From c4ee06251d4212a0d55e2371f2db464f6a1e0901 Mon Sep 17 00:00:00 2001 From: Jin Yao Date: Mon, 7 Aug 2017 21:05:15 +0800 Subject: [PATCH 01/13] perf report: Calculate the average cycles of iterations The branch history code has a loop detection function. With this, we can get the number of iterations by calculating the removed loops. While it would be nice for knowing the average cycles of iterations. This patch adds up the cycles in branch entries of removed loops and save the result to the next branch entry (e.g. branch entry A). Finally it will display the iteration number and average cycles at the "from" of branch entry A. For example: perf record -g -j any,save_type ./div perf report --branch-history --no-children --stdio --22.63%--main div.c:42 (RET CROSS_2M) compute_flag div.c:28 (cycles:2 iter:173115 avg_cycles:2) | --10.73%--compute_flag div.c:27 (RET CROSS_2M) rand rand.c:28 (cycles:1) rand rand.c:28 (RET CROSS_2M) __random random.c:298 (cycles:1) __random random.c:297 (COND_BWD CROSS_2M) __random random.c:295 (cycles:1) __random random.c:295 (COND_BWD CROSS_2M) __random random.c:295 (cycles:1) __random random.c:295 (RET CROSS_2M) Signed-off-by: Yao Jin Reviewed-by: Andi Kleen Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1502111115-18305-1-git-send-email-yao.jin@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/ui/browsers/hists.c | 8 +--- tools/perf/ui/stdio/hist.c | 10 ++-- tools/perf/util/callchain.c | 49 +++++++++---------- tools/perf/util/callchain.h | 9 ++-- tools/perf/util/machine.c | 88 ++++++++++++++++++++-------------- 5 files changed, 85 insertions(+), 79 deletions(-) diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c index f4bc2462bc2c..13dfb0a0bdeb 100644 --- a/tools/perf/ui/browsers/hists.c +++ b/tools/perf/ui/browsers/hists.c @@ -931,12 +931,8 @@ static int hist_browser__show_callchain_list(struct hist_browser *browser, browser->show_dso); if (symbol_conf.show_branchflag_count) { - if (need_percent) - callchain_list_counts__printf_value(node, chain, NULL, - buf, sizeof(buf)); - else - callchain_list_counts__printf_value(NULL, chain, NULL, - buf, sizeof(buf)); + callchain_list_counts__printf_value(chain, NULL, + buf, sizeof(buf)); if (asprintf(&alloc_str2, "%s%s", str, buf) < 0) str = "Not enough memory!"; diff --git a/tools/perf/ui/stdio/hist.c b/tools/perf/ui/stdio/hist.c index 5c95b8301c67..8bdb7a500181 100644 --- a/tools/perf/ui/stdio/hist.c +++ b/tools/perf/ui/stdio/hist.c @@ -124,12 +124,8 @@ static size_t ipchain__fprintf_graph(FILE *fp, struct callchain_node *node, str = callchain_list__sym_name(chain, bf, sizeof(bf), false); if (symbol_conf.show_branchflag_count) { - if (!period) - callchain_list_counts__printf_value(node, chain, NULL, - buf, sizeof(buf)); - else - callchain_list_counts__printf_value(NULL, chain, NULL, - buf, sizeof(buf)); + callchain_list_counts__printf_value(chain, NULL, + buf, sizeof(buf)); if (asprintf(&alloc_str, "%s%s", str, buf) < 0) str = "Not enough memory!"; @@ -313,7 +309,7 @@ static size_t callchain__fprintf_graph(FILE *fp, struct rb_root *root, if (symbol_conf.show_branchflag_count) ret += callchain_list_counts__printf_value( - NULL, chain, fp, NULL, 0); + chain, fp, NULL, 0); ret += fprintf(fp, "\n"); if (++entries_printed == callchain_param.print_limit) diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c index f320b0777e0d..510b513e0f01 100644 --- a/tools/perf/util/callchain.c +++ b/tools/perf/util/callchain.c @@ -588,7 +588,7 @@ fill_node(struct callchain_node *node, struct callchain_cursor *cursor) call->cycles_count = cursor_node->branch_flags.cycles; call->iter_count = cursor_node->nr_loop_iter; - call->samples_count = cursor_node->samples; + call->iter_cycles = cursor_node->iter_cycles; } } @@ -722,7 +722,7 @@ static enum match_result match_chain(struct callchain_cursor_node *node, cnode->cycles_count += node->branch_flags.cycles; cnode->iter_count += node->nr_loop_iter; - cnode->samples_count += node->samples; + cnode->iter_cycles += node->iter_cycles; } } @@ -998,7 +998,7 @@ int callchain_merge(struct callchain_cursor *cursor, int callchain_cursor_append(struct callchain_cursor *cursor, u64 ip, struct map *map, struct symbol *sym, bool branch, struct branch_flags *flags, - int nr_loop_iter, int samples, u64 branch_from) + int nr_loop_iter, u64 iter_cycles, u64 branch_from) { struct callchain_cursor_node *node = *cursor->last; @@ -1016,7 +1016,7 @@ int callchain_cursor_append(struct callchain_cursor *cursor, node->sym = sym; node->branch = branch; node->nr_loop_iter = nr_loop_iter; - node->samples = samples; + node->iter_cycles = iter_cycles; if (flags) memcpy(&node->branch_flags, flags, @@ -1306,7 +1306,7 @@ static int branch_to_str(char *bf, int bfsize, static int branch_from_str(char *bf, int bfsize, u64 branch_count, u64 cycles_count, u64 iter_count, - u64 samples_count) + u64 iter_cycles) { int printed = 0, i = 0; u64 cycles; @@ -1318,9 +1318,13 @@ static int branch_from_str(char *bf, int bfsize, bf + printed, bfsize - printed); } - if (iter_count && samples_count) { - printed += count_pri64_printf(i++, "iterations", - iter_count / samples_count, + if (iter_count) { + printed += count_pri64_printf(i++, "iter", + iter_count, + bf + printed, bfsize - printed); + + printed += count_pri64_printf(i++, "avg_cycles", + iter_cycles / iter_count, bf + printed, bfsize - printed); } @@ -1333,7 +1337,7 @@ static int branch_from_str(char *bf, int bfsize, static int counts_str_build(char *bf, int bfsize, u64 branch_count, u64 predicted_count, u64 abort_count, u64 cycles_count, - u64 iter_count, u64 samples_count, + u64 iter_count, u64 iter_cycles, struct branch_type_stat *brtype_stat) { int printed; @@ -1346,7 +1350,7 @@ static int counts_str_build(char *bf, int bfsize, predicted_count, abort_count, brtype_stat); } else { printed = branch_from_str(bf, bfsize, branch_count, - cycles_count, iter_count, samples_count); + cycles_count, iter_count, iter_cycles); } if (!printed) @@ -1358,14 +1362,14 @@ static int counts_str_build(char *bf, int bfsize, static int callchain_counts_printf(FILE *fp, char *bf, int bfsize, u64 branch_count, u64 predicted_count, u64 abort_count, u64 cycles_count, - u64 iter_count, u64 samples_count, + u64 iter_count, u64 iter_cycles, struct branch_type_stat *brtype_stat) { char str[256]; counts_str_build(str, sizeof(str), branch_count, predicted_count, abort_count, cycles_count, - iter_count, samples_count, brtype_stat); + iter_count, iter_cycles, brtype_stat); if (fp) return fprintf(fp, "%s", str); @@ -1373,31 +1377,23 @@ static int callchain_counts_printf(FILE *fp, char *bf, int bfsize, return scnprintf(bf, bfsize, "%s", str); } -int callchain_list_counts__printf_value(struct callchain_node *node, - struct callchain_list *clist, +int callchain_list_counts__printf_value(struct callchain_list *clist, FILE *fp, char *bf, int bfsize) { u64 branch_count, predicted_count; u64 abort_count, cycles_count; - u64 iter_count = 0, samples_count = 0; + u64 iter_count, iter_cycles; branch_count = clist->branch_count; predicted_count = clist->predicted_count; abort_count = clist->abort_count; cycles_count = clist->cycles_count; - - if (node) { - struct callchain_list *call; - - list_for_each_entry(call, &node->val, list) { - iter_count += call->iter_count; - samples_count += call->samples_count; - } - } + iter_count = clist->iter_count; + iter_cycles = clist->iter_cycles; return callchain_counts_printf(fp, bf, bfsize, branch_count, predicted_count, abort_count, - cycles_count, iter_count, samples_count, + cycles_count, iter_count, iter_cycles, &clist->brtype_stat); } @@ -1523,7 +1519,8 @@ int callchain_cursor__copy(struct callchain_cursor *dst, rc = callchain_cursor_append(dst, node->ip, node->map, node->sym, node->branch, &node->branch_flags, - node->nr_loop_iter, node->samples, + node->nr_loop_iter, + node->iter_cycles, node->branch_from); if (rc) break; diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h index 97738201464a..1ed6fc61d0a5 100644 --- a/tools/perf/util/callchain.h +++ b/tools/perf/util/callchain.h @@ -119,7 +119,7 @@ struct callchain_list { u64 abort_count; u64 cycles_count; u64 iter_count; - u64 samples_count; + u64 iter_cycles; struct branch_type_stat brtype_stat; char *srcline; struct list_head list; @@ -139,7 +139,7 @@ struct callchain_cursor_node { struct branch_flags branch_flags; u64 branch_from; int nr_loop_iter; - int samples; + u64 iter_cycles; struct callchain_cursor_node *next; }; @@ -201,7 +201,7 @@ static inline void callchain_cursor_reset(struct callchain_cursor *cursor) int callchain_cursor_append(struct callchain_cursor *cursor, u64 ip, struct map *map, struct symbol *sym, bool branch, struct branch_flags *flags, - int nr_loop_iter, int samples, u64 branch_from); + int nr_loop_iter, u64 iter_cycles, u64 branch_from); /* Close a cursor writing session. Initialize for the reader */ static inline void callchain_cursor_commit(struct callchain_cursor *cursor) @@ -282,8 +282,7 @@ char *callchain_node__scnprintf_value(struct callchain_node *node, int callchain_node__fprintf_value(struct callchain_node *node, FILE *fp, u64 total); -int callchain_list_counts__printf_value(struct callchain_node *node, - struct callchain_list *clist, +int callchain_list_counts__printf_value(struct callchain_list *clist, FILE *fp, char *bf, int bfsize); void free_callchain(struct callchain_root *root); diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index 5c8eacaca4f4..9eaa95302c86 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -1675,6 +1675,11 @@ struct mem_info *sample__resolve_mem(struct perf_sample *sample, return mi; } +struct iterations { + int nr_loop_iter; + u64 cycles; +}; + static int add_callchain_ip(struct thread *thread, struct callchain_cursor *cursor, struct symbol **parent, @@ -1683,11 +1688,12 @@ static int add_callchain_ip(struct thread *thread, u64 ip, bool branch, struct branch_flags *flags, - int nr_loop_iter, - int samples, + struct iterations *iter, u64 branch_from) { struct addr_location al; + int nr_loop_iter = 0; + u64 iter_cycles = 0; al.filtered = 0; al.sym = NULL; @@ -1737,9 +1743,15 @@ static int add_callchain_ip(struct thread *thread, if (symbol_conf.hide_unresolved && al.sym == NULL) return 0; + + if (iter) { + nr_loop_iter = iter->nr_loop_iter; + iter_cycles = iter->cycles; + } + return callchain_cursor_append(cursor, al.addr, al.map, al.sym, - branch, flags, nr_loop_iter, samples, - branch_from); + branch, flags, nr_loop_iter, + iter_cycles, branch_from); } struct branch_info *sample__resolve_bstack(struct perf_sample *sample, @@ -1760,6 +1772,18 @@ struct branch_info *sample__resolve_bstack(struct perf_sample *sample, return bi; } +static void save_iterations(struct iterations *iter, + struct branch_entry *be, int nr) +{ + int i; + + iter->nr_loop_iter = nr; + iter->cycles = 0; + + for (i = 0; i < nr; i++) + iter->cycles += be[i].flags.cycles; +} + #define CHASHSZ 127 #define CHASHBITS 7 #define NO_ENTRY 0xff @@ -1767,7 +1791,8 @@ struct branch_info *sample__resolve_bstack(struct perf_sample *sample, #define PERF_MAX_BRANCH_DEPTH 127 /* Remove loops. */ -static int remove_loops(struct branch_entry *l, int nr) +static int remove_loops(struct branch_entry *l, int nr, + struct iterations *iter) { int i, j, off; unsigned char chash[CHASHSZ]; @@ -1792,8 +1817,18 @@ static int remove_loops(struct branch_entry *l, int nr) break; } if (is_loop) { - memmove(l + i, l + i + off, - (nr - (i + off)) * sizeof(*l)); + j = nr - (i + off); + if (j > 0) { + save_iterations(iter + i + off, + l + i, off); + + memmove(iter + i, iter + i + off, + j * sizeof(*iter)); + + memmove(l + i, l + i + off, + j * sizeof(*l)); + } + nr -= off; } } @@ -1883,7 +1918,7 @@ static int resolve_lbr_callchain_sample(struct thread *thread, err = add_callchain_ip(thread, cursor, parent, root_al, &cpumode, ip, - branch, flags, 0, 0, + branch, flags, NULL, branch_from); if (err) return (err < 0) ? err : 0; @@ -1909,7 +1944,6 @@ static int thread__resolve_callchain_sample(struct thread *thread, int i, j, err, nr_entries; int skip_idx = -1; int first_call = 0; - int nr_loop_iter; if (chain) chain_nr = chain->nr; @@ -1942,6 +1976,7 @@ static int thread__resolve_callchain_sample(struct thread *thread, if (branch && callchain_param.branch_callstack) { int nr = min(max_stack, (int)branch->nr); struct branch_entry be[nr]; + struct iterations iter[nr]; if (branch->nr > PERF_MAX_BRANCH_DEPTH) { pr_warning("corrupted branch chain. skipping...\n"); @@ -1972,38 +2007,21 @@ static int thread__resolve_callchain_sample(struct thread *thread, be[i] = branch->entries[branch->nr - i - 1]; } - nr_loop_iter = nr; - nr = remove_loops(be, nr); - - /* - * Get the number of iterations. - * It's only approximation, but good enough in practice. - */ - if (nr_loop_iter > nr) - nr_loop_iter = nr_loop_iter - nr + 1; - else - nr_loop_iter = 0; + memset(iter, 0, sizeof(struct iterations) * nr); + nr = remove_loops(be, nr, iter); for (i = 0; i < nr; i++) { - if (i == nr - 1) - err = add_callchain_ip(thread, cursor, parent, - root_al, - NULL, be[i].to, - true, &be[i].flags, - nr_loop_iter, 1, - be[i].from); - else - err = add_callchain_ip(thread, cursor, parent, - root_al, - NULL, be[i].to, - true, &be[i].flags, - 0, 0, be[i].from); + err = add_callchain_ip(thread, cursor, parent, + root_al, + NULL, be[i].to, + true, &be[i].flags, + NULL, be[i].from); if (!err) err = add_callchain_ip(thread, cursor, parent, root_al, NULL, be[i].from, true, &be[i].flags, - 0, 0, 0); + &iter[i], 0); if (err == -EINVAL) break; if (err) @@ -2037,7 +2055,7 @@ check_calls: err = add_callchain_ip(thread, cursor, parent, root_al, &cpumode, ip, - false, NULL, 0, 0, 0); + false, NULL, NULL, 0); if (err) return (err < 0) ? err : 0; From 89be3f8ab701180fc0329eff1b076528d64ac56b Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 31 Aug 2017 11:46:49 -0300 Subject: [PATCH 02/13] perf syscalltbl: Support glob matching on syscall names With two new methods, one to find the first match, returning its syscall id and its index in whatever internal database it keeps the syscall into, then one to find the next match, if any. Implemented only on arches where we actually read the syscall table from the kernel sources, i.e. x86-64 for now, all the others use the libaudit method for which this returns -1, i.e. just stubs were added, with the actual implementation using whatever libaudit functions for matching that may be available. Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-i0sj4rxk1a63pfe9gl8z8irs@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/syscalltbl.c | 33 +++++++++++++++++++++++++++++++++ tools/perf/util/syscalltbl.h | 3 +++ 2 files changed, 36 insertions(+) diff --git a/tools/perf/util/syscalltbl.c b/tools/perf/util/syscalltbl.c index bbb4c1957578..19e5db90394c 100644 --- a/tools/perf/util/syscalltbl.c +++ b/tools/perf/util/syscalltbl.c @@ -19,6 +19,7 @@ #ifdef HAVE_SYSCALL_TABLE #include #include +#include "string2.h" #include "util.h" #if defined(__x86_64__) @@ -105,6 +106,27 @@ int syscalltbl__id(struct syscalltbl *tbl, const char *name) return sc ? sc->id : -1; } +int syscalltbl__strglobmatch_next(struct syscalltbl *tbl, const char *syscall_glob, int *idx) +{ + int i; + struct syscall *syscalls = tbl->syscalls.entries; + + for (i = *idx + 1; i < tbl->syscalls.nr_entries; ++i) { + if (strglobmatch(syscalls[i].name, syscall_glob)) { + *idx = i; + return syscalls[i].id; + } + } + + return -1; +} + +int syscalltbl__strglobmatch_first(struct syscalltbl *tbl, const char *syscall_glob, int *idx) +{ + *idx = -1; + return syscalltbl__strglobmatch_next(tbl, syscall_glob, idx); +} + #else /* HAVE_SYSCALL_TABLE */ #include @@ -131,4 +153,15 @@ int syscalltbl__id(struct syscalltbl *tbl, const char *name) { return audit_name_to_syscall(name, tbl->audit_machine); } + +int syscalltbl__strglobmatch_next(struct syscalltbl *tbl __maybe_unused, + const char *syscall_glob __maybe_unused, int *idx __maybe_unused) +{ + return -1; +} + +int syscalltbl__strglobmatch_first(struct syscalltbl *tbl, const char *syscall_glob, int *idx) +{ + return syscalltbl__strglobmatch_next(tbl, syscall_glob, idx); +} #endif /* HAVE_SYSCALL_TABLE */ diff --git a/tools/perf/util/syscalltbl.h b/tools/perf/util/syscalltbl.h index e2951510484f..e9fb8786da7c 100644 --- a/tools/perf/util/syscalltbl.h +++ b/tools/perf/util/syscalltbl.h @@ -17,4 +17,7 @@ void syscalltbl__delete(struct syscalltbl *tbl); const char *syscalltbl__name(const struct syscalltbl *tbl, int id); int syscalltbl__id(struct syscalltbl *tbl, const char *name); +int syscalltbl__strglobmatch_first(struct syscalltbl *tbl, const char *syscall_glob, int *idx); +int syscalltbl__strglobmatch_next(struct syscalltbl *tbl, const char *syscall_glob, int *idx); + #endif /* __PERF_SYSCALLTBL_H */ From 27702bcfe8a125a1feeeb5f07526d63b20cac47f Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 31 Aug 2017 11:50:04 -0300 Subject: [PATCH 03/13] perf trace: Support syscall name globbing So now we can use: # perf trace -e pkey_* 532.784 ( 0.006 ms): pkey/16018 pkey_alloc(init_val: DISABLE_WRITE) = -1 EINVAL Invalid argument 532.795 ( 0.004 ms): pkey/16018 pkey_mprotect(start: 0x7f380d0a6000, len: 4096, prot: READ|WRITE, pkey: -1) = 0 532.801 ( 0.002 ms): pkey/16018 pkey_free(pkey: -1 ) = -1 EINVAL Invalid argument ^C[root@jouet ~]# Or '-e epoll*', '-e *msg*', etc. Combining syscall names with perf events, tracepoints, etc, continues to be valid, i.e. this is possible: # perf probe -L sys_nanosleep 0 SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp, struct timespec __user *, rmtp) { struct timespec64 tu; 5 if (get_timespec64(&tu, rqtp)) 6 return -EFAULT; if (!timespec64_valid(&tu)) 9 return -EINVAL; 11 current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; 12 current->restart_block.nanosleep.rmtp = rmtp; 13 return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC); } # perf probe my_probe="sys_nanosleep:12 rmtp" Added new event: probe:my_probe (on sys_nanosleep:12 with rmtp) You can now use it in all perf tools, such as: perf record -e probe:my_probe -aR sleep 1 # # perf trace -e probe:my_probe/max-stack=5/,*sleep sleep 1 0.427 ( 0.003 ms): sleep/16690 nanosleep(rqtp: 0x7ffefc245090) ... 0.430 ( ): probe:my_probe:(ffffffffbd112923) rmtp=0) sys_nanosleep ([kernel.kallsyms]) do_syscall_64 ([kernel.kallsyms]) return_from_SYSCALL_64 ([kernel.kallsyms]) __nanosleep_nocancel (/usr/lib64/libc-2.25.so) 0.427 (1000.208 ms): sleep/16690 ... [continued]: nanosleep()) = 0 # Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-elycoi8wy6y0w9dkj7ox1mzz@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-trace.txt | 2 +- tools/perf/builtin-trace.c | 39 ++++++++++++++++++++++--- 2 files changed, 36 insertions(+), 5 deletions(-) diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt index c1e3288a2dfb..d53bea6bd571 100644 --- a/tools/perf/Documentation/perf-trace.txt +++ b/tools/perf/Documentation/perf-trace.txt @@ -37,7 +37,7 @@ OPTIONS --expr:: --event:: List of syscalls and other perf events (tracepoints, HW cache events, - etc) to show. + etc) to show. Globbing is supported, e.g.: "epoll_*", "*msg*", etc. See 'perf list' for a complete list of events. Prefixing with ! shows all syscalls but the ones specified. You may need to escape it. diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index d59cdadf3a79..771ddab94bb0 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -1261,6 +1261,7 @@ static int trace__read_syscall_info(struct trace *trace, int id) static int trace__validate_ev_qualifier(struct trace *trace) { int err = 0, i; + size_t nr_allocated; struct str_node *pos; trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier); @@ -1274,13 +1275,18 @@ static int trace__validate_ev_qualifier(struct trace *trace) goto out; } + nr_allocated = trace->ev_qualifier_ids.nr; i = 0; strlist__for_each_entry(pos, trace->ev_qualifier) { const char *sc = pos->s; - int id = syscalltbl__id(trace->sctbl, sc); + int id = syscalltbl__id(trace->sctbl, sc), match_next = -1; if (id < 0) { + id = syscalltbl__strglobmatch_first(trace->sctbl, sc, &match_next); + if (id >= 0) + goto matches; + if (err == 0) { fputs("Error:\tInvalid syscall ", trace->output); err = -EINVAL; @@ -1290,13 +1296,37 @@ static int trace__validate_ev_qualifier(struct trace *trace) fputs(sc, trace->output); } - +matches: trace->ev_qualifier_ids.entries[i++] = id; + if (match_next == -1) + continue; + + while (1) { + id = syscalltbl__strglobmatch_next(trace->sctbl, sc, &match_next); + if (id < 0) + break; + if (nr_allocated == trace->ev_qualifier_ids.nr) { + void *entries; + + nr_allocated += 8; + entries = realloc(trace->ev_qualifier_ids.entries, + nr_allocated * sizeof(trace->ev_qualifier_ids.entries[0])); + if (entries == NULL) { + err = -ENOMEM; + fputs("\nError:\t Not enough memory for parsing\n", trace->output); + goto out_free; + } + trace->ev_qualifier_ids.entries = entries; + } + trace->ev_qualifier_ids.nr++; + trace->ev_qualifier_ids.entries[i++] = id; + } } if (err < 0) { fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'" "\nHint:\tand: 'man syscalls'\n", trace->output); +out_free: zfree(&trace->ev_qualifier_ids.entries); trace->ev_qualifier_ids.nr = 0; } @@ -2814,7 +2844,7 @@ static int trace__parse_events_option(const struct option *opt, const char *str, struct trace *trace = (struct trace *)opt->value; const char *s = str; char *sep = NULL, *lists[2] = { NULL, NULL, }; - int len = strlen(str) + 1, err = -1, list; + int len = strlen(str) + 1, err = -1, list, idx; char *strace_groups_dir = system_path(STRACE_GROUPS_DIR); char group_name[PATH_MAX]; @@ -2831,7 +2861,8 @@ static int trace__parse_events_option(const struct option *opt, const char *str, *sep = '\0'; list = 0; - if (syscalltbl__id(trace->sctbl, s) >= 0) { + if (syscalltbl__id(trace->sctbl, s) >= 0 || + syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) { list = 1; } else { path__join(group_name, sizeof(group_name), strace_groups_dir, s); From 9a805d8648ee09c136130fe4114a09574bc0b1ef Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Thu, 31 Aug 2017 14:44:56 +0530 Subject: [PATCH 04/13] perf test powerpc: Fix 'Object code reading' test 'Object code reading' test always fails on powerpc guest. Two reasons for the failure are: 1. When elf section is too big (size beyond 'unsigned int' max value). objdump fails to disassemble from such section. This was fixed with commit 0f6329bd7fc ("binutils/objdump: Fix disassemble for huge elf sections") in binutils. 2. When the sample is from hypervisor. Hypervisor symbols can not be resolved within guest and thus thread__find_addr_map() fails for such symbols. Fix this by ignoring hypervisor symbols in the test. Signed-off-by: Ravi Bangoria Acked-by: Adrian Hunter Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Michael Ellerman Cc: Namhyung Kim Cc: Peter Zijlstra Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/1504170896-7876-1-git-send-email-ravi.bangoria@linux.vnet.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/code-reading.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/perf/tests/code-reading.c b/tools/perf/tests/code-reading.c index 761c5a448c56..466a462b26d1 100644 --- a/tools/perf/tests/code-reading.c +++ b/tools/perf/tests/code-reading.c @@ -237,6 +237,11 @@ static int read_object_code(u64 addr, size_t len, u8 cpumode, thread__find_addr_map(thread, cpumode, MAP__FUNCTION, addr, &al); if (!al.map || !al.map->dso) { + if (cpumode == PERF_RECORD_MISC_HYPERVISOR) { + pr_debug("Hypervisor address can not be resolved - skipping\n"); + return 0; + } + pr_debug("thread__find_addr_map failed\n"); return -1; } From 4fb205392022ba99a45dd01a62c6e2df046e400a Mon Sep 17 00:00:00 2001 From: Jack Henschel Date: Thu, 31 Aug 2017 10:05:35 +0200 Subject: [PATCH 05/13] perf intel-pt: Fix syntax in documentation of config option As specified in tools/perf/Documentation/perf-config.txt, perf configuration items must be in 'key = value' format, otherwise the following error message occurs: $ perf record -e intel_pt//u -- ls bad config file line 2 in ~/.perfconfig $ cat .perfconfig [intel-pt] mispred-all Changing to assigning a value to the key 'mispred-all' fixes the issue: $ perf record -e intel_pt//u -- ls [ perf record: Woken up 1 times to write data ] [ perf record: Capured and wrote 0.031 MB perf.data] $ cat .perfconfig [intel-pt] mispred-all = true Signed-off-by: Jack Henschel Cc: Alexander Shishkin Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20170831080535.2157-1-jackdev@mailbox.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/intel-pt.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/Documentation/intel-pt.txt b/tools/perf/Documentation/intel-pt.txt index ab1b0825130a..76971d2e4164 100644 --- a/tools/perf/Documentation/intel-pt.txt +++ b/tools/perf/Documentation/intel-pt.txt @@ -873,7 +873,7 @@ amended to take the number of elements as a parameter. $ cat ~/.perfconfig [intel-pt] - mispred-all + mispred-all = on $ perf record -e intel_pt//u ./sort 3000 Bubble sorting array of 3000 elements From 2a118e1bd22cad57318520d37e3a184b8846c6a2 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Wed, 30 Aug 2017 21:42:23 -0400 Subject: [PATCH 06/13] perf vendor events powerpc: Remove duplicate events Some POWER PMU event names have multiple/alternate event codes. These alternate event codes were listed in the POWER9 JSON files for reference. But the perf tool does not seem to handle duplicates cleanly. 'perf list' shows such duplicate events only once, but 'perf stat' ends up counting the first event code twice, multiplexing if necessary and we end up with double the event counts. Remove the duplicate event codes from the JSON files for now. Reported-by: Michael Petlan Signed-off-by: Sukadev Bhattiprolu Cc: Andi Kleen Cc: Anton Blanchard Cc: Jiri Olsa Cc: Michael Ellerman Link: http://lkml.kernel.org/r/20170830231506.GB20351@us.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- .../arch/powerpc/power9/frontend.json | 7 +- .../pmu-events/arch/powerpc/power9/other.json | 120 ------------------ .../arch/powerpc/power9/pipeline.json | 7 +- .../pmu-events/arch/powerpc/power9/pmc.json | 7 +- 4 files changed, 3 insertions(+), 138 deletions(-) diff --git a/tools/perf/pmu-events/arch/powerpc/power9/frontend.json b/tools/perf/pmu-events/arch/powerpc/power9/frontend.json index 7e62c46d7a20..c63a919eda98 100644 --- a/tools/perf/pmu-events/arch/powerpc/power9/frontend.json +++ b/tools/perf/pmu-events/arch/powerpc/power9/frontend.json @@ -79,11 +79,6 @@ "EventName": "PM_LD_MISS_L1", "BriefDescription": "Load Missed L1, counted at execution time (can be greater than loads finished). LMQ merges are not included in this count. i.e. if a load instruction misses on an address that is already allocated on the LMQ, this event will not increment for that load). Note that this count is per slice, so if a load spans multiple slices this event will increment multiple times for a single load." }, - {, - "EventCode": "0x400F0", - "EventName": "PM_LD_MISS_L1", - "BriefDescription": "Load Missed L1, counted at execution time (can be greater than loads finished). LMQ merges are not included in this count. i.e. if a load instruction misses on an address that is already allocated on the LMQ, this event will not increment for that load). Note that this count is per slice, so if a load spans multiple slices this event will increment multiple times for a single load." - }, {, "EventCode": "0x2E01A", "EventName": "PM_CMPLU_STALL_LSU_FLUSH_NEXT", @@ -374,4 +369,4 @@ "EventName": "PM_IPTEG_FROM_L31_ECO_MOD", "BriefDescription": "A Page Table Entry was loaded into the TLB with Modified (M) data from another core's ECO L3 on the same chip due to a instruction side request" } -] \ No newline at end of file +] diff --git a/tools/perf/pmu-events/arch/powerpc/power9/other.json b/tools/perf/pmu-events/arch/powerpc/power9/other.json index 00f3d2a21f31..54cc3be00fc2 100644 --- a/tools/perf/pmu-events/arch/powerpc/power9/other.json +++ b/tools/perf/pmu-events/arch/powerpc/power9/other.json @@ -604,11 +604,6 @@ "EventName": "PM_L2_RTY_LD", "BriefDescription": "RC retries on PB for any load from core (excludes DCBFs)" }, - {, - "EventCode": "0x3689E", - "EventName": "PM_L2_RTY_LD", - "BriefDescription": "RC retries on PB for any load from core (excludes DCBFs)" - }, {, "EventCode": "0xE08C", "EventName": "PM_LSU0_ERAT_HIT", @@ -714,11 +709,6 @@ "EventName": "PM_L3_RD0_BUSY", "BriefDescription": "Lifetime, sample of RD machine 0 valid" }, - {, - "EventCode": "0x468B4", - "EventName": "PM_L3_RD0_BUSY", - "BriefDescription": "Lifetime, sample of RD machine 0 valid" - }, {, "EventCode": "0x46080", "EventName": "PM_L2_DISP_ALL_L2MISS", @@ -849,21 +839,11 @@ "EventName": "PM_RC0_BUSY", "BriefDescription": "RC mach 0 Busy. Used by PMU to sample ave RC lifetime (mach0 used as sample point)" }, - {, - "EventCode": "0x2608C", - "EventName": "PM_RC0_BUSY", - "BriefDescription": "RC mach 0 Busy. Used by PMU to sample ave RC lifetime (mach0 used as sample point)" - }, {, "EventCode": "0x36082", "EventName": "PM_L2_LD_DISP", "BriefDescription": "All successful I-or-D side load dispatches for this thread (excludes i_l2mru_tch_reqs)." }, - {, - "EventCode": "0x1609E", - "EventName": "PM_L2_LD_DISP", - "BriefDescription": "All successful D side load dispatches for this thread (L2 miss + L2 hits)" - }, {, "EventCode": "0xF8B0", "EventName": "PM_L3_SW_PREF", @@ -1039,11 +1019,6 @@ "EventName": "PM_L3_CO_MEPF", "BriefDescription": "L3 castouts in Mepf state for this thread" }, - {, - "EventCode": "0x168A0", - "EventName": "PM_L3_CO_MEPF", - "BriefDescription": "L3 CO of line in Mep state (includes casthrough to memory). The Mepf state indicates that a line was brought in to satisfy an L3 prefetch request" - }, {, "EventCode": "0x460A2", "EventName": "PM_L3_LAT_CI_HIT", @@ -1149,11 +1124,6 @@ "EventName": "PM_L2_RTY_ST", "BriefDescription": "RC retries on PB for any store from core (excludes DCBFs)" }, - {, - "EventCode": "0x4689E", - "EventName": "PM_L2_RTY_ST", - "BriefDescription": "RC retries on PB for any store from core (excludes DCBFs)" - }, {, "EventCode": "0x24040", "EventName": "PM_INST_FROM_L2_MEPF", @@ -1254,11 +1224,6 @@ "EventName": "PM_CO0_BUSY", "BriefDescription": "CO mach 0 Busy. Used by PMU to sample ave CO lifetime (mach0 used as sample point)" }, - {, - "EventCode": "0x4608C", - "EventName": "PM_CO0_BUSY", - "BriefDescription": "CO mach 0 Busy. Used by PMU to sample ave CO lifetime (mach0 used as sample point)" - }, {, "EventCode": "0x2C122", "EventName": "PM_MRK_DATA_FROM_L3_DISP_CONFLICT_CYC", @@ -1394,11 +1359,6 @@ "EventName": "PM_IPTEG_FROM_LMEM", "BriefDescription": "A Page Table Entry was loaded into the TLB from the local chip's Memory due to a instruction side request" }, - {, - "EventCode": "0x40006", - "EventName": "PM_ISLB_MISS", - "BriefDescription": "Number of ISLB misses for this thread" - }, {, "EventCode": "0xD8A8", "EventName": "PM_ISLB_MISS", @@ -1514,11 +1474,6 @@ "EventName": "PM_L2_INST", "BriefDescription": "All successful I-side dispatches for this thread (excludes i_l2mru_tch reqs)." }, - {, - "EventCode": "0x3609E", - "EventName": "PM_L2_INST", - "BriefDescription": "All successful I-side dispatches that were an L2 miss for this thread (excludes i_l2mru_tch reqs)" - }, {, "EventCode": "0x3504C", "EventName": "PM_IPTEG_FROM_DL4", @@ -1689,11 +1644,6 @@ "EventName": "PM_L2_LD_HIT", "BriefDescription": "All successful I-or-D side load dispatches for this thread that were L2 hits (excludes i_l2mru_tch_reqs)" }, - {, - "EventCode": "0x2609E", - "EventName": "PM_L2_LD_HIT", - "BriefDescription": "All successful D side load dispatches for this thread that were L2 hits for this thread" - }, {, "EventCode": "0x168AC", "EventName": "PM_L3_CI_USAGE", @@ -1794,21 +1744,11 @@ "EventName": "PM_L3_WI0_BUSY", "BriefDescription": "Rotating sample of 8 WI valid" }, - {, - "EventCode": "0x260B6", - "EventName": "PM_L3_WI0_BUSY", - "BriefDescription": "Rotating sample of 8 WI valid (duplicate)" - }, {, "EventCode": "0x368AC", "EventName": "PM_L3_CO0_BUSY", "BriefDescription": "Lifetime, sample of CO machine 0 valid" }, - {, - "EventCode": "0x468AC", - "EventName": "PM_L3_CO0_BUSY", - "BriefDescription": "Lifetime, sample of CO machine 0 valid" - }, {, "EventCode": "0x2E040", "EventName": "PM_DPTEG_FROM_L2_MEPF", @@ -1839,11 +1779,6 @@ "EventName": "PM_L3_P0_PF_RTY", "BriefDescription": "L3 PF received retry port 0, every retry counted" }, - {, - "EventCode": "0x260AE", - "EventName": "PM_L3_P0_PF_RTY", - "BriefDescription": "L3 PF received retry port 0, every retry counted" - }, {, "EventCode": "0x268B2", "EventName": "PM_L3_LOC_GUESS_WRONG", @@ -1894,11 +1829,6 @@ "EventName": "PM_L3_SN0_BUSY", "BriefDescription": "Lifetime, sample of snooper machine 0 valid" }, - {, - "EventCode": "0x460AC", - "EventName": "PM_L3_SN0_BUSY", - "BriefDescription": "Lifetime, sample of snooper machine 0 valid" - }, {, "EventCode": "0x3005C", "EventName": "PM_BFU_BUSY", @@ -1934,11 +1864,6 @@ "EventName": "PM_L3_PF0_BUSY", "BriefDescription": "Lifetime, sample of PF machine 0 valid" }, - {, - "EventCode": "0x460B4", - "EventName": "PM_L3_PF0_BUSY", - "BriefDescription": "Lifetime, sample of PF machine 0 valid" - }, {, "EventCode": "0xC0B0", "EventName": "PM_LSU_FLUSH_UE", @@ -2084,11 +2009,6 @@ "EventName": "PM_L3_P1_CO_RTY", "BriefDescription": "L3 CO received retry port 1 (memory only), every retry counted" }, - {, - "EventCode": "0x468AE", - "EventName": "PM_L3_P1_CO_RTY", - "BriefDescription": "L3 CO received retry port 3 (memory only), every retry counted" - }, {, "EventCode": "0xC0AC", "EventName": "PM_LSU_FLUSH_EMSH", @@ -2194,11 +2114,6 @@ "EventName": "PM_L2_SN_M_WR_DONE", "BriefDescription": "SNP dispatched for a write and was M (true M); for DMA cacheinj this will pulse if rty/push is required (won't pulse if cacheinj is accepted)" }, - {, - "EventCode": "0x46886", - "EventName": "PM_L2_SN_M_WR_DONE", - "BriefDescription": "SNP dispatched for a write and was M (true M); for DMA cacheinj this will pulse if rty/push is required (won't pulse if cacheinj is accepted)" - }, {, "EventCode": "0x489C", "EventName": "PM_BR_CORECT_PRED_TAKEN_CMPL", @@ -2289,21 +2204,11 @@ "EventName": "PM_SN0_BUSY", "BriefDescription": "SN mach 0 Busy. Used by PMU to sample ave SN lifetime (mach0 used as sample point)" }, - {, - "EventCode": "0x26090", - "EventName": "PM_SN0_BUSY", - "BriefDescription": "SN mach 0 Busy. Used by PMU to sample ave SN lifetime (mach0 used as sample point)" - }, {, "EventCode": "0x360AE", "EventName": "PM_L3_P0_CO_RTY", "BriefDescription": "L3 CO received retry port 0 (memory only), every retry counted" }, - {, - "EventCode": "0x460AE", - "EventName": "PM_L3_P0_CO_RTY", - "BriefDescription": "L3 CO received retry port 0 (memory only), every retry counted" - }, {, "EventCode": "0x168A8", "EventName": "PM_L3_WI_USAGE", @@ -2339,26 +2244,11 @@ "EventName": "PM_L3_P1_PF_RTY", "BriefDescription": "L3 PF received retry port 1, every retry counted" }, - {, - "EventCode": "0x268AE", - "EventName": "PM_L3_P1_PF_RTY", - "BriefDescription": "L3 PF received retry port 3, every retry counted" - }, {, "EventCode": "0x46082", "EventName": "PM_L2_ST_DISP", "BriefDescription": "All successful D-side store dispatches for this thread " }, - {, - "EventCode": "0x1689E", - "EventName": "PM_L2_ST_DISP", - "BriefDescription": "All successful D-side store dispatches for this thread (L2 miss + L2 hits)" - }, - {, - "EventCode": "0x36880", - "EventName": "PM_L2_INST_MISS", - "BriefDescription": "All successful I-side dispatches that were an L2 miss for this thread (excludes i_l2mru_tch reqs)" - }, {, "EventCode": "0x4609E", "EventName": "PM_L2_INST_MISS", @@ -2429,11 +2319,6 @@ "EventName": "PM_INST_DISP", "BriefDescription": "# PPC Dispatched" }, - {, - "EventCode": "0x300F2", - "EventName": "PM_INST_DISP", - "BriefDescription": "# PPC Dispatched" - }, {, "EventCode": "0x4E05E", "EventName": "PM_TM_OUTER_TBEGIN_DISP", @@ -2459,11 +2344,6 @@ "EventName": "PM_L2_ST_HIT", "BriefDescription": "All successful D-side store dispatches for this thread that were L2 hits" }, - {, - "EventCode": "0x2689E", - "EventName": "PM_L2_ST_HIT", - "BriefDescription": "All successful D-side store dispatches that were L2 hits for this thread" - }, {, "EventCode": "0x360A8", "EventName": "PM_L3_CO", diff --git a/tools/perf/pmu-events/arch/powerpc/power9/pipeline.json b/tools/perf/pmu-events/arch/powerpc/power9/pipeline.json index 47a82568a8df..bc2db636dabf 100644 --- a/tools/perf/pmu-events/arch/powerpc/power9/pipeline.json +++ b/tools/perf/pmu-events/arch/powerpc/power9/pipeline.json @@ -419,11 +419,6 @@ "EventName": "PM_INST_GRP_PUMP_MPRED_RTY", "BriefDescription": "Final Pump Scope (Group) ended up larger than Initial Pump Scope (Chip) for an instruction fetch" }, - {, - "EventCode": "0x10016", - "EventName": "PM_DSLB_MISS", - "BriefDescription": "Data SLB Miss - Total of all segment sizes" - }, {, "EventCode": "0xD0A8", "EventName": "PM_DSLB_MISS", @@ -554,4 +549,4 @@ "EventName": "PM_MRK_DATA_FROM_L21_SHR_CYC", "BriefDescription": "Duration in cycles to reload with Shared (S) data from another core's L2 on the same chip due to a marked load" } -] \ No newline at end of file +] diff --git a/tools/perf/pmu-events/arch/powerpc/power9/pmc.json b/tools/perf/pmu-events/arch/powerpc/power9/pmc.json index a2c95a99e168..3ef8a10aac86 100644 --- a/tools/perf/pmu-events/arch/powerpc/power9/pmc.json +++ b/tools/perf/pmu-events/arch/powerpc/power9/pmc.json @@ -4,11 +4,6 @@ "EventName": "PM_BR_2PATH", "BriefDescription": "Branches that are not strongly biased" }, - {, - "EventCode": "0x40036", - "EventName": "PM_BR_2PATH", - "BriefDescription": "Branches that are not strongly biased" - }, {, "EventCode": "0x40056", "EventName": "PM_MEM_LOC_THRESH_LSU_HIGH", @@ -124,4 +119,4 @@ "EventName": "PM_1FLOP_CMPL", "BriefDescription": "one flop (fadd, fmul, fsub, fcmp, fsel, fabs, fnabs, fres, fsqrte, fneg) operation completed" } -] \ No newline at end of file +] From 3b0a5daa061076b2b75ffc294e74483ad9bf241a Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 29 Aug 2017 13:11:08 -0400 Subject: [PATCH 07/13] perf tools: Support new sample type for physical address Support new sample type PERF_SAMPLE_PHYS_ADDR for physical address. Add new option --phys-data to record sample physical address. Signed-off-by: Kan Liang Tested-by: Jiri Olsa Acked-by: Stephane Eranian Cc: Andi Kleen Cc: Madhavan Srinivasan Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1504026672-7304-2-git-send-email-kan.liang@intel.com [ Added missing printing in evsel.c patch sent by Jiri Olsa ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/perf_event.h | 4 +++- tools/perf/Documentation/perf-record.txt | 5 ++++- tools/perf/builtin-record.c | 2 ++ tools/perf/perf.h | 1 + tools/perf/util/event.h | 1 + tools/perf/util/evsel.c | 19 ++++++++++++++++++- 6 files changed, 29 insertions(+), 3 deletions(-) diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index 2a37ae925d85..140ae638cfd6 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -139,8 +139,9 @@ enum perf_event_sample_format { PERF_SAMPLE_IDENTIFIER = 1U << 16, PERF_SAMPLE_TRANSACTION = 1U << 17, PERF_SAMPLE_REGS_INTR = 1U << 18, + PERF_SAMPLE_PHYS_ADDR = 1U << 19, - PERF_SAMPLE_MAX = 1U << 19, /* non-ABI */ + PERF_SAMPLE_MAX = 1U << 20, /* non-ABI */ }; /* @@ -814,6 +815,7 @@ enum perf_event_type { * { u64 transaction; } && PERF_SAMPLE_TRANSACTION * { u64 abi; # enum perf_sample_regs_abi * u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR + * { u64 phys_addr;} && PERF_SAMPLE_PHYS_ADDR * }; */ PERF_RECORD_SAMPLE = 9, diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt index 9bdea047c5db..e397453e5a46 100644 --- a/tools/perf/Documentation/perf-record.txt +++ b/tools/perf/Documentation/perf-record.txt @@ -249,7 +249,10 @@ OPTIONS -d:: --data:: - Record the sample addresses. + Record the sample virtual addresses. + +--phys-data:: + Record the sample physical addresses. -T:: --timestamp:: diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 36d7117a7562..56f8142ff97f 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -1604,6 +1604,8 @@ static struct option __record_options[] = { OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat, "per thread counts"), OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"), + OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr, + "Record the sample physical addresses"), OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"), OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time, &record.opts.sample_time_set, diff --git a/tools/perf/perf.h b/tools/perf/perf.h index 2c010dd6a79d..dc442ba21bf6 100644 --- a/tools/perf/perf.h +++ b/tools/perf/perf.h @@ -43,6 +43,7 @@ struct record_opts { bool no_samples; bool raw_samples; bool sample_address; + bool sample_phys_addr; bool sample_weight; bool sample_time; bool sample_time_set; diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h index 423ac82605f3..ee7bcc898d35 100644 --- a/tools/perf/util/event.h +++ b/tools/perf/util/event.h @@ -200,6 +200,7 @@ struct perf_sample { u32 cpu; u32 raw_size; u64 data_src; + u64 phys_addr; u32 flags; u16 insn_len; u8 cpumode; diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index d9bd632ed7db..4bb89373eb52 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -955,6 +955,9 @@ void perf_evsel__config(struct perf_evsel *evsel, struct record_opts *opts, if (opts->sample_address) perf_evsel__set_sample_bit(evsel, DATA_SRC); + if (opts->sample_phys_addr) + perf_evsel__set_sample_bit(evsel, PHYS_ADDR); + if (opts->no_buffering) { attr->watermark = 0; attr->wakeup_events = 1; @@ -1464,7 +1467,7 @@ static void __p_sample_type(char *buf, size_t size, u64 value) bit_name(PERIOD), bit_name(STREAM_ID), bit_name(RAW), bit_name(BRANCH_STACK), bit_name(REGS_USER), bit_name(STACK_USER), bit_name(IDENTIFIER), bit_name(REGS_INTR), bit_name(DATA_SRC), - bit_name(WEIGHT), + bit_name(WEIGHT), bit_name(PHYS_ADDR), { .name = NULL, } }; #undef bit_name @@ -2206,6 +2209,12 @@ int perf_evsel__parse_sample(struct perf_evsel *evsel, union perf_event *event, } } + data->phys_addr = 0; + if (type & PERF_SAMPLE_PHYS_ADDR) { + data->phys_addr = *array; + array++; + } + return 0; } @@ -2311,6 +2320,9 @@ size_t perf_event__sample_event_size(const struct perf_sample *sample, u64 type, } } + if (type & PERF_SAMPLE_PHYS_ADDR) + result += sizeof(u64); + return result; } @@ -2500,6 +2512,11 @@ int perf_event__synthesize_sample(union perf_event *event, u64 type, } } + if (type & PERF_SAMPLE_PHYS_ADDR) { + *array = sample->phys_addr; + array++; + } + return 0; } From 8780fb25ab060bafa5a8149e79b703e0fc7ee847 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 29 Aug 2017 13:11:09 -0400 Subject: [PATCH 08/13] perf sort: Add sort option for physical address Add a new sort option "phys_daddr" for --mem-mode sort. With this option applied, perf can sort and report by sample's physical address. Signed-off-by: Kan Liang Tested-by: Jiri Olsa Acked-by: Stephane Eranian Cc: Andi Kleen Cc: Madhavan Srinivasan Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1504026672-7304-3-git-send-email-kan.liang@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-report.txt | 1 + tools/perf/util/hist.c | 4 +++ tools/perf/util/hist.h | 1 + tools/perf/util/machine.c | 8 +++-- tools/perf/util/session.c | 3 ++ tools/perf/util/sort.c | 42 ++++++++++++++++++++++++ tools/perf/util/sort.h | 1 + tools/perf/util/symbol.h | 1 + 8 files changed, 59 insertions(+), 2 deletions(-) diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt index 9fa84617181e..383a98d992ed 100644 --- a/tools/perf/Documentation/perf-report.txt +++ b/tools/perf/Documentation/perf-report.txt @@ -137,6 +137,7 @@ OPTIONS - mem: type of memory access for the data at the time of the sample - snoop: type of snoop (if any) for the data at the time of the sample - dcacheline: the cacheline the data address is on at the time of the sample + - phys_daddr: physical address of data being executed on at the time of sample And the default sort keys are changed to local_weight, mem, sym, dso, symbol_daddr, dso_daddr, snoop, tlb, locked, see '--mem-mode'. diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index 9453b2e27015..e60d8d8ea4c2 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -167,6 +167,10 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h) symlen = unresolved_col_width + 4 + 2; hists__set_unres_dso_col_len(hists, HISTC_MEM_DADDR_DSO); } + + hists__new_col_len(hists, HISTC_MEM_PHYS_DADDR, + unresolved_col_width + 4 + 2); + } else { symlen = unresolved_col_width + 4 + 2; hists__new_col_len(hists, HISTC_MEM_DADDR_SYMBOL, symlen); diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h index ee3670a388df..e60dda26a920 100644 --- a/tools/perf/util/hist.h +++ b/tools/perf/util/hist.h @@ -47,6 +47,7 @@ enum hist_column { HISTC_GLOBAL_WEIGHT, HISTC_MEM_DADDR_SYMBOL, HISTC_MEM_DADDR_DSO, + HISTC_MEM_PHYS_DADDR, HISTC_MEM_LOCKED, HISTC_MEM_TLB, HISTC_MEM_LVL, diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index 9eaa95302c86..df709363ef69 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -1635,10 +1635,12 @@ static void ip__resolve_ams(struct thread *thread, ams->al_addr = al.addr; ams->sym = al.sym; ams->map = al.map; + ams->phys_addr = 0; } static void ip__resolve_data(struct thread *thread, - u8 m, struct addr_map_symbol *ams, u64 addr) + u8 m, struct addr_map_symbol *ams, + u64 addr, u64 phys_addr) { struct addr_location al; @@ -1658,6 +1660,7 @@ static void ip__resolve_data(struct thread *thread, ams->al_addr = al.addr; ams->sym = al.sym; ams->map = al.map; + ams->phys_addr = phys_addr; } struct mem_info *sample__resolve_mem(struct perf_sample *sample, @@ -1669,7 +1672,8 @@ struct mem_info *sample__resolve_mem(struct perf_sample *sample, return NULL; ip__resolve_ams(al->thread, &mi->iaddr, sample->ip); - ip__resolve_data(al->thread, al->cpumode, &mi->daddr, sample->addr); + ip__resolve_data(al->thread, al->cpumode, &mi->daddr, + sample->addr, sample->phys_addr); mi->data_src.val = sample->data_src; return mi; diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index ac863691605f..a7ebd9fe8e40 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -1120,6 +1120,9 @@ static void dump_sample(struct perf_evsel *evsel, union perf_event *event, if (sample_type & PERF_SAMPLE_DATA_SRC) printf(" . data_src: 0x%"PRIx64"\n", sample->data_src); + if (sample_type & PERF_SAMPLE_PHYS_ADDR) + printf(" .. phys_addr: 0x%"PRIx64"\n", sample->phys_addr); + if (sample_type & PERF_SAMPLE_TRANSACTION) printf("... transaction: %" PRIx64 "\n", sample->transaction); diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index 12359bd986db..eb3ab902a1c0 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -1315,6 +1315,47 @@ struct sort_entry sort_mem_dcacheline = { .se_width_idx = HISTC_MEM_DCACHELINE, }; +static int64_t +sort__phys_daddr_cmp(struct hist_entry *left, struct hist_entry *right) +{ + uint64_t l = 0, r = 0; + + if (left->mem_info) + l = left->mem_info->daddr.phys_addr; + if (right->mem_info) + r = right->mem_info->daddr.phys_addr; + + return (int64_t)(r - l); +} + +static int hist_entry__phys_daddr_snprintf(struct hist_entry *he, char *bf, + size_t size, unsigned int width) +{ + uint64_t addr = 0; + size_t ret = 0; + size_t len = BITS_PER_LONG / 4; + + addr = he->mem_info->daddr.phys_addr; + + ret += repsep_snprintf(bf + ret, size - ret, "[%c] ", he->level); + + ret += repsep_snprintf(bf + ret, size - ret, "%-#.*llx", len, addr); + + ret += repsep_snprintf(bf + ret, size - ret, "%-*s", width - ret, ""); + + if (ret > width) + bf[width] = '\0'; + + return width; +} + +struct sort_entry sort_mem_phys_daddr = { + .se_header = "Data Physical Address", + .se_cmp = sort__phys_daddr_cmp, + .se_snprintf = hist_entry__phys_daddr_snprintf, + .se_width_idx = HISTC_MEM_PHYS_DADDR, +}; + static int64_t sort__abort_cmp(struct hist_entry *left, struct hist_entry *right) { @@ -1547,6 +1588,7 @@ static struct sort_dimension memory_sort_dimensions[] = { DIM(SORT_MEM_LVL, "mem", sort_mem_lvl), DIM(SORT_MEM_SNOOP, "snoop", sort_mem_snoop), DIM(SORT_MEM_DCACHELINE, "dcacheline", sort_mem_dcacheline), + DIM(SORT_MEM_PHYS_DADDR, "phys_daddr", sort_mem_phys_daddr), }; #undef DIM diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h index b7c75597e18f..f36dc4980a6c 100644 --- a/tools/perf/util/sort.h +++ b/tools/perf/util/sort.h @@ -245,6 +245,7 @@ enum sort_type { SORT_MEM_SNOOP, SORT_MEM_DCACHELINE, SORT_MEM_IADDR_SYMBOL, + SORT_MEM_PHYS_DADDR, }; /* diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h index d00a012cfdfb..2bd6a1f01a1c 100644 --- a/tools/perf/util/symbol.h +++ b/tools/perf/util/symbol.h @@ -186,6 +186,7 @@ struct addr_map_symbol { struct symbol *sym; u64 addr; u64 al_addr; + u64 phys_addr; }; struct branch_info { From c35aeb9dfe512422ca9ea28aae692c8f1d052b2d Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 29 Aug 2017 13:11:10 -0400 Subject: [PATCH 09/13] perf mem: Support physical address Add option phys-data in "perf mem" to record/report physical address. The default mem sort order for physical address is changed accordingly. Signed-off-by: Kan Liang Tested-by: Jiri Olsa Acked-by: Stephane Eranian Cc: Andi Kleen Cc: Madhavan Srinivasan Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1504026672-7304-4-git-send-email-kan.liang@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-mem.txt | 4 ++ tools/perf/builtin-mem.c | 99 +++++++++++++++++++-------- 2 files changed, 76 insertions(+), 27 deletions(-) diff --git a/tools/perf/Documentation/perf-mem.txt b/tools/perf/Documentation/perf-mem.txt index 73496320fca3..4be08a1e3f8d 100644 --- a/tools/perf/Documentation/perf-mem.txt +++ b/tools/perf/Documentation/perf-mem.txt @@ -59,6 +59,10 @@ OPTIONS --ldload:: Specify desired latency for loads event. +-p:: +--phys-data:: + Record/Report sample physical addresses + SEE ALSO -------- linkperf:perf-record[1], linkperf:perf-report[1] diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c index e001c0290793..0f15634ef82c 100644 --- a/tools/perf/builtin-mem.c +++ b/tools/perf/builtin-mem.c @@ -23,6 +23,7 @@ struct perf_mem { bool hide_unresolved; bool dump_raw; bool force; + bool phys_addr; int operation; const char *cpu_list; DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS); @@ -101,6 +102,9 @@ static int __cmd_record(int argc, const char **argv, struct perf_mem *mem) rec_argv[i++] = "-d"; + if (mem->phys_addr) + rec_argv[i++] = "--phys-data"; + for (j = 0; j < PERF_MEM_EVENTS__MAX; j++) { if (!perf_mem_events[j].record) continue; @@ -161,30 +165,60 @@ dump_raw_samples(struct perf_tool *tool, if (al.map != NULL) al.map->dso->hit = 1; - if (symbol_conf.field_sep) { - fmt = "%d%s%d%s0x%"PRIx64"%s0x%"PRIx64"%s%"PRIu64 - "%s0x%"PRIx64"%s%s:%s\n"; - } else { - fmt = "%5d%s%5d%s0x%016"PRIx64"%s0x016%"PRIx64 - "%s%5"PRIu64"%s0x%06"PRIx64"%s%s:%s\n"; - symbol_conf.field_sep = " "; - } + if (mem->phys_addr) { + if (symbol_conf.field_sep) { + fmt = "%d%s%d%s0x%"PRIx64"%s0x%"PRIx64"%s0x%016"PRIx64 + "%s%"PRIu64"%s0x%"PRIx64"%s%s:%s\n"; + } else { + fmt = "%5d%s%5d%s0x%016"PRIx64"%s0x016%"PRIx64 + "%s0x%016"PRIx64"%s%5"PRIu64"%s0x%06"PRIx64 + "%s%s:%s\n"; + symbol_conf.field_sep = " "; + } - printf(fmt, - sample->pid, - symbol_conf.field_sep, - sample->tid, - symbol_conf.field_sep, - sample->ip, - symbol_conf.field_sep, - sample->addr, - symbol_conf.field_sep, - sample->weight, - symbol_conf.field_sep, - sample->data_src, - symbol_conf.field_sep, - al.map ? (al.map->dso ? al.map->dso->long_name : "???") : "???", - al.sym ? al.sym->name : "???"); + printf(fmt, + sample->pid, + symbol_conf.field_sep, + sample->tid, + symbol_conf.field_sep, + sample->ip, + symbol_conf.field_sep, + sample->addr, + symbol_conf.field_sep, + sample->phys_addr, + symbol_conf.field_sep, + sample->weight, + symbol_conf.field_sep, + sample->data_src, + symbol_conf.field_sep, + al.map ? (al.map->dso ? al.map->dso->long_name : "???") : "???", + al.sym ? al.sym->name : "???"); + } else { + if (symbol_conf.field_sep) { + fmt = "%d%s%d%s0x%"PRIx64"%s0x%"PRIx64"%s%"PRIu64 + "%s0x%"PRIx64"%s%s:%s\n"; + } else { + fmt = "%5d%s%5d%s0x%016"PRIx64"%s0x016%"PRIx64 + "%s%5"PRIu64"%s0x%06"PRIx64"%s%s:%s\n"; + symbol_conf.field_sep = " "; + } + + printf(fmt, + sample->pid, + symbol_conf.field_sep, + sample->tid, + symbol_conf.field_sep, + sample->ip, + symbol_conf.field_sep, + sample->addr, + symbol_conf.field_sep, + sample->weight, + symbol_conf.field_sep, + sample->data_src, + symbol_conf.field_sep, + al.map ? (al.map->dso ? al.map->dso->long_name : "???") : "???", + al.sym ? al.sym->name : "???"); + } out_put: addr_location__put(&al); return 0; @@ -224,7 +258,10 @@ static int report_raw_events(struct perf_mem *mem) if (ret < 0) goto out_delete; - printf("# PID, TID, IP, ADDR, LOCAL WEIGHT, DSRC, SYMBOL\n"); + if (mem->phys_addr) + printf("# PID, TID, IP, ADDR, PHYS ADDR, LOCAL WEIGHT, DSRC, SYMBOL\n"); + else + printf("# PID, TID, IP, ADDR, LOCAL WEIGHT, DSRC, SYMBOL\n"); ret = perf_session__process_events(session); @@ -254,9 +291,16 @@ static int report_events(int argc, const char **argv, struct perf_mem *mem) * there is no weight (cost) associated with stores, so don't print * the column */ - if (!(mem->operation & MEM_OPERATION_LOAD)) - rep_argv[i++] = "--sort=mem,sym,dso,symbol_daddr," - "dso_daddr,tlb,locked"; + if (!(mem->operation & MEM_OPERATION_LOAD)) { + if (mem->phys_addr) + rep_argv[i++] = "--sort=mem,sym,dso,symbol_daddr," + "dso_daddr,tlb,locked,phys_daddr"; + else + rep_argv[i++] = "--sort=mem,sym,dso,symbol_daddr," + "dso_daddr,tlb,locked"; + } else if (mem->phys_addr) + rep_argv[i++] = "--sort=local_weight,mem,sym,dso,symbol_daddr," + "dso_daddr,snoop,tlb,locked,phys_daddr"; for (j = 1; j < argc; j++, i++) rep_argv[i] = argv[j]; @@ -373,6 +417,7 @@ int cmd_mem(int argc, const char **argv) "separator for columns, no spaces will be added" " between columns '.' is reserved."), OPT_BOOLEAN('f', "force", &mem.force, "don't complain, do it"), + OPT_BOOLEAN('p', "phys-data", &mem.phys_addr, "Record/Report sample physical addresses"), OPT_END() }; const char *const mem_subcommands[] = { "record", "report", NULL }; From 49d58f04eb6cdc18b3747fc4243a7114364f5420 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 29 Aug 2017 13:11:11 -0400 Subject: [PATCH 10/13] perf script: Support physical address Display the physical address at the tail if it is available. Signed-off-by: Kan Liang Tested-by: Jiri Olsa Acked-by: Stephane Eranian Cc: Andi Kleen Cc: Madhavan Srinivasan Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1504026672-7304-5-git-send-email-kan.liang@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-script.txt | 2 +- tools/perf/builtin-script.c | 15 +++++++++++++-- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt index 5ee8796be96e..18dfcfa38454 100644 --- a/tools/perf/Documentation/perf-script.txt +++ b/tools/perf/Documentation/perf-script.txt @@ -117,7 +117,7 @@ OPTIONS Comma separated list of fields to print. Options are: comm, tid, pid, time, cpu, event, trace, ip, sym, dso, addr, symoff, srcline, period, iregs, brstack, brstacksym, flags, bpf-output, brstackinsn, brstackoff, - callindent, insn, insnlen, synth. + callindent, insn, insnlen, synth, phys_addr. Field list can be prepended with the type, trace, sw or hw, to indicate to which event type the field list applies. e.g., -F sw:comm,tid,time,ip,sym and -F trace:time,cpu,trace diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 378f76cdf923..3d4c3b5e1868 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -87,6 +87,7 @@ enum perf_output_field { PERF_OUTPUT_BRSTACKINSN = 1U << 23, PERF_OUTPUT_BRSTACKOFF = 1U << 24, PERF_OUTPUT_SYNTH = 1U << 25, + PERF_OUTPUT_PHYS_ADDR = 1U << 26, }; struct output_option { @@ -119,6 +120,7 @@ struct output_option { {.str = "brstackinsn", .field = PERF_OUTPUT_BRSTACKINSN}, {.str = "brstackoff", .field = PERF_OUTPUT_BRSTACKOFF}, {.str = "synth", .field = PERF_OUTPUT_SYNTH}, + {.str = "phys_addr", .field = PERF_OUTPUT_PHYS_ADDR}, }; enum { @@ -175,7 +177,8 @@ static struct { PERF_OUTPUT_EVNAME | PERF_OUTPUT_IP | PERF_OUTPUT_SYM | PERF_OUTPUT_DSO | PERF_OUTPUT_PERIOD | PERF_OUTPUT_ADDR | - PERF_OUTPUT_DATA_SRC | PERF_OUTPUT_WEIGHT, + PERF_OUTPUT_DATA_SRC | PERF_OUTPUT_WEIGHT | + PERF_OUTPUT_PHYS_ADDR, .invalid_fields = PERF_OUTPUT_TRACE | PERF_OUTPUT_BPF_OUTPUT, }, @@ -382,6 +385,11 @@ static int perf_evsel__check_attr(struct perf_evsel *evsel, PERF_OUTPUT_IREGS)) return -EINVAL; + if (PRINT_FIELD(PHYS_ADDR) && + perf_evsel__check_stype(evsel, PERF_SAMPLE_PHYS_ADDR, "PHYS_ADDR", + PERF_OUTPUT_PHYS_ADDR)) + return -EINVAL; + return 0; } @@ -1446,6 +1454,9 @@ static void process_event(struct perf_script *script, if (perf_evsel__is_bpf_output(evsel) && PRINT_FIELD(BPF_OUTPUT)) print_sample_bpf_output(sample); print_insn(sample, attr, thread, machine); + + if (PRINT_FIELD(PHYS_ADDR)) + printf("%16" PRIx64, sample->phys_addr); printf("\n"); } @@ -2729,7 +2740,7 @@ int cmd_script(int argc, const char **argv) "Valid types: hw,sw,trace,raw,synth. " "Fields: comm,tid,pid,time,cpu,event,trace,ip,sym,dso," "addr,symoff,period,iregs,brstack,brstacksym,flags," - "bpf-output,callindent,insn,insnlen,brstackinsn,synth", + "bpf-output,callindent,insn,insnlen,brstackinsn,synth,phys_addr", parse_output_fields), OPT_BOOLEAN('a', "all-cpus", &system_wide, "system-wide collection from all CPUs"), From fc33dccba39584e403436b9cda3edc9c34b62bce Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 29 Aug 2017 13:11:12 -0400 Subject: [PATCH 11/13] perf test: Add test case for PERF_SAMPLE_PHYS_ADDR Extend sample-parsing test cases to support new sample type PERF_SAMPLE_PHYS_ADDR. Signed-off-by: Kan Liang Tested-by: Jiri Olsa Acked-by: Stephane Eranian Cc: Andi Kleen Cc: Madhavan Srinivasan Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1504026672-7304-6-git-send-email-kan.liang@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/sample-parsing.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/perf/tests/sample-parsing.c b/tools/perf/tests/sample-parsing.c index 6d028f42b3cf..c3858487159d 100644 --- a/tools/perf/tests/sample-parsing.c +++ b/tools/perf/tests/sample-parsing.c @@ -141,6 +141,9 @@ static bool samples_same(const struct perf_sample *s1, } } + if (type & PERF_SAMPLE_PHYS_ADDR) + COMP(phys_addr); + return true; } @@ -206,6 +209,7 @@ static int do_test(u64 sample_type, u64 sample_regs, u64 read_format) .mask = sample_regs, .regs = regs, }, + .phys_addr = 113, }; struct sample_read_value values[] = {{1, 5}, {9, 3}, {2, 7}, {6, 4},}; struct perf_sample sample_out; @@ -305,7 +309,7 @@ int test__sample_parsing(struct test *test __maybe_unused, int subtest __maybe_u * were added. Please actually update the test rather than just change * the condition below. */ - if (PERF_SAMPLE_MAX > PERF_SAMPLE_REGS_INTR << 1) { + if (PERF_SAMPLE_MAX > PERF_SAMPLE_PHYS_ADDR << 1) { pr_debug("sample format has changed, some new PERF_SAMPLE_ bit was introduced - test needs updating\n"); return -1; } From 63ce8449bc1081711eef1add68909e9bd758de62 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 31 Aug 2017 15:32:18 -0300 Subject: [PATCH 12/13] perf stat: Only auto-merge events that are PMU aliases Peter reported that when he explicitely asked for multiple events with the same name on the command line it got coalesced into just one line, i.e.: # perf stat -e cycles -e cycles -e cycles usleep 1 Performance counter stats for 'usleep 1': 3,269,652 cycles 0.000884123 seconds time elapsed # And while there is the --no-merges option to disable that auto-merging, this is a blunt change in behaviour for such explicit request, so change the code so that this auto merging is done only when handling the multi PMU aliases with the same name that introduced this coalescing, restoring the previous behaviour for the explicit case: # perf stat -e cycles -e cycles -e cycles usleep 1 Performance counter stats for 'usleep 1': 1,472,837 cycles 1,472,837 cycles 1,472,837 cycles 0.001764870 seconds time elapsed # Reported-by: Peter Zijlstra Acked-by: Andi Kleen Acked-by: Jiri Olsa Cc: Adrian Hunter Cc: Mark Rutland Cc: Namhyung Kim Cc: Wang Nan Fixes: 430daf2dc7af ("perf stat: Collapse identically named events") Link: http://lkml.kernel.org/r/20170831184122.GK4831@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 2 +- tools/perf/util/evsel.h | 1 + tools/perf/util/parse-events.c | 24 ++++++++++++++++-------- 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 866da7aa54bf..85e992d9215b 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -1257,7 +1257,7 @@ static bool collect_data(struct perf_evsel *counter, if (counter->merged_stat) return false; cb(counter, data, true); - if (!no_merge) + if (!no_merge && counter->auto_merge_stats) collect_all_aliases(counter, cb, data); return true; } diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 351d3b2d8887..dd2c4b5112a5 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -131,6 +131,7 @@ struct perf_evsel { bool cmdline_group_boundary; struct list_head config_terms; int bpf_fd; + bool auto_merge_stats; bool merged_stat; const char * metric_expr; const char * metric_name; diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index f44aeba51d1f..f6257fb4f08c 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -310,7 +310,7 @@ static struct perf_evsel * __add_event(struct list_head *list, int *idx, struct perf_event_attr *attr, char *name, struct cpu_map *cpus, - struct list_head *config_terms) + struct list_head *config_terms, bool auto_merge_stats) { struct perf_evsel *evsel; @@ -324,6 +324,7 @@ __add_event(struct list_head *list, int *idx, evsel->cpus = cpu_map__get(cpus); evsel->own_cpus = cpu_map__get(cpus); evsel->system_wide = !!cpus; + evsel->auto_merge_stats = auto_merge_stats; if (name) evsel->name = strdup(name); @@ -339,7 +340,7 @@ static int add_event(struct list_head *list, int *idx, struct perf_event_attr *attr, char *name, struct list_head *config_terms) { - return __add_event(list, idx, attr, name, NULL, config_terms) ? 0 : -ENOMEM; + return __add_event(list, idx, attr, name, NULL, config_terms, false) ? 0 : -ENOMEM; } static int parse_aliases(char *str, const char *names[][PERF_EVSEL__MAX_ALIASES], int size) @@ -1209,9 +1210,9 @@ int parse_events_add_numeric(struct parse_events_state *parse_state, get_config_name(head_config), &config_terms); } -int parse_events_add_pmu(struct parse_events_state *parse_state, +static int __parse_events_add_pmu(struct parse_events_state *parse_state, struct list_head *list, char *name, - struct list_head *head_config) + struct list_head *head_config, bool auto_merge_stats) { struct perf_event_attr attr; struct perf_pmu_info info; @@ -1232,7 +1233,7 @@ int parse_events_add_pmu(struct parse_events_state *parse_state, if (!head_config) { attr.type = pmu->type; - evsel = __add_event(list, &parse_state->idx, &attr, NULL, pmu->cpus, NULL); + evsel = __add_event(list, &parse_state->idx, &attr, NULL, pmu->cpus, NULL, auto_merge_stats); return evsel ? 0 : -ENOMEM; } @@ -1254,7 +1255,7 @@ int parse_events_add_pmu(struct parse_events_state *parse_state, evsel = __add_event(list, &parse_state->idx, &attr, get_config_name(head_config), pmu->cpus, - &config_terms); + &config_terms, auto_merge_stats); if (evsel) { evsel->unit = info.unit; evsel->scale = info.scale; @@ -1267,6 +1268,13 @@ int parse_events_add_pmu(struct parse_events_state *parse_state, return evsel ? 0 : -ENOMEM; } +int parse_events_add_pmu(struct parse_events_state *parse_state, + struct list_head *list, char *name, + struct list_head *head_config) +{ + return __parse_events_add_pmu(parse_state, list, name, head_config, false); +} + int parse_events_multi_pmu_add(struct parse_events_state *parse_state, char *str, struct list_head **listp) { @@ -1296,8 +1304,8 @@ int parse_events_multi_pmu_add(struct parse_events_state *parse_state, return -1; list_add_tail(&term->list, head); - if (!parse_events_add_pmu(parse_state, list, - pmu->name, head)) { + if (!__parse_events_add_pmu(parse_state, list, + pmu->name, head, true)) { pr_debug("%s -> %s/%s/\n", str, pmu->name, alias->str); ok++; From eba9fac017617e685d648339e29a1453a30cb065 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 1 Sep 2017 14:55:40 -0300 Subject: [PATCH 13/13] perf annotate browser: Help for cycling thru hottest instructions with TAB/shift+TAB The popup help accessed via 'h' wasn't mentioning about TAB and shift-TAB, just about 'H', which goes to the hottest line, while the former two are the hotkeys for actually cycling thru the hottest lines. Reported-by: Flavio Bruno Leitner Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Cc: Taeung Song Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-5ppym6odizfj1ifa4t7neiku@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/ui/browsers/annotate.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c index ba0aee576a2b..786fecaf578e 100644 --- a/tools/perf/ui/browsers/annotate.c +++ b/tools/perf/ui/browsers/annotate.c @@ -829,7 +829,8 @@ static int annotate_browser__run(struct annotate_browser *browser, "q/ESC/CTRL+C Exit\n\n" "ENTER Go to target\n" "ESC Exit\n" - "H Cycle thru hottest instructions\n" + "H Go to hottest instruction\n" + "TAB/shift+TAB Cycle thru hottest instructions\n" "j Toggle showing jump to target arrows\n" "J Toggle showing number of jump sources on targets\n" "n Search next string\n"