2008-10-02 19:34:42 +02:00
|
|
|
/*
|
2019-01-15 18:28:24 +01:00
|
|
|
SPDX-License-Identifier: GPL-2.0-only
|
2008-10-02 19:34:42 +02:00
|
|
|
|
2019-01-15 18:28:24 +01:00
|
|
|
Copyright (C) 2008 Arnaldo Carvalho de Melo <acme@redhat.com>
|
2008-10-02 19:34:42 +02:00
|
|
|
*/
|
|
|
|
|
2020-10-20 21:30:04 +02:00
|
|
|
#include "pahole_strings.h"
|
2008-10-02 22:06:01 +02:00
|
|
|
#include "gobuffer.h"
|
2008-10-02 19:34:42 +02:00
|
|
|
|
|
|
|
#include <search.h>
|
|
|
|
#include <stdint.h>
|
|
|
|
#include <stdlib.h>
|
|
|
|
#include <stdio.h>
|
|
|
|
#include <string.h>
|
|
|
|
#include <zlib.h>
|
2021-01-04 23:16:22 +01:00
|
|
|
#include <bpf/libbpf.h>
|
2008-10-02 19:34:42 +02:00
|
|
|
|
|
|
|
#include "dutil.h"
|
|
|
|
|
|
|
|
struct strings *strings__new(void)
|
|
|
|
{
|
2012-08-17 23:47:15 +02:00
|
|
|
struct strings *strs = malloc(sizeof(*strs));
|
2008-10-02 19:34:42 +02:00
|
|
|
|
strings: use BTF's string APIs for strings management
Switch strings container to using struct btf and its
btf__add_str()/btf__find_str() APIs, which do equivalent internal string
deduplication. This turns out to be a very significantly faster than using
tsearch functions. To satisfy CTF encoding use case, some hacky string size
fetching approach is utilized, as libbpf doesn't provide direct API to get
total string section size and to copy over just strings data section.
BEFORE:
22,624.28 msec task-clock # 1.000 CPUs utilized
85 context-switches # 0.004 K/sec
3 cpu-migrations # 0.000 K/sec
622,545 page-faults # 0.028 M/sec
68,177,206,387 cycles # 3.013 GHz (24.99%)
114,370,031,619 instructions # 1.68 insn per cycle (25.01%)
26,125,001,179 branches # 1154.733 M/sec (25.01%)
458,861,243 branch-misses # 1.76% of all branches (25.00%)
24,533,455,967 L1-dcache-loads # 1084.386 M/sec (25.02%)
973,500,214 L1-dcache-load-misses # 3.97% of all L1-dcache hits (25.05%)
338,773,561 LLC-loads # 14.974 M/sec (25.02%)
12,651,196 LLC-load-misses # 3.73% of all LL-cache hits (25.00%)
22.628910615 seconds time elapsed
21.341063000 seconds user
1.283763000 seconds sys
AFTER:
18,362.97 msec task-clock # 1.000 CPUs utilized
37 context-switches # 0.002 K/sec
0 cpu-migrations # 0.000 K/sec
626,281 page-faults # 0.034 M/sec
52,480,619,000 cycles # 2.858 GHz (25.00%)
104,736,434,384 instructions # 2.00 insn per cycle (25.01%)
23,878,428,465 branches # 1300.358 M/sec (25.01%)
252,669,685 branch-misses # 1.06% of all branches (25.03%)
21,829,390,952 L1-dcache-loads # 1188.772 M/sec (25.04%)
638,086,339 L1-dcache-load-misses # 2.92% of all L1-dcache hits (25.02%)
212,327,435 LLC-loads # 11.563 M/sec (25.00%)
14,578,117 LLC-load-misses # 6.87% of all LL-cache hits (25.00%)
18.364427347 seconds time elapsed
16.985494000 seconds user
1.377959000 seconds sys
Committer testing:
Before:
$ perf stat -r5 pahole -J vmlinux
Performance counter stats for 'pahole -J vmlinux' (5 runs):
8,735.92 msec task-clock:u # 0.998 CPUs utilized ( +- 0.34% )
0 context-switches:u # 0.000 K/sec
0 cpu-migrations:u # 0.000 K/sec
353,978 page-faults:u # 0.041 M/sec ( +- 0.00% )
34,722,167,335 cycles:u # 3.975 GHz ( +- 0.12% ) (83.33%)
555,981,118 stalled-cycles-frontend:u # 1.60% frontend cycles idle ( +- 1.53% ) (83.33%)
5,215,370,531 stalled-cycles-backend:u # 15.02% backend cycles idle ( +- 1.31% ) (83.33%)
72,615,773,119 instructions:u # 2.09 insn per cycle
# 0.07 stalled cycles per insn ( +- 0.02% ) (83.34%)
16,624,959,121 branches:u # 1903.057 M/sec ( +- 0.01% ) (83.33%)
229,962,327 branch-misses:u # 1.38% of all branches ( +- 0.07% ) (83.33%)
8.7503 +- 0.0301 seconds time elapsed ( +- 0.34% )
$
After:
$ perf stat -r5 pahole -J vmlinux
Performance counter stats for 'pahole -J vmlinux' (5 runs):
7,302.31 msec task-clock:u # 0.998 CPUs utilized ( +- 1.16% )
0 context-switches:u # 0.000 K/sec
0 cpu-migrations:u # 0.000 K/sec
355,884 page-faults:u # 0.049 M/sec ( +- 0.00% )
29,150,861,078 cycles:u # 3.992 GHz ( +- 0.35% ) (83.33%)
478,705,326 stalled-cycles-frontend:u # 1.64% frontend cycles idle ( +- 2.70% ) (83.33%)
5,351,001,796 stalled-cycles-backend:u # 18.36% backend cycles idle ( +- 1.20% ) (83.33%)
65,835,888,022 instructions:u # 2.26 insn per cycle
# 0.08 stalled cycles per insn ( +- 0.03% ) (83.33%)
15,025,195,460 branches:u # 2057.594 M/sec ( +- 0.05% ) (83.34%)
141,209,214 branch-misses:u # 0.94% of all branches ( +- 0.15% ) (83.33%)
7.3140 +- 0.0851 seconds time elapsed ( +- 1.16% )
$
16.04% less cycles, keep the patches coming! :-)
Had to add this patch tho:
+++ b/dwarf_loader.c
@@ -2159,7 +2159,7 @@ static unsigned long long dwarf_tag__orig_id(const struct tag *tag,
static const char *dwarf__strings_ptr(const struct cu *cu __unused,
strings_t s)
{
- return strings__ptr(strings, s);
+ return s ? strings__ptr(strings, s) : NULL;
}
To keep preexisting behaviour and to do what the BTF specific
strings_ptr method does:
static const char *btf_elf__strings_ptr(const struct cu *cu, strings_t s)
{
return btf_elf__string(cu->priv, s);
}
const char *btf_elf__string(struct btf_elf *btfe, uint32_t ref)
{
const char *s = btf__str_by_offset(btfe->btf, ref);
return s && s[0] == '\0' ? NULL : s;
}
With these adjustments, btfdiff on a vmlinux with BTF and DWARF is again
clean, i.e. pretty printing from BTF matches what we get when using
DWARF.
Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: bpf@vger.kernel.org
Cc: dwarves@vger.kernel.org
Cc: kernel-team@fb.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2020-10-09 01:39:59 +02:00
|
|
|
if (!strs)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
strs->btf = btf__new_empty();
|
|
|
|
if (libbpf_get_error(strs->btf)) {
|
|
|
|
free(strs);
|
|
|
|
return NULL;
|
2008-10-02 19:34:42 +02:00
|
|
|
}
|
|
|
|
|
2012-08-17 23:47:15 +02:00
|
|
|
return strs;
|
2008-10-02 19:34:42 +02:00
|
|
|
}
|
|
|
|
|
2012-08-17 23:47:15 +02:00
|
|
|
void strings__delete(struct strings *strs)
|
2008-10-02 19:34:42 +02:00
|
|
|
{
|
2012-08-17 23:47:15 +02:00
|
|
|
if (strs == NULL)
|
2009-03-20 14:35:57 +01:00
|
|
|
return;
|
strings: use BTF's string APIs for strings management
Switch strings container to using struct btf and its
btf__add_str()/btf__find_str() APIs, which do equivalent internal string
deduplication. This turns out to be a very significantly faster than using
tsearch functions. To satisfy CTF encoding use case, some hacky string size
fetching approach is utilized, as libbpf doesn't provide direct API to get
total string section size and to copy over just strings data section.
BEFORE:
22,624.28 msec task-clock # 1.000 CPUs utilized
85 context-switches # 0.004 K/sec
3 cpu-migrations # 0.000 K/sec
622,545 page-faults # 0.028 M/sec
68,177,206,387 cycles # 3.013 GHz (24.99%)
114,370,031,619 instructions # 1.68 insn per cycle (25.01%)
26,125,001,179 branches # 1154.733 M/sec (25.01%)
458,861,243 branch-misses # 1.76% of all branches (25.00%)
24,533,455,967 L1-dcache-loads # 1084.386 M/sec (25.02%)
973,500,214 L1-dcache-load-misses # 3.97% of all L1-dcache hits (25.05%)
338,773,561 LLC-loads # 14.974 M/sec (25.02%)
12,651,196 LLC-load-misses # 3.73% of all LL-cache hits (25.00%)
22.628910615 seconds time elapsed
21.341063000 seconds user
1.283763000 seconds sys
AFTER:
18,362.97 msec task-clock # 1.000 CPUs utilized
37 context-switches # 0.002 K/sec
0 cpu-migrations # 0.000 K/sec
626,281 page-faults # 0.034 M/sec
52,480,619,000 cycles # 2.858 GHz (25.00%)
104,736,434,384 instructions # 2.00 insn per cycle (25.01%)
23,878,428,465 branches # 1300.358 M/sec (25.01%)
252,669,685 branch-misses # 1.06% of all branches (25.03%)
21,829,390,952 L1-dcache-loads # 1188.772 M/sec (25.04%)
638,086,339 L1-dcache-load-misses # 2.92% of all L1-dcache hits (25.02%)
212,327,435 LLC-loads # 11.563 M/sec (25.00%)
14,578,117 LLC-load-misses # 6.87% of all LL-cache hits (25.00%)
18.364427347 seconds time elapsed
16.985494000 seconds user
1.377959000 seconds sys
Committer testing:
Before:
$ perf stat -r5 pahole -J vmlinux
Performance counter stats for 'pahole -J vmlinux' (5 runs):
8,735.92 msec task-clock:u # 0.998 CPUs utilized ( +- 0.34% )
0 context-switches:u # 0.000 K/sec
0 cpu-migrations:u # 0.000 K/sec
353,978 page-faults:u # 0.041 M/sec ( +- 0.00% )
34,722,167,335 cycles:u # 3.975 GHz ( +- 0.12% ) (83.33%)
555,981,118 stalled-cycles-frontend:u # 1.60% frontend cycles idle ( +- 1.53% ) (83.33%)
5,215,370,531 stalled-cycles-backend:u # 15.02% backend cycles idle ( +- 1.31% ) (83.33%)
72,615,773,119 instructions:u # 2.09 insn per cycle
# 0.07 stalled cycles per insn ( +- 0.02% ) (83.34%)
16,624,959,121 branches:u # 1903.057 M/sec ( +- 0.01% ) (83.33%)
229,962,327 branch-misses:u # 1.38% of all branches ( +- 0.07% ) (83.33%)
8.7503 +- 0.0301 seconds time elapsed ( +- 0.34% )
$
After:
$ perf stat -r5 pahole -J vmlinux
Performance counter stats for 'pahole -J vmlinux' (5 runs):
7,302.31 msec task-clock:u # 0.998 CPUs utilized ( +- 1.16% )
0 context-switches:u # 0.000 K/sec
0 cpu-migrations:u # 0.000 K/sec
355,884 page-faults:u # 0.049 M/sec ( +- 0.00% )
29,150,861,078 cycles:u # 3.992 GHz ( +- 0.35% ) (83.33%)
478,705,326 stalled-cycles-frontend:u # 1.64% frontend cycles idle ( +- 2.70% ) (83.33%)
5,351,001,796 stalled-cycles-backend:u # 18.36% backend cycles idle ( +- 1.20% ) (83.33%)
65,835,888,022 instructions:u # 2.26 insn per cycle
# 0.08 stalled cycles per insn ( +- 0.03% ) (83.33%)
15,025,195,460 branches:u # 2057.594 M/sec ( +- 0.05% ) (83.34%)
141,209,214 branch-misses:u # 0.94% of all branches ( +- 0.15% ) (83.33%)
7.3140 +- 0.0851 seconds time elapsed ( +- 1.16% )
$
16.04% less cycles, keep the patches coming! :-)
Had to add this patch tho:
+++ b/dwarf_loader.c
@@ -2159,7 +2159,7 @@ static unsigned long long dwarf_tag__orig_id(const struct tag *tag,
static const char *dwarf__strings_ptr(const struct cu *cu __unused,
strings_t s)
{
- return strings__ptr(strings, s);
+ return s ? strings__ptr(strings, s) : NULL;
}
To keep preexisting behaviour and to do what the BTF specific
strings_ptr method does:
static const char *btf_elf__strings_ptr(const struct cu *cu, strings_t s)
{
return btf_elf__string(cu->priv, s);
}
const char *btf_elf__string(struct btf_elf *btfe, uint32_t ref)
{
const char *s = btf__str_by_offset(btfe->btf, ref);
return s && s[0] == '\0' ? NULL : s;
}
With these adjustments, btfdiff on a vmlinux with BTF and DWARF is again
clean, i.e. pretty printing from BTF matches what we get when using
DWARF.
Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: bpf@vger.kernel.org
Cc: dwarves@vger.kernel.org
Cc: kernel-team@fb.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2020-10-09 01:39:59 +02:00
|
|
|
btf__free(strs->btf);
|
2012-08-17 23:47:15 +02:00
|
|
|
free(strs);
|
2008-10-02 19:34:42 +02:00
|
|
|
}
|
|
|
|
|
2012-08-17 23:47:15 +02:00
|
|
|
strings_t strings__add(struct strings *strs, const char *str)
|
2008-10-02 19:34:42 +02:00
|
|
|
{
|
|
|
|
strings_t index;
|
|
|
|
|
|
|
|
if (str == NULL)
|
|
|
|
return 0;
|
|
|
|
|
2021-06-28 19:51:24 +02:00
|
|
|
index = btf__add_str(strs->btf, str);
|
pahole: Add --kabi_prefix flag
To solve problems similar to _RH_KABI_REPLACE. The _RH_KABI_REPLACE(_orig, _new) macros perserve size alignment and kabi agreement between _orig and _new.Below is the definition of this macro:
union { \
_new; \
struct { \
_orig; \
} __UNIQUE_ID(rh_kabi_hide); \
__RH_KABI_CHECK_SIZE_ALIGN(_orig, _new); \
}
__UNIQUE_ID uses the __COUNTER__ macro, and the __COUNTER__ macro is automatically incremented by 1 every time it is precompiled. Therefore, in different compilation units, the same structure has different names.Here is a concrete example:
struct acpi_dev_node {
union {
struct acpi_device *companion;
struct {
void *handle;
} __UNIQUE_ID_rh_kabi_hide29;
union { };
};
};
struct acpi_dev_node {
union {
struct acpi_device *companion;
struct {
void *handle;
} __UNIQUE_ID_rh_kabi_hide31;
union { };
};
};
Finally, it will cause the btf algorithm to de-duplication efficiency is not high, and time-consuming. For example, running ./pahole -J vmlinux-3.10.0-1160.el7.x86_64 without --kabi_prefix flag, the running time is:
real 8m28.912s
user 8m27.271s
sys 0m1.471s
And the size of the generated btf segment is 30678240 bytes.
After adding the patch, running ./pahole --kabi_prefix=__UNIQUE_ID_rh_kabi_hide -J vmlinux-3.10.0-1160.el7.x86_64. The running time of the command is:
real 0m19.634s
user 0m18.457s
sys 0m1.169s
And the size of the generated btf segment is 3117719 bytes.
Signed-off-by: Shuyi Cheng <chengshuyi@linux.alibaba.com>
Acked-by: Jiri Olsa <jolsa@redhat.com>
Cc: Andrii Nakryiko <andrii.nakryiko@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Wenan Mao <wenan.mao@linux.alibaba.com>
Cc: dwarves@vger.kernel.org
Link: https://lore.kernel.org/dwarves/482e5543-d7da-7bed-098d-cc879d8db253@linux.alibaba.com/
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2021-05-21 03:44:20 +02:00
|
|
|
|
strings: use BTF's string APIs for strings management
Switch strings container to using struct btf and its
btf__add_str()/btf__find_str() APIs, which do equivalent internal string
deduplication. This turns out to be a very significantly faster than using
tsearch functions. To satisfy CTF encoding use case, some hacky string size
fetching approach is utilized, as libbpf doesn't provide direct API to get
total string section size and to copy over just strings data section.
BEFORE:
22,624.28 msec task-clock # 1.000 CPUs utilized
85 context-switches # 0.004 K/sec
3 cpu-migrations # 0.000 K/sec
622,545 page-faults # 0.028 M/sec
68,177,206,387 cycles # 3.013 GHz (24.99%)
114,370,031,619 instructions # 1.68 insn per cycle (25.01%)
26,125,001,179 branches # 1154.733 M/sec (25.01%)
458,861,243 branch-misses # 1.76% of all branches (25.00%)
24,533,455,967 L1-dcache-loads # 1084.386 M/sec (25.02%)
973,500,214 L1-dcache-load-misses # 3.97% of all L1-dcache hits (25.05%)
338,773,561 LLC-loads # 14.974 M/sec (25.02%)
12,651,196 LLC-load-misses # 3.73% of all LL-cache hits (25.00%)
22.628910615 seconds time elapsed
21.341063000 seconds user
1.283763000 seconds sys
AFTER:
18,362.97 msec task-clock # 1.000 CPUs utilized
37 context-switches # 0.002 K/sec
0 cpu-migrations # 0.000 K/sec
626,281 page-faults # 0.034 M/sec
52,480,619,000 cycles # 2.858 GHz (25.00%)
104,736,434,384 instructions # 2.00 insn per cycle (25.01%)
23,878,428,465 branches # 1300.358 M/sec (25.01%)
252,669,685 branch-misses # 1.06% of all branches (25.03%)
21,829,390,952 L1-dcache-loads # 1188.772 M/sec (25.04%)
638,086,339 L1-dcache-load-misses # 2.92% of all L1-dcache hits (25.02%)
212,327,435 LLC-loads # 11.563 M/sec (25.00%)
14,578,117 LLC-load-misses # 6.87% of all LL-cache hits (25.00%)
18.364427347 seconds time elapsed
16.985494000 seconds user
1.377959000 seconds sys
Committer testing:
Before:
$ perf stat -r5 pahole -J vmlinux
Performance counter stats for 'pahole -J vmlinux' (5 runs):
8,735.92 msec task-clock:u # 0.998 CPUs utilized ( +- 0.34% )
0 context-switches:u # 0.000 K/sec
0 cpu-migrations:u # 0.000 K/sec
353,978 page-faults:u # 0.041 M/sec ( +- 0.00% )
34,722,167,335 cycles:u # 3.975 GHz ( +- 0.12% ) (83.33%)
555,981,118 stalled-cycles-frontend:u # 1.60% frontend cycles idle ( +- 1.53% ) (83.33%)
5,215,370,531 stalled-cycles-backend:u # 15.02% backend cycles idle ( +- 1.31% ) (83.33%)
72,615,773,119 instructions:u # 2.09 insn per cycle
# 0.07 stalled cycles per insn ( +- 0.02% ) (83.34%)
16,624,959,121 branches:u # 1903.057 M/sec ( +- 0.01% ) (83.33%)
229,962,327 branch-misses:u # 1.38% of all branches ( +- 0.07% ) (83.33%)
8.7503 +- 0.0301 seconds time elapsed ( +- 0.34% )
$
After:
$ perf stat -r5 pahole -J vmlinux
Performance counter stats for 'pahole -J vmlinux' (5 runs):
7,302.31 msec task-clock:u # 0.998 CPUs utilized ( +- 1.16% )
0 context-switches:u # 0.000 K/sec
0 cpu-migrations:u # 0.000 K/sec
355,884 page-faults:u # 0.049 M/sec ( +- 0.00% )
29,150,861,078 cycles:u # 3.992 GHz ( +- 0.35% ) (83.33%)
478,705,326 stalled-cycles-frontend:u # 1.64% frontend cycles idle ( +- 2.70% ) (83.33%)
5,351,001,796 stalled-cycles-backend:u # 18.36% backend cycles idle ( +- 1.20% ) (83.33%)
65,835,888,022 instructions:u # 2.26 insn per cycle
# 0.08 stalled cycles per insn ( +- 0.03% ) (83.33%)
15,025,195,460 branches:u # 2057.594 M/sec ( +- 0.05% ) (83.34%)
141,209,214 branch-misses:u # 0.94% of all branches ( +- 0.15% ) (83.33%)
7.3140 +- 0.0851 seconds time elapsed ( +- 1.16% )
$
16.04% less cycles, keep the patches coming! :-)
Had to add this patch tho:
+++ b/dwarf_loader.c
@@ -2159,7 +2159,7 @@ static unsigned long long dwarf_tag__orig_id(const struct tag *tag,
static const char *dwarf__strings_ptr(const struct cu *cu __unused,
strings_t s)
{
- return strings__ptr(strings, s);
+ return s ? strings__ptr(strings, s) : NULL;
}
To keep preexisting behaviour and to do what the BTF specific
strings_ptr method does:
static const char *btf_elf__strings_ptr(const struct cu *cu, strings_t s)
{
return btf_elf__string(cu->priv, s);
}
const char *btf_elf__string(struct btf_elf *btfe, uint32_t ref)
{
const char *s = btf__str_by_offset(btfe->btf, ref);
return s && s[0] == '\0' ? NULL : s;
}
With these adjustments, btfdiff on a vmlinux with BTF and DWARF is again
clean, i.e. pretty printing from BTF matches what we get when using
DWARF.
Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: bpf@vger.kernel.org
Cc: dwarves@vger.kernel.org
Cc: kernel-team@fb.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2020-10-09 01:39:59 +02:00
|
|
|
if (index < 0)
|
2008-10-02 19:34:42 +02:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
return index;
|
|
|
|
}
|
2008-10-24 18:20:37 +02:00
|
|
|
|
2012-08-17 23:47:15 +02:00
|
|
|
strings_t strings__find(struct strings *strs, const char *str)
|
2008-10-24 18:20:37 +02:00
|
|
|
{
|
strings: use BTF's string APIs for strings management
Switch strings container to using struct btf and its
btf__add_str()/btf__find_str() APIs, which do equivalent internal string
deduplication. This turns out to be a very significantly faster than using
tsearch functions. To satisfy CTF encoding use case, some hacky string size
fetching approach is utilized, as libbpf doesn't provide direct API to get
total string section size and to copy over just strings data section.
BEFORE:
22,624.28 msec task-clock # 1.000 CPUs utilized
85 context-switches # 0.004 K/sec
3 cpu-migrations # 0.000 K/sec
622,545 page-faults # 0.028 M/sec
68,177,206,387 cycles # 3.013 GHz (24.99%)
114,370,031,619 instructions # 1.68 insn per cycle (25.01%)
26,125,001,179 branches # 1154.733 M/sec (25.01%)
458,861,243 branch-misses # 1.76% of all branches (25.00%)
24,533,455,967 L1-dcache-loads # 1084.386 M/sec (25.02%)
973,500,214 L1-dcache-load-misses # 3.97% of all L1-dcache hits (25.05%)
338,773,561 LLC-loads # 14.974 M/sec (25.02%)
12,651,196 LLC-load-misses # 3.73% of all LL-cache hits (25.00%)
22.628910615 seconds time elapsed
21.341063000 seconds user
1.283763000 seconds sys
AFTER:
18,362.97 msec task-clock # 1.000 CPUs utilized
37 context-switches # 0.002 K/sec
0 cpu-migrations # 0.000 K/sec
626,281 page-faults # 0.034 M/sec
52,480,619,000 cycles # 2.858 GHz (25.00%)
104,736,434,384 instructions # 2.00 insn per cycle (25.01%)
23,878,428,465 branches # 1300.358 M/sec (25.01%)
252,669,685 branch-misses # 1.06% of all branches (25.03%)
21,829,390,952 L1-dcache-loads # 1188.772 M/sec (25.04%)
638,086,339 L1-dcache-load-misses # 2.92% of all L1-dcache hits (25.02%)
212,327,435 LLC-loads # 11.563 M/sec (25.00%)
14,578,117 LLC-load-misses # 6.87% of all LL-cache hits (25.00%)
18.364427347 seconds time elapsed
16.985494000 seconds user
1.377959000 seconds sys
Committer testing:
Before:
$ perf stat -r5 pahole -J vmlinux
Performance counter stats for 'pahole -J vmlinux' (5 runs):
8,735.92 msec task-clock:u # 0.998 CPUs utilized ( +- 0.34% )
0 context-switches:u # 0.000 K/sec
0 cpu-migrations:u # 0.000 K/sec
353,978 page-faults:u # 0.041 M/sec ( +- 0.00% )
34,722,167,335 cycles:u # 3.975 GHz ( +- 0.12% ) (83.33%)
555,981,118 stalled-cycles-frontend:u # 1.60% frontend cycles idle ( +- 1.53% ) (83.33%)
5,215,370,531 stalled-cycles-backend:u # 15.02% backend cycles idle ( +- 1.31% ) (83.33%)
72,615,773,119 instructions:u # 2.09 insn per cycle
# 0.07 stalled cycles per insn ( +- 0.02% ) (83.34%)
16,624,959,121 branches:u # 1903.057 M/sec ( +- 0.01% ) (83.33%)
229,962,327 branch-misses:u # 1.38% of all branches ( +- 0.07% ) (83.33%)
8.7503 +- 0.0301 seconds time elapsed ( +- 0.34% )
$
After:
$ perf stat -r5 pahole -J vmlinux
Performance counter stats for 'pahole -J vmlinux' (5 runs):
7,302.31 msec task-clock:u # 0.998 CPUs utilized ( +- 1.16% )
0 context-switches:u # 0.000 K/sec
0 cpu-migrations:u # 0.000 K/sec
355,884 page-faults:u # 0.049 M/sec ( +- 0.00% )
29,150,861,078 cycles:u # 3.992 GHz ( +- 0.35% ) (83.33%)
478,705,326 stalled-cycles-frontend:u # 1.64% frontend cycles idle ( +- 2.70% ) (83.33%)
5,351,001,796 stalled-cycles-backend:u # 18.36% backend cycles idle ( +- 1.20% ) (83.33%)
65,835,888,022 instructions:u # 2.26 insn per cycle
# 0.08 stalled cycles per insn ( +- 0.03% ) (83.33%)
15,025,195,460 branches:u # 2057.594 M/sec ( +- 0.05% ) (83.34%)
141,209,214 branch-misses:u # 0.94% of all branches ( +- 0.15% ) (83.33%)
7.3140 +- 0.0851 seconds time elapsed ( +- 1.16% )
$
16.04% less cycles, keep the patches coming! :-)
Had to add this patch tho:
+++ b/dwarf_loader.c
@@ -2159,7 +2159,7 @@ static unsigned long long dwarf_tag__orig_id(const struct tag *tag,
static const char *dwarf__strings_ptr(const struct cu *cu __unused,
strings_t s)
{
- return strings__ptr(strings, s);
+ return s ? strings__ptr(strings, s) : NULL;
}
To keep preexisting behaviour and to do what the BTF specific
strings_ptr method does:
static const char *btf_elf__strings_ptr(const struct cu *cu, strings_t s)
{
return btf_elf__string(cu->priv, s);
}
const char *btf_elf__string(struct btf_elf *btfe, uint32_t ref)
{
const char *s = btf__str_by_offset(btfe->btf, ref);
return s && s[0] == '\0' ? NULL : s;
}
With these adjustments, btfdiff on a vmlinux with BTF and DWARF is again
clean, i.e. pretty printing from BTF matches what we get when using
DWARF.
Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: bpf@vger.kernel.org
Cc: dwarves@vger.kernel.org
Cc: kernel-team@fb.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2020-10-09 01:39:59 +02:00
|
|
|
return btf__find_str(strs->btf, str);
|
|
|
|
}
|
2008-10-24 18:20:37 +02:00
|
|
|
|
strings: use BTF's string APIs for strings management
Switch strings container to using struct btf and its
btf__add_str()/btf__find_str() APIs, which do equivalent internal string
deduplication. This turns out to be a very significantly faster than using
tsearch functions. To satisfy CTF encoding use case, some hacky string size
fetching approach is utilized, as libbpf doesn't provide direct API to get
total string section size and to copy over just strings data section.
BEFORE:
22,624.28 msec task-clock # 1.000 CPUs utilized
85 context-switches # 0.004 K/sec
3 cpu-migrations # 0.000 K/sec
622,545 page-faults # 0.028 M/sec
68,177,206,387 cycles # 3.013 GHz (24.99%)
114,370,031,619 instructions # 1.68 insn per cycle (25.01%)
26,125,001,179 branches # 1154.733 M/sec (25.01%)
458,861,243 branch-misses # 1.76% of all branches (25.00%)
24,533,455,967 L1-dcache-loads # 1084.386 M/sec (25.02%)
973,500,214 L1-dcache-load-misses # 3.97% of all L1-dcache hits (25.05%)
338,773,561 LLC-loads # 14.974 M/sec (25.02%)
12,651,196 LLC-load-misses # 3.73% of all LL-cache hits (25.00%)
22.628910615 seconds time elapsed
21.341063000 seconds user
1.283763000 seconds sys
AFTER:
18,362.97 msec task-clock # 1.000 CPUs utilized
37 context-switches # 0.002 K/sec
0 cpu-migrations # 0.000 K/sec
626,281 page-faults # 0.034 M/sec
52,480,619,000 cycles # 2.858 GHz (25.00%)
104,736,434,384 instructions # 2.00 insn per cycle (25.01%)
23,878,428,465 branches # 1300.358 M/sec (25.01%)
252,669,685 branch-misses # 1.06% of all branches (25.03%)
21,829,390,952 L1-dcache-loads # 1188.772 M/sec (25.04%)
638,086,339 L1-dcache-load-misses # 2.92% of all L1-dcache hits (25.02%)
212,327,435 LLC-loads # 11.563 M/sec (25.00%)
14,578,117 LLC-load-misses # 6.87% of all LL-cache hits (25.00%)
18.364427347 seconds time elapsed
16.985494000 seconds user
1.377959000 seconds sys
Committer testing:
Before:
$ perf stat -r5 pahole -J vmlinux
Performance counter stats for 'pahole -J vmlinux' (5 runs):
8,735.92 msec task-clock:u # 0.998 CPUs utilized ( +- 0.34% )
0 context-switches:u # 0.000 K/sec
0 cpu-migrations:u # 0.000 K/sec
353,978 page-faults:u # 0.041 M/sec ( +- 0.00% )
34,722,167,335 cycles:u # 3.975 GHz ( +- 0.12% ) (83.33%)
555,981,118 stalled-cycles-frontend:u # 1.60% frontend cycles idle ( +- 1.53% ) (83.33%)
5,215,370,531 stalled-cycles-backend:u # 15.02% backend cycles idle ( +- 1.31% ) (83.33%)
72,615,773,119 instructions:u # 2.09 insn per cycle
# 0.07 stalled cycles per insn ( +- 0.02% ) (83.34%)
16,624,959,121 branches:u # 1903.057 M/sec ( +- 0.01% ) (83.33%)
229,962,327 branch-misses:u # 1.38% of all branches ( +- 0.07% ) (83.33%)
8.7503 +- 0.0301 seconds time elapsed ( +- 0.34% )
$
After:
$ perf stat -r5 pahole -J vmlinux
Performance counter stats for 'pahole -J vmlinux' (5 runs):
7,302.31 msec task-clock:u # 0.998 CPUs utilized ( +- 1.16% )
0 context-switches:u # 0.000 K/sec
0 cpu-migrations:u # 0.000 K/sec
355,884 page-faults:u # 0.049 M/sec ( +- 0.00% )
29,150,861,078 cycles:u # 3.992 GHz ( +- 0.35% ) (83.33%)
478,705,326 stalled-cycles-frontend:u # 1.64% frontend cycles idle ( +- 2.70% ) (83.33%)
5,351,001,796 stalled-cycles-backend:u # 18.36% backend cycles idle ( +- 1.20% ) (83.33%)
65,835,888,022 instructions:u # 2.26 insn per cycle
# 0.08 stalled cycles per insn ( +- 0.03% ) (83.33%)
15,025,195,460 branches:u # 2057.594 M/sec ( +- 0.05% ) (83.34%)
141,209,214 branch-misses:u # 0.94% of all branches ( +- 0.15% ) (83.33%)
7.3140 +- 0.0851 seconds time elapsed ( +- 1.16% )
$
16.04% less cycles, keep the patches coming! :-)
Had to add this patch tho:
+++ b/dwarf_loader.c
@@ -2159,7 +2159,7 @@ static unsigned long long dwarf_tag__orig_id(const struct tag *tag,
static const char *dwarf__strings_ptr(const struct cu *cu __unused,
strings_t s)
{
- return strings__ptr(strings, s);
+ return s ? strings__ptr(strings, s) : NULL;
}
To keep preexisting behaviour and to do what the BTF specific
strings_ptr method does:
static const char *btf_elf__strings_ptr(const struct cu *cu, strings_t s)
{
return btf_elf__string(cu->priv, s);
}
const char *btf_elf__string(struct btf_elf *btfe, uint32_t ref)
{
const char *s = btf__str_by_offset(btfe->btf, ref);
return s && s[0] == '\0' ? NULL : s;
}
With these adjustments, btfdiff on a vmlinux with BTF and DWARF is again
clean, i.e. pretty printing from BTF matches what we get when using
DWARF.
Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: bpf@vger.kernel.org
Cc: dwarves@vger.kernel.org
Cc: kernel-team@fb.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2020-10-09 01:39:59 +02:00
|
|
|
/* a horrible and inefficient hack to get string section size out of BTF */
|
|
|
|
strings_t strings__size(const struct strings *strs)
|
|
|
|
{
|
|
|
|
const struct btf_header *p;
|
|
|
|
uint32_t sz;
|
|
|
|
|
|
|
|
p = btf__get_raw_data(strs->btf, &sz);
|
|
|
|
if (!p)
|
|
|
|
return -1;
|
2008-10-24 18:20:37 +02:00
|
|
|
|
strings: use BTF's string APIs for strings management
Switch strings container to using struct btf and its
btf__add_str()/btf__find_str() APIs, which do equivalent internal string
deduplication. This turns out to be a very significantly faster than using
tsearch functions. To satisfy CTF encoding use case, some hacky string size
fetching approach is utilized, as libbpf doesn't provide direct API to get
total string section size and to copy over just strings data section.
BEFORE:
22,624.28 msec task-clock # 1.000 CPUs utilized
85 context-switches # 0.004 K/sec
3 cpu-migrations # 0.000 K/sec
622,545 page-faults # 0.028 M/sec
68,177,206,387 cycles # 3.013 GHz (24.99%)
114,370,031,619 instructions # 1.68 insn per cycle (25.01%)
26,125,001,179 branches # 1154.733 M/sec (25.01%)
458,861,243 branch-misses # 1.76% of all branches (25.00%)
24,533,455,967 L1-dcache-loads # 1084.386 M/sec (25.02%)
973,500,214 L1-dcache-load-misses # 3.97% of all L1-dcache hits (25.05%)
338,773,561 LLC-loads # 14.974 M/sec (25.02%)
12,651,196 LLC-load-misses # 3.73% of all LL-cache hits (25.00%)
22.628910615 seconds time elapsed
21.341063000 seconds user
1.283763000 seconds sys
AFTER:
18,362.97 msec task-clock # 1.000 CPUs utilized
37 context-switches # 0.002 K/sec
0 cpu-migrations # 0.000 K/sec
626,281 page-faults # 0.034 M/sec
52,480,619,000 cycles # 2.858 GHz (25.00%)
104,736,434,384 instructions # 2.00 insn per cycle (25.01%)
23,878,428,465 branches # 1300.358 M/sec (25.01%)
252,669,685 branch-misses # 1.06% of all branches (25.03%)
21,829,390,952 L1-dcache-loads # 1188.772 M/sec (25.04%)
638,086,339 L1-dcache-load-misses # 2.92% of all L1-dcache hits (25.02%)
212,327,435 LLC-loads # 11.563 M/sec (25.00%)
14,578,117 LLC-load-misses # 6.87% of all LL-cache hits (25.00%)
18.364427347 seconds time elapsed
16.985494000 seconds user
1.377959000 seconds sys
Committer testing:
Before:
$ perf stat -r5 pahole -J vmlinux
Performance counter stats for 'pahole -J vmlinux' (5 runs):
8,735.92 msec task-clock:u # 0.998 CPUs utilized ( +- 0.34% )
0 context-switches:u # 0.000 K/sec
0 cpu-migrations:u # 0.000 K/sec
353,978 page-faults:u # 0.041 M/sec ( +- 0.00% )
34,722,167,335 cycles:u # 3.975 GHz ( +- 0.12% ) (83.33%)
555,981,118 stalled-cycles-frontend:u # 1.60% frontend cycles idle ( +- 1.53% ) (83.33%)
5,215,370,531 stalled-cycles-backend:u # 15.02% backend cycles idle ( +- 1.31% ) (83.33%)
72,615,773,119 instructions:u # 2.09 insn per cycle
# 0.07 stalled cycles per insn ( +- 0.02% ) (83.34%)
16,624,959,121 branches:u # 1903.057 M/sec ( +- 0.01% ) (83.33%)
229,962,327 branch-misses:u # 1.38% of all branches ( +- 0.07% ) (83.33%)
8.7503 +- 0.0301 seconds time elapsed ( +- 0.34% )
$
After:
$ perf stat -r5 pahole -J vmlinux
Performance counter stats for 'pahole -J vmlinux' (5 runs):
7,302.31 msec task-clock:u # 0.998 CPUs utilized ( +- 1.16% )
0 context-switches:u # 0.000 K/sec
0 cpu-migrations:u # 0.000 K/sec
355,884 page-faults:u # 0.049 M/sec ( +- 0.00% )
29,150,861,078 cycles:u # 3.992 GHz ( +- 0.35% ) (83.33%)
478,705,326 stalled-cycles-frontend:u # 1.64% frontend cycles idle ( +- 2.70% ) (83.33%)
5,351,001,796 stalled-cycles-backend:u # 18.36% backend cycles idle ( +- 1.20% ) (83.33%)
65,835,888,022 instructions:u # 2.26 insn per cycle
# 0.08 stalled cycles per insn ( +- 0.03% ) (83.33%)
15,025,195,460 branches:u # 2057.594 M/sec ( +- 0.05% ) (83.34%)
141,209,214 branch-misses:u # 0.94% of all branches ( +- 0.15% ) (83.33%)
7.3140 +- 0.0851 seconds time elapsed ( +- 1.16% )
$
16.04% less cycles, keep the patches coming! :-)
Had to add this patch tho:
+++ b/dwarf_loader.c
@@ -2159,7 +2159,7 @@ static unsigned long long dwarf_tag__orig_id(const struct tag *tag,
static const char *dwarf__strings_ptr(const struct cu *cu __unused,
strings_t s)
{
- return strings__ptr(strings, s);
+ return s ? strings__ptr(strings, s) : NULL;
}
To keep preexisting behaviour and to do what the BTF specific
strings_ptr method does:
static const char *btf_elf__strings_ptr(const struct cu *cu, strings_t s)
{
return btf_elf__string(cu->priv, s);
}
const char *btf_elf__string(struct btf_elf *btfe, uint32_t ref)
{
const char *s = btf__str_by_offset(btfe->btf, ref);
return s && s[0] == '\0' ? NULL : s;
}
With these adjustments, btfdiff on a vmlinux with BTF and DWARF is again
clean, i.e. pretty printing from BTF matches what we get when using
DWARF.
Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: bpf@vger.kernel.org
Cc: dwarves@vger.kernel.org
Cc: kernel-team@fb.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2020-10-09 01:39:59 +02:00
|
|
|
return p->str_len;
|
2008-10-24 18:20:37 +02:00
|
|
|
}
|
2009-03-14 17:50:36 +01:00
|
|
|
|
strings: use BTF's string APIs for strings management
Switch strings container to using struct btf and its
btf__add_str()/btf__find_str() APIs, which do equivalent internal string
deduplication. This turns out to be a very significantly faster than using
tsearch functions. To satisfy CTF encoding use case, some hacky string size
fetching approach is utilized, as libbpf doesn't provide direct API to get
total string section size and to copy over just strings data section.
BEFORE:
22,624.28 msec task-clock # 1.000 CPUs utilized
85 context-switches # 0.004 K/sec
3 cpu-migrations # 0.000 K/sec
622,545 page-faults # 0.028 M/sec
68,177,206,387 cycles # 3.013 GHz (24.99%)
114,370,031,619 instructions # 1.68 insn per cycle (25.01%)
26,125,001,179 branches # 1154.733 M/sec (25.01%)
458,861,243 branch-misses # 1.76% of all branches (25.00%)
24,533,455,967 L1-dcache-loads # 1084.386 M/sec (25.02%)
973,500,214 L1-dcache-load-misses # 3.97% of all L1-dcache hits (25.05%)
338,773,561 LLC-loads # 14.974 M/sec (25.02%)
12,651,196 LLC-load-misses # 3.73% of all LL-cache hits (25.00%)
22.628910615 seconds time elapsed
21.341063000 seconds user
1.283763000 seconds sys
AFTER:
18,362.97 msec task-clock # 1.000 CPUs utilized
37 context-switches # 0.002 K/sec
0 cpu-migrations # 0.000 K/sec
626,281 page-faults # 0.034 M/sec
52,480,619,000 cycles # 2.858 GHz (25.00%)
104,736,434,384 instructions # 2.00 insn per cycle (25.01%)
23,878,428,465 branches # 1300.358 M/sec (25.01%)
252,669,685 branch-misses # 1.06% of all branches (25.03%)
21,829,390,952 L1-dcache-loads # 1188.772 M/sec (25.04%)
638,086,339 L1-dcache-load-misses # 2.92% of all L1-dcache hits (25.02%)
212,327,435 LLC-loads # 11.563 M/sec (25.00%)
14,578,117 LLC-load-misses # 6.87% of all LL-cache hits (25.00%)
18.364427347 seconds time elapsed
16.985494000 seconds user
1.377959000 seconds sys
Committer testing:
Before:
$ perf stat -r5 pahole -J vmlinux
Performance counter stats for 'pahole -J vmlinux' (5 runs):
8,735.92 msec task-clock:u # 0.998 CPUs utilized ( +- 0.34% )
0 context-switches:u # 0.000 K/sec
0 cpu-migrations:u # 0.000 K/sec
353,978 page-faults:u # 0.041 M/sec ( +- 0.00% )
34,722,167,335 cycles:u # 3.975 GHz ( +- 0.12% ) (83.33%)
555,981,118 stalled-cycles-frontend:u # 1.60% frontend cycles idle ( +- 1.53% ) (83.33%)
5,215,370,531 stalled-cycles-backend:u # 15.02% backend cycles idle ( +- 1.31% ) (83.33%)
72,615,773,119 instructions:u # 2.09 insn per cycle
# 0.07 stalled cycles per insn ( +- 0.02% ) (83.34%)
16,624,959,121 branches:u # 1903.057 M/sec ( +- 0.01% ) (83.33%)
229,962,327 branch-misses:u # 1.38% of all branches ( +- 0.07% ) (83.33%)
8.7503 +- 0.0301 seconds time elapsed ( +- 0.34% )
$
After:
$ perf stat -r5 pahole -J vmlinux
Performance counter stats for 'pahole -J vmlinux' (5 runs):
7,302.31 msec task-clock:u # 0.998 CPUs utilized ( +- 1.16% )
0 context-switches:u # 0.000 K/sec
0 cpu-migrations:u # 0.000 K/sec
355,884 page-faults:u # 0.049 M/sec ( +- 0.00% )
29,150,861,078 cycles:u # 3.992 GHz ( +- 0.35% ) (83.33%)
478,705,326 stalled-cycles-frontend:u # 1.64% frontend cycles idle ( +- 2.70% ) (83.33%)
5,351,001,796 stalled-cycles-backend:u # 18.36% backend cycles idle ( +- 1.20% ) (83.33%)
65,835,888,022 instructions:u # 2.26 insn per cycle
# 0.08 stalled cycles per insn ( +- 0.03% ) (83.33%)
15,025,195,460 branches:u # 2057.594 M/sec ( +- 0.05% ) (83.34%)
141,209,214 branch-misses:u # 0.94% of all branches ( +- 0.15% ) (83.33%)
7.3140 +- 0.0851 seconds time elapsed ( +- 1.16% )
$
16.04% less cycles, keep the patches coming! :-)
Had to add this patch tho:
+++ b/dwarf_loader.c
@@ -2159,7 +2159,7 @@ static unsigned long long dwarf_tag__orig_id(const struct tag *tag,
static const char *dwarf__strings_ptr(const struct cu *cu __unused,
strings_t s)
{
- return strings__ptr(strings, s);
+ return s ? strings__ptr(strings, s) : NULL;
}
To keep preexisting behaviour and to do what the BTF specific
strings_ptr method does:
static const char *btf_elf__strings_ptr(const struct cu *cu, strings_t s)
{
return btf_elf__string(cu->priv, s);
}
const char *btf_elf__string(struct btf_elf *btfe, uint32_t ref)
{
const char *s = btf__str_by_offset(btfe->btf, ref);
return s && s[0] == '\0' ? NULL : s;
}
With these adjustments, btfdiff on a vmlinux with BTF and DWARF is again
clean, i.e. pretty printing from BTF matches what we get when using
DWARF.
Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: bpf@vger.kernel.org
Cc: dwarves@vger.kernel.org
Cc: kernel-team@fb.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2020-10-09 01:39:59 +02:00
|
|
|
/* similarly horrible hack to copy out string section out of BTF */
|
|
|
|
int strings__copy(const struct strings *strs, void *dst)
|
2008-10-24 18:20:37 +02:00
|
|
|
{
|
strings: use BTF's string APIs for strings management
Switch strings container to using struct btf and its
btf__add_str()/btf__find_str() APIs, which do equivalent internal string
deduplication. This turns out to be a very significantly faster than using
tsearch functions. To satisfy CTF encoding use case, some hacky string size
fetching approach is utilized, as libbpf doesn't provide direct API to get
total string section size and to copy over just strings data section.
BEFORE:
22,624.28 msec task-clock # 1.000 CPUs utilized
85 context-switches # 0.004 K/sec
3 cpu-migrations # 0.000 K/sec
622,545 page-faults # 0.028 M/sec
68,177,206,387 cycles # 3.013 GHz (24.99%)
114,370,031,619 instructions # 1.68 insn per cycle (25.01%)
26,125,001,179 branches # 1154.733 M/sec (25.01%)
458,861,243 branch-misses # 1.76% of all branches (25.00%)
24,533,455,967 L1-dcache-loads # 1084.386 M/sec (25.02%)
973,500,214 L1-dcache-load-misses # 3.97% of all L1-dcache hits (25.05%)
338,773,561 LLC-loads # 14.974 M/sec (25.02%)
12,651,196 LLC-load-misses # 3.73% of all LL-cache hits (25.00%)
22.628910615 seconds time elapsed
21.341063000 seconds user
1.283763000 seconds sys
AFTER:
18,362.97 msec task-clock # 1.000 CPUs utilized
37 context-switches # 0.002 K/sec
0 cpu-migrations # 0.000 K/sec
626,281 page-faults # 0.034 M/sec
52,480,619,000 cycles # 2.858 GHz (25.00%)
104,736,434,384 instructions # 2.00 insn per cycle (25.01%)
23,878,428,465 branches # 1300.358 M/sec (25.01%)
252,669,685 branch-misses # 1.06% of all branches (25.03%)
21,829,390,952 L1-dcache-loads # 1188.772 M/sec (25.04%)
638,086,339 L1-dcache-load-misses # 2.92% of all L1-dcache hits (25.02%)
212,327,435 LLC-loads # 11.563 M/sec (25.00%)
14,578,117 LLC-load-misses # 6.87% of all LL-cache hits (25.00%)
18.364427347 seconds time elapsed
16.985494000 seconds user
1.377959000 seconds sys
Committer testing:
Before:
$ perf stat -r5 pahole -J vmlinux
Performance counter stats for 'pahole -J vmlinux' (5 runs):
8,735.92 msec task-clock:u # 0.998 CPUs utilized ( +- 0.34% )
0 context-switches:u # 0.000 K/sec
0 cpu-migrations:u # 0.000 K/sec
353,978 page-faults:u # 0.041 M/sec ( +- 0.00% )
34,722,167,335 cycles:u # 3.975 GHz ( +- 0.12% ) (83.33%)
555,981,118 stalled-cycles-frontend:u # 1.60% frontend cycles idle ( +- 1.53% ) (83.33%)
5,215,370,531 stalled-cycles-backend:u # 15.02% backend cycles idle ( +- 1.31% ) (83.33%)
72,615,773,119 instructions:u # 2.09 insn per cycle
# 0.07 stalled cycles per insn ( +- 0.02% ) (83.34%)
16,624,959,121 branches:u # 1903.057 M/sec ( +- 0.01% ) (83.33%)
229,962,327 branch-misses:u # 1.38% of all branches ( +- 0.07% ) (83.33%)
8.7503 +- 0.0301 seconds time elapsed ( +- 0.34% )
$
After:
$ perf stat -r5 pahole -J vmlinux
Performance counter stats for 'pahole -J vmlinux' (5 runs):
7,302.31 msec task-clock:u # 0.998 CPUs utilized ( +- 1.16% )
0 context-switches:u # 0.000 K/sec
0 cpu-migrations:u # 0.000 K/sec
355,884 page-faults:u # 0.049 M/sec ( +- 0.00% )
29,150,861,078 cycles:u # 3.992 GHz ( +- 0.35% ) (83.33%)
478,705,326 stalled-cycles-frontend:u # 1.64% frontend cycles idle ( +- 2.70% ) (83.33%)
5,351,001,796 stalled-cycles-backend:u # 18.36% backend cycles idle ( +- 1.20% ) (83.33%)
65,835,888,022 instructions:u # 2.26 insn per cycle
# 0.08 stalled cycles per insn ( +- 0.03% ) (83.33%)
15,025,195,460 branches:u # 2057.594 M/sec ( +- 0.05% ) (83.34%)
141,209,214 branch-misses:u # 0.94% of all branches ( +- 0.15% ) (83.33%)
7.3140 +- 0.0851 seconds time elapsed ( +- 1.16% )
$
16.04% less cycles, keep the patches coming! :-)
Had to add this patch tho:
+++ b/dwarf_loader.c
@@ -2159,7 +2159,7 @@ static unsigned long long dwarf_tag__orig_id(const struct tag *tag,
static const char *dwarf__strings_ptr(const struct cu *cu __unused,
strings_t s)
{
- return strings__ptr(strings, s);
+ return s ? strings__ptr(strings, s) : NULL;
}
To keep preexisting behaviour and to do what the BTF specific
strings_ptr method does:
static const char *btf_elf__strings_ptr(const struct cu *cu, strings_t s)
{
return btf_elf__string(cu->priv, s);
}
const char *btf_elf__string(struct btf_elf *btfe, uint32_t ref)
{
const char *s = btf__str_by_offset(btfe->btf, ref);
return s && s[0] == '\0' ? NULL : s;
}
With these adjustments, btfdiff on a vmlinux with BTF and DWARF is again
clean, i.e. pretty printing from BTF matches what we get when using
DWARF.
Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: bpf@vger.kernel.org
Cc: dwarves@vger.kernel.org
Cc: kernel-team@fb.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2020-10-09 01:39:59 +02:00
|
|
|
const struct btf_header *p;
|
|
|
|
uint32_t sz;
|
|
|
|
|
|
|
|
p = btf__get_raw_data(strs->btf, &sz);
|
|
|
|
if (!p)
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
memcpy(dst, (void *)p + p->str_off, p->str_len);
|
|
|
|
return 0;
|
2008-10-24 18:20:37 +02:00
|
|
|
}
|