2018-03-06 01:53:50 +01:00
|
|
|
#ifndef _BTF_ENCODER_H_
|
|
|
|
#define _BTF_ENCODER_H_ 1
|
2019-01-15 18:28:24 +01:00
|
|
|
/*
|
|
|
|
SPDX-License-Identifier: GPL-2.0-only
|
|
|
|
|
2019-01-18 19:42:37 +01:00
|
|
|
Copyright (C) 2019 Facebook
|
|
|
|
|
|
|
|
Derived from ctf_encoder.h, which is:
|
|
|
|
Copyright (C) Arnaldo Carvalho de Melo <acme@redhat.com>
|
2019-01-15 18:28:24 +01:00
|
|
|
*/
|
2018-03-06 01:53:50 +01:00
|
|
|
|
2021-06-03 15:43:21 +02:00
|
|
|
#include <stdbool.h>
|
|
|
|
|
2021-06-11 21:22:05 +02:00
|
|
|
struct btf_encoder;
|
2021-06-02 14:32:56 +02:00
|
|
|
struct btf;
|
2018-03-06 01:53:50 +01:00
|
|
|
struct cu;
|
2021-07-26 22:02:44 +02:00
|
|
|
struct list_head;
|
2021-06-03 15:12:41 +02:00
|
|
|
|
2021-06-14 23:40:58 +02:00
|
|
|
struct btf_encoder *btf_encoder__new(struct cu *cu, const char *detached_filename, struct btf *base_btf, bool skip_encoding_vars, bool force, bool gen_floats, bool verbose);
|
2021-06-03 15:12:41 +02:00
|
|
|
void btf_encoder__delete(struct btf_encoder *encoder);
|
|
|
|
|
2021-06-14 23:40:58 +02:00
|
|
|
int btf_encoder__encode(struct btf_encoder *encoder);
|
2018-12-18 23:09:39 +01:00
|
|
|
|
2021-06-14 20:57:29 +02:00
|
|
|
int btf_encoder__encode_cu(struct btf_encoder *encoder, struct cu *cu);
|
2018-03-06 01:53:50 +01:00
|
|
|
|
2021-07-26 22:02:44 +02:00
|
|
|
void btf_encoders__add(struct list_head *encoders, struct btf_encoder *encoder);
|
|
|
|
|
|
|
|
struct btf_encoder *btf_encoders__first(struct list_head *encoders);
|
|
|
|
|
|
|
|
struct btf_encoder *btf_encoders__next(struct btf_encoder *encoder);
|
|
|
|
|
pahole: Use per-thread btf instances to avoid mutex locking
Create an instance of btf for each worker thread, and add type info to
the local btf instance in the steal-function of pahole without mutex
acquiring. Once finished with all worker threads, merge all
per-thread btf instances to the primary btf instance.
Committer testing:
Results with no multithreading, and without further DWARF loading
improvements (not loading things that won't be converted to BTF, etc),
i.e. using pahole 1.21:
# perf stat -r5 pahole --btf_encode /tmp/vmlinux ; btfdiff /tmp/vmlinux
Performance counter stats for 'pahole --btf_encode /tmp/vmlinux' (5 runs):
6,317.41 msec task-clock # 0.985 CPUs utilized ( +- 1.07% )
80 context-switches # 12.478 /sec ( +- 15.25% )
1 cpu-migrations # 0.156 /sec ( +-111.36% )
535,890 page-faults # 83.585 K/sec ( +- 0.00% )
29,789,308,790 cycles # 4.646 GHz ( +- 0.46% ) (83.33%)
97,696,165 stalled-cycles-frontend # 0.33% frontend cycles idle ( +- 4.05% ) (83.34%)
145,554,652 stalled-cycles-backend # 0.49% backend cycles idle ( +- 21.53% ) (83.33%)
78,215,192,264 instructions # 2.61 insn per cycle
# 0.00 stalled cycles per insn ( +- 0.05% ) (83.33%)
18,141,376,637 branches # 2.830 G/sec ( +- 0.06% ) (83.33%)
148,826,657 branch-misses # 0.82% of all branches ( +- 0.65% ) (83.34%)
6.4129 +- 0.0682 seconds time elapsed ( +- 1.06% )
#
Now with pahole 1.23, with just parallel DWARF loading + trimmed DWARF
loading (skipping DWARF tags that won't be converted to BTF, etc):
$ perf stat -r5 pahole -j --btf_encode /tmp/vmlinux
Performance counter stats for 'pahole -j --btf_encode /tmp/vmlinux' (5 runs):
10,828.98 msec task-clock:u # 3.539 CPUs utilized ( +- 0.94% )
0 context-switches:u # 0.000 /sec
0 cpu-migrations:u # 0.000 /sec
105,407 page-faults:u # 9.895 K/sec ( +- 0.15% )
24,774,029,571 cycles:u # 2.326 GHz ( +- 0.50% ) (83.49%)
76,895,232 stalled-cycles-frontend:u # 0.31% frontend cycles idle ( +- 4.84% ) (83.50%)
24,821,768 stalled-cycles-backend:u # 0.10% backend cycles idle ( +- 3.66% ) (83.11%)
69,891,360,588 instructions:u # 2.83 insn per cycle
# 0.00 stalled cycles per insn ( +- 0.10% ) (83.20%)
16,966,456,889 branches:u # 1.593 G/sec ( +- 0.21% ) (83.41%)
131,923,443 branch-misses:u # 0.78% of all branches ( +- 0.82% ) (83.42%)
3.0600 +- 0.0140 seconds time elapsed ( +- 0.46% )
$
It is a bit better not to use -j to use all the CPU threads in the
machine, i.e. using just the number of non-hyperthreading cores, in this
machine, a Ryzen 5950x, 16 cores:
$ perf stat -r5 pahole -j16 --btf_encode /tmp/vmlinux
Performance counter stats for 'pahole -j16 --btf_encode /tmp/vmlinux' (5 runs):
10,075.46 msec task-clock:u # 3.431 CPUs utilized ( +- 0.49% )
0 context-switches:u # 0.000 /sec
0 cpu-migrations:u # 0.000 /sec
90,777 page-faults:u # 8.983 K/sec ( +- 0.16% )
22,611,016,624 cycles:u # 2.237 GHz ( +- 0.93% ) (83.34%)
55,760,536 stalled-cycles-frontend:u # 0.24% frontend cycles idle ( +- 2.35% ) (83.25%)
15,985,651 stalled-cycles-backend:u # 0.07% backend cycles idle ( +- 8.79% ) (83.33%)
68,976,319,497 instructions:u # 2.96 insn per cycle
# 0.00 stalled cycles per insn ( +- 0.34% ) (83.39%)
16,770,540,533 branches:u # 1.659 G/sec ( +- 0.31% ) (83.35%)
128,220,385 branch-misses:u # 0.76% of all branches ( +- 0.77% ) (83.37%)
2.9365 +- 0.0284 seconds time elapsed ( +- 0.97% )
$
Then with parallel DWARF loading + parallel BTF encoding (this patch):
$ perf stat -r5 pahole -j --btf_encode /tmp/vmlinux
Performance counter stats for 'pahole -j --btf_encode /tmp/vmlinux' (5 runs):
11,063.29 msec task-clock:u # 6.389 CPUs utilized ( +- 0.79% )
0 context-switches:u # 0.000 /sec
0 cpu-migrations:u # 0.000 /sec
163,263 page-faults:u # 14.840 K/sec ( +- 0.48% )
41,892,887,608 cycles:u # 3.808 GHz ( +- 0.96% ) (83.41%)
197,163,158 stalled-cycles-frontend:u # 0.47% frontend cycles idle ( +- 3.23% ) (83.46%)
114,187,423 stalled-cycles-backend:u # 0.27% backend cycles idle ( +- 16.57% ) (83.43%)
74,053,722,204 instructions:u # 1.78 insn per cycle
# 0.00 stalled cycles per insn ( +- 0.18% ) (83.37%)
17,848,238,467 branches:u # 1.622 G/sec ( +- 0.10% ) (83.27%)
180,232,427 branch-misses:u # 1.01% of all branches ( +- 0.86% ) (83.16%)
1.7316 +- 0.0301 seconds time elapsed ( +- 1.74% )
$
Again it is better not to use -j to use all the CPU threads:
$ perf stat -r5 pahole -j16 --btf_encode /tmp/vmlinux
Performance counter stats for 'pahole -j16 --btf_encode /tmp/vmlinux' (5 runs):
6,626.33 msec task-clock:u # 4.421 CPUs utilized ( +- 0.82% )
0 context-switches:u # 0.000 /sec
0 cpu-migrations:u # 0.000 /sec
140,919 page-faults:u # 21.240 K/sec ( +- 1.03% )
26,085,701,848 cycles:u # 3.932 GHz ( +- 1.20% ) (83.38%)
98,962,246 stalled-cycles-frontend:u # 0.37% frontend cycles idle ( +- 3.47% ) (83.41%)
102,762,088 stalled-cycles-backend:u # 0.39% backend cycles idle ( +- 17.95% ) (83.38%)
71,193,141,569 instructions:u # 2.69 insn per cycle
# 0.00 stalled cycles per insn ( +- 0.14% ) (83.33%)
17,166,459,728 branches:u # 2.587 G/sec ( +- 0.15% ) (83.27%)
150,984,525 branch-misses:u # 0.87% of all branches ( +- 0.61% ) (83.34%)
1.4989 +- 0.0113 seconds time elapsed ( +- 0.76% )
$
Minor tweaks to reduce the patch size, things like avoiding moving the
pthread_mutex_lock(&btf_lock) to after a comment, etc.
Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: bpf@vger.kernel.org
Cc: dwarves@vger.kernel.org
Link: https://lore.kernel.org/r/20220126192039.2840752-4-kuifeng@fb.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2022-01-26 20:20:38 +01:00
|
|
|
struct btf *btf_encoder__btf(struct btf_encoder *encoder);
|
|
|
|
|
2018-03-06 01:53:50 +01:00
|
|
|
#endif /* _BTF_ENCODER_H_ */
|