dwarves/btf_encoder.h

37 lines
1.0 KiB
C
Raw Normal View History

#ifndef _BTF_ENCODER_H_
#define _BTF_ENCODER_H_ 1
/*
SPDX-License-Identifier: GPL-2.0-only
Copyright (C) 2019 Facebook
Derived from ctf_encoder.h, which is:
Copyright (C) Arnaldo Carvalho de Melo <acme@redhat.com>
*/
#include <stdbool.h>
struct btf_encoder;
struct btf;
struct cu;
struct list_head;
struct btf_encoder *btf_encoder__new(struct cu *cu, const char *detached_filename, struct btf *base_btf, bool skip_encoding_vars, bool force, bool gen_floats, bool verbose);
void btf_encoder__delete(struct btf_encoder *encoder);
int btf_encoder__encode(struct btf_encoder *encoder);
btf: Allow multiple cu's in dwarf->btf conversion Currently, the pahole dwarf->btf conversion only supports one compilation unit. This is not ideal since we would like using pahole to generate BTF for vmlinux which has a lot of compilation units. This patch added support to process multiple compilation units per ELF file. Multiple ELF files are also supported properly. The following is a demonstration example: -bash-4.4$ cat t1.c struct t1 { int a1; } g1; int main(void) { return 0; } -bash-4.4$ cat t2.c struct t2 { char a2; } g2; int main() { return 0; } -bash-4.4$ cat t3.c struct t3 { unsigned char a1:4; } g1; int main(void) { return 0; } -bash-4.4$ cat t4.c struct t4 { volatile char a4; } g2; int main() { return 0; } -bash-4.4$ gcc -O2 -o t1 -g t1.c t2.c -bash-4.4$ gcc -O2 -o t3 -g t3.c t4.c Note that both the binary "t1" and "t3" have two compilation units in their respective dwarf debug_info sections. The following is the pahole verbose output for BTF conversion for these two binaries. -bash-4.4$ pahole -JV t1 t3 File t1: [1] STRUCT t1 size=4 vlen=1 a1 type_id=2 bits_offset=0 [2] INT int size=4 bit_offset=0 nr_bits=32 encoding=SIGNED [3] STRUCT t2 size=1 vlen=1 a2 type_id=4 bits_offset=0 [4] INT char size=1 bit_offset=0 nr_bits=8 encoding=(none) [5] INT int size=4 bit_offset=0 nr_bits=32 encoding=SIGNED File t3: [1] STRUCT t3 size=1 vlen=1 a1 type_id=3 bits_offset=0 [2] INT unsigned char size=1 bit_offset=0 nr_bits=8 encoding=(none) [3] INT unsigned char size=1 bit_offset=0 nr_bits=4 encoding=(none) [4] INT (anon) size=4 bit_offset=0 nr_bits=32 encoding=(none) [5] STRUCT t4 size=1 vlen=1 a4 type_id=6 bits_offset=0 [6] VOLATILE (anon) type_id=7 [7] INT char size=1 bit_offset=0 nr_bits=8 encoding=(none) [8] INT int size=4 bit_offset=0 nr_bits=32 encoding=SIGNED Signed-off-by: Andrii Nakryiko <andriin@fb.com> Acked-by: Martin KaFai Lau <kafai@fb.com> Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Alexei Starovoitov <ast@fb.com> Cc: Yonghong Song <yhs@fb.com> Signed-off-by: Yonghong Song <yhs@fb.com> Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2018-12-18 23:09:39 +01:00
int btf_encoder__encode_cu(struct btf_encoder *encoder, struct cu *cu);
void btf_encoders__add(struct list_head *encoders, struct btf_encoder *encoder);
struct btf_encoder *btf_encoders__first(struct list_head *encoders);
struct btf_encoder *btf_encoders__next(struct btf_encoder *encoder);
pahole: Use per-thread btf instances to avoid mutex locking Create an instance of btf for each worker thread, and add type info to the local btf instance in the steal-function of pahole without mutex acquiring. Once finished with all worker threads, merge all per-thread btf instances to the primary btf instance. Committer testing: Results with no multithreading, and without further DWARF loading improvements (not loading things that won't be converted to BTF, etc), i.e. using pahole 1.21: # perf stat -r5 pahole --btf_encode /tmp/vmlinux ; btfdiff /tmp/vmlinux Performance counter stats for 'pahole --btf_encode /tmp/vmlinux' (5 runs): 6,317.41 msec task-clock # 0.985 CPUs utilized ( +- 1.07% ) 80 context-switches # 12.478 /sec ( +- 15.25% ) 1 cpu-migrations # 0.156 /sec ( +-111.36% ) 535,890 page-faults # 83.585 K/sec ( +- 0.00% ) 29,789,308,790 cycles # 4.646 GHz ( +- 0.46% ) (83.33%) 97,696,165 stalled-cycles-frontend # 0.33% frontend cycles idle ( +- 4.05% ) (83.34%) 145,554,652 stalled-cycles-backend # 0.49% backend cycles idle ( +- 21.53% ) (83.33%) 78,215,192,264 instructions # 2.61 insn per cycle # 0.00 stalled cycles per insn ( +- 0.05% ) (83.33%) 18,141,376,637 branches # 2.830 G/sec ( +- 0.06% ) (83.33%) 148,826,657 branch-misses # 0.82% of all branches ( +- 0.65% ) (83.34%) 6.4129 +- 0.0682 seconds time elapsed ( +- 1.06% ) # Now with pahole 1.23, with just parallel DWARF loading + trimmed DWARF loading (skipping DWARF tags that won't be converted to BTF, etc): $ perf stat -r5 pahole -j --btf_encode /tmp/vmlinux Performance counter stats for 'pahole -j --btf_encode /tmp/vmlinux' (5 runs): 10,828.98 msec task-clock:u # 3.539 CPUs utilized ( +- 0.94% ) 0 context-switches:u # 0.000 /sec 0 cpu-migrations:u # 0.000 /sec 105,407 page-faults:u # 9.895 K/sec ( +- 0.15% ) 24,774,029,571 cycles:u # 2.326 GHz ( +- 0.50% ) (83.49%) 76,895,232 stalled-cycles-frontend:u # 0.31% frontend cycles idle ( +- 4.84% ) (83.50%) 24,821,768 stalled-cycles-backend:u # 0.10% backend cycles idle ( +- 3.66% ) (83.11%) 69,891,360,588 instructions:u # 2.83 insn per cycle # 0.00 stalled cycles per insn ( +- 0.10% ) (83.20%) 16,966,456,889 branches:u # 1.593 G/sec ( +- 0.21% ) (83.41%) 131,923,443 branch-misses:u # 0.78% of all branches ( +- 0.82% ) (83.42%) 3.0600 +- 0.0140 seconds time elapsed ( +- 0.46% ) $ It is a bit better not to use -j to use all the CPU threads in the machine, i.e. using just the number of non-hyperthreading cores, in this machine, a Ryzen 5950x, 16 cores: $ perf stat -r5 pahole -j16 --btf_encode /tmp/vmlinux Performance counter stats for 'pahole -j16 --btf_encode /tmp/vmlinux' (5 runs): 10,075.46 msec task-clock:u # 3.431 CPUs utilized ( +- 0.49% ) 0 context-switches:u # 0.000 /sec 0 cpu-migrations:u # 0.000 /sec 90,777 page-faults:u # 8.983 K/sec ( +- 0.16% ) 22,611,016,624 cycles:u # 2.237 GHz ( +- 0.93% ) (83.34%) 55,760,536 stalled-cycles-frontend:u # 0.24% frontend cycles idle ( +- 2.35% ) (83.25%) 15,985,651 stalled-cycles-backend:u # 0.07% backend cycles idle ( +- 8.79% ) (83.33%) 68,976,319,497 instructions:u # 2.96 insn per cycle # 0.00 stalled cycles per insn ( +- 0.34% ) (83.39%) 16,770,540,533 branches:u # 1.659 G/sec ( +- 0.31% ) (83.35%) 128,220,385 branch-misses:u # 0.76% of all branches ( +- 0.77% ) (83.37%) 2.9365 +- 0.0284 seconds time elapsed ( +- 0.97% ) $ Then with parallel DWARF loading + parallel BTF encoding (this patch): $ perf stat -r5 pahole -j --btf_encode /tmp/vmlinux Performance counter stats for 'pahole -j --btf_encode /tmp/vmlinux' (5 runs): 11,063.29 msec task-clock:u # 6.389 CPUs utilized ( +- 0.79% ) 0 context-switches:u # 0.000 /sec 0 cpu-migrations:u # 0.000 /sec 163,263 page-faults:u # 14.840 K/sec ( +- 0.48% ) 41,892,887,608 cycles:u # 3.808 GHz ( +- 0.96% ) (83.41%) 197,163,158 stalled-cycles-frontend:u # 0.47% frontend cycles idle ( +- 3.23% ) (83.46%) 114,187,423 stalled-cycles-backend:u # 0.27% backend cycles idle ( +- 16.57% ) (83.43%) 74,053,722,204 instructions:u # 1.78 insn per cycle # 0.00 stalled cycles per insn ( +- 0.18% ) (83.37%) 17,848,238,467 branches:u # 1.622 G/sec ( +- 0.10% ) (83.27%) 180,232,427 branch-misses:u # 1.01% of all branches ( +- 0.86% ) (83.16%) 1.7316 +- 0.0301 seconds time elapsed ( +- 1.74% ) $ Again it is better not to use -j to use all the CPU threads: $ perf stat -r5 pahole -j16 --btf_encode /tmp/vmlinux Performance counter stats for 'pahole -j16 --btf_encode /tmp/vmlinux' (5 runs): 6,626.33 msec task-clock:u # 4.421 CPUs utilized ( +- 0.82% ) 0 context-switches:u # 0.000 /sec 0 cpu-migrations:u # 0.000 /sec 140,919 page-faults:u # 21.240 K/sec ( +- 1.03% ) 26,085,701,848 cycles:u # 3.932 GHz ( +- 1.20% ) (83.38%) 98,962,246 stalled-cycles-frontend:u # 0.37% frontend cycles idle ( +- 3.47% ) (83.41%) 102,762,088 stalled-cycles-backend:u # 0.39% backend cycles idle ( +- 17.95% ) (83.38%) 71,193,141,569 instructions:u # 2.69 insn per cycle # 0.00 stalled cycles per insn ( +- 0.14% ) (83.33%) 17,166,459,728 branches:u # 2.587 G/sec ( +- 0.15% ) (83.27%) 150,984,525 branch-misses:u # 0.87% of all branches ( +- 0.61% ) (83.34%) 1.4989 +- 0.0113 seconds time elapsed ( +- 0.76% ) $ Minor tweaks to reduce the patch size, things like avoiding moving the pthread_mutex_lock(&btf_lock) to after a comment, etc. Signed-off-by: Kui-Feng Lee <kuifeng@fb.com> Acked-by: Andrii Nakryiko <andrii@kernel.org> Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Alexei Starovoitov <ast@kernel.org> Cc: Daniel Borkmann <daniel@iogearbox.net> Cc: bpf@vger.kernel.org Cc: dwarves@vger.kernel.org Link: https://lore.kernel.org/r/20220126192039.2840752-4-kuifeng@fb.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2022-01-26 20:20:38 +01:00
struct btf *btf_encoder__btf(struct btf_encoder *encoder);
int btf_encoder__add_encoder(struct btf_encoder *encoder, struct btf_encoder *other);
#endif /* _BTF_ENCODER_H_ */