2e719cca66
Right now to encode per-CPU variables in BTF, pahole iterates complete vmlinux symbol table for each CU. There are 2500 CUs for a typical kernel image. Overall, to encode 287 per-CPU variables pahole spends more than 10% of its CPU budget, this is incredibly wasteful. This patch revamps how this is done. Now it pre-processes symbol table once before any of per-CU processing starts. It remembers each per-CPU variable symbol, including its address, size, and name. Then during processing each CU, binary search is used to correlate DWARF variable with per-CPU symbols and figure out if variable belongs to per-CPU data section. If the match is found, BTF_KIND_VAR is emitted and var_secinfo is recorded, just like before. At the very end, after all CUs are processed, BTF_KIND_DATASEC is emitted with sorted variables. This change makes per-CPU variables generation overhead pretty negligible and returns back about 10% of CPU usage. Performance counter stats for './pahole -J /home/andriin/linux-build/default/vmlinux': BEFORE: 19.160149000 seconds user 1.304873000 seconds sys 24,114.05 msec task-clock # 0.999 CPUs utilized 83 context-switches # 0.003 K/sec 0 cpu-migrations # 0.000 K/sec 622,417 page-faults # 0.026 M/sec 72,897,315,125 cycles # 3.023 GHz (25.02%) 127,807,316,959 instructions # 1.75 insn per cycle (25.01%) 29,087,179,117 branches # 1206.234 M/sec (25.01%) 464,105,921 branch-misses # 1.60% of all branches (25.01%) 30,252,119,368 L1-dcache-loads # 1254.543 M/sec (25.01%) 1,156,336,207 L1-dcache-load-misses # 3.82% of all L1-dcache hits (25.05%) 343,373,503 LLC-loads # 14.240 M/sec (25.02%) 12,044,977 LLC-load-misses # 3.51% of all LL-cache hits (25.01%) 24.136198321 seconds time elapsed 22.729693000 seconds user 1.384859000 seconds sys AFTER: 16.781455000 seconds user 1.343956000 seconds sys 23,398.77 msec task-clock # 1.000 CPUs utilized 86 context-switches # 0.004 K/sec 0 cpu-migrations # 0.000 K/sec 622,420 page-faults # 0.027 M/sec 68,395,641,468 cycles # 2.923 GHz (25.05%) 114,241,327,034 instructions # 1.67 insn per cycle (25.01%) 26,330,711,718 branches # 1125.303 M/sec (25.01%) 465,926,869 branch-misses # 1.77% of all branches (25.00%) 24,662,984,772 L1-dcache-loads # 1054.029 M/sec (25.00%) 1,054,052,064 L1-dcache-load-misses # 4.27% of all L1-dcache hits (25.00%) 340,970,622 LLC-loads # 14.572 M/sec (25.00%) 16,032,297 LLC-load-misses # 4.70% of all LL-cache hits (25.03%) 23.402259654 seconds time elapsed 21.916437000 seconds user 1.482826000 seconds sys Committer testing: $ grep 'model name' -m1 /proc/cpuinfo model name : AMD Ryzen 9 3900X 12-Core Processor $ Before: $ perf stat -r5 pahole -J vmlinux Performance counter stats for 'pahole -J vmlinux' (5 runs): 9,730.28 msec task-clock:u # 0.998 CPUs utilized ( +- 0.54% ) 0 context-switches:u # 0.000 K/sec 0 cpu-migrations:u # 0.000 K/sec 353,854 page-faults:u # 0.036 M/sec ( +- 0.00% ) 39,721,726,459 cycles:u # 4.082 GHz ( +- 0.07% ) (83.33%) 626,010,654 stalled-cycles-frontend:u # 1.58% frontend cycles idle ( +- 0.91% ) (83.33%) 7,518,333,691 stalled-cycles-backend:u # 18.93% backend cycles idle ( +- 0.56% ) (83.33%) 85,477,123,093 instructions:u # 2.15 insn per cycle # 0.09 stalled cycles per insn ( +- 0.02% ) (83.34%) 19,346,085,683 branches:u # 1988.235 M/sec ( +- 0.03% ) (83.34%) 237,291,787 branch-misses:u # 1.23% of all branches ( +- 0.15% ) (83.33%) 9.7465 +- 0.0524 seconds time elapsed ( +- 0.54% ) $ After: $ perf stat -r5 pahole -J vmlinux Performance counter stats for 'pahole -J vmlinux' (5 runs): 8,953.80 msec task-clock:u # 0.998 CPUs utilized ( +- 0.09% ) 0 context-switches:u # 0.000 K/sec 0 cpu-migrations:u # 0.000 K/sec 353,855 page-faults:u # 0.040 M/sec ( +- 0.00% ) 35,775,730,539 cycles:u # 3.996 GHz ( +- 0.07% ) (83.33%) 579,534,836 stalled-cycles-frontend:u # 1.62% frontend cycles idle ( +- 2.21% ) (83.33%) 5,719,840,144 stalled-cycles-backend:u # 15.99% backend cycles idle ( +- 0.93% ) (83.33%) 73,035,744,786 instructions:u # 2.04 insn per cycle # 0.08 stalled cycles per insn ( +- 0.02% ) (83.34%) 16,798,017,844 branches:u # 1876.077 M/sec ( +- 0.05% ) (83.33%) 237,777,143 branch-misses:u # 1.42% of all branches ( +- 0.15% ) (83.34%) 8.97077 +- 0.00803 seconds time elapsed ( +- 0.09% ) $ Indeed, about 10% shaved, not bad. Signed-off-by: Andrii Nakryiko <andriin@fb.com> Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Alexei Starovoitov <ast@kernel.org> Cc: Andrii Nakryiko <andrii@kernel.org> Cc: Hao Luo <haoluo@google.com> Cc: Oleg Rombakh <olegrom@google.com> Cc: bpf@vger.kernel.org Cc: dwarves@vger.kernel.org Cc: kernel-team@fb.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
70 lines
2.2 KiB
C
70 lines
2.2 KiB
C
/*
|
|
SPDX-License-Identifier: GPL-2.0-only
|
|
|
|
Copyright (C) 2019 Facebook
|
|
*/
|
|
|
|
#ifndef _LIBBTF_H
|
|
#define _LIBBTF_H
|
|
|
|
#include "gobuffer.h"
|
|
|
|
#include <stdbool.h>
|
|
#include <stdint.h>
|
|
#include "lib/bpf/src/btf.h"
|
|
|
|
struct btf_elf {
|
|
void *priv;
|
|
Elf *elf;
|
|
GElf_Ehdr ehdr;
|
|
struct elf_symtab *symtab;
|
|
struct gobuffer percpu_secinfo;
|
|
char *filename;
|
|
int in_fd;
|
|
uint8_t wordsize;
|
|
bool is_big_endian;
|
|
bool raw_btf; // "/sys/kernel/btf/vmlinux"
|
|
uint32_t percpu_shndx;
|
|
uint64_t percpu_base_addr;
|
|
struct btf *btf;
|
|
};
|
|
|
|
extern uint8_t btf_elf__verbose;
|
|
extern uint8_t btf_elf__force;
|
|
#define btf_elf__verbose_log(fmt, ...) { if (btf_elf__verbose) printf(fmt, __VA_ARGS__); }
|
|
|
|
#define PERCPU_SECTION ".data..percpu"
|
|
|
|
struct cu;
|
|
struct base_type;
|
|
struct ftype;
|
|
|
|
struct btf_elf *btf_elf__new(const char *filename, Elf *elf);
|
|
void btf_elf__delete(struct btf_elf *btf);
|
|
|
|
int32_t btf_elf__add_base_type(struct btf_elf *btf, const struct base_type *bt,
|
|
const char *name);
|
|
int32_t btf_elf__add_ref_type(struct btf_elf *btf, uint16_t kind, uint32_t type,
|
|
const char *name, bool kind_flag);
|
|
int btf_elf__add_member(struct btf_elf *btf, const char *name, uint32_t type,
|
|
uint32_t bitfield_size, uint32_t bit_offset);
|
|
int32_t btf_elf__add_struct(struct btf_elf *btf, uint8_t kind, const char *name, uint32_t size);
|
|
int32_t btf_elf__add_array(struct btf_elf *btf, uint32_t type, uint32_t index_type,
|
|
uint32_t nelems);
|
|
int32_t btf_elf__add_enum(struct btf_elf *btf, const char *name, uint32_t size);
|
|
int btf_elf__add_enum_val(struct btf_elf *btf, const char *name, int32_t value);
|
|
int32_t btf_elf__add_func_proto(struct btf_elf *btf, struct cu *cu, struct ftype *ftype,
|
|
uint32_t type_id_off);
|
|
int32_t btf_elf__add_var_type(struct btf_elf *btfe, uint32_t type, const char *name,
|
|
uint32_t linkage);
|
|
int32_t btf_elf__add_var_secinfo(struct gobuffer *buf, uint32_t type,
|
|
uint32_t offset, uint32_t size);
|
|
int32_t btf_elf__add_datasec_type(struct btf_elf *btfe, const char *section_name,
|
|
struct gobuffer *var_secinfo_buf);
|
|
int btf_elf__encode(struct btf_elf *btf, uint8_t flags);
|
|
|
|
const char *btf_elf__string(struct btf_elf *btf, uint32_t ref);
|
|
int btf_elf__load(struct btf_elf *btf);
|
|
|
|
#endif /* _LIBBTF_H */
|