pahole: Use per-thread btf instances to avoid mutex locking
Create an instance of btf for each worker thread, and add type info to the local btf instance in the steal-function of pahole without mutex acquiring. Once finished with all worker threads, merge all per-thread btf instances to the primary btf instance. Committer testing: Results with no multithreading, and without further DWARF loading improvements (not loading things that won't be converted to BTF, etc), i.e. using pahole 1.21: # perf stat -r5 pahole --btf_encode /tmp/vmlinux ; btfdiff /tmp/vmlinux Performance counter stats for 'pahole --btf_encode /tmp/vmlinux' (5 runs): 6,317.41 msec task-clock # 0.985 CPUs utilized ( +- 1.07% ) 80 context-switches # 12.478 /sec ( +- 15.25% ) 1 cpu-migrations # 0.156 /sec ( +-111.36% ) 535,890 page-faults # 83.585 K/sec ( +- 0.00% ) 29,789,308,790 cycles # 4.646 GHz ( +- 0.46% ) (83.33%) 97,696,165 stalled-cycles-frontend # 0.33% frontend cycles idle ( +- 4.05% ) (83.34%) 145,554,652 stalled-cycles-backend # 0.49% backend cycles idle ( +- 21.53% ) (83.33%) 78,215,192,264 instructions # 2.61 insn per cycle # 0.00 stalled cycles per insn ( +- 0.05% ) (83.33%) 18,141,376,637 branches # 2.830 G/sec ( +- 0.06% ) (83.33%) 148,826,657 branch-misses # 0.82% of all branches ( +- 0.65% ) (83.34%) 6.4129 +- 0.0682 seconds time elapsed ( +- 1.06% ) # Now with pahole 1.23, with just parallel DWARF loading + trimmed DWARF loading (skipping DWARF tags that won't be converted to BTF, etc): $ perf stat -r5 pahole -j --btf_encode /tmp/vmlinux Performance counter stats for 'pahole -j --btf_encode /tmp/vmlinux' (5 runs): 10,828.98 msec task-clock:u # 3.539 CPUs utilized ( +- 0.94% ) 0 context-switches:u # 0.000 /sec 0 cpu-migrations:u # 0.000 /sec 105,407 page-faults:u # 9.895 K/sec ( +- 0.15% ) 24,774,029,571 cycles:u # 2.326 GHz ( +- 0.50% ) (83.49%) 76,895,232 stalled-cycles-frontend:u # 0.31% frontend cycles idle ( +- 4.84% ) (83.50%) 24,821,768 stalled-cycles-backend:u # 0.10% backend cycles idle ( +- 3.66% ) (83.11%) 69,891,360,588 instructions:u # 2.83 insn per cycle # 0.00 stalled cycles per insn ( +- 0.10% ) (83.20%) 16,966,456,889 branches:u # 1.593 G/sec ( +- 0.21% ) (83.41%) 131,923,443 branch-misses:u # 0.78% of all branches ( +- 0.82% ) (83.42%) 3.0600 +- 0.0140 seconds time elapsed ( +- 0.46% ) $ It is a bit better not to use -j to use all the CPU threads in the machine, i.e. using just the number of non-hyperthreading cores, in this machine, a Ryzen 5950x, 16 cores: $ perf stat -r5 pahole -j16 --btf_encode /tmp/vmlinux Performance counter stats for 'pahole -j16 --btf_encode /tmp/vmlinux' (5 runs): 10,075.46 msec task-clock:u # 3.431 CPUs utilized ( +- 0.49% ) 0 context-switches:u # 0.000 /sec 0 cpu-migrations:u # 0.000 /sec 90,777 page-faults:u # 8.983 K/sec ( +- 0.16% ) 22,611,016,624 cycles:u # 2.237 GHz ( +- 0.93% ) (83.34%) 55,760,536 stalled-cycles-frontend:u # 0.24% frontend cycles idle ( +- 2.35% ) (83.25%) 15,985,651 stalled-cycles-backend:u # 0.07% backend cycles idle ( +- 8.79% ) (83.33%) 68,976,319,497 instructions:u # 2.96 insn per cycle # 0.00 stalled cycles per insn ( +- 0.34% ) (83.39%) 16,770,540,533 branches:u # 1.659 G/sec ( +- 0.31% ) (83.35%) 128,220,385 branch-misses:u # 0.76% of all branches ( +- 0.77% ) (83.37%) 2.9365 +- 0.0284 seconds time elapsed ( +- 0.97% ) $ Then with parallel DWARF loading + parallel BTF encoding (this patch): $ perf stat -r5 pahole -j --btf_encode /tmp/vmlinux Performance counter stats for 'pahole -j --btf_encode /tmp/vmlinux' (5 runs): 11,063.29 msec task-clock:u # 6.389 CPUs utilized ( +- 0.79% ) 0 context-switches:u # 0.000 /sec 0 cpu-migrations:u # 0.000 /sec 163,263 page-faults:u # 14.840 K/sec ( +- 0.48% ) 41,892,887,608 cycles:u # 3.808 GHz ( +- 0.96% ) (83.41%) 197,163,158 stalled-cycles-frontend:u # 0.47% frontend cycles idle ( +- 3.23% ) (83.46%) 114,187,423 stalled-cycles-backend:u # 0.27% backend cycles idle ( +- 16.57% ) (83.43%) 74,053,722,204 instructions:u # 1.78 insn per cycle # 0.00 stalled cycles per insn ( +- 0.18% ) (83.37%) 17,848,238,467 branches:u # 1.622 G/sec ( +- 0.10% ) (83.27%) 180,232,427 branch-misses:u # 1.01% of all branches ( +- 0.86% ) (83.16%) 1.7316 +- 0.0301 seconds time elapsed ( +- 1.74% ) $ Again it is better not to use -j to use all the CPU threads: $ perf stat -r5 pahole -j16 --btf_encode /tmp/vmlinux Performance counter stats for 'pahole -j16 --btf_encode /tmp/vmlinux' (5 runs): 6,626.33 msec task-clock:u # 4.421 CPUs utilized ( +- 0.82% ) 0 context-switches:u # 0.000 /sec 0 cpu-migrations:u # 0.000 /sec 140,919 page-faults:u # 21.240 K/sec ( +- 1.03% ) 26,085,701,848 cycles:u # 3.932 GHz ( +- 1.20% ) (83.38%) 98,962,246 stalled-cycles-frontend:u # 0.37% frontend cycles idle ( +- 3.47% ) (83.41%) 102,762,088 stalled-cycles-backend:u # 0.39% backend cycles idle ( +- 17.95% ) (83.38%) 71,193,141,569 instructions:u # 2.69 insn per cycle # 0.00 stalled cycles per insn ( +- 0.14% ) (83.33%) 17,166,459,728 branches:u # 2.587 G/sec ( +- 0.15% ) (83.27%) 150,984,525 branch-misses:u # 0.87% of all branches ( +- 0.61% ) (83.34%) 1.4989 +- 0.0113 seconds time elapsed ( +- 0.76% ) $ Minor tweaks to reduce the patch size, things like avoiding moving the pthread_mutex_lock(&btf_lock) to after a comment, etc. Signed-off-by: Kui-Feng Lee <kuifeng@fb.com> Acked-by: Andrii Nakryiko <andrii@kernel.org> Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Alexei Starovoitov <ast@kernel.org> Cc: Daniel Borkmann <daniel@iogearbox.net> Cc: bpf@vger.kernel.org Cc: dwarves@vger.kernel.org Link: https://lore.kernel.org/r/20220126192039.2840752-4-kuifeng@fb.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
This commit is contained in:
parent
96d2c5c323
commit
2135275318
|
@ -1529,3 +1529,8 @@ int btf_encoder__encode_cu(struct btf_encoder *encoder, struct cu *cu)
|
|||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
struct btf *btf_encoder__btf(struct btf_encoder *encoder)
|
||||
{
|
||||
return encoder->btf;
|
||||
}
|
||||
|
|
|
@ -29,4 +29,6 @@ struct btf_encoder *btf_encoders__first(struct list_head *encoders);
|
|||
|
||||
struct btf_encoder *btf_encoders__next(struct btf_encoder *encoder);
|
||||
|
||||
struct btf *btf_encoder__btf(struct btf_encoder *encoder);
|
||||
|
||||
#endif /* _BTF_ENCODER_H_ */
|
||||
|
|
117
pahole.c
117
pahole.c
|
@ -2798,6 +2798,72 @@ out:
|
|||
|
||||
static struct type_instance *header;
|
||||
|
||||
struct thread_data {
|
||||
struct btf *btf;
|
||||
struct btf_encoder *encoder;
|
||||
};
|
||||
|
||||
static int pahole_threads_prepare(struct conf_load *conf, int nr_threads, void **thr_data)
|
||||
{
|
||||
int i;
|
||||
struct thread_data *threads = calloc(sizeof(struct thread_data), nr_threads);
|
||||
|
||||
for (i = 0; i < nr_threads; i++)
|
||||
thr_data[i] = threads + i;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int pahole_thread_exit(struct conf_load *conf, void *thr_data)
|
||||
{
|
||||
struct thread_data *thread = thr_data;
|
||||
|
||||
if (thread == NULL)
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* Here we will call btf__dedup() here once we extend
|
||||
* btf__dedup().
|
||||
*/
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int pahole_threads_collect(struct conf_load *conf, int nr_threads, void **thr_data,
|
||||
int error)
|
||||
{
|
||||
struct thread_data **threads = (struct thread_data **)thr_data;
|
||||
int i;
|
||||
int err = 0;
|
||||
|
||||
if (error)
|
||||
goto out;
|
||||
|
||||
for (i = 0; i < nr_threads; i++) {
|
||||
/*
|
||||
* Merge content of the btf instances of worker threads to the btf
|
||||
* instance of the primary btf_encoder.
|
||||
*/
|
||||
if (!threads[i]->btf || threads[i]->encoder == btf_encoder)
|
||||
continue; /* The primary btf_encoder */
|
||||
err = btf__add_btf(btf_encoder__btf(btf_encoder), threads[i]->btf);
|
||||
if (err < 0)
|
||||
goto out;
|
||||
btf_encoder__delete(threads[i]->encoder);
|
||||
threads[i]->encoder = NULL;
|
||||
}
|
||||
err = 0;
|
||||
|
||||
out:
|
||||
for (i = 0; i < nr_threads; i++) {
|
||||
if (threads[i]->encoder && threads[i]->encoder != btf_encoder)
|
||||
btf_encoder__delete(threads[i]->encoder);
|
||||
}
|
||||
free(threads[0]);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static enum load_steal_kind pahole_stealer(struct cu *cu,
|
||||
struct conf_load *conf_load,
|
||||
void *thr_data)
|
||||
|
@ -2819,6 +2885,7 @@ static enum load_steal_kind pahole_stealer(struct cu *cu,
|
|||
|
||||
if (btf_encode) {
|
||||
static pthread_mutex_t btf_lock = PTHREAD_MUTEX_INITIALIZER;
|
||||
struct btf_encoder *encoder;
|
||||
|
||||
pthread_mutex_lock(&btf_lock);
|
||||
/*
|
||||
|
@ -2828,21 +2895,58 @@ static enum load_steal_kind pahole_stealer(struct cu *cu,
|
|||
* point we'll have cu->elf setup...
|
||||
*/
|
||||
if (!btf_encoder) {
|
||||
/*
|
||||
* btf_encoder is the primary encoder.
|
||||
* And, it is used by the thread
|
||||
* create it.
|
||||
*/
|
||||
btf_encoder = btf_encoder__new(cu, detached_btf_filename, conf_load->base_btf, skip_encoding_btf_vars,
|
||||
btf_encode_force, btf_gen_floats, global_verbose);
|
||||
if (btf_encoder == NULL) {
|
||||
ret = LSK__STOP_LOADING;
|
||||
goto out_btf;
|
||||
if (btf_encoder && thr_data) {
|
||||
struct thread_data *thread = thr_data;
|
||||
|
||||
thread->encoder = btf_encoder;
|
||||
thread->btf = btf_encoder__btf(btf_encoder);
|
||||
}
|
||||
}
|
||||
pthread_mutex_unlock(&btf_lock);
|
||||
|
||||
if (btf_encoder__encode_cu(btf_encoder, cu)) {
|
||||
if (!btf_encoder) {
|
||||
ret = LSK__STOP_LOADING;
|
||||
goto out_btf;
|
||||
}
|
||||
|
||||
/*
|
||||
* thr_data keeps per-thread data for worker threads. Each worker thread
|
||||
* has an encoder. The main thread will merge the data collected by all
|
||||
* these encoders to btf_encoder. However, the first thread reaching this
|
||||
* function creates btf_encoder and reuses it as its local encoder. It
|
||||
* avoids copying the data collected by the first thread.
|
||||
*/
|
||||
if (thr_data) {
|
||||
struct thread_data *thread = thr_data;
|
||||
|
||||
if (thread->encoder == NULL) {
|
||||
thread->encoder =
|
||||
btf_encoder__new(cu, detached_btf_filename,
|
||||
NULL,
|
||||
skip_encoding_btf_vars,
|
||||
btf_encode_force,
|
||||
btf_gen_floats,
|
||||
global_verbose);
|
||||
thread->btf = btf_encoder__btf(thread->encoder);
|
||||
}
|
||||
encoder = thread->encoder;
|
||||
} else {
|
||||
encoder = btf_encoder;
|
||||
}
|
||||
|
||||
if (btf_encoder__encode_cu(encoder, cu)) {
|
||||
fprintf(stderr, "Encountered error while encoding BTF.\n");
|
||||
exit(1);
|
||||
}
|
||||
ret = LSK__DELETE;
|
||||
out_btf:
|
||||
pthread_mutex_unlock(&btf_lock);
|
||||
return ret;
|
||||
}
|
||||
#if 0
|
||||
|
@ -3207,6 +3311,9 @@ int main(int argc, char *argv[])
|
|||
memset(tab, ' ', sizeof(tab) - 1);
|
||||
|
||||
conf_load.steal = pahole_stealer;
|
||||
conf_load.thread_exit = pahole_thread_exit;
|
||||
conf_load.threads_prepare = pahole_threads_prepare;
|
||||
conf_load.threads_collect = pahole_threads_collect;
|
||||
|
||||
// Make 'pahole --header type < file' a shorter form of 'pahole -C type --count 1 < file'
|
||||
if (conf.header_type && !class_name && prettify_input) {
|
||||
|
|
Loading…
Reference in New Issue