pahole: Use per-thread btf instances to avoid mutex locking

Create an instance of btf for each worker thread, and add type info to
the local btf instance in the steal-function of pahole without mutex
acquiring.  Once finished with all worker threads, merge all
per-thread btf instances to the primary btf instance.

Committer testing:

Results with no multithreading, and without further DWARF loading
improvements (not loading things that won't be converted to BTF, etc),
i.e. using pahole 1.21:

  # perf stat -r5 pahole --btf_encode /tmp/vmlinux ; btfdiff /tmp/vmlinux

   Performance counter stats for 'pahole --btf_encode /tmp/vmlinux' (5 runs):

            6,317.41 msec task-clock                #    0.985 CPUs utilized            ( +-  1.07% )
                  80      context-switches          #   12.478 /sec                     ( +- 15.25% )
                   1      cpu-migrations            #    0.156 /sec                     ( +-111.36% )
             535,890      page-faults               #   83.585 K/sec                    ( +-  0.00% )
      29,789,308,790      cycles                    #    4.646 GHz                      ( +-  0.46% )  (83.33%)
          97,696,165      stalled-cycles-frontend   #    0.33% frontend cycles idle     ( +-  4.05% )  (83.34%)
         145,554,652      stalled-cycles-backend    #    0.49% backend cycles idle      ( +- 21.53% )  (83.33%)
      78,215,192,264      instructions              #    2.61  insn per cycle
                                                    #    0.00  stalled cycles per insn  ( +-  0.05% )  (83.33%)
      18,141,376,637      branches                  #    2.830 G/sec                    ( +-  0.06% )  (83.33%)
         148,826,657      branch-misses             #    0.82% of all branches          ( +-  0.65% )  (83.34%)

              6.4129 +- 0.0682 seconds time elapsed  ( +-  1.06% )

  #

Now with pahole 1.23, with just parallel DWARF loading + trimmed DWARF
loading (skipping DWARF tags that won't be converted to BTF, etc):

  $ perf stat -r5 pahole -j --btf_encode /tmp/vmlinux

   Performance counter stats for 'pahole -j --btf_encode /tmp/vmlinux' (5 runs):

           10,828.98 msec task-clock:u              #    3.539 CPUs utilized            ( +-  0.94% )
                   0      context-switches:u        #    0.000 /sec
                   0      cpu-migrations:u          #    0.000 /sec
             105,407      page-faults:u             #    9.895 K/sec                    ( +-  0.15% )
      24,774,029,571      cycles:u                  #    2.326 GHz                      ( +-  0.50% )  (83.49%)
          76,895,232      stalled-cycles-frontend:u #    0.31% frontend cycles idle     ( +-  4.84% )  (83.50%)
          24,821,768      stalled-cycles-backend:u  #    0.10% backend cycles idle      ( +-  3.66% )  (83.11%)
      69,891,360,588      instructions:u            #    2.83  insn per cycle
                                                    #    0.00  stalled cycles per insn  ( +-  0.10% )  (83.20%)
      16,966,456,889      branches:u                #    1.593 G/sec                    ( +-  0.21% )  (83.41%)
         131,923,443      branch-misses:u           #    0.78% of all branches          ( +-  0.82% )  (83.42%)

              3.0600 +- 0.0140 seconds time elapsed  ( +-  0.46% )

  $

It is a bit better not to use -j to use all the CPU threads in the
machine, i.e. using just the number of non-hyperthreading cores, in this
machine, a Ryzen 5950x, 16 cores:

  $ perf stat -r5 pahole -j16 --btf_encode /tmp/vmlinux

   Performance counter stats for 'pahole -j16 --btf_encode /tmp/vmlinux' (5 runs):

           10,075.46 msec task-clock:u              #    3.431 CPUs utilized            ( +-  0.49% )
                   0      context-switches:u        #    0.000 /sec
                   0      cpu-migrations:u          #    0.000 /sec
              90,777      page-faults:u             #    8.983 K/sec                    ( +-  0.16% )
      22,611,016,624      cycles:u                  #    2.237 GHz                      ( +-  0.93% )  (83.34%)
          55,760,536      stalled-cycles-frontend:u #    0.24% frontend cycles idle     ( +-  2.35% )  (83.25%)
          15,985,651      stalled-cycles-backend:u  #    0.07% backend cycles idle      ( +-  8.79% )  (83.33%)
      68,976,319,497      instructions:u            #    2.96  insn per cycle
                                                    #    0.00  stalled cycles per insn  ( +-  0.34% )  (83.39%)
      16,770,540,533      branches:u                #    1.659 G/sec                    ( +-  0.31% )  (83.35%)
         128,220,385      branch-misses:u           #    0.76% of all branches          ( +-  0.77% )  (83.37%)

              2.9365 +- 0.0284 seconds time elapsed  ( +-  0.97% )

  $

Then with parallel DWARF loading + parallel BTF encoding (this patch):

  $ perf stat -r5 pahole -j --btf_encode /tmp/vmlinux

   Performance counter stats for 'pahole -j --btf_encode /tmp/vmlinux' (5 runs):

           11,063.29 msec task-clock:u              #    6.389 CPUs utilized            ( +-  0.79% )
                   0      context-switches:u        #    0.000 /sec
                   0      cpu-migrations:u          #    0.000 /sec
             163,263      page-faults:u             #   14.840 K/sec                    ( +-  0.48% )
      41,892,887,608      cycles:u                  #    3.808 GHz                      ( +-  0.96% )  (83.41%)
         197,163,158      stalled-cycles-frontend:u #    0.47% frontend cycles idle     ( +-  3.23% )  (83.46%)
         114,187,423      stalled-cycles-backend:u  #    0.27% backend cycles idle      ( +- 16.57% )  (83.43%)
      74,053,722,204      instructions:u            #    1.78  insn per cycle
                                                    #    0.00  stalled cycles per insn  ( +-  0.18% )  (83.37%)
      17,848,238,467      branches:u                #    1.622 G/sec                    ( +-  0.10% )  (83.27%)
         180,232,427      branch-misses:u           #    1.01% of all branches          ( +-  0.86% )  (83.16%)

              1.7316 +- 0.0301 seconds time elapsed  ( +-  1.74% )

  $

Again it is better not to use -j to use all the CPU threads:

  $ perf stat -r5 pahole -j16 --btf_encode /tmp/vmlinux

   Performance counter stats for 'pahole -j16 --btf_encode /tmp/vmlinux' (5 runs):

            6,626.33 msec task-clock:u              #    4.421 CPUs utilized            ( +-  0.82% )
                   0      context-switches:u        #    0.000 /sec
                   0      cpu-migrations:u          #    0.000 /sec
             140,919      page-faults:u             #   21.240 K/sec                    ( +-  1.03% )
      26,085,701,848      cycles:u                  #    3.932 GHz                      ( +-  1.20% )  (83.38%)
          98,962,246      stalled-cycles-frontend:u #    0.37% frontend cycles idle     ( +-  3.47% )  (83.41%)
         102,762,088      stalled-cycles-backend:u  #    0.39% backend cycles idle      ( +- 17.95% )  (83.38%)
      71,193,141,569      instructions:u            #    2.69  insn per cycle
                                                    #    0.00  stalled cycles per insn  ( +-  0.14% )  (83.33%)
      17,166,459,728      branches:u                #    2.587 G/sec                    ( +-  0.15% )  (83.27%)
         150,984,525      branch-misses:u           #    0.87% of all branches          ( +-  0.61% )  (83.34%)

              1.4989 +- 0.0113 seconds time elapsed  ( +-  0.76% )

  $

Minor tweaks to reduce the patch size, things like avoiding moving the
pthread_mutex_lock(&btf_lock) to after a comment, etc.

Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: bpf@vger.kernel.org
Cc: dwarves@vger.kernel.org
Link: https://lore.kernel.org/r/20220126192039.2840752-4-kuifeng@fb.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
This commit is contained in:
Kui-Feng Lee 2022-01-26 11:20:38 -08:00 committed by Arnaldo Carvalho de Melo
parent 96d2c5c323
commit 2135275318
3 changed files with 119 additions and 5 deletions

View File

@ -1529,3 +1529,8 @@ int btf_encoder__encode_cu(struct btf_encoder *encoder, struct cu *cu)
out:
return err;
}
struct btf *btf_encoder__btf(struct btf_encoder *encoder)
{
return encoder->btf;
}

View File

@ -29,4 +29,6 @@ struct btf_encoder *btf_encoders__first(struct list_head *encoders);
struct btf_encoder *btf_encoders__next(struct btf_encoder *encoder);
struct btf *btf_encoder__btf(struct btf_encoder *encoder);
#endif /* _BTF_ENCODER_H_ */

117
pahole.c
View File

@ -2798,6 +2798,72 @@ out:
static struct type_instance *header;
struct thread_data {
struct btf *btf;
struct btf_encoder *encoder;
};
static int pahole_threads_prepare(struct conf_load *conf, int nr_threads, void **thr_data)
{
int i;
struct thread_data *threads = calloc(sizeof(struct thread_data), nr_threads);
for (i = 0; i < nr_threads; i++)
thr_data[i] = threads + i;
return 0;
}
static int pahole_thread_exit(struct conf_load *conf, void *thr_data)
{
struct thread_data *thread = thr_data;
if (thread == NULL)
return 0;
/*
* Here we will call btf__dedup() here once we extend
* btf__dedup().
*/
return 0;
}
static int pahole_threads_collect(struct conf_load *conf, int nr_threads, void **thr_data,
int error)
{
struct thread_data **threads = (struct thread_data **)thr_data;
int i;
int err = 0;
if (error)
goto out;
for (i = 0; i < nr_threads; i++) {
/*
* Merge content of the btf instances of worker threads to the btf
* instance of the primary btf_encoder.
*/
if (!threads[i]->btf || threads[i]->encoder == btf_encoder)
continue; /* The primary btf_encoder */
err = btf__add_btf(btf_encoder__btf(btf_encoder), threads[i]->btf);
if (err < 0)
goto out;
btf_encoder__delete(threads[i]->encoder);
threads[i]->encoder = NULL;
}
err = 0;
out:
for (i = 0; i < nr_threads; i++) {
if (threads[i]->encoder && threads[i]->encoder != btf_encoder)
btf_encoder__delete(threads[i]->encoder);
}
free(threads[0]);
return err;
}
static enum load_steal_kind pahole_stealer(struct cu *cu,
struct conf_load *conf_load,
void *thr_data)
@ -2819,6 +2885,7 @@ static enum load_steal_kind pahole_stealer(struct cu *cu,
if (btf_encode) {
static pthread_mutex_t btf_lock = PTHREAD_MUTEX_INITIALIZER;
struct btf_encoder *encoder;
pthread_mutex_lock(&btf_lock);
/*
@ -2828,21 +2895,58 @@ static enum load_steal_kind pahole_stealer(struct cu *cu,
* point we'll have cu->elf setup...
*/
if (!btf_encoder) {
/*
* btf_encoder is the primary encoder.
* And, it is used by the thread
* create it.
*/
btf_encoder = btf_encoder__new(cu, detached_btf_filename, conf_load->base_btf, skip_encoding_btf_vars,
btf_encode_force, btf_gen_floats, global_verbose);
if (btf_encoder == NULL) {
ret = LSK__STOP_LOADING;
goto out_btf;
if (btf_encoder && thr_data) {
struct thread_data *thread = thr_data;
thread->encoder = btf_encoder;
thread->btf = btf_encoder__btf(btf_encoder);
}
}
pthread_mutex_unlock(&btf_lock);
if (btf_encoder__encode_cu(btf_encoder, cu)) {
if (!btf_encoder) {
ret = LSK__STOP_LOADING;
goto out_btf;
}
/*
* thr_data keeps per-thread data for worker threads. Each worker thread
* has an encoder. The main thread will merge the data collected by all
* these encoders to btf_encoder. However, the first thread reaching this
* function creates btf_encoder and reuses it as its local encoder. It
* avoids copying the data collected by the first thread.
*/
if (thr_data) {
struct thread_data *thread = thr_data;
if (thread->encoder == NULL) {
thread->encoder =
btf_encoder__new(cu, detached_btf_filename,
NULL,
skip_encoding_btf_vars,
btf_encode_force,
btf_gen_floats,
global_verbose);
thread->btf = btf_encoder__btf(thread->encoder);
}
encoder = thread->encoder;
} else {
encoder = btf_encoder;
}
if (btf_encoder__encode_cu(encoder, cu)) {
fprintf(stderr, "Encountered error while encoding BTF.\n");
exit(1);
}
ret = LSK__DELETE;
out_btf:
pthread_mutex_unlock(&btf_lock);
return ret;
}
#if 0
@ -3207,6 +3311,9 @@ int main(int argc, char *argv[])
memset(tab, ' ', sizeof(tab) - 1);
conf_load.steal = pahole_stealer;
conf_load.thread_exit = pahole_thread_exit;
conf_load.threads_prepare = pahole_threads_prepare;
conf_load.threads_collect = pahole_threads_collect;
// Make 'pahole --header type < file' a shorter form of 'pahole -C type --count 1 < file'
if (conf.header_type && !class_name && prettify_input) {