dwarf_loader: Parallel DWARF loading

Tested so far with a typical Linux kernel vmlinux file.

Testing it:

  ⬢[acme@toolbox pahole]$ perf stat -r5 pahole -F dwarf vmlinux > /dev/null

   Performance counter stats for 'pahole -F dwarf vmlinux' (5 runs):

            5,675.97 msec task-clock:u              #    1.000 CPUs utilized            ( +-  0.36% )
                   0      context-switches:u        #    0.000 /sec
                   0      cpu-migrations:u          #    0.000 /sec
             736,865      page-faults:u             #  129.898 K/sec                    ( +-  0.00% )
      21,921,617,854      cycles:u                  #    3.864 GHz                      ( +-  0.23% )  (83.34%)
         206,308,275      stalled-cycles-frontend:u #    0.95% frontend cycles idle     ( +-  4.59% )  (83.33%)
       2,186,772,169      stalled-cycles-backend:u  #   10.02% backend cycles idle      ( +-  0.46% )  (83.33%)
      62,272,507,248      instructions:u            #    2.85  insn per cycle
                                                    #    0.03  stalled cycles per insn  ( +-  0.03% )  (83.34%)
      14,967,758,961      branches:u                #    2.639 G/sec                    ( +-  0.03% )  (83.33%)
          65,688,710      branch-misses:u           #    0.44% of all branches          ( +-  0.29% )  (83.33%)

              5.6750 +- 0.0203 seconds time elapsed  ( +-  0.36% )

  ⬢[acme@toolbox pahole]$ perf stat -r5 pahole -F dwarf -j12 vmlinux > /dev/null

   Performance counter stats for 'pahole -F dwarf -j12 vmlinux' (5 runs):

           18,015.77 msec task-clock:u              #    7.669 CPUs utilized            ( +-  2.49% )
                   0      context-switches:u        #    0.000 /sec
                   0      cpu-migrations:u          #    0.000 /sec
             739,157      page-faults:u             #   40.726 K/sec                    ( +-  0.01% )
      26,673,502,570      cycles:u                  #    1.470 GHz                      ( +-  0.44% )  (83.12%)
         734,106,744      stalled-cycles-frontend:u #    2.80% frontend cycles idle     ( +-  2.30% )  (83.65%)
       2,258,159,917      stalled-cycles-backend:u  #    8.60% backend cycles idle      ( +-  1.51% )  (83.62%)
      63,347,827,742      instructions:u            #    2.41  insn per cycle
                                                    #    0.04  stalled cycles per insn  ( +-  0.03% )  (83.32%)
      15,242,840,672      branches:u                #  839.841 M/sec                    ( +-  0.03% )  (83.22%)
          73,860,851      branch-misses:u           #    0.48% of all branches          ( +-  0.51% )  (83.09%)

               2.349 +- 0.116 seconds time elapsed  ( +-  4.93% )

  ⬢[acme@toolbox pahole]$

Since this is done in 12 threads and pahole prints as it finishes
processing each CU, the output is not anymore deterministically the same
for all runs.

I'll add a mode where one can ask for the structures to be kept into a
data structure to sort before printing, so that btfdiff can use it with
-j and continue working.

Also since it prints the first struct with a given name, and there are
multiple structures with a given name in the kernel, we get differences
even when we ask just for the sizes (so that we get just one line per
struct):

  ⬢[acme@toolbox pahole]$ pahole -F dwarf --sizes vmlinux > /tmp/pahole--sizes.txt
  ⬢[acme@toolbox pahole]$ pahole -F dwarf -j12 --sizes vmlinux > /tmp/pahole--sizes-j12.txt
  ⬢[acme@toolbox pahole]$ diff -u /tmp/pahole--sizes.txt /tmp/pahole--sizes-j12.txt | head
  --- /tmp/pahole--sizes.txt	2021-07-01 21:56:49.260958678 -0300
  +++ /tmp/pahole--sizes-j12.txt	2021-07-01 21:57:00.322209241 -0300
  @@ -1,20 +1,9 @@
  -list_head	16	0
  -hlist_head	8	0
  -hlist_node	16	0
  -callback_head	16	0
  -file_system_type	72	1
  -qspinlock	4	0
  -qrwlock	8	0
  ⬢[acme@toolbox pahole]$

We can't compare it that way, lets sort both and then try again:

  ⬢[acme@toolbox pahole]$ sort /tmp/pahole--sizes.txt > /tmp/pahole--sizes.txt.sorted
  ⬢[acme@toolbox pahole]$ sort /tmp/pahole--sizes-j12.txt > /tmp/pahole--sizes-j12.txt.sorted
  ⬢[acme@toolbox pahole]$ diff -u /tmp/pahole--sizes.txt.sorted /tmp/pahole--sizes-j12.txt.sorted
  --- /tmp/pahole--sizes.txt.sorted	2021-07-01 21:57:13.841515467 -0300
  +++ /tmp/pahole--sizes-j12.txt.sorted	2021-07-01 21:57:16.771581840 -0300
  @@ -1116,7 +1116,7 @@
   child_latency_info	48	1
   chipset	32	1
   chksum_ctx	4	0
  -chksum_desc_ctx	4	0
  +chksum_desc_ctx	2	0
   cipher_alg	32	0
   cipher_context	16	0
   cipher_test_sglists	1184	0
  @@ -1589,7 +1589,7 @@
   ddebug_query	40	0
   ddebug_table	40	1
   deadline_data	120	1
  -debug_buffer	72	0
  +debug_buffer	64	0
   debugfs_blob_wrapper	16	0
   debugfs_devm_entry	16	0
   debugfs_fsdata	48	1
  @@ -3291,7 +3291,7 @@
   integrity_sysfs_entry	32	0
   intel_agp_driver_description	24	1
   intel_community	96	1
  -intel_community_context	68	0
  +intel_community_context	16	0
   intel_early_ops	16	0
   intel_excl_cntrs	536	0
   intel_excl_states	260	0
  @@ -3619,7 +3619,7 @@
   irqtime	24	0
   irq_work	24	0
   ir_table	16	0
  -irte	4	0
  +irte	16	0
   irte_ga	16	0
   irte_ga_hi	8	0
   irte_ga_lo	8	0
  @@ -4909,7 +4909,7 @@
   pci_platform_pm_ops	64	0
   pci_pme_device	24	0
   pci_raw_ops	16	0
  -pci_root_info	104	0
  +pci_root_info	120	1
   pci_root_res	80	0
   pci_saved_state	64	0
   pciserial_board	24	0
  @@ -5132,10 +5132,10 @@
   pmc_clk	24	0
   pmc_clk_data	24	0
   pmc_data	16	0
  -pmc_dev	144	4
  +pmc_dev	40	1
   pm_clk_notifier_block	32	0
   pm_clock_entry	40	0
  -pmc_reg_map	136	0
  +pmc_reg_map	40	0
   pmic_table	12	0
   pm_message	4	0
   pm_nl_pernet	80	1
  @@ -6388,7 +6388,7 @@
   sw842_hlist_node2	24	0
   sw842_hlist_node4	24	0
   sw842_hlist_node8	32	0
  -sw842_param	59496	2
  +sw842_param	48	1
   swait_queue	24	0
   swait_queue_head	24	1
   swap_cgroup	2	0
  @@ -7942,7 +7942,7 @@
   uprobe_trace_entry_head	8	0
   uprobe_xol_ops	32	0
   urb	184	0
  -urb_priv	32	1
  +urb_priv	8	0
   usb2_lpm_parameters	8	0
   usb3_lpm_parameters	16	0
   usb_anchor	56	0
  ⬢[acme@toolbox pahole]$

I'll check one by one, but looks kinda legit.

Now to fiddle with thread affinities. And then move to threaded BTF
encoding, that at a first test with a single btf_lock in the pahole
stealer ended up producing corrupt BTF, valid just up to a point.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
This commit is contained in:
Arnaldo Carvalho de Melo 2021-07-01 21:35:10 -03:00
parent 75d4748861
commit fb99cad539
1 changed files with 79 additions and 1 deletions

View File

@ -12,6 +12,7 @@
#include <fcntl.h>
#include <fnmatch.h>
#include <libelf.h>
#include <pthread.h>
#include <search.h>
#include <stdio.h>
#include <stdlib.h>
@ -2647,8 +2648,77 @@ static int dwarf_cus__create_and_process_cu(struct dwarf_cus *dcus, Dwarf_Die *c
return DWARF_CB_OK;
}
static int dwarf_cus__nextcu(struct dwarf_cus *dcus, Dwarf_Die *die_mem, Dwarf_Die **cu_die, uint8_t *pointer_size, uint8_t *offset_size)
{
Dwarf_Off noff;
size_t cuhl;
int ret;
static int dwarf_cus__process_cus(struct dwarf_cus *dcus)
cus__lock(dcus->cus);
if (dcus->error) {
ret = dcus->error;
goto out_unlock;
}
ret = dwarf_nextcu(dcus->dw, dcus->off, &noff, &cuhl, NULL, pointer_size, offset_size);
if (ret == 0) {
*cu_die = dwarf_offdie(dcus->dw, dcus->off + cuhl, die_mem);
if (*cu_die != NULL)
dcus->off = noff;
}
out_unlock:
cus__unlock(dcus->cus);
return ret;
}
static void *dwarf_cus__process_cu_thread(void *arg)
{
struct dwarf_cus *dcus = arg;
uint8_t pointer_size, offset_size;
Dwarf_Die die_mem, *cu_die;
while (dwarf_cus__nextcu(dcus, &die_mem, &cu_die, &pointer_size, &offset_size) == 0) {
if (cu_die == NULL)
break;
if (dwarf_cus__create_and_process_cu(dcus, cu_die, pointer_size) == DWARF_CB_ABORT)
goto out_abort;
}
return (void *)DWARF_CB_OK;
out_abort:
return (void *)DWARF_CB_ABORT;
}
static int dwarf_cus__threaded_process_cus(struct dwarf_cus *dcus)
{
pthread_t threads[dcus->conf->nr_jobs];
int i;
for (i = 0; i < dcus->conf->nr_jobs; ++i) {
dcus->error = pthread_create(&threads[i], NULL, dwarf_cus__process_cu_thread, dcus);
if (dcus->error)
goto out_join;
}
dcus->error = 0;
out_join:
while (--i >= 0) {
void *res;
int err = pthread_join(threads[i], &res);
if (err == 0 && res != NULL)
dcus->error = (long)res;
}
return dcus->error;
}
static int __dwarf_cus__process_cus(struct dwarf_cus *dcus)
{
uint8_t pointer_size, offset_size;
Dwarf_Off noff;
@ -2670,6 +2740,14 @@ static int dwarf_cus__process_cus(struct dwarf_cus *dcus)
return 0;
}
static int dwarf_cus__process_cus(struct dwarf_cus *dcus)
{
if (dcus->conf->nr_jobs > 1)
return dwarf_cus__threaded_process_cus(dcus);
return __dwarf_cus__process_cus(dcus);
}
static int cus__merge_and_process_cu(struct cus *cus, struct conf_load *conf,
Dwfl_Module *mod, Dwarf *dw, Elf *elf,
const char *filename,