From fb99cad539e58638d4259b671bcb7146785bc36e Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 1 Jul 2021 21:35:10 -0300 Subject: [PATCH] dwarf_loader: Parallel DWARF loading MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tested so far with a typical Linux kernel vmlinux file. Testing it: ⬢[acme@toolbox pahole]$ perf stat -r5 pahole -F dwarf vmlinux > /dev/null Performance counter stats for 'pahole -F dwarf vmlinux' (5 runs): 5,675.97 msec task-clock:u # 1.000 CPUs utilized ( +- 0.36% ) 0 context-switches:u # 0.000 /sec 0 cpu-migrations:u # 0.000 /sec 736,865 page-faults:u # 129.898 K/sec ( +- 0.00% ) 21,921,617,854 cycles:u # 3.864 GHz ( +- 0.23% ) (83.34%) 206,308,275 stalled-cycles-frontend:u # 0.95% frontend cycles idle ( +- 4.59% ) (83.33%) 2,186,772,169 stalled-cycles-backend:u # 10.02% backend cycles idle ( +- 0.46% ) (83.33%) 62,272,507,248 instructions:u # 2.85 insn per cycle # 0.03 stalled cycles per insn ( +- 0.03% ) (83.34%) 14,967,758,961 branches:u # 2.639 G/sec ( +- 0.03% ) (83.33%) 65,688,710 branch-misses:u # 0.44% of all branches ( +- 0.29% ) (83.33%) 5.6750 +- 0.0203 seconds time elapsed ( +- 0.36% ) ⬢[acme@toolbox pahole]$ perf stat -r5 pahole -F dwarf -j12 vmlinux > /dev/null Performance counter stats for 'pahole -F dwarf -j12 vmlinux' (5 runs): 18,015.77 msec task-clock:u # 7.669 CPUs utilized ( +- 2.49% ) 0 context-switches:u # 0.000 /sec 0 cpu-migrations:u # 0.000 /sec 739,157 page-faults:u # 40.726 K/sec ( +- 0.01% ) 26,673,502,570 cycles:u # 1.470 GHz ( +- 0.44% ) (83.12%) 734,106,744 stalled-cycles-frontend:u # 2.80% frontend cycles idle ( +- 2.30% ) (83.65%) 2,258,159,917 stalled-cycles-backend:u # 8.60% backend cycles idle ( +- 1.51% ) (83.62%) 63,347,827,742 instructions:u # 2.41 insn per cycle # 0.04 stalled cycles per insn ( +- 0.03% ) (83.32%) 15,242,840,672 branches:u # 839.841 M/sec ( +- 0.03% ) (83.22%) 73,860,851 branch-misses:u # 0.48% of all branches ( +- 0.51% ) (83.09%) 2.349 +- 0.116 seconds time elapsed ( +- 4.93% ) ⬢[acme@toolbox pahole]$ Since this is done in 12 threads and pahole prints as it finishes processing each CU, the output is not anymore deterministically the same for all runs. I'll add a mode where one can ask for the structures to be kept into a data structure to sort before printing, so that btfdiff can use it with -j and continue working. Also since it prints the first struct with a given name, and there are multiple structures with a given name in the kernel, we get differences even when we ask just for the sizes (so that we get just one line per struct): ⬢[acme@toolbox pahole]$ pahole -F dwarf --sizes vmlinux > /tmp/pahole--sizes.txt ⬢[acme@toolbox pahole]$ pahole -F dwarf -j12 --sizes vmlinux > /tmp/pahole--sizes-j12.txt ⬢[acme@toolbox pahole]$ diff -u /tmp/pahole--sizes.txt /tmp/pahole--sizes-j12.txt | head --- /tmp/pahole--sizes.txt 2021-07-01 21:56:49.260958678 -0300 +++ /tmp/pahole--sizes-j12.txt 2021-07-01 21:57:00.322209241 -0300 @@ -1,20 +1,9 @@ -list_head 16 0 -hlist_head 8 0 -hlist_node 16 0 -callback_head 16 0 -file_system_type 72 1 -qspinlock 4 0 -qrwlock 8 0 ⬢[acme@toolbox pahole]$ We can't compare it that way, lets sort both and then try again: ⬢[acme@toolbox pahole]$ sort /tmp/pahole--sizes.txt > /tmp/pahole--sizes.txt.sorted ⬢[acme@toolbox pahole]$ sort /tmp/pahole--sizes-j12.txt > /tmp/pahole--sizes-j12.txt.sorted ⬢[acme@toolbox pahole]$ diff -u /tmp/pahole--sizes.txt.sorted /tmp/pahole--sizes-j12.txt.sorted --- /tmp/pahole--sizes.txt.sorted 2021-07-01 21:57:13.841515467 -0300 +++ /tmp/pahole--sizes-j12.txt.sorted 2021-07-01 21:57:16.771581840 -0300 @@ -1116,7 +1116,7 @@ child_latency_info 48 1 chipset 32 1 chksum_ctx 4 0 -chksum_desc_ctx 4 0 +chksum_desc_ctx 2 0 cipher_alg 32 0 cipher_context 16 0 cipher_test_sglists 1184 0 @@ -1589,7 +1589,7 @@ ddebug_query 40 0 ddebug_table 40 1 deadline_data 120 1 -debug_buffer 72 0 +debug_buffer 64 0 debugfs_blob_wrapper 16 0 debugfs_devm_entry 16 0 debugfs_fsdata 48 1 @@ -3291,7 +3291,7 @@ integrity_sysfs_entry 32 0 intel_agp_driver_description 24 1 intel_community 96 1 -intel_community_context 68 0 +intel_community_context 16 0 intel_early_ops 16 0 intel_excl_cntrs 536 0 intel_excl_states 260 0 @@ -3619,7 +3619,7 @@ irqtime 24 0 irq_work 24 0 ir_table 16 0 -irte 4 0 +irte 16 0 irte_ga 16 0 irte_ga_hi 8 0 irte_ga_lo 8 0 @@ -4909,7 +4909,7 @@ pci_platform_pm_ops 64 0 pci_pme_device 24 0 pci_raw_ops 16 0 -pci_root_info 104 0 +pci_root_info 120 1 pci_root_res 80 0 pci_saved_state 64 0 pciserial_board 24 0 @@ -5132,10 +5132,10 @@ pmc_clk 24 0 pmc_clk_data 24 0 pmc_data 16 0 -pmc_dev 144 4 +pmc_dev 40 1 pm_clk_notifier_block 32 0 pm_clock_entry 40 0 -pmc_reg_map 136 0 +pmc_reg_map 40 0 pmic_table 12 0 pm_message 4 0 pm_nl_pernet 80 1 @@ -6388,7 +6388,7 @@ sw842_hlist_node2 24 0 sw842_hlist_node4 24 0 sw842_hlist_node8 32 0 -sw842_param 59496 2 +sw842_param 48 1 swait_queue 24 0 swait_queue_head 24 1 swap_cgroup 2 0 @@ -7942,7 +7942,7 @@ uprobe_trace_entry_head 8 0 uprobe_xol_ops 32 0 urb 184 0 -urb_priv 32 1 +urb_priv 8 0 usb2_lpm_parameters 8 0 usb3_lpm_parameters 16 0 usb_anchor 56 0 ⬢[acme@toolbox pahole]$ I'll check one by one, but looks kinda legit. Now to fiddle with thread affinities. And then move to threaded BTF encoding, that at a first test with a single btf_lock in the pahole stealer ended up producing corrupt BTF, valid just up to a point. Signed-off-by: Arnaldo Carvalho de Melo --- dwarf_loader.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 79 insertions(+), 1 deletion(-) diff --git a/dwarf_loader.c b/dwarf_loader.c index 41f7643..ba3320c 100644 --- a/dwarf_loader.c +++ b/dwarf_loader.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -2647,8 +2648,77 @@ static int dwarf_cus__create_and_process_cu(struct dwarf_cus *dcus, Dwarf_Die *c return DWARF_CB_OK; } +static int dwarf_cus__nextcu(struct dwarf_cus *dcus, Dwarf_Die *die_mem, Dwarf_Die **cu_die, uint8_t *pointer_size, uint8_t *offset_size) +{ + Dwarf_Off noff; + size_t cuhl; + int ret; -static int dwarf_cus__process_cus(struct dwarf_cus *dcus) + cus__lock(dcus->cus); + + if (dcus->error) { + ret = dcus->error; + goto out_unlock; + } + + ret = dwarf_nextcu(dcus->dw, dcus->off, &noff, &cuhl, NULL, pointer_size, offset_size); + if (ret == 0) { + *cu_die = dwarf_offdie(dcus->dw, dcus->off + cuhl, die_mem); + if (*cu_die != NULL) + dcus->off = noff; + } + +out_unlock: + cus__unlock(dcus->cus); + + return ret; +} + +static void *dwarf_cus__process_cu_thread(void *arg) +{ + struct dwarf_cus *dcus = arg; + uint8_t pointer_size, offset_size; + Dwarf_Die die_mem, *cu_die; + + while (dwarf_cus__nextcu(dcus, &die_mem, &cu_die, &pointer_size, &offset_size) == 0) { + if (cu_die == NULL) + break; + + if (dwarf_cus__create_and_process_cu(dcus, cu_die, pointer_size) == DWARF_CB_ABORT) + goto out_abort; + } + + return (void *)DWARF_CB_OK; +out_abort: + return (void *)DWARF_CB_ABORT; +} + +static int dwarf_cus__threaded_process_cus(struct dwarf_cus *dcus) +{ + pthread_t threads[dcus->conf->nr_jobs]; + int i; + + for (i = 0; i < dcus->conf->nr_jobs; ++i) { + dcus->error = pthread_create(&threads[i], NULL, dwarf_cus__process_cu_thread, dcus); + if (dcus->error) + goto out_join; + } + + dcus->error = 0; + +out_join: + while (--i >= 0) { + void *res; + int err = pthread_join(threads[i], &res); + + if (err == 0 && res != NULL) + dcus->error = (long)res; + } + + return dcus->error; +} + +static int __dwarf_cus__process_cus(struct dwarf_cus *dcus) { uint8_t pointer_size, offset_size; Dwarf_Off noff; @@ -2670,6 +2740,14 @@ static int dwarf_cus__process_cus(struct dwarf_cus *dcus) return 0; } +static int dwarf_cus__process_cus(struct dwarf_cus *dcus) +{ + if (dcus->conf->nr_jobs > 1) + return dwarf_cus__threaded_process_cus(dcus); + + return __dwarf_cus__process_cus(dcus); +} + static int cus__merge_and_process_cu(struct cus *cus, struct conf_load *conf, Dwfl_Module *mod, Dwarf *dw, Elf *elf, const char *filename,