dwarf_loader: Permit merging all DWARF CU's for clang LTO built binary

For vmlinux built with clang thin-LTO or LTO (Link Time Optimizationq,
there exists cross CU type references. For example, this can happen:

  compile unit 1:
     tag 10:  type A
  compile unit 2:
     ...
       refer to type A (tag 10 in compile unit 1)

I only checked a few but have seen type A may be a simple type like
"unsigned char" or a complex type like an array of base types.

To resolve this issue, the tag DW_AT_producer of the first few
DW_TAG_compile_unit is checked. If the binary is built with clang LTO,
all debuginfo DWARF CU's will be merged into one pahole CU which will
resolve the above cross-CU tag reference issue. To test whether a binary
is built with clang LTO or not, the "clang version" and "-flto" will be
checked against DW_AT_producer string for the first 5 debuginfo CU's.
The reason is that a few linux objects disabled LTO for various reasons.

Merging CU's will create a single CU with lots of types, tags and
functions. For example with clang thin-LTO built vmlinux, I saw 9M
entries in types table, 5.2M in tags table. The below are pahole
wallclock time for different hashbits:

command line: time pahole -J vmlinux

      # of hashbits            wallclock time in seconds
          15                       460
          16                       255
          17                       131
          18                       97
          19                       75
          20                       69
          21                       64
          22                       62
          23                       58
          24                       64

The problem is with hashtags__find(), esp. the loop

    uint32_t bucket = hashtags__fn(id);
    const struct hlist_head *head = hashtable + bucket;
    hlist_for_each_entry(tpos, pos, head, hash_node) {
            if (tpos->id == id)
                    return tpos;
    }

Say we have 9M types and (1 << 15) buckets, that means each bucket will
have roughly 64 elements. So each lookup will traverse the loop 32
iterations on average.

If we have 1 << 21 buckets, then each buckets will have 4 elements, and
the average number of loop iterations for hashtags__find() will be 2.

Note that the number of hashbits 24 makes performance worse than 23. The
reason could be that 23 hashbits can cover 8M buckets (close to 9M for
the number of entries in types table).  Higher number of hash bits
allocates more memory and becomes less cache efficient compared to 23
hashbits.

This patch picks # of hashbits 21 as the starting value and will try to
allocate memory based on that, if memory allocation fails, we will go
with less hashbits until we reach hashbits 15 which is the default for
non merge-CU case.

Committer notes:

To test this we need this patch to be applied on bpf-next/master:

  https://lore.kernel.org/bpf/20210328064121.2062927-1-yhs@fb.com/

Signed-off-by: Yonghong Song <yhs@fb.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: Bill Wendling <morbo@google.com>
Cc: bpf@vger.kernel.org
Cc: dwarves@vger.kernel.org
Cc: kernel-team@fb.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
This commit is contained in:
Yonghong Song 2021-03-28 13:14:15 -07:00 committed by Arnaldo Carvalho de Melo
parent 763475ca11
commit 39227909db
1 changed files with 120 additions and 0 deletions

View File

@ -51,6 +51,7 @@ struct strings *strings;
#endif
static uint32_t hashtags__bits = 15;
static uint32_t max_hashtags__bits = 21;
static uint32_t hashtags__fn(Dwarf_Off key)
{
@ -2500,6 +2501,115 @@ static int cus__load_debug_types(struct cus *cus, struct conf_load *conf,
return 0;
}
static bool cus__merging_cu(Dwarf *dw)
{
uint8_t pointer_size, offset_size;
Dwarf_Off off = 0, noff;
size_t cuhl;
int cnt = 0;
/*
* Just checking the first cu is not enough.
* In linux, some C files may have LTO is disabled, e.g.,
* e242db40be27 x86, vdso: disable LTO only for vDSO
* d2dcd3e37475 x86, cpu: disable LTO for cpu.c
* Fortunately, disabling LTO for a particular file in a LTO build
* is rather an exception. Iterating 5 cu's to check whether
* LTO is used or not should be enough.
*/
while (dwarf_nextcu(dw, off, &noff, &cuhl, NULL, &pointer_size,
&offset_size) == 0) {
Dwarf_Die die_mem;
Dwarf_Die *cu_die = dwarf_offdie(dw, off + cuhl, &die_mem);
if (cu_die == NULL)
break;
if (++cnt > 5)
break;
const char *producer = attr_string(cu_die, DW_AT_producer);
if (strstr(producer, "clang version") != NULL &&
strstr(producer, "-flto") != NULL)
return true;
off = noff;
}
return false;
}
static int cus__merge_and_process_cu(struct cus *cus, struct conf_load *conf,
Dwfl_Module *mod, Dwarf *dw, Elf *elf,
const char *filename,
const unsigned char *build_id,
int build_id_len,
struct dwarf_cu *type_dcu)
{
uint8_t pointer_size, offset_size;
struct dwarf_cu *dcu = NULL;
Dwarf_Off off = 0, noff;
struct cu *cu = NULL;
size_t cuhl;
while (dwarf_nextcu(dw, off, &noff, &cuhl, NULL, &pointer_size,
&offset_size) == 0) {
Dwarf_Die die_mem;
Dwarf_Die *cu_die = dwarf_offdie(dw, off + cuhl, &die_mem);
if (cu_die == NULL)
break;
if (cu == NULL) {
cu = cu__new("", pointer_size, build_id, build_id_len,
filename);
if (cu == NULL || cu__set_common(cu, conf, mod, elf) != 0)
return DWARF_CB_ABORT;
dcu = malloc(sizeof(struct dwarf_cu));
if (dcu == NULL)
return DWARF_CB_ABORT;
/* Merged cu tends to need a lot more memory.
* Let us start with max_hashtags__bits and
* go down to find a proper hashtag bit value.
*/
uint32_t default_hbits = hashtags__bits;
for (hashtags__bits = max_hashtags__bits;
hashtags__bits >= default_hbits;
hashtags__bits--) {
if (dwarf_cu__init(dcu) == 0)
break;
}
if (hashtags__bits < default_hbits)
return DWARF_CB_ABORT;
dcu->cu = cu;
dcu->type_unit = type_dcu;
cu->priv = dcu;
cu->dfops = &dwarf__ops;
cu->language = attr_numeric(cu_die, DW_AT_language);
}
Dwarf_Die child;
if (dwarf_child(cu_die, &child) == 0) {
if (die__process_unit(&child, cu) != 0)
return DWARF_CB_ABORT;
}
off = noff;
}
/* process merged cu */
if (cu__recode_dwarf_types(cu) != LSK__KEEPIT)
return DWARF_CB_ABORT;
if (finalize_cu_immediately(cus, cu, dcu, conf)
== LSK__STOP_LOADING)
return DWARF_CB_ABORT;
return 0;
}
static int cus__load_module(struct cus *cus, struct conf_load *conf,
Dwfl_Module *mod, Dwarf *dw, Elf *elf,
const char *filename)
@ -2534,6 +2644,15 @@ static int cus__load_module(struct cus *cus, struct conf_load *conf,
}
}
if (cus__merging_cu(dw)) {
res = cus__merge_and_process_cu(cus, conf, mod, dw, elf, filename,
build_id, build_id_len,
type_cu ? &type_dcu : NULL);
if (res)
return res;
goto out;
}
while (dwarf_nextcu(dw, off, &noff, &cuhl, NULL, &pointer_size,
&offset_size) == 0) {
Dwarf_Die die_mem;
@ -2572,6 +2691,7 @@ static int cus__load_module(struct cus *cus, struct conf_load *conf,
off = noff;
}
out:
if (type_lsk == LSK__DELETE)
cu__delete(type_cu);