btf_encoder: revamp how per-CPU variables are encoded

Right now to encode per-CPU variables in BTF, pahole iterates complete vmlinux
symbol table for each CU. There are 2500 CUs for a typical kernel image.
Overall, to encode 287 per-CPU variables pahole spends more than 10% of its CPU
budget, this is incredibly wasteful.

This patch revamps how this is done. Now it pre-processes symbol table once
before any of per-CU processing starts. It remembers each per-CPU variable
symbol, including its address, size, and name. Then during processing each CU,
binary search is used to correlate DWARF variable with per-CPU symbols and
figure out if variable belongs to per-CPU data section. If the match is found,
BTF_KIND_VAR is emitted and var_secinfo is recorded, just like before. At the
very end, after all CUs are processed, BTF_KIND_DATASEC is emitted with sorted
variables.

This change makes per-CPU variables generation overhead pretty negligible and
returns back about 10% of CPU usage.

Performance counter stats for './pahole -J /home/andriin/linux-build/default/vmlinux':

BEFORE:
      19.160149000 seconds user
       1.304873000 seconds sys

         24,114.05 msec task-clock                #    0.999 CPUs utilized
                83      context-switches          #    0.003 K/sec
                 0      cpu-migrations            #    0.000 K/sec
           622,417      page-faults               #    0.026 M/sec
    72,897,315,125      cycles                    #    3.023 GHz                      (25.02%)
   127,807,316,959      instructions              #    1.75  insn per cycle           (25.01%)
    29,087,179,117      branches                  # 1206.234 M/sec                    (25.01%)
       464,105,921      branch-misses             #    1.60% of all branches          (25.01%)
    30,252,119,368      L1-dcache-loads           # 1254.543 M/sec                    (25.01%)
     1,156,336,207      L1-dcache-load-misses     #    3.82% of all L1-dcache hits    (25.05%)
       343,373,503      LLC-loads                 #   14.240 M/sec                    (25.02%)
        12,044,977      LLC-load-misses           #    3.51% of all LL-cache hits     (25.01%)

      24.136198321 seconds time elapsed

      22.729693000 seconds user
       1.384859000 seconds sys

AFTER:
      16.781455000 seconds user
       1.343956000 seconds sys

         23,398.77 msec task-clock                #    1.000 CPUs utilized
                86      context-switches          #    0.004 K/sec
                 0      cpu-migrations            #    0.000 K/sec
           622,420      page-faults               #    0.027 M/sec
    68,395,641,468      cycles                    #    2.923 GHz                      (25.05%)
   114,241,327,034      instructions              #    1.67  insn per cycle           (25.01%)
    26,330,711,718      branches                  # 1125.303 M/sec                    (25.01%)
       465,926,869      branch-misses             #    1.77% of all branches          (25.00%)
    24,662,984,772      L1-dcache-loads           # 1054.029 M/sec                    (25.00%)
     1,054,052,064      L1-dcache-load-misses     #    4.27% of all L1-dcache hits    (25.00%)
       340,970,622      LLC-loads                 #   14.572 M/sec                    (25.00%)
        16,032,297      LLC-load-misses           #    4.70% of all LL-cache hits     (25.03%)

      23.402259654 seconds time elapsed

      21.916437000 seconds user
       1.482826000 seconds sys

Committer testing:

  $ grep 'model name' -m1 /proc/cpuinfo
  model name	: AMD Ryzen 9 3900X 12-Core Processor
  $

Before:

  $ perf stat -r5 pahole -J vmlinux

   Performance counter stats for 'pahole -J vmlinux' (5 runs):

            9,730.28 msec task-clock:u              #    0.998 CPUs utilized            ( +-  0.54% )
                   0      context-switches:u        #    0.000 K/sec
                   0      cpu-migrations:u          #    0.000 K/sec
             353,854      page-faults:u             #    0.036 M/sec                    ( +-  0.00% )
      39,721,726,459      cycles:u                  #    4.082 GHz                      ( +-  0.07% )  (83.33%)
         626,010,654      stalled-cycles-frontend:u #    1.58% frontend cycles idle     ( +-  0.91% )  (83.33%)
       7,518,333,691      stalled-cycles-backend:u  #   18.93% backend cycles idle      ( +-  0.56% )  (83.33%)
      85,477,123,093      instructions:u            #    2.15  insn per cycle
                                                    #    0.09  stalled cycles per insn  ( +-  0.02% )  (83.34%)
      19,346,085,683      branches:u                # 1988.235 M/sec                    ( +-  0.03% )  (83.34%)
         237,291,787      branch-misses:u           #    1.23% of all branches          ( +-  0.15% )  (83.33%)

              9.7465 +- 0.0524 seconds time elapsed  ( +-  0.54% )

  $

After:

  $ perf stat -r5 pahole -J vmlinux

   Performance counter stats for 'pahole -J vmlinux' (5 runs):

            8,953.80 msec task-clock:u              #    0.998 CPUs utilized            ( +-  0.09% )
                   0      context-switches:u        #    0.000 K/sec
                   0      cpu-migrations:u          #    0.000 K/sec
             353,855      page-faults:u             #    0.040 M/sec                    ( +-  0.00% )
      35,775,730,539      cycles:u                  #    3.996 GHz                      ( +-  0.07% )  (83.33%)
         579,534,836      stalled-cycles-frontend:u #    1.62% frontend cycles idle     ( +-  2.21% )  (83.33%)
       5,719,840,144      stalled-cycles-backend:u  #   15.99% backend cycles idle      ( +-  0.93% )  (83.33%)
      73,035,744,786      instructions:u            #    2.04  insn per cycle
                                                    #    0.08  stalled cycles per insn  ( +-  0.02% )  (83.34%)
      16,798,017,844      branches:u                # 1876.077 M/sec                    ( +-  0.05% )  (83.33%)
         237,777,143      branch-misses:u           #    1.42% of all branches          ( +-  0.15% )  (83.34%)

             8.97077 +- 0.00803 seconds time elapsed  ( +-  0.09% )

  $

Indeed, about 10% shaved, not bad.

Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: Hao Luo <haoluo@google.com>
Cc: Oleg Rombakh <olegrom@google.com>
Cc: bpf@vger.kernel.org
Cc: dwarves@vger.kernel.org
Cc: kernel-team@fb.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
This commit is contained in:
Andrii Nakryiko 2020-10-08 16:39:57 -07:00 committed by Arnaldo Carvalho de Melo
parent 0258a47ef9
commit 2e719cca66
3 changed files with 147 additions and 106 deletions

View File

@ -17,6 +17,7 @@
#include "btf_encoder.h"
#include <ctype.h> /* for isalpha() and isalnum() */
#include <stdlib.h> /* for qsort() and bsearch() */
#include <inttypes.h>
/*
@ -53,18 +54,18 @@ static bool btf_name_valid(const char *p)
return !*p;
}
static void dump_invalid_symbol(const char *msg, const char *sym, const char *cu,
static void dump_invalid_symbol(const char *msg, const char *sym,
int verbose, bool force)
{
if (force) {
if (verbose)
fprintf(stderr, "PAHOLE: Warning: %s, ignored (sym: '%s', cu: '%s').\n",
msg, sym, cu);
fprintf(stderr, "PAHOLE: Warning: %s, ignored (sym: '%s').\n",
msg, sym);
return;
}
fprintf(stderr, "PAHOLE: Error: %s (sym: '%s', cu: '%s').\n", msg, sym, cu);
fprintf(stderr, "PAHOLE: Error: Use '-j' or '--force' to ignore such symbols and force emit the btf.\n");
fprintf(stderr, "PAHOLE: Error: %s (sym: '%s').\n", msg, sym);
fprintf(stderr, "PAHOLE: Error: Use '--btf_encode_force' to ignore such symbols and force emit the btf.\n");
}
extern struct debug_fmt_ops *dwarves__active_loader;
@ -202,6 +203,9 @@ int btf_encoder__encode()
{
int err;
if (gobuffer__size(&btfe->percpu_secinfo) != 0)
btf_elf__add_datasec_type(btfe, PERCPU_SECTION, &btfe->percpu_secinfo);
err = btf_elf__encode(btfe, 0);
btf_elf__delete(btfe);
btfe = NULL;
@ -209,24 +213,117 @@ int btf_encoder__encode()
return err;
}
#define HASHADDR__BITS 8
#define HASHADDR__SIZE (1UL << HASHADDR__BITS)
#define hashaddr__fn(key) hash_64(key, HASHADDR__BITS)
#define MAX_PERCPU_VAR_CNT 4096
static struct variable *hashaddr__find_variable(const struct hlist_head hashtable[],
const uint64_t addr)
struct var_info {
uint64_t addr;
uint32_t sz;
const char *name;
};
static struct var_info percpu_vars[MAX_PERCPU_VAR_CNT];
static int percpu_var_cnt;
static int percpu_var_cmp(const void *_a, const void *_b)
{
struct variable *variable;
struct hlist_node *pos;
uint16_t bucket = hashaddr__fn(addr);
const struct hlist_head *head = &hashtable[bucket];
const struct var_info *a = _a;
const struct var_info *b = _b;
hlist_for_each_entry(variable, pos, head, tool_hnode) {
if (variable->ip.addr == addr)
return variable;
if (a->addr == b->addr)
return 0;
return a->addr < b->addr ? -1 : 1;
}
static bool percpu_var_exists(uint64_t addr, uint32_t *sz, const char **name)
{
const struct var_info *p;
struct var_info key = { .addr = addr };
p = bsearch(&key, percpu_vars, percpu_var_cnt,
sizeof(percpu_vars[0]), percpu_var_cmp);
if (!p)
return false;
*sz = p->sz;
*name = p->name;
return true;
}
static int find_all_percpu_vars(struct btf_elf *btfe)
{
uint32_t core_id;
GElf_Sym sym;
/* cache variables' addresses, preparing for searching in symtab. */
percpu_var_cnt = 0;
/* search within symtab for percpu variables */
elf_symtab__for_each_symbol(btfe->symtab, core_id, sym) {
const char *sym_name;
uint64_t addr;
uint32_t size;
/* compare a symbol's shndx to determine if it's a percpu variable */
if (elf_sym__section(&sym) != btfe->percpu_shndx)
continue;
if (elf_sym__type(&sym) != STT_OBJECT)
continue;
addr = elf_sym__value(&sym);
/*
* Store only those symbols that have allocated space in the percpu section.
* This excludes the following three types of symbols:
*
* 1. __ADDRESSABLE(sym), which are forcely emitted as symbols.
* 2. __UNIQUE_ID(prefix), which are introduced to generate unique ids.
* 3. __exitcall(fn), functions which are labeled as exit calls.
*
* In addition, the variables defined using DEFINE_PERCPU_FIRST are
* also not included, which currently includes:
*
* 1. fixed_percpu_data
*/
if (!addr)
continue;
sym_name = elf_sym__name(&sym, btfe->symtab);
if (!btf_name_valid(sym_name)) {
dump_invalid_symbol("Found symbol of invalid name when encoding btf",
sym_name, btf_elf__verbose, btf_elf__force);
if (btf_elf__force)
continue;
return -1;
}
size = elf_sym__size(&sym);
if (!size) {
dump_invalid_symbol("Found symbol of zero size when encoding btf",
sym_name, btf_elf__verbose, btf_elf__force);
if (btf_elf__force)
continue;
return -1;
}
if (btf_elf__verbose)
printf("Found per-CPU symbol '%s' at address 0x%lx\n", sym_name, addr);
if (percpu_var_cnt == MAX_PERCPU_VAR_CNT) {
fprintf(stderr, "Reached the limit of per-CPU variables: %d\n",
MAX_PERCPU_VAR_CNT);
return -1;
}
percpu_vars[percpu_var_cnt].addr = addr;
percpu_vars[percpu_var_cnt].sz = size;
percpu_vars[percpu_var_cnt].name = sym_name;
percpu_var_cnt++;
}
return NULL;
if (percpu_var_cnt)
qsort(percpu_vars, percpu_var_cnt, sizeof(percpu_vars[0]), percpu_var_cmp);
if (btf_elf__verbose)
printf("Found %d per-CPU variables!\n", percpu_var_cnt);
return 0;
}
int cu__encode_btf(struct cu *cu, int verbose, bool force,
@ -234,13 +331,10 @@ int cu__encode_btf(struct cu *cu, int verbose, bool force,
{
uint32_t type_id_off;
uint32_t core_id;
struct variable *var;
struct function *fn;
struct tag *pos;
int err = 0;
struct hlist_head hash_addr[HASHADDR__SIZE];
struct variable *var;
bool has_global_var = false;
GElf_Sym sym;
if (btfe && strcmp(btfe->filename, cu->filename)) {
err = btf_encoder__encode();
@ -257,6 +351,9 @@ int cu__encode_btf(struct cu *cu, int verbose, bool force,
if (!btfe)
return -1;
if (!skip_encoding_vars && find_all_percpu_vars(btfe))
goto out;
has_index_type = false;
need_index_type = false;
array_index_id = 0;
@ -278,6 +375,7 @@ int cu__encode_btf(struct cu *cu, int verbose, bool force,
}
btf_elf__verbose = verbose;
btf_elf__force = force;
type_id_off = btf__get_nr_types(btfe->btf);
cu__for_each_type(cu, core_id, pos) {
@ -325,12 +423,11 @@ int cu__encode_btf(struct cu *cu, int verbose, bool force,
if (verbose)
printf("search cu '%s' for percpu global variables.\n", cu->name);
/* cache variables' addresses, preparing for searching in symtab. */
for (core_id = 0; core_id < HASHADDR__SIZE; ++core_id)
INIT_HLIST_HEAD(&hash_addr[core_id]);
cu__for_each_variable(cu, core_id, pos) {
struct hlist_head *head;
uint32_t size, type, linkage, offset;
const char *name;
uint64_t addr;
int id;
var = tag__variable(pos);
if (var->declaration && !var->spec)
@ -338,89 +435,37 @@ int cu__encode_btf(struct cu *cu, int verbose, bool force,
/* percpu variables are allocated in global space */
if (variable__scope(var) != VSCOPE_GLOBAL && !var->spec)
continue;
has_global_var = true;
head = &hash_addr[hashaddr__fn(var->ip.addr)];
hlist_add_head(&var->tool_hnode, head);
}
if (!has_global_var) {
if (verbose)
printf("cu has no global variable defined, skip.\n");
goto out;
}
/* search within symtab for percpu variables */
elf_symtab__for_each_symbol(btfe->symtab, core_id, sym) {
uint32_t linkage, type, size, offset;
int32_t btf_var_id, btf_var_secinfo_id;
uint64_t addr;
const char *sym_name;
/* compare a symbol's shndx to determine if it's a percpu variable */
if (elf_sym__section(&sym) != btfe->percpu_shndx)
continue;
if (elf_sym__type(&sym) != STT_OBJECT)
continue;
addr = elf_sym__value(&sym);
/*
* Store only those symbols that have allocated space in the percpu section.
* This excludes the following three types of symbols:
*
* 1. __ADDRESSABLE(sym), which are forcely emitted as symbols.
* 2. __UNIQUE_ID(prefix), which are introduced to generate unique ids.
* 3. __exitcall(fn), functions which are labeled as exit calls.
*
* In addition, the variables defined using DEFINE_PERCPU_FIRST are
* also not included, which currently includes:
*
* 1. fixed_percpu_data
*/
if (!addr)
continue;
var = hashaddr__find_variable(hash_addr, addr);
if (var == NULL)
continue;
/* addr has to be recorded before we follow spec */
addr = var->ip.addr;
if (var->spec)
var = var->spec;
sym_name = elf_sym__name(&sym, btfe->symtab);
if (!btf_name_valid(sym_name)) {
dump_invalid_symbol("Found symbol of invalid name when encoding btf",
sym_name, cu->name, verbose, force);
if (force)
continue;
err = -1;
break;
}
if (var->ip.tag.type == 0) {
dump_invalid_symbol("Found symbol of void type when encoding btf",
sym_name, cu->name, verbose, force);
if (force)
continue;
err = -1;
break;
}
type = type_id_off + var->ip.tag.type;
size = elf_sym__size(&sym);
if (!size) {
dump_invalid_symbol("Found symbol of zero size when encoding btf",
sym_name, cu->name, verbose, force);
fprintf(stderr, "error: found variable in CU '%s' that has void type\n",
cu->name);
if (force)
continue;
err = -1;
break;
}
if (verbose)
printf("symbol '%s' of address 0x%lx encoded\n",
sym_name, addr);
type = var->ip.tag.type + type_id_off;
linkage = var->external ? BTF_VAR_GLOBAL_ALLOCATED : BTF_VAR_STATIC;
if (!percpu_var_exists(addr, &size, &name))
continue; /* not a per-CPU variable */
if (btf_elf__verbose) {
printf("Variable '%s' from CU '%s' at address 0x%lx encoded\n",
name, cu->name, addr);
}
/* add a BTF_KIND_VAR in btfe->types */
linkage = var->external ? BTF_VAR_GLOBAL_ALLOCATED : BTF_VAR_STATIC;
btf_var_id = btf_elf__add_var_type(btfe, type, sym_name, linkage);
if (btf_var_id < 0) {
id = btf_elf__add_var_type(btfe, type, name, linkage);
if (id < 0) {
err = -1;
printf("error: failed to encode variable '%s'\n", sym_name);
fprintf(stderr, "error: failed to encode variable '%s' at addr 0x%lx\n",
name, addr);
break;
}
@ -428,13 +473,12 @@ int cu__encode_btf(struct cu *cu, int verbose, bool force,
* add a BTF_VAR_SECINFO in btfe->percpu_secinfo, which will be added into
* btfe->types later when we add BTF_VAR_DATASEC.
*/
type = btf_var_id;
offset = addr - btfe->percpu_base_addr;
btf_var_secinfo_id = btf_elf__add_var_secinfo(&btfe->percpu_secinfo,
type, offset, size);
if (btf_var_secinfo_id < 0) {
id = btf_elf__add_var_secinfo(&btfe->percpu_secinfo, id, offset, size);
if (id < 0) {
err = -1;
printf("error: failed to encode var secinfo '%s'\n", sym_name);
fprintf(stderr, "error: failed to encode section info for variable '%s' at addr 0x%lx\n",
name, addr);
break;
}
}

View File

@ -28,6 +28,7 @@
#include "elf_symtab.h"
uint8_t btf_elf__verbose;
uint8_t btf_elf__force;
static int btf_var_secinfo_cmp(const void *a, const void *b)
{
@ -62,7 +63,6 @@ int btf_elf__load(struct btf_elf *btfe)
return 0;
}
struct btf_elf *btf_elf__new(const char *filename, Elf *elf)
{
struct btf_elf *btfe = zalloc(sizeof(*btfe));
@ -771,10 +771,6 @@ int btf_elf__encode(struct btf_elf *btfe, uint8_t flags)
{
struct btf *btf = btfe->btf;
if (gobuffer__size(&btfe->percpu_secinfo) != 0)
btf_elf__add_datasec_type(btfe, PERCPU_SECTION,
&btfe->percpu_secinfo);
/* Empty file, nothing to do, so... done! */
if (btf__get_nr_types(btf) == 0)
return 0;

View File

@ -30,6 +30,7 @@ struct btf_elf {
};
extern uint8_t btf_elf__verbose;
extern uint8_t btf_elf__force;
#define btf_elf__verbose_log(fmt, ...) { if (btf_elf__verbose) printf(fmt, __VA_ARGS__); }
#define PERCPU_SECTION ".data..percpu"