tools: bpftool: add simple perf event output reader

Users of BPF sooner or later discover perf_event_output() helpers
and BPF_MAP_TYPE_PERF_EVENT_ARRAY.  Dumping this array type is
not possible, however, we can add simple reading of perf events.
Create a new event_pipe subcommand for maps, this sub command
will only work with BPF_MAP_TYPE_PERF_EVENT_ARRAY maps.

Parts of the code from samples/bpf/trace_output_user.c.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
This commit is contained in:
Jakub Kicinski 2018-05-03 18:37:16 -07:00 committed by Daniel Borkmann
parent e64d52569f
commit f412eed9df
8 changed files with 444 additions and 19 deletions

View File

@ -22,12 +22,13 @@ MAP COMMANDS
=============
| **bpftool** **map { show | list }** [*MAP*]
| **bpftool** **map dump** *MAP*
| **bpftool** **map update** *MAP* **key** *DATA* **value** *VALUE* [*UPDATE_FLAGS*]
| **bpftool** **map lookup** *MAP* **key** *DATA*
| **bpftool** **map getnext** *MAP* [**key** *DATA*]
| **bpftool** **map delete** *MAP* **key** *DATA*
| **bpftool** **map pin** *MAP* *FILE*
| **bpftool** **map dump** *MAP*
| **bpftool** **map update** *MAP* **key** *DATA* **value** *VALUE* [*UPDATE_FLAGS*]
| **bpftool** **map lookup** *MAP* **key** *DATA*
| **bpftool** **map getnext** *MAP* [**key** *DATA*]
| **bpftool** **map delete** *MAP* **key** *DATA*
| **bpftool** **map pin** *MAP* *FILE*
| **bpftool** **map event_pipe** *MAP* [**cpu** *N* **index** *M*]
| **bpftool** **map help**
|
| *MAP* := { **id** *MAP_ID* | **pinned** *FILE* }
@ -76,6 +77,22 @@ DESCRIPTION
Note: *FILE* must be located in *bpffs* mount.
**bpftool** **map event_pipe** *MAP* [**cpu** *N* **index** *M*]
Read events from a BPF_MAP_TYPE_PERF_EVENT_ARRAY map.
Install perf rings into a perf event array map and dump
output of any bpf_perf_event_output() call in the kernel.
By default read the number of CPUs on the system and
install perf ring for each CPU in the corresponding index
in the array.
If **cpu** and **index** are specified, install perf ring
for given **cpu** at **index** in the array (single ring).
Note that installing a perf ring into an array will silently
replace any existing ring. Any other application will stop
receiving events if it installed its rings earlier.
**bpftool map help**
Print short help message.

View File

@ -23,7 +23,7 @@ SYNOPSIS
*MAP-COMMANDS* :=
{ **show** | **list** | **dump** | **update** | **lookup** | **getnext** | **delete**
| **pin** | **help** }
| **pin** | **event_pipe** | **help** }
*PROG-COMMANDS* := { **show** | **list** | **dump jited** | **dump xlated** | **pin**
| **load** | **help** }

View File

@ -39,7 +39,12 @@ CC = gcc
CFLAGS += -O2
CFLAGS += -W -Wall -Wextra -Wno-unused-parameter -Wshadow -Wno-missing-field-initializers
CFLAGS += -DPACKAGE='"bpftool"' -D__EXPORTED_HEADERS__ -I$(srctree)/tools/include/uapi -I$(srctree)/tools/include -I$(srctree)/tools/lib/bpf -I$(srctree)/kernel/bpf/
CFLAGS += -DPACKAGE='"bpftool"' -D__EXPORTED_HEADERS__ \
-I$(srctree)/kernel/bpf/ \
-I$(srctree)/tools/include \
-I$(srctree)/tools/include/uapi \
-I$(srctree)/tools/lib/bpf \
-I$(srctree)/tools/perf
CFLAGS += -DBPFTOOL_VERSION='"$(BPFTOOL_VERSION)"'
LIBS = -lelf -lbfd -lopcodes $(LIBBPF)

View File

@ -1,6 +1,6 @@
# bpftool(8) bash completion -*- shell-script -*-
#
# Copyright (C) 2017 Netronome Systems, Inc.
# Copyright (C) 2017-2018 Netronome Systems, Inc.
#
# This software is dual licensed under the GNU General License
# Version 2, June 1991 as shown in the file COPYING in the top-level
@ -79,6 +79,14 @@ _bpftool_get_map_ids()
command sed -n 's/.*"id": \(.*\),$/\1/p' )" -- "$cur" ) )
}
_bpftool_get_perf_map_ids()
{
COMPREPLY+=( $( compgen -W "$( bpftool -jp map 2>&1 | \
command grep -C2 perf_event_array | \
command sed -n 's/.*"id": \(.*\),$/\1/p' )" -- "$cur" ) )
}
_bpftool_get_prog_ids()
{
COMPREPLY+=( $( compgen -W "$( bpftool -jp prog 2>&1 | \
@ -359,10 +367,34 @@ _bpftool()
fi
return 0
;;
event_pipe)
case $prev in
$command)
COMPREPLY=( $( compgen -W "$MAP_TYPE" -- "$cur" ) )
return 0
;;
id)
_bpftool_get_perf_map_ids
return 0
;;
cpu)
return 0
;;
index)
return 0
;;
*)
_bpftool_once_attr 'cpu'
_bpftool_once_attr 'index'
return 0
;;
esac
;;
*)
[[ $prev == $object ]] && \
COMPREPLY=( $( compgen -W 'delete dump getnext help \
lookup pin show list update' -- "$cur" ) )
lookup pin event_pipe show list update' -- \
"$cur" ) )
;;
esac
;;

View File

@ -331,6 +331,16 @@ char *get_fdinfo(int fd, const char *key)
return NULL;
}
void print_data_json(uint8_t *data, size_t len)
{
unsigned int i;
jsonw_start_array(json_wtr);
for (i = 0; i < len; i++)
jsonw_printf(json_wtr, "%d", data[i]);
jsonw_end_array(json_wtr);
}
void print_hex_data_json(uint8_t *data, size_t len)
{
unsigned int i;
@ -421,6 +431,15 @@ void delete_pinned_obj_table(struct pinned_obj_table *tab)
}
}
unsigned int get_page_size(void)
{
static int result;
if (!result)
result = getpagesize();
return result;
}
unsigned int get_possible_cpus(void)
{
static unsigned int result;

View File

@ -117,14 +117,18 @@ int do_pin_fd(int fd, const char *name);
int do_prog(int argc, char **arg);
int do_map(int argc, char **arg);
int do_event_pipe(int argc, char **argv);
int do_cgroup(int argc, char **arg);
int prog_parse_fd(int *argc, char ***argv);
int map_parse_fd_and_info(int *argc, char ***argv, void *info, __u32 *info_len);
void disasm_print_insn(unsigned char *image, ssize_t len, int opcodes,
const char *arch);
void print_data_json(uint8_t *data, size_t len);
void print_hex_data_json(uint8_t *data, size_t len);
unsigned int get_page_size(void);
unsigned int get_possible_cpus(void);
const char *ifindex_to_bfd_name_ns(__u32 ifindex, __u64 ns_dev, __u64 ns_ino);

View File

@ -130,8 +130,7 @@ static int map_parse_fd(int *argc, char ***argv)
return -1;
}
static int
map_parse_fd_and_info(int *argc, char ***argv, void *info, __u32 *info_len)
int map_parse_fd_and_info(int *argc, char ***argv, void *info, __u32 *info_len)
{
int err;
int fd;
@ -817,12 +816,13 @@ static int do_help(int argc, char **argv)
fprintf(stderr,
"Usage: %s %s { show | list } [MAP]\n"
" %s %s dump MAP\n"
" %s %s update MAP key DATA value VALUE [UPDATE_FLAGS]\n"
" %s %s lookup MAP key DATA\n"
" %s %s getnext MAP [key DATA]\n"
" %s %s delete MAP key DATA\n"
" %s %s pin MAP FILE\n"
" %s %s dump MAP\n"
" %s %s update MAP key DATA value VALUE [UPDATE_FLAGS]\n"
" %s %s lookup MAP key DATA\n"
" %s %s getnext MAP [key DATA]\n"
" %s %s delete MAP key DATA\n"
" %s %s pin MAP FILE\n"
" %s %s event_pipe MAP [cpu N index M]\n"
" %s %s help\n"
"\n"
" MAP := { id MAP_ID | pinned FILE }\n"
@ -834,7 +834,7 @@ static int do_help(int argc, char **argv)
"",
bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2],
bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2],
bin_name, argv[-2], bin_name, argv[-2]);
bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2]);
return 0;
}
@ -849,6 +849,7 @@ static const struct cmd cmds[] = {
{ "getnext", do_getnext },
{ "delete", do_delete },
{ "pin", do_pin },
{ "event_pipe", do_event_pipe },
{ 0 }
};

View File

@ -0,0 +1,347 @@
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (C) 2018 Netronome Systems, Inc. */
/* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation.
*/
#include <errno.h>
#include <fcntl.h>
#include <libbpf.h>
#include <poll.h>
#include <signal.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <unistd.h>
#include <linux/bpf.h>
#include <linux/perf_event.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <bpf.h>
#include <perf-sys.h>
#include "main.h"
#define MMAP_PAGE_CNT 16
static bool stop;
struct event_ring_info {
int fd;
int key;
unsigned int cpu;
void *mem;
};
struct perf_event_sample {
struct perf_event_header header;
__u32 size;
unsigned char data[];
};
static void int_exit(int signo)
{
fprintf(stderr, "Stopping...\n");
stop = true;
}
static void
print_bpf_output(struct event_ring_info *ring, struct perf_event_sample *e)
{
struct {
struct perf_event_header header;
__u64 id;
__u64 lost;
} *lost = (void *)e;
struct timespec ts;
if (clock_gettime(CLOCK_MONOTONIC, &ts)) {
perror("Can't read clock for timestamp");
return;
}
if (json_output) {
jsonw_start_object(json_wtr);
jsonw_name(json_wtr, "timestamp");
jsonw_uint(json_wtr, ts.tv_sec * 1000000000ull + ts.tv_nsec);
jsonw_name(json_wtr, "type");
jsonw_uint(json_wtr, e->header.type);
jsonw_name(json_wtr, "cpu");
jsonw_uint(json_wtr, ring->cpu);
jsonw_name(json_wtr, "index");
jsonw_uint(json_wtr, ring->key);
if (e->header.type == PERF_RECORD_SAMPLE) {
jsonw_name(json_wtr, "data");
print_data_json(e->data, e->size);
} else if (e->header.type == PERF_RECORD_LOST) {
jsonw_name(json_wtr, "lost");
jsonw_start_object(json_wtr);
jsonw_name(json_wtr, "id");
jsonw_uint(json_wtr, lost->id);
jsonw_name(json_wtr, "count");
jsonw_uint(json_wtr, lost->lost);
jsonw_end_object(json_wtr);
}
jsonw_end_object(json_wtr);
} else {
if (e->header.type == PERF_RECORD_SAMPLE) {
printf("== @%ld.%ld CPU: %d index: %d =====\n",
(long)ts.tv_sec, ts.tv_nsec,
ring->cpu, ring->key);
fprint_hex(stdout, e->data, e->size, " ");
printf("\n");
} else if (e->header.type == PERF_RECORD_LOST) {
printf("lost %lld events\n", lost->lost);
} else {
printf("unknown event type=%d size=%d\n",
e->header.type, e->header.size);
}
}
}
static void
perf_event_read(struct event_ring_info *ring, void **buf, size_t *buf_len)
{
volatile struct perf_event_mmap_page *header = ring->mem;
__u64 buffer_size = MMAP_PAGE_CNT * get_page_size();
__u64 data_tail = header->data_tail;
__u64 data_head = header->data_head;
void *base, *begin, *end;
asm volatile("" ::: "memory"); /* in real code it should be smp_rmb() */
if (data_head == data_tail)
return;
base = ((char *)header) + get_page_size();
begin = base + data_tail % buffer_size;
end = base + data_head % buffer_size;
while (begin != end) {
struct perf_event_sample *e;
e = begin;
if (begin + e->header.size > base + buffer_size) {
long len = base + buffer_size - begin;
if (*buf_len < e->header.size) {
free(*buf);
*buf = malloc(e->header.size);
if (!*buf) {
fprintf(stderr,
"can't allocate memory");
stop = true;
return;
}
*buf_len = e->header.size;
}
memcpy(*buf, begin, len);
memcpy(*buf + len, base, e->header.size - len);
e = (void *)*buf;
begin = base + e->header.size - len;
} else if (begin + e->header.size == base + buffer_size) {
begin = base;
} else {
begin += e->header.size;
}
print_bpf_output(ring, e);
}
__sync_synchronize(); /* smp_mb() */
header->data_tail = data_head;
}
static int perf_mmap_size(void)
{
return get_page_size() * (MMAP_PAGE_CNT + 1);
}
static void *perf_event_mmap(int fd)
{
int mmap_size = perf_mmap_size();
void *base;
base = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
if (base == MAP_FAILED) {
p_err("event mmap failed: %s\n", strerror(errno));
return NULL;
}
return base;
}
static void perf_event_unmap(void *mem)
{
if (munmap(mem, perf_mmap_size()))
fprintf(stderr, "Can't unmap ring memory!\n");
}
static int bpf_perf_event_open(int map_fd, int key, int cpu)
{
struct perf_event_attr attr = {
.sample_type = PERF_SAMPLE_RAW,
.type = PERF_TYPE_SOFTWARE,
.config = PERF_COUNT_SW_BPF_OUTPUT,
};
int pmu_fd;
pmu_fd = sys_perf_event_open(&attr, -1, cpu, -1, 0);
if (pmu_fd < 0) {
p_err("failed to open perf event %d for CPU %d", key, cpu);
return -1;
}
if (bpf_map_update_elem(map_fd, &key, &pmu_fd, BPF_ANY)) {
p_err("failed to update map for event %d for CPU %d", key, cpu);
goto err_close;
}
if (ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0)) {
p_err("failed to enable event %d for CPU %d", key, cpu);
goto err_close;
}
return pmu_fd;
err_close:
close(pmu_fd);
return -1;
}
int do_event_pipe(int argc, char **argv)
{
int i, nfds, map_fd, index = -1, cpu = -1;
struct bpf_map_info map_info = {};
struct event_ring_info *rings;
size_t tmp_buf_sz = 0;
void *tmp_buf = NULL;
struct pollfd *pfds;
__u32 map_info_len;
bool do_all = true;
map_info_len = sizeof(map_info);
map_fd = map_parse_fd_and_info(&argc, &argv, &map_info, &map_info_len);
if (map_fd < 0)
return -1;
if (map_info.type != BPF_MAP_TYPE_PERF_EVENT_ARRAY) {
p_err("map is not a perf event array");
goto err_close_map;
}
while (argc) {
if (argc < 2)
BAD_ARG();
if (is_prefix(*argv, "cpu")) {
char *endptr;
NEXT_ARG();
cpu = strtoul(*argv, &endptr, 0);
if (*endptr) {
p_err("can't parse %s as CPU ID", **argv);
goto err_close_map;
}
NEXT_ARG();
} else if (is_prefix(*argv, "index")) {
char *endptr;
NEXT_ARG();
index = strtoul(*argv, &endptr, 0);
if (*endptr) {
p_err("can't parse %s as index", **argv);
goto err_close_map;
}
NEXT_ARG();
} else {
BAD_ARG();
}
do_all = false;
}
if (!do_all) {
if (index == -1 || cpu == -1) {
p_err("cpu and index must be specified together");
goto err_close_map;
}
nfds = 1;
} else {
nfds = min(get_possible_cpus(), map_info.max_entries);
cpu = 0;
index = 0;
}
rings = calloc(nfds, sizeof(rings[0]));
if (!rings)
goto err_close_map;
pfds = calloc(nfds, sizeof(pfds[0]));
if (!pfds)
goto err_free_rings;
for (i = 0; i < nfds; i++) {
rings[i].cpu = cpu + i;
rings[i].key = index + i;
rings[i].fd = bpf_perf_event_open(map_fd, rings[i].key,
rings[i].cpu);
if (rings[i].fd < 0)
goto err_close_fds_prev;
rings[i].mem = perf_event_mmap(rings[i].fd);
if (!rings[i].mem)
goto err_close_fds_current;
pfds[i].fd = rings[i].fd;
pfds[i].events = POLLIN;
}
signal(SIGINT, int_exit);
signal(SIGHUP, int_exit);
signal(SIGTERM, int_exit);
if (json_output)
jsonw_start_array(json_wtr);
while (!stop) {
poll(pfds, nfds, 200);
for (i = 0; i < nfds; i++)
perf_event_read(&rings[i], &tmp_buf, &tmp_buf_sz);
}
free(tmp_buf);
if (json_output)
jsonw_end_array(json_wtr);
for (i = 0; i < nfds; i++) {
perf_event_unmap(rings[i].mem);
close(rings[i].fd);
}
free(pfds);
free(rings);
close(map_fd);
return 0;
err_close_fds_prev:
while (i--) {
perf_event_unmap(rings[i].mem);
err_close_fds_current:
close(rings[i].fd);
}
free(pfds);
err_free_rings:
free(rings);
err_close_map:
close(map_fd);
return -1;
}