From f3fa412de28ae3cb31d38811d30a77e4e20456cc Mon Sep 17 00:00:00 2001 From: Andrew Melnychenko Date: Fri, 14 May 2021 14:48:31 +0300 Subject: [PATCH] ebpf: Added eBPF RSS program. RSS program and Makefile to build it. The bpftool used to generate '.h' file. The data in that file may be loaded by libbpf. EBPF compilation is not required for building qemu. You can use Makefile if you need to regenerate rss.bpf.skeleton.h. Signed-off-by: Yuri Benditovich Signed-off-by: Andrew Melnychenko Signed-off-by: Jason Wang --- tools/ebpf/Makefile.ebpf | 21 ++ tools/ebpf/rss.bpf.c | 571 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 592 insertions(+) create mode 100755 tools/ebpf/Makefile.ebpf create mode 100644 tools/ebpf/rss.bpf.c diff --git a/tools/ebpf/Makefile.ebpf b/tools/ebpf/Makefile.ebpf new file mode 100755 index 0000000000..8f327ae3b8 --- /dev/null +++ b/tools/ebpf/Makefile.ebpf @@ -0,0 +1,21 @@ +OBJS = rss.bpf.o + +LLC ?= llc +CLANG ?= clang +INC_FLAGS = `$(CLANG) -print-file-name=include` +EXTRA_CFLAGS ?= -O2 -emit-llvm -fno-stack-protector + +all: $(OBJS) + +.PHONY: clean + +clean: + rm -f $(OBJS) + +$(OBJS): %.o:%.c + $(CLANG) $(INC_FLAGS) \ + -D__KERNEL__ -D__ASM_SYSREG_H \ + -I../include $(LINUXINCLUDE) \ + $(EXTRA_CFLAGS) -c $< -o -| $(LLC) -march=bpf -filetype=obj -o $@ + bpftool gen skeleton rss.bpf.o > rss.bpf.skeleton.h + cp rss.bpf.skeleton.h ../../ebpf/ diff --git a/tools/ebpf/rss.bpf.c b/tools/ebpf/rss.bpf.c new file mode 100644 index 0000000000..e85ec55f9b --- /dev/null +++ b/tools/ebpf/rss.bpf.c @@ -0,0 +1,571 @@ +/* + * eBPF RSS program + * + * Developed by Daynix Computing LTD (http://www.daynix.com) + * + * Authors: + * Andrew Melnychenko + * Yuri Benditovich + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * Prepare: + * Requires llvm, clang, bpftool, linux kernel tree + * + * Build rss.bpf.skeleton.h: + * make -f Makefile.ebpf clean all + */ + +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +#define INDIRECTION_TABLE_SIZE 128 +#define HASH_CALCULATION_BUFFER_SIZE 36 + +struct rss_config_t { + __u8 redirect; + __u8 populate_hash; + __u32 hash_types; + __u16 indirections_len; + __u16 default_queue; +} __attribute__((packed)); + +struct toeplitz_key_data_t { + __u32 leftmost_32_bits; + __u8 next_byte[HASH_CALCULATION_BUFFER_SIZE]; +}; + +struct packet_hash_info_t { + __u8 is_ipv4; + __u8 is_ipv6; + __u8 is_udp; + __u8 is_tcp; + __u8 is_ipv6_ext_src; + __u8 is_ipv6_ext_dst; + __u8 is_fragmented; + + __u16 src_port; + __u16 dst_port; + + union { + struct { + __be32 in_src; + __be32 in_dst; + }; + + struct { + struct in6_addr in6_src; + struct in6_addr in6_dst; + struct in6_addr in6_ext_src; + struct in6_addr in6_ext_dst; + }; + }; +}; + +struct bpf_map_def SEC("maps") +tap_rss_map_configurations = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(__u32), + .value_size = sizeof(struct rss_config_t), + .max_entries = 1, +}; + +struct bpf_map_def SEC("maps") +tap_rss_map_toeplitz_key = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(__u32), + .value_size = sizeof(struct toeplitz_key_data_t), + .max_entries = 1, +}; + +struct bpf_map_def SEC("maps") +tap_rss_map_indirection_table = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(__u32), + .value_size = sizeof(__u16), + .max_entries = INDIRECTION_TABLE_SIZE, +}; + +static inline void net_rx_rss_add_chunk(__u8 *rss_input, size_t *bytes_written, + const void *ptr, size_t size) { + __builtin_memcpy(&rss_input[*bytes_written], ptr, size); + *bytes_written += size; +} + +static inline +void net_toeplitz_add(__u32 *result, + __u8 *input, + __u32 len + , struct toeplitz_key_data_t *key) { + + __u32 accumulator = *result; + __u32 leftmost_32_bits = key->leftmost_32_bits; + __u32 byte; + + for (byte = 0; byte < HASH_CALCULATION_BUFFER_SIZE; byte++) { + __u8 input_byte = input[byte]; + __u8 key_byte = key->next_byte[byte]; + __u8 bit; + + for (bit = 0; bit < 8; bit++) { + if (input_byte & (1 << 7)) { + accumulator ^= leftmost_32_bits; + } + + leftmost_32_bits = + (leftmost_32_bits << 1) | ((key_byte & (1 << 7)) >> 7); + + input_byte <<= 1; + key_byte <<= 1; + } + } + + *result = accumulator; +} + + +static inline int ip6_extension_header_type(__u8 hdr_type) +{ + switch (hdr_type) { + case IPPROTO_HOPOPTS: + case IPPROTO_ROUTING: + case IPPROTO_FRAGMENT: + case IPPROTO_ICMPV6: + case IPPROTO_NONE: + case IPPROTO_DSTOPTS: + case IPPROTO_MH: + return 1; + default: + return 0; + } +} +/* + * According to + * https://www.iana.org/assignments/ipv6-parameters/ipv6-parameters.xhtml + * we expect that there are would be no more than 11 extensions in IPv6 header, + * also there is 27 TLV options for Destination and Hop-by-hop extensions. + * Need to choose reasonable amount of maximum extensions/options we may + * check to find ext src/dst. + */ +#define IP6_EXTENSIONS_COUNT 11 +#define IP6_OPTIONS_COUNT 30 + +static inline int parse_ipv6_ext(struct __sk_buff *skb, + struct packet_hash_info_t *info, + __u8 *l4_protocol, size_t *l4_offset) +{ + int err = 0; + + if (!ip6_extension_header_type(*l4_protocol)) { + return 0; + } + + struct ipv6_opt_hdr ext_hdr = {}; + + for (unsigned int i = 0; i < IP6_EXTENSIONS_COUNT; ++i) { + + err = bpf_skb_load_bytes_relative(skb, *l4_offset, &ext_hdr, + sizeof(ext_hdr), BPF_HDR_START_NET); + if (err) { + goto error; + } + + if (*l4_protocol == IPPROTO_ROUTING) { + struct ipv6_rt_hdr ext_rt = {}; + + err = bpf_skb_load_bytes_relative(skb, *l4_offset, &ext_rt, + sizeof(ext_rt), BPF_HDR_START_NET); + if (err) { + goto error; + } + + if ((ext_rt.type == IPV6_SRCRT_TYPE_2) && + (ext_rt.hdrlen == sizeof(struct in6_addr) / 8) && + (ext_rt.segments_left == 1)) { + + err = bpf_skb_load_bytes_relative(skb, + *l4_offset + offsetof(struct rt2_hdr, addr), + &info->in6_ext_dst, sizeof(info->in6_ext_dst), + BPF_HDR_START_NET); + if (err) { + goto error; + } + + info->is_ipv6_ext_dst = 1; + } + + } else if (*l4_protocol == IPPROTO_DSTOPTS) { + struct ipv6_opt_t { + __u8 type; + __u8 length; + } __attribute__((packed)) opt = {}; + + size_t opt_offset = sizeof(ext_hdr); + + for (unsigned int j = 0; j < IP6_OPTIONS_COUNT; ++j) { + err = bpf_skb_load_bytes_relative(skb, *l4_offset + opt_offset, + &opt, sizeof(opt), BPF_HDR_START_NET); + if (err) { + goto error; + } + + if (opt.type == IPV6_TLV_HAO) { + err = bpf_skb_load_bytes_relative(skb, + *l4_offset + opt_offset + + offsetof(struct ipv6_destopt_hao, addr), + &info->in6_ext_src, sizeof(info->in6_ext_src), + BPF_HDR_START_NET); + if (err) { + goto error; + } + + info->is_ipv6_ext_src = 1; + break; + } + + opt_offset += (opt.type == IPV6_TLV_PAD1) ? + 1 : opt.length + sizeof(opt); + + if (opt_offset + 1 >= ext_hdr.hdrlen * 8) { + break; + } + } + } else if (*l4_protocol == IPPROTO_FRAGMENT) { + info->is_fragmented = true; + } + + *l4_protocol = ext_hdr.nexthdr; + *l4_offset += (ext_hdr.hdrlen + 1) * 8; + + if (!ip6_extension_header_type(ext_hdr.nexthdr)) { + return 0; + } + } + + return 0; +error: + return err; +} + +static __be16 parse_eth_type(struct __sk_buff *skb) +{ + unsigned int offset = 12; + __be16 ret = 0; + int err = 0; + + err = bpf_skb_load_bytes_relative(skb, offset, &ret, sizeof(ret), + BPF_HDR_START_MAC); + if (err) { + return 0; + } + + switch (bpf_ntohs(ret)) { + case ETH_P_8021AD: + offset += 4; + case ETH_P_8021Q: + offset += 4; + err = bpf_skb_load_bytes_relative(skb, offset, &ret, sizeof(ret), + BPF_HDR_START_MAC); + default: + break; + } + + if (err) { + return 0; + } + + return ret; +} + +static inline int parse_packet(struct __sk_buff *skb, + struct packet_hash_info_t *info) +{ + int err = 0; + + if (!info || !skb) { + return -1; + } + + size_t l4_offset = 0; + __u8 l4_protocol = 0; + __u16 l3_protocol = bpf_ntohs(parse_eth_type(skb)); + if (l3_protocol == 0) { + err = -1; + goto error; + } + + if (l3_protocol == ETH_P_IP) { + info->is_ipv4 = 1; + + struct iphdr ip = {}; + err = bpf_skb_load_bytes_relative(skb, 0, &ip, sizeof(ip), + BPF_HDR_START_NET); + if (err) { + goto error; + } + + info->in_src = ip.saddr; + info->in_dst = ip.daddr; + info->is_fragmented = !!ip.frag_off; + + l4_protocol = ip.protocol; + l4_offset = ip.ihl * 4; + } else if (l3_protocol == ETH_P_IPV6) { + info->is_ipv6 = 1; + + struct ipv6hdr ip6 = {}; + err = bpf_skb_load_bytes_relative(skb, 0, &ip6, sizeof(ip6), + BPF_HDR_START_NET); + if (err) { + goto error; + } + + info->in6_src = ip6.saddr; + info->in6_dst = ip6.daddr; + + l4_protocol = ip6.nexthdr; + l4_offset = sizeof(ip6); + + err = parse_ipv6_ext(skb, info, &l4_protocol, &l4_offset); + if (err) { + goto error; + } + } + + if (l4_protocol != 0 && !info->is_fragmented) { + if (l4_protocol == IPPROTO_TCP) { + info->is_tcp = 1; + + struct tcphdr tcp = {}; + err = bpf_skb_load_bytes_relative(skb, l4_offset, &tcp, sizeof(tcp), + BPF_HDR_START_NET); + if (err) { + goto error; + } + + info->src_port = tcp.source; + info->dst_port = tcp.dest; + } else if (l4_protocol == IPPROTO_UDP) { /* TODO: add udplite? */ + info->is_udp = 1; + + struct udphdr udp = {}; + err = bpf_skb_load_bytes_relative(skb, l4_offset, &udp, sizeof(udp), + BPF_HDR_START_NET); + if (err) { + goto error; + } + + info->src_port = udp.source; + info->dst_port = udp.dest; + } + } + + return 0; + +error: + return err; +} + +static inline __u32 calculate_rss_hash(struct __sk_buff *skb, + struct rss_config_t *config, struct toeplitz_key_data_t *toe) +{ + __u8 rss_input[HASH_CALCULATION_BUFFER_SIZE] = {}; + size_t bytes_written = 0; + __u32 result = 0; + int err = 0; + struct packet_hash_info_t packet_info = {}; + + err = parse_packet(skb, &packet_info); + if (err) { + return 0; + } + + if (packet_info.is_ipv4) { + if (packet_info.is_tcp && + config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_TCPv4) { + + net_rx_rss_add_chunk(rss_input, &bytes_written, + &packet_info.in_src, + sizeof(packet_info.in_src)); + net_rx_rss_add_chunk(rss_input, &bytes_written, + &packet_info.in_dst, + sizeof(packet_info.in_dst)); + net_rx_rss_add_chunk(rss_input, &bytes_written, + &packet_info.src_port, + sizeof(packet_info.src_port)); + net_rx_rss_add_chunk(rss_input, &bytes_written, + &packet_info.dst_port, + sizeof(packet_info.dst_port)); + } else if (packet_info.is_udp && + config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_UDPv4) { + + net_rx_rss_add_chunk(rss_input, &bytes_written, + &packet_info.in_src, + sizeof(packet_info.in_src)); + net_rx_rss_add_chunk(rss_input, &bytes_written, + &packet_info.in_dst, + sizeof(packet_info.in_dst)); + net_rx_rss_add_chunk(rss_input, &bytes_written, + &packet_info.src_port, + sizeof(packet_info.src_port)); + net_rx_rss_add_chunk(rss_input, &bytes_written, + &packet_info.dst_port, + sizeof(packet_info.dst_port)); + } else if (config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_IPv4) { + net_rx_rss_add_chunk(rss_input, &bytes_written, + &packet_info.in_src, + sizeof(packet_info.in_src)); + net_rx_rss_add_chunk(rss_input, &bytes_written, + &packet_info.in_dst, + sizeof(packet_info.in_dst)); + } + } else if (packet_info.is_ipv6) { + if (packet_info.is_tcp && + config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_TCPv6) { + + if (packet_info.is_ipv6_ext_src && + config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_TCP_EX) { + + net_rx_rss_add_chunk(rss_input, &bytes_written, + &packet_info.in6_ext_src, + sizeof(packet_info.in6_ext_src)); + } else { + net_rx_rss_add_chunk(rss_input, &bytes_written, + &packet_info.in6_src, + sizeof(packet_info.in6_src)); + } + if (packet_info.is_ipv6_ext_dst && + config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_TCP_EX) { + + net_rx_rss_add_chunk(rss_input, &bytes_written, + &packet_info.in6_ext_dst, + sizeof(packet_info.in6_ext_dst)); + } else { + net_rx_rss_add_chunk(rss_input, &bytes_written, + &packet_info.in6_dst, + sizeof(packet_info.in6_dst)); + } + net_rx_rss_add_chunk(rss_input, &bytes_written, + &packet_info.src_port, + sizeof(packet_info.src_port)); + net_rx_rss_add_chunk(rss_input, &bytes_written, + &packet_info.dst_port, + sizeof(packet_info.dst_port)); + } else if (packet_info.is_udp && + config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_UDPv6) { + + if (packet_info.is_ipv6_ext_src && + config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_UDP_EX) { + + net_rx_rss_add_chunk(rss_input, &bytes_written, + &packet_info.in6_ext_src, + sizeof(packet_info.in6_ext_src)); + } else { + net_rx_rss_add_chunk(rss_input, &bytes_written, + &packet_info.in6_src, + sizeof(packet_info.in6_src)); + } + if (packet_info.is_ipv6_ext_dst && + config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_UDP_EX) { + + net_rx_rss_add_chunk(rss_input, &bytes_written, + &packet_info.in6_ext_dst, + sizeof(packet_info.in6_ext_dst)); + } else { + net_rx_rss_add_chunk(rss_input, &bytes_written, + &packet_info.in6_dst, + sizeof(packet_info.in6_dst)); + } + + net_rx_rss_add_chunk(rss_input, &bytes_written, + &packet_info.src_port, + sizeof(packet_info.src_port)); + net_rx_rss_add_chunk(rss_input, &bytes_written, + &packet_info.dst_port, + sizeof(packet_info.dst_port)); + + } else if (config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_IPv6) { + if (packet_info.is_ipv6_ext_src && + config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_IP_EX) { + + net_rx_rss_add_chunk(rss_input, &bytes_written, + &packet_info.in6_ext_src, + sizeof(packet_info.in6_ext_src)); + } else { + net_rx_rss_add_chunk(rss_input, &bytes_written, + &packet_info.in6_src, + sizeof(packet_info.in6_src)); + } + if (packet_info.is_ipv6_ext_dst && + config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_IP_EX) { + + net_rx_rss_add_chunk(rss_input, &bytes_written, + &packet_info.in6_ext_dst, + sizeof(packet_info.in6_ext_dst)); + } else { + net_rx_rss_add_chunk(rss_input, &bytes_written, + &packet_info.in6_dst, + sizeof(packet_info.in6_dst)); + } + } + } + + if (bytes_written) { + net_toeplitz_add(&result, rss_input, bytes_written, toe); + } + + return result; +} + +SEC("tun_rss_steering") +int tun_rss_steering_prog(struct __sk_buff *skb) +{ + + struct rss_config_t *config; + struct toeplitz_key_data_t *toe; + + __u32 key = 0; + __u32 hash = 0; + + config = bpf_map_lookup_elem(&tap_rss_map_configurations, &key); + toe = bpf_map_lookup_elem(&tap_rss_map_toeplitz_key, &key); + + if (config && toe) { + if (!config->redirect) { + return config->default_queue; + } + + hash = calculate_rss_hash(skb, config, toe); + if (hash) { + __u32 table_idx = hash % config->indirections_len; + __u16 *queue = 0; + + queue = bpf_map_lookup_elem(&tap_rss_map_indirection_table, + &table_idx); + + if (queue) { + return *queue; + } + } + + return config->default_queue; + } + + return -1; +} + +char _license[] SEC("license") = "GPL v2";