Merge branch 'fix-bpf_redirect'

Martin KaFai Lau says:

====================
bpf: Fix bpf_redirect to an ipip/ip6tnl dev

This patch set fixes a bug in bpf_redirect(dev, flags) when dev is an
ipip/ip6tnl.  The current problem is IP-EthHdr-IP is sent out instead of
IP-IP.

Patch 1 adds a dev->type test similar to dev_is_mac_header_xmit()
in act_mirred.c which is only available in net-next.  We can consider to
refactor it once this patch is pulled into net-next from net.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
David S. Miller 2016-11-12 23:38:08 -05:00
commit 79774d6bfa
7 changed files with 567 additions and 19 deletions

View File

@ -3354,6 +3354,21 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
bool is_skb_forwardable(const struct net_device *dev,
const struct sk_buff *skb);
static __always_inline int ____dev_forward_skb(struct net_device *dev,
struct sk_buff *skb)
{
if (skb_orphan_frags(skb, GFP_ATOMIC) ||
unlikely(!is_skb_forwardable(dev, skb))) {
atomic_long_inc(&dev->rx_dropped);
kfree_skb(skb);
return NET_RX_DROP;
}
skb_scrub_packet(skb, true);
skb->priority = 0;
return 0;
}
void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev);
extern int netdev_budget;

View File

@ -1766,19 +1766,14 @@ EXPORT_SYMBOL_GPL(is_skb_forwardable);
int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
{
if (skb_orphan_frags(skb, GFP_ATOMIC) ||
unlikely(!is_skb_forwardable(dev, skb))) {
atomic_long_inc(&dev->rx_dropped);
kfree_skb(skb);
return NET_RX_DROP;
int ret = ____dev_forward_skb(dev, skb);
if (likely(!ret)) {
skb->protocol = eth_type_trans(skb, dev);
skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
}
skb_scrub_packet(skb, true);
skb->priority = 0;
skb->protocol = eth_type_trans(skb, dev);
skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
return 0;
return ret;
}
EXPORT_SYMBOL_GPL(__dev_forward_skb);

View File

@ -1628,6 +1628,19 @@ static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb)
return dev_forward_skb(dev, skb);
}
static inline int __bpf_rx_skb_no_mac(struct net_device *dev,
struct sk_buff *skb)
{
int ret = ____dev_forward_skb(dev, skb);
if (likely(!ret)) {
skb->dev = dev;
ret = netif_rx(skb);
}
return ret;
}
static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
{
int ret;
@ -1647,6 +1660,51 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
return ret;
}
static int __bpf_redirect_no_mac(struct sk_buff *skb, struct net_device *dev,
u32 flags)
{
/* skb->mac_len is not set on normal egress */
unsigned int mlen = skb->network_header - skb->mac_header;
__skb_pull(skb, mlen);
/* At ingress, the mac header has already been pulled once.
* At egress, skb_pospull_rcsum has to be done in case that
* the skb is originated from ingress (i.e. a forwarded skb)
* to ensure that rcsum starts at net header.
*/
if (!skb_at_tc_ingress(skb))
skb_postpull_rcsum(skb, skb_mac_header(skb), mlen);
skb_pop_mac_header(skb);
skb_reset_mac_len(skb);
return flags & BPF_F_INGRESS ?
__bpf_rx_skb_no_mac(dev, skb) : __bpf_tx_skb(dev, skb);
}
static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev,
u32 flags)
{
bpf_push_mac_rcsum(skb);
return flags & BPF_F_INGRESS ?
__bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb);
}
static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev,
u32 flags)
{
switch (dev->type) {
case ARPHRD_TUNNEL:
case ARPHRD_TUNNEL6:
case ARPHRD_SIT:
case ARPHRD_IPGRE:
case ARPHRD_VOID:
case ARPHRD_NONE:
return __bpf_redirect_no_mac(skb, dev, flags);
default:
return __bpf_redirect_common(skb, dev, flags);
}
}
BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
{
struct net_device *dev;
@ -1675,10 +1733,7 @@ BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
return -ENOMEM;
}
bpf_push_mac_rcsum(clone);
return flags & BPF_F_INGRESS ?
__bpf_rx_skb(dev, clone) : __bpf_tx_skb(dev, clone);
return __bpf_redirect(clone, dev, flags);
}
static const struct bpf_func_proto bpf_clone_redirect_proto = {
@ -1722,10 +1777,7 @@ int skb_do_redirect(struct sk_buff *skb)
return -EINVAL;
}
bpf_push_mac_rcsum(skb);
return ri->flags & BPF_F_INGRESS ?
__bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb);
return __bpf_redirect(skb, dev, ri->flags);
}
static const struct bpf_func_proto bpf_redirect_proto = {

View File

@ -27,6 +27,7 @@ hostprogs-y += xdp2
hostprogs-y += test_current_task_under_cgroup
hostprogs-y += trace_event
hostprogs-y += sampleip
hostprogs-y += tc_l2_redirect
test_verifier-objs := test_verifier.o libbpf.o
test_maps-objs := test_maps.o libbpf.o
@ -56,6 +57,7 @@ test_current_task_under_cgroup-objs := bpf_load.o libbpf.o \
test_current_task_under_cgroup_user.o
trace_event-objs := bpf_load.o libbpf.o trace_event_user.o
sampleip-objs := bpf_load.o libbpf.o sampleip_user.o
tc_l2_redirect-objs := bpf_load.o libbpf.o tc_l2_redirect_user.o
# Tell kbuild to always build the programs
always := $(hostprogs-y)
@ -72,6 +74,7 @@ always += test_probe_write_user_kern.o
always += trace_output_kern.o
always += tcbpf1_kern.o
always += tcbpf2_kern.o
always += tc_l2_redirect_kern.o
always += lathist_kern.o
always += offwaketime_kern.o
always += spintest_kern.o
@ -111,6 +114,7 @@ HOSTLOADLIBES_xdp2 += -lelf
HOSTLOADLIBES_test_current_task_under_cgroup += -lelf
HOSTLOADLIBES_trace_event += -lelf
HOSTLOADLIBES_sampleip += -lelf
HOSTLOADLIBES_tc_l2_redirect += -l elf
# Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline:
# make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang

173
samples/bpf/tc_l2_redirect.sh Executable file
View File

@ -0,0 +1,173 @@
#!/bin/bash
[[ -z $TC ]] && TC='tc'
[[ -z $IP ]] && IP='ip'
REDIRECT_USER='./tc_l2_redirect'
REDIRECT_BPF='./tc_l2_redirect_kern.o'
RP_FILTER=$(< /proc/sys/net/ipv4/conf/all/rp_filter)
IPV6_FORWARDING=$(< /proc/sys/net/ipv6/conf/all/forwarding)
function config_common {
local tun_type=$1
$IP netns add ns1
$IP netns add ns2
$IP link add ve1 type veth peer name vens1
$IP link add ve2 type veth peer name vens2
$IP link set dev ve1 up
$IP link set dev ve2 up
$IP link set dev ve1 mtu 1500
$IP link set dev ve2 mtu 1500
$IP link set dev vens1 netns ns1
$IP link set dev vens2 netns ns2
$IP -n ns1 link set dev lo up
$IP -n ns1 link set dev vens1 up
$IP -n ns1 addr add 10.1.1.101/24 dev vens1
$IP -n ns1 addr add 2401:db01::65/64 dev vens1 nodad
$IP -n ns1 route add default via 10.1.1.1 dev vens1
$IP -n ns1 route add default via 2401:db01::1 dev vens1
$IP -n ns2 link set dev lo up
$IP -n ns2 link set dev vens2 up
$IP -n ns2 addr add 10.2.1.102/24 dev vens2
$IP -n ns2 addr add 2401:db02::66/64 dev vens2 nodad
$IP -n ns2 addr add 10.10.1.102 dev lo
$IP -n ns2 addr add 2401:face::66/64 dev lo nodad
$IP -n ns2 link add ipt2 type ipip local 10.2.1.102 remote 10.2.1.1
$IP -n ns2 link add ip6t2 type ip6tnl mode any local 2401:db02::66 remote 2401:db02::1
$IP -n ns2 link set dev ipt2 up
$IP -n ns2 link set dev ip6t2 up
$IP netns exec ns2 $TC qdisc add dev vens2 clsact
$IP netns exec ns2 $TC filter add dev vens2 ingress bpf da obj $REDIRECT_BPF sec drop_non_tun_vip
if [[ $tun_type == "ipip" ]]; then
$IP -n ns2 route add 10.1.1.0/24 dev ipt2
$IP netns exec ns2 sysctl -q -w net.ipv4.conf.all.rp_filter=0
$IP netns exec ns2 sysctl -q -w net.ipv4.conf.ipt2.rp_filter=0
else
$IP -n ns2 route add 10.1.1.0/24 dev ip6t2
$IP -n ns2 route add 2401:db01::/64 dev ip6t2
$IP netns exec ns2 sysctl -q -w net.ipv4.conf.all.rp_filter=0
$IP netns exec ns2 sysctl -q -w net.ipv4.conf.ip6t2.rp_filter=0
fi
$IP addr add 10.1.1.1/24 dev ve1
$IP addr add 2401:db01::1/64 dev ve1 nodad
$IP addr add 10.2.1.1/24 dev ve2
$IP addr add 2401:db02::1/64 dev ve2 nodad
$TC qdisc add dev ve2 clsact
$TC filter add dev ve2 ingress bpf da obj $REDIRECT_BPF sec l2_to_iptun_ingress_forward
sysctl -q -w net.ipv4.conf.all.rp_filter=0
sysctl -q -w net.ipv6.conf.all.forwarding=1
}
function cleanup {
set +e
[[ -z $DEBUG ]] || set +x
$IP netns delete ns1 >& /dev/null
$IP netns delete ns2 >& /dev/null
$IP link del ve1 >& /dev/null
$IP link del ve2 >& /dev/null
$IP link del ipt >& /dev/null
$IP link del ip6t >& /dev/null
sysctl -q -w net.ipv4.conf.all.rp_filter=$RP_FILTER
sysctl -q -w net.ipv6.conf.all.forwarding=$IPV6_FORWARDING
rm -f /sys/fs/bpf/tc/globals/tun_iface
[[ -z $DEBUG ]] || set -x
set -e
}
function l2_to_ipip {
echo -n "l2_to_ipip $1: "
local dir=$1
config_common ipip
$IP link add ipt type ipip external
$IP link set dev ipt up
sysctl -q -w net.ipv4.conf.ipt.rp_filter=0
sysctl -q -w net.ipv4.conf.ipt.forwarding=1
if [[ $dir == "egress" ]]; then
$IP route add 10.10.1.0/24 via 10.2.1.102 dev ve2
$TC filter add dev ve2 egress bpf da obj $REDIRECT_BPF sec l2_to_iptun_ingress_redirect
sysctl -q -w net.ipv4.conf.ve1.forwarding=1
else
$TC qdisc add dev ve1 clsact
$TC filter add dev ve1 ingress bpf da obj $REDIRECT_BPF sec l2_to_iptun_ingress_redirect
fi
$REDIRECT_USER -U /sys/fs/bpf/tc/globals/tun_iface -i $(< /sys/class/net/ipt/ifindex)
$IP netns exec ns1 ping -c1 10.10.1.102 >& /dev/null
if [[ $dir == "egress" ]]; then
# test direct egress to ve2 (i.e. not forwarding from
# ve1 to ve2).
ping -c1 10.10.1.102 >& /dev/null
fi
cleanup
echo "OK"
}
function l2_to_ip6tnl {
echo -n "l2_to_ip6tnl $1: "
local dir=$1
config_common ip6tnl
$IP link add ip6t type ip6tnl mode any external
$IP link set dev ip6t up
sysctl -q -w net.ipv4.conf.ip6t.rp_filter=0
sysctl -q -w net.ipv4.conf.ip6t.forwarding=1
if [[ $dir == "egress" ]]; then
$IP route add 10.10.1.0/24 via 10.2.1.102 dev ve2
$IP route add 2401:face::/64 via 2401:db02::66 dev ve2
$TC filter add dev ve2 egress bpf da obj $REDIRECT_BPF sec l2_to_ip6tun_ingress_redirect
sysctl -q -w net.ipv4.conf.ve1.forwarding=1
else
$TC qdisc add dev ve1 clsact
$TC filter add dev ve1 ingress bpf da obj $REDIRECT_BPF sec l2_to_ip6tun_ingress_redirect
fi
$REDIRECT_USER -U /sys/fs/bpf/tc/globals/tun_iface -i $(< /sys/class/net/ip6t/ifindex)
$IP netns exec ns1 ping -c1 10.10.1.102 >& /dev/null
$IP netns exec ns1 ping -6 -c1 2401:face::66 >& /dev/null
if [[ $dir == "egress" ]]; then
# test direct egress to ve2 (i.e. not forwarding from
# ve1 to ve2).
ping -c1 10.10.1.102 >& /dev/null
ping -6 -c1 2401:face::66 >& /dev/null
fi
cleanup
echo "OK"
}
cleanup
test_names="l2_to_ipip l2_to_ip6tnl"
test_dirs="ingress egress"
if [[ $# -ge 2 ]]; then
test_names=$1
test_dirs=$2
elif [[ $# -ge 1 ]]; then
test_names=$1
fi
for t in $test_names; do
for d in $test_dirs; do
$t $d
done
done

View File

@ -0,0 +1,236 @@
/* Copyright (c) 2016 Facebook
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation.
*/
#include <uapi/linux/bpf.h>
#include <uapi/linux/if_ether.h>
#include <uapi/linux/if_packet.h>
#include <uapi/linux/ip.h>
#include <uapi/linux/ipv6.h>
#include <uapi/linux/in.h>
#include <uapi/linux/tcp.h>
#include <uapi/linux/filter.h>
#include <uapi/linux/pkt_cls.h>
#include <net/ipv6.h>
#include "bpf_helpers.h"
#define _htonl __builtin_bswap32
#define PIN_GLOBAL_NS 2
struct bpf_elf_map {
__u32 type;
__u32 size_key;
__u32 size_value;
__u32 max_elem;
__u32 flags;
__u32 id;
__u32 pinning;
};
/* copy of 'struct ethhdr' without __packed */
struct eth_hdr {
unsigned char h_dest[ETH_ALEN];
unsigned char h_source[ETH_ALEN];
unsigned short h_proto;
};
struct bpf_elf_map SEC("maps") tun_iface = {
.type = BPF_MAP_TYPE_ARRAY,
.size_key = sizeof(int),
.size_value = sizeof(int),
.pinning = PIN_GLOBAL_NS,
.max_elem = 1,
};
static __always_inline bool is_vip_addr(__be16 eth_proto, __be32 daddr)
{
if (eth_proto == htons(ETH_P_IP))
return (_htonl(0xffffff00) & daddr) == _htonl(0x0a0a0100);
else if (eth_proto == htons(ETH_P_IPV6))
return (daddr == _htonl(0x2401face));
return false;
}
SEC("l2_to_iptun_ingress_forward")
int _l2_to_iptun_ingress_forward(struct __sk_buff *skb)
{
struct bpf_tunnel_key tkey = {};
void *data = (void *)(long)skb->data;
struct eth_hdr *eth = data;
void *data_end = (void *)(long)skb->data_end;
int key = 0, *ifindex;
int ret;
if (data + sizeof(*eth) > data_end)
return TC_ACT_OK;
ifindex = bpf_map_lookup_elem(&tun_iface, &key);
if (!ifindex)
return TC_ACT_OK;
if (eth->h_proto == htons(ETH_P_IP)) {
char fmt4[] = "ingress forward to ifindex:%d daddr4:%x\n";
struct iphdr *iph = data + sizeof(*eth);
if (data + sizeof(*eth) + sizeof(*iph) > data_end)
return TC_ACT_OK;
if (iph->protocol != IPPROTO_IPIP)
return TC_ACT_OK;
bpf_trace_printk(fmt4, sizeof(fmt4), *ifindex,
_htonl(iph->daddr));
return bpf_redirect(*ifindex, BPF_F_INGRESS);
} else if (eth->h_proto == htons(ETH_P_IPV6)) {
char fmt6[] = "ingress forward to ifindex:%d daddr6:%x::%x\n";
struct ipv6hdr *ip6h = data + sizeof(*eth);
if (data + sizeof(*eth) + sizeof(*ip6h) > data_end)
return TC_ACT_OK;
if (ip6h->nexthdr != IPPROTO_IPIP &&
ip6h->nexthdr != IPPROTO_IPV6)
return TC_ACT_OK;
bpf_trace_printk(fmt6, sizeof(fmt6), *ifindex,
_htonl(ip6h->daddr.s6_addr32[0]),
_htonl(ip6h->daddr.s6_addr32[3]));
return bpf_redirect(*ifindex, BPF_F_INGRESS);
}
return TC_ACT_OK;
}
SEC("l2_to_iptun_ingress_redirect")
int _l2_to_iptun_ingress_redirect(struct __sk_buff *skb)
{
struct bpf_tunnel_key tkey = {};
void *data = (void *)(long)skb->data;
struct eth_hdr *eth = data;
void *data_end = (void *)(long)skb->data_end;
int key = 0, *ifindex;
int ret;
if (data + sizeof(*eth) > data_end)
return TC_ACT_OK;
ifindex = bpf_map_lookup_elem(&tun_iface, &key);
if (!ifindex)
return TC_ACT_OK;
if (eth->h_proto == htons(ETH_P_IP)) {
char fmt4[] = "e/ingress redirect daddr4:%x to ifindex:%d\n";
struct iphdr *iph = data + sizeof(*eth);
__be32 daddr = iph->daddr;
if (data + sizeof(*eth) + sizeof(*iph) > data_end)
return TC_ACT_OK;
if (!is_vip_addr(eth->h_proto, daddr))
return TC_ACT_OK;
bpf_trace_printk(fmt4, sizeof(fmt4), _htonl(daddr), *ifindex);
} else {
return TC_ACT_OK;
}
tkey.tunnel_id = 10000;
tkey.tunnel_ttl = 64;
tkey.remote_ipv4 = 0x0a020166; /* 10.2.1.102 */
bpf_skb_set_tunnel_key(skb, &tkey, sizeof(tkey), 0);
return bpf_redirect(*ifindex, 0);
}
SEC("l2_to_ip6tun_ingress_redirect")
int _l2_to_ip6tun_ingress_redirect(struct __sk_buff *skb)
{
struct bpf_tunnel_key tkey = {};
void *data = (void *)(long)skb->data;
struct eth_hdr *eth = data;
void *data_end = (void *)(long)skb->data_end;
int key = 0, *ifindex;
if (data + sizeof(*eth) > data_end)
return TC_ACT_OK;
ifindex = bpf_map_lookup_elem(&tun_iface, &key);
if (!ifindex)
return TC_ACT_OK;
if (eth->h_proto == htons(ETH_P_IP)) {
char fmt4[] = "e/ingress redirect daddr4:%x to ifindex:%d\n";
struct iphdr *iph = data + sizeof(*eth);
if (data + sizeof(*eth) + sizeof(*iph) > data_end)
return TC_ACT_OK;
if (!is_vip_addr(eth->h_proto, iph->daddr))
return TC_ACT_OK;
bpf_trace_printk(fmt4, sizeof(fmt4), _htonl(iph->daddr),
*ifindex);
} else if (eth->h_proto == htons(ETH_P_IPV6)) {
char fmt6[] = "e/ingress redirect daddr6:%x to ifindex:%d\n";
struct ipv6hdr *ip6h = data + sizeof(*eth);
if (data + sizeof(*eth) + sizeof(*ip6h) > data_end)
return TC_ACT_OK;
if (!is_vip_addr(eth->h_proto, ip6h->daddr.s6_addr32[0]))
return TC_ACT_OK;
bpf_trace_printk(fmt6, sizeof(fmt6),
_htonl(ip6h->daddr.s6_addr32[0]), *ifindex);
} else {
return TC_ACT_OK;
}
tkey.tunnel_id = 10000;
tkey.tunnel_ttl = 64;
/* 2401:db02:0:0:0:0:0:66 */
tkey.remote_ipv6[0] = _htonl(0x2401db02);
tkey.remote_ipv6[1] = 0;
tkey.remote_ipv6[2] = 0;
tkey.remote_ipv6[3] = _htonl(0x00000066);
bpf_skb_set_tunnel_key(skb, &tkey, sizeof(tkey), BPF_F_TUNINFO_IPV6);
return bpf_redirect(*ifindex, 0);
}
SEC("drop_non_tun_vip")
int _drop_non_tun_vip(struct __sk_buff *skb)
{
struct bpf_tunnel_key tkey = {};
void *data = (void *)(long)skb->data;
struct eth_hdr *eth = data;
void *data_end = (void *)(long)skb->data_end;
if (data + sizeof(*eth) > data_end)
return TC_ACT_OK;
if (eth->h_proto == htons(ETH_P_IP)) {
struct iphdr *iph = data + sizeof(*eth);
if (data + sizeof(*eth) + sizeof(*iph) > data_end)
return TC_ACT_OK;
if (is_vip_addr(eth->h_proto, iph->daddr))
return TC_ACT_SHOT;
} else if (eth->h_proto == htons(ETH_P_IPV6)) {
struct ipv6hdr *ip6h = data + sizeof(*eth);
if (data + sizeof(*eth) + sizeof(*ip6h) > data_end)
return TC_ACT_OK;
if (is_vip_addr(eth->h_proto, ip6h->daddr.s6_addr32[0]))
return TC_ACT_SHOT;
}
return TC_ACT_OK;
}
char _license[] SEC("license") = "GPL";

View File

@ -0,0 +1,73 @@
/* Copyright (c) 2016 Facebook
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation.
*/
#include <linux/unistd.h>
#include <linux/bpf.h>
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <string.h>
#include <errno.h>
#include "libbpf.h"
static void usage(void)
{
printf("Usage: tc_l2_ipip_redirect [...]\n");
printf(" -U <file> Update an already pinned BPF array\n");
printf(" -i <ifindex> Interface index\n");
printf(" -h Display this help\n");
}
int main(int argc, char **argv)
{
const char *pinned_file = NULL;
int ifindex = -1;
int array_key = 0;
int array_fd = -1;
int ret = -1;
int opt;
while ((opt = getopt(argc, argv, "F:U:i:")) != -1) {
switch (opt) {
/* General args */
case 'U':
pinned_file = optarg;
break;
case 'i':
ifindex = atoi(optarg);
break;
default:
usage();
goto out;
}
}
if (ifindex < 0 || !pinned_file) {
usage();
goto out;
}
array_fd = bpf_obj_get(pinned_file);
if (array_fd < 0) {
fprintf(stderr, "bpf_obj_get(%s): %s(%d)\n",
pinned_file, strerror(errno), errno);
goto out;
}
/* bpf_tunnel_key.remote_ipv4 expects host byte orders */
ret = bpf_update_elem(array_fd, &array_key, &ifindex, 0);
if (ret) {
perror("bpf_update_elem");
goto out;
}
out:
if (array_fd != -1)
close(array_fd);
return ret;
}