diff --git a/Documentation/networking/00-INDEX b/Documentation/networking/00-INDEX index 7a79b3587dd3..f5d642c01dd3 100644 --- a/Documentation/networking/00-INDEX +++ b/Documentation/networking/00-INDEX @@ -228,6 +228,8 @@ x25.txt - general info on X.25 development. x25-iface.txt - description of the X.25 Packet Layer to LAPB device interface. +xfrm_device.txt + - description of XFRM offload API xfrm_proc.txt - description of the statistics package for XFRM. xfrm_sync.txt diff --git a/Documentation/networking/xfrm_device.txt b/Documentation/networking/xfrm_device.txt new file mode 100644 index 000000000000..2d9d588cd34b --- /dev/null +++ b/Documentation/networking/xfrm_device.txt @@ -0,0 +1,132 @@ + +=============================================== +XFRM device - offloading the IPsec computations +=============================================== +Shannon Nelson + + +Overview +======== + +IPsec is a useful feature for securing network traffic, but the +computational cost is high: a 10Gbps link can easily be brought down +to under 1Gbps, depending on the traffic and link configuration. +Luckily, there are NICs that offer a hardware based IPsec offload which +can radically increase throughput and decrease CPU utilization. The XFRM +Device interface allows NIC drivers to offer to the stack access to the +hardware offload. + +Userland access to the offload is typically through a system such as +libreswan or KAME/raccoon, but the iproute2 'ip xfrm' command set can +be handy when experimenting. An example command might look something +like this: + + ip x s add proto esp dst 14.0.0.70 src 14.0.0.52 spi 0x07 mode transport \ + reqid 0x07 replay-window 32 \ + aead 'rfc4106(gcm(aes))' 0x44434241343332312423222114131211f4f3f2f1 128 \ + sel src 14.0.0.52/24 dst 14.0.0.70/24 proto tcp \ + offload dev eth4 dir in + +Yes, that's ugly, but that's what shell scripts and/or libreswan are for. + + + +Callbacks to implement +====================== + +/* from include/linux/netdevice.h */ +struct xfrmdev_ops { + int (*xdo_dev_state_add) (struct xfrm_state *x); + void (*xdo_dev_state_delete) (struct xfrm_state *x); + void (*xdo_dev_state_free) (struct xfrm_state *x); + bool (*xdo_dev_offload_ok) (struct sk_buff *skb, + struct xfrm_state *x); +}; + +The NIC driver offering ipsec offload will need to implement these +callbacks to make the offload available to the network stack's +XFRM subsytem. Additionally, the feature bits NETIF_F_HW_ESP and +NETIF_F_HW_ESP_TX_CSUM will signal the availability of the offload. + + + +Flow +==== + +At probe time and before the call to register_netdev(), the driver should +set up local data structures and XFRM callbacks, and set the feature bits. +The XFRM code's listener will finish the setup on NETDEV_REGISTER. + + adapter->netdev->xfrmdev_ops = &ixgbe_xfrmdev_ops; + adapter->netdev->features |= NETIF_F_HW_ESP; + adapter->netdev->hw_enc_features |= NETIF_F_HW_ESP; + +When new SAs are set up with a request for "offload" feature, the +driver's xdo_dev_state_add() will be given the new SA to be offloaded +and an indication of whether it is for Rx or Tx. The driver should + - verify the algorithm is supported for offloads + - store the SA information (key, salt, target-ip, protocol, etc) + - enable the HW offload of the SA + +The driver can also set an offload_handle in the SA, an opaque void pointer +that can be used to convey context into the fast-path offload requests. + + xs->xso.offload_handle = context; + + +When the network stack is preparing an IPsec packet for an SA that has +been setup for offload, it first calls into xdo_dev_offload_ok() with +the skb and the intended offload state to ask the driver if the offload +will serviceable. This can check the packet information to be sure the +offload can be supported (e.g. IPv4 or IPv6, no IPv4 options, etc) and +return true of false to signify its support. + +When ready to send, the driver needs to inspect the Tx packet for the +offload information, including the opaque context, and set up the packet +send accordingly. + + xs = xfrm_input_state(skb); + context = xs->xso.offload_handle; + set up HW for send + +The stack has already inserted the appropriate IPsec headers in the +packet data, the offload just needs to do the encryption and fix up the +header values. + + +When a packet is received and the HW has indicated that it offloaded a +decryption, the driver needs to add a reference to the decoded SA into +the packet's skb. At this point the data should be decrypted but the +IPsec headers are still in the packet data; they are removed later up +the stack in xfrm_input(). + + find and hold the SA that was used to the Rx skb + get spi, protocol, and destination IP from packet headers + xs = find xs from (spi, protocol, dest_IP) + xfrm_state_hold(xs); + + store the state information into the skb + skb->sp = secpath_dup(skb->sp); + skb->sp->xvec[skb->sp->len++] = xs; + skb->sp->olen++; + + indicate the success and/or error status of the offload + xo = xfrm_offload(skb); + xo->flags = CRYPTO_DONE; + xo->status = crypto_status; + + hand the packet to napi_gro_receive() as usual + + +When the SA is removed by the user, the driver's xdo_dev_state_delete() +is asked to disable the offload. Later, xdo_dev_state_free() is called +from a garbage collection routine after all reference counts to the state +have been removed and any remaining resources can be cleared for the +offload state. How these are used by the driver will depend on specific +hardware needs. + +As a netdev is set to DOWN the XFRM stack's netdev listener will call +xdo_dev_state_delete() and xdo_dev_state_free() on any remaining offloaded +states. + + diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index c61a7d46b412..00641b611aed 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -67,7 +67,7 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, /* We don't yet support UDP encapsulation, TFC padding and ESN. */ if (x->encap || x->tfcpad || (x->props.flags & XFRM_STATE_ESN)) - return 0; + return -EINVAL; dev = dev_get_by_index(net, xuo->ifindex); if (!dev) { diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 347ab31574d5..ac277b97e0d7 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -231,7 +231,6 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) if (xo && (xo->flags & CRYPTO_DONE)) { crypto_done = true; - x = xfrm_input_state(skb); family = XFRM_SPI_SKB_CB(skb)->family; if (!(xo->status & CRYPTO_SUCCESS)) { diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 22e3350013b4..e3a5aca9cdda 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1251,7 +1251,7 @@ EXPORT_SYMBOL(xfrm_policy_delete); int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol) { - struct net *net = xp_net(pol); + struct net *net = sock_net(sk); struct xfrm_policy *old_pol; #ifdef CONFIG_XFRM_SUB_POLICY diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c index 8b23c5bcf8e8..02501817227b 100644 --- a/net/xfrm/xfrm_replay.c +++ b/net/xfrm/xfrm_replay.c @@ -666,7 +666,7 @@ static int xfrm_replay_overflow_offload_esn(struct xfrm_state *x, struct sk_buff if (unlikely(oseq < replay_esn->oseq)) { XFRM_SKB_CB(skb)->seq.output.hi = ++oseq_hi; xo->seq.hi = oseq_hi; - + replay_esn->oseq_hi = oseq_hi; if (replay_esn->oseq_hi == 0) { replay_esn->oseq--; replay_esn->oseq_hi--; @@ -678,7 +678,6 @@ static int xfrm_replay_overflow_offload_esn(struct xfrm_state *x, struct sk_buff } replay_esn->oseq = oseq; - replay_esn->oseq_hi = oseq_hi; if (xfrm_aevent_is_on(net)) x->repl->notify(x, XFRM_REPLAY_UPDATE); diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 065d89606888..1b7856be3eeb 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -2048,6 +2048,13 @@ int xfrm_user_policy(struct sock *sk, int optname, u8 __user *optval, int optlen struct xfrm_mgr *km; struct xfrm_policy *pol = NULL; + if (!optval && !optlen) { + xfrm_sk_policy_insert(sk, XFRM_POLICY_IN, NULL); + xfrm_sk_policy_insert(sk, XFRM_POLICY_OUT, NULL); + __sk_dst_reset(sk); + return 0; + } + if (optlen <= 0 || optlen > PAGE_SIZE) return -EMSGSIZE;