From 628e341f319f1a64a4639088faba952e4ec8f0a8 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Wed, 14 Aug 2013 13:05:23 +0200 Subject: [PATCH 01/44] xfrm: make local error reporting more robust In xfrm4 and xfrm6 we need to take care about sockets of the other address family. This could happen because a 6in4 or 4in6 tunnel could get protected by ipsec. Because we don't want to have a run-time dependency on ipv6 when only using ipv4 xfrm we have to embed a pointer to the correct local_error function in xfrm_state_afinet and look it up when returning an error depending on the socket address family. Thanks to vi0ss for the great bug report: v2: a) fix two more unsafe interpretations of skb->sk as ipv6 socket (xfrm6_local_dontfrag and __xfrm6_output) v3: a) add an EXPORT_SYMBOL_GPL(xfrm_local_error) to fix a link error when building ipv6 as a module (thanks to Steffen Klassert) Reported-by: Cc: Steffen Klassert Signed-off-by: Hannes Frederic Sowa Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 6 ++++++ net/ipv4/xfrm4_output.c | 12 ++++++++++-- net/ipv4/xfrm4_state.c | 1 + net/ipv6/xfrm6_output.c | 10 ++++++---- net/ipv6/xfrm6_state.c | 1 + net/xfrm/xfrm_output.c | 13 +++++++++++++ net/xfrm/xfrm_state.c | 7 ++----- 7 files changed, 39 insertions(+), 11 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 94ce082b29dc..e823786e7c66 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -341,10 +341,13 @@ struct xfrm_state_afinfo { struct sk_buff *skb); int (*transport_finish)(struct sk_buff *skb, int async); + void (*local_error)(struct sk_buff *skb, u32 mtu); }; extern int xfrm_state_register_afinfo(struct xfrm_state_afinfo *afinfo); extern int xfrm_state_unregister_afinfo(struct xfrm_state_afinfo *afinfo); +extern struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned int family); +extern void xfrm_state_put_afinfo(struct xfrm_state_afinfo *afinfo); extern void xfrm_state_delete_tunnel(struct xfrm_state *x); @@ -1477,6 +1480,7 @@ extern int xfrm_input_resume(struct sk_buff *skb, int nexthdr); extern int xfrm_output_resume(struct sk_buff *skb, int err); extern int xfrm_output(struct sk_buff *skb); extern int xfrm_inner_extract_output(struct xfrm_state *x, struct sk_buff *skb); +extern void xfrm_local_error(struct sk_buff *skb, int mtu); extern int xfrm4_extract_header(struct sk_buff *skb); extern int xfrm4_extract_input(struct xfrm_state *x, struct sk_buff *skb); extern int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi, @@ -1497,6 +1501,7 @@ extern int xfrm4_tunnel_register(struct xfrm_tunnel *handler, unsigned short fam extern int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family); extern int xfrm4_mode_tunnel_input_register(struct xfrm_tunnel *handler); extern int xfrm4_mode_tunnel_input_deregister(struct xfrm_tunnel *handler); +extern void xfrm4_local_error(struct sk_buff *skb, u32 mtu); extern int xfrm6_extract_header(struct sk_buff *skb); extern int xfrm6_extract_input(struct xfrm_state *x, struct sk_buff *skb); extern int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi); @@ -1514,6 +1519,7 @@ extern int xfrm6_output(struct sk_buff *skb); extern int xfrm6_output_finish(struct sk_buff *skb); extern int xfrm6_find_1stfragopt(struct xfrm_state *x, struct sk_buff *skb, u8 **prevhdr); +extern void xfrm6_local_error(struct sk_buff *skb, u32 mtu); #ifdef CONFIG_XFRM extern int xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb); diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index 327a617d594c..7a5491ffa4de 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -33,8 +33,7 @@ static int xfrm4_tunnel_check_size(struct sk_buff *skb) mtu = dst_mtu(dst); if (skb->len > mtu) { if (skb->sk) - ip_local_error(skb->sk, EMSGSIZE, ip_hdr(skb)->daddr, - inet_sk(skb->sk)->inet_dport, mtu); + xfrm_local_error(skb, mtu); else icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); @@ -99,3 +98,12 @@ int xfrm4_output(struct sk_buff *skb) x->outer_mode->afinfo->output_finish, !(IPCB(skb)->flags & IPSKB_REROUTED)); } + +void xfrm4_local_error(struct sk_buff *skb, u32 mtu) +{ + struct iphdr *hdr; + + hdr = skb->encapsulation ? inner_ip_hdr(skb) : ip_hdr(skb); + ip_local_error(skb->sk, EMSGSIZE, hdr->daddr, + inet_sk(skb->sk)->inet_dport, mtu); +} diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c index 9258e751baba..0b2a0641526a 100644 --- a/net/ipv4/xfrm4_state.c +++ b/net/ipv4/xfrm4_state.c @@ -83,6 +83,7 @@ static struct xfrm_state_afinfo xfrm4_state_afinfo = { .extract_input = xfrm4_extract_input, .extract_output = xfrm4_extract_output, .transport_finish = xfrm4_transport_finish, + .local_error = xfrm4_local_error, }; void __init xfrm4_state_init(void) diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index 8755a3079d0f..b64fff30eb06 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -34,8 +34,10 @@ static int xfrm6_local_dontfrag(struct sk_buff *skb) struct sock *sk = skb->sk; if (sk) { - proto = sk->sk_protocol; + if (sk->sk_family != AF_INET6) + return 0; + proto = sk->sk_protocol; if (proto == IPPROTO_UDP || proto == IPPROTO_RAW) return inet6_sk(sk)->dontfrag; } @@ -54,7 +56,7 @@ static void xfrm6_local_rxpmtu(struct sk_buff *skb, u32 mtu) ipv6_local_rxpmtu(sk, &fl6, mtu); } -static void xfrm6_local_error(struct sk_buff *skb, u32 mtu) +void xfrm6_local_error(struct sk_buff *skb, u32 mtu) { struct flowi6 fl6; struct sock *sk = skb->sk; @@ -80,7 +82,7 @@ static int xfrm6_tunnel_check_size(struct sk_buff *skb) if (xfrm6_local_dontfrag(skb)) xfrm6_local_rxpmtu(skb, mtu); else if (skb->sk) - xfrm6_local_error(skb, mtu); + xfrm_local_error(skb, mtu); else icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); ret = -EMSGSIZE; @@ -142,7 +144,7 @@ static int __xfrm6_output(struct sk_buff *skb) xfrm6_local_rxpmtu(skb, mtu); return -EMSGSIZE; } else if (!skb->local_df && skb->len > mtu && skb->sk) { - xfrm6_local_error(skb, mtu); + xfrm_local_error(skb, mtu); return -EMSGSIZE; } diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c index d8c70b8efc24..3fc970135fc6 100644 --- a/net/ipv6/xfrm6_state.c +++ b/net/ipv6/xfrm6_state.c @@ -183,6 +183,7 @@ static struct xfrm_state_afinfo xfrm6_state_afinfo = { .extract_input = xfrm6_extract_input, .extract_output = xfrm6_extract_output, .transport_finish = xfrm6_transport_finish, + .local_error = xfrm6_local_error, }; int __init xfrm6_state_init(void) diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index eb4a84288648..6f5fc612b162 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -214,5 +214,18 @@ int xfrm_inner_extract_output(struct xfrm_state *x, struct sk_buff *skb) return inner_mode->afinfo->extract_output(x, skb); } +void xfrm_local_error(struct sk_buff *skb, int mtu) +{ + struct xfrm_state_afinfo *afinfo; + + afinfo = xfrm_state_get_afinfo(skb->sk->sk_family); + if (!afinfo) + return; + + afinfo->local_error(skb, mtu); + xfrm_state_put_afinfo(afinfo); +} + EXPORT_SYMBOL_GPL(xfrm_output); EXPORT_SYMBOL_GPL(xfrm_inner_extract_output); +EXPORT_SYMBOL_GPL(xfrm_local_error); diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 78f66fa92449..54c0acd29468 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -39,9 +39,6 @@ static DEFINE_SPINLOCK(xfrm_state_lock); static unsigned int xfrm_state_hashmax __read_mostly = 1 * 1024 * 1024; -static struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned int family); -static void xfrm_state_put_afinfo(struct xfrm_state_afinfo *afinfo); - static inline unsigned int xfrm_dst_hash(struct net *net, const xfrm_address_t *daddr, const xfrm_address_t *saddr, @@ -1860,7 +1857,7 @@ int xfrm_state_unregister_afinfo(struct xfrm_state_afinfo *afinfo) } EXPORT_SYMBOL(xfrm_state_unregister_afinfo); -static struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned int family) +struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned int family) { struct xfrm_state_afinfo *afinfo; if (unlikely(family >= NPROTO)) @@ -1872,7 +1869,7 @@ static struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned int family) return afinfo; } -static void xfrm_state_put_afinfo(struct xfrm_state_afinfo *afinfo) +void xfrm_state_put_afinfo(struct xfrm_state_afinfo *afinfo) { rcu_read_unlock(); } From 0ea9d5e3e0e03a63b11392f5613378977dae7eca Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Tue, 13 Aug 2013 04:35:58 +0200 Subject: [PATCH 02/44] xfrm: introduce helper for safe determination of mtu skb->sk socket can be of AF_INET or AF_INET6 address family. Thus we always have to make sure we a referring to the correct interpretation of skb->sk. We only depend on header defines to query the mtu, so we don't introduce a new dependency to ipv6 by this change. Cc: Steffen Klassert Signed-off-by: Hannes Frederic Sowa Signed-off-by: Steffen Klassert --- include/net/route.h | 8 ++++++++ include/net/xfrm.h | 12 ++++++++++++ net/ipv4/ip_output.c | 8 -------- net/ipv4/xfrm4_output.c | 4 +--- net/ipv6/xfrm6_output.c | 5 ++++- 5 files changed, 25 insertions(+), 12 deletions(-) diff --git a/include/net/route.h b/include/net/route.h index 2ea40c1b5e00..afdeeb5bec25 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -317,4 +317,12 @@ static inline int ip4_dst_hoplimit(const struct dst_entry *dst) return hoplimit; } +static inline int ip_skb_dst_mtu(struct sk_buff *skb) +{ + struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL; + + return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ? + skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb)); +} + #endif /* _ROUTE_H */ diff --git a/include/net/xfrm.h b/include/net/xfrm.h index e823786e7c66..b41d2d10ff0e 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -1723,4 +1724,15 @@ static inline int xfrm_mark_put(struct sk_buff *skb, const struct xfrm_mark *m) return ret; } +static inline int xfrm_skb_dst_mtu(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + + if (sk && sk->sk_family == AF_INET6) + return ip6_skb_dst_mtu(skb); + else if (sk && sk->sk_family == AF_INET) + return ip_skb_dst_mtu(skb); + return dst_mtu(skb_dst(skb)); +} + #endif /* _NET_XFRM_H */ diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 4bcabf3ab4ca..9ee17e3d11c3 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -211,14 +211,6 @@ static inline int ip_finish_output2(struct sk_buff *skb) return -EINVAL; } -static inline int ip_skb_dst_mtu(struct sk_buff *skb) -{ - struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL; - - return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ? - skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb)); -} - static int ip_finish_output(struct sk_buff *skb) { #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index 7a5491ffa4de..80baf4a3b1b5 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -21,7 +21,6 @@ static int xfrm4_tunnel_check_size(struct sk_buff *skb) { int mtu, ret = 0; - struct dst_entry *dst; if (IPCB(skb)->flags & IPSKB_XFRM_TUNNEL_SIZE) goto out; @@ -29,8 +28,7 @@ static int xfrm4_tunnel_check_size(struct sk_buff *skb) if (!(ip_hdr(skb)->frag_off & htons(IP_DF)) || skb->local_df) goto out; - dst = skb_dst(skb); - mtu = dst_mtu(dst); + mtu = xfrm_skb_dst_mtu(skb); if (skb->len > mtu) { if (skb->sk) xfrm_local_error(skb, mtu); diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index b64fff30eb06..3ac5ab264fed 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -138,7 +138,10 @@ static int __xfrm6_output(struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct xfrm_state *x = dst->xfrm; - int mtu = ip6_skb_dst_mtu(skb); + int mtu = xfrm_skb_dst_mtu(skb); + + if (mtu < IPV6_MIN_MTU) + mtu = IPV6_MIN_MTU; if (skb->len > mtu && xfrm6_local_dontfrag(skb)) { xfrm6_local_rxpmtu(skb, mtu); From 3d483058c8c8b87a167155ca9ddd776dd730bc39 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Sun, 18 Aug 2013 13:46:52 +0200 Subject: [PATCH 03/44] ipv6: wire up skb->encapsulation When pushing a new header before current one call skb_reset_inner_headers to record the position of the inner headers in the various ipv6 tunnel protocols. We later need this to correctly identify the addresses needed to send back an error in the xfrm layer. This change is safe, because skb->protocol is always checked before dereferencing data from the inner protocol. Cc: Steffen Klassert Cc: YOSHIFUJI Hideaki Cc: Nicolas Dichtel Acked-by: Eric Dumazet Signed-off-by: Hannes Frederic Sowa Signed-off-by: Steffen Klassert --- net/ipv6/ip6_gre.c | 5 +++++ net/ipv6/ip6_tunnel.c | 6 ++++++ net/ipv6/sit.c | 5 +++++ 3 files changed, 16 insertions(+) diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index ecd60733e5e2..90747f1973fe 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -724,6 +724,11 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, ipv6_push_nfrag_opts(skb, &opt.ops, &proto, NULL); } + if (likely(!skb->encapsulation)) { + skb_reset_inner_headers(skb); + skb->encapsulation = 1; + } + skb_push(skb, gre_hlen); skb_reset_network_header(skb); skb_set_transport_header(skb, sizeof(*ipv6h)); diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 1e55866cead7..46ba243605a3 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1027,6 +1027,12 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, init_tel_txopt(&opt, encap_limit); ipv6_push_nfrag_opts(skb, &opt.ops, &proto, NULL); } + + if (likely(!skb->encapsulation)) { + skb_reset_inner_headers(skb); + skb->encapsulation = 1; + } + skb_push(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); ipv6h = ipv6_hdr(skb); diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index a3437a4cd07e..fbfc5a83867f 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -888,6 +888,11 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, ttl = iph6->hop_limit; tos = INET_ECN_encapsulate(tos, ipv6_get_dsfield(iph6)); + if (likely(!skb->encapsulation)) { + skb_reset_inner_headers(skb); + skb->encapsulation = 1; + } + err = iptunnel_xmit(dev_net(dev), rt, skb, fl4.saddr, fl4.daddr, IPPROTO_IPV6, tos, ttl, df); iptunnel_xmit_stats(err, &dev->stats, dev->tstats); From 5d0ff542d0264f61dc4bdb34eba39ffb4ea3bc23 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Sun, 18 Aug 2013 13:46:57 +0200 Subject: [PATCH 04/44] ipv6: xfrm: dereference inner ipv6 header if encapsulated In xfrm6_local_error use inner_header if the packet was encapsulated. Cc: Steffen Klassert Acked-by: Eric Dumazet Signed-off-by: Hannes Frederic Sowa Signed-off-by: Steffen Klassert --- net/ipv6/xfrm6_output.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index 3ac5ab264fed..e092e306882d 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -59,10 +59,12 @@ static void xfrm6_local_rxpmtu(struct sk_buff *skb, u32 mtu) void xfrm6_local_error(struct sk_buff *skb, u32 mtu) { struct flowi6 fl6; + const struct ipv6hdr *hdr; struct sock *sk = skb->sk; + hdr = skb->encapsulation ? inner_ipv6_hdr(skb) : ipv6_hdr(skb); fl6.fl6_dport = inet_sk(sk)->inet_dport; - fl6.daddr = ipv6_hdr(skb)->daddr; + fl6.daddr = hdr->daddr; ipv6_local_error(sk, EMSGSIZE, &fl6, mtu); } From 844d48746e4b281a933aedc0428048a1219b42f4 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Sun, 18 Aug 2013 13:47:01 +0200 Subject: [PATCH 05/44] xfrm: choose protocol family by skb protocol We need to choose the protocol family by skb->protocol. Otherwise we call the wrong xfrm{4,6}_local_error handler in case an ipv6 sockets is used in ipv4 mode, in which case we should call down to xfrm4_local_error (ip6 sockets are a superset of ip4 ones). We are called before before ip_output functions, so skb->protocol is not reset. Cc: Steffen Klassert Acked-by: Eric Dumazet Signed-off-by: Hannes Frederic Sowa Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 4 ++-- net/xfrm/xfrm_output.c | 10 +++++++++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index b41d2d10ff0e..ac5b02515355 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1728,9 +1728,9 @@ static inline int xfrm_skb_dst_mtu(struct sk_buff *skb) { struct sock *sk = skb->sk; - if (sk && sk->sk_family == AF_INET6) + if (sk && skb->protocol == htons(ETH_P_IPV6)) return ip6_skb_dst_mtu(skb); - else if (sk && sk->sk_family == AF_INET) + else if (sk && skb->protocol == htons(ETH_P_IP)) return ip_skb_dst_mtu(skb); return dst_mtu(skb_dst(skb)); } diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index 6f5fc612b162..3bb2cdc13b46 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -216,9 +216,17 @@ int xfrm_inner_extract_output(struct xfrm_state *x, struct sk_buff *skb) void xfrm_local_error(struct sk_buff *skb, int mtu) { + unsigned int proto; struct xfrm_state_afinfo *afinfo; - afinfo = xfrm_state_get_afinfo(skb->sk->sk_family); + if (skb->protocol == htons(ETH_P_IP)) + proto = AF_INET; + else if (skb->protocol == htons(ETH_P_IPV6)) + proto = AF_INET6; + else + return; + + afinfo = xfrm_state_get_afinfo(proto); if (!afinfo) return; From 2a3ba63c235fdcd37f6451bdf4a0c7865a3930cf Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 20 Aug 2013 11:28:50 +0200 Subject: [PATCH 06/44] mac80211: add missing channel context release IBSS needs to release the channel context when leaving but I evidently missed that. Fix it. Cc: stable@vger.kernel.org Signed-off-by: Johannes Berg --- net/mac80211/ibss.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index ea7b9c2c7e66..5e8bb3bee3c2 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -1138,6 +1138,7 @@ int ieee80211_ibss_leave(struct ieee80211_sub_if_data *sdata) clear_bit(SDATA_STATE_OFFCHANNEL_BEACON_STOPPED, &sdata->state); ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON_ENABLED | BSS_CHANGED_IBSS); + ieee80211_vif_release_channel(sdata); synchronize_rcu(); kfree(presp); From 2dfca312a91631311c1cf7c090246cc8103de038 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Tue, 20 Aug 2013 19:43:54 +0200 Subject: [PATCH 07/44] mac80211: add a flag to indicate CCK support for HT clients brcm80211 cannot handle sending frames with CCK rates as part of an A-MPDU session. Other drivers may have issues too. Set the flag in all drivers that have been tested with CCK rates. This fixes a reported brcmsmac regression introduced in commit ef47a5e4f1aaf1d0e2e6875e34b2c9595897bef6 "mac80211/minstrel_ht: fix cck rate sampling" Cc: stable@vger.kernel.org # 3.10 Reported-by: Tom Gundersen Signed-off-by: Felix Fietkau Signed-off-by: Johannes Berg --- drivers/net/wireless/ath/ath9k/init.c | 3 ++- drivers/net/wireless/ath/carl9170/main.c | 3 ++- drivers/net/wireless/rt2x00/rt2800lib.c | 3 ++- include/net/mac80211.h | 1 + net/mac80211/rc80211_minstrel_ht.c | 3 +++ 5 files changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/ath/ath9k/init.c b/drivers/net/wireless/ath/ath9k/init.c index 16f8b201642b..026a2a067b46 100644 --- a/drivers/net/wireless/ath/ath9k/init.c +++ b/drivers/net/wireless/ath/ath9k/init.c @@ -802,7 +802,8 @@ void ath9k_set_hw_capab(struct ath_softc *sc, struct ieee80211_hw *hw) IEEE80211_HW_PS_NULLFUNC_STACK | IEEE80211_HW_SPECTRUM_MGMT | IEEE80211_HW_REPORTS_TX_ACK_STATUS | - IEEE80211_HW_SUPPORTS_RC_TABLE; + IEEE80211_HW_SUPPORTS_RC_TABLE | + IEEE80211_HW_SUPPORTS_HT_CCK_RATES; if (sc->sc_ah->caps.hw_caps & ATH9K_HW_CAP_HT) { hw->flags |= IEEE80211_HW_AMPDU_AGGREGATION; diff --git a/drivers/net/wireless/ath/carl9170/main.c b/drivers/net/wireless/ath/carl9170/main.c index 4a33c6e39ca2..349fa22a921a 100644 --- a/drivers/net/wireless/ath/carl9170/main.c +++ b/drivers/net/wireless/ath/carl9170/main.c @@ -1860,7 +1860,8 @@ void *carl9170_alloc(size_t priv_size) IEEE80211_HW_PS_NULLFUNC_STACK | IEEE80211_HW_NEED_DTIM_BEFORE_ASSOC | IEEE80211_HW_SUPPORTS_RC_TABLE | - IEEE80211_HW_SIGNAL_DBM; + IEEE80211_HW_SIGNAL_DBM | + IEEE80211_HW_SUPPORTS_HT_CCK_RATES; if (!modparam_noht) { /* diff --git a/drivers/net/wireless/rt2x00/rt2800lib.c b/drivers/net/wireless/rt2x00/rt2800lib.c index 1f80ea5e29dd..1b41c8eda12d 100644 --- a/drivers/net/wireless/rt2x00/rt2800lib.c +++ b/drivers/net/wireless/rt2x00/rt2800lib.c @@ -6133,7 +6133,8 @@ static int rt2800_probe_hw_mode(struct rt2x00_dev *rt2x00dev) IEEE80211_HW_SUPPORTS_PS | IEEE80211_HW_PS_NULLFUNC_STACK | IEEE80211_HW_AMPDU_AGGREGATION | - IEEE80211_HW_REPORTS_TX_ACK_STATUS; + IEEE80211_HW_REPORTS_TX_ACK_STATUS | + IEEE80211_HW_SUPPORTS_HT_CCK_RATES; /* * Don't set IEEE80211_HW_HOST_BROADCAST_PS_BUFFERING for USB devices diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 5b7a3dadadde..551ba6a6a073 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1499,6 +1499,7 @@ enum ieee80211_hw_flags { IEEE80211_HW_SUPPORTS_RC_TABLE = 1<<24, IEEE80211_HW_P2P_DEV_ADDR_FOR_INTF = 1<<25, IEEE80211_HW_TIMING_BEACON_ONLY = 1<<26, + IEEE80211_HW_SUPPORTS_HT_CCK_RATES = 1<<27, }; /** diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c index f5aed963b22e..f3bbea1eb9e7 100644 --- a/net/mac80211/rc80211_minstrel_ht.c +++ b/net/mac80211/rc80211_minstrel_ht.c @@ -828,6 +828,9 @@ minstrel_ht_update_cck(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, if (sband->band != IEEE80211_BAND_2GHZ) return; + if (!(mp->hw->flags & IEEE80211_HW_SUPPORTS_HT_CCK_RATES)) + return; + mi->cck_supported = 0; mi->cck_supported_short = 0; for (i = 0; i < 4; i++) { From 75a423f493ffdf741acae27bf179cd560f7813d7 Mon Sep 17 00:00:00 2001 From: Simon Wunderlich Date: Wed, 21 Aug 2013 15:30:25 +0200 Subject: [PATCH 08/44] mac80211: ibss: fix ignored channel parameter my earlier patch "mac80211: change IBSS channel state to chandef" created a regression by ignoring the channel parameter in __ieee80211_sta_join_ibss, which breaks IBSS channel selection. This patch fixes this situation by using the right channel and adopting the selected bandwidth mode. Cc: stable@vger.kernel.org Signed-off-by: Simon Wunderlich Signed-off-by: Johannes Berg --- net/mac80211/ibss.c | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index 5e8bb3bee3c2..2d45643c964e 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -36,7 +36,7 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, const u8 *bssid, const int beacon_int, - struct ieee80211_channel *chan, + struct cfg80211_chan_def *req_chandef, const u32 basic_rates, const u16 capability, u64 tsf, bool creator) @@ -51,6 +51,7 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, u32 bss_change; u8 supp_rates[IEEE80211_MAX_SUPP_RATES]; struct cfg80211_chan_def chandef; + struct ieee80211_channel *chan; struct beacon_data *presp; int frame_len; @@ -81,7 +82,9 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, sdata->drop_unencrypted = capability & WLAN_CAPABILITY_PRIVACY ? 1 : 0; - chandef = ifibss->chandef; + /* make a copy of the chandef, it could be modified below. */ + chandef = *req_chandef; + chan = chandef.chan; if (!cfg80211_reg_can_beacon(local->hw.wiphy, &chandef)) { chandef.width = NL80211_CHAN_WIDTH_20; chandef.center_freq1 = chan->center_freq; @@ -259,10 +262,12 @@ static void ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, struct cfg80211_bss *cbss = container_of((void *)bss, struct cfg80211_bss, priv); struct ieee80211_supported_band *sband; + struct cfg80211_chan_def chandef; u32 basic_rates; int i, j; u16 beacon_int = cbss->beacon_interval; const struct cfg80211_bss_ies *ies; + enum nl80211_channel_type chan_type; u64 tsf; sdata_assert_lock(sdata); @@ -270,6 +275,26 @@ static void ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, if (beacon_int < 10) beacon_int = 10; + switch (sdata->u.ibss.chandef.width) { + case NL80211_CHAN_WIDTH_20_NOHT: + case NL80211_CHAN_WIDTH_20: + case NL80211_CHAN_WIDTH_40: + chan_type = cfg80211_get_chandef_type(&sdata->u.ibss.chandef); + cfg80211_chandef_create(&chandef, cbss->channel, chan_type); + break; + case NL80211_CHAN_WIDTH_5: + case NL80211_CHAN_WIDTH_10: + cfg80211_chandef_create(&chandef, cbss->channel, + NL80211_CHAN_WIDTH_20_NOHT); + chandef.width = sdata->u.ibss.chandef.width; + break; + default: + /* fall back to 20 MHz for unsupported modes */ + cfg80211_chandef_create(&chandef, cbss->channel, + NL80211_CHAN_WIDTH_20_NOHT); + break; + } + sband = sdata->local->hw.wiphy->bands[cbss->channel->band]; basic_rates = 0; @@ -294,7 +319,7 @@ static void ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, __ieee80211_sta_join_ibss(sdata, cbss->bssid, beacon_int, - cbss->channel, + &chandef, basic_rates, cbss->capability, tsf, false); @@ -736,7 +761,7 @@ static void ieee80211_sta_create_ibss(struct ieee80211_sub_if_data *sdata) sdata->drop_unencrypted = 0; __ieee80211_sta_join_ibss(sdata, bssid, sdata->vif.bss_conf.beacon_int, - ifibss->chandef.chan, ifibss->basic_rates, + &ifibss->chandef, ifibss->basic_rates, capability, 0, true); } From b2fcc0aee58a3435566dd6d8501a0b355552f28b Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Wed, 21 Aug 2013 10:18:19 +0200 Subject: [PATCH 09/44] iwl4965: fix rfkill set state regression My current 3.11 fix: commit 788f7a56fce1bcb2067b62b851a086fca48a0056 Author: Stanislaw Gruszka Date: Thu Aug 1 12:07:55 2013 +0200 iwl4965: reset firmware after rfkill off broke rfkill notification to user-space . I missed that bug, because I compiled without CONFIG_RFKILL, sorry about that. Cc: stable@vger.kernel.org Signed-off-by: Stanislaw Gruszka Signed-off-by: John W. Linville --- drivers/net/wireless/iwlegacy/4965-mac.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/iwlegacy/4965-mac.c b/drivers/net/wireless/iwlegacy/4965-mac.c index f2ed62e37340..7acf5ee23582 100644 --- a/drivers/net/wireless/iwlegacy/4965-mac.c +++ b/drivers/net/wireless/iwlegacy/4965-mac.c @@ -4464,9 +4464,9 @@ il4965_irq_tasklet(struct il_priv *il) set_bit(S_RFKILL, &il->status); } else { clear_bit(S_RFKILL, &il->status); - wiphy_rfkill_set_hw_state(il->hw->wiphy, hw_rf_kill); il_force_reset(il, true); } + wiphy_rfkill_set_hw_state(il->hw->wiphy, hw_rf_kill); handled |= CSR_INT_BIT_RF_KILL; } From d2e9fc141e2aa21f4b35ee27072d84e9aa6e2ba0 Mon Sep 17 00:00:00 2001 From: Helmut Schaa Date: Fri, 16 Aug 2013 21:39:40 +0200 Subject: [PATCH 10/44] ath9k_htc: Restore skb headroom when returning skb to mac80211 ath9k_htc adds padding between the 802.11 header and the payload during TX by moving the header. When handing the frame back to mac80211 for TX status handling the header is not moved back into its original position. This can result in a too small skb headroom when entering ath9k_htc again (due to a soft retransmission for example) causing an skb_under_panic oops. Fix this by moving the 802.11 header back into its original position before returning the frame to mac80211 as other drivers like rt2x00 or ath5k do. Reported-by: Marc Kleine-Budde Signed-off-by: Helmut Schaa Tested-by: Marc Kleine-Budde Signed-off-by: Marc Kleine-Budde Cc: stable@vger.kernel.org Signed-off-by: John W. Linville --- drivers/net/wireless/ath/ath9k/htc_drv_txrx.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/net/wireless/ath/ath9k/htc_drv_txrx.c b/drivers/net/wireless/ath/ath9k/htc_drv_txrx.c index e602c9519709..c028df76b564 100644 --- a/drivers/net/wireless/ath/ath9k/htc_drv_txrx.c +++ b/drivers/net/wireless/ath/ath9k/htc_drv_txrx.c @@ -448,6 +448,7 @@ static void ath9k_htc_tx_process(struct ath9k_htc_priv *priv, struct ieee80211_conf *cur_conf = &priv->hw->conf; bool txok; int slot; + int hdrlen, padsize; slot = strip_drv_header(priv, skb); if (slot < 0) { @@ -504,6 +505,15 @@ send_mac80211: ath9k_htc_tx_clear_slot(priv, slot); + /* Remove padding before handing frame back to mac80211 */ + hdrlen = ieee80211_get_hdrlen_from_skb(skb); + + padsize = hdrlen & 3; + if (padsize && skb->len > hdrlen + padsize) { + memmove(skb->data + padsize, skb->data, hdrlen); + skb_pull(skb, padsize); + } + /* Send status to mac80211 */ ieee80211_tx_status(priv->hw, skb); } From 19c361608ce3e73f352e323262f7e0a8264be3af Mon Sep 17 00:00:00 2001 From: Sujith Manoharan Date: Tue, 20 Aug 2013 10:05:59 +0530 Subject: [PATCH 11/44] ath9k: Enable PLL fix only for AR9340/AR9330 The PLL hang workaround is required only for AR9330 and AR9340. This issue was first observed on an AP121 and the WAR is enabled for AR9340 also (DB120 etc.), since it uses a PLL design identical to AR9330. This is not required for AR9485 and AR9550. Various bugs have been reported regarding this: https://bugzilla.redhat.com/show_bug.cgi?id=997217 https://bugzilla.redhat.com/show_bug.cgi?id=994648 Cc: stable@vger.kernel.org Signed-off-by: Sujith Manoharan Signed-off-by: John W. Linville --- drivers/net/wireless/ath/ath9k/main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/wireless/ath/ath9k/main.c b/drivers/net/wireless/ath/ath9k/main.c index 1737a3e33685..cb5a65553ac7 100644 --- a/drivers/net/wireless/ath/ath9k/main.c +++ b/drivers/net/wireless/ath/ath9k/main.c @@ -173,8 +173,7 @@ static void ath_restart_work(struct ath_softc *sc) { ieee80211_queue_delayed_work(sc->hw, &sc->tx_complete_work, 0); - if (AR_SREV_9340(sc->sc_ah) || AR_SREV_9485(sc->sc_ah) || - AR_SREV_9550(sc->sc_ah)) + if (AR_SREV_9340(sc->sc_ah) || AR_SREV_9330(sc->sc_ah)) ieee80211_queue_delayed_work(sc->hw, &sc->hw_pll_work, msecs_to_jiffies(ATH_PLL_WORK_INTERVAL)); From 5a25cf1e310888eb333f9e034be84a8117111d30 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Mon, 26 Aug 2013 12:31:19 +0200 Subject: [PATCH 12/44] xfrm: revert ipv4 mtu determination to dst_mtu In commit 0ea9d5e3e0e03a63b11392f5613378977dae7eca ("xfrm: introduce helper for safe determination of mtu") I switched the determination of ipv4 mtus from dst_mtu to ip_skb_dst_mtu. This was an error because in case of IP_PMTUDISC_PROBE we fall back to the interface mtu, which is never correct for ipv4 ipsec. This patch partly reverts 0ea9d5e3e0e03a63b11392f5613378977dae7eca ("xfrm: introduce helper for safe determination of mtu"). Cc: Steffen Klassert Signed-off-by: Hannes Frederic Sowa Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 12 ------------ net/ipv4/xfrm4_output.c | 2 +- net/ipv6/xfrm6_output.c | 8 +++++--- 3 files changed, 6 insertions(+), 16 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index ac5b02515355..e823786e7c66 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -20,7 +20,6 @@ #include #include #include -#include #include #include @@ -1724,15 +1723,4 @@ static inline int xfrm_mark_put(struct sk_buff *skb, const struct xfrm_mark *m) return ret; } -static inline int xfrm_skb_dst_mtu(struct sk_buff *skb) -{ - struct sock *sk = skb->sk; - - if (sk && skb->protocol == htons(ETH_P_IPV6)) - return ip6_skb_dst_mtu(skb); - else if (sk && skb->protocol == htons(ETH_P_IP)) - return ip_skb_dst_mtu(skb); - return dst_mtu(skb_dst(skb)); -} - #endif /* _NET_XFRM_H */ diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index 80baf4a3b1b5..baa0f63731fd 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -28,7 +28,7 @@ static int xfrm4_tunnel_check_size(struct sk_buff *skb) if (!(ip_hdr(skb)->frag_off & htons(IP_DF)) || skb->local_df) goto out; - mtu = xfrm_skb_dst_mtu(skb); + mtu = dst_mtu(skb_dst(skb)); if (skb->len > mtu) { if (skb->sk) xfrm_local_error(skb, mtu); diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index e092e306882d..6cd625e37706 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -140,10 +140,12 @@ static int __xfrm6_output(struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct xfrm_state *x = dst->xfrm; - int mtu = xfrm_skb_dst_mtu(skb); + int mtu; - if (mtu < IPV6_MIN_MTU) - mtu = IPV6_MIN_MTU; + if (skb->protocol == htons(ETH_P_IPV6)) + mtu = ip6_skb_dst_mtu(skb); + else + mtu = dst_mtu(skb_dst(skb)); if (skb->len > mtu && xfrm6_local_dontfrag(skb)) { xfrm6_local_rxpmtu(skb, mtu); From 9c9c9ad5fae7e9ef56a38acb508a01919b225e9a Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Mon, 26 Aug 2013 12:31:23 +0200 Subject: [PATCH 13/44] ipv6: set skb->protocol on tcp, raw and ip6_append_data genereated skbs Currently we don't initialize skb->protocol when transmitting data via tcp, raw(with and without inclhdr) or udp+ufo or appending data directly to the socket transmit queue (via ip6_append_data). This needs to be done so that we can get the correct mtu in the xfrm layer. Setting of skb->protocol happens only in functions where we also have a transmitting socket and a new skb, so we don't overwrite old values. Cc: Steffen Klassert Cc: Eric Dumazet Acked-by: Eric Dumazet Signed-off-by: Hannes Frederic Sowa Signed-off-by: Steffen Klassert --- net/ipv6/ip6_output.c | 3 +++ net/ipv6/raw.c | 1 + 2 files changed, 4 insertions(+) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 6e3ddf806ec2..e7ceb6c871d1 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -238,6 +238,7 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, hdr->saddr = fl6->saddr; hdr->daddr = *first_hop; + skb->protocol = htons(ETH_P_IPV6); skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; @@ -1057,6 +1058,7 @@ static inline int ip6_ufo_append_data(struct sock *sk, /* initialize protocol header pointer */ skb->transport_header = skb->network_header + fragheaderlen; + skb->protocol = htons(ETH_P_IPV6); skb->ip_summed = CHECKSUM_PARTIAL; skb->csum = 0; } @@ -1359,6 +1361,7 @@ alloc_new_skb: /* * Fill in the control structures */ + skb->protocol = htons(ETH_P_IPV6); skb->ip_summed = CHECKSUM_NONE; skb->csum = 0; /* reserve for fragmentation and ipsec header */ diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index c45f7a5c36e9..cdaed47ba932 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -628,6 +628,7 @@ static int rawv6_send_hdrinc(struct sock *sk, void *from, int length, goto error; skb_reserve(skb, hlen); + skb->protocol = htons(ETH_P_IPV6); skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; skb_dst_set(skb, &rt->dst); From dd5746bf6b48bb837e9f5af14b9b241fc4fdc1ef Mon Sep 17 00:00:00 2001 From: Sarveshwar Bandi Date: Fri, 23 Aug 2013 14:59:33 +0530 Subject: [PATCH 14/44] be2net: Check for POST state in suspend-resume sequence In suspend-resume sequence, the OS could attempt to initialize the controller before it is ready, check for POST state before going ahead. Signed-off-by: Sarveshwar Bandi Signed-off-by: David S. Miller --- drivers/net/ethernet/emulex/benet/be_main.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c index 4559c35eea13..3d91a5ec61a4 100644 --- a/drivers/net/ethernet/emulex/benet/be_main.c +++ b/drivers/net/ethernet/emulex/benet/be_main.c @@ -4373,6 +4373,10 @@ static int be_resume(struct pci_dev *pdev) pci_set_power_state(pdev, PCI_D0); pci_restore_state(pdev); + status = be_fw_wait_ready(adapter); + if (status) + return status; + /* tell fw we're ready to fire cmds */ status = be_cmd_fw_init(adapter); if (status) From d661684cf6820331feae71146c35da83d794467e Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 22 Aug 2013 11:39:15 -0700 Subject: [PATCH 15/44] net: Check the correct namespace when spoofing pid over SCM_RIGHTS This is a security bug. The follow-up will fix nsproxy to discourage this type of issue from happening again. Cc: stable@vger.kernel.org Signed-off-by: Andy Lutomirski Reviewed-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- net/core/scm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/scm.c b/net/core/scm.c index 03795d0147f2..b4da80b1cc07 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -54,7 +54,7 @@ static __inline__ int scm_check_creds(struct ucred *creds) return -EINVAL; if ((creds->pid == task_tgid_vnr(current) || - ns_capable(current->nsproxy->pid_ns->user_ns, CAP_SYS_ADMIN)) && + ns_capable(task_active_pid_ns(current)->user_ns, CAP_SYS_ADMIN)) && ((uid_eq(uid, cred->uid) || uid_eq(uid, cred->euid) || uid_eq(uid, cred->suid)) || nsown_capable(CAP_SETUID)) && ((gid_eq(gid, cred->gid) || gid_eq(gid, cred->egid) || From c2b1df2eb42978073ec27c99cc199d20ae48b849 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 22 Aug 2013 11:39:16 -0700 Subject: [PATCH 16/44] Rename nsproxy.pid_ns to nsproxy.pid_ns_for_children nsproxy.pid_ns is *not* the task's pid namespace. The name should clarify that. This makes it more obvious that setns on a pid namespace is weird -- it won't change the pid namespace shown in procfs. Signed-off-by: Andy Lutomirski Reviewed-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- include/linux/nsproxy.h | 6 +++++- kernel/fork.c | 5 +++-- kernel/nsproxy.c | 27 ++++++++++++++------------- kernel/pid_namespace.c | 4 ++-- 4 files changed, 24 insertions(+), 18 deletions(-) diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h index 10e5947491c7..b4ec59d159ac 100644 --- a/include/linux/nsproxy.h +++ b/include/linux/nsproxy.h @@ -14,6 +14,10 @@ struct fs_struct; * A structure to contain pointers to all per-process * namespaces - fs (mount), uts, network, sysvipc, etc. * + * The pid namespace is an exception -- it's accessed using + * task_active_pid_ns. The pid namespace here is the + * namespace that children will use. + * * 'count' is the number of tasks holding a reference. * The count for each namespace, then, will be the number * of nsproxies pointing to it, not the number of tasks. @@ -27,7 +31,7 @@ struct nsproxy { struct uts_namespace *uts_ns; struct ipc_namespace *ipc_ns; struct mnt_namespace *mnt_ns; - struct pid_namespace *pid_ns; + struct pid_namespace *pid_ns_for_children; struct net *net_ns; }; extern struct nsproxy init_nsproxy; diff --git a/kernel/fork.c b/kernel/fork.c index e23bb19e2a3e..bf46287c91a4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1177,7 +1177,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, * don't allow the creation of threads. */ if ((clone_flags & (CLONE_VM|CLONE_NEWPID)) && - (task_active_pid_ns(current) != current->nsproxy->pid_ns)) + (task_active_pid_ns(current) != + current->nsproxy->pid_ns_for_children)) return ERR_PTR(-EINVAL); retval = security_task_create(clone_flags); @@ -1351,7 +1352,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (pid != &init_struct_pid) { retval = -ENOMEM; - pid = alloc_pid(p->nsproxy->pid_ns); + pid = alloc_pid(p->nsproxy->pid_ns_for_children); if (!pid) goto bad_fork_cleanup_io; } diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 364ceab15f0c..997cbb951a3b 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -29,15 +29,15 @@ static struct kmem_cache *nsproxy_cachep; struct nsproxy init_nsproxy = { - .count = ATOMIC_INIT(1), - .uts_ns = &init_uts_ns, + .count = ATOMIC_INIT(1), + .uts_ns = &init_uts_ns, #if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC) - .ipc_ns = &init_ipc_ns, + .ipc_ns = &init_ipc_ns, #endif - .mnt_ns = NULL, - .pid_ns = &init_pid_ns, + .mnt_ns = NULL, + .pid_ns_for_children = &init_pid_ns, #ifdef CONFIG_NET - .net_ns = &init_net, + .net_ns = &init_net, #endif }; @@ -85,9 +85,10 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, goto out_ipc; } - new_nsp->pid_ns = copy_pid_ns(flags, user_ns, tsk->nsproxy->pid_ns); - if (IS_ERR(new_nsp->pid_ns)) { - err = PTR_ERR(new_nsp->pid_ns); + new_nsp->pid_ns_for_children = + copy_pid_ns(flags, user_ns, tsk->nsproxy->pid_ns_for_children); + if (IS_ERR(new_nsp->pid_ns_for_children)) { + err = PTR_ERR(new_nsp->pid_ns_for_children); goto out_pid; } @@ -100,8 +101,8 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, return new_nsp; out_net: - if (new_nsp->pid_ns) - put_pid_ns(new_nsp->pid_ns); + if (new_nsp->pid_ns_for_children) + put_pid_ns(new_nsp->pid_ns_for_children); out_pid: if (new_nsp->ipc_ns) put_ipc_ns(new_nsp->ipc_ns); @@ -174,8 +175,8 @@ void free_nsproxy(struct nsproxy *ns) put_uts_ns(ns->uts_ns); if (ns->ipc_ns) put_ipc_ns(ns->ipc_ns); - if (ns->pid_ns) - put_pid_ns(ns->pid_ns); + if (ns->pid_ns_for_children) + put_pid_ns(ns->pid_ns_for_children); put_net(ns->net_ns); kmem_cache_free(nsproxy_cachep, ns); } diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 6917e8edb48e..601bb361c235 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -349,8 +349,8 @@ static int pidns_install(struct nsproxy *nsproxy, void *ns) if (ancestor != active) return -EINVAL; - put_pid_ns(nsproxy->pid_ns); - nsproxy->pid_ns = get_pid_ns(new); + put_pid_ns(nsproxy->pid_ns_for_children); + nsproxy->pid_ns_for_children = get_pid_ns(new); return 0; } From 449cfcc1cd28a1c16321c47b1e7e77138411a72a Mon Sep 17 00:00:00 2001 From: Michal Schmidt Date: Fri, 23 Aug 2013 15:40:53 +0200 Subject: [PATCH 17/44] jme: lower NAPI weight Since commit 82dc3c63 ("net: introduce NAPI_POLL_WEIGHT") netif_napi_add() produces an error message if a NAPI poll weight greater than 64 is requested. jme requests a quarter of the rx ring size as the NAPI weight. jme's rx ring size is 1 << 9 = 512. Use the standard NAPI weight. v2: proper reference to the related commit Signed-off-by: Michal Schmidt Signed-off-by: David S. Miller --- drivers/net/ethernet/jme.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/jme.c b/drivers/net/ethernet/jme.c index 7fbe6abf6054..23de82a9da82 100644 --- a/drivers/net/ethernet/jme.c +++ b/drivers/net/ethernet/jme.c @@ -3069,7 +3069,7 @@ jme_init_one(struct pci_dev *pdev, jwrite32(jme, JME_APMC, apmc); } - NETIF_NAPI_SET(netdev, &jme->napi, jme_poll, jme->rx_ring_size >> 2) + NETIF_NAPI_SET(netdev, &jme->napi, jme_poll, NAPI_POLL_WEIGHT) spin_lock_init(&jme->phy_lock); spin_lock_init(&jme->macaddr_lock); From 1e4a5282b4791fb9ba68478ecddae9642f9cfefb Mon Sep 17 00:00:00 2001 From: Michal Schmidt Date: Fri, 23 Aug 2013 15:41:09 +0200 Subject: [PATCH 18/44] netxen: lower NAPI weight Since commit 82dc3c63 ("net: introduce NAPI_POLL_WEIGHT") netif_napi_add() produces an error message if a NAPI poll weight greater than 64 is requested. Use the standard NAPI weight. v2: proper reference to the related commit Signed-off-by: Michal Schmidt Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/netxen/netxen_nic.h | 1 - drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/qlogic/netxen/netxen_nic.h b/drivers/net/ethernet/qlogic/netxen/netxen_nic.h index 3fe09ab2d7c9..32675e16021e 100644 --- a/drivers/net/ethernet/qlogic/netxen/netxen_nic.h +++ b/drivers/net/ethernet/qlogic/netxen/netxen_nic.h @@ -1171,7 +1171,6 @@ typedef struct { #define NETXEN_DB_MAPSIZE_BYTES 0x1000 -#define NETXEN_NETDEV_WEIGHT 128 #define NETXEN_ADAPTER_UP_MAGIC 777 #define NETXEN_NIC_PEG_TUNE 0 diff --git a/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c b/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c index c401b0b4353d..ec4cf7fd4123 100644 --- a/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c +++ b/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c @@ -197,7 +197,7 @@ netxen_napi_add(struct netxen_adapter *adapter, struct net_device *netdev) for (ring = 0; ring < adapter->max_sds_rings; ring++) { sds_ring = &recv_ctx->sds_rings[ring]; netif_napi_add(netdev, &sds_ring->napi, - netxen_nic_poll, NETXEN_NETDEV_WEIGHT); + netxen_nic_poll, NAPI_POLL_WEIGHT); } return 0; From ae24f261e11ec30d343c7c5ee805ff29782f726b Mon Sep 17 00:00:00 2001 From: Michal Schmidt Date: Fri, 23 Aug 2013 15:41:19 +0200 Subject: [PATCH 19/44] ps3_gelic: lower NAPI weight Since commit 82dc3c63 ("net: introduce NAPI_POLL_WEIGHT") netif_napi_add() produces an error message if a NAPI poll weight greater than 64 is requested. GELIC_NET_NAPI_WEIGHT is defined to GELIC_NET_RX_DESCRIPTORS, which is 128. Use the standard NAPI weight. v2: proper reference to the related commit Signed-off-by: Michal Schmidt Acked-by: Geoff Levand Signed-off-by: David S. Miller --- drivers/net/ethernet/toshiba/ps3_gelic_net.c | 3 +-- drivers/net/ethernet/toshiba/ps3_gelic_net.h | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/toshiba/ps3_gelic_net.c b/drivers/net/ethernet/toshiba/ps3_gelic_net.c index ad32af67e618..9c805e0c0cae 100644 --- a/drivers/net/ethernet/toshiba/ps3_gelic_net.c +++ b/drivers/net/ethernet/toshiba/ps3_gelic_net.c @@ -1466,8 +1466,7 @@ static void gelic_ether_setup_netdev_ops(struct net_device *netdev, { netdev->watchdog_timeo = GELIC_NET_WATCHDOG_TIMEOUT; /* NAPI */ - netif_napi_add(netdev, napi, - gelic_net_poll, GELIC_NET_NAPI_WEIGHT); + netif_napi_add(netdev, napi, gelic_net_poll, NAPI_POLL_WEIGHT); netdev->ethtool_ops = &gelic_ether_ethtool_ops; netdev->netdev_ops = &gelic_netdevice_ops; } diff --git a/drivers/net/ethernet/toshiba/ps3_gelic_net.h b/drivers/net/ethernet/toshiba/ps3_gelic_net.h index a93df6ac1909..309abb472aa2 100644 --- a/drivers/net/ethernet/toshiba/ps3_gelic_net.h +++ b/drivers/net/ethernet/toshiba/ps3_gelic_net.h @@ -37,7 +37,6 @@ #define GELIC_NET_RXBUF_ALIGN 128 #define GELIC_CARD_RX_CSUM_DEFAULT 1 /* hw chksum */ #define GELIC_NET_WATCHDOG_TIMEOUT 5*HZ -#define GELIC_NET_NAPI_WEIGHT (GELIC_NET_RX_DESCRIPTORS) #define GELIC_NET_BROADCAST_ADDR 0xffffffffffffL #define GELIC_NET_MC_COUNT_MAX 32 /* multicast address list */ From c730b170456d9139c3c1f8a9c1f91837be657d60 Mon Sep 17 00:00:00 2001 From: Ariel Elior Date: Wed, 28 Aug 2013 01:13:00 +0300 Subject: [PATCH 20/44] bnx2x: vf mark stats started Solve issue where no stats were being collected for VF devices due to missing configuration in the stats' atomic synchronization mechanism. Signed-off-by: Ariel Elior Signed-off-by: Yuval Mintz Signed-off-by: Eilon Greenstein Signed-off-by: David S. Miller --- .../net/ethernet/broadcom/bnx2x/bnx2x_stats.c | 20 ++++++++----------- 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_stats.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_stats.c index d63d1327b051..fed9b1543b9f 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_stats.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_stats.c @@ -522,20 +522,16 @@ static void bnx2x_func_stats_init(struct bnx2x *bp) /* should be called under stats_sema */ static void __bnx2x_stats_start(struct bnx2x *bp) { - /* vfs travel through here as part of the statistics FSM, but no action - * is required - */ - if (IS_VF(bp)) - return; + if (IS_PF(bp)) { + if (bp->port.pmf) + bnx2x_port_stats_init(bp); - if (bp->port.pmf) - bnx2x_port_stats_init(bp); + else if (bp->func_stx) + bnx2x_func_stats_init(bp); - else if (bp->func_stx) - bnx2x_func_stats_init(bp); - - bnx2x_hw_stats_post(bp); - bnx2x_storm_stats_post(bp); + bnx2x_hw_stats_post(bp); + bnx2x_storm_stats_post(bp); + } bp->stats_started = true; } From 34d5626afc39c43d63ec7781b648091e92fae45a Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Wed, 28 Aug 2013 01:13:01 +0300 Subject: [PATCH 21/44] bnx2x: Fix move FP memory deallocations If driver will fail to allocate all queues, it will shrink the number of queues and move the storage queue to its correct place (i.e., the last queue among the newly supported number). When changing the pointers of the new location of the FCoE queue, we need to pay special attention to the aggregations pointer - that memory is allocated during probe and released upon driver removal. Current implementation has 2 pointers pointing to the same chunk of allocated memory, meaning upon removal there will be two kfree() of the same chunk while the other won't be released. Signed-off-by: Yuval Mintz Signed-off-by: Ariel Elior Signed-off-by: Eilon Greenstein Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c index f2d1ff10054b..26b4dfcc0087 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c @@ -53,6 +53,7 @@ static inline void bnx2x_move_fp(struct bnx2x *bp, int from, int to) struct bnx2x_fp_stats *to_fp_stats = &bp->fp_stats[to]; int old_max_eth_txqs, new_max_eth_txqs; int old_txdata_index = 0, new_txdata_index = 0; + struct bnx2x_agg_info *old_tpa_info = to_fp->tpa_info; /* Copy the NAPI object as it has been already initialized */ from_fp->napi = to_fp->napi; @@ -61,6 +62,11 @@ static inline void bnx2x_move_fp(struct bnx2x *bp, int from, int to) memcpy(to_fp, from_fp, sizeof(*to_fp)); to_fp->index = to; + /* Retain the tpa_info of the original `to' version as we don't want + * 2 FPs to contain the same tpa_info pointer. + */ + to_fp->tpa_info = old_tpa_info; + /* move sp_objs contents as well, as their indices match fp ones */ memcpy(to_sp_objs, from_sp_objs, sizeof(*to_sp_objs)); From 35a04aa35c2929f24c7f063f42b6d776ad848c24 Mon Sep 17 00:00:00 2001 From: Ariel Elior Date: Wed, 28 Aug 2013 01:13:02 +0300 Subject: [PATCH 22/44] bnx2x: Fix functionality of configuring vlan list The check on return code of bnx2x_vfop_config_vlan0() would lead to error handling flow as the return value indicating an existing pending ramrod would be erroneously considered as an error. Signed-off-by: Ariel Elior Signed-off-by: Yuval Mintz Signed-off-by: Eilon Greenstein Signed-off-by: David S. Miller --- .../net/ethernet/broadcom/bnx2x/bnx2x_sriov.c | 37 +------------------ 1 file changed, 2 insertions(+), 35 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c index ad83f4b48777..b7efe27f845c 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c @@ -522,23 +522,6 @@ static int bnx2x_vfop_set_user_req(struct bnx2x *bp, return 0; } -static int -bnx2x_vfop_config_vlan0(struct bnx2x *bp, - struct bnx2x_vlan_mac_ramrod_params *vlan_mac, - bool add) -{ - int rc; - - vlan_mac->user_req.cmd = add ? BNX2X_VLAN_MAC_ADD : - BNX2X_VLAN_MAC_DEL; - vlan_mac->user_req.u.vlan.vlan = 0; - - rc = bnx2x_config_vlan_mac(bp, vlan_mac); - if (rc == -EEXIST) - rc = 0; - return rc; -} - static int bnx2x_vfop_config_list(struct bnx2x *bp, struct bnx2x_vfop_filters *filters, struct bnx2x_vlan_mac_ramrod_params *vlan_mac) @@ -643,30 +626,14 @@ static void bnx2x_vfop_vlan_mac(struct bnx2x *bp, struct bnx2x_virtf *vf) case BNX2X_VFOP_VLAN_CONFIG_LIST: /* next state */ - vfop->state = BNX2X_VFOP_VLAN_CONFIG_LIST_0; + vfop->state = BNX2X_VFOP_VLAN_MAC_CHK_DONE; - /* remove vlan0 - could be no-op */ - vfop->rc = bnx2x_vfop_config_vlan0(bp, vlan_mac, false); - if (vfop->rc) - goto op_err; - - /* Do vlan list config. if this operation fails we try to - * restore vlan0 to keep the queue is working order - */ + /* do list config */ vfop->rc = bnx2x_vfop_config_list(bp, filters, vlan_mac); if (!vfop->rc) { set_bit(RAMROD_CONT, &vlan_mac->ramrod_flags); vfop->rc = bnx2x_config_vlan_mac(bp, vlan_mac); } - bnx2x_vfop_finalize(vf, vfop->rc, VFOP_CONT); /* fall-through */ - - case BNX2X_VFOP_VLAN_CONFIG_LIST_0: - /* next state */ - vfop->state = BNX2X_VFOP_VLAN_MAC_CHK_DONE; - - if (list_empty(&obj->head)) - /* add vlan0 */ - vfop->rc = bnx2x_vfop_config_vlan0(bp, vlan_mac, true); bnx2x_vfop_finalize(vf, vfop->rc, VFOP_DONE); default: From b4cddbd6dd9b3b9e08c26d8b7247e4e011092117 Mon Sep 17 00:00:00 2001 From: Ariel Elior Date: Wed, 28 Aug 2013 01:13:03 +0300 Subject: [PATCH 23/44] bnx2x: Fix VF memory leak unload Due to incorrect VF/PF conditions, when unloading a VF it will not release part of the memory it has previously allocated. Signed-off-by: Ariel Elior Signed-off-by: Yuval Mintz Signed-off-by: Eilon Greenstein Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c | 3 ++- drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c | 9 ++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c index 26b4dfcc0087..0cc26110868d 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c @@ -2962,8 +2962,9 @@ int bnx2x_nic_unload(struct bnx2x *bp, int unload_mode, bool keep_link) if (IS_PF(bp)) { if (CNIC_LOADED(bp)) bnx2x_free_mem_cnic(bp); - bnx2x_free_mem(bp); } + bnx2x_free_mem(bp); + bp->state = BNX2X_STATE_CLOSED; bp->cnic_loaded = false; diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c index 8bdc8b973007..1627a4e09c32 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c @@ -7855,12 +7855,15 @@ void bnx2x_free_mem(struct bnx2x *bp) { int i; - BNX2X_PCI_FREE(bp->def_status_blk, bp->def_status_blk_mapping, - sizeof(struct host_sp_status_block)); - BNX2X_PCI_FREE(bp->fw_stats, bp->fw_stats_mapping, bp->fw_stats_data_sz + bp->fw_stats_req_sz); + if (IS_VF(bp)) + return; + + BNX2X_PCI_FREE(bp->def_status_blk, bp->def_status_blk_mapping, + sizeof(struct host_sp_status_block)); + BNX2X_PCI_FREE(bp->slowpath, bp->slowpath_mapping, sizeof(struct bnx2x_slowpath)); From a3097bda78c7fb41fd3091ffb70bf7bd946e6997 Mon Sep 17 00:00:00 2001 From: Ariel Elior Date: Wed, 28 Aug 2013 01:13:04 +0300 Subject: [PATCH 24/44] bnx2x: Fix VF stats sync Since the PF gathers statistics for the VF, when the VF is about to unload we must synchronize the release of its statistics buffer with the PF, so that no DMA operation will be made to that address after the buffer release. Signed-off-by: Ariel Elior Signed-off-by: Yuval Mintz Signed-off-by: Eilon Greenstein Signed-off-by: David S. Miller --- .../net/ethernet/broadcom/bnx2x/bnx2x_sriov.c | 26 ++++++++++++++++++- .../net/ethernet/broadcom/bnx2x/bnx2x_stats.c | 11 ++++++++ .../net/ethernet/broadcom/bnx2x/bnx2x_stats.h | 3 +++ 3 files changed, 39 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c index b7efe27f845c..e8706e19f96f 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c @@ -2786,6 +2786,18 @@ int bnx2x_vf_init(struct bnx2x *bp, struct bnx2x_virtf *vf, dma_addr_t *sb_map) return 0; } +struct set_vf_state_cookie { + struct bnx2x_virtf *vf; + u8 state; +}; + +void bnx2x_set_vf_state(void *cookie) +{ + struct set_vf_state_cookie *p = (struct set_vf_state_cookie *)cookie; + + p->vf->state = p->state; +} + /* VFOP close (teardown the queues, delete mcasts and close HW) */ static void bnx2x_vfop_close(struct bnx2x *bp, struct bnx2x_virtf *vf) { @@ -2836,7 +2848,19 @@ static void bnx2x_vfop_close(struct bnx2x *bp, struct bnx2x_virtf *vf) op_err: BNX2X_ERR("VF[%d] CLOSE error: rc %d\n", vf->abs_vfid, vfop->rc); op_done: - vf->state = VF_ACQUIRED; + + /* need to make sure there are no outstanding stats ramrods which may + * cause the device to access the VF's stats buffer which it will free + * as soon as we return from the close flow. + */ + { + struct set_vf_state_cookie cookie; + + cookie.vf = vf; + cookie.state = VF_ACQUIRED; + bnx2x_stats_safe_exec(bp, bnx2x_set_vf_state, &cookie); + } + DP(BNX2X_MSG_IOV, "set state to acquired\n"); bnx2x_vfop_end(bp, vf, vfop); } diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_stats.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_stats.c index fed9b1543b9f..86436c77af03 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_stats.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_stats.c @@ -1993,3 +1993,14 @@ void bnx2x_afex_collect_stats(struct bnx2x *bp, void *void_afex_stats, estats->mac_discard); } } + +void bnx2x_stats_safe_exec(struct bnx2x *bp, + void (func_to_exec)(void *cookie), + void *cookie){ + if (down_timeout(&bp->stats_sema, HZ/10)) + BNX2X_ERR("Unable to acquire stats lock\n"); + bnx2x_stats_comp(bp); + func_to_exec(cookie); + __bnx2x_stats_start(bp); + up(&bp->stats_sema); +} diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_stats.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_stats.h index 853824d258e8..f35845006cdd 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_stats.h +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_stats.h @@ -539,6 +539,9 @@ struct bnx2x; void bnx2x_memset_stats(struct bnx2x *bp); void bnx2x_stats_init(struct bnx2x *bp); void bnx2x_stats_handle(struct bnx2x *bp, enum bnx2x_stats_event event); +void bnx2x_stats_safe_exec(struct bnx2x *bp, + void (func_to_exec)(void *cookie), + void *cookie); /** * bnx2x_save_statistics - save statistics when unloading. From 302a50bc941010d7a67f288fd0db31981e4d722d Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Wed, 28 Aug 2013 08:47:14 +0200 Subject: [PATCH 25/44] xfrm: Fix potential null pointer dereference in xdst_queue_output The net_device might be not set on the skb when we try refcounting. This leads to a null pointer dereference in xdst_queue_output(). It turned out that the refcount to the net_device is not needed after all. The dst_entry has a refcount to the net_device before we queue the skb, so it can't go away. Therefore we can remove the refcount on queueing to fix the null pointer dereference. Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index e52cab3591dd..f77c371ea72b 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -320,10 +320,8 @@ static void xfrm_queue_purge(struct sk_buff_head *list) { struct sk_buff *skb; - while ((skb = skb_dequeue(list)) != NULL) { - dev_put(skb->dev); + while ((skb = skb_dequeue(list)) != NULL) kfree_skb(skb); - } } /* Rule must be locked. Release descentant resources, announce @@ -1758,7 +1756,6 @@ static void xfrm_policy_queue_process(unsigned long arg) struct sk_buff *skb; struct sock *sk; struct dst_entry *dst; - struct net_device *dev; struct xfrm_policy *pol = (struct xfrm_policy *)arg; struct xfrm_policy_queue *pq = &pol->polq; struct flowi fl; @@ -1805,7 +1802,6 @@ static void xfrm_policy_queue_process(unsigned long arg) dst = xfrm_lookup(xp_net(pol), skb_dst(skb)->path, &fl, skb->sk, 0); if (IS_ERR(dst)) { - dev_put(skb->dev); kfree_skb(skb); continue; } @@ -1814,9 +1810,7 @@ static void xfrm_policy_queue_process(unsigned long arg) skb_dst_drop(skb); skb_dst_set(skb, dst); - dev = skb->dev; err = dst_output(skb); - dev_put(dev); } return; @@ -1839,7 +1833,6 @@ static int xdst_queue_output(struct sk_buff *skb) } skb_dst_force(skb); - dev_hold(skb->dev); spin_lock_bh(&pq->hold_queue.lock); From 9b96309c5b0b9e466773c07a5bc8b7b68fcf010a Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Fri, 23 Aug 2013 12:44:55 -0700 Subject: [PATCH 26/44] genl: Fix genl dumpit() locking. In case of genl-family with parallel ops off, dumpif() callback is expected to run under genl_lock, But commit def3117493eafd9df (genl: Allow concurrent genl callbacks.) changed this behaviour where only first dumpit() op was called under genl-lock. For subsequent dump, only nlk->cb_lock was taken. Following patch fixes it by defining locked dumpit() and done() callback which takes care of genl-locking. CC: Jesse Gross CC: Johannes Berg Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/netlink/genetlink.c | 51 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 46 insertions(+), 5 deletions(-) diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 512718adb0d5..7e0f4d199ade 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -544,6 +544,30 @@ void *genlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, } EXPORT_SYMBOL(genlmsg_put); +static int genl_lock_dumpit(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct genl_ops *ops = cb->data; + int rc; + + genl_lock(); + rc = ops->dumpit(skb, cb); + genl_unlock(); + return rc; +} + +static int genl_lock_done(struct netlink_callback *cb) +{ + struct genl_ops *ops = cb->data; + int rc = 0; + + if (ops->done) { + genl_lock(); + rc = ops->done(cb); + genl_unlock(); + } + return rc; +} + static int genl_family_rcv_msg(struct genl_family *family, struct sk_buff *skb, struct nlmsghdr *nlh) @@ -572,15 +596,32 @@ static int genl_family_rcv_msg(struct genl_family *family, return -EPERM; if ((nlh->nlmsg_flags & NLM_F_DUMP) == NLM_F_DUMP) { - struct netlink_dump_control c = { - .dump = ops->dumpit, - .done = ops->done, - }; + int rc; if (ops->dumpit == NULL) return -EOPNOTSUPP; - return netlink_dump_start(net->genl_sock, skb, nlh, &c); + if (!family->parallel_ops) { + struct netlink_dump_control c = { + .data = ops, + .dump = genl_lock_dumpit, + .done = genl_lock_done, + }; + + genl_unlock(); + rc = netlink_dump_start(net->genl_sock, skb, nlh, &c); + genl_lock(); + + } else { + struct netlink_dump_control c = { + .dump = ops->dumpit, + .done = ops->done, + }; + + rc = netlink_dump_start(net->genl_sock, skb, nlh, &c); + } + + return rc; } if (ops->doit == NULL) From 33c6b1f6b154894321f5734e50c66621e9134e7e Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Fri, 23 Aug 2013 12:45:04 -0700 Subject: [PATCH 27/44] genl: Hold reference on correct module while netlink-dump. netlink dump operations take module as parameter to hold reference for entire netlink dump duration. Currently it holds ref only on genl module which is not correct when we use ops registered to genl from another module. Following patch adds module pointer to genl_ops so that netlink can hold ref count on it. CC: Jesse Gross CC: Johannes Berg Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/net/genetlink.h | 20 ++++++++++++++++++-- net/netlink/genetlink.c | 20 +++++++++++--------- 2 files changed, 29 insertions(+), 11 deletions(-) diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 93024a47e0e2..8e0b6c856a13 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -61,6 +61,7 @@ struct genl_family { struct list_head ops_list; /* private */ struct list_head family_list; /* private */ struct list_head mcast_groups; /* private */ + struct module *module; }; /** @@ -121,9 +122,24 @@ struct genl_ops { struct list_head ops_list; }; -extern int genl_register_family(struct genl_family *family); -extern int genl_register_family_with_ops(struct genl_family *family, +extern int __genl_register_family(struct genl_family *family); + +static inline int genl_register_family(struct genl_family *family) +{ + family->module = THIS_MODULE; + return __genl_register_family(family); +} + +extern int __genl_register_family_with_ops(struct genl_family *family, struct genl_ops *ops, size_t n_ops); + +static inline int genl_register_family_with_ops(struct genl_family *family, + struct genl_ops *ops, size_t n_ops) +{ + family->module = THIS_MODULE; + return __genl_register_family_with_ops(family, ops, n_ops); +} + extern int genl_unregister_family(struct genl_family *family); extern int genl_register_ops(struct genl_family *, struct genl_ops *ops); extern int genl_unregister_ops(struct genl_family *, struct genl_ops *ops); diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 7e0f4d199ade..0c741cec4d0d 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -364,7 +364,7 @@ int genl_unregister_ops(struct genl_family *family, struct genl_ops *ops) EXPORT_SYMBOL(genl_unregister_ops); /** - * genl_register_family - register a generic netlink family + * __genl_register_family - register a generic netlink family * @family: generic netlink family * * Registers the specified family after validating it first. Only one @@ -374,7 +374,7 @@ EXPORT_SYMBOL(genl_unregister_ops); * * Return 0 on success or a negative error code. */ -int genl_register_family(struct genl_family *family) +int __genl_register_family(struct genl_family *family) { int err = -EINVAL; @@ -430,10 +430,10 @@ errout_locked: errout: return err; } -EXPORT_SYMBOL(genl_register_family); +EXPORT_SYMBOL(__genl_register_family); /** - * genl_register_family_with_ops - register a generic netlink family + * __genl_register_family_with_ops - register a generic netlink family * @family: generic netlink family * @ops: operations to be registered * @n_ops: number of elements to register @@ -457,12 +457,12 @@ EXPORT_SYMBOL(genl_register_family); * * Return 0 on success or a negative error code. */ -int genl_register_family_with_ops(struct genl_family *family, +int __genl_register_family_with_ops(struct genl_family *family, struct genl_ops *ops, size_t n_ops) { int err, i; - err = genl_register_family(family); + err = __genl_register_family(family); if (err) return err; @@ -476,7 +476,7 @@ err_out: genl_unregister_family(family); return err; } -EXPORT_SYMBOL(genl_register_family_with_ops); +EXPORT_SYMBOL(__genl_register_family_with_ops); /** * genl_unregister_family - unregister generic netlink family @@ -603,22 +603,24 @@ static int genl_family_rcv_msg(struct genl_family *family, if (!family->parallel_ops) { struct netlink_dump_control c = { + .module = family->module, .data = ops, .dump = genl_lock_dumpit, .done = genl_lock_done, }; genl_unlock(); - rc = netlink_dump_start(net->genl_sock, skb, nlh, &c); + rc = __netlink_dump_start(net->genl_sock, skb, nlh, &c); genl_lock(); } else { struct netlink_dump_control c = { + .module = family->module, .dump = ops->dumpit, .done = ops->done, }; - rc = netlink_dump_start(net->genl_sock, skb, nlh, &c); + rc = __netlink_dump_start(net->genl_sock, skb, nlh, &c); } return rc; From 64c3b252e9fc8256ef7b1b99ba60492163cd06e8 Mon Sep 17 00:00:00 2001 From: Byungho An Date: Sat, 24 Aug 2013 15:31:43 +0900 Subject: [PATCH 28/44] net: stmmac: fixed the pbl setting with DT This patch fixed the pbl(programmable burst length) setting using DT. Even though the default pbl is 8, If there is no pbl property in device tree file, pbl is set 0 and it causes bandwidth degradation. Signed-off-by: Byungho An Signed-off-by: David S. Miller --- .../ethernet/stmicro/stmmac/stmmac_platform.c | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c index 03de76c7a177..1c83a44c547b 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c @@ -71,14 +71,18 @@ static int stmmac_probe_config_dt(struct platform_device *pdev, plat->force_sf_dma_mode = 1; } - dma_cfg = devm_kzalloc(&pdev->dev, sizeof(*dma_cfg), GFP_KERNEL); - if (!dma_cfg) - return -ENOMEM; - - plat->dma_cfg = dma_cfg; - of_property_read_u32(np, "snps,pbl", &dma_cfg->pbl); - dma_cfg->fixed_burst = of_property_read_bool(np, "snps,fixed-burst"); - dma_cfg->mixed_burst = of_property_read_bool(np, "snps,mixed-burst"); + if (of_find_property(np, "snps,pbl", NULL)) { + dma_cfg = devm_kzalloc(&pdev->dev, sizeof(*dma_cfg), + GFP_KERNEL); + if (!dma_cfg) + return -ENOMEM; + plat->dma_cfg = dma_cfg; + of_property_read_u32(np, "snps,pbl", &dma_cfg->pbl); + dma_cfg->fixed_burst = + of_property_read_bool(np, "snps,fixed-burst"); + dma_cfg->mixed_burst = + of_property_read_bool(np, "snps,mixed-burst"); + } return 0; } From 3046e2f5b79a86044ac0a29c69610d6ac6a4b882 Mon Sep 17 00:00:00 2001 From: Eliezer Tamir Date: Sun, 25 Aug 2013 10:23:46 +0300 Subject: [PATCH 29/44] net: add cpu_relax to busy poll loop Add a cpu_relaxt to sk_busy_loop. Julie Cummings reported performance issues when hyperthreading is on. Arjan van de Ven observed that we should have a cpu_relax() in the busy poll loop. Reported-by: Julie Cummings Signed-off-by: Eliezer Tamir Signed-off-by: David S. Miller --- include/net/busy_poll.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index 8a358a2c97e6..829627d7b846 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -123,6 +123,7 @@ static inline bool sk_busy_loop(struct sock *sk, int nonblock) /* local bh are disabled so it is ok to use _BH */ NET_ADD_STATS_BH(sock_net(sk), LINUX_MIB_BUSYPOLLRXPACKETS, rc); + cpu_relax(); } while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) && !need_resched() && !busy_loop_timeout(end_time)); From 03803a59e32453ee5737c6096a295f748f03cc49 Mon Sep 17 00:00:00 2001 From: Rob Gardner Date: Sun, 25 Aug 2013 16:02:23 -0600 Subject: [PATCH 30/44] net: usb: Add HP hs2434 device to ZLP exception table MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch adds another entry (HP hs2434 Mobile Broadband) to the list of exceptional devices that require a zero length packet in order to function properly. This list was added in commit 844e88f0. The hs2434 is manufactured by Sierra Wireless, who also produces the MC7710, which the ZLP exception list was created for in the first place. So hopefully it is just this one producer's devices that will need this workaround. Tested on a DM1-4310NR HP notebook, which does not function without this change. Signed-off-by: Rob Gardner Acked-by: Bjørn Mork Signed-off-by: David S. Miller --- drivers/net/usb/cdc_mbim.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/usb/cdc_mbim.c b/drivers/net/usb/cdc_mbim.c index 872819851aef..25ba7eca9a13 100644 --- a/drivers/net/usb/cdc_mbim.c +++ b/drivers/net/usb/cdc_mbim.c @@ -400,6 +400,10 @@ static const struct usb_device_id mbim_devs[] = { { USB_DEVICE_AND_INTERFACE_INFO(0x1199, 0x68a2, USB_CLASS_COMM, USB_CDC_SUBCLASS_MBIM, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&cdc_mbim_info_zlp, }, + /* HP hs2434 Mobile Broadband Module needs ZLPs */ + { USB_DEVICE_AND_INTERFACE_INFO(0x3f0, 0x4b1d, USB_CLASS_COMM, USB_CDC_SUBCLASS_MBIM, USB_CDC_PROTO_NONE), + .driver_info = (unsigned long)&cdc_mbim_info_zlp, + }, { USB_INTERFACE_INFO(USB_CLASS_COMM, USB_CDC_SUBCLASS_MBIM, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&cdc_mbim_info, }, From 282a1dffc1b9976cdf1b0eea3f6f68fda23a7c7e Mon Sep 17 00:00:00 2001 From: Libo Chen Date: Mon, 26 Aug 2013 11:30:55 +0800 Subject: [PATCH 31/44] net: xilinx: fix memleak decrease device_node refcount np1 in err case. Signed-off-by: Libo Chen Signed-off-by: David S. Miller --- drivers/net/ethernet/xilinx/xilinx_axienet_mdio.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_mdio.c b/drivers/net/ethernet/xilinx/xilinx_axienet_mdio.c index e90e1f46121e..64b4639f43b6 100644 --- a/drivers/net/ethernet/xilinx/xilinx_axienet_mdio.c +++ b/drivers/net/ethernet/xilinx/xilinx_axienet_mdio.c @@ -175,6 +175,7 @@ int axienet_mdio_setup(struct axienet_local *lp, struct device_node *np) printk(KERN_WARNING "Setting MDIO clock divisor to " "default %d\n", DEFAULT_CLOCK_DIVISOR); clk_div = DEFAULT_CLOCK_DIVISOR; + of_node_put(np1); goto issue; } From c7781a6e3c4a9a17e144ec2db00ebfea327bd627 Mon Sep 17 00:00:00 2001 From: Andrew Vagin Date: Tue, 27 Aug 2013 12:20:40 +0400 Subject: [PATCH 32/44] tcp: initialize rcv_tstamp for restored sockets u32 rcv_tstamp; /* timestamp of last received ACK */ Its value used in tcp_retransmit_timer, which closes socket if the last ack was received more then TCP_RTO_MAX ago. Currently rcv_tstamp is initialized to zero and if tcp_retransmit_timer is called before receiving a first ack, the connection is closed. This patch initializes rcv_tstamp to a timestamp, when a socket was restored. Cc: Pavel Emelyanov Cc: Eric Dumazet Cc: "David S. Miller" Cc: Alexey Kuznetsov Cc: James Morris Cc: Hideaki YOSHIFUJI Cc: Patrick McHardy Reported-by: Cyrill Gorcunov Signed-off-by: Andrey Vagin Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 92fde8d1aa82..e2972993c671 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2814,6 +2814,8 @@ void tcp_connect_init(struct sock *sk) if (likely(!tp->repair)) tp->rcv_nxt = 0; + else + tp->rcv_tstamp = tcp_time_stamp; tp->rcv_wup = tp->rcv_nxt; tp->copied_seq = tp->rcv_nxt; From e3e12028315749b7fa2edbc37328e5847be9ede9 Mon Sep 17 00:00:00 2001 From: Andrew Vagin Date: Tue, 27 Aug 2013 12:21:55 +0400 Subject: [PATCH 33/44] tcp: don't apply tsoffset if rcv_tsecr is zero The zero value means that tsecr is not valid, so it's a special case. tsoffset is used to customize tcp_time_stamp for one socket. tsoffset is usually zero, it's used when a socket was moved from one host to another host. Currently this issue affects logic of tcp_rcv_rtt_measure_ts. Due to incorrect value of rcv_tsecr, tcp_rcv_rtt_measure_ts sets rto to TCP_RTO_MAX. Cc: Pavel Emelyanov Cc: Eric Dumazet Cc: "David S. Miller" Cc: Alexey Kuznetsov Cc: James Morris Cc: Hideaki YOSHIFUJI Cc: Patrick McHardy Reported-by: Cyrill Gorcunov Signed-off-by: Andrey Vagin Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 28af45abe062..3ca2139a130b 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3535,7 +3535,10 @@ static bool tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr ++ptr; tp->rx_opt.rcv_tsval = ntohl(*ptr); ++ptr; - tp->rx_opt.rcv_tsecr = ntohl(*ptr) - tp->tsoffset; + if (*ptr) + tp->rx_opt.rcv_tsecr = ntohl(*ptr) - tp->tsoffset; + else + tp->rx_opt.rcv_tsecr = 0; return true; } return false; @@ -3560,7 +3563,7 @@ static bool tcp_fast_parse_options(const struct sk_buff *skb, } tcp_parse_options(skb, &tp->rx_opt, 1, NULL); - if (tp->rx_opt.saw_tstamp) + if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) tp->rx_opt.rcv_tsecr -= tp->tsoffset; return true; @@ -5316,7 +5319,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, int saved_clamp = tp->rx_opt.mss_clamp; tcp_parse_options(skb, &tp->rx_opt, 0, &foc); - if (tp->rx_opt.saw_tstamp) + if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) tp->rx_opt.rcv_tsecr -= tp->tsoffset; if (th->ack) { From c27c9322d015dc1d9dfdf31724fca71c0476c4d1 Mon Sep 17 00:00:00 2001 From: Chris Clark Date: Tue, 27 Aug 2013 12:02:15 -0600 Subject: [PATCH 34/44] ipv4: sendto/hdrincl: don't use destination address found in header ipv4: raw_sendmsg: don't use header's destination address A sendto() regression was bisected and found to start with commit f8126f1d5136be1 (ipv4: Adjust semantics of rt->rt_gateway.) The problem is that it tries to ARP-lookup the constructed packet's destination address rather than the explicitly provided address. Fix this using FLOWI_FLAG_KNOWN_NH so that given nexthop is used. cf. commit 2ad5b9e4bd314fc685086b99e90e5de3bc59e26b Reported-by: Chris Clark Bisected-by: Chris Clark Tested-by: Chris Clark Suggested-by: Julian Anastasov Signed-off-by: Chris Clark Signed-off-by: David S. Miller --- net/ipv4/raw.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index dd44e0ab600c..61e60d67adca 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -571,7 +571,8 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE, inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, - inet_sk_flowi_flags(sk) | FLOWI_FLAG_CAN_SLEEP, + inet_sk_flowi_flags(sk) | FLOWI_FLAG_CAN_SLEEP | + (inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0), daddr, saddr, 0, 0); if (!inet->hdrincl) { From 1f324e38870cc09659cf23bc626f1b8869e201f2 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Wed, 28 Aug 2013 01:07:25 +0200 Subject: [PATCH 35/44] ipv6: Don't depend on per socket memory for neighbour discovery messages Allocating skbs when sending out neighbour discovery messages currently uses sock_alloc_send_skb() based on a per net namespace socket and thus share a socket wmem buffer space. If a netdevice is temporarily unable to transmit due to carrier loss or for other reasons, the queued up ndisc messages will cosnume all of the wmem space and will thus prevent from any more skbs to be allocated even for netdevices that are able to transmit packets. The number of neighbour discovery messages sent is very limited, simply use alloc_skb() and don't depend on any socket wmem space any longer. This patch has orginally been posted by Eric Dumazet in a modified form. Signed-off-by: Thomas Graf Cc: Eric Dumazet Acked-by: Hannes Frederic Sowa Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/ndisc.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 04d31c2fbef1..5cb98df966c2 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -370,16 +370,12 @@ static struct sk_buff *ndisc_alloc_skb(struct net_device *dev, { int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; - struct sock *sk = dev_net(dev)->ipv6.ndisc_sk; struct sk_buff *skb; - int err; - skb = sock_alloc_send_skb(sk, - hlen + sizeof(struct ipv6hdr) + len + tlen, - 1, &err); + skb = alloc_skb(hlen + sizeof(struct ipv6hdr) + len + tlen, GFP_ATOMIC); if (!skb) { - ND_PRINTK(0, err, "ndisc: %s failed to allocate an skb, err=%d\n", - __func__, err); + ND_PRINTK(0, err, "ndisc: %s failed to allocate an skb\n", + __func__); return NULL; } From cc0fdd802859eaeb00e1c87dbb655594bed2844c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20L=C3=BCssing?= Date: Fri, 30 Aug 2013 17:28:17 +0200 Subject: [PATCH 36/44] bridge: separate querier and query timer into IGMP/IPv4 and MLD/IPv6 ones MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently we would still potentially suffer multicast packet loss if there is just either an IGMP or an MLD querier: For the former case, we would possibly drop IPv6 multicast packets, for the latter IPv4 ones. This is because we are currently assuming that if either an IGMP or MLD querier is present that the other one is present, too. This patch makes the behaviour and fix added in "bridge: disable snooping if there is no querier" (b00589af3b04) to also work if there is either just an IGMP or an MLD querier on the link: It refines the deactivation of the snooping to be protocol specific by using separate timers for the snooped IGMP and MLD queries as well as separate timers for our internal IGMP and MLD queriers. Signed-off-by: Linus Lüssing Signed-off-by: David S. Miller --- net/bridge/br_device.c | 2 +- net/bridge/br_input.c | 2 +- net/bridge/br_mdb.c | 14 +- net/bridge/br_multicast.c | 262 +++++++++++++++++++++++++++----------- net/bridge/br_private.h | 57 +++++++-- 5 files changed, 242 insertions(+), 95 deletions(-) diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 69363bd37f64..89659d4ed1f9 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -71,7 +71,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) mdst = br_mdb_get(br, skb, vid); if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) && - br_multicast_querier_exists(br)) + br_multicast_querier_exists(br, eth_hdr(skb))) br_multicast_deliver(mdst, skb); else br_flood_deliver(br, skb, false); diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 8c561c0aa636..a2fd37ec35f7 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -102,7 +102,7 @@ int br_handle_frame_finish(struct sk_buff *skb) } else if (is_multicast_ether_addr(dest)) { mdst = br_mdb_get(br, skb, vid); if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) && - br_multicast_querier_exists(br)) { + br_multicast_querier_exists(br, eth_hdr(skb))) { if ((mdst && mdst->mglist) || br_multicast_is_router(br)) skb2 = skb; diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 0daae3ec2355..6319c4333c39 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -414,16 +414,20 @@ static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry) if (!netif_running(br->dev) || br->multicast_disabled) return -EINVAL; - if (timer_pending(&br->multicast_querier_timer)) - return -EBUSY; - ip.proto = entry->addr.proto; - if (ip.proto == htons(ETH_P_IP)) + if (ip.proto == htons(ETH_P_IP)) { + if (timer_pending(&br->ip4_querier.timer)) + return -EBUSY; + ip.u.ip4 = entry->addr.u.ip4; #if IS_ENABLED(CONFIG_IPV6) - else + } else { + if (timer_pending(&br->ip6_querier.timer)) + return -EBUSY; + ip.u.ip6 = entry->addr.u.ip6; #endif + } spin_lock_bh(&br->multicast_lock); mdb = mlock_dereference(br->mdb, br); diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 08e576ada0b2..9d1d0e66c357 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -33,7 +33,8 @@ #include "br_private.h" -static void br_multicast_start_querier(struct net_bridge *br); +static void br_multicast_start_querier(struct net_bridge *br, + struct bridge_mcast_query *query); unsigned int br_mdb_rehash_seq; static inline int br_ip_equal(const struct br_ip *a, const struct br_ip *b) @@ -755,20 +756,35 @@ static void br_multicast_local_router_expired(unsigned long data) { } -static void br_multicast_querier_expired(unsigned long data) +static void br_multicast_querier_expired(struct net_bridge *br, + struct bridge_mcast_query *query) { - struct net_bridge *br = (void *)data; - spin_lock(&br->multicast_lock); if (!netif_running(br->dev) || br->multicast_disabled) goto out; - br_multicast_start_querier(br); + br_multicast_start_querier(br, query); out: spin_unlock(&br->multicast_lock); } +static void br_ip4_multicast_querier_expired(unsigned long data) +{ + struct net_bridge *br = (void *)data; + + br_multicast_querier_expired(br, &br->ip4_query); +} + +#if IS_ENABLED(CONFIG_IPV6) +static void br_ip6_multicast_querier_expired(unsigned long data) +{ + struct net_bridge *br = (void *)data; + + br_multicast_querier_expired(br, &br->ip6_query); +} +#endif + static void __br_multicast_send_query(struct net_bridge *br, struct net_bridge_port *port, struct br_ip *ip) @@ -789,37 +805,45 @@ static void __br_multicast_send_query(struct net_bridge *br, } static void br_multicast_send_query(struct net_bridge *br, - struct net_bridge_port *port, u32 sent) + struct net_bridge_port *port, + struct bridge_mcast_query *query) { unsigned long time; struct br_ip br_group; + struct bridge_mcast_querier *querier = NULL; if (!netif_running(br->dev) || br->multicast_disabled || - !br->multicast_querier || - timer_pending(&br->multicast_querier_timer)) + !br->multicast_querier) return; memset(&br_group.u, 0, sizeof(br_group.u)); - br_group.proto = htons(ETH_P_IP); - __br_multicast_send_query(br, port, &br_group); - + if (port ? (query == &port->ip4_query) : + (query == &br->ip4_query)) { + querier = &br->ip4_querier; + br_group.proto = htons(ETH_P_IP); #if IS_ENABLED(CONFIG_IPV6) - br_group.proto = htons(ETH_P_IPV6); - __br_multicast_send_query(br, port, &br_group); + } else { + querier = &br->ip6_querier; + br_group.proto = htons(ETH_P_IPV6); #endif + } + + if (!querier || timer_pending(&querier->timer)) + return; + + __br_multicast_send_query(br, port, &br_group); time = jiffies; - time += sent < br->multicast_startup_query_count ? + time += query->startup_sent < br->multicast_startup_query_count ? br->multicast_startup_query_interval : br->multicast_query_interval; - mod_timer(port ? &port->multicast_query_timer : - &br->multicast_query_timer, time); + mod_timer(&query->timer, time); } -static void br_multicast_port_query_expired(unsigned long data) +static void br_multicast_port_query_expired(struct net_bridge_port *port, + struct bridge_mcast_query *query) { - struct net_bridge_port *port = (void *)data; struct net_bridge *br = port->br; spin_lock(&br->multicast_lock); @@ -827,25 +851,43 @@ static void br_multicast_port_query_expired(unsigned long data) port->state == BR_STATE_BLOCKING) goto out; - if (port->multicast_startup_queries_sent < - br->multicast_startup_query_count) - port->multicast_startup_queries_sent++; + if (query->startup_sent < br->multicast_startup_query_count) + query->startup_sent++; - br_multicast_send_query(port->br, port, - port->multicast_startup_queries_sent); + br_multicast_send_query(port->br, port, query); out: spin_unlock(&br->multicast_lock); } +static void br_ip4_multicast_port_query_expired(unsigned long data) +{ + struct net_bridge_port *port = (void *)data; + + br_multicast_port_query_expired(port, &port->ip4_query); +} + +#if IS_ENABLED(CONFIG_IPV6) +static void br_ip6_multicast_port_query_expired(unsigned long data) +{ + struct net_bridge_port *port = (void *)data; + + br_multicast_port_query_expired(port, &port->ip6_query); +} +#endif + void br_multicast_add_port(struct net_bridge_port *port) { port->multicast_router = 1; setup_timer(&port->multicast_router_timer, br_multicast_router_expired, (unsigned long)port); - setup_timer(&port->multicast_query_timer, - br_multicast_port_query_expired, (unsigned long)port); + setup_timer(&port->ip4_query.timer, br_ip4_multicast_port_query_expired, + (unsigned long)port); +#if IS_ENABLED(CONFIG_IPV6) + setup_timer(&port->ip6_query.timer, br_ip6_multicast_port_query_expired, + (unsigned long)port); +#endif } void br_multicast_del_port(struct net_bridge_port *port) @@ -853,13 +895,13 @@ void br_multicast_del_port(struct net_bridge_port *port) del_timer_sync(&port->multicast_router_timer); } -static void __br_multicast_enable_port(struct net_bridge_port *port) +static void br_multicast_enable(struct bridge_mcast_query *query) { - port->multicast_startup_queries_sent = 0; + query->startup_sent = 0; - if (try_to_del_timer_sync(&port->multicast_query_timer) >= 0 || - del_timer(&port->multicast_query_timer)) - mod_timer(&port->multicast_query_timer, jiffies); + if (try_to_del_timer_sync(&query->timer) >= 0 || + del_timer(&query->timer)) + mod_timer(&query->timer, jiffies); } void br_multicast_enable_port(struct net_bridge_port *port) @@ -870,7 +912,10 @@ void br_multicast_enable_port(struct net_bridge_port *port) if (br->multicast_disabled || !netif_running(br->dev)) goto out; - __br_multicast_enable_port(port); + br_multicast_enable(&port->ip4_query); +#if IS_ENABLED(CONFIG_IPV6) + br_multicast_enable(&port->ip6_query); +#endif out: spin_unlock(&br->multicast_lock); @@ -889,7 +934,10 @@ void br_multicast_disable_port(struct net_bridge_port *port) if (!hlist_unhashed(&port->rlist)) hlist_del_init_rcu(&port->rlist); del_timer(&port->multicast_router_timer); - del_timer(&port->multicast_query_timer); + del_timer(&port->ip4_query.timer); +#if IS_ENABLED(CONFIG_IPV6) + del_timer(&port->ip6_query.timer); +#endif spin_unlock(&br->multicast_lock); } @@ -1014,14 +1062,15 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br, } #endif -static void br_multicast_update_querier_timer(struct net_bridge *br, - unsigned long max_delay) +static void +br_multicast_update_querier_timer(struct net_bridge *br, + struct bridge_mcast_querier *querier, + unsigned long max_delay) { - if (!timer_pending(&br->multicast_querier_timer)) - br->multicast_querier_delay_time = jiffies + max_delay; + if (!timer_pending(&querier->timer)) + querier->delay_time = jiffies + max_delay; - mod_timer(&br->multicast_querier_timer, - jiffies + br->multicast_querier_interval); + mod_timer(&querier->timer, jiffies + br->multicast_querier_interval); } /* @@ -1074,12 +1123,13 @@ timer: static void br_multicast_query_received(struct net_bridge *br, struct net_bridge_port *port, + struct bridge_mcast_querier *querier, int saddr, unsigned long max_delay) { if (saddr) - br_multicast_update_querier_timer(br, max_delay); - else if (timer_pending(&br->multicast_querier_timer)) + br_multicast_update_querier_timer(br, querier, max_delay); + else if (timer_pending(&querier->timer)) return; br_multicast_mark_router(br, port); @@ -1129,7 +1179,8 @@ static int br_ip4_multicast_query(struct net_bridge *br, IGMPV3_MRC(ih3->code) * (HZ / IGMP_TIMER_SCALE) : 1; } - br_multicast_query_received(br, port, !!iph->saddr, max_delay); + br_multicast_query_received(br, port, &br->ip4_querier, !!iph->saddr, + max_delay); if (!group) goto out; @@ -1206,8 +1257,8 @@ static int br_ip6_multicast_query(struct net_bridge *br, max_delay = mld2q->mld2q_mrc ? MLDV2_MRC(ntohs(mld2q->mld2q_mrc)) : 1; } - br_multicast_query_received(br, port, !ipv6_addr_any(&ip6h->saddr), - max_delay); + br_multicast_query_received(br, port, &br->ip6_querier, + !ipv6_addr_any(&ip6h->saddr), max_delay); if (!group) goto out; @@ -1244,7 +1295,9 @@ out: static void br_multicast_leave_group(struct net_bridge *br, struct net_bridge_port *port, - struct br_ip *group) + struct br_ip *group, + struct bridge_mcast_querier *querier, + struct bridge_mcast_query *query) { struct net_bridge_mdb_htable *mdb; struct net_bridge_mdb_entry *mp; @@ -1255,7 +1308,7 @@ static void br_multicast_leave_group(struct net_bridge *br, spin_lock(&br->multicast_lock); if (!netif_running(br->dev) || (port && port->state == BR_STATE_DISABLED) || - timer_pending(&br->multicast_querier_timer)) + timer_pending(&querier->timer)) goto out; mdb = mlock_dereference(br->mdb, br); @@ -1263,14 +1316,13 @@ static void br_multicast_leave_group(struct net_bridge *br, if (!mp) goto out; - if (br->multicast_querier && - !timer_pending(&br->multicast_querier_timer)) { + if (br->multicast_querier) { __br_multicast_send_query(br, port, &mp->addr); time = jiffies + br->multicast_last_member_count * br->multicast_last_member_interval; - mod_timer(port ? &port->multicast_query_timer : - &br->multicast_query_timer, time); + + mod_timer(&query->timer, time); for (p = mlock_dereference(mp->ports, br); p != NULL; @@ -1323,7 +1375,6 @@ static void br_multicast_leave_group(struct net_bridge *br, mod_timer(&mp->timer, time); } } - out: spin_unlock(&br->multicast_lock); } @@ -1334,6 +1385,8 @@ static void br_ip4_multicast_leave_group(struct net_bridge *br, __u16 vid) { struct br_ip br_group; + struct bridge_mcast_query *query = port ? &port->ip4_query : + &br->ip4_query; if (ipv4_is_local_multicast(group)) return; @@ -1342,7 +1395,7 @@ static void br_ip4_multicast_leave_group(struct net_bridge *br, br_group.proto = htons(ETH_P_IP); br_group.vid = vid; - br_multicast_leave_group(br, port, &br_group); + br_multicast_leave_group(br, port, &br_group, &br->ip4_querier, query); } #if IS_ENABLED(CONFIG_IPV6) @@ -1352,6 +1405,9 @@ static void br_ip6_multicast_leave_group(struct net_bridge *br, __u16 vid) { struct br_ip br_group; + struct bridge_mcast_query *query = port ? &port->ip6_query : + &br->ip6_query; + if (!ipv6_is_transient_multicast(group)) return; @@ -1360,7 +1416,7 @@ static void br_ip6_multicast_leave_group(struct net_bridge *br, br_group.proto = htons(ETH_P_IPV6); br_group.vid = vid; - br_multicast_leave_group(br, port, &br_group); + br_multicast_leave_group(br, port, &br_group, &br->ip6_querier, query); } #endif @@ -1622,20 +1678,33 @@ int br_multicast_rcv(struct net_bridge *br, struct net_bridge_port *port, return 0; } -static void br_multicast_query_expired(unsigned long data) +static void br_multicast_query_expired(struct net_bridge *br, + struct bridge_mcast_query *query) +{ + spin_lock(&br->multicast_lock); + if (query->startup_sent < br->multicast_startup_query_count) + query->startup_sent++; + + br_multicast_send_query(br, NULL, query); + spin_unlock(&br->multicast_lock); +} + +static void br_ip4_multicast_query_expired(unsigned long data) { struct net_bridge *br = (void *)data; - spin_lock(&br->multicast_lock); - if (br->multicast_startup_queries_sent < - br->multicast_startup_query_count) - br->multicast_startup_queries_sent++; - - br_multicast_send_query(br, NULL, br->multicast_startup_queries_sent); - - spin_unlock(&br->multicast_lock); + br_multicast_query_expired(br, &br->ip4_query); } +#if IS_ENABLED(CONFIG_IPV6) +static void br_ip6_multicast_query_expired(unsigned long data) +{ + struct net_bridge *br = (void *)data; + + br_multicast_query_expired(br, &br->ip6_query); +} +#endif + void br_multicast_init(struct net_bridge *br) { br->hash_elasticity = 4; @@ -1654,25 +1723,43 @@ void br_multicast_init(struct net_bridge *br) br->multicast_querier_interval = 255 * HZ; br->multicast_membership_interval = 260 * HZ; - br->multicast_querier_delay_time = 0; + br->ip4_querier.delay_time = 0; +#if IS_ENABLED(CONFIG_IPV6) + br->ip6_querier.delay_time = 0; +#endif spin_lock_init(&br->multicast_lock); setup_timer(&br->multicast_router_timer, br_multicast_local_router_expired, 0); - setup_timer(&br->multicast_querier_timer, - br_multicast_querier_expired, (unsigned long)br); - setup_timer(&br->multicast_query_timer, br_multicast_query_expired, + setup_timer(&br->ip4_querier.timer, br_ip4_multicast_querier_expired, (unsigned long)br); + setup_timer(&br->ip4_query.timer, br_ip4_multicast_query_expired, + (unsigned long)br); +#if IS_ENABLED(CONFIG_IPV6) + setup_timer(&br->ip6_querier.timer, br_ip6_multicast_querier_expired, + (unsigned long)br); + setup_timer(&br->ip6_query.timer, br_ip6_multicast_query_expired, + (unsigned long)br); +#endif } -void br_multicast_open(struct net_bridge *br) +static void __br_multicast_open(struct net_bridge *br, + struct bridge_mcast_query *query) { - br->multicast_startup_queries_sent = 0; + query->startup_sent = 0; if (br->multicast_disabled) return; - mod_timer(&br->multicast_query_timer, jiffies); + mod_timer(&query->timer, jiffies); +} + +void br_multicast_open(struct net_bridge *br) +{ + __br_multicast_open(br, &br->ip4_query); +#if IS_ENABLED(CONFIG_IPV6) + __br_multicast_open(br, &br->ip6_query); +#endif } void br_multicast_stop(struct net_bridge *br) @@ -1684,8 +1771,12 @@ void br_multicast_stop(struct net_bridge *br) int i; del_timer_sync(&br->multicast_router_timer); - del_timer_sync(&br->multicast_querier_timer); - del_timer_sync(&br->multicast_query_timer); + del_timer_sync(&br->ip4_querier.timer); + del_timer_sync(&br->ip4_query.timer); +#if IS_ENABLED(CONFIG_IPV6) + del_timer_sync(&br->ip6_querier.timer); + del_timer_sync(&br->ip6_query.timer); +#endif spin_lock_bh(&br->multicast_lock); mdb = mlock_dereference(br->mdb, br); @@ -1788,18 +1879,24 @@ unlock: return err; } -static void br_multicast_start_querier(struct net_bridge *br) +static void br_multicast_start_querier(struct net_bridge *br, + struct bridge_mcast_query *query) { struct net_bridge_port *port; - br_multicast_open(br); + __br_multicast_open(br, query); list_for_each_entry(port, &br->port_list, list) { if (port->state == BR_STATE_DISABLED || port->state == BR_STATE_BLOCKING) continue; - __br_multicast_enable_port(port); + if (query == &br->ip4_query) + br_multicast_enable(&port->ip4_query); +#if IS_ENABLED(CONFIG_IPV6) + else + br_multicast_enable(&port->ip6_query); +#endif } } @@ -1834,7 +1931,10 @@ rollback: goto rollback; } - br_multicast_start_querier(br); + br_multicast_start_querier(br, &br->ip4_query); +#if IS_ENABLED(CONFIG_IPV6) + br_multicast_start_querier(br, &br->ip6_query); +#endif unlock: spin_unlock_bh(&br->multicast_lock); @@ -1857,10 +1957,18 @@ int br_multicast_set_querier(struct net_bridge *br, unsigned long val) goto unlock; max_delay = br->multicast_query_response_interval; - if (!timer_pending(&br->multicast_querier_timer)) - br->multicast_querier_delay_time = jiffies + max_delay; - br_multicast_start_querier(br); + if (!timer_pending(&br->ip4_querier.timer)) + br->ip4_querier.delay_time = jiffies + max_delay; + + br_multicast_start_querier(br, &br->ip4_query); + +#if IS_ENABLED(CONFIG_IPV6) + if (!timer_pending(&br->ip6_querier.timer)) + br->ip6_querier.delay_time = jiffies + max_delay; + + br_multicast_start_querier(br, &br->ip6_query); +#endif unlock: spin_unlock_bh(&br->multicast_lock); diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 2f7da41851bf..263ba9034468 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -66,6 +66,20 @@ struct br_ip __u16 vid; }; +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING +/* our own querier */ +struct bridge_mcast_query { + struct timer_list timer; + u32 startup_sent; +}; + +/* other querier */ +struct bridge_mcast_querier { + struct timer_list timer; + unsigned long delay_time; +}; +#endif + struct net_port_vlans { u16 port_idx; u16 pvid; @@ -162,10 +176,12 @@ struct net_bridge_port #define BR_FLOOD 0x00000040 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING - u32 multicast_startup_queries_sent; + struct bridge_mcast_query ip4_query; +#if IS_ENABLED(CONFIG_IPV6) + struct bridge_mcast_query ip6_query; +#endif /* IS_ENABLED(CONFIG_IPV6) */ unsigned char multicast_router; struct timer_list multicast_router_timer; - struct timer_list multicast_query_timer; struct hlist_head mglist; struct hlist_node rlist; #endif @@ -258,7 +274,6 @@ struct net_bridge u32 hash_max; u32 multicast_last_member_count; - u32 multicast_startup_queries_sent; u32 multicast_startup_query_count; unsigned long multicast_last_member_interval; @@ -267,15 +282,18 @@ struct net_bridge unsigned long multicast_query_interval; unsigned long multicast_query_response_interval; unsigned long multicast_startup_query_interval; - unsigned long multicast_querier_delay_time; spinlock_t multicast_lock; struct net_bridge_mdb_htable __rcu *mdb; struct hlist_head router_list; struct timer_list multicast_router_timer; - struct timer_list multicast_querier_timer; - struct timer_list multicast_query_timer; + struct bridge_mcast_querier ip4_querier; + struct bridge_mcast_query ip4_query; +#if IS_ENABLED(CONFIG_IPV6) + struct bridge_mcast_querier ip6_querier; + struct bridge_mcast_query ip6_query; +#endif /* IS_ENABLED(CONFIG_IPV6) */ #endif struct timer_list hello_timer; @@ -503,11 +521,27 @@ static inline bool br_multicast_is_router(struct net_bridge *br) timer_pending(&br->multicast_router_timer)); } -static inline bool br_multicast_querier_exists(struct net_bridge *br) +static inline bool +__br_multicast_querier_exists(struct net_bridge *br, + struct bridge_mcast_querier *querier) { - return time_is_before_jiffies(br->multicast_querier_delay_time) && - (br->multicast_querier || - timer_pending(&br->multicast_querier_timer)); + return time_is_before_jiffies(querier->delay_time) && + (br->multicast_querier || timer_pending(&querier->timer)); +} + +static inline bool br_multicast_querier_exists(struct net_bridge *br, + struct ethhdr *eth) +{ + switch (eth->h_proto) { + case (htons(ETH_P_IP)): + return __br_multicast_querier_exists(br, &br->ip4_querier); +#if IS_ENABLED(CONFIG_IPV6) + case (htons(ETH_P_IPV6)): + return __br_multicast_querier_exists(br, &br->ip6_querier); +#endif + default: + return false; + } } #else static inline int br_multicast_rcv(struct net_bridge *br, @@ -565,7 +599,8 @@ static inline bool br_multicast_is_router(struct net_bridge *br) { return 0; } -static inline bool br_multicast_querier_exists(struct net_bridge *br) +static inline bool br_multicast_querier_exists(struct net_bridge *br, + struct ethhdr *eth) { return false; } From eb8895debe1baba41fcb62c78a16f0c63c21662a Mon Sep 17 00:00:00 2001 From: Phil Oester Date: Tue, 27 Aug 2013 16:41:40 -0700 Subject: [PATCH 37/44] tcp: tcp_make_synack() should use sock_wmalloc In commit 90ba9b19 (tcp: tcp_make_synack() can use alloc_skb()), Eric changed the call to sock_wmalloc in tcp_make_synack to alloc_skb. In doing so, the netfilter owner match lost its ability to block the SYNACK packet on outbound listening sockets. Revert the change, restoring the owner match functionality. This closes netfilter bugzilla #847. Signed-off-by: Phil Oester Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index e2972993c671..170737a9d56d 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2670,7 +2670,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, int tcp_header_size; int mss; - skb = alloc_skb(MAX_TCP_HEADER + 15, sk_gfp_atomic(sk, GFP_ATOMIC)); + skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC); if (unlikely(!skb)) { dst_release(dst); return NULL; From 2c8d85182348021fc0a1bed193a4be4161dc8364 Mon Sep 17 00:00:00 2001 From: Erik Hugne Date: Wed, 28 Aug 2013 09:29:58 +0200 Subject: [PATCH 38/44] tipc: set sk_err correctly when connection fails Should a connect fail, if the publication/server is unavailable or due to some other error, a positive value will be returned and errno is never set. If the application code checks for an explicit zero return from connect (success) or a negative return (failure), it will not catch the error and subsequent send() calls will fail as shown from the strace snippet below. socket(0x1e /* PF_??? */, SOCK_SEQPACKET, 0) = 3 connect(3, {sa_family=0x1e /* AF_??? */, sa_data="\2\1\322\4\0\0\322\4\0\0\0\0\0\0"}, 16) = 111 sendto(3, "test", 4, 0, NULL, 0) = -1 EPIPE (Broken pipe) The reason for this behaviour is that TIPC wrongly inverts error codes set in sk_err. Signed-off-by: Erik Hugne Signed-off-by: David S. Miller --- net/tipc/socket.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/tipc/socket.c b/net/tipc/socket.c index ce8249c76827..6cc7ddd2fb7c 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1257,7 +1257,7 @@ static u32 filter_connect(struct tipc_sock *tsock, struct sk_buff **buf) /* Accept only ACK or NACK message */ if (unlikely(msg_errcode(msg))) { sock->state = SS_DISCONNECTING; - sk->sk_err = -ECONNREFUSED; + sk->sk_err = ECONNREFUSED; retval = TIPC_OK; break; } @@ -1268,7 +1268,7 @@ static u32 filter_connect(struct tipc_sock *tsock, struct sk_buff **buf) res = auto_connect(sock, msg); if (res) { sock->state = SS_DISCONNECTING; - sk->sk_err = res; + sk->sk_err = -res; retval = TIPC_OK; break; } From 737e828bdbdaf2f9d7de07f20a0308ac46ce5178 Mon Sep 17 00:00:00 2001 From: Li Hongjun Date: Wed, 28 Aug 2013 11:54:50 +0200 Subject: [PATCH 39/44] ipv4 tunnels: fix an oops when using ipip/sit with IPsec Since commit 3d7b46cd20e3 (ip_tunnel: push generic protocol handling to ip_tunnel module.), an Oops is triggered when an xfrm policy is configured on an IPv4 over IPv4 tunnel. xfrm4_policy_check() calls __xfrm_policy_check2(), which uses skb_dst(skb). But this field is NULL because iptunnel_pull_header() calls skb_dst_drop(skb). Signed-off-by: Li Hongjun Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/ipv4/ipip.c | 5 ++--- net/ipv6/sit.c | 6 ++---- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 51fc2a1dcdd3..b3ac3c3f6219 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -190,15 +190,14 @@ static int ipip_rcv(struct sk_buff *skb) struct ip_tunnel *tunnel; const struct iphdr *iph; - if (iptunnel_pull_header(skb, 0, tpi.proto)) - goto drop; - iph = ip_hdr(skb); tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, iph->saddr, iph->daddr, 0); if (tunnel) { if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; + if (iptunnel_pull_header(skb, 0, tpi.proto)) + goto drop; return ip_tunnel_rcv(tunnel, skb, &tpi, log_ecn_error); } diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index fbfc5a83867f..21b25dd8466b 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -645,11 +645,7 @@ static int ipip_rcv(struct sk_buff *skb) const struct iphdr *iph; struct ip_tunnel *tunnel; - if (iptunnel_pull_header(skb, 0, tpi.proto)) - goto drop; - iph = ip_hdr(skb); - tunnel = ipip6_tunnel_lookup(dev_net(skb->dev), skb->dev, iph->saddr, iph->daddr); if (tunnel != NULL) { @@ -659,6 +655,8 @@ static int ipip_rcv(struct sk_buff *skb) if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; + if (iptunnel_pull_header(skb, 0, tpi.proto)) + goto drop; return ip_tunnel_rcv(tunnel, skb, &tpi, log_ecn_error); } From 25ad6117e73656071b38fd19fa67ae325471c758 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 30 Aug 2013 17:39:33 -0400 Subject: [PATCH 40/44] Revert "ipv6: Don't depend on per socket memory for neighbour discovery messages" This reverts commit 1f324e38870cc09659cf23bc626f1b8869e201f2. It seems to cause regressions, and in particular the output path really depends upon there being a socket attached to skb->sk for checks such as sk_mc_loop(skb->sk) for example. See ip6_output_finish2(). Reported-by: Stephen Warren Reported-by: Fabio Estevam Signed-off-by: David S. Miller --- net/ipv6/ndisc.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 5cb98df966c2..04d31c2fbef1 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -370,12 +370,16 @@ static struct sk_buff *ndisc_alloc_skb(struct net_device *dev, { int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; + struct sock *sk = dev_net(dev)->ipv6.ndisc_sk; struct sk_buff *skb; + int err; - skb = alloc_skb(hlen + sizeof(struct ipv6hdr) + len + tlen, GFP_ATOMIC); + skb = sock_alloc_send_skb(sk, + hlen + sizeof(struct ipv6hdr) + len + tlen, + 1, &err); if (!skb) { - ND_PRINTK(0, err, "ndisc: %s failed to allocate an skb\n", - __func__); + ND_PRINTK(0, err, "ndisc: %s failed to allocate an skb, err=%d\n", + __func__, err); return NULL; } From 702821f4ea6f68db18aa1de7d8ed62c6ba586a64 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 28 Aug 2013 18:10:43 -0700 Subject: [PATCH 41/44] net: revert 8728c544a9c ("net: dev_pick_tx() fix") commit 8728c544a9cbdc ("net: dev_pick_tx() fix") and commit b6fe83e9525a ("bonding: refine IFF_XMIT_DST_RELEASE capability") are quite incompatible : Queue selection is disabled because skb dst was dropped before entering bonding device. This causes major performance regression, mainly because TCP packets for a given flow can be sent to multiple queues. This is particularly visible when using the new FQ packet scheduler with MQ + FQ setup on the slaves. We can safely revert the first commit now that 416186fbf8c5b ("net: Split core bits of netdev_pick_tx into __netdev_pick_tx") properly caps the queue_index. Reported-by: Xi Wang Diagnosed-by: Xi Wang Signed-off-by: Eric Dumazet Cc: Tom Herbert Cc: Alexander Duyck Cc: Denys Fedorysychenko Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index b84a1b155bc1..d12e3a9a5356 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -346,14 +346,9 @@ u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb) if (new_index < 0) new_index = skb_tx_hash(dev, skb); - if (queue_index != new_index && sk) { - struct dst_entry *dst = - rcu_dereference_check(sk->sk_dst_cache, 1); - - if (dst && skb_dst(skb) == dst) - sk_tx_queue_set(sk, queue_index); - - } + if (queue_index != new_index && sk && + rcu_access_pointer(sk->sk_dst_cache)) + sk_tx_queue_set(sk, queue_index); queue_index = new_index; } From 0d63c27d9e879a0b54eb405636d60ab12040ca46 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 29 Aug 2013 11:47:00 +0300 Subject: [PATCH 42/44] mISDN: return -EINVAL on error in dsp_control_req() If skb->len is too short then we should return an error. Otherwise we read beyond the end of skb->data for several bytes. Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- drivers/isdn/mISDN/dsp_core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/isdn/mISDN/dsp_core.c b/drivers/isdn/mISDN/dsp_core.c index 22b720ec80cb..77025f5cb57d 100644 --- a/drivers/isdn/mISDN/dsp_core.c +++ b/drivers/isdn/mISDN/dsp_core.c @@ -288,8 +288,10 @@ dsp_control_req(struct dsp *dsp, struct mISDNhead *hh, struct sk_buff *skb) u8 *data; int len; - if (skb->len < sizeof(int)) + if (skb->len < sizeof(int)) { printk(KERN_ERR "%s: PH_CONTROL message too short\n", __func__); + return -EINVAL; + } cont = *((int *)skb->data); len = skb->len - sizeof(int); data = skb->data + sizeof(int); From 2d98c29b6fb3de44d9eaa73c09f9cf7209346383 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 29 Aug 2013 23:55:05 +0200 Subject: [PATCH 43/44] net: bridge: convert MLDv2 Query MRC into msecs_to_jiffies for max_delay MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While looking into MLDv1/v2 code, I noticed that bridging code does not convert it's max delay into jiffies for MLDv2 messages as we do in core IPv6' multicast code. RFC3810, 5.1.3. Maximum Response Code says: The Maximum Response Code field specifies the maximum time allowed before sending a responding Report. The actual time allowed, called the Maximum Response Delay, is represented in units of milliseconds, and is derived from the Maximum Response Code as follows: [...] As we update timers that work with jiffies, we need to convert it. Signed-off-by: Daniel Borkmann Cc: Linus Lüssing Cc: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/bridge/br_multicast.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 9d1d0e66c357..bbcb43582496 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -1254,7 +1254,8 @@ static int br_ip6_multicast_query(struct net_bridge *br, mld2q = (struct mld2_query *)icmp6_hdr(skb); if (!mld2q->mld2q_nsrcs) group = &mld2q->mld2q_mca; - max_delay = mld2q->mld2q_mrc ? MLDV2_MRC(ntohs(mld2q->mld2q_mrc)) : 1; + + max_delay = max(msecs_to_jiffies(MLDV2_MRC(ntohs(mld2q->mld2q_mrc))), 1UL); } br_multicast_query_received(br, port, &br->ip6_querier, From 0affdf347ffc0c3a4595661c091e8cc5f1346e92 Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Fri, 30 Aug 2013 20:28:10 +0200 Subject: [PATCH 44/44] net: fec: fix time stamping logic after napi conversion Commit dc975382 "net: fec: add napi support to improve proformance" converted the fec driver to the napi model. However, that commit forgot to remove the call to skb_defer_rx_timestamp which is only needed in non-napi drivers. (The function napi_gro_receive eventually calls netif_receive_skb, which in turn calls skb_defer_rx_timestamp.) This patch should also be applied to the 3.9 and 3.10 kernels. Signed-off-by: Richard Cochran Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/fec_main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index 77ea0db0bbfc..c610a2716be4 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -971,8 +971,7 @@ fec_enet_rx(struct net_device *ndev, int budget) htons(ETH_P_8021Q), vlan_tag); - if (!skb_defer_rx_timestamp(skb)) - napi_gro_receive(&fep->napi, skb); + napi_gro_receive(&fep->napi, skb); } bdp->cbd_bufaddr = dma_map_single(&fep->pdev->dev, data,