From 1c2bcc766be44467809f1798cd4ceacafe20a852 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Wed, 22 Feb 2017 17:25:42 +0100 Subject: [PATCH 01/85] batman-adv: Keep fragments equally sized MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The batman-adv fragmentation packets have the design problem that they cannot be refragmented and cannot handle padding by the underlying link. The latter often leads to problems when networks are incorrectly configured and don't use a common MTU. The sender could for example fragment a 1271 byte frame (plus external ethernet header (14) and batadv unicast header (10)) to fit in a 1280 bytes large MTU of the underlying link (max. 1294 byte frames). This would create a 1294 bytes large frame (fragment 2) and a 55 bytes large frame (fragment 1). The extra 54 bytes are the fragment header (20) added to each fragment and the external ethernet header (14) for the second fragment. Let us assume that the next hop is then not able to transport 1294 bytes to its next hop. The 1294 byte large frame will be dropped but the 55 bytes large fragment will still be forwarded to its destination. Or let us assume that the underlying hardware requires that each frame has a minimum size (e.g. 60 bytes). Then it will pad the 55 bytes frame to 60 bytes. The receiver of the 60 bytes frame will no longer be able to correctly assemble the two frames together because it is not aware that 5 bytes of the 60 bytes frame are padding and don't belong to the reassembled frame. This can partly be avoided by splitting frames more equally. In this example, the 675 and 674 bytes large fragment frames could both potentially reach its destination without being too large or too small. Reported-by: Martin Weinelt Fixes: ee75ed88879a ("batman-adv: Fragment and send skbs larger than mtu") Signed-off-by: Sven Eckelmann Acked-by: Linus Lüssing Signed-off-by: Simon Wunderlich --- net/batman-adv/fragmentation.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index 11149e5be4e0..106bda56ec98 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -404,7 +404,7 @@ out: * batadv_frag_create - create a fragment from skb * @skb: skb to create fragment from * @frag_head: header to use in new fragment - * @mtu: size of new fragment + * @fragment_size: size of new fragment * * Split the passed skb into two fragments: A new one with size matching the * passed mtu and the old one with the rest. The new skb contains data from the @@ -414,11 +414,11 @@ out: */ static struct sk_buff *batadv_frag_create(struct sk_buff *skb, struct batadv_frag_packet *frag_head, - unsigned int mtu) + unsigned int fragment_size) { struct sk_buff *skb_fragment; unsigned int header_size = sizeof(*frag_head); - unsigned int fragment_size = mtu - header_size; + unsigned int mtu = fragment_size + header_size; skb_fragment = netdev_alloc_skb(NULL, mtu + ETH_HLEN); if (!skb_fragment) @@ -456,7 +456,7 @@ int batadv_frag_send_packet(struct sk_buff *skb, struct sk_buff *skb_fragment; unsigned int mtu = neigh_node->if_incoming->net_dev->mtu; unsigned int header_size = sizeof(frag_header); - unsigned int max_fragment_size, max_packet_size; + unsigned int max_fragment_size, num_fragments; int ret; /* To avoid merge and refragmentation at next-hops we never send @@ -464,10 +464,15 @@ int batadv_frag_send_packet(struct sk_buff *skb, */ mtu = min_t(unsigned int, mtu, BATADV_FRAG_MAX_FRAG_SIZE); max_fragment_size = mtu - header_size; - max_packet_size = max_fragment_size * BATADV_FRAG_MAX_FRAGMENTS; + + if (skb->len == 0 || max_fragment_size == 0) + return -EINVAL; + + num_fragments = (skb->len - 1) / max_fragment_size + 1; + max_fragment_size = (skb->len - 1) / num_fragments + 1; /* Don't even try to fragment, if we need more than 16 fragments */ - if (skb->len > max_packet_size) { + if (num_fragments > BATADV_FRAG_MAX_FRAGMENTS) { ret = -EAGAIN; goto free_skb; } @@ -507,7 +512,8 @@ int batadv_frag_send_packet(struct sk_buff *skb, goto put_primary_if; } - skb_fragment = batadv_frag_create(skb, &frag_header, mtu); + skb_fragment = batadv_frag_create(skb, &frag_header, + max_fragment_size); if (!skb_fragment) { ret = -ENOMEM; goto put_primary_if; From 1a9070ec91b37234fe915849b767c61584c64a44 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sat, 4 Mar 2017 15:48:50 +0100 Subject: [PATCH 02/85] batman-adv: Initialize gw sel_class via batadv_algo The gateway selection class variable is shared between different algorithm versions. But the interpretation of the content is algorithm specific. The initialization is therefore also algorithm specific. But this was implemented incorrectly and the initialization for BATMAN_V always overwrote the value previously written for BATMAN_IV. This could only be avoided when BATMAN_V was disabled during compile time. Using a special batadv_algo hook for this initialization avoids this problem. Fixes: 50164d8f500f ("batman-adv: B.A.T.M.A.N. V - implement GW selection logic") Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/bat_iv_ogm.c | 11 +++++++++++ net/batman-adv/bat_v.c | 14 +++++++++++--- net/batman-adv/gateway_common.c | 5 +++++ net/batman-adv/soft-interface.c | 1 - net/batman-adv/types.h | 2 ++ 5 files changed, 29 insertions(+), 4 deletions(-) diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index f00f666e2ccd..7bfd0d7ef49d 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -2477,6 +2477,16 @@ static void batadv_iv_iface_activate(struct batadv_hard_iface *hard_iface) batadv_iv_ogm_schedule(hard_iface); } +/** + * batadv_iv_init_sel_class - initialize GW selection class + * @bat_priv: the bat priv with all the soft interface information + */ +static void batadv_iv_init_sel_class(struct batadv_priv *bat_priv) +{ + /* set default TQ difference threshold to 20 */ + atomic_set(&bat_priv->gw.sel_class, 20); +} + static struct batadv_gw_node * batadv_iv_gw_get_best_gw_node(struct batadv_priv *bat_priv) { @@ -2823,6 +2833,7 @@ static struct batadv_algo_ops batadv_batman_iv __read_mostly = { .del_if = batadv_iv_ogm_orig_del_if, }, .gw = { + .init_sel_class = batadv_iv_init_sel_class, .get_best_gw_node = batadv_iv_gw_get_best_gw_node, .is_eligible = batadv_iv_gw_is_eligible, #ifdef CONFIG_BATMAN_ADV_DEBUGFS diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c index 2ac612d7bab4..2e2471ca84e3 100644 --- a/net/batman-adv/bat_v.c +++ b/net/batman-adv/bat_v.c @@ -668,6 +668,16 @@ err_ifinfo1: return ret; } +/** + * batadv_v_init_sel_class - initialize GW selection class + * @bat_priv: the bat priv with all the soft interface information + */ +static void batadv_v_init_sel_class(struct batadv_priv *bat_priv) +{ + /* set default throughput difference threshold to 5Mbps */ + atomic_set(&bat_priv->gw.sel_class, 50); +} + static ssize_t batadv_v_store_sel_class(struct batadv_priv *bat_priv, char *buff, size_t count) { @@ -1052,6 +1062,7 @@ static struct batadv_algo_ops batadv_batman_v __read_mostly = { .dump = batadv_v_orig_dump, }, .gw = { + .init_sel_class = batadv_v_init_sel_class, .store_sel_class = batadv_v_store_sel_class, .show_sel_class = batadv_v_show_sel_class, .get_best_gw_node = batadv_v_gw_get_best_gw_node, @@ -1092,9 +1103,6 @@ int batadv_v_mesh_init(struct batadv_priv *bat_priv) if (ret < 0) return ret; - /* set default throughput difference threshold to 5Mbps */ - atomic_set(&bat_priv->gw.sel_class, 50); - return 0; } diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c index 21184810d89f..3e3f91ab694f 100644 --- a/net/batman-adv/gateway_common.c +++ b/net/batman-adv/gateway_common.c @@ -253,6 +253,11 @@ static void batadv_gw_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, */ void batadv_gw_init(struct batadv_priv *bat_priv) { + if (bat_priv->algo_ops->gw.init_sel_class) + bat_priv->algo_ops->gw.init_sel_class(bat_priv); + else + atomic_set(&bat_priv->gw.sel_class, 1); + batadv_tvlv_handler_register(bat_priv, batadv_gw_tvlv_ogm_handler_v1, NULL, BATADV_TVLV_GW, 1, BATADV_TVLV_HANDLER_OGM_CIFNOTFND); diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 7b3494ae6ad9..2e0b3463ab4a 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -820,7 +820,6 @@ static int batadv_softif_init_late(struct net_device *dev) atomic_set(&bat_priv->mcast.num_want_all_ipv6, 0); #endif atomic_set(&bat_priv->gw.mode, BATADV_GW_MODE_OFF); - atomic_set(&bat_priv->gw.sel_class, 20); atomic_set(&bat_priv->gw.bandwidth_down, 100); atomic_set(&bat_priv->gw.bandwidth_up, 20); atomic_set(&bat_priv->orig_interval, 1000); diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index e913aee28c98..5137d859694c 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -1489,6 +1489,7 @@ struct batadv_algo_orig_ops { /** * struct batadv_algo_gw_ops - mesh algorithm callbacks (GW specific) + * @init_sel_class: initialize GW selection class (optional) * @store_sel_class: parse and stores a new GW selection class (optional) * @show_sel_class: prints the current GW selection class (optional) * @get_best_gw_node: select the best GW from the list of available nodes @@ -1499,6 +1500,7 @@ struct batadv_algo_orig_ops { * @dump: dump gateways to a netlink socket (optional) */ struct batadv_algo_gw_ops { + void (*init_sel_class)(struct batadv_priv *bat_priv); ssize_t (*store_sel_class)(struct batadv_priv *bat_priv, char *buff, size_t count); ssize_t (*show_sel_class)(struct batadv_priv *bat_priv, char *buff); From 7b4fdf77a450ec0fdcb2f677b080ddbf2c186544 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 3 Mar 2017 21:44:00 +0100 Subject: [PATCH 03/85] netfilter: don't track fragmented packets Andrey reports syzkaller splat caused by NF_CT_ASSERT(!ip_is_fragment(ip_hdr(skb))); in ipv4 nat. But this assertion (and the comment) are wrong, this function does see fragments when IP_NODEFRAG setsockopt is used. As conntrack doesn't track packets without complete l4 header, only the first fragment is tracked. Because applying nat to first packet but not the rest makes no sense this also turns off tracking of all fragments. Reported-by: Andrey Konovalov Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 4 ++++ net/ipv4/netfilter/nf_nat_l3proto_ipv4.c | 5 ----- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index bc1486f2c064..2e14ed11a35c 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -165,6 +165,10 @@ static unsigned int ipv4_conntrack_local(void *priv, if (skb->len < sizeof(struct iphdr) || ip_hdrlen(skb) < sizeof(struct iphdr)) return NF_ACCEPT; + + if (ip_is_fragment(ip_hdr(skb))) /* IP_NODEFRAG setsockopt set */ + return NF_ACCEPT; + return nf_conntrack_in(state->net, PF_INET, state->hook, skb); } diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c index f8aad03d674b..6f5e8d01b876 100644 --- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c @@ -255,11 +255,6 @@ nf_nat_ipv4_fn(void *priv, struct sk_buff *skb, /* maniptype == SRC for postrouting. */ enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook); - /* We never see fragments: conntrack defrags on pre-routing - * and local-out, and nf_nat_out protects post-routing. - */ - NF_CT_ASSERT(!ip_is_fragment(ip_hdr(skb))); - ct = nf_ct_get(skb, &ctinfo); /* Can't track? It's not due to stress, or conntrack would * have dropped it. Hence it's the user's responsibilty to From 8e05ba7f848475bdc3aa546cf88418f7e51a6671 Mon Sep 17 00:00:00 2001 From: Ying Xue Date: Sat, 4 Mar 2017 18:00:02 +0800 Subject: [PATCH 04/85] netfilter: nf_nat_sctp: fix ICMP packet to be dropped accidently Regarding RFC 792, the first 64 bits of the original SCTP datagram's data could be contained in ICMP packet, such as: 0 1 2 3 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | Type | Code | Checksum | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | unused | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | Internet Header + 64 bits of Original Data Datagram | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ However, according to RFC 4960, SCTP datagram header is as below: 0 1 2 3 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | Source Port Number | Destination Port Number | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | Verification Tag | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | Checksum | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ It means only the first three fields of SCTP header can be carried in ICMP packet except for Checksum field. At present in sctp_manip_pkt(), no matter whether the packet is ICMP or not, it always calculates SCTP packet checksum. However, not only the calculation of checksum is unnecessary for ICMP, but also it causes another fatal issue that ICMP packet is dropped. The header size of SCTP is used to identify whether the writeable length of skb is bigger than skb->len through skb_make_writable() in sctp_manip_pkt(). But when it deals with ICMP packet, skb_make_writable() directly returns false as the writeable length of skb is bigger than skb->len. Subsequently ICMP is dropped. Now we correct this misbahavior. When sctp_manip_pkt() handles ICMP packet, 8 bytes rather than the whole SCTP header size is used to check if writeable length of skb is overflowed. Meanwhile, as it's meaningless to calculate checksum when packet is ICMP, the computation of checksum is ignored as well. Signed-off-by: Ying Xue Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_nat_proto_sctp.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nf_nat_proto_sctp.c b/net/netfilter/nf_nat_proto_sctp.c index 31d358691af0..804e8a0ab36e 100644 --- a/net/netfilter/nf_nat_proto_sctp.c +++ b/net/netfilter/nf_nat_proto_sctp.c @@ -33,8 +33,16 @@ sctp_manip_pkt(struct sk_buff *skb, enum nf_nat_manip_type maniptype) { sctp_sctphdr_t *hdr; + int hdrsize = 8; - if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) + /* This could be an inner header returned in imcp packet; in such + * cases we cannot update the checksum field since it is outside + * of the 8 bytes of transport layer headers we are guaranteed. + */ + if (skb->len >= hdroff + sizeof(*hdr)) + hdrsize = sizeof(*hdr); + + if (!skb_make_writable(skb, hdroff + hdrsize)) return false; hdr = (struct sctphdr *)(skb->data + hdroff); @@ -47,6 +55,9 @@ sctp_manip_pkt(struct sk_buff *skb, hdr->dest = tuple->dst.u.sctp.port; } + if (hdrsize < sizeof(*hdr)) + return true; + if (skb->ip_summed != CHECKSUM_PARTIAL) { hdr->checksum = sctp_compute_cksum(skb, hdroff); skb->ip_summed = CHECKSUM_NONE; From 568af6de058cb2b0c5b98d98ffcf37cdc6bc38a7 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sat, 4 Mar 2017 19:53:47 +0100 Subject: [PATCH 05/85] netfilter: nf_tables: set pktinfo->thoff at AH header if found Phil Sutter reports that IPv6 AH header matching is broken. From userspace, nft generates bytecode that expects to find the AH header at NFT_PAYLOAD_TRANSPORT_HEADER both for IPv4 and IPv6. However, pktinfo->thoff is set to the inner header after the AH header in IPv6, while in IPv4 pktinfo->thoff points to the AH header indeed. This behaviour is inconsistent. This patch fixes this problem by updating ipv6_find_hdr() to get the IP6_FH_F_AUTH flag so this function stops at the AH header, so both IPv4 and IPv6 pktinfo->thoff point to the AH header. This is also inconsistent when trying to match encapsulated headers: 1) A packet that looks like IPv4 + AH + TCP dport 22 will *not* match. 2) A packet that looks like IPv6 + AH + TCP dport 22 will match. Reported-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables_ipv6.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/include/net/netfilter/nf_tables_ipv6.h b/include/net/netfilter/nf_tables_ipv6.h index d150b5066201..97983d1c05e4 100644 --- a/include/net/netfilter/nf_tables_ipv6.h +++ b/include/net/netfilter/nf_tables_ipv6.h @@ -9,12 +9,13 @@ nft_set_pktinfo_ipv6(struct nft_pktinfo *pkt, struct sk_buff *skb, const struct nf_hook_state *state) { + unsigned int flags = IP6_FH_F_AUTH; int protohdr, thoff = 0; unsigned short frag_off; nft_set_pktinfo(pkt, skb, state); - protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, NULL); + protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, &flags); if (protohdr < 0) { nft_set_pktinfo_proto_unspec(pkt, skb); return; @@ -32,6 +33,7 @@ __nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt, const struct nf_hook_state *state) { #if IS_ENABLED(CONFIG_IPV6) + unsigned int flags = IP6_FH_F_AUTH; struct ipv6hdr *ip6h, _ip6h; unsigned int thoff = 0; unsigned short frag_off; @@ -50,7 +52,7 @@ __nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt, if (pkt_len + sizeof(*ip6h) > skb->len) return -1; - protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, NULL); + protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, &flags); if (protohdr < 0) return -1; From fd89b23a4632d3cbdee398048497e026edadfb71 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Mon, 6 Mar 2017 00:02:52 +0800 Subject: [PATCH 06/85] netfilter: nft_set_bitmap: fetch the element key based on the set->klen Currently we just assume the element key as a u32 integer, regardless of the set key length. This is incorrect, for example, the tcp port number is only 16 bits. So when we use the nft_payload expr to get the tcp dport and store it to dreg, the dport will be stored at 0~15 bits, and 16~31 bits will be padded with zero. So the reg->data[dreg] will be looked like as below: 0 15 31 +-+-+-+-+-+-+-+-+-+-+-+-+ | tcp dport | 0 | +-+-+-+-+-+-+-+-+-+-+-+-+ But for these big-endian systems, if we treate this register as a u32 integer, the element key will be larger than 65535, so the following lookup in bitmap set will cause out of bound access. Another issue is that if we add element with comments in bitmap set(although the comments will be ignored eventually), the element will vanish strangely. Because we treate the element key as a u32 integer, so the comments will become the part of the element key, then the element key will also be larger than 65535 and out of bound access will happen: # nft add element t s { 1 comment test } Since set->klen is 1 or 2, it's fine to treate the element key as a u8 or u16 integer. Fixes: 665153ff5752 ("netfilter: nf_tables: add bitmap set type") Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_bitmap.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c index 152d226552c1..9b024e22717b 100644 --- a/net/netfilter/nft_set_bitmap.c +++ b/net/netfilter/nft_set_bitmap.c @@ -45,9 +45,17 @@ struct nft_bitmap { u8 bitmap[]; }; -static inline void nft_bitmap_location(u32 key, u32 *idx, u32 *off) +static inline void nft_bitmap_location(const struct nft_set *set, + const void *key, + u32 *idx, u32 *off) { - u32 k = (key << 1); + u32 k; + + if (set->klen == 2) + k = *(u16 *)key; + else + k = *(u8 *)key; + k <<= 1; *idx = k / BITS_PER_BYTE; *off = k % BITS_PER_BYTE; @@ -69,7 +77,7 @@ static bool nft_bitmap_lookup(const struct net *net, const struct nft_set *set, u8 genmask = nft_genmask_cur(net); u32 idx, off; - nft_bitmap_location(*key, &idx, &off); + nft_bitmap_location(set, key, &idx, &off); return nft_bitmap_active(priv->bitmap, idx, off, genmask); } @@ -83,7 +91,7 @@ static int nft_bitmap_insert(const struct net *net, const struct nft_set *set, u8 genmask = nft_genmask_next(net); u32 idx, off; - nft_bitmap_location(nft_set_ext_key(ext)->data[0], &idx, &off); + nft_bitmap_location(set, nft_set_ext_key(ext), &idx, &off); if (nft_bitmap_active(priv->bitmap, idx, off, genmask)) return -EEXIST; @@ -102,7 +110,7 @@ static void nft_bitmap_remove(const struct net *net, u8 genmask = nft_genmask_next(net); u32 idx, off; - nft_bitmap_location(nft_set_ext_key(ext)->data[0], &idx, &off); + nft_bitmap_location(set, nft_set_ext_key(ext), &idx, &off); /* Enter 00 state. */ priv->bitmap[idx] &= ~(genmask << off); } @@ -116,7 +124,7 @@ static void nft_bitmap_activate(const struct net *net, u8 genmask = nft_genmask_next(net); u32 idx, off; - nft_bitmap_location(nft_set_ext_key(ext)->data[0], &idx, &off); + nft_bitmap_location(set, nft_set_ext_key(ext), &idx, &off); /* Enter 11 state. */ priv->bitmap[idx] |= (genmask << off); } @@ -128,7 +136,7 @@ static bool nft_bitmap_flush(const struct net *net, u8 genmask = nft_genmask_next(net); u32 idx, off; - nft_bitmap_location(nft_set_ext_key(ext)->data[0], &idx, &off); + nft_bitmap_location(set, nft_set_ext_key(ext), &idx, &off); /* Enter 10 state, similar to deactivation. */ priv->bitmap[idx] &= ~(genmask << off); @@ -161,10 +169,9 @@ static void *nft_bitmap_deactivate(const struct net *net, struct nft_bitmap *priv = nft_set_priv(set); u8 genmask = nft_genmask_next(net); struct nft_set_ext *ext; - u32 idx, off, key = 0; + u32 idx, off; - memcpy(&key, elem->key.val.data, set->klen); - nft_bitmap_location(key, &idx, &off); + nft_bitmap_location(set, elem->key.val.data, &idx, &off); if (!nft_bitmap_active(priv->bitmap, idx, off, genmask)) return NULL; From 10596608c4d62cb8c1c2b806debcbd32fe657e71 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Wed, 8 Mar 2017 22:54:18 +0800 Subject: [PATCH 07/85] netfilter: nf_tables: fix mismatch in big-endian system Currently, there are two different methods to store an u16 integer to the u32 data register. For example: u32 *dest = ®s->data[priv->dreg]; 1. *dest = 0; *(u16 *) dest = val_u16; 2. *dest = val_u16; For method 1, the u16 value will be stored like this, either in big-endian or little-endian system: 0 15 31 +-+-+-+-+-+-+-+-+-+-+-+-+ | Value | 0 | +-+-+-+-+-+-+-+-+-+-+-+-+ For method 2, in little-endian system, the u16 value will be the same as listed above. But in big-endian system, the u16 value will be stored like this: 0 15 31 +-+-+-+-+-+-+-+-+-+-+-+-+ | 0 | Value | +-+-+-+-+-+-+-+-+-+-+-+-+ So later we use "memcmp(®s->data[priv->sreg], data, 2);" to do compare in nft_cmp, nft_lookup expr ..., method 2 will get the wrong result in big-endian system, as 0~15 bits will always be zero. For the similar reason, when loading an u16 value from the u32 data register, we should use "*(u16 *) sreg;" instead of "(u16)*sreg;", the 2nd method will get the wrong value in the big-endian system. So introduce some wrapper functions to store/load an u8 or u16 integer to/from the u32 data register, and use them in the right place. Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 29 +++++++++++++++++++++ net/ipv4/netfilter/nft_masq_ipv4.c | 8 +++--- net/ipv4/netfilter/nft_redir_ipv4.c | 8 +++--- net/ipv6/netfilter/nft_masq_ipv6.c | 8 +++--- net/ipv6/netfilter/nft_redir_ipv6.c | 8 +++--- net/netfilter/nft_ct.c | 18 +++++++------ net/netfilter/nft_meta.c | 40 +++++++++++++++-------------- net/netfilter/nft_nat.c | 8 +++--- 8 files changed, 80 insertions(+), 47 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 2aa8a9d80fbe..70c5ca0c60b1 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -103,6 +103,35 @@ struct nft_regs { }; }; +/* Store/load an u16 or u8 integer to/from the u32 data register. + * + * Note, when using concatenations, register allocation happens at 32-bit + * level. So for store instruction, pad the rest part with zero to avoid + * garbage values. + */ + +static inline void nft_reg_store16(u32 *dreg, u16 val) +{ + *dreg = 0; + *(u16 *)dreg = val; +} + +static inline void nft_reg_store8(u32 *dreg, u8 val) +{ + *dreg = 0; + *(u8 *)dreg = val; +} + +static inline u16 nft_reg_load16(u32 *sreg) +{ + return *(u16 *)sreg; +} + +static inline u8 nft_reg_load8(u32 *sreg) +{ + return *(u8 *)sreg; +} + static inline void nft_data_copy(u32 *dst, const struct nft_data *src, unsigned int len) { diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c index a0ea8aad1bf1..f18677277119 100644 --- a/net/ipv4/netfilter/nft_masq_ipv4.c +++ b/net/ipv4/netfilter/nft_masq_ipv4.c @@ -26,10 +26,10 @@ static void nft_masq_ipv4_eval(const struct nft_expr *expr, memset(&range, 0, sizeof(range)); range.flags = priv->flags; if (priv->sreg_proto_min) { - range.min_proto.all = - *(__be16 *)®s->data[priv->sreg_proto_min]; - range.max_proto.all = - *(__be16 *)®s->data[priv->sreg_proto_max]; + range.min_proto.all = (__force __be16)nft_reg_load16( + ®s->data[priv->sreg_proto_min]); + range.max_proto.all = (__force __be16)nft_reg_load16( + ®s->data[priv->sreg_proto_max]); } regs->verdict.code = nf_nat_masquerade_ipv4(pkt->skb, nft_hook(pkt), &range, nft_out(pkt)); diff --git a/net/ipv4/netfilter/nft_redir_ipv4.c b/net/ipv4/netfilter/nft_redir_ipv4.c index 1650ed23c15d..5120be1d3118 100644 --- a/net/ipv4/netfilter/nft_redir_ipv4.c +++ b/net/ipv4/netfilter/nft_redir_ipv4.c @@ -26,10 +26,10 @@ static void nft_redir_ipv4_eval(const struct nft_expr *expr, memset(&mr, 0, sizeof(mr)); if (priv->sreg_proto_min) { - mr.range[0].min.all = - *(__be16 *)®s->data[priv->sreg_proto_min]; - mr.range[0].max.all = - *(__be16 *)®s->data[priv->sreg_proto_max]; + mr.range[0].min.all = (__force __be16)nft_reg_load16( + ®s->data[priv->sreg_proto_min]); + mr.range[0].max.all = (__force __be16)nft_reg_load16( + ®s->data[priv->sreg_proto_max]); mr.range[0].flags |= NF_NAT_RANGE_PROTO_SPECIFIED; } diff --git a/net/ipv6/netfilter/nft_masq_ipv6.c b/net/ipv6/netfilter/nft_masq_ipv6.c index 6c5b5b1830a7..4146536e9c15 100644 --- a/net/ipv6/netfilter/nft_masq_ipv6.c +++ b/net/ipv6/netfilter/nft_masq_ipv6.c @@ -27,10 +27,10 @@ static void nft_masq_ipv6_eval(const struct nft_expr *expr, memset(&range, 0, sizeof(range)); range.flags = priv->flags; if (priv->sreg_proto_min) { - range.min_proto.all = - *(__be16 *)®s->data[priv->sreg_proto_min]; - range.max_proto.all = - *(__be16 *)®s->data[priv->sreg_proto_max]; + range.min_proto.all = (__force __be16)nft_reg_load16( + ®s->data[priv->sreg_proto_min]); + range.max_proto.all = (__force __be16)nft_reg_load16( + ®s->data[priv->sreg_proto_max]); } regs->verdict.code = nf_nat_masquerade_ipv6(pkt->skb, &range, nft_out(pkt)); diff --git a/net/ipv6/netfilter/nft_redir_ipv6.c b/net/ipv6/netfilter/nft_redir_ipv6.c index f5ac080fc084..a27e424f690d 100644 --- a/net/ipv6/netfilter/nft_redir_ipv6.c +++ b/net/ipv6/netfilter/nft_redir_ipv6.c @@ -26,10 +26,10 @@ static void nft_redir_ipv6_eval(const struct nft_expr *expr, memset(&range, 0, sizeof(range)); if (priv->sreg_proto_min) { - range.min_proto.all = - *(__be16 *)®s->data[priv->sreg_proto_min], - range.max_proto.all = - *(__be16 *)®s->data[priv->sreg_proto_max], + range.min_proto.all = (__force __be16)nft_reg_load16( + ®s->data[priv->sreg_proto_min]); + range.max_proto.all = (__force __be16)nft_reg_load16( + ®s->data[priv->sreg_proto_max]); range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; } diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index bf548a7a71ec..91585b5e5307 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -83,7 +83,7 @@ static void nft_ct_get_eval(const struct nft_expr *expr, switch (priv->key) { case NFT_CT_DIRECTION: - *dest = CTINFO2DIR(ctinfo); + nft_reg_store8(dest, CTINFO2DIR(ctinfo)); return; case NFT_CT_STATUS: *dest = ct->status; @@ -151,20 +151,22 @@ static void nft_ct_get_eval(const struct nft_expr *expr, return; } case NFT_CT_L3PROTOCOL: - *dest = nf_ct_l3num(ct); + nft_reg_store8(dest, nf_ct_l3num(ct)); return; case NFT_CT_PROTOCOL: - *dest = nf_ct_protonum(ct); + nft_reg_store8(dest, nf_ct_protonum(ct)); return; #ifdef CONFIG_NF_CONNTRACK_ZONES case NFT_CT_ZONE: { const struct nf_conntrack_zone *zone = nf_ct_zone(ct); + u16 zoneid; if (priv->dir < IP_CT_DIR_MAX) - *dest = nf_ct_zone_id(zone, priv->dir); + zoneid = nf_ct_zone_id(zone, priv->dir); else - *dest = zone->id; + zoneid = zone->id; + nft_reg_store16(dest, zoneid); return; } #endif @@ -183,10 +185,10 @@ static void nft_ct_get_eval(const struct nft_expr *expr, nf_ct_l3num(ct) == NFPROTO_IPV4 ? 4 : 16); return; case NFT_CT_PROTO_SRC: - *dest = (__force __u16)tuple->src.u.all; + nft_reg_store16(dest, (__force u16)tuple->src.u.all); return; case NFT_CT_PROTO_DST: - *dest = (__force __u16)tuple->dst.u.all; + nft_reg_store16(dest, (__force u16)tuple->dst.u.all); return; default: break; @@ -205,7 +207,7 @@ static void nft_ct_set_zone_eval(const struct nft_expr *expr, const struct nft_ct *priv = nft_expr_priv(expr); struct sk_buff *skb = pkt->skb; enum ip_conntrack_info ctinfo; - u16 value = regs->data[priv->sreg]; + u16 value = nft_reg_load16(®s->data[priv->sreg]); struct nf_conn *ct; ct = nf_ct_get(skb, &ctinfo); diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index e1f5ca9b423b..7b60e01f38ff 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -45,16 +45,15 @@ void nft_meta_get_eval(const struct nft_expr *expr, *dest = skb->len; break; case NFT_META_PROTOCOL: - *dest = 0; - *(__be16 *)dest = skb->protocol; + nft_reg_store16(dest, (__force u16)skb->protocol); break; case NFT_META_NFPROTO: - *dest = nft_pf(pkt); + nft_reg_store8(dest, nft_pf(pkt)); break; case NFT_META_L4PROTO: if (!pkt->tprot_set) goto err; - *dest = pkt->tprot; + nft_reg_store8(dest, pkt->tprot); break; case NFT_META_PRIORITY: *dest = skb->priority; @@ -85,14 +84,12 @@ void nft_meta_get_eval(const struct nft_expr *expr, case NFT_META_IIFTYPE: if (in == NULL) goto err; - *dest = 0; - *(u16 *)dest = in->type; + nft_reg_store16(dest, in->type); break; case NFT_META_OIFTYPE: if (out == NULL) goto err; - *dest = 0; - *(u16 *)dest = out->type; + nft_reg_store16(dest, out->type); break; case NFT_META_SKUID: sk = skb_to_full_sk(skb); @@ -142,19 +139,19 @@ void nft_meta_get_eval(const struct nft_expr *expr, #endif case NFT_META_PKTTYPE: if (skb->pkt_type != PACKET_LOOPBACK) { - *dest = skb->pkt_type; + nft_reg_store8(dest, skb->pkt_type); break; } switch (nft_pf(pkt)) { case NFPROTO_IPV4: if (ipv4_is_multicast(ip_hdr(skb)->daddr)) - *dest = PACKET_MULTICAST; + nft_reg_store8(dest, PACKET_MULTICAST); else - *dest = PACKET_BROADCAST; + nft_reg_store8(dest, PACKET_BROADCAST); break; case NFPROTO_IPV6: - *dest = PACKET_MULTICAST; + nft_reg_store8(dest, PACKET_MULTICAST); break; case NFPROTO_NETDEV: switch (skb->protocol) { @@ -168,14 +165,14 @@ void nft_meta_get_eval(const struct nft_expr *expr, goto err; if (ipv4_is_multicast(iph->daddr)) - *dest = PACKET_MULTICAST; + nft_reg_store8(dest, PACKET_MULTICAST); else - *dest = PACKET_BROADCAST; + nft_reg_store8(dest, PACKET_BROADCAST); break; } case htons(ETH_P_IPV6): - *dest = PACKET_MULTICAST; + nft_reg_store8(dest, PACKET_MULTICAST); break; default: WARN_ON_ONCE(1); @@ -230,7 +227,9 @@ void nft_meta_set_eval(const struct nft_expr *expr, { const struct nft_meta *meta = nft_expr_priv(expr); struct sk_buff *skb = pkt->skb; - u32 value = regs->data[meta->sreg]; + u32 *sreg = ®s->data[meta->sreg]; + u32 value = *sreg; + u8 pkt_type; switch (meta->key) { case NFT_META_MARK: @@ -240,9 +239,12 @@ void nft_meta_set_eval(const struct nft_expr *expr, skb->priority = value; break; case NFT_META_PKTTYPE: - if (skb->pkt_type != value && - skb_pkt_type_ok(value) && skb_pkt_type_ok(skb->pkt_type)) - skb->pkt_type = value; + pkt_type = nft_reg_load8(sreg); + + if (skb->pkt_type != pkt_type && + skb_pkt_type_ok(pkt_type) && + skb_pkt_type_ok(skb->pkt_type)) + skb->pkt_type = pkt_type; break; case NFT_META_NFTRACE: skb->nf_trace = !!value; diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c index 19a7bf3236f9..439e0bd152a0 100644 --- a/net/netfilter/nft_nat.c +++ b/net/netfilter/nft_nat.c @@ -65,10 +65,10 @@ static void nft_nat_eval(const struct nft_expr *expr, } if (priv->sreg_proto_min) { - range.min_proto.all = - *(__be16 *)®s->data[priv->sreg_proto_min]; - range.max_proto.all = - *(__be16 *)®s->data[priv->sreg_proto_max]; + range.min_proto.all = (__force __be16)nft_reg_load16( + ®s->data[priv->sreg_proto_min]); + range.max_proto.all = (__force __be16)nft_reg_load16( + ®s->data[priv->sreg_proto_max]); range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; } From 4ca60d08cbe65f501baad64af50fceba79c19fbb Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 9 Mar 2017 23:22:30 +0100 Subject: [PATCH 08/85] netfilter: bridge: honor frag_max_size when refragmenting consider a bridge with mtu 9000, but end host sending smaller packets to another host with mtu < 9000. In this case, after reassembly, bridge+defrag would refragment, and then attempt to send the reassembled packet as long as it was below 9k. Instead we have to cap by the largest fragment size seen. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/bridge/br_netfilter_hooks.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 95087e6e8258..3c5185021c1c 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -721,18 +721,20 @@ static unsigned int nf_bridge_mtu_reduction(const struct sk_buff *skb) static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct nf_bridge_info *nf_bridge; - unsigned int mtu_reserved; + struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); + unsigned int mtu, mtu_reserved; mtu_reserved = nf_bridge_mtu_reduction(skb); + mtu = skb->dev->mtu; - if (skb_is_gso(skb) || skb->len + mtu_reserved <= skb->dev->mtu) { + if (nf_bridge->frag_max_size && nf_bridge->frag_max_size < mtu) + mtu = nf_bridge->frag_max_size; + + if (skb_is_gso(skb) || skb->len + mtu_reserved <= mtu) { nf_bridge_info_free(skb); return br_dev_queue_push_xmit(net, sk, skb); } - nf_bridge = nf_bridge_info_get(skb); - /* This is wrong! We should preserve the original fragment * boundaries by preserving frag_list rather than refragmenting. */ From 170a1fb9c01bc40b7e8fd57a32ac9a0e131ec5b6 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Sat, 11 Mar 2017 00:25:26 -0500 Subject: [PATCH 09/85] netfilter: Force fake conntrack entry to be at least 8 bytes aligned Since the nfct and nfctinfo have been combined, the nf_conn structure must be at least 8 bytes aligned, as the 3 LSB bits are used for the nfctinfo. But there's a fake nf_conn structure to denote untracked connections, which is created by a PER_CPU construct. This does not guarantee that it will be 8 bytes aligned and can break the logic in determining the correct nfctinfo. I triggered this on a 32bit machine with the following error: BUG: unable to handle kernel NULL pointer dereference at 00000af4 IP: nf_ct_deliver_cached_events+0x1b/0xfb *pdpt = 0000000031962001 *pde = 0000000000000000 Oops: 0000 [#1] SMP [Modules linked in: ip6t_REJECT nf_reject_ipv6 nf_conntrack_ipv6 nf_defrag_ipv6 ip6table_filter ip6_tables ipv6 crc_ccitt ppdev r8169 parport_pc parport OK ] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.10.0-test+ #75 Hardware name: MSI MS-7823/CSM-H87M-G43 (MS-7823), BIOS V1.6 02/22/2014 task: c126ec00 task.stack: c1258000 EIP: nf_ct_deliver_cached_events+0x1b/0xfb EFLAGS: 00010202 CPU: 0 EAX: 0021cd01 EBX: 00000000 ECX: 27b0c767 EDX: 32bcb17a ESI: f34135c0 EDI: f34135c0 EBP: f2debd60 ESP: f2debd3c DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068 CR0: 80050033 CR2: 00000af4 CR3: 309a0440 CR4: 001406f0 Call Trace: ? ipv6_skip_exthdr+0xac/0xcb ipv6_confirm+0x10c/0x119 [nf_conntrack_ipv6] nf_hook_slow+0x22/0xc7 nf_hook+0x9a/0xad [ipv6] ? ip6t_do_table+0x356/0x379 [ip6_tables] ? ip6_fragment+0x9e9/0x9e9 [ipv6] ip6_output+0xee/0x107 [ipv6] ? ip6_fragment+0x9e9/0x9e9 [ipv6] dst_output+0x36/0x4d [ipv6] NF_HOOK.constprop.37+0xb2/0xba [ipv6] ? icmp6_dst_alloc+0x2c/0xfd [ipv6] ? local_bh_enable+0x14/0x14 [ipv6] mld_sendpack+0x1c5/0x281 [ipv6] ? mark_held_locks+0x40/0x5c mld_ifc_timer_expire+0x1f6/0x21e [ipv6] call_timer_fn+0x135/0x283 ? detach_if_pending+0x55/0x55 ? mld_dad_timer_expire+0x3e/0x3e [ipv6] __run_timers+0x111/0x14b ? mld_dad_timer_expire+0x3e/0x3e [ipv6] run_timer_softirq+0x1c/0x36 __do_softirq+0x185/0x37c ? test_ti_thread_flag.constprop.19+0xd/0xd do_softirq_own_stack+0x22/0x28 irq_exit+0x5a/0xa4 smp_apic_timer_interrupt+0x2a/0x34 apic_timer_interrupt+0x37/0x3c By using DEFINE/DECLARE_PER_CPU_ALIGNED we can enforce at least 8 byte alignment as all cache line sizes are at least 8 bytes or more. Fixes: a9e419dc7be6 ("netfilter: merge ctinfo into nfct pointer storage area") Signed-off-by: Steven Rostedt (VMware) Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack.h | 2 +- net/netfilter/nf_conntrack_core.c | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index f540f9ad2af4..19605878da47 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -244,7 +244,7 @@ extern s32 (*nf_ct_nat_offset)(const struct nf_conn *ct, u32 seq); /* Fake conntrack entry for untracked connections */ -DECLARE_PER_CPU(struct nf_conn, nf_conntrack_untracked); +DECLARE_PER_CPU_ALIGNED(struct nf_conn, nf_conntrack_untracked); static inline struct nf_conn *nf_ct_untracked_get(void) { return raw_cpu_ptr(&nf_conntrack_untracked); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 071b97fcbefb..ffb78e5f7b70 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -181,7 +181,11 @@ EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); unsigned int nf_conntrack_max __read_mostly; seqcount_t nf_conntrack_generation __read_mostly; -DEFINE_PER_CPU(struct nf_conn, nf_conntrack_untracked); +/* nf_conn must be 8 bytes aligned, as the 3 LSB bits are used + * for the nfctinfo. We cheat by (ab)using the PER CPU cache line + * alignment to enforce this. + */ +DEFINE_PER_CPU_ALIGNED(struct nf_conn, nf_conntrack_untracked); EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked); static unsigned int nf_conntrack_hash_rnd __read_mostly; From e920dde5160887d07b738f5a7f593b1fa9b1e32e Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 10 Mar 2017 18:32:31 +0100 Subject: [PATCH 10/85] netfilter: nft_set_bitmap: keep a list of dummy elements Element comments may come without any prior set flag, so we have to keep a list of dummy struct nft_set_ext to keep this information around. This is only useful for set dumps to userspace. From the packet path, this set type relies on the bitmap representation. This patch simplifies the logic since we don't need to allocate the dummy nft_set_ext structure anymore on the fly at the cost of increasing memory consumption because of the list of dummy struct nft_set_ext. Fixes: 665153ff5752 ("netfilter: nf_tables: add bitmap set type") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_bitmap.c | 138 +++++++++++++++------------------ 1 file changed, 62 insertions(+), 76 deletions(-) diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c index 9b024e22717b..8ebbc2940f4c 100644 --- a/net/netfilter/nft_set_bitmap.c +++ b/net/netfilter/nft_set_bitmap.c @@ -15,6 +15,11 @@ #include #include +struct nft_bitmap_elem { + struct list_head head; + struct nft_set_ext ext; +}; + /* This bitmap uses two bits to represent one element. These two bits determine * the element state in the current and the future generation. * @@ -41,8 +46,9 @@ * restore its previous state. */ struct nft_bitmap { - u16 bitmap_size; - u8 bitmap[]; + struct list_head list; + u16 bitmap_size; + u8 bitmap[]; }; static inline void nft_bitmap_location(const struct nft_set *set, @@ -82,21 +88,43 @@ static bool nft_bitmap_lookup(const struct net *net, const struct nft_set *set, return nft_bitmap_active(priv->bitmap, idx, off, genmask); } +static struct nft_bitmap_elem * +nft_bitmap_elem_find(const struct nft_set *set, struct nft_bitmap_elem *this, + u8 genmask) +{ + const struct nft_bitmap *priv = nft_set_priv(set); + struct nft_bitmap_elem *be; + + list_for_each_entry_rcu(be, &priv->list, head) { + if (memcmp(nft_set_ext_key(&be->ext), + nft_set_ext_key(&this->ext), set->klen) || + !nft_set_elem_active(&be->ext, genmask)) + continue; + + return be; + } + return NULL; +} + static int nft_bitmap_insert(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, - struct nft_set_ext **_ext) + struct nft_set_ext **ext) { struct nft_bitmap *priv = nft_set_priv(set); - struct nft_set_ext *ext = elem->priv; + struct nft_bitmap_elem *new = elem->priv, *be; u8 genmask = nft_genmask_next(net); u32 idx, off; - nft_bitmap_location(set, nft_set_ext_key(ext), &idx, &off); - if (nft_bitmap_active(priv->bitmap, idx, off, genmask)) + be = nft_bitmap_elem_find(set, new, genmask); + if (be) { + *ext = &be->ext; return -EEXIST; + } + nft_bitmap_location(set, nft_set_ext_key(&new->ext), &idx, &off); /* Enter 01 state. */ priv->bitmap[idx] |= (genmask << off); + list_add_tail_rcu(&new->head, &priv->list); return 0; } @@ -106,13 +134,14 @@ static void nft_bitmap_remove(const struct net *net, const struct nft_set_elem *elem) { struct nft_bitmap *priv = nft_set_priv(set); - struct nft_set_ext *ext = elem->priv; + struct nft_bitmap_elem *be = elem->priv; u8 genmask = nft_genmask_next(net); u32 idx, off; - nft_bitmap_location(set, nft_set_ext_key(ext), &idx, &off); + nft_bitmap_location(set, nft_set_ext_key(&be->ext), &idx, &off); /* Enter 00 state. */ priv->bitmap[idx] &= ~(genmask << off); + list_del_rcu(&be->head); } static void nft_bitmap_activate(const struct net *net, @@ -120,73 +149,52 @@ static void nft_bitmap_activate(const struct net *net, const struct nft_set_elem *elem) { struct nft_bitmap *priv = nft_set_priv(set); - struct nft_set_ext *ext = elem->priv; + struct nft_bitmap_elem *be = elem->priv; u8 genmask = nft_genmask_next(net); u32 idx, off; - nft_bitmap_location(set, nft_set_ext_key(ext), &idx, &off); + nft_bitmap_location(set, nft_set_ext_key(&be->ext), &idx, &off); /* Enter 11 state. */ priv->bitmap[idx] |= (genmask << off); + nft_set_elem_change_active(net, set, &be->ext); } static bool nft_bitmap_flush(const struct net *net, - const struct nft_set *set, void *ext) + const struct nft_set *set, void *_be) { struct nft_bitmap *priv = nft_set_priv(set); u8 genmask = nft_genmask_next(net); + struct nft_bitmap_elem *be = _be; u32 idx, off; - nft_bitmap_location(set, nft_set_ext_key(ext), &idx, &off); + nft_bitmap_location(set, nft_set_ext_key(&be->ext), &idx, &off); /* Enter 10 state, similar to deactivation. */ priv->bitmap[idx] &= ~(genmask << off); + nft_set_elem_change_active(net, set, &be->ext); return true; } -static struct nft_set_ext *nft_bitmap_ext_alloc(const struct nft_set *set, - const struct nft_set_elem *elem) -{ - struct nft_set_ext_tmpl tmpl; - struct nft_set_ext *ext; - - nft_set_ext_prepare(&tmpl); - nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen); - - ext = kzalloc(tmpl.len, GFP_KERNEL); - if (!ext) - return NULL; - - nft_set_ext_init(ext, &tmpl); - memcpy(nft_set_ext_key(ext), elem->key.val.data, set->klen); - - return ext; -} - static void *nft_bitmap_deactivate(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem) { struct nft_bitmap *priv = nft_set_priv(set); + struct nft_bitmap_elem *this = elem->priv, *be; u8 genmask = nft_genmask_next(net); - struct nft_set_ext *ext; u32 idx, off; nft_bitmap_location(set, elem->key.val.data, &idx, &off); - if (!nft_bitmap_active(priv->bitmap, idx, off, genmask)) - return NULL; - - /* We have no real set extension since this is a bitmap, allocate this - * dummy object that is released from the commit/abort path. - */ - ext = nft_bitmap_ext_alloc(set, elem); - if (!ext) + be = nft_bitmap_elem_find(set, this, genmask); + if (!be) return NULL; /* Enter 10 state. */ priv->bitmap[idx] &= ~(genmask << off); + nft_set_elem_change_active(net, set, &be->ext); - return ext; + return be; } static void nft_bitmap_walk(const struct nft_ctx *ctx, @@ -194,47 +202,23 @@ static void nft_bitmap_walk(const struct nft_ctx *ctx, struct nft_set_iter *iter) { const struct nft_bitmap *priv = nft_set_priv(set); - struct nft_set_ext_tmpl tmpl; + struct nft_bitmap_elem *be; struct nft_set_elem elem; - struct nft_set_ext *ext; - int idx, off; - u16 key; - nft_set_ext_prepare(&tmpl); - nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen); + list_for_each_entry_rcu(be, &priv->list, head) { + if (iter->count < iter->skip) + goto cont; + if (!nft_set_elem_active(&be->ext, iter->genmask)) + goto cont; - for (idx = 0; idx < priv->bitmap_size; idx++) { - for (off = 0; off < BITS_PER_BYTE; off += 2) { - if (iter->count < iter->skip) - goto cont; + elem.priv = be; - if (!nft_bitmap_active(priv->bitmap, idx, off, - iter->genmask)) - goto cont; + iter->err = iter->fn(ctx, set, iter, &elem); - ext = kzalloc(tmpl.len, GFP_KERNEL); - if (!ext) { - iter->err = -ENOMEM; - return; - } - nft_set_ext_init(ext, &tmpl); - key = ((idx * BITS_PER_BYTE) + off) >> 1; - memcpy(nft_set_ext_key(ext), &key, set->klen); - - elem.priv = ext; - iter->err = iter->fn(ctx, set, iter, &elem); - - /* On set flush, this dummy extension object is released - * from the commit/abort path. - */ - if (!iter->flush) - kfree(ext); - - if (iter->err < 0) - return; + if (iter->err < 0) + return; cont: - iter->count++; - } + iter->count++; } } @@ -265,6 +249,7 @@ static int nft_bitmap_init(const struct nft_set *set, { struct nft_bitmap *priv = nft_set_priv(set); + INIT_LIST_HEAD(&priv->list); priv->bitmap_size = nft_bitmap_size(set->klen); return 0; @@ -290,6 +275,7 @@ static bool nft_bitmap_estimate(const struct nft_set_desc *desc, u32 features, static struct nft_set_ops nft_bitmap_ops __read_mostly = { .privsize = nft_bitmap_privsize, + .elemsize = offsetof(struct nft_bitmap_elem, ext), .estimate = nft_bitmap_estimate, .init = nft_bitmap_init, .destroy = nft_bitmap_destroy, From 04166f48d9593af4513ae06c0f966c0cee300a20 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 13 Mar 2017 13:24:03 +0100 Subject: [PATCH 11/85] Revert "netfilter: nf_tables: add flush field to struct nft_set_iter" This reverts commit 1f48ff6c5393aa7fe290faf5d633164f105b0aa7. This patch is not required anymore now that we keep a dummy list of set elements in the bitmap set implementation, so revert this before we forget this code has no clients. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 1 - net/netfilter/nf_tables_api.c | 4 ---- 2 files changed, 5 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 70c5ca0c60b1..0136028652bd 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -232,7 +232,6 @@ struct nft_set_elem { struct nft_set; struct nft_set_iter { u8 genmask; - bool flush; unsigned int count; unsigned int skip; int err; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 5e0ccfd5bb37..434c739dfeca 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3145,7 +3145,6 @@ int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set, iter.count = 0; iter.err = 0; iter.fn = nf_tables_bind_check_setelem; - iter.flush = false; set->ops->walk(ctx, set, &iter); if (iter.err < 0) @@ -3399,7 +3398,6 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) args.iter.count = 0; args.iter.err = 0; args.iter.fn = nf_tables_dump_setelem; - args.iter.flush = false; set->ops->walk(&ctx, set, &args.iter); nla_nest_end(skb, nest); @@ -3963,7 +3961,6 @@ static int nf_tables_delsetelem(struct net *net, struct sock *nlsk, struct nft_set_iter iter = { .genmask = genmask, .fn = nft_flush_set, - .flush = true, }; set->ops->walk(&ctx, set, &iter); @@ -5114,7 +5111,6 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx, iter.count = 0; iter.err = 0; iter.fn = nf_tables_loop_check_setelem; - iter.flush = false; set->ops->walk(ctx, set, &iter); if (iter.err < 0) From 4494dbc6dec37817f2cc2aa7604039a9e87ada18 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Wed, 15 Mar 2017 22:22:08 +0800 Subject: [PATCH 12/85] netfilter: nft_ct: do cleanup work when NFTA_CT_DIRECTION is invalid We should jump to invoke __nft_ct_set_destroy() instead of just return error. Fixes: edee4f1e9245 ("netfilter: nft_ct: add zone id set support") Signed-off-by: Liping Zhang Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_ct.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 91585b5e5307..0264258c46fe 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -544,7 +544,8 @@ static int nft_ct_set_init(const struct nft_ctx *ctx, case IP_CT_DIR_REPLY: break; default: - return -EINVAL; + err = -EINVAL; + goto err1; } } From fe8daf5fa715f7214952f06a387e4b7de818c5be Mon Sep 17 00:00:00 2001 From: Taku Izumi Date: Wed, 15 Mar 2017 13:47:50 +0900 Subject: [PATCH 13/85] fjes: Fix wrong netdevice feature flags This patch fixes netdev->features for Extended Socket network device. Currently Extended Socket network device's netdev->feature claims NETIF_F_HW_CSUM, however this is completely wrong. There's no feature of checksum offloading. That causes invalid TCP/UDP checksum and packet rejection when IP forwarding from Extended Socket network device to other network device. NETIF_F_HW_CSUM should be omitted. Signed-off-by: Taku Izumi Signed-off-by: David S. Miller --- drivers/net/fjes/fjes_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/fjes/fjes_main.c b/drivers/net/fjes/fjes_main.c index b75d9cdcfb0c..c4b3c4b77a9c 100644 --- a/drivers/net/fjes/fjes_main.c +++ b/drivers/net/fjes/fjes_main.c @@ -1316,7 +1316,7 @@ static void fjes_netdev_setup(struct net_device *netdev) netdev->min_mtu = fjes_support_mtu[0]; netdev->max_mtu = fjes_support_mtu[3]; netdev->flags |= IFF_BROADCAST; - netdev->features |= NETIF_F_HW_CSUM | NETIF_F_HW_VLAN_CTAG_FILTER; + netdev->features |= NETIF_F_HW_VLAN_CTAG_FILTER; } static void fjes_irq_watch_task(struct work_struct *work) From 3d20f1f7bd575d147ffa75621fa560eea0aec690 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Wed, 15 Mar 2017 18:10:47 +0200 Subject: [PATCH 14/85] net/openvswitch: Set the ipv6 source tunnel key address attribute correctly When dealing with ipv6 source tunnel key address attribute (OVS_TUNNEL_KEY_ATTR_IPV6_SRC) we are wrongly setting the tunnel dst ip, fix that. Fixes: 6b26ba3a7d95 ('openvswitch: netlink attributes for IPv6 tunneling') Signed-off-by: Or Gerlitz Reported-by: Paul Blakey Acked-by: Jiri Benc Acked-by: Joe Stringer Signed-off-by: David S. Miller --- net/openvswitch/flow_netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 6f5fa50f716d..a08ff834676b 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -604,7 +604,7 @@ static int ip_tun_from_nlattr(const struct nlattr *attr, ipv4 = true; break; case OVS_TUNNEL_KEY_ATTR_IPV6_SRC: - SW_FLOW_KEY_PUT(match, tun_key.u.ipv6.dst, + SW_FLOW_KEY_PUT(match, tun_key.u.ipv6.src, nla_get_in6_addr(a), is_mask); ipv6 = true; break; From 88d339e2d3be955848a034970970931a7ed33956 Mon Sep 17 00:00:00 2001 From: Pablo Neira Date: Wed, 15 Mar 2017 18:39:46 +0100 Subject: [PATCH 15/85] MAINTAINERS: remove MACVLAN and VLAN entries macvlan.c file seems to be both in VLAN and MACVLAN DRIVER, so remove the MACVLAN DRIVER since this is redundant. I propose with this patch to remove the VLAN (802.1Q) entry so this just falls into the NETWORKING [GENERAL]. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- MAINTAINERS | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index c776906f67a9..33875e07482b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7774,13 +7774,6 @@ F: include/net/mac80211.h F: net/mac80211/ F: drivers/net/wireless/mac80211_hwsim.[ch] -MACVLAN DRIVER -M: Patrick McHardy -L: netdev@vger.kernel.org -S: Maintained -F: drivers/net/macvlan.c -F: include/linux/if_macvlan.h - MAILBOX API M: Jassi Brar L: linux-kernel@vger.kernel.org @@ -13383,14 +13376,6 @@ W: https://linuxtv.org S: Maintained F: drivers/media/platform/vivid/* -VLAN (802.1Q) -M: Patrick McHardy -L: netdev@vger.kernel.org -S: Maintained -F: drivers/net/macvlan.c -F: include/linux/if_*vlan.h -F: net/8021q/ - VLYNQ BUS M: Florian Fainelli L: openwrt-devel@lists.openwrt.org (subscribers-only) From 5371bbf4b295eea334ed453efa286afa2c3ccff3 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 15 Mar 2017 12:57:21 -0700 Subject: [PATCH 16/85] net: bcmgenet: Do not suspend PHY if Wake-on-LAN is enabled Suspending the PHY would be putting it in a low power state where it may no longer allow us to do Wake-on-LAN. Fixes: cc013fb48898 ("net: bcmgenet: correctly suspend and resume PHY device") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/genet/bcmgenet.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c index 69015fa50f20..365895ed3c3e 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c @@ -3481,7 +3481,8 @@ static int bcmgenet_suspend(struct device *d) bcmgenet_netif_stop(dev); - phy_suspend(priv->phydev); + if (!device_may_wakeup(d)) + phy_suspend(priv->phydev); netif_device_detach(dev); @@ -3578,7 +3579,8 @@ static int bcmgenet_resume(struct device *d) netif_device_attach(dev); - phy_resume(priv->phydev); + if (!device_may_wakeup(d)) + phy_resume(priv->phydev); if (priv->eee.eee_enabled) bcmgenet_eee_enable_set(dev, true); From 622c36f143fc9566ba49d7cec994c2da1182d9e2 Mon Sep 17 00:00:00 2001 From: "Lendacky, Thomas" Date: Wed, 15 Mar 2017 15:11:23 -0500 Subject: [PATCH 17/85] amd-xgbe: Fix jumbo MTU processing on newer hardware Newer hardware does not provide a cumulative payload length when multiple descriptors are needed to handle the data. Once the MTU increases beyond the size that can be handled by a single descriptor, the SKB does not get built properly by the driver. The driver will now calculate the size of the data buffers used by the hardware. The first buffer of the first descriptor is for packet headers or packet headers and data when the headers can't be split. Subsequent descriptors in a multi-descriptor chain will not use the first buffer. The second buffer is used by all the descriptors in the chain for payload data. Based on whether the driver is processing the first, intermediate, or last descriptor it can calculate the buffer usage and build the SKB properly. Tested and verified on both old and new hardware. Signed-off-by: Tom Lendacky Signed-off-by: David S. Miller --- drivers/net/ethernet/amd/xgbe/xgbe-common.h | 6 +- drivers/net/ethernet/amd/xgbe/xgbe-dev.c | 20 ++-- drivers/net/ethernet/amd/xgbe/xgbe-drv.c | 102 ++++++++++++-------- 3 files changed, 78 insertions(+), 50 deletions(-) diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-common.h b/drivers/net/ethernet/amd/xgbe/xgbe-common.h index 8a280e7d66bd..86f1626816ff 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-common.h +++ b/drivers/net/ethernet/amd/xgbe/xgbe-common.h @@ -1148,8 +1148,8 @@ #define RX_PACKET_ATTRIBUTES_CSUM_DONE_WIDTH 1 #define RX_PACKET_ATTRIBUTES_VLAN_CTAG_INDEX 1 #define RX_PACKET_ATTRIBUTES_VLAN_CTAG_WIDTH 1 -#define RX_PACKET_ATTRIBUTES_INCOMPLETE_INDEX 2 -#define RX_PACKET_ATTRIBUTES_INCOMPLETE_WIDTH 1 +#define RX_PACKET_ATTRIBUTES_LAST_INDEX 2 +#define RX_PACKET_ATTRIBUTES_LAST_WIDTH 1 #define RX_PACKET_ATTRIBUTES_CONTEXT_NEXT_INDEX 3 #define RX_PACKET_ATTRIBUTES_CONTEXT_NEXT_WIDTH 1 #define RX_PACKET_ATTRIBUTES_CONTEXT_INDEX 4 @@ -1158,6 +1158,8 @@ #define RX_PACKET_ATTRIBUTES_RX_TSTAMP_WIDTH 1 #define RX_PACKET_ATTRIBUTES_RSS_HASH_INDEX 6 #define RX_PACKET_ATTRIBUTES_RSS_HASH_WIDTH 1 +#define RX_PACKET_ATTRIBUTES_FIRST_INDEX 7 +#define RX_PACKET_ATTRIBUTES_FIRST_WIDTH 1 #define RX_NORMAL_DESC0_OVT_INDEX 0 #define RX_NORMAL_DESC0_OVT_WIDTH 16 diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-dev.c b/drivers/net/ethernet/amd/xgbe/xgbe-dev.c index 937f37a5dcb2..24a687ce4388 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-dev.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-dev.c @@ -1896,10 +1896,15 @@ static int xgbe_dev_read(struct xgbe_channel *channel) /* Get the header length */ if (XGMAC_GET_BITS_LE(rdesc->desc3, RX_NORMAL_DESC3, FD)) { + XGMAC_SET_BITS(packet->attributes, RX_PACKET_ATTRIBUTES, + FIRST, 1); rdata->rx.hdr_len = XGMAC_GET_BITS_LE(rdesc->desc2, RX_NORMAL_DESC2, HL); if (rdata->rx.hdr_len) pdata->ext_stats.rx_split_header_packets++; + } else { + XGMAC_SET_BITS(packet->attributes, RX_PACKET_ATTRIBUTES, + FIRST, 0); } /* Get the RSS hash */ @@ -1922,19 +1927,16 @@ static int xgbe_dev_read(struct xgbe_channel *channel) } } - /* Get the packet length */ - rdata->rx.len = XGMAC_GET_BITS_LE(rdesc->desc3, RX_NORMAL_DESC3, PL); - - if (!XGMAC_GET_BITS_LE(rdesc->desc3, RX_NORMAL_DESC3, LD)) { - /* Not all the data has been transferred for this packet */ - XGMAC_SET_BITS(packet->attributes, RX_PACKET_ATTRIBUTES, - INCOMPLETE, 1); + /* Not all the data has been transferred for this packet */ + if (!XGMAC_GET_BITS_LE(rdesc->desc3, RX_NORMAL_DESC3, LD)) return 0; - } /* This is the last of the data for this packet */ XGMAC_SET_BITS(packet->attributes, RX_PACKET_ATTRIBUTES, - INCOMPLETE, 0); + LAST, 1); + + /* Get the packet length */ + rdata->rx.len = XGMAC_GET_BITS_LE(rdesc->desc3, RX_NORMAL_DESC3, PL); /* Set checksum done indicator as appropriate */ if (netdev->features & NETIF_F_RXCSUM) diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c index ffea9859f5a7..a713abd9d03e 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c @@ -1971,13 +1971,12 @@ static struct sk_buff *xgbe_create_skb(struct xgbe_prv_data *pdata, { struct sk_buff *skb; u8 *packet; - unsigned int copy_len; skb = napi_alloc_skb(napi, rdata->rx.hdr.dma_len); if (!skb) return NULL; - /* Start with the header buffer which may contain just the header + /* Pull in the header buffer which may contain just the header * or the header plus data */ dma_sync_single_range_for_cpu(pdata->dev, rdata->rx.hdr.dma_base, @@ -1986,30 +1985,49 @@ static struct sk_buff *xgbe_create_skb(struct xgbe_prv_data *pdata, packet = page_address(rdata->rx.hdr.pa.pages) + rdata->rx.hdr.pa.pages_offset; - copy_len = (rdata->rx.hdr_len) ? rdata->rx.hdr_len : len; - copy_len = min(rdata->rx.hdr.dma_len, copy_len); - skb_copy_to_linear_data(skb, packet, copy_len); - skb_put(skb, copy_len); - - len -= copy_len; - if (len) { - /* Add the remaining data as a frag */ - dma_sync_single_range_for_cpu(pdata->dev, - rdata->rx.buf.dma_base, - rdata->rx.buf.dma_off, - rdata->rx.buf.dma_len, - DMA_FROM_DEVICE); - - skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, - rdata->rx.buf.pa.pages, - rdata->rx.buf.pa.pages_offset, - len, rdata->rx.buf.dma_len); - rdata->rx.buf.pa.pages = NULL; - } + skb_copy_to_linear_data(skb, packet, len); + skb_put(skb, len); return skb; } +static unsigned int xgbe_rx_buf1_len(struct xgbe_ring_data *rdata, + struct xgbe_packet_data *packet) +{ + /* Always zero if not the first descriptor */ + if (!XGMAC_GET_BITS(packet->attributes, RX_PACKET_ATTRIBUTES, FIRST)) + return 0; + + /* First descriptor with split header, return header length */ + if (rdata->rx.hdr_len) + return rdata->rx.hdr_len; + + /* First descriptor but not the last descriptor and no split header, + * so the full buffer was used + */ + if (!XGMAC_GET_BITS(packet->attributes, RX_PACKET_ATTRIBUTES, LAST)) + return rdata->rx.hdr.dma_len; + + /* First descriptor and last descriptor and no split header, so + * calculate how much of the buffer was used + */ + return min_t(unsigned int, rdata->rx.hdr.dma_len, rdata->rx.len); +} + +static unsigned int xgbe_rx_buf2_len(struct xgbe_ring_data *rdata, + struct xgbe_packet_data *packet, + unsigned int len) +{ + /* Always the full buffer if not the last descriptor */ + if (!XGMAC_GET_BITS(packet->attributes, RX_PACKET_ATTRIBUTES, LAST)) + return rdata->rx.buf.dma_len; + + /* Last descriptor so calculate how much of the buffer was used + * for the last bit of data + */ + return rdata->rx.len - len; +} + static int xgbe_tx_poll(struct xgbe_channel *channel) { struct xgbe_prv_data *pdata = channel->pdata; @@ -2092,8 +2110,8 @@ static int xgbe_rx_poll(struct xgbe_channel *channel, int budget) struct napi_struct *napi; struct sk_buff *skb; struct skb_shared_hwtstamps *hwtstamps; - unsigned int incomplete, error, context_next, context; - unsigned int len, rdesc_len, max_len; + unsigned int last, error, context_next, context; + unsigned int len, buf1_len, buf2_len, max_len; unsigned int received = 0; int packet_count = 0; @@ -2103,7 +2121,7 @@ static int xgbe_rx_poll(struct xgbe_channel *channel, int budget) if (!ring) return 0; - incomplete = 0; + last = 0; context_next = 0; napi = (pdata->per_channel_irq) ? &channel->napi : &pdata->napi; @@ -2137,9 +2155,8 @@ read_again: received++; ring->cur++; - incomplete = XGMAC_GET_BITS(packet->attributes, - RX_PACKET_ATTRIBUTES, - INCOMPLETE); + last = XGMAC_GET_BITS(packet->attributes, RX_PACKET_ATTRIBUTES, + LAST); context_next = XGMAC_GET_BITS(packet->attributes, RX_PACKET_ATTRIBUTES, CONTEXT_NEXT); @@ -2148,7 +2165,7 @@ read_again: CONTEXT); /* Earlier error, just drain the remaining data */ - if ((incomplete || context_next) && error) + if ((!last || context_next) && error) goto read_again; if (error || packet->errors) { @@ -2160,16 +2177,22 @@ read_again: } if (!context) { - /* Length is cumulative, get this descriptor's length */ - rdesc_len = rdata->rx.len - len; - len += rdesc_len; + /* Get the data length in the descriptor buffers */ + buf1_len = xgbe_rx_buf1_len(rdata, packet); + len += buf1_len; + buf2_len = xgbe_rx_buf2_len(rdata, packet, len); + len += buf2_len; - if (rdesc_len && !skb) { + if (!skb) { skb = xgbe_create_skb(pdata, napi, rdata, - rdesc_len); - if (!skb) + buf1_len); + if (!skb) { error = 1; - } else if (rdesc_len) { + goto skip_data; + } + } + + if (buf2_len) { dma_sync_single_range_for_cpu(pdata->dev, rdata->rx.buf.dma_base, rdata->rx.buf.dma_off, @@ -2179,13 +2202,14 @@ read_again: skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, rdata->rx.buf.pa.pages, rdata->rx.buf.pa.pages_offset, - rdesc_len, + buf2_len, rdata->rx.buf.dma_len); rdata->rx.buf.pa.pages = NULL; } } - if (incomplete || context_next) +skip_data: + if (!last || context_next) goto read_again; if (!skb) @@ -2243,7 +2267,7 @@ next_packet: } /* Check if we need to save state before leaving */ - if (received && (incomplete || context_next)) { + if (received && (!last || context_next)) { rdata = XGBE_GET_DESC_DATA(ring, ring->cur); rdata->state_saved = 1; rdata->state.skb = skb; From 22a0e18eac7a9e986fec76c60fa4a2926d1291e2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 15 Mar 2017 13:21:28 -0700 Subject: [PATCH 18/85] net: properly release sk_frag.page I mistakenly added the code to release sk->sk_frag in sk_common_release() instead of sk_destruct() TCP sockets using sk->sk_allocation == GFP_ATOMIC do no call sk_common_release() at close time, thus leaking one (order-3) page. iSCSI is using such sockets. Fixes: 5640f7685831 ("net: use a per task frag allocator") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/net/core/sock.c b/net/core/sock.c index a96d5f7a5734..acb0d4137499 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1442,6 +1442,11 @@ static void __sk_destruct(struct rcu_head *head) pr_debug("%s: optmem leakage (%d bytes) detected\n", __func__, atomic_read(&sk->sk_omem_alloc)); + if (sk->sk_frag.page) { + put_page(sk->sk_frag.page); + sk->sk_frag.page = NULL; + } + if (sk->sk_peer_cred) put_cred(sk->sk_peer_cred); put_pid(sk->sk_peer_pid); @@ -2787,11 +2792,6 @@ void sk_common_release(struct sock *sk) sk_refcnt_debug_release(sk); - if (sk->sk_frag.page) { - put_page(sk->sk_frag.page); - sk->sk_frag.page = NULL; - } - sock_put(sk); } EXPORT_SYMBOL(sk_common_release); From 9a3fcf912ef7f5c6e18f9af6875dd13f7311f7aa Mon Sep 17 00:00:00 2001 From: Sara Sharon Date: Tue, 14 Mar 2017 09:50:35 +0200 Subject: [PATCH 19/85] iwlwifi: mvm: cleanup pending frames in DQA mode When a station is asleep, the fw will set it as "asleep". All queues that are used only by one station will be stopped by the fw. In pre-DQA mode this was relevant for aggregation queues. However, in DQA mode a queue is owned by one station only, so all queues will be stopped. As a result, we don't expect to get filtered frames back to mac80211 and don't have to maintain the entire pending_frames state logic, the same way as we do in aggregations. The correct behavior is to align DQA behavior with the aggregation queue behaviour pre-DQA: - Don't count pending frames. - Let mac80211 know we have frames in these queues so that it can properly handle trigger frames. When a trigger frame is received, mac80211 tells the driver to send frames from the queues using release_buffered_frames. The driver will tell the fw to let frames out even if the station is asleep. This is done by iwl_mvm_sta_modify_sleep_tx_count. Reported-and-tested-by: Jens Axboe Reported-by: Linus Torvalds Signed-off-by: Sara Sharon Signed-off-by: Luca Coelho Signed-off-by: Kalle Valo --- .../net/wireless/intel/iwlwifi/mvm/mac80211.c | 5 ++- drivers/net/wireless/intel/iwlwifi/mvm/sta.c | 11 ++--- drivers/net/wireless/intel/iwlwifi/mvm/sta.h | 2 +- drivers/net/wireless/intel/iwlwifi/mvm/tx.c | 41 ++++++++----------- 4 files changed, 28 insertions(+), 31 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c index d37b1695c64e..6927caecd48e 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c @@ -2319,7 +2319,7 @@ iwl_mvm_mac_release_buffered_frames(struct ieee80211_hw *hw, { struct iwl_mvm *mvm = IWL_MAC80211_GET_MVM(hw); - /* Called when we need to transmit (a) frame(s) from agg queue */ + /* Called when we need to transmit (a) frame(s) from agg or dqa queue */ iwl_mvm_sta_modify_sleep_tx_count(mvm, sta, reason, num_frames, tids, more_data, true); @@ -2338,7 +2338,8 @@ static void __iwl_mvm_mac_sta_notify(struct ieee80211_hw *hw, for (tid = 0; tid < IWL_MAX_TID_COUNT; tid++) { struct iwl_mvm_tid_data *tid_data = &mvmsta->tid_data[tid]; - if (tid_data->state != IWL_AGG_ON && + if (!iwl_mvm_is_dqa_supported(mvm) && + tid_data->state != IWL_AGG_ON && tid_data->state != IWL_EMPTYING_HW_QUEUE_DELBA) continue; diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/sta.c b/drivers/net/wireless/intel/iwlwifi/mvm/sta.c index bd1dcc863d8f..b51a2853cc80 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/sta.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/sta.c @@ -3135,7 +3135,7 @@ void iwl_mvm_sta_modify_sleep_tx_count(struct iwl_mvm *mvm, struct ieee80211_sta *sta, enum ieee80211_frame_release_type reason, u16 cnt, u16 tids, bool more_data, - bool agg) + bool single_sta_queue) { struct iwl_mvm_sta *mvmsta = iwl_mvm_sta_from_mac80211(sta); struct iwl_mvm_add_sta_cmd cmd = { @@ -3155,14 +3155,14 @@ void iwl_mvm_sta_modify_sleep_tx_count(struct iwl_mvm *mvm, for_each_set_bit(tid, &_tids, IWL_MAX_TID_COUNT) cmd.awake_acs |= BIT(tid_to_ucode_ac[tid]); - /* If we're releasing frames from aggregation queues then check if the - * all queues combined that we're releasing frames from have + /* If we're releasing frames from aggregation or dqa queues then check + * if all the queues that we're releasing frames from, combined, have: * - more frames than the service period, in which case more_data * needs to be set * - fewer than 'cnt' frames, in which case we need to adjust the * firmware command (but do that unconditionally) */ - if (agg) { + if (single_sta_queue) { int remaining = cnt; int sleep_tx_count; @@ -3172,7 +3172,8 @@ void iwl_mvm_sta_modify_sleep_tx_count(struct iwl_mvm *mvm, u16 n_queued; tid_data = &mvmsta->tid_data[tid]; - if (WARN(tid_data->state != IWL_AGG_ON && + if (WARN(!iwl_mvm_is_dqa_supported(mvm) && + tid_data->state != IWL_AGG_ON && tid_data->state != IWL_EMPTYING_HW_QUEUE_DELBA, "TID %d state is %d\n", tid, tid_data->state)) { diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/sta.h b/drivers/net/wireless/intel/iwlwifi/mvm/sta.h index 4be34f902278..1927ce607798 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/sta.h +++ b/drivers/net/wireless/intel/iwlwifi/mvm/sta.h @@ -547,7 +547,7 @@ void iwl_mvm_sta_modify_sleep_tx_count(struct iwl_mvm *mvm, struct ieee80211_sta *sta, enum ieee80211_frame_release_type reason, u16 cnt, u16 tids, bool more_data, - bool agg); + bool single_sta_queue); int iwl_mvm_drain_sta(struct iwl_mvm *mvm, struct iwl_mvm_sta *mvmsta, bool drain); void iwl_mvm_sta_modify_disable_tx(struct iwl_mvm *mvm, diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/tx.c b/drivers/net/wireless/intel/iwlwifi/mvm/tx.c index dd2b4a300819..3f37075f4cde 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/tx.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/tx.c @@ -7,7 +7,7 @@ * * Copyright(c) 2012 - 2014 Intel Corporation. All rights reserved. * Copyright(c) 2013 - 2015 Intel Mobile Communications GmbH - * Copyright(c) 2016 Intel Deutschland GmbH + * Copyright(c) 2016 - 2017 Intel Deutschland GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as @@ -34,6 +34,7 @@ * * Copyright(c) 2012 - 2014 Intel Corporation. All rights reserved. * Copyright(c) 2013 - 2015 Intel Mobile Communications GmbH + * Copyright(c) 2016 - 2017 Intel Deutschland GmbH * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -628,8 +629,10 @@ int iwl_mvm_tx_skb_non_sta(struct iwl_mvm *mvm, struct sk_buff *skb) * values. * Note that we don't need to make sure it isn't agg'd, since we're * TXing non-sta + * For DQA mode - we shouldn't increase it though */ - atomic_inc(&mvm->pending_frames[sta_id]); + if (!iwl_mvm_is_dqa_supported(mvm)) + atomic_inc(&mvm->pending_frames[sta_id]); return 0; } @@ -1005,11 +1008,8 @@ static int iwl_mvm_tx_mpdu(struct iwl_mvm *mvm, struct sk_buff *skb, spin_unlock(&mvmsta->lock); - /* Increase pending frames count if this isn't AMPDU */ - if ((iwl_mvm_is_dqa_supported(mvm) && - mvmsta->tid_data[tx_cmd->tid_tspec].state != IWL_AGG_ON && - mvmsta->tid_data[tx_cmd->tid_tspec].state != IWL_AGG_STARTING) || - (!iwl_mvm_is_dqa_supported(mvm) && !is_ampdu)) + /* Increase pending frames count if this isn't AMPDU or DQA queue */ + if (!iwl_mvm_is_dqa_supported(mvm) && !is_ampdu) atomic_inc(&mvm->pending_frames[mvmsta->sta_id]); return 0; @@ -1079,12 +1079,13 @@ static void iwl_mvm_check_ratid_empty(struct iwl_mvm *mvm, lockdep_assert_held(&mvmsta->lock); if ((tid_data->state == IWL_AGG_ON || - tid_data->state == IWL_EMPTYING_HW_QUEUE_DELBA) && + tid_data->state == IWL_EMPTYING_HW_QUEUE_DELBA || + iwl_mvm_is_dqa_supported(mvm)) && iwl_mvm_tid_queued(tid_data) == 0) { /* - * Now that this aggregation queue is empty tell mac80211 so it - * knows we no longer have frames buffered for the station on - * this TID (for the TIM bitmap calculation.) + * Now that this aggregation or DQA queue is empty tell + * mac80211 so it knows we no longer have frames buffered for + * the station on this TID (for the TIM bitmap calculation.) */ ieee80211_sta_set_buffered(sta, tid, false); } @@ -1257,7 +1258,6 @@ static void iwl_mvm_rx_tx_cmd_single(struct iwl_mvm *mvm, u8 skb_freed = 0; u16 next_reclaimed, seq_ctl; bool is_ndp = false; - bool txq_agg = false; /* Is this TXQ aggregated */ __skb_queue_head_init(&skbs); @@ -1283,6 +1283,10 @@ static void iwl_mvm_rx_tx_cmd_single(struct iwl_mvm *mvm, info->flags |= IEEE80211_TX_STAT_ACK; break; case TX_STATUS_FAIL_DEST_PS: + /* In DQA, the FW should have stopped the queue and not + * return this status + */ + WARN_ON(iwl_mvm_is_dqa_supported(mvm)); info->flags |= IEEE80211_TX_STAT_TX_FILTERED; break; default: @@ -1387,15 +1391,6 @@ static void iwl_mvm_rx_tx_cmd_single(struct iwl_mvm *mvm, bool send_eosp_ndp = false; spin_lock_bh(&mvmsta->lock); - if (iwl_mvm_is_dqa_supported(mvm)) { - enum iwl_mvm_agg_state state; - - state = mvmsta->tid_data[tid].state; - txq_agg = (state == IWL_AGG_ON || - state == IWL_EMPTYING_HW_QUEUE_DELBA); - } else { - txq_agg = txq_id >= mvm->first_agg_queue; - } if (!is_ndp) { tid_data->next_reclaimed = next_reclaimed; @@ -1452,11 +1447,11 @@ static void iwl_mvm_rx_tx_cmd_single(struct iwl_mvm *mvm, * If the txq is not an AMPDU queue, there is no chance we freed * several skbs. Check that out... */ - if (txq_agg) + if (iwl_mvm_is_dqa_supported(mvm) || txq_id >= mvm->first_agg_queue) goto out; /* We can't free more than one frame at once on a shared queue */ - WARN_ON(!iwl_mvm_is_dqa_supported(mvm) && (skb_freed > 1)); + WARN_ON(skb_freed > 1); /* If we have still frames for this STA nothing to do here */ if (!atomic_sub_and_test(skb_freed, &mvm->pending_frames[sta_id])) From 4e841d3eb9294ce4137fdb5d0a88f1bceab9c212 Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Fri, 10 Mar 2017 17:39:21 -0800 Subject: [PATCH 20/85] mwifiex: pcie: don't leak DMA buffers when removing When PCIe FLR support was added, much of the remove/release code for PCIe was migrated to ->down_dev(), but ->down_dev() is never called for device removal. Let's refactor the cleanup to be done in both cases. Also, drop the comments above mwifiex_cleanup_pcie(), because they were clearly wrong, and it's better to have clear and obvious code than to detail the code steps in comments anyway. Fixes: 4c5dae59d2e9 ("mwifiex: add PCIe function level reset support") Cc: Signed-off-by: Brian Norris Signed-off-by: Kalle Valo --- drivers/net/wireless/marvell/mwifiex/pcie.c | 38 ++++++++++----------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/drivers/net/wireless/marvell/mwifiex/pcie.c b/drivers/net/wireless/marvell/mwifiex/pcie.c index a0d918094889..b8c990d10d6e 100644 --- a/drivers/net/wireless/marvell/mwifiex/pcie.c +++ b/drivers/net/wireless/marvell/mwifiex/pcie.c @@ -2739,6 +2739,21 @@ static void mwifiex_pcie_device_dump(struct mwifiex_adapter *adapter) schedule_work(&card->work); } +static void mwifiex_pcie_free_buffers(struct mwifiex_adapter *adapter) +{ + struct pcie_service_card *card = adapter->card; + const struct mwifiex_pcie_card_reg *reg = card->pcie.reg; + + if (reg->sleep_cookie) + mwifiex_pcie_delete_sleep_cookie_buf(adapter); + + mwifiex_pcie_delete_cmdrsp_buf(adapter); + mwifiex_pcie_delete_evtbd_ring(adapter); + mwifiex_pcie_delete_rxbd_ring(adapter); + mwifiex_pcie_delete_txbd_ring(adapter); + card->cmdrsp_buf = NULL; +} + /* * This function initializes the PCI-E host memory space, WCB rings, etc. * @@ -2850,13 +2865,6 @@ err_enable_dev: /* * This function cleans up the allocated card buffers. - * - * The following are freed by this function - - * - TXBD ring buffers - * - RXBD ring buffers - * - Event BD ring buffers - * - Command response ring buffer - * - Sleep cookie buffer */ static void mwifiex_cleanup_pcie(struct mwifiex_adapter *adapter) { @@ -2875,6 +2883,8 @@ static void mwifiex_cleanup_pcie(struct mwifiex_adapter *adapter) "Failed to write driver not-ready signature\n"); } + mwifiex_pcie_free_buffers(adapter); + if (pdev) { pci_iounmap(pdev, card->pci_mmap); pci_iounmap(pdev, card->pci_mmap1); @@ -3126,10 +3136,7 @@ err_cre_txbd: pci_iounmap(pdev, card->pci_mmap1); } -/* This function cleans up the PCI-E host memory space. - * Some code is extracted from mwifiex_unregister_dev() - * - */ +/* This function cleans up the PCI-E host memory space. */ static void mwifiex_pcie_down_dev(struct mwifiex_adapter *adapter) { struct pcie_service_card *card = adapter->card; @@ -3140,14 +3147,7 @@ static void mwifiex_pcie_down_dev(struct mwifiex_adapter *adapter) adapter->seq_num = 0; - if (reg->sleep_cookie) - mwifiex_pcie_delete_sleep_cookie_buf(adapter); - - mwifiex_pcie_delete_cmdrsp_buf(adapter); - mwifiex_pcie_delete_evtbd_ring(adapter); - mwifiex_pcie_delete_rxbd_ring(adapter); - mwifiex_pcie_delete_txbd_ring(adapter); - card->cmdrsp_buf = NULL; + mwifiex_pcie_free_buffers(adapter); } static struct mwifiex_if_ops pcie_ops = { From ba1c7e45ec224cc8d2df33ecaee1946d48e79231 Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Fri, 10 Mar 2017 17:39:22 -0800 Subject: [PATCH 21/85] mwifiex: set adapter->dev before starting to use mwifiex_dbg() The mwifiex_dbg() log handler utilizes the struct device in adapter->dev. Without it, it decides not to print anything. As of commit 2e02b5814217 ("mwifiex: Allow mwifiex early access to device structure"), we started assigning that pointer only after we finished mwifiex_register() -- this effectively neuters any mwifiex_dbg() logging done before this point. Let's move the device assignment into mwifiex_register(). Fixes: 2e02b5814217 ("mwifiex: Allow mwifiex early access to device structure") Cc: Rajat Jain Signed-off-by: Brian Norris Signed-off-by: Kalle Valo --- drivers/net/wireless/marvell/mwifiex/main.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/wireless/marvell/mwifiex/main.c b/drivers/net/wireless/marvell/mwifiex/main.c index 5ebca1d0cfc7..43d040e02e4d 100644 --- a/drivers/net/wireless/marvell/mwifiex/main.c +++ b/drivers/net/wireless/marvell/mwifiex/main.c @@ -57,8 +57,8 @@ MODULE_PARM_DESC(mfg_mode, "manufacturing mode enable:1, disable:0"); * In case of any errors during inittialization, this function also ensures * proper cleanup before exiting. */ -static int mwifiex_register(void *card, struct mwifiex_if_ops *if_ops, - void **padapter) +static int mwifiex_register(void *card, struct device *dev, + struct mwifiex_if_ops *if_ops, void **padapter) { struct mwifiex_adapter *adapter; int i; @@ -68,6 +68,7 @@ static int mwifiex_register(void *card, struct mwifiex_if_ops *if_ops, return -ENOMEM; *padapter = adapter; + adapter->dev = dev; adapter->card = card; /* Save interface specific operations in adapter */ @@ -1568,12 +1569,11 @@ mwifiex_add_card(void *card, struct completion *fw_done, { struct mwifiex_adapter *adapter; - if (mwifiex_register(card, if_ops, (void **)&adapter)) { + if (mwifiex_register(card, dev, if_ops, (void **)&adapter)) { pr_err("%s: software init failed\n", __func__); goto err_init_sw; } - adapter->dev = dev; mwifiex_probe_of(adapter); adapter->iface_type = iface_type; From 36908c4e5b1063eff3e11336fab544a76c625b69 Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Fri, 10 Mar 2017 17:39:23 -0800 Subject: [PATCH 22/85] mwifiex: uninit wakeup info when removing device We manually init wakeup info, but we don't detach it on device removal. This means that if we (for example) rmmod + modprobe the driver, the device framework might return -EEXIST the second time, and we'll complain in the logs: [ 839.311881] mwifiex_pcie 0000:01:00.0: fail to init wakeup for mwifiex AFAICT, there's no other negative effect. But we can fix this by disabling wakeup on remove, similar to what a few other drivers do (e.g., the power supply framework). This code (and bug) has existed on SDIO for a while, but it got moved around and enabled for PCIe with commit 853402a00823 ("mwifiex: Enable WoWLAN for both sdio and pcie"). Signed-off-by: Brian Norris Signed-off-by: Kalle Valo --- drivers/net/wireless/marvell/mwifiex/main.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/wireless/marvell/mwifiex/main.c b/drivers/net/wireless/marvell/mwifiex/main.c index 43d040e02e4d..b62e03d11c2e 100644 --- a/drivers/net/wireless/marvell/mwifiex/main.c +++ b/drivers/net/wireless/marvell/mwifiex/main.c @@ -1718,6 +1718,9 @@ int mwifiex_remove_card(struct mwifiex_adapter *adapter) wiphy_unregister(adapter->wiphy); wiphy_free(adapter->wiphy); + if (adapter->irq_wakeup >= 0) + device_init_wakeup(adapter->dev, false); + /* Unregister device */ mwifiex_dbg(adapter, INFO, "info: unregister device\n"); From cf8c44d42c4f4f38468a53e9ce2a0314e7ebeaa1 Mon Sep 17 00:00:00 2001 From: Amitkumar Karwar Date: Tue, 28 Feb 2017 18:54:31 +0530 Subject: [PATCH 23/85] MAINTAINERS: update for mwifiex driver maintainers Ganapathi & Xinming are starting to take a more active role in the mwifiex driver maintainership here onwards on account of organizational changes. CC: Xinming Hu CC: Ganapathi Bhat Signed-off-by: Amitkumar Karwar Signed-off-by: Nishant Sarmukadam Signed-off-by: Cathy Luo Signed-off-by: Kalle Valo --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 33875e07482b..078c38217daa 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7846,6 +7846,8 @@ F: drivers/net/ethernet/marvell/mvneta.* MARVELL MWIFIEX WIRELESS DRIVER M: Amitkumar Karwar M: Nishant Sarmukadam +M: Ganapathi Bhat +M: Xinming Hu L: linux-wireless@vger.kernel.org S: Maintained F: drivers/net/wireless/marvell/mwifiex/ From ea90e0dc8cecba6359b481e24d9c37160f6f524f Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 15 Mar 2017 14:26:04 +0100 Subject: [PATCH 24/85] nl80211: fix dumpit error path RTNL deadlocks Sowmini pointed out Dmitry's RTNL deadlock report to me, and it turns out to be perfectly accurate - there are various error paths that miss unlock of the RTNL. To fix those, change the locking a bit to not be conditional in all those nl80211_prepare_*_dump() functions, but make those require the RTNL to start with, and fix the buggy error paths. This also let me use sparse (by appropriately overriding the rtnl_lock/rtnl_unlock functions) to validate the changes. Cc: stable@vger.kernel.org Reported-by: Sowmini Varadhan Reported-by: Dmitry Vyukov Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 127 ++++++++++++++++++----------------------- 1 file changed, 56 insertions(+), 71 deletions(-) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index d7f8be4e321a..2312dc2ffdb9 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -545,22 +545,18 @@ static int nl80211_prepare_wdev_dump(struct sk_buff *skb, { int err; - rtnl_lock(); - if (!cb->args[0]) { err = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl80211_fam.hdrsize, genl_family_attrbuf(&nl80211_fam), nl80211_fam.maxattr, nl80211_policy); if (err) - goto out_unlock; + return err; *wdev = __cfg80211_wdev_from_attrs( sock_net(skb->sk), genl_family_attrbuf(&nl80211_fam)); - if (IS_ERR(*wdev)) { - err = PTR_ERR(*wdev); - goto out_unlock; - } + if (IS_ERR(*wdev)) + return PTR_ERR(*wdev); *rdev = wiphy_to_rdev((*wdev)->wiphy); /* 0 is the first index - add 1 to parse only once */ cb->args[0] = (*rdev)->wiphy_idx + 1; @@ -570,10 +566,8 @@ static int nl80211_prepare_wdev_dump(struct sk_buff *skb, struct wiphy *wiphy = wiphy_idx_to_wiphy(cb->args[0] - 1); struct wireless_dev *tmp; - if (!wiphy) { - err = -ENODEV; - goto out_unlock; - } + if (!wiphy) + return -ENODEV; *rdev = wiphy_to_rdev(wiphy); *wdev = NULL; @@ -584,21 +578,11 @@ static int nl80211_prepare_wdev_dump(struct sk_buff *skb, } } - if (!*wdev) { - err = -ENODEV; - goto out_unlock; - } + if (!*wdev) + return -ENODEV; } return 0; - out_unlock: - rtnl_unlock(); - return err; -} - -static void nl80211_finish_wdev_dump(struct cfg80211_registered_device *rdev) -{ - rtnl_unlock(); } /* IE validation */ @@ -2608,17 +2592,17 @@ static int nl80211_dump_interface(struct sk_buff *skb, struct netlink_callback * int filter_wiphy = -1; struct cfg80211_registered_device *rdev; struct wireless_dev *wdev; + int ret; rtnl_lock(); if (!cb->args[2]) { struct nl80211_dump_wiphy_state state = { .filter_wiphy = -1, }; - int ret; ret = nl80211_dump_wiphy_parse(skb, cb, &state); if (ret) - return ret; + goto out_unlock; filter_wiphy = state.filter_wiphy; @@ -2663,12 +2647,14 @@ static int nl80211_dump_interface(struct sk_buff *skb, struct netlink_callback * wp_idx++; } out: - rtnl_unlock(); - cb->args[0] = wp_idx; cb->args[1] = if_idx; - return skb->len; + ret = skb->len; + out_unlock: + rtnl_unlock(); + + return ret; } static int nl80211_get_interface(struct sk_buff *skb, struct genl_info *info) @@ -4452,9 +4438,10 @@ static int nl80211_dump_station(struct sk_buff *skb, int sta_idx = cb->args[2]; int err; + rtnl_lock(); err = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev); if (err) - return err; + goto out_err; if (!wdev->netdev) { err = -EINVAL; @@ -4489,7 +4476,7 @@ static int nl80211_dump_station(struct sk_buff *skb, cb->args[2] = sta_idx; err = skb->len; out_err: - nl80211_finish_wdev_dump(rdev); + rtnl_unlock(); return err; } @@ -5275,9 +5262,10 @@ static int nl80211_dump_mpath(struct sk_buff *skb, int path_idx = cb->args[2]; int err; + rtnl_lock(); err = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev); if (err) - return err; + goto out_err; if (!rdev->ops->dump_mpath) { err = -EOPNOTSUPP; @@ -5310,7 +5298,7 @@ static int nl80211_dump_mpath(struct sk_buff *skb, cb->args[2] = path_idx; err = skb->len; out_err: - nl80211_finish_wdev_dump(rdev); + rtnl_unlock(); return err; } @@ -5470,9 +5458,10 @@ static int nl80211_dump_mpp(struct sk_buff *skb, int path_idx = cb->args[2]; int err; + rtnl_lock(); err = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev); if (err) - return err; + goto out_err; if (!rdev->ops->dump_mpp) { err = -EOPNOTSUPP; @@ -5505,7 +5494,7 @@ static int nl80211_dump_mpp(struct sk_buff *skb, cb->args[2] = path_idx; err = skb->len; out_err: - nl80211_finish_wdev_dump(rdev); + rtnl_unlock(); return err; } @@ -7674,9 +7663,12 @@ static int nl80211_dump_scan(struct sk_buff *skb, struct netlink_callback *cb) int start = cb->args[2], idx = 0; int err; + rtnl_lock(); err = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev); - if (err) + if (err) { + rtnl_unlock(); return err; + } wdev_lock(wdev); spin_lock_bh(&rdev->bss_lock); @@ -7699,7 +7691,7 @@ static int nl80211_dump_scan(struct sk_buff *skb, struct netlink_callback *cb) wdev_unlock(wdev); cb->args[2] = idx; - nl80211_finish_wdev_dump(rdev); + rtnl_unlock(); return skb->len; } @@ -7784,9 +7776,10 @@ static int nl80211_dump_survey(struct sk_buff *skb, struct netlink_callback *cb) int res; bool radio_stats; + rtnl_lock(); res = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev); if (res) - return res; + goto out_err; /* prepare_wdev_dump parsed the attributes */ radio_stats = attrbuf[NL80211_ATTR_SURVEY_RADIO_STATS]; @@ -7827,7 +7820,7 @@ static int nl80211_dump_survey(struct sk_buff *skb, struct netlink_callback *cb) cb->args[2] = survey_idx; res = skb->len; out_err: - nl80211_finish_wdev_dump(rdev); + rtnl_unlock(); return res; } @@ -11508,17 +11501,13 @@ static int nl80211_prepare_vendor_dump(struct sk_buff *skb, void *data = NULL; unsigned int data_len = 0; - rtnl_lock(); - if (cb->args[0]) { /* subtract the 1 again here */ struct wiphy *wiphy = wiphy_idx_to_wiphy(cb->args[0] - 1); struct wireless_dev *tmp; - if (!wiphy) { - err = -ENODEV; - goto out_unlock; - } + if (!wiphy) + return -ENODEV; *rdev = wiphy_to_rdev(wiphy); *wdev = NULL; @@ -11538,23 +11527,19 @@ static int nl80211_prepare_vendor_dump(struct sk_buff *skb, err = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl80211_fam.hdrsize, attrbuf, nl80211_fam.maxattr, nl80211_policy); if (err) - goto out_unlock; + return err; if (!attrbuf[NL80211_ATTR_VENDOR_ID] || - !attrbuf[NL80211_ATTR_VENDOR_SUBCMD]) { - err = -EINVAL; - goto out_unlock; - } + !attrbuf[NL80211_ATTR_VENDOR_SUBCMD]) + return -EINVAL; *wdev = __cfg80211_wdev_from_attrs(sock_net(skb->sk), attrbuf); if (IS_ERR(*wdev)) *wdev = NULL; *rdev = __cfg80211_rdev_from_attrs(sock_net(skb->sk), attrbuf); - if (IS_ERR(*rdev)) { - err = PTR_ERR(*rdev); - goto out_unlock; - } + if (IS_ERR(*rdev)) + return PTR_ERR(*rdev); vid = nla_get_u32(attrbuf[NL80211_ATTR_VENDOR_ID]); subcmd = nla_get_u32(attrbuf[NL80211_ATTR_VENDOR_SUBCMD]); @@ -11567,19 +11552,15 @@ static int nl80211_prepare_vendor_dump(struct sk_buff *skb, if (vcmd->info.vendor_id != vid || vcmd->info.subcmd != subcmd) continue; - if (!vcmd->dumpit) { - err = -EOPNOTSUPP; - goto out_unlock; - } + if (!vcmd->dumpit) + return -EOPNOTSUPP; vcmd_idx = i; break; } - if (vcmd_idx < 0) { - err = -EOPNOTSUPP; - goto out_unlock; - } + if (vcmd_idx < 0) + return -EOPNOTSUPP; if (attrbuf[NL80211_ATTR_VENDOR_DATA]) { data = nla_data(attrbuf[NL80211_ATTR_VENDOR_DATA]); @@ -11596,9 +11577,6 @@ static int nl80211_prepare_vendor_dump(struct sk_buff *skb, /* keep rtnl locked in successful case */ return 0; - out_unlock: - rtnl_unlock(); - return err; } static int nl80211_vendor_cmd_dump(struct sk_buff *skb, @@ -11613,9 +11591,10 @@ static int nl80211_vendor_cmd_dump(struct sk_buff *skb, int err; struct nlattr *vendor_data; + rtnl_lock(); err = nl80211_prepare_vendor_dump(skb, cb, &rdev, &wdev); if (err) - return err; + goto out; vcmd_idx = cb->args[2]; data = (void *)cb->args[3]; @@ -11624,15 +11603,21 @@ static int nl80211_vendor_cmd_dump(struct sk_buff *skb, if (vcmd->flags & (WIPHY_VENDOR_CMD_NEED_WDEV | WIPHY_VENDOR_CMD_NEED_NETDEV)) { - if (!wdev) - return -EINVAL; + if (!wdev) { + err = -EINVAL; + goto out; + } if (vcmd->flags & WIPHY_VENDOR_CMD_NEED_NETDEV && - !wdev->netdev) - return -EINVAL; + !wdev->netdev) { + err = -EINVAL; + goto out; + } if (vcmd->flags & WIPHY_VENDOR_CMD_NEED_RUNNING) { - if (!wdev_running(wdev)) - return -ENETDOWN; + if (!wdev_running(wdev)) { + err = -ENETDOWN; + goto out; + } } } From 8f3dbfd79ed9ef9770305a7cc4e13dfd31ad2cd0 Mon Sep 17 00:00:00 2001 From: Kris Murphy Date: Thu, 16 Mar 2017 10:51:28 -0500 Subject: [PATCH 25/85] openvswitch: Add missing case OVS_TUNNEL_KEY_ATTR_PAD Added a case for OVS_TUNNEL_KEY_ATTR_PAD to the switch statement in ip_tun_from_nlattr in order to prevent the default case returning an error. Fixes: b46f6ded906e ("libnl: nla_put_be64(): align on a 64-bit area") Signed-off-by: Kris Murphy Acked-by: Joe Stringer Signed-off-by: David S. Miller --- net/openvswitch/flow_netlink.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index a08ff834676b..1105a838bab8 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -665,6 +665,8 @@ static int ip_tun_from_nlattr(const struct nlattr *attr, tun_flags |= TUNNEL_VXLAN_OPT; opts_type = type; break; + case OVS_TUNNEL_KEY_ATTR_PAD: + break; default: OVS_NLERR(log, "Unknown IP tunnel attribute %d", type); From 4cbe4dac82e423ecc9a0ba46af24a860853259f4 Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Mon, 13 Mar 2017 19:29:08 +0200 Subject: [PATCH 26/85] net/mlx4_core: Avoid delays during VF driver device shutdown Some Hypervisors detach VFs from VMs by instantly causing an FLR event to be generated for a VF. In the mlx4 case, this will cause that VF's comm channel to be disabled before the VM has an opportunity to invoke the VF device's "shutdown" method. For such Hypervisors, there is a race condition between the VF's shutdown method and its internal-error detection/reset thread. The internal-error detection/reset thread (which runs every 5 seconds) also detects a disabled comm channel. If the internal-error detection/reset flow wins the race, we still get delays (while that flow tries repeatedly to detect comm-channel recovery). The cited commit fixed the command timeout problem when the internal-error detection/reset flow loses the race. This commit avoids the unneeded delays when the internal-error detection/reset flow wins. Fixes: d585df1c5ccf ("net/mlx4_core: Avoid command timeouts during VF driver device shutdown") Signed-off-by: Jack Morgenstein Reported-by: Simon Xiao Signed-off-by: Tariq Toukan Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/cmd.c | 11 +++++++++++ drivers/net/ethernet/mellanox/mlx4/main.c | 11 +++++++++++ include/linux/mlx4/device.h | 1 + 3 files changed, 23 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c index e8c105164931..0e0fa7030565 100644 --- a/drivers/net/ethernet/mellanox/mlx4/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c @@ -2305,6 +2305,17 @@ static int sync_toggles(struct mlx4_dev *dev) rd_toggle = swab32(readl(&priv->mfunc.comm->slave_read)); if (wr_toggle == 0xffffffff || rd_toggle == 0xffffffff) { /* PCI might be offline */ + + /* If device removal has been requested, + * do not continue retrying. + */ + if (dev->persist->interface_state & + MLX4_INTERFACE_STATE_NOWAIT) { + mlx4_warn(dev, + "communication channel is offline\n"); + return -EIO; + } + msleep(100); wr_toggle = swab32(readl(&priv->mfunc.comm-> slave_write)); diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c index 21377c315083..703205475524 100644 --- a/drivers/net/ethernet/mellanox/mlx4/main.c +++ b/drivers/net/ethernet/mellanox/mlx4/main.c @@ -1940,6 +1940,14 @@ static int mlx4_comm_check_offline(struct mlx4_dev *dev) (u32)(1 << COMM_CHAN_OFFLINE_OFFSET)); if (!offline_bit) return 0; + + /* If device removal has been requested, + * do not continue retrying. + */ + if (dev->persist->interface_state & + MLX4_INTERFACE_STATE_NOWAIT) + break; + /* There are cases as part of AER/Reset flow that PF needs * around 100 msec to load. We therefore sleep for 100 msec * to allow other tasks to make use of that CPU during this @@ -3955,6 +3963,9 @@ static void mlx4_remove_one(struct pci_dev *pdev) struct devlink *devlink = priv_to_devlink(priv); int active_vfs = 0; + if (mlx4_is_slave(dev)) + persist->interface_state |= MLX4_INTERFACE_STATE_NOWAIT; + mutex_lock(&persist->interface_state_mutex); persist->interface_state |= MLX4_INTERFACE_STATE_DELETION; mutex_unlock(&persist->interface_state_mutex); diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 7e66e4f62858..1beb1ec2fbdf 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -476,6 +476,7 @@ enum { enum { MLX4_INTERFACE_STATE_UP = 1 << 0, MLX4_INTERFACE_STATE_DELETION = 1 << 1, + MLX4_INTERFACE_STATE_NOWAIT = 1 << 2, }; #define MSTR_SM_CHANGE_MASK (MLX4_EQ_PORT_INFO_MSTR_SM_SL_CHANGE_MASK | \ From bc9ab9231ec8c08352ea860480523d88a221a68f Mon Sep 17 00:00:00 2001 From: David Arcari Date: Mon, 13 Mar 2017 19:07:16 -0400 Subject: [PATCH 27/85] net: ethernet: aquantia: set net_device mtu when mtu is changed When the aquantia device mtu is changed the net_device structure is not updated. As a result the ip command does not properly reflect the mtu change. Commit 5513e16421cb incorrectly assumed that __dev_set_mtu() was making the assignment ndev->mtu = new_mtu; This is not true in the case where the driver has a ndo_change_mtu routine. Fixes: 5513e16421cb ("net: ethernet: aquantia: Fixes for aq_ndev_change_mtu") Cc: Pavel Belous Signed-off-by: David Arcari Tested-by: Pavel Belous Reviewed-by: Jarod Wilson Signed-off-by: David S. Miller --- drivers/net/ethernet/aquantia/atlantic/aq_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_main.c b/drivers/net/ethernet/aquantia/atlantic/aq_main.c index dad63623be6a..d05fbfdce5e5 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_main.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_main.c @@ -98,6 +98,7 @@ static int aq_ndev_change_mtu(struct net_device *ndev, int new_mtu) if (err < 0) goto err_exit; + ndev->mtu = new_mtu; if (netif_running(ndev)) { aq_ndev_close(ndev); From 61733c91c454a61be0ffc93fe46a5d5f2f048c1c Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 13 Mar 2017 16:49:10 -0700 Subject: [PATCH 28/85] net: mpls: Fix nexthop alive tracking on down events Alive tracking of nexthops can account for a link twice if the carrier goes down followed by an admin down of the same link rendering multipath routes useless. This is similar to 79099aab38c8 for UNREGISTER events and DOWN events. Fix by tracking number of alive nexthops in mpls_ifdown similar to the logic in mpls_ifup. Checking the flags per nexthop once after all events have been processed is simpler than trying to maintian a running count through all event combinations. Also, WRITE_ONCE is used instead of ACCESS_ONCE to set rt_nhn_alive per a comment from checkpatch: WARNING: Prefer WRITE_ONCE(, ) over ACCESS_ONCE() = Fixes: c89359a42e2a4 ("mpls: support for dead routes") Signed-off-by: David Ahern Acked-by: Robert Shearman Signed-off-by: David S. Miller --- net/mpls/af_mpls.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 33211f9a2656..6414079aa729 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -1269,6 +1269,8 @@ static void mpls_ifdown(struct net_device *dev, int event) { struct mpls_route __rcu **platform_label; struct net *net = dev_net(dev); + unsigned int nh_flags = RTNH_F_DEAD | RTNH_F_LINKDOWN; + unsigned int alive; unsigned index; platform_label = rtnl_dereference(net->mpls.platform_label); @@ -1278,9 +1280,11 @@ static void mpls_ifdown(struct net_device *dev, int event) if (!rt) continue; + alive = 0; change_nexthops(rt) { if (rtnl_dereference(nh->nh_dev) != dev) - continue; + goto next; + switch (event) { case NETDEV_DOWN: case NETDEV_UNREGISTER: @@ -1288,13 +1292,16 @@ static void mpls_ifdown(struct net_device *dev, int event) /* fall through */ case NETDEV_CHANGE: nh->nh_flags |= RTNH_F_LINKDOWN; - if (event != NETDEV_UNREGISTER) - ACCESS_ONCE(rt->rt_nhn_alive) = rt->rt_nhn_alive - 1; break; } if (event == NETDEV_UNREGISTER) RCU_INIT_POINTER(nh->nh_dev, NULL); +next: + if (!(nh->nh_flags & nh_flags)) + alive++; } endfor_nexthops(rt); + + WRITE_ONCE(rt->rt_nhn_alive, alive); } } From 4ee39733fbecf04cf9f346de2d64788c35028079 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 15 Mar 2017 18:14:33 -0700 Subject: [PATCH 29/85] net: ipv6: set route type for anycast routes Anycast routes have the RTF_ANYCAST flag set, but when dumping routes for userspace the route type is not set to RTN_ANYCAST. Make it so. Fixes: 58c4fb86eabcb ("[IPV6]: Flag RTF_ANYCAST for anycast routes") CC: Hideaki YOSHIFUJI Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 35c58b669ebd..9db1418993f2 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3423,6 +3423,8 @@ static int rt6_fill_node(struct net *net, } else if (rt->rt6i_flags & RTF_LOCAL) rtm->rtm_type = RTN_LOCAL; + else if (rt->rt6i_flags & RTF_ANYCAST) + rtm->rtm_type = RTN_ANYCAST; else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK)) rtm->rtm_type = RTN_LOCAL; else From 9501df3cd9204f5859f649182431616a31ee88a1 Mon Sep 17 00:00:00 2001 From: Nathan Fontenot Date: Wed, 15 Mar 2017 23:38:07 -0400 Subject: [PATCH 30/85] ibmvnic: Free tx/rx scrq pointer array when releasing sub-crqs The pointer array for the tx/rx sub crqs should be free'ed when releasing the tx/rx sub crqs. Signed-off-by: Nathan Fontenot Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 5f11b4dc95d2..b23d6545f835 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -1257,6 +1257,7 @@ static void release_sub_crqs(struct ibmvnic_adapter *adapter) release_sub_crq_queue(adapter, adapter->tx_scrq[i]); } + kfree(adapter->tx_scrq); adapter->tx_scrq = NULL; } @@ -1269,6 +1270,7 @@ static void release_sub_crqs(struct ibmvnic_adapter *adapter) release_sub_crq_queue(adapter, adapter->rx_scrq[i]); } + kfree(adapter->rx_scrq); adapter->rx_scrq = NULL; } } From 4d4a6ac73e7466c2085c307fac41f74ce4568a45 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 16 Mar 2017 16:27:10 +0000 Subject: [PATCH 31/85] rxrpc: Ignore BUSY packets on old calls If we receive a BUSY packet for a call we think we've just completed, the packet is handed off to the connection processor to deal with - but the connection processor doesn't expect a BUSY packet and so flags a protocol error. Fix this by simply ignoring the BUSY packet for the moment. The symptom of this may appear as a system call failing with EPROTO. This may be triggered by pressing ctrl-C under some circumstances. This comes about we abort calls due to interruption by a signal (which we shouldn't do, but that's going to be a large fix and mostly in fs/afs/). What happens is that we abort the call and may also abort follow up calls too (this needs offloading somehoe). So we see a transmission of something like the following sequence of packets: DATA for call N ABORT call N DATA for call N+1 ABORT call N+1 in very quick succession on the same channel. However, the peer may have deferred the processing of the ABORT from the call N to a background thread and thus sees the DATA message from the call N+1 coming in before it has cleared the channel. Thus it sends a BUSY packet[*]. [*] Note that some implementations (OpenAFS, for example) mark the BUSY packet with one plus the callNumber of the call prior to call N. Ordinarily, this would be call N, but there's no requirement for the calls on a channel to be numbered strictly sequentially (the number is required to increase). This is wrong and means that the callNumber in the BUSY packet should be ignored (it really ought to be N+1 since that's what it's in response to). Signed-off-by: David Howells Signed-off-by: David S. Miller --- net/rxrpc/conn_event.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 3f9d8d7ec632..b099b64366f3 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -275,6 +275,10 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, rxrpc_conn_retransmit_call(conn, skb); return 0; + case RXRPC_PACKET_TYPE_BUSY: + /* Just ignore BUSY packets for now. */ + return 0; + case RXRPC_PACKET_TYPE_ABORT: if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header), &wtmp, sizeof(wtmp)) < 0) From d12c917691b45d9dffcfe7c2362d25caa40905fd Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Thu, 16 Mar 2017 10:32:42 -0700 Subject: [PATCH 32/85] bridge: resolve a false alarm of lockdep Andrei reported a false alarm of lockdep at net/bridge/br_fdb.c:109, this is because in Andrei's case, a spin_bug() was already triggered before this, therefore the debug_locks is turned off, lockdep_is_held() is no longer accurate after that. We should use lockdep_assert_held_once() instead of lockdep_is_held() to respect debug_locks. Fixes: 410b3d48f5111 ("bridge: fdb: add proper lock checks in searching functions") Reported-by: Andrei Vagin Signed-off-by: Cong Wang Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_fdb.c | 2 +- net/bridge/br_private.h | 9 --------- 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 4f598dc2d916..6e08b7199dd7 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -106,7 +106,7 @@ static struct net_bridge_fdb_entry *br_fdb_find(struct net_bridge *br, struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)]; struct net_bridge_fdb_entry *fdb; - WARN_ON_ONCE(!br_hash_lock_held(br)); + lockdep_assert_held_once(&br->hash_lock); rcu_read_lock(); fdb = fdb_find_rcu(head, addr, vid); diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 2288fca7756c..61368186edea 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -531,15 +531,6 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p, int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p, const unsigned char *addr, u16 vid); -static inline bool br_hash_lock_held(struct net_bridge *br) -{ -#ifdef CONFIG_LOCKDEP - return lockdep_is_held(&br->hash_lock); -#else - return true; -#endif -} - /* br_forward.c */ enum br_pkt_type { BR_PKT_UNICAST, From e14b4db7a567ff507453ecd9c64da51bbc2b6d23 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Thu, 16 Mar 2017 12:21:32 -0700 Subject: [PATCH 33/85] netvsc: fix race during initialization When device is being setup on boot, there is a small race where network device callback is registered, but the netvsc_device pointer is not set yet. This can cause a NULL ptr dereference if packet arrives during this window. Fixes: 46b4f7f5d1f7 ("netvsc: eliminate per-device outstanding send counter") Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- drivers/net/hyperv/netvsc.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index 4c1d8cca247b..8dd0b8770328 100644 --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@ -1231,8 +1231,11 @@ void netvsc_channel_cb(void *context) return; net_device = net_device_to_netvsc_device(ndev); - if (unlikely(net_device->destroy) && - netvsc_channel_idle(net_device, q_idx)) + if (unlikely(!net_device)) + return; + + if (unlikely(net_device->destroy && + netvsc_channel_idle(net_device, q_idx))) return; /* commit_rd_index() -> hv_signal_on_read() needs this. */ From db7f00b8dba6d687b6ab1f2e9309acfd214fcb4b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 16 Mar 2017 15:43:19 -0700 Subject: [PATCH 34/85] tcp: tcp_get_info() should read tcp_time_stamp later Commit b369e7fd41f7 ("tcp: make TCP_INFO more consistent") moved lock_sock_fast() earlier in tcp_get_info() This has the minor effect that jiffies value being sampled at the beginning of tcp_get_info() is more likely to be off by one, and we report big tcpi_last_data_sent values (like 0xFFFFFFFF). Since we lock the socket, fetching tcp_time_stamp right before doing the jiffies_to_msecs() calls is enough to remove these wrong values. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index cf4555581282..1e319a525d51 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2770,7 +2770,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) { const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */ const struct inet_connection_sock *icsk = inet_csk(sk); - u32 now = tcp_time_stamp, intv; + u32 now, intv; u64 rate64; bool slow; u32 rate; @@ -2839,6 +2839,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_retrans = tp->retrans_out; info->tcpi_fackets = tp->fackets_out; + now = tcp_time_stamp; info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime); info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime); info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp); From 6be3b6cce1e225f189b68b4e84fc711d19b4277b Mon Sep 17 00:00:00 2001 From: Ryan Hsu Date: Mon, 13 Mar 2017 15:49:03 -0700 Subject: [PATCH 35/85] ath10k: fix incorrect wlan_mac_base in qca6174_regs In the 'commit ebee76f7fa46 ("ath10k: allow setting coverage class")', it inherits the design and the address offset from ath9k, but the address is not applicable to QCA6174, which leads to a random crash while doing the resume() operation, since the set_coverage_class.ops will be called from ieee80211_reconfig() when resume() (if the wow is not configured). Fix the incorrect address offset here to avoid the random crash. Verified on QCA6174/hw3.0 with firmware WLAN.RM.4.4-00022-QCARMSWPZ-2. kvalo: this also seems to fix a regression with firmware restart. Fixes: ebee76f7fa46 ("ath10k: allow setting coverage class") Cc: # v4.10 Signed-off-by: Ryan Hsu Signed-off-by: Kalle Valo --- drivers/net/wireless/ath/ath10k/hw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/ath/ath10k/hw.c b/drivers/net/wireless/ath/ath10k/hw.c index 33fb26833cd0..d9f37ee4bfdd 100644 --- a/drivers/net/wireless/ath/ath10k/hw.c +++ b/drivers/net/wireless/ath/ath10k/hw.c @@ -51,7 +51,7 @@ const struct ath10k_hw_regs qca6174_regs = { .rtc_soc_base_address = 0x00000800, .rtc_wmac_base_address = 0x00001000, .soc_core_base_address = 0x0003a000, - .wlan_mac_base_address = 0x00020000, + .wlan_mac_base_address = 0x00010000, .ce_wrapper_base_address = 0x00034000, .ce0_base_address = 0x00034400, .ce1_base_address = 0x00034800, From 98d068ab52b4b11d403995ed14154660797e7136 Mon Sep 17 00:00:00 2001 From: hayeswang Date: Tue, 14 Mar 2017 14:15:20 +0800 Subject: [PATCH 36/85] r8152: fix the list rx_done may be used without initialization The list rx_done would be initialized when the linking on occurs. Therefore, if a napi is scheduled without any linking on before, the following kernel panic would happen. BUG: unable to handle kernel NULL pointer dereference at 000000000000008 IP: [] r8152_poll+0xe1e/0x1210 [r8152] PGD 0 Oops: 0002 [#1] SMP Signed-off-by: Hayes Wang Signed-off-by: David S. Miller --- drivers/net/usb/r8152.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index 986243c932cc..bb3eedd07fbe 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -1362,6 +1362,7 @@ static int alloc_all_mem(struct r8152 *tp) spin_lock_init(&tp->rx_lock); spin_lock_init(&tp->tx_lock); INIT_LIST_HEAD(&tp->tx_free); + INIT_LIST_HEAD(&tp->rx_done); skb_queue_head_init(&tp->tx_queue); skb_queue_head_init(&tp->rx_queue); From 8a0f5ccfb33b0b8b51de65b7b3bf342ba10b4fb6 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 14 Mar 2017 18:25:57 +0800 Subject: [PATCH 37/85] crypto: deadlock between crypto_alg_sem/rtnl_mutex/genl_mutex On Tue, Mar 14, 2017 at 10:44:10AM +0100, Dmitry Vyukov wrote: > > Yes, please. > Disregarding some reports is not a good way long term. Please try this patch. ---8<--- Subject: netlink: Annotate nlk cb_mutex by protocol Currently all occurences of nlk->cb_mutex are annotated by lockdep as a single class. This causes a false lcokdep cycle involving genl and crypto_user. This patch fixes it by dividing cb_mutex into individual classes based on the netlink protocol. As genl and crypto_user do not use the same netlink protocol this breaks the false dependency loop. Reported-by: Dmitry Vyukov Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 41 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 7b73c7c161a9..596eaff66649 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -96,6 +96,44 @@ EXPORT_SYMBOL_GPL(nl_table); static DECLARE_WAIT_QUEUE_HEAD(nl_table_wait); +static struct lock_class_key nlk_cb_mutex_keys[MAX_LINKS]; + +static const char *const nlk_cb_mutex_key_strings[MAX_LINKS + 1] = { + "nlk_cb_mutex-ROUTE", + "nlk_cb_mutex-1", + "nlk_cb_mutex-USERSOCK", + "nlk_cb_mutex-FIREWALL", + "nlk_cb_mutex-SOCK_DIAG", + "nlk_cb_mutex-NFLOG", + "nlk_cb_mutex-XFRM", + "nlk_cb_mutex-SELINUX", + "nlk_cb_mutex-ISCSI", + "nlk_cb_mutex-AUDIT", + "nlk_cb_mutex-FIB_LOOKUP", + "nlk_cb_mutex-CONNECTOR", + "nlk_cb_mutex-NETFILTER", + "nlk_cb_mutex-IP6_FW", + "nlk_cb_mutex-DNRTMSG", + "nlk_cb_mutex-KOBJECT_UEVENT", + "nlk_cb_mutex-GENERIC", + "nlk_cb_mutex-17", + "nlk_cb_mutex-SCSITRANSPORT", + "nlk_cb_mutex-ECRYPTFS", + "nlk_cb_mutex-RDMA", + "nlk_cb_mutex-CRYPTO", + "nlk_cb_mutex-SMC", + "nlk_cb_mutex-23", + "nlk_cb_mutex-24", + "nlk_cb_mutex-25", + "nlk_cb_mutex-26", + "nlk_cb_mutex-27", + "nlk_cb_mutex-28", + "nlk_cb_mutex-29", + "nlk_cb_mutex-30", + "nlk_cb_mutex-31", + "nlk_cb_mutex-MAX_LINKS" +}; + static int netlink_dump(struct sock *sk); static void netlink_skb_destructor(struct sk_buff *skb); @@ -585,6 +623,9 @@ static int __netlink_create(struct net *net, struct socket *sock, } else { nlk->cb_mutex = &nlk->cb_def_mutex; mutex_init(nlk->cb_mutex); + lockdep_set_class_and_name(nlk->cb_mutex, + nlk_cb_mutex_keys + protocol, + nlk_cb_mutex_key_strings[protocol]); } init_waitqueue_head(&nlk->wait); From 36d277bac8080202684e67162ebb157f16631581 Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Wed, 15 Mar 2017 09:32:14 +0800 Subject: [PATCH 38/85] vsock: track pkt owner vsock So that we can cancel a queued pkt later if necessary. Signed-off-by: Peng Tao Signed-off-by: David S. Miller --- include/linux/virtio_vsock.h | 3 +++ net/vmw_vsock/virtio_transport_common.c | 7 +++++++ 2 files changed, 10 insertions(+) diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h index 9638bfeb0d1f..584f9a647ad4 100644 --- a/include/linux/virtio_vsock.h +++ b/include/linux/virtio_vsock.h @@ -48,6 +48,8 @@ struct virtio_vsock_pkt { struct virtio_vsock_hdr hdr; struct work_struct work; struct list_head list; + /* socket refcnt not held, only use for cancellation */ + struct vsock_sock *vsk; void *buf; u32 len; u32 off; @@ -56,6 +58,7 @@ struct virtio_vsock_pkt { struct virtio_vsock_pkt_info { u32 remote_cid, remote_port; + struct vsock_sock *vsk; struct msghdr *msg; u32 pkt_len; u16 type; diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 8d592a45b597..af087b44ceea 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -58,6 +58,7 @@ virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info, pkt->len = len; pkt->hdr.len = cpu_to_le32(len); pkt->reply = info->reply; + pkt->vsk = info->vsk; if (info->msg && len > 0) { pkt->buf = kmalloc(len, GFP_KERNEL); @@ -180,6 +181,7 @@ static int virtio_transport_send_credit_update(struct vsock_sock *vsk, struct virtio_vsock_pkt_info info = { .op = VIRTIO_VSOCK_OP_CREDIT_UPDATE, .type = type, + .vsk = vsk, }; return virtio_transport_send_pkt_info(vsk, &info); @@ -519,6 +521,7 @@ int virtio_transport_connect(struct vsock_sock *vsk) struct virtio_vsock_pkt_info info = { .op = VIRTIO_VSOCK_OP_REQUEST, .type = VIRTIO_VSOCK_TYPE_STREAM, + .vsk = vsk, }; return virtio_transport_send_pkt_info(vsk, &info); @@ -534,6 +537,7 @@ int virtio_transport_shutdown(struct vsock_sock *vsk, int mode) VIRTIO_VSOCK_SHUTDOWN_RCV : 0) | (mode & SEND_SHUTDOWN ? VIRTIO_VSOCK_SHUTDOWN_SEND : 0), + .vsk = vsk, }; return virtio_transport_send_pkt_info(vsk, &info); @@ -560,6 +564,7 @@ virtio_transport_stream_enqueue(struct vsock_sock *vsk, .type = VIRTIO_VSOCK_TYPE_STREAM, .msg = msg, .pkt_len = len, + .vsk = vsk, }; return virtio_transport_send_pkt_info(vsk, &info); @@ -581,6 +586,7 @@ static int virtio_transport_reset(struct vsock_sock *vsk, .op = VIRTIO_VSOCK_OP_RST, .type = VIRTIO_VSOCK_TYPE_STREAM, .reply = !!pkt, + .vsk = vsk, }; /* Send RST only if the original pkt is not a RST pkt */ @@ -826,6 +832,7 @@ virtio_transport_send_response(struct vsock_sock *vsk, .remote_cid = le64_to_cpu(pkt->hdr.src_cid), .remote_port = le32_to_cpu(pkt->hdr.src_port), .reply = true, + .vsk = vsk, }; return virtio_transport_send_pkt_info(vsk, &info); From 16320f363ae128d9b9c70e60f00f2a572f57c23d Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Wed, 15 Mar 2017 09:32:15 +0800 Subject: [PATCH 39/85] vhost-vsock: add pkt cancel capability To allow canceling all packets of a connection. Reviewed-by: Stefan Hajnoczi Reviewed-by: Jorgen Hansen Signed-off-by: Peng Tao Signed-off-by: David S. Miller --- drivers/vhost/vsock.c | 41 +++++++++++++++++++++++++++++++++++++++++ include/net/af_vsock.h | 3 +++ 2 files changed, 44 insertions(+) diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c index ce5e63d2c66a..44eed8eb0725 100644 --- a/drivers/vhost/vsock.c +++ b/drivers/vhost/vsock.c @@ -223,6 +223,46 @@ vhost_transport_send_pkt(struct virtio_vsock_pkt *pkt) return len; } +static int +vhost_transport_cancel_pkt(struct vsock_sock *vsk) +{ + struct vhost_vsock *vsock; + struct virtio_vsock_pkt *pkt, *n; + int cnt = 0; + LIST_HEAD(freeme); + + /* Find the vhost_vsock according to guest context id */ + vsock = vhost_vsock_get(vsk->remote_addr.svm_cid); + if (!vsock) + return -ENODEV; + + spin_lock_bh(&vsock->send_pkt_list_lock); + list_for_each_entry_safe(pkt, n, &vsock->send_pkt_list, list) { + if (pkt->vsk != vsk) + continue; + list_move(&pkt->list, &freeme); + } + spin_unlock_bh(&vsock->send_pkt_list_lock); + + list_for_each_entry_safe(pkt, n, &freeme, list) { + if (pkt->reply) + cnt++; + list_del(&pkt->list); + virtio_transport_free_pkt(pkt); + } + + if (cnt) { + struct vhost_virtqueue *tx_vq = &vsock->vqs[VSOCK_VQ_TX]; + int new_cnt; + + new_cnt = atomic_sub_return(cnt, &vsock->queued_replies); + if (new_cnt + cnt >= tx_vq->num && new_cnt < tx_vq->num) + vhost_poll_queue(&tx_vq->poll); + } + + return 0; +} + static struct virtio_vsock_pkt * vhost_vsock_alloc_pkt(struct vhost_virtqueue *vq, unsigned int out, unsigned int in) @@ -675,6 +715,7 @@ static struct virtio_transport vhost_transport = { .release = virtio_transport_release, .connect = virtio_transport_connect, .shutdown = virtio_transport_shutdown, + .cancel_pkt = vhost_transport_cancel_pkt, .dgram_enqueue = virtio_transport_dgram_enqueue, .dgram_dequeue = virtio_transport_dgram_dequeue, diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index f2758964ce6f..f32ed9ac181a 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -100,6 +100,9 @@ struct vsock_transport { void (*destruct)(struct vsock_sock *); void (*release)(struct vsock_sock *); + /* Cancel all pending packets sent on vsock. */ + int (*cancel_pkt)(struct vsock_sock *vsk); + /* Connections. */ int (*connect)(struct vsock_sock *); From 073b4f2c50fe67c7c66a059a4d6db52bb1465490 Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Wed, 15 Mar 2017 09:32:16 +0800 Subject: [PATCH 40/85] vsock: add pkt cancel capability Reviewed-by: Stefan Hajnoczi Signed-off-by: Peng Tao Signed-off-by: David S. Miller --- net/vmw_vsock/virtio_transport.c | 42 ++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index 9d24c0e958b1..68675a151f22 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -213,6 +213,47 @@ virtio_transport_send_pkt(struct virtio_vsock_pkt *pkt) return len; } +static int +virtio_transport_cancel_pkt(struct vsock_sock *vsk) +{ + struct virtio_vsock *vsock; + struct virtio_vsock_pkt *pkt, *n; + int cnt = 0; + LIST_HEAD(freeme); + + vsock = virtio_vsock_get(); + if (!vsock) { + return -ENODEV; + } + + spin_lock_bh(&vsock->send_pkt_list_lock); + list_for_each_entry_safe(pkt, n, &vsock->send_pkt_list, list) { + if (pkt->vsk != vsk) + continue; + list_move(&pkt->list, &freeme); + } + spin_unlock_bh(&vsock->send_pkt_list_lock); + + list_for_each_entry_safe(pkt, n, &freeme, list) { + if (pkt->reply) + cnt++; + list_del(&pkt->list); + virtio_transport_free_pkt(pkt); + } + + if (cnt) { + struct virtqueue *rx_vq = vsock->vqs[VSOCK_VQ_RX]; + int new_cnt; + + new_cnt = atomic_sub_return(cnt, &vsock->queued_replies); + if (new_cnt + cnt >= virtqueue_get_vring_size(rx_vq) && + new_cnt < virtqueue_get_vring_size(rx_vq)) + queue_work(virtio_vsock_workqueue, &vsock->rx_work); + } + + return 0; +} + static void virtio_vsock_rx_fill(struct virtio_vsock *vsock) { int buf_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE; @@ -462,6 +503,7 @@ static struct virtio_transport virtio_transport = { .release = virtio_transport_release, .connect = virtio_transport_connect, .shutdown = virtio_transport_shutdown, + .cancel_pkt = virtio_transport_cancel_pkt, .dgram_bind = virtio_transport_dgram_bind, .dgram_dequeue = virtio_transport_dgram_dequeue, From 380feae0def7e6a115124a3219c3ec9b654dca32 Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Wed, 15 Mar 2017 09:32:17 +0800 Subject: [PATCH 41/85] vsock: cancel packets when failing to connect Otherwise we'll leave the packets queued until releasing vsock device. E.g., if guest is slow to start up, resulting ETIMEDOUT on connect, guest will get the connect requests from failed host sockets. Reviewed-by: Stefan Hajnoczi Reviewed-by: Jorgen Hansen Signed-off-by: Peng Tao Signed-off-by: David S. Miller --- net/vmw_vsock/af_vsock.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 9f770f33c100..6f7f6757ceef 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1102,10 +1102,19 @@ static const struct proto_ops vsock_dgram_ops = { .sendpage = sock_no_sendpage, }; +static int vsock_transport_cancel_pkt(struct vsock_sock *vsk) +{ + if (!transport->cancel_pkt) + return -EOPNOTSUPP; + + return transport->cancel_pkt(vsk); +} + static void vsock_connect_timeout(struct work_struct *work) { struct sock *sk; struct vsock_sock *vsk; + int cancel = 0; vsk = container_of(work, struct vsock_sock, dwork.work); sk = sk_vsock(vsk); @@ -1116,8 +1125,11 @@ static void vsock_connect_timeout(struct work_struct *work) sk->sk_state = SS_UNCONNECTED; sk->sk_err = ETIMEDOUT; sk->sk_error_report(sk); + cancel = 1; } release_sock(sk); + if (cancel) + vsock_transport_cancel_pkt(vsk); sock_put(sk); } @@ -1224,11 +1236,13 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr, err = sock_intr_errno(timeout); sk->sk_state = SS_UNCONNECTED; sock->state = SS_UNCONNECTED; + vsock_transport_cancel_pkt(vsk); goto out_wait; } else if (timeout == 0) { err = -ETIMEDOUT; sk->sk_state = SS_UNCONNECTED; sock->state = SS_UNCONNECTED; + vsock_transport_cancel_pkt(vsk); goto out_wait; } From 7df9c24625b9981779afb8fcdbe2bb4765e61147 Mon Sep 17 00:00:00 2001 From: Andrey Ulanov Date: Tue, 14 Mar 2017 20:16:42 -0700 Subject: [PATCH 42/85] net: unix: properly re-increment inflight counter of GC discarded candidates Dmitry has reported that a BUG_ON() condition in unix_notinflight() may be triggered by a simple code that forwards unix socket in an SCM_RIGHTS message. That is caused by incorrect unix socket GC implementation in unix_gc(). The GC first collects list of candidates, then (a) decrements their "children's" inflight counter, (b) checks which inflight counters are now 0, and then (c) increments all inflight counters back. (a) and (c) are done by calling scan_children() with inc_inflight or dec_inflight as the second argument. Commit 6209344f5a37 ("net: unix: fix inflight counting bug in garbage collector") changed scan_children() such that it no longer considers sockets that do not have UNIX_GC_CANDIDATE flag. It also added a block of code that that unsets this flag _before_ invoking scan_children(, dec_iflight, ). This may lead to incorrect inflight counters for some sockets. This change fixes this bug by changing order of operations: UNIX_GC_CANDIDATE is now unset only after all inflight counters are restored to the original state. kernel BUG at net/unix/garbage.c:149! RIP: 0010:[] [] unix_notinflight+0x3b4/0x490 net/unix/garbage.c:149 Call Trace: [] unix_detach_fds.isra.19+0xff/0x170 net/unix/af_unix.c:1487 [] unix_destruct_scm+0xf9/0x210 net/unix/af_unix.c:1496 [] skb_release_head_state+0x101/0x200 net/core/skbuff.c:655 [] skb_release_all+0x1a/0x60 net/core/skbuff.c:668 [] __kfree_skb+0x1a/0x30 net/core/skbuff.c:684 [] kfree_skb+0x184/0x570 net/core/skbuff.c:705 [] unix_release_sock+0x5b5/0xbd0 net/unix/af_unix.c:559 [] unix_release+0x49/0x90 net/unix/af_unix.c:836 [] sock_release+0x92/0x1f0 net/socket.c:570 [] sock_close+0x1b/0x20 net/socket.c:1017 [] __fput+0x34e/0x910 fs/file_table.c:208 [] ____fput+0x1a/0x20 fs/file_table.c:244 [] task_work_run+0x1a0/0x280 kernel/task_work.c:116 [< inline >] exit_task_work include/linux/task_work.h:21 [] do_exit+0x183a/0x2640 kernel/exit.c:828 [] do_group_exit+0x14e/0x420 kernel/exit.c:931 [] get_signal+0x663/0x1880 kernel/signal.c:2307 [] do_signal+0xc5/0x2190 arch/x86/kernel/signal.c:807 [] exit_to_usermode_loop+0x1ea/0x2d0 arch/x86/entry/common.c:156 [< inline >] prepare_exit_to_usermode arch/x86/entry/common.c:190 [] syscall_return_slowpath+0x4d3/0x570 arch/x86/entry/common.c:259 [] entry_SYSCALL_64_fastpath+0xc4/0xc6 Link: https://lkml.org/lkml/2017/3/6/252 Signed-off-by: Andrey Ulanov Reported-by: Dmitry Vyukov Fixes: 6209344 ("net: unix: fix inflight counting bug in garbage collector") Signed-off-by: David S. Miller --- net/unix/garbage.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 6a0d48525fcf..c36757e72844 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -146,6 +146,7 @@ void unix_notinflight(struct user_struct *user, struct file *fp) if (s) { struct unix_sock *u = unix_sk(s); + BUG_ON(!atomic_long_read(&u->inflight)); BUG_ON(list_empty(&u->link)); if (atomic_long_dec_and_test(&u->inflight)) @@ -341,6 +342,14 @@ void unix_gc(void) } list_del(&cursor); + /* Now gc_candidates contains only garbage. Restore original + * inflight counters for these as well, and remove the skbuffs + * which are creating the cycle(s). + */ + skb_queue_head_init(&hitlist); + list_for_each_entry(u, &gc_candidates, link) + scan_children(&u->sk, inc_inflight, &hitlist); + /* not_cycle_list contains those sockets which do not make up a * cycle. Restore these to the inflight list. */ @@ -350,14 +359,6 @@ void unix_gc(void) list_move_tail(&u->link, &gc_inflight_list); } - /* Now gc_candidates contains only garbage. Restore original - * inflight counters for these as well, and remove the skbuffs - * which are creating the cycle(s). - */ - skb_queue_head_init(&hitlist); - list_for_each_entry(u, &gc_candidates, link) - scan_children(&u->sk, inc_inflight, &hitlist); - spin_unlock(&unix_gc_lock); /* Here we are. Hitlist is filled. Die. */ From 09050957fae896e001498af1aa35c446a11cb47d Mon Sep 17 00:00:00 2001 From: Yaroslav Isakov Date: Thu, 16 Mar 2017 22:44:10 +0300 Subject: [PATCH 43/85] tun: fix inability to set offloads after disabling them via ethtool Added missing logic in tun driver, which prevents apps to set offloads using tun ioctl, if offloads were previously disabled via ethtool Signed-off-by: Yaroslav Isakov Signed-off-by: David S. Miller --- drivers/net/tun.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 34cc3c590aa5..cc88cd7856f5 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1931,6 +1931,8 @@ static int set_offload(struct tun_struct *tun, unsigned long arg) return -EINVAL; tun->set_features = features; + tun->dev->wanted_features &= ~TUN_USER_FEATURES; + tun->dev->wanted_features |= features; netdev_update_features(tun->dev); return 0; From aea92fb2e09e29653b023d4254ac9fbf94221538 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 17 Mar 2017 08:05:28 -0700 Subject: [PATCH 44/85] sch_dsmark: fix invalid skb_cow() usage skb_cow(skb, sizeof(ip header)) is not very helpful in this context. First we need to use pskb_may_pull() to make sure the ip header is in skb linear part, then use skb_try_make_writable() to address clones issues. Fixes: 4c30719f4f55 ("[PKT_SCHED] dsmark: handle cloned and non-linear skb's") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_dsmark.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index 802ac7c2e5e8..5334e309f17f 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -201,9 +201,13 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch, pr_debug("%s(skb %p,sch %p,[qdisc %p])\n", __func__, skb, sch, p); if (p->set_tc_index) { + int wlen = skb_network_offset(skb); + switch (tc_skb_protocol(skb)) { case htons(ETH_P_IP): - if (skb_cow_head(skb, sizeof(struct iphdr))) + wlen += sizeof(struct iphdr); + if (!pskb_may_pull(skb, wlen) || + skb_try_make_writable(skb, wlen)) goto drop; skb->tc_index = ipv4_get_dsfield(ip_hdr(skb)) @@ -211,7 +215,9 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch, break; case htons(ETH_P_IPV6): - if (skb_cow_head(skb, sizeof(struct ipv6hdr))) + wlen += sizeof(struct ipv6hdr); + if (!pskb_may_pull(skb, wlen) || + skb_try_make_writable(skb, wlen)) goto drop; skb->tc_index = ipv6_get_dsfield(ipv6_hdr(skb)) From 6bd845d1cf98b45c634baacb8381436dad3c2dd0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Mork?= Date: Fri, 17 Mar 2017 17:20:48 +0100 Subject: [PATCH 45/85] qmi_wwan: add Dell DW5811e MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a Dell branded Sierra Wireless EM7455. It is operating in MBIM mode by default, but can be configured to provide two QMI/RMNET functions. Signed-off-by: Bjørn Mork Signed-off-by: David S. Miller --- drivers/net/usb/qmi_wwan.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index 805674550683..f8d55aa058ec 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -925,6 +925,8 @@ static const struct usb_device_id products[] = { {QMI_FIXED_INTF(0x413c, 0x81a9, 8)}, /* Dell Wireless 5808e Gobi(TM) 4G LTE Mobile Broadband Card */ {QMI_FIXED_INTF(0x413c, 0x81b1, 8)}, /* Dell Wireless 5809e Gobi(TM) 4G LTE Mobile Broadband Card */ {QMI_FIXED_INTF(0x413c, 0x81b3, 8)}, /* Dell Wireless 5809e Gobi(TM) 4G LTE Mobile Broadband Card (rev3) */ + {QMI_FIXED_INTF(0x413c, 0x81b6, 8)}, /* Dell Wireless 5811e */ + {QMI_FIXED_INTF(0x413c, 0x81b6, 10)}, /* Dell Wireless 5811e */ {QMI_FIXED_INTF(0x03f0, 0x4e1d, 8)}, /* HP lt4111 LTE/EV-DO/HSPA+ Gobi 4G Module */ {QMI_FIXED_INTF(0x22de, 0x9061, 3)}, /* WeTelecom WPD-600N */ {QMI_FIXED_INTF(0x1e0e, 0x9001, 5)}, /* SIMCom 7230E */ From 13e2d5187f6b965ba3556caedb914baf81b98ed2 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 17 Mar 2017 23:52:35 +0300 Subject: [PATCH 46/85] bna: integer overflow bug in debugfs We could allocate less memory than intended because we do: bnad->regdata = kzalloc(len << 2, GFP_KERNEL); The shift can overflow leading to a crash. This is debugfs code so the impact is very small. Fixes: 7afc5dbde091 ("bna: Add debugfs interface.") Signed-off-by: Dan Carpenter Acked-by: Rasesh Mody Signed-off-by: David S. Miller --- drivers/net/ethernet/brocade/bna/bnad_debugfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/brocade/bna/bnad_debugfs.c b/drivers/net/ethernet/brocade/bna/bnad_debugfs.c index 05c1c1dd7751..cebfe3bd086e 100644 --- a/drivers/net/ethernet/brocade/bna/bnad_debugfs.c +++ b/drivers/net/ethernet/brocade/bna/bnad_debugfs.c @@ -325,7 +325,7 @@ bnad_debugfs_write_regrd(struct file *file, const char __user *buf, return PTR_ERR(kern_buf); rc = sscanf(kern_buf, "%x:%x", &addr, &len); - if (rc < 2) { + if (rc < 2 || len > UINT_MAX >> 2) { netdev_warn(bnad->netdev, "failed to read user buffer\n"); kfree(kern_buf); return -EINVAL; From 3dc857f0e8fc22610a59cbb346ba62c6e921863f Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 17 Mar 2017 16:07:11 -0700 Subject: [PATCH 47/85] net: vrf: Reset rt6i_idev in local dst after put The VRF driver takes a reference to the inet6_dev on the VRF device for its rt6_local dst when handling local traffic through the VRF device as a loopback. When the device is deleted the driver does a put on the idev but does not reset rt6i_idev in the rt6_info struct. When the dst is destroyed, dst_destroy calls ip6_dst_destroy which does a second put for what is essentially the same reference causing it to be prematurely freed. Reset rt6i_idev after the put in the vrf driver. Fixes: b4869aa2f881e ("net: vrf: ipv6 support for local traffic to local addresses") Signed-off-by: David Ahern Signed-off-by: David S. Miller --- drivers/net/vrf.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index fea687f35b5a..d6988db1930d 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -462,8 +462,10 @@ static void vrf_rt6_release(struct net_device *dev, struct net_vrf *vrf) } if (rt6_local) { - if (rt6_local->rt6i_idev) + if (rt6_local->rt6i_idev) { in6_dev_put(rt6_local->rt6i_idev); + rt6_local->rt6i_idev = NULL; + } dst = &rt6_local->dst; dev_put(dst->dev); From 486a43db2e26b87125b5629e1ade516f90833934 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 18 Mar 2017 19:12:22 +0800 Subject: [PATCH 48/85] sctp: remove temporary variable confirm from sctp_packet_transmit Commit c86a773c7802 ("sctp: add dst_pending_confirm flag") introduced a temporary variable "confirm" in sctp_packet_transmit. But it broke the rule that longer lines should be above shorter ones. Besides, this variable is not necessary, so this patch is to just remove it and use tp->dst_pending_confirm directly. Fixes: c86a773c7802 ("sctp: add dst_pending_confirm flag") Signed-off-by: Xin Long Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/output.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/net/sctp/output.c b/net/sctp/output.c index 71ce6b945dcb..1224421036b3 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -546,7 +546,6 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) struct sctp_association *asoc = tp->asoc; struct sctp_chunk *chunk, *tmp; int pkt_count, gso = 0; - int confirm; struct dst_entry *dst; struct sk_buff *head; struct sctphdr *sh; @@ -625,13 +624,13 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) asoc->peer.last_sent_to = tp; } head->ignore_df = packet->ipfragok; - confirm = tp->dst_pending_confirm; - if (confirm) + if (tp->dst_pending_confirm) skb_set_dst_pending_confirm(head, 1); /* neighbour should be confirmed on successful transmission or * positive error */ - if (tp->af_specific->sctp_xmit(head, tp) >= 0 && confirm) + if (tp->af_specific->sctp_xmit(head, tp) >= 0 && + tp->dst_pending_confirm) tp->dst_pending_confirm = 0; out: From 1f904495b79003cd3d881de8731377d48fcbc7e3 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 18 Mar 2017 19:27:23 +0800 Subject: [PATCH 49/85] sctp: define dst_pending_confirm as a bit in sctp_transport As tp->dst_pending_confirm's value can only be set 0 or 1, this patch is to change to define it as a bit instead of __u32. Signed-off-by: Xin Long Acked-by: Neil Horman Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 07a0b128625a..4f645198e9bd 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -753,6 +753,8 @@ struct sctp_transport { /* Is the Path MTU update pending on this tranport */ pmtu_pending:1, + dst_pending_confirm:1, /* need to confirm neighbour */ + /* Has this transport moved the ctsn since we last sacked */ sack_generation:1; u32 dst_cookie; @@ -806,8 +808,6 @@ struct sctp_transport { __u32 burst_limited; /* Holds old cwnd when max.burst is applied */ - __u32 dst_pending_confirm; /* need to confirm neighbour */ - /* Destination */ struct dst_entry *dst; /* Source address. */ From 23bb09cfbe04076ef647da3889a5a5ab6cbe6f15 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 18 Mar 2017 20:03:59 +0800 Subject: [PATCH 50/85] sctp: out_qlen should be updated when pruning unsent queue This patch is to fix the issue that sctp_prsctp_prune_sent forgot to update q->out_qlen when removing a chunk from unsent queue. Fixes: 8dbdf1f5b09c ("sctp: implement prsctp PRIO policy") Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/outqueue.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index db352e5d61f8..025ccff67072 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -382,17 +382,18 @@ static int sctp_prsctp_prune_sent(struct sctp_association *asoc, } static int sctp_prsctp_prune_unsent(struct sctp_association *asoc, - struct sctp_sndrcvinfo *sinfo, - struct list_head *queue, int msg_len) + struct sctp_sndrcvinfo *sinfo, int msg_len) { + struct sctp_outq *q = &asoc->outqueue; struct sctp_chunk *chk, *temp; - list_for_each_entry_safe(chk, temp, queue, list) { + list_for_each_entry_safe(chk, temp, &q->out_chunk_list, list) { if (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) || chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive) continue; list_del_init(&chk->list); + q->out_qlen -= chk->skb->len; asoc->sent_cnt_removable--; asoc->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++; @@ -431,9 +432,7 @@ void sctp_prsctp_prune(struct sctp_association *asoc, return; } - sctp_prsctp_prune_unsent(asoc, sinfo, - &asoc->outqueue.out_chunk_list, - msg_len); + sctp_prsctp_prune_unsent(asoc, sinfo, msg_len); } /* Mark all the eligible packets on a transport for retransmission. */ From 8605330aac5a5785630aec8f64378a54891937cc Mon Sep 17 00:00:00 2001 From: Soheil Hassas Yeganeh Date: Sat, 18 Mar 2017 17:02:59 -0400 Subject: [PATCH 51/85] tcp: fix SCM_TIMESTAMPING_OPT_STATS for normal skbs __sock_recv_timestamp can be called for both normal skbs (for receive timestamps) and for skbs on the error queue (for transmit timestamps). Commit 1c885808e456 (tcp: SOF_TIMESTAMPING_OPT_STATS option for SO_TIMESTAMPING) assumes any skb passed to __sock_recv_timestamp are from the error queue, containing OPT_STATS in the content of the skb. This results in accessing invalid memory or generating junk data. To fix this, set skb->pkt_type to PACKET_OUTGOING for packets on the error queue. This is safe because on the receive path on local sockets skb->pkt_type is never set to PACKET_OUTGOING. With that, copy OPT_STATS from a packet, only if its pkt_type is PACKET_OUTGOING. Fixes: 1c885808e456 ("tcp: SOF_TIMESTAMPING_OPT_STATS option for SO_TIMESTAMPING") Reported-by: JongHwan Kim Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: Eric Dumazet Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/core/skbuff.c | 10 ++++++++++ net/socket.c | 13 ++++++++++++- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index cd4ba8c6b609..b1fbd1958eb6 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3694,6 +3694,15 @@ static void sock_rmem_free(struct sk_buff *skb) atomic_sub(skb->truesize, &sk->sk_rmem_alloc); } +static void skb_set_err_queue(struct sk_buff *skb) +{ + /* pkt_type of skbs received on local sockets is never PACKET_OUTGOING. + * So, it is safe to (mis)use it to mark skbs on the error queue. + */ + skb->pkt_type = PACKET_OUTGOING; + BUILD_BUG_ON(PACKET_OUTGOING == 0); +} + /* * Note: We dont mem charge error packets (no sk_forward_alloc changes) */ @@ -3707,6 +3716,7 @@ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) skb->sk = sk; skb->destructor = sock_rmem_free; atomic_add(skb->truesize, &sk->sk_rmem_alloc); + skb_set_err_queue(skb); /* before exiting rcu section, make sure dst is refcounted */ skb_dst_force(skb); diff --git a/net/socket.c b/net/socket.c index e034fe4164be..692d6989d2c2 100644 --- a/net/socket.c +++ b/net/socket.c @@ -652,6 +652,16 @@ int kernel_sendmsg(struct socket *sock, struct msghdr *msg, } EXPORT_SYMBOL(kernel_sendmsg); +static bool skb_is_err_queue(const struct sk_buff *skb) +{ + /* pkt_type of skbs enqueued on the error queue are set to + * PACKET_OUTGOING in skb_set_err_queue(). This is only safe to do + * in recvmsg, since skbs received on a local socket will never + * have a pkt_type of PACKET_OUTGOING. + */ + return skb->pkt_type == PACKET_OUTGOING; +} + /* * called from sock_recv_timestamp() if sock_flag(sk, SOCK_RCVTSTAMP) */ @@ -695,7 +705,8 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING, sizeof(tss), &tss); - if (skb->len && (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS)) + if (skb_is_err_queue(skb) && skb->len && + (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS)) put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING_OPT_STATS, skb->len, skb->data); } From 4ef1b2869447411ad3ef91ad7d4891a83c1a509a Mon Sep 17 00:00:00 2001 From: Soheil Hassas Yeganeh Date: Sat, 18 Mar 2017 17:03:00 -0400 Subject: [PATCH 52/85] tcp: mark skbs with SCM_TIMESTAMPING_OPT_STATS SOF_TIMESTAMPING_OPT_STATS can be enabled and disabled while packets are collected on the error queue. So, checking SOF_TIMESTAMPING_OPT_STATS in sk->sk_tsflags is not enough to safely assume that the skb contains OPT_STATS data. Add a bit in sock_exterr_skb to indicate whether the skb contains opt_stats data. Fixes: 1c885808e456 ("tcp: SOF_TIMESTAMPING_OPT_STATS option for SO_TIMESTAMPING") Reported-by: JongHwan Kim Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: Eric Dumazet Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/errqueue.h | 2 ++ net/core/skbuff.c | 17 +++++++++++------ net/socket.c | 2 +- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/include/linux/errqueue.h b/include/linux/errqueue.h index 9ca23fcfb5d7..6fdfc884fdeb 100644 --- a/include/linux/errqueue.h +++ b/include/linux/errqueue.h @@ -20,6 +20,8 @@ struct sock_exterr_skb { struct sock_extended_err ee; u16 addr_offset; __be16 port; + u8 opt_stats:1, + unused:7; }; #endif diff --git a/net/core/skbuff.c b/net/core/skbuff.c index b1fbd1958eb6..9f781092fda9 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3793,16 +3793,20 @@ EXPORT_SYMBOL(skb_clone_sk); static void __skb_complete_tx_timestamp(struct sk_buff *skb, struct sock *sk, - int tstype) + int tstype, + bool opt_stats) { struct sock_exterr_skb *serr; int err; + BUILD_BUG_ON(sizeof(struct sock_exterr_skb) > sizeof(skb->cb)); + serr = SKB_EXT_ERR(skb); memset(serr, 0, sizeof(*serr)); serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; serr->ee.ee_info = tstype; + serr->opt_stats = opt_stats; if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) { serr->ee.ee_data = skb_shinfo(skb)->tskey; if (sk->sk_protocol == IPPROTO_TCP && @@ -3843,7 +3847,7 @@ void skb_complete_tx_timestamp(struct sk_buff *skb, */ if (likely(atomic_inc_not_zero(&sk->sk_refcnt))) { *skb_hwtstamps(skb) = *hwtstamps; - __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND); + __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND, false); sock_put(sk); } } @@ -3854,7 +3858,7 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, struct sock *sk, int tstype) { struct sk_buff *skb; - bool tsonly; + bool tsonly, opt_stats = false; if (!sk) return; @@ -3867,9 +3871,10 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, #ifdef CONFIG_INET if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) && sk->sk_protocol == IPPROTO_TCP && - sk->sk_type == SOCK_STREAM) + sk->sk_type == SOCK_STREAM) { skb = tcp_get_timestamping_opt_stats(sk); - else + opt_stats = true; + } else #endif skb = alloc_skb(0, GFP_ATOMIC); } else { @@ -3888,7 +3893,7 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, else skb->tstamp = ktime_get_real(); - __skb_complete_tx_timestamp(skb, sk, tstype); + __skb_complete_tx_timestamp(skb, sk, tstype, opt_stats); } EXPORT_SYMBOL_GPL(__skb_tstamp_tx); diff --git a/net/socket.c b/net/socket.c index 692d6989d2c2..985ef06792d6 100644 --- a/net/socket.c +++ b/net/socket.c @@ -706,7 +706,7 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, SCM_TIMESTAMPING, sizeof(tss), &tss); if (skb_is_err_queue(skb) && skb->len && - (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS)) + SKB_EXT_ERR(skb)->opt_stats) put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING_OPT_STATS, skb->len, skb->data); } From e8f1f34a344d060eaf1918089369c4c1172a153b Mon Sep 17 00:00:00 2001 From: Zi Shen Lim Date: Sun, 19 Mar 2017 23:03:14 -0700 Subject: [PATCH 53/85] selftests/bpf: fix broken build, take 2 Merge of 'linux-kselftest-4.11-rc1': 1. Partially removed use of 'test_objs' target, breaking force rebuild of BPFOBJ, introduced in commit d498f8719a09 ("bpf: Rebuild bpf.o for any dependency update"). Update target so dependency on BPFOBJ is restored. 2. Introduced commit 2047f1d8ba28 ("selftests: Fix the .c linking rule") which fixes order of LDLIBS. Commit d02d8986a768 ("bpf: Always test unprivileged programs") added libcap dependency into CFLAGS. Use LDLIBS instead to fix linking of test_verifier. 3. Introduced commit d83c3ba0b926 ("selftests: Fix selftests build to just build, not run tests"). Reordering the Makefile allows us to remove the 'all' target. Tested both: selftests/bpf$ make and selftests$ make TARGETS=bpf on Ubuntu 16.04.2. Signed-off-by: Zi Shen Lim Acked-by: Daniel Borkmann Tested-by: Daniel Borkmann Acked-by: Alexei Starovoitov Tested-by: Alexei Starovoitov Acked-by: Shuah Khan Signed-off-by: David S. Miller --- tools/testing/selftests/bpf/Makefile | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 67531f47781b..6a1ad58cb66f 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -1,22 +1,23 @@ LIBDIR := ../../../lib -BPFOBJ := $(LIBDIR)/bpf/bpf.o +BPFDIR := $(LIBDIR)/bpf -CFLAGS += -Wall -O2 -lcap -I../../../include/uapi -I$(LIBDIR) $(BPFOBJ) +CFLAGS += -Wall -O2 -I../../../include/uapi -I$(LIBDIR) +LDLIBS += -lcap TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map TEST_PROGS := test_kmod.sh -all: $(TEST_GEN_PROGS) +include ../lib.mk -.PHONY: all clean force +BPFOBJ := $(OUTPUT)/bpf.o + +$(TEST_GEN_PROGS): $(BPFOBJ) + +.PHONY: force # force a rebuild of BPFOBJ when its dependencies are updated force: $(BPFOBJ): force - $(MAKE) -C $(dir $(BPFOBJ)) - -$(test_objs): $(BPFOBJ) - -include ../lib.mk + $(MAKE) -C $(BPFDIR) OUTPUT=$(OUTPUT)/ From 4071898bf0f4d79ff353db327af2a15123272548 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Sun, 19 Mar 2017 09:19:57 -0700 Subject: [PATCH 54/85] net: qmi_wwan: Add USB IDs for MDM6600 modem on Motorola Droid 4 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This gets qmicli working with the MDM6600 modem. Cc: Bjørn Mork Reviewed-by: Sebastian Reichel Tested-by: Sebastian Reichel Signed-off-by: Tony Lindgren Acked-by: Bjørn Mork Signed-off-by: David S. Miller --- drivers/net/usb/qmi_wwan.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index f8d55aa058ec..156f7f85e486 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -580,6 +580,10 @@ static const struct usb_device_id products[] = { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, USB_CLASS_VENDOR_SPEC, 0x01, 0x69), .driver_info = (unsigned long)&qmi_wwan_info, }, + { /* Motorola Mapphone devices with MDM6600 */ + USB_VENDOR_AND_INTERFACE_INFO(0x22b8, USB_CLASS_VENDOR_SPEC, 0xfb, 0xff), + .driver_info = (unsigned long)&qmi_wwan_info, + }, /* 2. Combined interface devices matching on class+protocol */ { /* Huawei E367 and possibly others in "Windows mode" */ From a05d4fd9176003e0c1f9c3d083f4dac19fd346ab Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 14 Mar 2017 19:25:56 -0400 Subject: [PATCH 55/85] cgroup, net_cls: iterate the fds of only the tasks which are being migrated The net_cls controller controls the classid field of each socket which is associated with the cgroup. Because the classid is per-socket attribute, when a task migrates to another cgroup or the configured classid of the cgroup changes, the controller needs to walk all sockets and update the classid value, which was implemented by 3b13758f51de ("cgroups: Allow dynamically changing net_classid"). While the approach is not scalable, migrating tasks which have a lot of fds attached to them is rare and the cost is born by the ones initiating the operations. However, for simplicity, both the migration and classid config change paths call update_classid() which scans all fds of all tasks in the target css. This is an overkill for the migration path which only needs to cover a much smaller subset of tasks which are actually getting migrated in. On cgroup v1, this can lead to unexpected scalability issues when one tries to migrate a task or process into a net_cls cgroup which already contains a lot of fds. Even if the migration traget doesn't have many to get scanned, update_classid() ends up scanning all fds in the target cgroup which can be extremely numerous. Unfortunately, on cgroup v2 which doesn't use net_cls, the problem is even worse. Before bfc2cf6f61fc ("cgroup: call subsys->*attach() only for subsystems which are actually affected by migration"), cgroup core would call the ->css_attach callback even for controllers which don't see actual migration to a different css. As net_cls is always disabled but still mounted on cgroup v2, whenever a process is migrated on the cgroup v2 hierarchy, net_cls sees identity migration from root to root and cgroup core used to call ->css_attach callback for those. The net_cls ->css_attach ends up calling update_classid() on the root net_cls css to which all processes on the system belong to as the controller isn't used. This makes any cgroup v2 migration O(total_number_of_fds_on_the_system) which is horrible and easily leads to noticeable stalls triggering RCU stall warnings and so on. The worst symptom is already fixed in upstream by bfc2cf6f61fc ("cgroup: call subsys->*attach() only for subsystems which are actually affected by migration"); however, backporting that commit is too invasive and we want to avoid other cases too. This patch updates net_cls's cgrp_attach() to iterate fds of only the processes which are actually getting migrated. This removes the surprising migration cost which is dependent on the total number of fds in the target cgroup. As this leaves write_classid() the only user of update_classid(), open-code the helper into write_classid(). Reported-by: David Goode Fixes: 3b13758f51de ("cgroups: Allow dynamically changing net_classid") Cc: stable@vger.kernel.org # v4.4+ Cc: Nina Schiff Cc: David S. Miller Signed-off-by: Tejun Heo Signed-off-by: David S. Miller --- net/core/netclassid_cgroup.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c index 6ae56037bb13..029a61ac6cdd 100644 --- a/net/core/netclassid_cgroup.c +++ b/net/core/netclassid_cgroup.c @@ -71,27 +71,17 @@ static int update_classid_sock(const void *v, struct file *file, unsigned n) return 0; } -static void update_classid(struct cgroup_subsys_state *css, void *v) -{ - struct css_task_iter it; - struct task_struct *p; - - css_task_iter_start(css, &it); - while ((p = css_task_iter_next(&it))) { - task_lock(p); - iterate_fd(p->files, 0, update_classid_sock, v); - task_unlock(p); - } - css_task_iter_end(&it); -} - static void cgrp_attach(struct cgroup_taskset *tset) { struct cgroup_subsys_state *css; + struct task_struct *p; - cgroup_taskset_first(tset, &css); - update_classid(css, - (void *)(unsigned long)css_cls_state(css)->classid); + cgroup_taskset_for_each(p, css, tset) { + task_lock(p); + iterate_fd(p->files, 0, update_classid_sock, + (void *)(unsigned long)css_cls_state(css)->classid); + task_unlock(p); + } } static u64 read_classid(struct cgroup_subsys_state *css, struct cftype *cft) @@ -103,12 +93,22 @@ static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft, u64 value) { struct cgroup_cls_state *cs = css_cls_state(css); + struct css_task_iter it; + struct task_struct *p; cgroup_sk_alloc_disable(); cs->classid = (u32)value; - update_classid(css, (void *)(unsigned long)cs->classid); + css_task_iter_start(css, &it); + while ((p = css_task_iter_next(&it))) { + task_lock(p); + iterate_fd(p->files, 0, update_classid_sock, + (void *)(unsigned long)cs->classid); + task_unlock(p); + } + css_task_iter_end(&it); + return 0; } From 210c4f70b4c630b27f0840c8043c138c955edc9e Mon Sep 17 00:00:00 2001 From: hayeswang Date: Mon, 20 Mar 2017 16:13:44 +0800 Subject: [PATCH 56/85] r8152: set the RMS of RTL8153 according to the mtu Set the received maximum size (RMS) according to the mtu size. It is unnecessary to receive a packet which is more than the size we could transmit. Besides, this could let the rx buffer be used effectively. Signed-off-by: Hayes Wang Signed-off-by: David S. Miller --- drivers/net/usb/r8152.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index bb3eedd07fbe..525c25817013 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -2899,7 +2899,8 @@ static void r8153_first_init(struct r8152 *tp) rtl_rx_vlan_en(tp, tp->netdev->features & NETIF_F_HW_VLAN_CTAG_RX); - ocp_write_word(tp, MCU_TYPE_PLA, PLA_RMS, RTL8153_RMS); + ocp_data = tp->netdev->mtu + VLAN_ETH_HLEN + CRC_SIZE; + ocp_write_word(tp, MCU_TYPE_PLA, PLA_RMS, ocp_data); ocp_write_byte(tp, MCU_TYPE_PLA, PLA_MTPS, MTPS_JUMBO); ocp_data = ocp_read_word(tp, MCU_TYPE_PLA, PLA_TCR0); @@ -2951,7 +2952,8 @@ static void r8153_enter_oob(struct r8152 *tp) usleep_range(1000, 2000); } - ocp_write_word(tp, MCU_TYPE_PLA, PLA_RMS, RTL8153_RMS); + ocp_data = tp->netdev->mtu + VLAN_ETH_HLEN + CRC_SIZE; + ocp_write_word(tp, MCU_TYPE_PLA, PLA_RMS, ocp_data); ocp_data = ocp_read_word(tp, MCU_TYPE_PLA, PLA_TEREDO_CFG); ocp_data &= ~TEREDO_WAKE_MASK; @@ -4201,8 +4203,14 @@ static int rtl8152_change_mtu(struct net_device *dev, int new_mtu) dev->mtu = new_mtu; - if (netif_running(dev) && netif_carrier_ok(dev)) - r8153_set_rx_early_size(tp); + if (netif_running(dev)) { + u32 rms = new_mtu + VLAN_ETH_HLEN + CRC_SIZE; + + ocp_write_word(tp, MCU_TYPE_PLA, PLA_RMS, rms); + + if (netif_carrier_ok(dev)) + r8153_set_rx_early_size(tp); + } mutex_unlock(&tp->control); From b20cb60e2b865638459e6ec82ad3536d3734e555 Mon Sep 17 00:00:00 2001 From: hayeswang Date: Mon, 20 Mar 2017 16:13:45 +0800 Subject: [PATCH 57/85] r8152: fix the rx early size of RTL8153 revert commit a59e6d815226 ("r8152: correct the rx early size") and fix the rx early size as (rx buffer size - rx packet size - rx desc size - alignment) / 4 Signed-off-by: Hayes Wang Signed-off-by: David S. Miller --- drivers/net/usb/r8152.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index 525c25817013..0b1b9188625d 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -32,7 +32,7 @@ #define NETNEXT_VERSION "08" /* Information for net */ -#define NET_VERSION "8" +#define NET_VERSION "9" #define DRIVER_VERSION "v1." NETNEXT_VERSION "." NET_VERSION #define DRIVER_AUTHOR "Realtek linux nic maintainers " @@ -501,6 +501,8 @@ enum rtl_register_content { #define RTL8153_RMS RTL8153_MAX_PACKET #define RTL8152_TX_TIMEOUT (5 * HZ) #define RTL8152_NAPI_WEIGHT 64 +#define rx_reserved_size(x) ((x) + VLAN_ETH_HLEN + CRC_SIZE + \ + sizeof(struct rx_desc) + RX_ALIGN) /* rtl8152 flags */ enum rtl8152_flags { @@ -2253,8 +2255,7 @@ static void r8153_set_rx_early_timeout(struct r8152 *tp) static void r8153_set_rx_early_size(struct r8152 *tp) { - u32 mtu = tp->netdev->mtu; - u32 ocp_data = (agg_buf_sz - mtu - VLAN_ETH_HLEN - VLAN_HLEN) / 8; + u32 ocp_data = (agg_buf_sz - rx_reserved_size(tp->netdev->mtu)) / 4; ocp_write_word(tp, MCU_TYPE_USB, USB_RX_EARLY_SIZE, ocp_data); } From be9ca0d33c850192198c22518eeb1f41401268e8 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 20 Mar 2017 09:52:50 +0100 Subject: [PATCH 58/85] cpsw/netcp: work around reverse cpts dependency The dependency is reversed: cpsw and netcp call into cpts, but cpts depends on the other two in Kconfig. This can lead to cpts being a loadable module and its callers built-in: drivers/net/ethernet/ti/cpsw.o: In function `cpsw_remove': cpsw.c:(.text.cpsw_remove+0xd0): undefined reference to `cpts_release' drivers/net/ethernet/ti/cpsw.o: In function `cpsw_rx_handler': cpsw.c:(.text.cpsw_rx_handler+0x2dc): undefined reference to `cpts_rx_timestamp' drivers/net/ethernet/ti/cpsw.o: In function `cpsw_tx_handler': cpsw.c:(.text.cpsw_tx_handler+0x7c): undefined reference to `cpts_tx_timestamp' drivers/net/ethernet/ti/cpsw.o: In function `cpsw_ndo_stop': As a workaround, I'm introducing another Kconfig symbol to control the compilation of cpts, while making the actual module controlled by a silent symbol that is =y when necessary. Fixes: 6246168b4a38 ("net: ethernet: ti: netcp: add support of cpts") Signed-off-by: Arnd Bergmann Reviewed-by: Grygorii Strashko Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/Kconfig | 8 +++++++- drivers/net/ethernet/ti/Makefile | 2 +- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/ti/Kconfig b/drivers/net/ethernet/ti/Kconfig index 296c8efd0038..d923890a9fda 100644 --- a/drivers/net/ethernet/ti/Kconfig +++ b/drivers/net/ethernet/ti/Kconfig @@ -74,7 +74,7 @@ config TI_CPSW will be called cpsw. config TI_CPTS - tristate "TI Common Platform Time Sync (CPTS) Support" + bool "TI Common Platform Time Sync (CPTS) Support" depends on TI_CPSW || TI_KEYSTONE_NETCP imply PTP_1588_CLOCK ---help--- @@ -83,6 +83,12 @@ config TI_CPTS The unit can time stamp PTP UDP/IPv4 and Layer 2 packets, and the driver offers a PTP Hardware Clock. +config TI_CPTS_MOD + tristate + depends on TI_CPTS + default y if TI_CPSW=y || TI_KEYSTONE_NETCP=y + default m + config TI_KEYSTONE_NETCP tristate "TI Keystone NETCP Core Support" select TI_CPSW_ALE diff --git a/drivers/net/ethernet/ti/Makefile b/drivers/net/ethernet/ti/Makefile index 1e7c10bf8713..10e6b0ce51ba 100644 --- a/drivers/net/ethernet/ti/Makefile +++ b/drivers/net/ethernet/ti/Makefile @@ -12,7 +12,7 @@ obj-$(CONFIG_TI_DAVINCI_MDIO) += davinci_mdio.o obj-$(CONFIG_TI_DAVINCI_CPDMA) += davinci_cpdma.o obj-$(CONFIG_TI_CPSW_PHY_SEL) += cpsw-phy-sel.o obj-$(CONFIG_TI_CPSW_ALE) += cpsw_ale.o -obj-$(CONFIG_TI_CPTS) += cpts.o +obj-$(CONFIG_TI_CPTS_MOD) += cpts.o obj-$(CONFIG_TI_CPSW) += ti_cpsw.o ti_cpsw-y := cpsw.o From 07fef3623407444e51c12ea57cd91df38c1069e0 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 20 Mar 2017 09:58:33 +0100 Subject: [PATCH 59/85] cpsw/netcp: cpts depends on posix_timers With posix timers having become optional, we get a build error with the cpts time sync option of the CPSW driver: drivers/net/ethernet/ti/cpts.c: In function 'cpts_find_ts': drivers/net/ethernet/ti/cpts.c:291:23: error: implicit declaration of function 'ptp_classify_raw';did you mean 'ptp_classifier_init'? [-Werror=implicit-function-declaration] This adds a hard dependency on PTP_CLOCK to avoid the problem, as building it without PTP support makes no sense anyway. Fixes: baa73d9e478f ("posix-timers: Make them configurable") Cc: Nicolas Pitre Cc: stable@vger.kernel.org Signed-off-by: Arnd Bergmann Acked-by: Nicolas Pitre Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ti/Kconfig b/drivers/net/ethernet/ti/Kconfig index d923890a9fda..9e631952b86f 100644 --- a/drivers/net/ethernet/ti/Kconfig +++ b/drivers/net/ethernet/ti/Kconfig @@ -76,7 +76,7 @@ config TI_CPSW config TI_CPTS bool "TI Common Platform Time Sync (CPTS) Support" depends on TI_CPSW || TI_KEYSTONE_NETCP - imply PTP_1588_CLOCK + depends on PTP_1588_CLOCK ---help--- This driver supports the Common Platform Time Sync unit of the CPSW Ethernet Switch and Keystone 2 1g/10g Switch Subsystem. From 1511949c61ec63e4b646c34d602ac6990b38ce30 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 20 Mar 2017 17:46:27 +0800 Subject: [PATCH 60/85] sctp: declare struct sctp_stream before using it sctp_stream_free uses struct sctp_stream as a param, but struct sctp_stream is defined after it's declaration. This patch is to declare struct sctp_stream before sctp_stream_free. Fixes: a83863174a61 ("sctp: prepare asoc stream for stream reconf") Signed-off-by: Xin Long Acked-by: Neil Horman Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 4f645198e9bd..592decebac75 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -83,6 +83,7 @@ struct sctp_bind_addr; struct sctp_ulpq; struct sctp_ep_common; struct crypto_shash; +struct sctp_stream; #include From 581947787eaf1ad801959d00b42b9d0131aacb6a Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 20 Mar 2017 18:00:28 +0800 Subject: [PATCH 61/85] sctp: remove useless err from sctp_association_init This patch is to remove the unnecessary temporary variable 'err' from sctp_association_init. Signed-off-by: Xin Long Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/associola.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 2a6835b4562b..0439a1a68367 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -71,9 +71,8 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a { struct net *net = sock_net(sk); struct sctp_sock *sp; - int i; sctp_paramhdr_t *p; - int err; + int i; /* Retrieve the SCTP per socket area. */ sp = sctp_sk((struct sock *)sk); @@ -264,8 +263,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a /* AUTH related initializations */ INIT_LIST_HEAD(&asoc->endpoint_shared_keys); - err = sctp_auth_asoc_copy_shkeys(ep, asoc, gfp); - if (err) + if (sctp_auth_asoc_copy_shkeys(ep, asoc, gfp)) goto fail_init; asoc->active_key_id = ep->active_key_id; From 557d054c01da0337ca81de9e9d9206d57245b57e Mon Sep 17 00:00:00 2001 From: Ying Xue Date: Tue, 21 Mar 2017 10:47:49 +0100 Subject: [PATCH 62/85] tipc: fix nametbl deadlock at tipc_nametbl_unsubscribe Until now, tipc_nametbl_unsubscribe() is called at subscriptions reference count cleanup. Usually the subscriptions cleanup is called at subscription timeout or at subscription cancel or at subscriber delete. We have ignored the possibility of this being called from other locations, which causes deadlock as we try to grab the tn->nametbl_lock while holding it already. CPU1: CPU2: ---------- ---------------- tipc_nametbl_publish spin_lock_bh(&tn->nametbl_lock) tipc_nametbl_insert_publ tipc_nameseq_insert_publ tipc_subscrp_report_overlap tipc_subscrp_get tipc_subscrp_send_event tipc_close_conn tipc_subscrb_release_cb tipc_subscrb_delete tipc_subscrp_put tipc_subscrp_put tipc_subscrp_kref_release tipc_nametbl_unsubscribe spin_lock_bh(&tn->nametbl_lock) <> CPU1: CPU2: ---------- ---------------- tipc_nametbl_stop spin_lock_bh(&tn->nametbl_lock) tipc_purge_publications tipc_nameseq_remove_publ tipc_subscrp_report_overlap tipc_subscrp_get tipc_subscrp_send_event tipc_close_conn tipc_subscrb_release_cb tipc_subscrb_delete tipc_subscrp_put tipc_subscrp_put tipc_subscrp_kref_release tipc_nametbl_unsubscribe spin_lock_bh(&tn->nametbl_lock) <> In this commit, we advance the calling of tipc_nametbl_unsubscribe() from the refcount cleanup to the intended callers. Fixes: d094c4d5f5c7 ("tipc: add subscription refcount to avoid invalid delete") Reported-by: John Thompson Acked-by: Jon Maloy Signed-off-by: Ying Xue Signed-off-by: Parthasarathy Bhuvaragan Signed-off-by: David S. Miller --- net/tipc/subscr.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index 9d94e65d0894..271cd66e4b3b 100644 --- a/net/tipc/subscr.c +++ b/net/tipc/subscr.c @@ -141,6 +141,11 @@ void tipc_subscrp_report_overlap(struct tipc_subscription *sub, u32 found_lower, static void tipc_subscrp_timeout(unsigned long data) { struct tipc_subscription *sub = (struct tipc_subscription *)data; + struct tipc_subscriber *subscriber = sub->subscriber; + + spin_lock_bh(&subscriber->lock); + tipc_nametbl_unsubscribe(sub); + spin_unlock_bh(&subscriber->lock); /* Notify subscriber of timeout */ tipc_subscrp_send_event(sub, sub->evt.s.seq.lower, sub->evt.s.seq.upper, @@ -173,7 +178,6 @@ static void tipc_subscrp_kref_release(struct kref *kref) struct tipc_subscriber *subscriber = sub->subscriber; spin_lock_bh(&subscriber->lock); - tipc_nametbl_unsubscribe(sub); list_del(&sub->subscrp_list); atomic_dec(&tn->subscription_count); spin_unlock_bh(&subscriber->lock); @@ -205,6 +209,7 @@ static void tipc_subscrb_subscrp_delete(struct tipc_subscriber *subscriber, if (s && memcmp(s, &sub->evt.s, sizeof(struct tipc_subscr))) continue; + tipc_nametbl_unsubscribe(sub); tipc_subscrp_get(sub); spin_unlock_bh(&subscriber->lock); tipc_subscrp_delete(sub); From 1f30a86c58093046dc3e49c23d2618894e098f7a Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Tue, 21 Mar 2017 15:59:12 +0200 Subject: [PATCH 63/85] net/mlx5: Add missing entries for set/query rate limit commands The switch cases for the rate limit set and query commands were missing, which could get us wrong under fw error or driver reset flow, fix that. Fixes: 1466cc5b23d1 ('net/mlx5: Rate limit tables support') Signed-off-by: Or Gerlitz Reviewed-by: Hadar Hen Zion Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index caa837e5e2b9..a380353a78c2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -361,6 +361,8 @@ static int mlx5_internal_err_ret_value(struct mlx5_core_dev *dev, u16 op, case MLX5_CMD_OP_QUERY_VPORT_COUNTER: case MLX5_CMD_OP_ALLOC_Q_COUNTER: case MLX5_CMD_OP_QUERY_Q_COUNTER: + case MLX5_CMD_OP_SET_RATE_LIMIT: + case MLX5_CMD_OP_QUERY_RATE_LIMIT: case MLX5_CMD_OP_ALLOC_PD: case MLX5_CMD_OP_ALLOC_UAR: case MLX5_CMD_OP_CONFIG_INT_MODERATION: @@ -497,6 +499,8 @@ const char *mlx5_command_str(int command) MLX5_COMMAND_STR_CASE(ALLOC_Q_COUNTER); MLX5_COMMAND_STR_CASE(DEALLOC_Q_COUNTER); MLX5_COMMAND_STR_CASE(QUERY_Q_COUNTER); + MLX5_COMMAND_STR_CASE(SET_RATE_LIMIT); + MLX5_COMMAND_STR_CASE(QUERY_RATE_LIMIT); MLX5_COMMAND_STR_CASE(ALLOC_PD); MLX5_COMMAND_STR_CASE(DEALLOC_PD); MLX5_COMMAND_STR_CASE(ALLOC_UAR); From d85cdccbb3fe9a632ec9d0f4e4526c8c84fc3523 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Tue, 21 Mar 2017 15:59:13 +0200 Subject: [PATCH 64/85] net/mlx5e: Change the TC offload rule add/del code path to be per NIC or E-Switch Refactor the code to deal with add/del TC rules to have handler per NIC/E-switch offloading use case, and push the latter into the e-switch code. This provides better separation and is to be used in down-stream patch for applying a fix. Fixes: bffaa916588e ("net/mlx5: E-Switch, Add control for inline mode") Signed-off-by: Or Gerlitz Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- .../net/ethernet/mellanox/mlx5/core/en_tc.c | 59 ++++++++++++------- .../net/ethernet/mellanox/mlx5/core/eswitch.h | 5 ++ .../mellanox/mlx5/core/eswitch_offloads.c | 14 +++++ 3 files changed, 58 insertions(+), 20 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 79481f4cf264..2825b5665456 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -133,6 +133,23 @@ err_create_ft: return rule; } +static void mlx5e_tc_del_nic_flow(struct mlx5e_priv *priv, + struct mlx5e_tc_flow *flow) +{ + struct mlx5_fc *counter = NULL; + + if (!IS_ERR(flow->rule)) { + counter = mlx5_flow_rule_counter(flow->rule); + mlx5_del_flow_rules(flow->rule); + mlx5_fc_destroy(priv->mdev, counter); + } + + if (!mlx5e_tc_num_filters(priv) && (priv->fs.tc.t)) { + mlx5_destroy_flow_table(priv->fs.tc.t); + priv->fs.tc.t = NULL; + } +} + static struct mlx5_flow_handle * mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv, struct mlx5_flow_spec *spec, @@ -149,7 +166,24 @@ mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv, } static void mlx5e_detach_encap(struct mlx5e_priv *priv, - struct mlx5e_tc_flow *flow) { + struct mlx5e_tc_flow *flow); + +static void mlx5e_tc_del_fdb_flow(struct mlx5e_priv *priv, + struct mlx5e_tc_flow *flow) +{ + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + + mlx5_eswitch_del_offloaded_rule(esw, flow->rule, flow->attr); + + mlx5_eswitch_del_vlan_action(esw, flow->attr); + + if (flow->attr->action & MLX5_FLOW_CONTEXT_ACTION_ENCAP) + mlx5e_detach_encap(priv, flow); +} + +static void mlx5e_detach_encap(struct mlx5e_priv *priv, + struct mlx5e_tc_flow *flow) +{ struct list_head *next = flow->encap.next; list_del(&flow->encap); @@ -173,25 +207,10 @@ static void mlx5e_detach_encap(struct mlx5e_priv *priv, static void mlx5e_tc_del_flow(struct mlx5e_priv *priv, struct mlx5e_tc_flow *flow) { - struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; - struct mlx5_fc *counter = NULL; - - if (!IS_ERR(flow->rule)) { - counter = mlx5_flow_rule_counter(flow->rule); - mlx5_del_flow_rules(flow->rule); - mlx5_fc_destroy(priv->mdev, counter); - } - - if (flow->flags & MLX5E_TC_FLOW_ESWITCH) { - mlx5_eswitch_del_vlan_action(esw, flow->attr); - if (flow->attr->action & MLX5_FLOW_CONTEXT_ACTION_ENCAP) - mlx5e_detach_encap(priv, flow); - } - - if (!mlx5e_tc_num_filters(priv) && (priv->fs.tc.t)) { - mlx5_destroy_flow_table(priv->fs.tc.t); - priv->fs.tc.t = NULL; - } + if (flow->flags & MLX5E_TC_FLOW_ESWITCH) + mlx5e_tc_del_fdb_flow(priv, flow); + else + mlx5e_tc_del_nic_flow(priv, flow); } static void parse_vxlan_attr(struct mlx5_flow_spec *spec, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h index 5b78883d5654..9227a83a97e3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h @@ -271,6 +271,11 @@ struct mlx5_flow_handle * mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw, struct mlx5_flow_spec *spec, struct mlx5_esw_flow_attr *attr); +void +mlx5_eswitch_del_offloaded_rule(struct mlx5_eswitch *esw, + struct mlx5_flow_handle *rule, + struct mlx5_esw_flow_attr *attr); + struct mlx5_flow_handle * mlx5_eswitch_create_vport_rx_rule(struct mlx5_eswitch *esw, int vport, u32 tirn); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index 4f5b0d47d5f3..bfabefe20ac0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -97,6 +97,20 @@ mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw, return rule; } +void +mlx5_eswitch_del_offloaded_rule(struct mlx5_eswitch *esw, + struct mlx5_flow_handle *rule, + struct mlx5_esw_flow_attr *attr) +{ + struct mlx5_fc *counter = NULL; + + if (!IS_ERR(rule)) { + counter = mlx5_flow_rule_counter(rule); + mlx5_del_flow_rules(rule); + mlx5_fc_destroy(esw->dev, counter); + } +} + static int esw_set_global_vlan_pop(struct mlx5_eswitch *esw, u8 val) { struct mlx5_eswitch_rep *rep; From 375f51e2b5b7b9a42b3139aea519cbb1bfc5d6ef Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Tue, 21 Mar 2017 15:59:14 +0200 Subject: [PATCH 65/85] net/mlx5: E-Switch, Don't allow changing inline mode when flows are configured Changing the eswitch inline mode can potentially cause already configured flows not to match the policy. E.g. set policy L4, add some L4 rules, set policy to L2 --> bad! Hence we disallow it. Keep track of how many offloaded rules are now set and refuse inline mode changes if this isn't zero. Fixes: bffaa916588e ("net/mlx5: E-Switch, Add control for inline mode") Signed-off-by: Roi Dayan Reviewed-by: Or Gerlitz Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/eswitch.h | 1 + .../net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h index 9227a83a97e3..ad329b1680b4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h @@ -209,6 +209,7 @@ struct mlx5_esw_offload { struct mlx5_eswitch_rep *vport_reps; DECLARE_HASHTABLE(encap_tbl, 8); u8 inline_mode; + u64 num_flows; }; struct mlx5_eswitch { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index bfabefe20ac0..307ec6c5fd3b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -93,6 +93,8 @@ mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw, spec, &flow_act, dest, i); if (IS_ERR(rule)) mlx5_fc_destroy(esw->dev, counter); + else + esw->offloads.num_flows++; return rule; } @@ -108,6 +110,7 @@ mlx5_eswitch_del_offloaded_rule(struct mlx5_eswitch *esw, counter = mlx5_flow_rule_counter(rule); mlx5_del_flow_rules(rule); mlx5_fc_destroy(esw->dev, counter); + esw->offloads.num_flows--; } } @@ -922,6 +925,11 @@ int mlx5_devlink_eswitch_inline_mode_set(struct devlink *devlink, u8 mode) MLX5_CAP_INLINE_MODE_VPORT_CONTEXT) return -EOPNOTSUPP; + if (esw->offloads.num_flows > 0) { + esw_warn(dev, "Can't set inline mode when flows are configured\n"); + return -EOPNOTSUPP; + } + err = esw_inline_mode_from_devlink(mode, &mlx5_mode); if (err) goto out; From 09c91ddf2cd33489c2c14edfef43ae38d412888e Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Tue, 21 Mar 2017 15:59:15 +0200 Subject: [PATCH 66/85] net/mlx5e: Use the proper UAPI values when offloading TC vlan actions Currently we use the non UAPI values and we miss erring on the modify action which is not supported, fix that. Fixes: 8b32580df1cb ('net/mlx5e: Add TC vlan action for SRIOV offloads') Signed-off-by: Or Gerlitz Reported-by: Petr Machata Reviewed-by: Jiri Pirko Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 2825b5665456..9c13abaf3885 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -1131,14 +1131,16 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, struct tcf_exts *exts, } if (is_tcf_vlan(a)) { - if (tcf_vlan_action(a) == VLAN_F_POP) { + if (tcf_vlan_action(a) == TCA_VLAN_ACT_POP) { attr->action |= MLX5_FLOW_CONTEXT_ACTION_VLAN_POP; - } else if (tcf_vlan_action(a) == VLAN_F_PUSH) { + } else if (tcf_vlan_action(a) == TCA_VLAN_ACT_PUSH) { if (tcf_vlan_push_proto(a) != htons(ETH_P_8021Q)) return -EOPNOTSUPP; attr->action |= MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH; attr->vlan = tcf_vlan_push_vid(a); + } else { /* action is TCA_VLAN_ACT_MODIFY */ + return -EOPNOTSUPP; } continue; } From 1ad9a00ae0efc2e9337148d6c382fad3d27bf99a Mon Sep 17 00:00:00 2001 From: Paul Blakey Date: Tue, 21 Mar 2017 15:59:16 +0200 Subject: [PATCH 67/85] net/mlx5e: Avoid supporting udp tunnel port ndo for VF reps This was added to allow the TC offloading code to identify offloading encap/decap vxlan rules. The VF reps are effectively related to the same mlx5 PCI device as the PF. Since the kernel invokes the (say) delete ndo for each netdev, the FW erred on multiple vxlan dst port deletes when the port was deleted from the system. We fix that by keeping the registration to be carried out only by the PF. Since the PF serves as the uplink device, the VF reps will look up a port there and realize if they are ok to offload that. Tested: ip link add vxlan1 type vxlan id 44 dev ens5f0 dstport 9999 ip link set vxlan1 up ip link del dev vxlan1 Fixes: 4a25730eb202 ('net/mlx5e: Add ndo_udp_tunnel_add to VF representors') Signed-off-by: Paul Blakey Reviewed-by: Or Gerlitz Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 4 ---- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 8 ++++---- drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 2 -- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 9 +++++++-- 4 files changed, 11 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index f6a6ded204f6..dc52053128bc 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -928,10 +928,6 @@ void mlx5e_destroy_netdev(struct mlx5_core_dev *mdev, struct mlx5e_priv *priv); int mlx5e_attach_netdev(struct mlx5_core_dev *mdev, struct net_device *netdev); void mlx5e_detach_netdev(struct mlx5_core_dev *mdev, struct net_device *netdev); u32 mlx5e_choose_lro_timeout(struct mlx5_core_dev *mdev, u32 wanted_timeout); -void mlx5e_add_vxlan_port(struct net_device *netdev, - struct udp_tunnel_info *ti); -void mlx5e_del_vxlan_port(struct net_device *netdev, - struct udp_tunnel_info *ti); int mlx5e_get_offload_stats(int attr_id, const struct net_device *dev, void *sp); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 8ef64c4db2c2..66c133757a5e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -3100,8 +3100,8 @@ static int mlx5e_get_vf_stats(struct net_device *dev, vf_stats); } -void mlx5e_add_vxlan_port(struct net_device *netdev, - struct udp_tunnel_info *ti) +static void mlx5e_add_vxlan_port(struct net_device *netdev, + struct udp_tunnel_info *ti) { struct mlx5e_priv *priv = netdev_priv(netdev); @@ -3114,8 +3114,8 @@ void mlx5e_add_vxlan_port(struct net_device *netdev, mlx5e_vxlan_queue_work(priv, ti->sa_family, be16_to_cpu(ti->port), 1); } -void mlx5e_del_vxlan_port(struct net_device *netdev, - struct udp_tunnel_info *ti) +static void mlx5e_del_vxlan_port(struct net_device *netdev, + struct udp_tunnel_info *ti) { struct mlx5e_priv *priv = netdev_priv(netdev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index 2c864574a9d5..f621373bd7a5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -393,8 +393,6 @@ static const struct net_device_ops mlx5e_netdev_ops_rep = { .ndo_get_phys_port_name = mlx5e_rep_get_phys_port_name, .ndo_setup_tc = mlx5e_rep_ndo_setup_tc, .ndo_get_stats64 = mlx5e_rep_get_stats, - .ndo_udp_tunnel_add = mlx5e_add_vxlan_port, - .ndo_udp_tunnel_del = mlx5e_del_vxlan_port, .ndo_has_offload_stats = mlx5e_has_offload_stats, .ndo_get_offload_stats = mlx5e_get_offload_stats, }; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 9c13abaf3885..fade7233dac5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -267,12 +267,15 @@ static int parse_tunnel_attr(struct mlx5e_priv *priv, skb_flow_dissector_target(f->dissector, FLOW_DISSECTOR_KEY_ENC_PORTS, f->mask); + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + struct net_device *up_dev = mlx5_eswitch_get_uplink_netdev(esw); + struct mlx5e_priv *up_priv = netdev_priv(up_dev); /* Full udp dst port must be given */ if (memchr_inv(&mask->dst, 0xff, sizeof(mask->dst))) goto vxlan_match_offload_err; - if (mlx5e_vxlan_lookup_port(priv, be16_to_cpu(key->dst)) && + if (mlx5e_vxlan_lookup_port(up_priv, be16_to_cpu(key->dst)) && MLX5_CAP_ESW(priv->mdev, vxlan_encap_decap)) parse_vxlan_attr(spec, f); else { @@ -995,6 +998,8 @@ static int mlx5e_attach_encap(struct mlx5e_priv *priv, struct mlx5_esw_flow_attr *attr) { struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + struct net_device *up_dev = mlx5_eswitch_get_uplink_netdev(esw); + struct mlx5e_priv *up_priv = netdev_priv(up_dev); unsigned short family = ip_tunnel_info_af(tun_info); struct ip_tunnel_key *key = &tun_info->key; struct mlx5_encap_entry *e; @@ -1015,7 +1020,7 @@ vxlan_encap_offload_err: return -EOPNOTSUPP; } - if (mlx5e_vxlan_lookup_port(priv, be16_to_cpu(key->tp_dst)) && + if (mlx5e_vxlan_lookup_port(up_priv, be16_to_cpu(key->tp_dst)) && MLX5_CAP_ESW(priv->mdev, vxlan_encap_decap)) { tunnel_type = MLX5_HEADER_TYPE_VXLAN; } else { From 5f40b4ed975c26016cf41953b7510fe90718e21c Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Tue, 21 Mar 2017 15:59:17 +0200 Subject: [PATCH 68/85] net/mlx5: Increase number of max QPs in default profile With ConnectX-4 sharing SRQs from the same space as QPs, we hit a limit preventing some applications to allocate needed QPs amount. Double the size to 256K. Fixes: e126ba97dba9e ('mlx5: Add driver for Mellanox Connect-IB adapters') Signed-off-by: Maor Gottlieb Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index e2bd600d19de..60154a175bd3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -87,7 +87,7 @@ static struct mlx5_profile profile[] = { [2] = { .mask = MLX5_PROF_MASK_QP_SIZE | MLX5_PROF_MASK_MR_CACHE, - .log_max_qp = 17, + .log_max_qp = 18, .mr_cache[0] = { .size = 500, .limit = 250 From d3a4e4da54c7adb420d5f48e89be913b14bdeff1 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Tue, 21 Mar 2017 15:59:18 +0200 Subject: [PATCH 69/85] net/mlx5e: Count GSO packets correctly TX packets statistics ('tx_packets' counter) used to count GSO packets as one, even though it contains multiple segments. This patch will increment the counter by the number of segments, and align the driver with the behavior of other drivers in the stack. Note that no information is lost in this patch due to 'tx_tso_packets' counter existence. Before, ethtool showed: $ ethtool -S ens6 | egrep "tx_packets|tx_tso_packets" tx_packets: 61340 tx_tso_packets: 60954 tx_packets_phy: 2451115 Now, we will see the more logical statistics: $ ethtool -S ens6 | egrep "tx_packets|tx_tso_packets" tx_packets: 2451115 tx_tso_packets: 60954 tx_packets_phy: 2451115 Fixes: e586b3b0baee ("net/mlx5: Ethernet Datapath files") Signed-off-by: Gal Pressman Cc: kernel-team@fb.com Signed-off-by: Saeed Mahameed Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en_tx.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c index f193128bac4b..57f5e2d7ebd1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c @@ -274,15 +274,18 @@ static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_sq *sq, struct sk_buff *skb) sq->stats.tso_bytes += skb->len - ihs; } + sq->stats.packets += skb_shinfo(skb)->gso_segs; num_bytes = skb->len + (skb_shinfo(skb)->gso_segs - 1) * ihs; } else { bf = sq->bf_budget && !skb->xmit_more && !skb_shinfo(skb)->nr_frags; ihs = mlx5e_get_inline_hdr_size(sq, skb, bf); + sq->stats.packets++; num_bytes = max_t(unsigned int, skb->len, ETH_ZLEN); } + sq->stats.bytes += num_bytes; wi->num_bytes = num_bytes; ds_cnt = sizeof(*wqe) / MLX5_SEND_WQE_DS; @@ -381,8 +384,6 @@ static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_sq *sq, struct sk_buff *skb) if (bf) sq->bf_budget--; - sq->stats.packets++; - sq->stats.bytes += num_bytes; return NETDEV_TX_OK; dma_unmap_wqe_err: From 8ab7e2ae15d84ba758b2c8c6f4075722e9bd2a08 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Tue, 21 Mar 2017 15:59:19 +0200 Subject: [PATCH 70/85] net/mlx5e: Count LRO packets correctly RX packets statistics ('rx_packets' counter) used to count LRO packets as one, even though it contains multiple segments. This patch will increment the counter by the number of segments, and align the driver with the behavior of other drivers in the stack. Note that no information is lost in this patch due to 'rx_lro_packets' counter existence. Before, ethtool showed: $ ethtool -S ens6 | egrep "rx_packets|rx_lro_packets" rx_packets: 435277 rx_lro_packets: 35847 rx_packets_phy: 1935066 Now, we will see the more logical statistics: $ ethtool -S ens6 | egrep "rx_packets|rx_lro_packets" rx_packets: 1935066 rx_lro_packets: 35847 rx_packets_phy: 1935066 Fixes: e586b3b0baee ("net/mlx5: Ethernet Datapath files") Signed-off-by: Gal Pressman Cc: kernel-team@fb.com Signed-off-by: Saeed Mahameed Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index 3d371688fbbb..bafcb349a50c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -601,6 +601,10 @@ static inline void mlx5e_build_rx_skb(struct mlx5_cqe64 *cqe, if (lro_num_seg > 1) { mlx5e_lro_update_hdr(skb, cqe, cqe_bcnt); skb_shinfo(skb)->gso_size = DIV_ROUND_UP(cqe_bcnt, lro_num_seg); + /* Subtract one since we already counted this as one + * "regular" packet in mlx5e_complete_rx_cqe() + */ + rq->stats.packets += lro_num_seg - 1; rq->stats.lro_packets++; rq->stats.lro_bytes += cqe_bcnt; } From ac23d3cac1f339febd95c403a245ae072dfd0e84 Mon Sep 17 00:00:00 2001 From: Yasuaki Ishimatsu Date: Tue, 21 Mar 2017 11:44:25 -0400 Subject: [PATCH 71/85] fjes: Do not load fjes driver if system does not have extended socket device. The fjes driver is used only by FUJITSU servers and almost of all servers in the world never use it. But currently if ACPI PNP0C02 is defined in the ACPI table, the following message is always shown: "FUJITSU Extended Socket Network Device Driver - version 1.2 - Copyright (c) 2015 FUJITSU LIMITED" The message makes users confused because there is no reason that the message is shown in other vendor servers. To avoid the confusion, the patch adds a check that the server has a extended socket device or not. Signed-off-by: Yasuaki Ishimatsu CC: Taku Izumi Signed-off-by: David S. Miller --- drivers/net/fjes/fjes_main.c | 52 ++++++++++++++++++++++++++++++++---- 1 file changed, 47 insertions(+), 5 deletions(-) diff --git a/drivers/net/fjes/fjes_main.c b/drivers/net/fjes/fjes_main.c index c4b3c4b77a9c..7b589649ab46 100644 --- a/drivers/net/fjes/fjes_main.c +++ b/drivers/net/fjes/fjes_main.c @@ -45,6 +45,8 @@ MODULE_DESCRIPTION("FUJITSU Extended Socket Network Device Driver"); MODULE_LICENSE("GPL"); MODULE_VERSION(DRV_VERSION); +#define ACPI_MOTHERBOARD_RESOURCE_HID "PNP0C02" + static int fjes_request_irq(struct fjes_adapter *); static void fjes_free_irq(struct fjes_adapter *); @@ -78,7 +80,7 @@ static void fjes_rx_irq(struct fjes_adapter *, int); static int fjes_poll(struct napi_struct *, int); static const struct acpi_device_id fjes_acpi_ids[] = { - {"PNP0C02", 0}, + {ACPI_MOTHERBOARD_RESOURCE_HID, 0}, {"", 0}, }; MODULE_DEVICE_TABLE(acpi, fjes_acpi_ids); @@ -115,18 +117,17 @@ static struct resource fjes_resource[] = { }, }; -static int fjes_acpi_add(struct acpi_device *device) +static bool is_extended_socket_device(struct acpi_device *device) { struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL}; char str_buf[sizeof(FJES_ACPI_SYMBOL) + 1]; - struct platform_device *plat_dev; union acpi_object *str; acpi_status status; int result; status = acpi_evaluate_object(device->handle, "_STR", NULL, &buffer); if (ACPI_FAILURE(status)) - return -ENODEV; + return false; str = buffer.pointer; result = utf16s_to_utf8s((wchar_t *)str->string.pointer, @@ -136,10 +137,21 @@ static int fjes_acpi_add(struct acpi_device *device) if (strncmp(FJES_ACPI_SYMBOL, str_buf, strlen(FJES_ACPI_SYMBOL)) != 0) { kfree(buffer.pointer); - return -ENODEV; + return false; } kfree(buffer.pointer); + return true; +} + +static int fjes_acpi_add(struct acpi_device *device) +{ + struct platform_device *plat_dev; + acpi_status status; + + if (!is_extended_socket_device(device)) + return -ENODEV; + status = acpi_walk_resources(device->handle, METHOD_NAME__CRS, fjes_get_acpi_resource, fjes_resource); if (ACPI_FAILURE(status)) @@ -1473,11 +1485,41 @@ static void fjes_watch_unshare_task(struct work_struct *work) } } +static acpi_status +acpi_find_extended_socket_device(acpi_handle obj_handle, u32 level, + void *context, void **return_value) +{ + struct acpi_device *device; + bool *found = context; + int result; + + result = acpi_bus_get_device(obj_handle, &device); + if (result) + return AE_OK; + + if (strcmp(acpi_device_hid(device), ACPI_MOTHERBOARD_RESOURCE_HID)) + return AE_OK; + + if (!is_extended_socket_device(device)) + return AE_OK; + + *found = true; + return AE_CTRL_TERMINATE; +} + /* fjes_init_module - Driver Registration Routine */ static int __init fjes_init_module(void) { + bool found = false; int result; + acpi_walk_namespace(ACPI_TYPE_DEVICE, ACPI_ROOT_OBJECT, ACPI_UINT32_MAX, + acpi_find_extended_socket_device, NULL, &found, + NULL); + + if (!found) + return -ENODEV; + pr_info("%s - version %s - %s\n", fjes_driver_string, fjes_driver_version, fjes_copyright); From 2b396d302650f1ebb770ed758ddcf5a64328ffd5 Mon Sep 17 00:00:00 2001 From: Yasuaki Ishimatsu Date: Tue, 21 Mar 2017 11:46:35 -0400 Subject: [PATCH 72/85] fjes: Do not load fjes driver if extended socket device is not power on. The extended device socket cannot turn on/off while system is running. So when system boots up and the device is not power on, the fjes driver does not need be loaded. To check the status of the device, the patch adds ACPI _STA method check. Signed-off-by: Yasuaki Ishimatsu CC: Taku Izumi Signed-off-by: David S. Miller --- drivers/net/fjes/fjes_main.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/drivers/net/fjes/fjes_main.c b/drivers/net/fjes/fjes_main.c index 7b589649ab46..ae48c809bac9 100644 --- a/drivers/net/fjes/fjes_main.c +++ b/drivers/net/fjes/fjes_main.c @@ -144,6 +144,24 @@ static bool is_extended_socket_device(struct acpi_device *device) return true; } +static int acpi_check_extended_socket_status(struct acpi_device *device) +{ + unsigned long long sta; + acpi_status status; + + status = acpi_evaluate_integer(device->handle, "_STA", NULL, &sta); + if (ACPI_FAILURE(status)) + return -ENODEV; + + if (!((sta & ACPI_STA_DEVICE_PRESENT) && + (sta & ACPI_STA_DEVICE_ENABLED) && + (sta & ACPI_STA_DEVICE_UI) && + (sta & ACPI_STA_DEVICE_FUNCTIONING))) + return -ENODEV; + + return 0; +} + static int fjes_acpi_add(struct acpi_device *device) { struct platform_device *plat_dev; @@ -152,6 +170,9 @@ static int fjes_acpi_add(struct acpi_device *device) if (!is_extended_socket_device(device)) return -ENODEV; + if (acpi_check_extended_socket_status(device)) + return -ENODEV; + status = acpi_walk_resources(device->handle, METHOD_NAME__CRS, fjes_get_acpi_resource, fjes_resource); if (ACPI_FAILURE(status)) @@ -1503,6 +1524,9 @@ acpi_find_extended_socket_device(acpi_handle obj_handle, u32 level, if (!is_extended_socket_device(device)) return AE_OK; + if (acpi_check_extended_socket_status(device)) + return AE_OK; + *found = true; return AE_CTRL_TERMINATE; } From d515684d78148884d5fc425ba904c50f03844020 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Tue, 21 Mar 2017 17:14:27 +0100 Subject: [PATCH 73/85] ipv6: make sure to initialize sockc.tsflags before first use In the case udp_sk(sk)->pending is AF_INET6, udpv6_sendmsg() would jump to do_append_data, skipping the initialization of sockc.tsflags. Fix the problem by moving sockc.tsflags initialization earlier. The bug was detected with KMSAN. Fixes: c14ac9451c34 ("sock: enable timestamping using control messages") Signed-off-by: Alexander Potapenko Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv6/udp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 4e4c401e3bc6..e28082f0a307 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1035,6 +1035,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ipc6.hlimit = -1; ipc6.tclass = -1; ipc6.dontfrag = -1; + sockc.tsflags = sk->sk_tsflags; /* destination address check */ if (sin6) { @@ -1159,7 +1160,6 @@ do_udp_sendmsg: fl6.flowi6_mark = sk->sk_mark; fl6.flowi6_uid = sk->sk_uid; - sockc.tsflags = sk->sk_tsflags; if (msg->msg_controllen) { opt = &opt_space; From 31739eae738ccbe8b9d627c3f2251017ca03f4d2 Mon Sep 17 00:00:00 2001 From: Doug Berger Date: Tue, 21 Mar 2017 14:01:06 -0700 Subject: [PATCH 74/85] net: bcmgenet: remove bcmgenet_internal_phy_setup() Commit 6ac3ce8295e6 ("net: bcmgenet: Remove excessive PHY reset") removed the bcmgenet_mii_reset() function from bcmgenet_power_up() and bcmgenet_internal_phy_setup() functions. In so doing it broke the reset of the internal PHY devices used by the GENETv1-GENETv3 which required this reset before the UniMAC was enabled. It also broke the internal GPHY devices used by the GENETv4 because the config_init that installed the AFE workaround was no longer occurring after the reset of the GPHY performed by bcmgenet_phy_power_set() in bcmgenet_internal_phy_setup(). In addition the code in bcmgenet_internal_phy_setup() related to the "enable APD" comment goes with the bcmgenet_mii_reset() so it should have also been removed. Commit bd4060a6108b ("net: bcmgenet: Power on integrated GPHY in bcmgenet_power_up()") moved the bcmgenet_phy_power_set() call to the bcmgenet_power_up() function, but failed to remove it from the bcmgenet_internal_phy_setup() function. Had it done so, the bcmgenet_internal_phy_setup() function would have been empty and could have been removed at that time. Commit 5dbebbb44a6a ("net: bcmgenet: Software reset EPHY after power on") was submitted to correct the functional problems introduced by commit 6ac3ce8295e6 ("net: bcmgenet: Remove excessive PHY reset"). It was included in v4.4 and made available on 4.3-stable. Unfortunately, it didn't fully revert the commit because this bcmgenet_mii_reset() doesn't apply the soft reset to the internal GPHY used by GENETv4 like the previous one did. This prevents the restoration of the AFE work- arounds for internal GPHY devices after the bcmgenet_phy_power_set() in bcmgenet_internal_phy_setup(). This commit takes the alternate approach of removing the unnecessary bcmgenet_internal_phy_setup() function which shouldn't have been in v4.3 so that when bcmgenet_mii_reset() was restored it should have only gone into bcmgenet_power_up(). This will avoid the problems while also removing the redundancy (and hopefully some of the confusion). Fixes: 6ac3ce8295e6 ("net: bcmgenet: Remove excessive PHY reset") Signed-off-by: Doug Berger Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/genet/bcmmii.c | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/drivers/net/ethernet/broadcom/genet/bcmmii.c b/drivers/net/ethernet/broadcom/genet/bcmmii.c index e87607621e62..2f9281936f0e 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmmii.c +++ b/drivers/net/ethernet/broadcom/genet/bcmmii.c @@ -220,20 +220,6 @@ void bcmgenet_phy_power_set(struct net_device *dev, bool enable) udelay(60); } -static void bcmgenet_internal_phy_setup(struct net_device *dev) -{ - struct bcmgenet_priv *priv = netdev_priv(dev); - u32 reg; - - /* Power up PHY */ - bcmgenet_phy_power_set(dev, true); - /* enable APD */ - reg = bcmgenet_ext_readl(priv, EXT_EXT_PWR_MGMT); - reg |= EXT_PWR_DN_EN_LD; - bcmgenet_ext_writel(priv, reg, EXT_EXT_PWR_MGMT); - bcmgenet_mii_reset(dev); -} - static void bcmgenet_moca_phy_setup(struct bcmgenet_priv *priv) { u32 reg; @@ -281,7 +267,6 @@ int bcmgenet_mii_config(struct net_device *dev) if (priv->internal_phy) { phy_name = "internal PHY"; - bcmgenet_internal_phy_setup(dev); } else if (priv->phy_interface == PHY_INTERFACE_MODE_MOCA) { phy_name = "MoCA"; bcmgenet_moca_phy_setup(priv); From dd1ef79120e1600cb48320cf80a612ee6510110c Mon Sep 17 00:00:00 2001 From: Govindarajulu Varadarajan Date: Tue, 21 Mar 2017 15:07:48 -0700 Subject: [PATCH 75/85] enic: update enic maintainers update enic maintainers Signed-off-by: Govindarajulu Varadarajan Signed-off-by: David S. Miller --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 078c38217daa..c45c02bc6082 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3216,7 +3216,6 @@ F: drivers/platform/chrome/ CISCO VIC ETHERNET NIC DRIVER M: Christian Benvenuti -M: Sujith Sankar M: Govindarajulu Varadarajan <_govind@gmx.com> M: Neel Patel S: Supported From 8c290e60fa2a51806159522331c9ed41252a8fb3 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 21 Mar 2017 19:05:04 -0700 Subject: [PATCH 76/85] bpf: fix hashmap extra_elems logic In both kmalloc and prealloc mode the bpf_map_update_elem() is using per-cpu extra_elems to do atomic update when the map is full. There are two issues with it. The logic can be misused, since it allows max_entries+num_cpus elements to be present in the map. And alloc_extra_elems() at map creation time can fail percpu alloc for large map values with a warn: WARNING: CPU: 3 PID: 2752 at ../mm/percpu.c:892 pcpu_alloc+0x119/0xa60 illegal size (32824) or align (8) for percpu allocation The fixes for both of these issues are different for kmalloc and prealloc modes. For prealloc mode allocate extra num_possible_cpus elements and store their pointers into extra_elems array instead of actual elements. Hence we can use these hidden(spare) elements not only when the map is full but during bpf_map_update_elem() that replaces existing element too. That also improves performance, since pcpu_freelist_pop/push is avoided. Unfortunately this approach cannot be used for kmalloc mode which needs to kfree elements after rcu grace period. Therefore switch it back to normal kmalloc even when full and old element exists like it was prior to commit 6c9059817432 ("bpf: pre-allocate hash map elements"). Add tests to check for over max_entries and large map values. Reported-by: Dave Jones Fixes: 6c9059817432 ("bpf: pre-allocate hash map elements") Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- kernel/bpf/hashtab.c | 146 ++++++++++++------------ tools/testing/selftests/bpf/test_maps.c | 29 ++++- 2 files changed, 98 insertions(+), 77 deletions(-) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index afe5bab376c9..361a69dfe543 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -30,18 +30,12 @@ struct bpf_htab { struct pcpu_freelist freelist; struct bpf_lru lru; }; - void __percpu *extra_elems; + struct htab_elem *__percpu *extra_elems; atomic_t count; /* number of elements in this hashtable */ u32 n_buckets; /* number of hash buckets */ u32 elem_size; /* size of each element in bytes */ }; -enum extra_elem_state { - HTAB_NOT_AN_EXTRA_ELEM = 0, - HTAB_EXTRA_ELEM_FREE, - HTAB_EXTRA_ELEM_USED -}; - /* each htab element is struct htab_elem + key + value */ struct htab_elem { union { @@ -56,7 +50,6 @@ struct htab_elem { }; union { struct rcu_head rcu; - enum extra_elem_state state; struct bpf_lru_node lru_node; }; u32 hash; @@ -77,6 +70,11 @@ static bool htab_is_percpu(const struct bpf_htab *htab) htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; } +static bool htab_is_prealloc(const struct bpf_htab *htab) +{ + return !(htab->map.map_flags & BPF_F_NO_PREALLOC); +} + static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, void __percpu *pptr) { @@ -128,17 +126,20 @@ static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key, static int prealloc_init(struct bpf_htab *htab) { + u32 num_entries = htab->map.max_entries; int err = -ENOMEM, i; - htab->elems = bpf_map_area_alloc(htab->elem_size * - htab->map.max_entries); + if (!htab_is_percpu(htab) && !htab_is_lru(htab)) + num_entries += num_possible_cpus(); + + htab->elems = bpf_map_area_alloc(htab->elem_size * num_entries); if (!htab->elems) return -ENOMEM; if (!htab_is_percpu(htab)) goto skip_percpu_elems; - for (i = 0; i < htab->map.max_entries; i++) { + for (i = 0; i < num_entries; i++) { u32 size = round_up(htab->map.value_size, 8); void __percpu *pptr; @@ -166,11 +167,11 @@ skip_percpu_elems: if (htab_is_lru(htab)) bpf_lru_populate(&htab->lru, htab->elems, offsetof(struct htab_elem, lru_node), - htab->elem_size, htab->map.max_entries); + htab->elem_size, num_entries); else pcpu_freelist_populate(&htab->freelist, htab->elems + offsetof(struct htab_elem, fnode), - htab->elem_size, htab->map.max_entries); + htab->elem_size, num_entries); return 0; @@ -191,16 +192,22 @@ static void prealloc_destroy(struct bpf_htab *htab) static int alloc_extra_elems(struct bpf_htab *htab) { - void __percpu *pptr; + struct htab_elem *__percpu *pptr, *l_new; + struct pcpu_freelist_node *l; int cpu; - pptr = __alloc_percpu_gfp(htab->elem_size, 8, GFP_USER | __GFP_NOWARN); + pptr = __alloc_percpu_gfp(sizeof(struct htab_elem *), 8, + GFP_USER | __GFP_NOWARN); if (!pptr) return -ENOMEM; for_each_possible_cpu(cpu) { - ((struct htab_elem *)per_cpu_ptr(pptr, cpu))->state = - HTAB_EXTRA_ELEM_FREE; + l = pcpu_freelist_pop(&htab->freelist); + /* pop will succeed, since prealloc_init() + * preallocated extra num_possible_cpus elements + */ + l_new = container_of(l, struct htab_elem, fnode); + *per_cpu_ptr(pptr, cpu) = l_new; } htab->extra_elems = pptr; return 0; @@ -342,25 +349,25 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) raw_spin_lock_init(&htab->buckets[i].lock); } - if (!percpu && !lru) { - /* lru itself can remove the least used element, so - * there is no need for an extra elem during map_update. - */ - err = alloc_extra_elems(htab); - if (err) - goto free_buckets; - } - if (prealloc) { err = prealloc_init(htab); if (err) - goto free_extra_elems; + goto free_buckets; + + if (!percpu && !lru) { + /* lru itself can remove the least used element, so + * there is no need for an extra elem during map_update. + */ + err = alloc_extra_elems(htab); + if (err) + goto free_prealloc; + } } return &htab->map; -free_extra_elems: - free_percpu(htab->extra_elems); +free_prealloc: + prealloc_destroy(htab); free_buckets: bpf_map_area_free(htab->buckets); free_htab: @@ -575,12 +582,7 @@ static void htab_elem_free_rcu(struct rcu_head *head) static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) { - if (l->state == HTAB_EXTRA_ELEM_USED) { - l->state = HTAB_EXTRA_ELEM_FREE; - return; - } - - if (!(htab->map.map_flags & BPF_F_NO_PREALLOC)) { + if (htab_is_prealloc(htab)) { pcpu_freelist_push(&htab->freelist, &l->fnode); } else { atomic_dec(&htab->count); @@ -610,47 +612,43 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, void *value, u32 key_size, u32 hash, bool percpu, bool onallcpus, - bool old_elem_exists) + struct htab_elem *old_elem) { u32 size = htab->map.value_size; - bool prealloc = !(htab->map.map_flags & BPF_F_NO_PREALLOC); - struct htab_elem *l_new; + bool prealloc = htab_is_prealloc(htab); + struct htab_elem *l_new, **pl_new; void __percpu *pptr; - int err = 0; if (prealloc) { - struct pcpu_freelist_node *l; - - l = pcpu_freelist_pop(&htab->freelist); - if (!l) - err = -E2BIG; - else - l_new = container_of(l, struct htab_elem, fnode); - } else { - if (atomic_inc_return(&htab->count) > htab->map.max_entries) { - atomic_dec(&htab->count); - err = -E2BIG; + if (old_elem) { + /* if we're updating the existing element, + * use per-cpu extra elems to avoid freelist_pop/push + */ + pl_new = this_cpu_ptr(htab->extra_elems); + l_new = *pl_new; + *pl_new = old_elem; } else { - l_new = kmalloc(htab->elem_size, - GFP_ATOMIC | __GFP_NOWARN); - if (!l_new) - return ERR_PTR(-ENOMEM); + struct pcpu_freelist_node *l; + + l = pcpu_freelist_pop(&htab->freelist); + if (!l) + return ERR_PTR(-E2BIG); + l_new = container_of(l, struct htab_elem, fnode); } - } - - if (err) { - if (!old_elem_exists) - return ERR_PTR(err); - - /* if we're updating the existing element and the hash table - * is full, use per-cpu extra elems - */ - l_new = this_cpu_ptr(htab->extra_elems); - if (l_new->state != HTAB_EXTRA_ELEM_FREE) - return ERR_PTR(-E2BIG); - l_new->state = HTAB_EXTRA_ELEM_USED; } else { - l_new->state = HTAB_NOT_AN_EXTRA_ELEM; + if (atomic_inc_return(&htab->count) > htab->map.max_entries) + if (!old_elem) { + /* when map is full and update() is replacing + * old element, it's ok to allocate, since + * old element will be freed immediately. + * Otherwise return an error + */ + atomic_dec(&htab->count); + return ERR_PTR(-E2BIG); + } + l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN); + if (!l_new) + return ERR_PTR(-ENOMEM); } memcpy(l_new->key, key, key_size); @@ -731,7 +729,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, goto err; l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false, - !!l_old); + l_old); if (IS_ERR(l_new)) { /* all pre-allocated elements are in use or memory exhausted */ ret = PTR_ERR(l_new); @@ -744,7 +742,8 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, hlist_nulls_add_head_rcu(&l_new->hash_node, head); if (l_old) { hlist_nulls_del_rcu(&l_old->hash_node); - free_htab_elem(htab, l_old); + if (!htab_is_prealloc(htab)) + free_htab_elem(htab, l_old); } ret = 0; err: @@ -856,7 +855,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, value, onallcpus); } else { l_new = alloc_htab_elem(htab, key, value, key_size, - hash, true, onallcpus, false); + hash, true, onallcpus, NULL); if (IS_ERR(l_new)) { ret = PTR_ERR(l_new); goto err; @@ -1024,8 +1023,7 @@ static void delete_all_elements(struct bpf_htab *htab) hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { hlist_nulls_del_rcu(&l->hash_node); - if (l->state != HTAB_EXTRA_ELEM_USED) - htab_elem_free(htab, l); + htab_elem_free(htab, l); } } } @@ -1045,7 +1043,7 @@ static void htab_map_free(struct bpf_map *map) * not have executed. Wait for them. */ rcu_barrier(); - if (htab->map.map_flags & BPF_F_NO_PREALLOC) + if (!htab_is_prealloc(htab)) delete_all_elements(htab); else prealloc_destroy(htab); diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c index cada17ac00b8..a0aa2009b0e0 100644 --- a/tools/testing/selftests/bpf/test_maps.c +++ b/tools/testing/selftests/bpf/test_maps.c @@ -80,8 +80,9 @@ static void test_hashmap(int task, void *data) assert(bpf_map_update_elem(fd, &key, &value, BPF_EXIST) == 0); key = 2; assert(bpf_map_update_elem(fd, &key, &value, BPF_ANY) == 0); - key = 1; - assert(bpf_map_update_elem(fd, &key, &value, BPF_ANY) == 0); + key = 3; + assert(bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST) == -1 && + errno == E2BIG); /* Check that key = 0 doesn't exist. */ key = 0; @@ -110,6 +111,24 @@ static void test_hashmap(int task, void *data) close(fd); } +static void test_hashmap_sizes(int task, void *data) +{ + int fd, i, j; + + for (i = 1; i <= 512; i <<= 1) + for (j = 1; j <= 1 << 18; j <<= 1) { + fd = bpf_create_map(BPF_MAP_TYPE_HASH, i, j, + 2, map_flags); + if (fd < 0) { + printf("Failed to create hashmap key=%d value=%d '%s'\n", + i, j, strerror(errno)); + exit(1); + } + close(fd); + usleep(10); /* give kernel time to destroy */ + } +} + static void test_hashmap_percpu(int task, void *data) { unsigned int nr_cpus = bpf_num_possible_cpus(); @@ -317,7 +336,10 @@ static void test_arraymap_percpu(int task, void *data) static void test_arraymap_percpu_many_keys(void) { unsigned int nr_cpus = bpf_num_possible_cpus(); - unsigned int nr_keys = 20000; + /* nr_keys is not too large otherwise the test stresses percpu + * allocator more than anything else + */ + unsigned int nr_keys = 2000; long values[nr_cpus]; int key, fd, i; @@ -419,6 +441,7 @@ static void test_map_stress(void) { run_parallel(100, test_hashmap, NULL); run_parallel(100, test_hashmap_percpu, NULL); + run_parallel(100, test_hashmap_sizes, NULL); run_parallel(100, test_arraymap, NULL); run_parallel(100, test_arraymap_percpu, NULL); From c64c0b3cac4c5b8cb093727d2c19743ea3965c0b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 21 Mar 2017 19:22:28 -0700 Subject: [PATCH 77/85] ipv4: provide stronger user input validation in nl_fib_input() Alexander reported a KMSAN splat caused by reads of uninitialized field (tb_id_in) from user provided struct fib_result_nl It turns out nl_fib_input() sanity tests on user input is a bit wrong : User can pretend nlh->nlmsg_len is big enough, but provide at sendmsg() time a too small buffer. Reported-by: Alexander Potapenko Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 42bfd08109dd..8f2133ffc2ff 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1083,7 +1083,8 @@ static void nl_fib_input(struct sk_buff *skb) net = sock_net(skb->sk); nlh = nlmsg_hdr(skb); - if (skb->len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len || + if (skb->len < nlmsg_total_size(sizeof(*frn)) || + skb->len < nlh->nlmsg_len || nlmsg_len(nlh) < sizeof(*frn)) return; From a97e50cc4cb67e1e7bff56f6b41cda62ca832336 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 22 Mar 2017 13:08:08 +0100 Subject: [PATCH 78/85] socket, bpf: fix sk_filter use after free in sk_clone_lock In sk_clone_lock(), we create a new socket and inherit most of the parent's members via sock_copy() which memcpy()'s various sections. Now, in case the parent socket had a BPF socket filter attached, then newsk->sk_filter points to the same instance as the original sk->sk_filter. sk_filter_charge() is then called on the newsk->sk_filter to take a reference and should that fail due to hitting max optmem, we bail out and release the newsk instance. The issue is that commit 278571baca2a ("net: filter: simplify socket charging") wrongly combined the dismantle path with the failure path of xfrm_sk_clone_policy(). This means, even when charging failed, we call sk_free_unlock_clone() on the newsk, which then still points to the same sk_filter as the original sk. Thus, sk_free_unlock_clone() calls into __sk_destruct() eventually where it tests for present sk_filter and calls sk_filter_uncharge() on it, which potentially lets sk_omem_alloc wrap around and releases the eBPF prog and sk_filter structure from the (still intact) parent. Fix it by making sure that when sk_filter_charge() failed, we reset newsk->sk_filter back to NULL before passing to sk_free_unlock_clone(), so that we don't mess with the parents sk_filter. Only if xfrm_sk_clone_policy() fails, we did reach the point where either the parent's filter was NULL and as a result newsk's as well or where we previously had a successful sk_filter_charge(), thus for that case, we do need sk_filter_uncharge() to release the prior taken reference on sk_filter. Fixes: 278571baca2a ("net: filter: simplify socket charging") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/sock.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/core/sock.c b/net/core/sock.c index acb0d4137499..2c4f574168fb 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1544,6 +1544,12 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) is_charged = sk_filter_charge(newsk, filter); if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { + /* We need to make sure that we don't uncharge the new + * socket if we couldn't charge it in the first place + * as otherwise we uncharge the parent's filter. + */ + if (!is_charged) + RCU_INIT_POINTER(newsk->sk_filter, NULL); sk_free_unlock_clone(newsk); newsk = NULL; goto out; From 1d2a6a5e4bf2921531071fcff8538623dce74efa Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Wed, 22 Mar 2017 16:08:33 +0100 Subject: [PATCH 79/85] genetlink: fix counting regression on ctrl_dumpfamily() Commit 2ae0f17df1cd ("genetlink: use idr to track families") replaced if (++n < fams_to_skip) continue; into: if (n++ < fams_to_skip) continue; This subtle change cause that on retry ctrl_dumpfamily() call we omit one family that failed to do ctrl_fill_info() on previous call, because cb->args[0] = n number counts also family that failed to do ctrl_fill_info(). Patch fixes the problem and avoid confusion in the future just decrease n counter when ctrl_fill_info() fail. User visible problem caused by this bug is failure to get access to some genetlink family i.e. nl80211. However problem is reproducible only if number of registered genetlink families is big enough to cause second call of ctrl_dumpfamily(). Cc: Xose Vazquez Perez Cc: Larry Finger Cc: Johannes Berg Fixes: 2ae0f17df1cd ("genetlink: use idr to track families") Signed-off-by: Stanislaw Gruszka Acked-by: Johannes Berg Signed-off-by: David S. Miller --- net/netlink/genetlink.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index fb6e10fdb217..92e0981f7404 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -783,8 +783,10 @@ static int ctrl_dumpfamily(struct sk_buff *skb, struct netlink_callback *cb) if (ctrl_fill_info(rt, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, - skb, CTRL_CMD_NEWFAMILY) < 0) + skb, CTRL_CMD_NEWFAMILY) < 0) { + n--; break; + } } cb->args[0] = n; From 15bb7745e94a665caf42bfaabf0ce062845b533b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 22 Mar 2017 08:10:21 -0700 Subject: [PATCH 80/85] tcp: initialize icsk_ack.lrcvtime at session start time icsk_ack.lrcvtime has a 0 value at socket creation time. tcpi_last_data_recv can have bogus value if no payload is ever received. This patch initializes icsk_ack.lrcvtime for active sessions in tcp_finish_connect(), and for passive sessions in tcp_create_openreq_child() Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_minisocks.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 39c393cc0fd3..c43119726a62 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5541,6 +5541,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) struct inet_connection_sock *icsk = inet_csk(sk); tcp_set_state(sk, TCP_ESTABLISHED); + icsk->icsk_ack.lrcvtime = tcp_time_stamp; if (skb) { icsk->icsk_af_ops->sk_rx_dst_set(sk, skb); @@ -5759,7 +5760,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, * to stand against the temptation 8) --ANK */ inet_csk_schedule_ack(sk); - icsk->icsk_ack.lrcvtime = tcp_time_stamp; tcp_enter_quickack_mode(sk); inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, TCP_DELACK_MAX, TCP_RTO_MAX); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 7e16243cdb58..65c0f3d13eca 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -460,6 +460,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); minmax_reset(&newtp->rtt_min, tcp_time_stamp, ~0U); newicsk->icsk_rto = TCP_TIMEOUT_INIT; + newicsk->icsk_ack.lrcvtime = tcp_time_stamp; newtp->packets_out = 0; newtp->retrans_out = 0; From ec4fbd64751de18729eaa816ec69e4b504b5a7a2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 22 Mar 2017 08:57:15 -0700 Subject: [PATCH 81/85] inet: frag: release spinlock before calling icmp_send() Dmitry reported a lockdep splat [1] (false positive) that we can fix by releasing the spinlock before calling icmp_send() from ip_expire() This is a false positive because sending an ICMP message can not possibly re-enter the IP frag engine. [1] [ INFO: possible circular locking dependency detected ] 4.10.0+ #29 Not tainted ------------------------------------------------------- modprobe/12392 is trying to acquire lock: (_xmit_ETHER#2){+.-...}, at: [] spin_lock include/linux/spinlock.h:299 [inline] (_xmit_ETHER#2){+.-...}, at: [] __netif_tx_lock include/linux/netdevice.h:3486 [inline] (_xmit_ETHER#2){+.-...}, at: [] sch_direct_xmit+0x282/0x6d0 net/sched/sch_generic.c:180 but task is already holding lock: (&(&q->lock)->rlock){+.-...}, at: [] spin_lock include/linux/spinlock.h:299 [inline] (&(&q->lock)->rlock){+.-...}, at: [] ip_expire+0x51/0x6c0 net/ipv4/ip_fragment.c:201 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&(&q->lock)->rlock){+.-...}: validate_chain kernel/locking/lockdep.c:2267 [inline] __lock_acquire+0x2149/0x3430 kernel/locking/lockdep.c:3340 lock_acquire+0x2a1/0x630 kernel/locking/lockdep.c:3755 __raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline] _raw_spin_lock+0x33/0x50 kernel/locking/spinlock.c:151 spin_lock include/linux/spinlock.h:299 [inline] ip_defrag+0x3a2/0x4130 net/ipv4/ip_fragment.c:669 ip_check_defrag+0x4e3/0x8b0 net/ipv4/ip_fragment.c:713 packet_rcv_fanout+0x282/0x800 net/packet/af_packet.c:1459 deliver_skb net/core/dev.c:1834 [inline] dev_queue_xmit_nit+0x294/0xa90 net/core/dev.c:1890 xmit_one net/core/dev.c:2903 [inline] dev_hard_start_xmit+0x16b/0xab0 net/core/dev.c:2923 sch_direct_xmit+0x31f/0x6d0 net/sched/sch_generic.c:182 __dev_xmit_skb net/core/dev.c:3092 [inline] __dev_queue_xmit+0x13e5/0x1e60 net/core/dev.c:3358 dev_queue_xmit+0x17/0x20 net/core/dev.c:3423 neigh_resolve_output+0x6b9/0xb10 net/core/neighbour.c:1308 neigh_output include/net/neighbour.h:478 [inline] ip_finish_output2+0x8b8/0x15a0 net/ipv4/ip_output.c:228 ip_do_fragment+0x1d93/0x2720 net/ipv4/ip_output.c:672 ip_fragment.constprop.54+0x145/0x200 net/ipv4/ip_output.c:545 ip_finish_output+0x82d/0xe10 net/ipv4/ip_output.c:314 NF_HOOK_COND include/linux/netfilter.h:246 [inline] ip_output+0x1f0/0x7a0 net/ipv4/ip_output.c:404 dst_output include/net/dst.h:486 [inline] ip_local_out+0x95/0x170 net/ipv4/ip_output.c:124 ip_send_skb+0x3c/0xc0 net/ipv4/ip_output.c:1492 ip_push_pending_frames+0x64/0x80 net/ipv4/ip_output.c:1512 raw_sendmsg+0x26de/0x3a00 net/ipv4/raw.c:655 inet_sendmsg+0x164/0x5b0 net/ipv4/af_inet.c:761 sock_sendmsg_nosec net/socket.c:633 [inline] sock_sendmsg+0xca/0x110 net/socket.c:643 ___sys_sendmsg+0x4a3/0x9f0 net/socket.c:1985 __sys_sendmmsg+0x25c/0x750 net/socket.c:2075 SYSC_sendmmsg net/socket.c:2106 [inline] SyS_sendmmsg+0x35/0x60 net/socket.c:2101 do_syscall_64+0x2e8/0x930 arch/x86/entry/common.c:281 return_from_SYSCALL_64+0x0/0x7a -> #0 (_xmit_ETHER#2){+.-...}: check_prev_add kernel/locking/lockdep.c:1830 [inline] check_prevs_add+0xa8f/0x19f0 kernel/locking/lockdep.c:1940 validate_chain kernel/locking/lockdep.c:2267 [inline] __lock_acquire+0x2149/0x3430 kernel/locking/lockdep.c:3340 lock_acquire+0x2a1/0x630 kernel/locking/lockdep.c:3755 __raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline] _raw_spin_lock+0x33/0x50 kernel/locking/spinlock.c:151 spin_lock include/linux/spinlock.h:299 [inline] __netif_tx_lock include/linux/netdevice.h:3486 [inline] sch_direct_xmit+0x282/0x6d0 net/sched/sch_generic.c:180 __dev_xmit_skb net/core/dev.c:3092 [inline] __dev_queue_xmit+0x13e5/0x1e60 net/core/dev.c:3358 dev_queue_xmit+0x17/0x20 net/core/dev.c:3423 neigh_hh_output include/net/neighbour.h:468 [inline] neigh_output include/net/neighbour.h:476 [inline] ip_finish_output2+0xf6c/0x15a0 net/ipv4/ip_output.c:228 ip_finish_output+0xa29/0xe10 net/ipv4/ip_output.c:316 NF_HOOK_COND include/linux/netfilter.h:246 [inline] ip_output+0x1f0/0x7a0 net/ipv4/ip_output.c:404 dst_output include/net/dst.h:486 [inline] ip_local_out+0x95/0x170 net/ipv4/ip_output.c:124 ip_send_skb+0x3c/0xc0 net/ipv4/ip_output.c:1492 ip_push_pending_frames+0x64/0x80 net/ipv4/ip_output.c:1512 icmp_push_reply+0x372/0x4d0 net/ipv4/icmp.c:394 icmp_send+0x156c/0x1c80 net/ipv4/icmp.c:754 ip_expire+0x40e/0x6c0 net/ipv4/ip_fragment.c:239 call_timer_fn+0x241/0x820 kernel/time/timer.c:1268 expire_timers kernel/time/timer.c:1307 [inline] __run_timers+0x960/0xcf0 kernel/time/timer.c:1601 run_timer_softirq+0x21/0x80 kernel/time/timer.c:1614 __do_softirq+0x31f/0xbe7 kernel/softirq.c:284 invoke_softirq kernel/softirq.c:364 [inline] irq_exit+0x1cc/0x200 kernel/softirq.c:405 exiting_irq arch/x86/include/asm/apic.h:657 [inline] smp_apic_timer_interrupt+0x76/0xa0 arch/x86/kernel/apic/apic.c:962 apic_timer_interrupt+0x93/0xa0 arch/x86/entry/entry_64.S:707 __read_once_size include/linux/compiler.h:254 [inline] atomic_read arch/x86/include/asm/atomic.h:26 [inline] rcu_dynticks_curr_cpu_in_eqs kernel/rcu/tree.c:350 [inline] __rcu_is_watching kernel/rcu/tree.c:1133 [inline] rcu_is_watching+0x83/0x110 kernel/rcu/tree.c:1147 rcu_read_lock_held+0x87/0xc0 kernel/rcu/update.c:293 radix_tree_deref_slot include/linux/radix-tree.h:238 [inline] filemap_map_pages+0x6d4/0x1570 mm/filemap.c:2335 do_fault_around mm/memory.c:3231 [inline] do_read_fault mm/memory.c:3265 [inline] do_fault+0xbd5/0x2080 mm/memory.c:3370 handle_pte_fault mm/memory.c:3600 [inline] __handle_mm_fault+0x1062/0x2cb0 mm/memory.c:3714 handle_mm_fault+0x1e2/0x480 mm/memory.c:3751 __do_page_fault+0x4f6/0xb60 arch/x86/mm/fault.c:1397 do_page_fault+0x54/0x70 arch/x86/mm/fault.c:1460 page_fault+0x28/0x30 arch/x86/entry/entry_64.S:1011 other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&(&q->lock)->rlock); lock(_xmit_ETHER#2); lock(&(&q->lock)->rlock); lock(_xmit_ETHER#2); *** DEADLOCK *** 10 locks held by modprobe/12392: #0: (&mm->mmap_sem){++++++}, at: [] __do_page_fault+0x2b8/0xb60 arch/x86/mm/fault.c:1336 #1: (rcu_read_lock){......}, at: [] filemap_map_pages+0x1e6/0x1570 mm/filemap.c:2324 #2: (&(ptlock_ptr(page))->rlock#2){+.+...}, at: [] spin_lock include/linux/spinlock.h:299 [inline] #2: (&(ptlock_ptr(page))->rlock#2){+.+...}, at: [] pte_alloc_one_map mm/memory.c:2944 [inline] #2: (&(ptlock_ptr(page))->rlock#2){+.+...}, at: [] alloc_set_pte+0x13b8/0x1b90 mm/memory.c:3072 #3: (((&q->timer))){+.-...}, at: [] lockdep_copy_map include/linux/lockdep.h:175 [inline] #3: (((&q->timer))){+.-...}, at: [] call_timer_fn+0x1c2/0x820 kernel/time/timer.c:1258 #4: (&(&q->lock)->rlock){+.-...}, at: [] spin_lock include/linux/spinlock.h:299 [inline] #4: (&(&q->lock)->rlock){+.-...}, at: [] ip_expire+0x51/0x6c0 net/ipv4/ip_fragment.c:201 #5: (rcu_read_lock){......}, at: [] ip_expire+0x1b3/0x6c0 net/ipv4/ip_fragment.c:216 #6: (slock-AF_INET){+.-...}, at: [] spin_trylock include/linux/spinlock.h:309 [inline] #6: (slock-AF_INET){+.-...}, at: [] icmp_xmit_lock net/ipv4/icmp.c:219 [inline] #6: (slock-AF_INET){+.-...}, at: [] icmp_send+0x803/0x1c80 net/ipv4/icmp.c:681 #7: (rcu_read_lock_bh){......}, at: [] ip_finish_output2+0x2c1/0x15a0 net/ipv4/ip_output.c:198 #8: (rcu_read_lock_bh){......}, at: [] __dev_queue_xmit+0x23e/0x1e60 net/core/dev.c:3324 #9: (dev->qdisc_running_key ?: &qdisc_running_key){+.....}, at: [] dev_queue_xmit+0x17/0x20 net/core/dev.c:3423 stack backtrace: CPU: 0 PID: 12392 Comm: modprobe Not tainted 4.10.0+ #29 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:16 [inline] dump_stack+0x2ee/0x3ef lib/dump_stack.c:52 print_circular_bug+0x307/0x3b0 kernel/locking/lockdep.c:1204 check_prev_add kernel/locking/lockdep.c:1830 [inline] check_prevs_add+0xa8f/0x19f0 kernel/locking/lockdep.c:1940 validate_chain kernel/locking/lockdep.c:2267 [inline] __lock_acquire+0x2149/0x3430 kernel/locking/lockdep.c:3340 lock_acquire+0x2a1/0x630 kernel/locking/lockdep.c:3755 __raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline] _raw_spin_lock+0x33/0x50 kernel/locking/spinlock.c:151 spin_lock include/linux/spinlock.h:299 [inline] __netif_tx_lock include/linux/netdevice.h:3486 [inline] sch_direct_xmit+0x282/0x6d0 net/sched/sch_generic.c:180 __dev_xmit_skb net/core/dev.c:3092 [inline] __dev_queue_xmit+0x13e5/0x1e60 net/core/dev.c:3358 dev_queue_xmit+0x17/0x20 net/core/dev.c:3423 neigh_hh_output include/net/neighbour.h:468 [inline] neigh_output include/net/neighbour.h:476 [inline] ip_finish_output2+0xf6c/0x15a0 net/ipv4/ip_output.c:228 ip_finish_output+0xa29/0xe10 net/ipv4/ip_output.c:316 NF_HOOK_COND include/linux/netfilter.h:246 [inline] ip_output+0x1f0/0x7a0 net/ipv4/ip_output.c:404 dst_output include/net/dst.h:486 [inline] ip_local_out+0x95/0x170 net/ipv4/ip_output.c:124 ip_send_skb+0x3c/0xc0 net/ipv4/ip_output.c:1492 ip_push_pending_frames+0x64/0x80 net/ipv4/ip_output.c:1512 icmp_push_reply+0x372/0x4d0 net/ipv4/icmp.c:394 icmp_send+0x156c/0x1c80 net/ipv4/icmp.c:754 ip_expire+0x40e/0x6c0 net/ipv4/ip_fragment.c:239 call_timer_fn+0x241/0x820 kernel/time/timer.c:1268 expire_timers kernel/time/timer.c:1307 [inline] __run_timers+0x960/0xcf0 kernel/time/timer.c:1601 run_timer_softirq+0x21/0x80 kernel/time/timer.c:1614 __do_softirq+0x31f/0xbe7 kernel/softirq.c:284 invoke_softirq kernel/softirq.c:364 [inline] irq_exit+0x1cc/0x200 kernel/softirq.c:405 exiting_irq arch/x86/include/asm/apic.h:657 [inline] smp_apic_timer_interrupt+0x76/0xa0 arch/x86/kernel/apic/apic.c:962 apic_timer_interrupt+0x93/0xa0 arch/x86/entry/entry_64.S:707 RIP: 0010:__read_once_size include/linux/compiler.h:254 [inline] RIP: 0010:atomic_read arch/x86/include/asm/atomic.h:26 [inline] RIP: 0010:rcu_dynticks_curr_cpu_in_eqs kernel/rcu/tree.c:350 [inline] RIP: 0010:__rcu_is_watching kernel/rcu/tree.c:1133 [inline] RIP: 0010:rcu_is_watching+0x83/0x110 kernel/rcu/tree.c:1147 RSP: 0000:ffff8801c391f120 EFLAGS: 00000a03 ORIG_RAX: ffffffffffffff10 RAX: dffffc0000000000 RBX: ffff8801c391f148 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 000055edd4374000 RDI: ffff8801dbe1ae0c RBP: ffff8801c391f1a0 R08: 0000000000000002 R09: 0000000000000000 R10: dffffc0000000000 R11: 0000000000000002 R12: 1ffff10038723e25 R13: ffff8801dbe1ae00 R14: ffff8801c391f680 R15: dffffc0000000000 rcu_read_lock_held+0x87/0xc0 kernel/rcu/update.c:293 radix_tree_deref_slot include/linux/radix-tree.h:238 [inline] filemap_map_pages+0x6d4/0x1570 mm/filemap.c:2335 do_fault_around mm/memory.c:3231 [inline] do_read_fault mm/memory.c:3265 [inline] do_fault+0xbd5/0x2080 mm/memory.c:3370 handle_pte_fault mm/memory.c:3600 [inline] __handle_mm_fault+0x1062/0x2cb0 mm/memory.c:3714 handle_mm_fault+0x1e2/0x480 mm/memory.c:3751 __do_page_fault+0x4f6/0xb60 arch/x86/mm/fault.c:1397 do_page_fault+0x54/0x70 arch/x86/mm/fault.c:1460 page_fault+0x28/0x30 arch/x86/entry/entry_64.S:1011 RIP: 0033:0x7f83172f2786 RSP: 002b:00007fffe859ae80 EFLAGS: 00010293 RAX: 000055edd4373040 RBX: 00007f83175111c8 RCX: 000055edd4373238 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 00007f8317510970 RBP: 00007fffe859afd0 R08: 0000000000000009 R09: 0000000000000000 R10: 0000000000000064 R11: 0000000000000000 R12: 000055edd4373040 R13: 0000000000000000 R14: 00007fffe859afe8 R15: 0000000000000000 Signed-off-by: Eric Dumazet Reported-by: Dmitry Vyukov Signed-off-by: David S. Miller --- net/ipv4/ip_fragment.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index bbe7f72db9c1..b3cdeec85f1f 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -198,6 +198,7 @@ static void ip_expire(unsigned long arg) qp = container_of((struct inet_frag_queue *) arg, struct ipq, q); net = container_of(qp->q.net, struct net, ipv4.frags); + rcu_read_lock(); spin_lock(&qp->q.lock); if (qp->q.flags & INET_FRAG_COMPLETE) @@ -207,7 +208,7 @@ static void ip_expire(unsigned long arg) __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); if (!inet_frag_evicting(&qp->q)) { - struct sk_buff *head = qp->q.fragments; + struct sk_buff *clone, *head = qp->q.fragments; const struct iphdr *iph; int err; @@ -216,32 +217,40 @@ static void ip_expire(unsigned long arg) if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !qp->q.fragments) goto out; - rcu_read_lock(); head->dev = dev_get_by_index_rcu(net, qp->iif); if (!head->dev) - goto out_rcu_unlock; + goto out; + /* skb has no dst, perform route lookup again */ iph = ip_hdr(head); err = ip_route_input_noref(head, iph->daddr, iph->saddr, iph->tos, head->dev); if (err) - goto out_rcu_unlock; + goto out; /* Only an end host needs to send an ICMP * "Fragment Reassembly Timeout" message, per RFC792. */ if (frag_expire_skip_icmp(qp->user) && (skb_rtable(head)->rt_type != RTN_LOCAL)) - goto out_rcu_unlock; + goto out; + + clone = skb_clone(head, GFP_ATOMIC); /* Send an ICMP "Fragment Reassembly Timeout" message. */ - icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); -out_rcu_unlock: - rcu_read_unlock(); + if (clone) { + spin_unlock(&qp->q.lock); + icmp_send(clone, ICMP_TIME_EXCEEDED, + ICMP_EXC_FRAGTIME, 0); + consume_skb(clone); + goto out_rcu_unlock; + } } out: spin_unlock(&qp->q.lock); +out_rcu_unlock: + rcu_read_unlock(); ipq_put(qp); } From 6e9e6cc8f4e4f2cd67931510c9f39abf3d9e0d3b Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 20 Mar 2017 15:31:10 -0700 Subject: [PATCH 82/85] Bluetooth: btqcomsmd: fix compile-test dependency compile-testing fails when QCOM_SMD is a loadable module: drivers/bluetooth/built-in.o: In function `btqcomsmd_send': btqca.c:(.text+0xa8): undefined reference to `qcom_smd_send' drivers/bluetooth/built-in.o: In function `btqcomsmd_probe': btqca.c:(.text+0x3ec): undefined reference to `qcom_wcnss_open_channel' btqca.c:(.text+0x46c): undefined reference to `qcom_smd_set_drvdata' This clarifies the dependency to allow compile-testing only when SMD is completely disabled, otherwise the dependency on QCOM_SMD will make sure we can link against it. Fixes: e27ee2b16bad ("Bluetooth: btqcomsmd: Allow driver to build if COMPILE_TEST is enabled") Signed-off-by: Arnd Bergmann [bjorn: Restructure and clarify dependency to QCOM_WCNSS_CTRL] Signed-off-by: Bjorn Andersson Acked-by: Marcel Holtmann Signed-off-by: David S. Miller --- drivers/bluetooth/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/bluetooth/Kconfig b/drivers/bluetooth/Kconfig index c2c14a12713b..08e054507d0b 100644 --- a/drivers/bluetooth/Kconfig +++ b/drivers/bluetooth/Kconfig @@ -344,7 +344,8 @@ config BT_WILINK config BT_QCOMSMD tristate "Qualcomm SMD based HCI support" - depends on (QCOM_SMD && QCOM_WCNSS_CTRL) || COMPILE_TEST + depends on QCOM_SMD || (COMPILE_TEST && QCOM_SMD=n) + depends on QCOM_WCNSS_CTRL || (COMPILE_TEST && QCOM_WCNSS_CTRL=n) select BT_QCA help Qualcomm SMD based HCI driver. From c04ca616eed02b9abe7afd311382c3ed5eef5c40 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 22 Mar 2017 12:10:02 +0300 Subject: [PATCH 83/85] sfc: cleanup a condition in efx_udp_tunnel_del() Presumably if there is an "add" function, there is also a "del" function. But it causes a static checker warning because it looks like a common cut and paste bug. Signed-off-by: Dan Carpenter Acked-by: Jarod Wilson Signed-off-by: David S. Miller --- drivers/net/ethernet/sfc/efx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c index 334bcc6df6b2..50d28261b6b9 100644 --- a/drivers/net/ethernet/sfc/efx.c +++ b/drivers/net/ethernet/sfc/efx.c @@ -2404,7 +2404,7 @@ static void efx_udp_tunnel_del(struct net_device *dev, struct udp_tunnel_info *t tnl.type = (u16)efx_tunnel_type; tnl.port = ti->port; - if (efx->type->udp_tnl_add_port) + if (efx->type->udp_tnl_del_port) (void)efx->type->udp_tnl_del_port(efx, tnl); } From f43feef4e6acde10857fcbfdede790d6b3f2c71d Mon Sep 17 00:00:00 2001 From: "Lendacky, Thomas" Date: Wed, 22 Mar 2017 17:25:27 -0500 Subject: [PATCH 84/85] amd-xgbe: Fix the ECC-related bit position definitions The ECC bit positions that describe whether the ECC interrupt is for Tx, Rx or descriptor memory and whether the it is a single correctable or double detected error were defined in incorrectly (reversed order). Fix the bit position definitions for these settings so that the proper ECC handling is performed. Signed-off-by: Tom Lendacky Signed-off-by: David S. Miller --- drivers/net/ethernet/amd/xgbe/xgbe-common.h | 24 ++++++++++----------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-common.h b/drivers/net/ethernet/amd/xgbe/xgbe-common.h index 86f1626816ff..127adbeefb10 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-common.h +++ b/drivers/net/ethernet/amd/xgbe/xgbe-common.h @@ -984,29 +984,29 @@ #define XP_ECC_CNT1_DESC_DED_WIDTH 8 #define XP_ECC_CNT1_DESC_SEC_INDEX 0 #define XP_ECC_CNT1_DESC_SEC_WIDTH 8 -#define XP_ECC_IER_DESC_DED_INDEX 0 +#define XP_ECC_IER_DESC_DED_INDEX 5 #define XP_ECC_IER_DESC_DED_WIDTH 1 -#define XP_ECC_IER_DESC_SEC_INDEX 1 +#define XP_ECC_IER_DESC_SEC_INDEX 4 #define XP_ECC_IER_DESC_SEC_WIDTH 1 -#define XP_ECC_IER_RX_DED_INDEX 2 +#define XP_ECC_IER_RX_DED_INDEX 3 #define XP_ECC_IER_RX_DED_WIDTH 1 -#define XP_ECC_IER_RX_SEC_INDEX 3 +#define XP_ECC_IER_RX_SEC_INDEX 2 #define XP_ECC_IER_RX_SEC_WIDTH 1 -#define XP_ECC_IER_TX_DED_INDEX 4 +#define XP_ECC_IER_TX_DED_INDEX 1 #define XP_ECC_IER_TX_DED_WIDTH 1 -#define XP_ECC_IER_TX_SEC_INDEX 5 +#define XP_ECC_IER_TX_SEC_INDEX 0 #define XP_ECC_IER_TX_SEC_WIDTH 1 -#define XP_ECC_ISR_DESC_DED_INDEX 0 +#define XP_ECC_ISR_DESC_DED_INDEX 5 #define XP_ECC_ISR_DESC_DED_WIDTH 1 -#define XP_ECC_ISR_DESC_SEC_INDEX 1 +#define XP_ECC_ISR_DESC_SEC_INDEX 4 #define XP_ECC_ISR_DESC_SEC_WIDTH 1 -#define XP_ECC_ISR_RX_DED_INDEX 2 +#define XP_ECC_ISR_RX_DED_INDEX 3 #define XP_ECC_ISR_RX_DED_WIDTH 1 -#define XP_ECC_ISR_RX_SEC_INDEX 3 +#define XP_ECC_ISR_RX_SEC_INDEX 2 #define XP_ECC_ISR_RX_SEC_WIDTH 1 -#define XP_ECC_ISR_TX_DED_INDEX 4 +#define XP_ECC_ISR_TX_DED_INDEX 1 #define XP_ECC_ISR_TX_DED_WIDTH 1 -#define XP_ECC_ISR_TX_SEC_INDEX 5 +#define XP_ECC_ISR_TX_SEC_INDEX 0 #define XP_ECC_ISR_TX_SEC_WIDTH 1 #define XP_I2C_MUTEX_BUSY_INDEX 31 #define XP_I2C_MUTEX_BUSY_WIDTH 1 From 68c386590375b2aea5a3154f17882a30170707bf Mon Sep 17 00:00:00 2001 From: Pavel Belous Date: Thu, 23 Mar 2017 02:20:39 +0300 Subject: [PATCH 85/85] net:ethernet:aquantia: Fix for RX checksum offload. Since AQC-100/107/108 chips supports hardware checksums for RX we should indicate this via NETIF_F_RXCSUM flag. v1->v2: 'Signed-off-by' tag added. Signed-off-by: Pavel Belous Signed-off-by: David S. Miller --- .../net/ethernet/aquantia/atlantic/hw_atl/hw_atl_a0_internal.h | 1 + .../net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0_internal.h | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_a0_internal.h b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_a0_internal.h index 1093ea18823a..0592a0330cf0 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_a0_internal.h +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_a0_internal.h @@ -137,6 +137,7 @@ static struct aq_hw_caps_s hw_atl_a0_hw_caps_ = { .tx_rings = HW_ATL_A0_TX_RINGS, .rx_rings = HW_ATL_A0_RX_RINGS, .hw_features = NETIF_F_HW_CSUM | + NETIF_F_RXCSUM | NETIF_F_RXHASH | NETIF_F_SG | NETIF_F_TSO, diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0_internal.h b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0_internal.h index 8bdee3ddd5a0..f3957e930340 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0_internal.h +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0_internal.h @@ -188,6 +188,7 @@ static struct aq_hw_caps_s hw_atl_b0_hw_caps_ = { .tx_rings = HW_ATL_B0_TX_RINGS, .rx_rings = HW_ATL_B0_RX_RINGS, .hw_features = NETIF_F_HW_CSUM | + NETIF_F_RXCSUM | NETIF_F_RXHASH | NETIF_F_SG | NETIF_F_TSO |