From 1492623e837fe7ca6296f7f5411328307e242771 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 29 Dec 2018 17:42:22 +0100 Subject: [PATCH 01/87] octeontx2-af: Fix a resource leak in an error handling path in 'cgx_probe()' If an error occurs after the call to 'pci_alloc_irq_vectors()', we must call 'pci_free_irq_vectors()' in order to avoid a resource leak. The same sequence is already in place in the corresponding 'cgx_remove()' function. Fixes: 1463f382f58d ("octeontx2-af: Add support for CGX link management") Signed-off-by: Christophe JAILLET Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/octeontx2/af/cgx.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cgx.c b/drivers/net/ethernet/marvell/octeontx2/af/cgx.c index 742f0c1f60df..6d55e3d0b7ea 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/cgx.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/cgx.c @@ -825,7 +825,7 @@ static int cgx_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (!cgx->cgx_cmd_workq) { dev_err(dev, "alloc workqueue failed for cgx cmd"); err = -ENOMEM; - goto err_release_regions; + goto err_free_irq_vectors; } list_add(&cgx->cgx_list, &cgx_list); @@ -841,6 +841,8 @@ static int cgx_probe(struct pci_dev *pdev, const struct pci_device_id *id) err_release_lmac: cgx_lmac_exit(cgx); list_del(&cgx->cgx_list); +err_free_irq_vectors: + pci_free_irq_vectors(pdev); err_release_regions: pci_release_regions(pdev); err_disable_device: From 7c1e8a3817c55d73b27cc29b84075999c8894179 Mon Sep 17 00:00:00 2001 From: Arthur Gautier Date: Mon, 31 Dec 2018 02:10:58 +0000 Subject: [PATCH 02/87] netlink: fixup regression in RTM_GETADDR This commit fixes a regression in AF_INET/RTM_GETADDR and AF_INET6/RTM_GETADDR. Before this commit, the kernel would stop dumping addresses once the first skb was full and end the stream with NLMSG_DONE(-EMSGSIZE). The error shouldn't be sent back to netlink_dump so the callback is kept alive. The userspace is expected to call back with a new empty skb. Changes from V1: - The error is not handled in netlink_dump anymore but rather in inet_dump_ifaddr and inet6_dump_addr directly as suggested by David Ahern. Fixes: d7e38611b81e ("net/ipv4: Put target net when address dump fails due to bad attributes") Fixes: 242afaa6968c ("net/ipv6: Put target net when address dump fails due to bad attributes") Cc: David Ahern Cc: "David S . Miller" Cc: netdev@vger.kernel.org Signed-off-by: Arthur Gautier Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/devinet.c | 2 +- net/ipv6/addrconf.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 04ba321ae5ce..e258a00b4a3d 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1826,7 +1826,7 @@ put_tgt_net: if (fillargs.netnsid >= 0) put_net(tgt_net); - return err < 0 ? err : skb->len; + return skb->len ? : err; } static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh, diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 8eeec6eb2bd3..93d5ad2b1a69 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -5154,7 +5154,7 @@ put_tgt_net: if (fillargs.netnsid >= 0) put_net(tgt_net); - return err < 0 ? err : skb->len; + return skb->len ? : err; } static int inet6_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) From 36352991835ce99e46b4441dd0eb6980f9a83e8f Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng Date: Wed, 2 Jan 2019 14:45:07 +0800 Subject: [PATCH 03/87] r8169: Add support for new Realtek Ethernet There are two new Realtek Ethernet devices which are re-branded r8168h. Add the IDs to to support them. Signed-off-by: Kai-Heng Feng Reviewed-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c index 298930d39b79..7fe9180261b8 100644 --- a/drivers/net/ethernet/realtek/r8169.c +++ b/drivers/net/ethernet/realtek/r8169.c @@ -205,6 +205,8 @@ enum cfg_version { }; static const struct pci_device_id rtl8169_pci_tbl[] = { + { PCI_VDEVICE(REALTEK, 0x2502), RTL_CFG_1 }, + { PCI_VDEVICE(REALTEK, 0x2600), RTL_CFG_1 }, { PCI_VDEVICE(REALTEK, 0x8129), RTL_CFG_0 }, { PCI_VDEVICE(REALTEK, 0x8136), RTL_CFG_2 }, { PCI_VDEVICE(REALTEK, 0x8161), RTL_CFG_1 }, From 8d68100ab4ad92560a16a68b72e068613ac4d573 Mon Sep 17 00:00:00 2001 From: Wen Yang Date: Thu, 3 Jan 2019 01:09:53 +0800 Subject: [PATCH 04/87] soc/fsl/qe: fix err handling of ucc_of_parse_tdm Currently there are some issues with the ucc_of_parse_tdm function: 1, a possible null pointer dereference in ucc_of_parse_tdm, detected by the semantic patch deref_null.cocci, with the following warning: drivers/soc/fsl/qe/qe_tdm.c:177:21-24: ERROR: pdev is NULL but dereferenced. 2, dev gets modified, so in any case that devm_iounmap() will fail even when the new pdev is valid, because the iomap was done with a different pdev. 3, there is no driver bind with the "fsl,t1040-qe-si" or "fsl,t1040-qe-siram" device. So allocating resources using devm_*() with these devices won't provide a cleanup path for these resources when the caller fails. This patch fixes them. Suggested-by: Li Yang Suggested-by: Christophe LEROY Signed-off-by: Wen Yang Reviewed-by: Peng Hao CC: Julia Lawall CC: Zhao Qiang CC: David S. Miller CC: netdev@vger.kernel.org CC: linuxppc-dev@lists.ozlabs.org CC: linux-kernel@vger.kernel.org Signed-off-by: David S. Miller --- drivers/net/wan/fsl_ucc_hdlc.c | 62 +++++++++++++++++++++++++++++++++- drivers/soc/fsl/qe/qe_tdm.c | 55 ------------------------------ 2 files changed, 61 insertions(+), 56 deletions(-) diff --git a/drivers/net/wan/fsl_ucc_hdlc.c b/drivers/net/wan/fsl_ucc_hdlc.c index 839fa7715709..f30a040efd2c 100644 --- a/drivers/net/wan/fsl_ucc_hdlc.c +++ b/drivers/net/wan/fsl_ucc_hdlc.c @@ -1057,6 +1057,54 @@ static const struct net_device_ops uhdlc_ops = { .ndo_tx_timeout = uhdlc_tx_timeout, }; +static int hdlc_map_iomem(char *name, int init_flag, void __iomem **ptr) +{ + struct device_node *np; + struct platform_device *pdev; + struct resource *res; + static int siram_init_flag; + int ret = 0; + + np = of_find_compatible_node(NULL, NULL, name); + if (!np) + return -EINVAL; + + pdev = of_find_device_by_node(np); + if (!pdev) { + pr_err("%pOFn: failed to lookup pdev\n", np); + of_node_put(np); + return -EINVAL; + } + + of_node_put(np); + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!res) { + ret = -EINVAL; + goto error_put_device; + } + *ptr = ioremap(res->start, resource_size(res)); + if (!*ptr) { + ret = -ENOMEM; + goto error_put_device; + } + + /* We've remapped the addresses, and we don't need the device any + * more, so we should release it. + */ + put_device(&pdev->dev); + + if (init_flag && siram_init_flag == 0) { + memset_io(*ptr, 0, resource_size(res)); + siram_init_flag = 1; + } + return 0; + +error_put_device: + put_device(&pdev->dev); + + return ret; +} + static int ucc_hdlc_probe(struct platform_device *pdev) { struct device_node *np = pdev->dev.of_node; @@ -1151,6 +1199,15 @@ static int ucc_hdlc_probe(struct platform_device *pdev) ret = ucc_of_parse_tdm(np, utdm, ut_info); if (ret) goto free_utdm; + + ret = hdlc_map_iomem("fsl,t1040-qe-si", 0, + (void __iomem **)&utdm->si_regs); + if (ret) + goto free_utdm; + ret = hdlc_map_iomem("fsl,t1040-qe-siram", 1, + (void __iomem **)&utdm->siram); + if (ret) + goto unmap_si_regs; } if (of_property_read_u16(np, "fsl,hmask", &uhdlc_priv->hmask)) @@ -1159,7 +1216,7 @@ static int ucc_hdlc_probe(struct platform_device *pdev) ret = uhdlc_init(uhdlc_priv); if (ret) { dev_err(&pdev->dev, "Failed to init uhdlc\n"); - goto free_utdm; + goto undo_uhdlc_init; } dev = alloc_hdlcdev(uhdlc_priv); @@ -1188,6 +1245,9 @@ static int ucc_hdlc_probe(struct platform_device *pdev) free_dev: free_netdev(dev); undo_uhdlc_init: + iounmap(utdm->siram); +unmap_si_regs: + iounmap(utdm->si_regs); free_utdm: if (uhdlc_priv->tsa) kfree(utdm); diff --git a/drivers/soc/fsl/qe/qe_tdm.c b/drivers/soc/fsl/qe/qe_tdm.c index f78c34647ca2..76480df195a8 100644 --- a/drivers/soc/fsl/qe/qe_tdm.c +++ b/drivers/soc/fsl/qe/qe_tdm.c @@ -44,10 +44,6 @@ int ucc_of_parse_tdm(struct device_node *np, struct ucc_tdm *utdm, const char *sprop; int ret = 0; u32 val; - struct resource *res; - struct device_node *np2; - static int siram_init_flag; - struct platform_device *pdev; sprop = of_get_property(np, "fsl,rx-sync-clock", NULL); if (sprop) { @@ -124,57 +120,6 @@ int ucc_of_parse_tdm(struct device_node *np, struct ucc_tdm *utdm, utdm->siram_entry_id = val; set_si_param(utdm, ut_info); - - np2 = of_find_compatible_node(NULL, NULL, "fsl,t1040-qe-si"); - if (!np2) - return -EINVAL; - - pdev = of_find_device_by_node(np2); - if (!pdev) { - pr_err("%pOFn: failed to lookup pdev\n", np2); - of_node_put(np2); - return -EINVAL; - } - - of_node_put(np2); - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - utdm->si_regs = devm_ioremap_resource(&pdev->dev, res); - if (IS_ERR(utdm->si_regs)) { - ret = PTR_ERR(utdm->si_regs); - goto err_miss_siram_property; - } - - np2 = of_find_compatible_node(NULL, NULL, "fsl,t1040-qe-siram"); - if (!np2) { - ret = -EINVAL; - goto err_miss_siram_property; - } - - pdev = of_find_device_by_node(np2); - if (!pdev) { - ret = -EINVAL; - pr_err("%pOFn: failed to lookup pdev\n", np2); - of_node_put(np2); - goto err_miss_siram_property; - } - - of_node_put(np2); - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - utdm->siram = devm_ioremap_resource(&pdev->dev, res); - if (IS_ERR(utdm->siram)) { - ret = PTR_ERR(utdm->siram); - goto err_miss_siram_property; - } - - if (siram_init_flag == 0) { - memset_io(utdm->siram, 0, resource_size(res)); - siram_init_flag = 1; - } - - return ret; - -err_miss_siram_property: - devm_iounmap(&pdev->dev, utdm->si_regs); return ret; } EXPORT_SYMBOL(ucc_of_parse_tdm); From f8c468e8537925e0c4607263f498a1b7c0c8982e Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 2 Jan 2019 13:01:43 -0800 Subject: [PATCH 05/87] net, skbuff: do not prefer skb allocation fails early Commit dcda9b04713c ("mm, tree wide: replace __GFP_REPEAT by __GFP_RETRY_MAYFAIL with more useful semantic") replaced __GFP_REPEAT in alloc_skb_with_frags() with __GFP_RETRY_MAYFAIL when the allocation may directly reclaim. The previous behavior would require reclaim up to 1 << order pages for skb aligned header_len of order > PAGE_ALLOC_COSTLY_ORDER before failing, otherwise the allocations in alloc_skb() would loop in the page allocator looking for memory. __GFP_RETRY_MAYFAIL makes both allocations failable under memory pressure, including for the HEAD allocation. This can cause, among many other things, write() to fail with ENOTCONN during RPC when under memory pressure. These allocations should succeed as they did previous to dcda9b04713c even if it requires calling the oom killer and additional looping in the page allocator to find memory. There is no way to specify the previous behavior of __GFP_REPEAT, but it's unlikely to be necessary since the previous behavior only guaranteed that 1 << order pages would be reclaimed before failing for order > PAGE_ALLOC_COSTLY_ORDER. That reclaim is not guaranteed to be contiguous memory, so repeating for such large orders is usually not beneficial. Removing the setting of __GFP_RETRY_MAYFAIL to restore the previous behavior, specifically not allowing alloc_skb() to fail for small orders and oom kill if necessary rather than allowing RPCs to fail. Fixes: dcda9b04713c ("mm, tree wide: replace __GFP_REPEAT by __GFP_RETRY_MAYFAIL with more useful semantic") Signed-off-by: David Rientjes Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/skbuff.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 37317ffec146..26d848484912 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -5270,7 +5270,6 @@ struct sk_buff *alloc_skb_with_frags(unsigned long header_len, unsigned long chunk; struct sk_buff *skb; struct page *page; - gfp_t gfp_head; int i; *errcode = -EMSGSIZE; @@ -5280,12 +5279,8 @@ struct sk_buff *alloc_skb_with_frags(unsigned long header_len, if (npages > MAX_SKB_FRAGS) return NULL; - gfp_head = gfp_mask; - if (gfp_head & __GFP_DIRECT_RECLAIM) - gfp_head |= __GFP_RETRY_MAYFAIL; - *errcode = -ENOBUFS; - skb = alloc_skb(header_len, gfp_head); + skb = alloc_skb(header_len, gfp_mask); if (!skb) return NULL; From 2d533a9287f2011632977e87ce2783f4c689c984 Mon Sep 17 00:00:00 2001 From: Denis Bolotin Date: Thu, 3 Jan 2019 12:02:39 +0200 Subject: [PATCH 06/87] qed: Fix qed_chain_set_prod() for PBL chains with non power of 2 page count In PBL chains with non power of 2 page count, the producer is not at the beginning of the chain when index is 0 after a wrap. Therefore, after the producer index wrap around, page index should be calculated more carefully. Signed-off-by: Denis Bolotin Signed-off-by: Ariel Elior Signed-off-by: David S. Miller --- include/linux/qed/qed_chain.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/include/linux/qed/qed_chain.h b/include/linux/qed/qed_chain.h index 59ddf9af909e..2dd0a9ed5b36 100644 --- a/include/linux/qed/qed_chain.h +++ b/include/linux/qed/qed_chain.h @@ -663,6 +663,37 @@ out: static inline void qed_chain_set_prod(struct qed_chain *p_chain, u32 prod_idx, void *p_prod_elem) { + if (p_chain->mode == QED_CHAIN_MODE_PBL) { + u32 cur_prod, page_mask, page_cnt, page_diff; + + cur_prod = is_chain_u16(p_chain) ? p_chain->u.chain16.prod_idx : + p_chain->u.chain32.prod_idx; + + /* Assume that number of elements in a page is power of 2 */ + page_mask = ~p_chain->elem_per_page_mask; + + /* Use "cur_prod - 1" and "prod_idx - 1" since producer index + * reaches the first element of next page before the page index + * is incremented. See qed_chain_produce(). + * Index wrap around is not a problem because the difference + * between current and given producer indices is always + * positive and lower than the chain's capacity. + */ + page_diff = (((cur_prod - 1) & page_mask) - + ((prod_idx - 1) & page_mask)) / + p_chain->elem_per_page; + + page_cnt = qed_chain_get_page_cnt(p_chain); + if (is_chain_u16(p_chain)) + p_chain->pbl.c.u16.prod_page_idx = + (p_chain->pbl.c.u16.prod_page_idx - + page_diff + page_cnt) % page_cnt; + else + p_chain->pbl.c.u32.prod_page_idx = + (p_chain->pbl.c.u32.prod_page_idx - + page_diff + page_cnt) % page_cnt; + } + if (is_chain_u16(p_chain)) p_chain->u.chain16.prod_idx = (u16) prod_idx; else From 46721c3d9e273aea880e9ff835b0e1271e1cd2fb Mon Sep 17 00:00:00 2001 From: Denis Bolotin Date: Thu, 3 Jan 2019 12:02:40 +0200 Subject: [PATCH 07/87] qed: Fix qed_ll2_post_rx_buffer_notify_fw() by adding a write memory barrier Make sure chain element is updated before ringing the doorbell. Signed-off-by: Denis Bolotin Signed-off-by: Ariel Elior Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_ll2.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.c b/drivers/net/ethernet/qlogic/qed/qed_ll2.c index 90afd514ffe1..d9237c65a838 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_ll2.c +++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.c @@ -1619,6 +1619,10 @@ static void qed_ll2_post_rx_buffer_notify_fw(struct qed_hwfn *p_hwfn, cq_prod = qed_chain_get_prod_idx(&p_rx->rcq_chain); rx_prod.bd_prod = cpu_to_le16(bd_prod); rx_prod.cqe_prod = cpu_to_le16(cq_prod); + + /* Make sure chain element is updated before ringing the doorbell */ + dma_wmb(); + DIRECT_REG_WR(p_rx->set_prod_addr, *((u32 *)&rx_prod)); } From a09b42ba1a5e1cbeb934fd94cb7b5b9018bf15c7 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Thu, 3 Jan 2019 13:36:43 +0100 Subject: [PATCH 08/87] net: dsa: microchip: Drop unused GPIO includes This driver does not use the old GPIO includes so drop them. Signed-off-by: Linus Walleij Reviewed-by: Vivien Didelot Signed-off-by: David S. Miller --- drivers/net/dsa/microchip/ksz_common.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c index 3b12e2dcff31..8a5111f9414c 100644 --- a/drivers/net/dsa/microchip/ksz_common.c +++ b/drivers/net/dsa/microchip/ksz_common.c @@ -7,7 +7,6 @@ #include #include -#include #include #include #include @@ -15,7 +14,6 @@ #include #include #include -#include #include #include #include From ba3e1847d6471f30241f11069d8f153ed8cb052b Mon Sep 17 00:00:00 2001 From: Claudiu Beznea Date: Thu, 3 Jan 2019 14:59:35 +0000 Subject: [PATCH 09/87] net: macb: remove unnecessary code Commit 653e92a9175e ("net: macb: add support for padding and fcs computation") introduced a bug fixed by commit 899ecaedd155 ("net: ethernet: cadence: fix socket buffer corruption problem"). Code removed in this patch is not reachable at all so remove it. Fixes: 653e92a9175e ("net: macb: add support for padding and fcs computation") Cc: Tristram Ha Signed-off-by: Claudiu Beznea Signed-off-by: David S. Miller --- drivers/net/ethernet/cadence/macb_main.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index b126926ef7f5..66cc7927061a 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -1738,12 +1738,8 @@ static int macb_pad_and_fcs(struct sk_buff **skb, struct net_device *ndev) *skb = nskb; } - if (padlen) { - if (padlen >= ETH_FCS_LEN) - skb_put_zero(*skb, padlen - ETH_FCS_LEN); - else - skb_trim(*skb, ETH_FCS_LEN - padlen); - } + if (padlen > ETH_FCS_LEN) + skb_put_zero(*skb, padlen - ETH_FCS_LEN); add_fcs: /* set FCS to packet */ From 41e4e2cd75346667b0c531c07dab05cce5b06d15 Mon Sep 17 00:00:00 2001 From: Yi-Hung Wei Date: Thu, 3 Jan 2019 09:51:57 -0800 Subject: [PATCH 10/87] openvswitch: Fix IPv6 later frags parsing The previous commit fa642f08839b ("openvswitch: Derive IP protocol number for IPv6 later frags") introduces IP protocol number parsing for IPv6 later frags that can mess up the network header length calculation logic, i.e. nh_len < 0. However, the network header length calculation is mainly for deriving the transport layer header in the key extraction process which the later fragment does not apply. Therefore, this commit skips the network header length calculation to fix the issue. Reported-by: Chris Mi Reported-by: Greg Rose Fixes: fa642f08839b ("openvswitch: Derive IP protocol number for IPv6 later frags") Signed-off-by: Yi-Hung Wei Signed-off-by: David S. Miller --- net/openvswitch/flow.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 57e07768c9d1..f54cf17ef7a8 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -276,10 +276,12 @@ static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key) nexthdr = ipv6_find_hdr(skb, &payload_ofs, -1, &frag_off, &flags); if (flags & IP6_FH_F_FRAG) { - if (frag_off) + if (frag_off) { key->ip.frag = OVS_FRAG_TYPE_LATER; - else - key->ip.frag = OVS_FRAG_TYPE_FIRST; + key->ip.proto = nexthdr; + return 0; + } + key->ip.frag = OVS_FRAG_TYPE_FIRST; } else { key->ip.frag = OVS_FRAG_TYPE_NONE; } From bc6e019b6ee65ff4ebf3ca272f774cf6c67db669 Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Thu, 3 Jan 2019 21:43:34 +0100 Subject: [PATCH 11/87] fou: Prevent unbounded recursion in GUE error handler also with UDP-Lite In commit 11789039da53 ("fou: Prevent unbounded recursion in GUE error handler"), I didn't take care of the case where UDP-Lite is encapsulated into UDP or UDP-Lite with GUE. From a syzbot report about a possibly similar issue with GUE on IPv6, I just realised the same thing might happen with a UDP-Lite inner payload. Also skip exception handling for inner UDP-Lite protocol. Fixes: 11789039da53 ("fou: Prevent unbounded recursion in GUE error handler") Signed-off-by: Stefano Brivio Signed-off-by: David S. Miller --- net/ipv4/fou.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 0c9f171fb085..632863541082 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -1065,7 +1065,8 @@ static int gue_err(struct sk_buff *skb, u32 info) * recursion. Besides, this kind of encapsulation can't even be * configured currently. Discard this. */ - if (guehdr->proto_ctype == IPPROTO_UDP) + if (guehdr->proto_ctype == IPPROTO_UDP || + guehdr->proto_ctype == IPPROTO_UDPLITE) return -EOPNOTSUPP; skb_set_transport_header(skb, -(int)sizeof(struct icmphdr)); From 44039e00171b0fe930c07ff7b43e6023eaf1ed31 Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Thu, 3 Jan 2019 21:43:35 +0100 Subject: [PATCH 12/87] fou6: Prevent unbounded recursion in GUE error handler I forgot to deal with IPv6 in commit 11789039da53 ("fou: Prevent unbounded recursion in GUE error handler"). Now syzbot reported what might be the same type of issue, caused by gue6_err(), that is, handling exceptions for direct UDP encapsulation in GUE (UDP-in-UDP) leads to unbounded recursion in the GUE exception handler. As it probably doesn't make sense to set up GUE this way, and it's currently not even possible to configure this, skip exception handling for UDP (or UDP-Lite) packets encapsulated in UDP (or UDP-Lite) packets with GUE on IPv6. Reported-by: syzbot+4ad25edc7a33e4ab91e0@syzkaller.appspotmail.com Reported-by: Willem de Bruijn Reported-by: Eric Dumazet Fixes: b8a51b38e4d4 ("fou, fou6: ICMP error handlers for FoU and GUE") Signed-off-by: Stefano Brivio Signed-off-by: David S. Miller --- net/ipv6/fou6.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/net/ipv6/fou6.c b/net/ipv6/fou6.c index bd675c61deb1..7da7bf3b7fe3 100644 --- a/net/ipv6/fou6.c +++ b/net/ipv6/fou6.c @@ -131,6 +131,14 @@ static int gue6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (validate_gue_flags(guehdr, optlen)) return -EINVAL; + /* Handling exceptions for direct UDP encapsulation in GUE would lead to + * recursion. Besides, this kind of encapsulation can't even be + * configured currently. Discard this. + */ + if (guehdr->proto_ctype == IPPROTO_UDP || + guehdr->proto_ctype == IPPROTO_UDPLITE) + return -EOPNOTSUPP; + skb_set_transport_header(skb, -(int)sizeof(struct icmp6hdr)); ret = gue6_err_proto_handler(guehdr->proto_ctype, skb, opt, type, code, offset, info); From cff1e01f16f84083e5b4e3a98331ba405fb3fbb5 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Thu, 3 Jan 2019 22:31:32 +0100 Subject: [PATCH 13/87] net: dsa: mt7530: Drop unused GPIO include This driver uses GPIO descriptors only, is not used so drop the include. Signed-off-by: Linus Walleij Reviewed-by: Vivien Didelot Signed-off-by: David S. Miller --- drivers/net/dsa/mt7530.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index 74547f43b938..a8a2c728afba 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include From c77804be53369dd4c15bfc376cf9b45948194cab Mon Sep 17 00:00:00 2001 From: Yonglong Liu Date: Fri, 4 Jan 2019 20:18:10 +0800 Subject: [PATCH 14/87] net: hns: Fix WARNING when hns modules installed Commit 308c6cafde01 ("net: hns: All ports can not work when insmod hns ko after rmmod.") add phy_stop in hns_nic_init_phy(), In the branch of "net", this method is effective, but in the branch of "net-next", it will cause a WARNING when hns modules loaded, reference to commit 2b3e88ea6528 ("net: phy: improve phy state checking"): [10.092168] ------------[ cut here ]------------ [10.092171] called from state READY [10.092189] WARNING: CPU: 4 PID: 1 at ../drivers/net/phy/phy.c:854 phy_stop+0x90/0xb0 [10.092192] Modules linked in: [10.092197] CPU: 4 PID:1 Comm:swapper/0 Not tainted 4.20.0-rc7-next-20181220 #1 [10.092200] Hardware name: Huawei TaiShan 2280 /D05, BIOS Hisilicon D05 UEFI 16.12 Release 05/15/2017 [10.092202] pstate: 60000005 (nZCv daif -PAN -UAO) [10.092205] pc : phy_stop+0x90/0xb0 [10.092208] lr : phy_stop+0x90/0xb0 [10.092209] sp : ffff00001159ba90 [10.092212] x29: ffff00001159ba90 x28: 0000000000000007 [10.092215] x27: ffff000011180068 x26: ffff0000110a5620 [10.092218] x25: ffff0000113b6000 x24: ffff842f96dac000 [10.092221] x23: 0000000000000000 x22: 0000000000000000 [10.092223] x21: ffff841fb8425e18 x20: ffff801fb3a56438 [10.092226] x19: ffff801fb3a56000 x18: ffffffffffffffff [10.092228] x17: 0000000000000000 x16: 0000000000000000 [10.092231] x15: ffff00001122d6c8 x14: ffff00009159b7b7 [10.092234] x13: ffff00001159b7c5 x12: ffff000011245000 [10.092236] x11: 0000000005f5e0ff x10: ffff00001159b750 [10.092239] x9 : 00000000ffffffd0 x8 : 0000000000000465 [10.092242] x7 : ffff0000112457f8 x6 : ffff0000113bd7ce [10.092245] x5 : 0000000000000000 x4 : 0000000000000000 [10.092247] x3 : 00000000ffffffff x2 : ffff000011245828 [10.092250] x1 : 4b5860bd05871300 x0 : 0000000000000000 [10.092253] Call trace: [10.092255] phy_stop+0x90/0xb0 [10.092260] hns_nic_init_phy+0xf8/0x110 [10.092262] hns_nic_try_get_ae+0x4c/0x3b0 [10.092264] hns_nic_dev_probe+0x1fc/0x480 [10.092268] platform_drv_probe+0x50/0xa0 [10.092271] really_probe+0x1f4/0x298 [10.092273] driver_probe_device+0x58/0x108 [10.092275] __driver_attach+0xdc/0xe0 [10.092278] bus_for_each_dev+0x74/0xc8 [10.092280] driver_attach+0x20/0x28 [10.092283] bus_add_driver+0x1b8/0x228 [10.092285] driver_register+0x60/0x110 [10.092288] __platform_driver_register+0x40/0x48 [10.092292] hns_nic_dev_driver_init+0x18/0x20 [10.092296] do_one_initcall+0x5c/0x180 [10.092299] kernel_init_freeable+0x198/0x240 [10.092303] kernel_init+0x10/0x108 [10.092306] ret_from_fork+0x10/0x18 [10.092308] ---[ end trace 1396dd0278e397eb ]--- This WARNING occurred because of calling phy_stop before phy_start. The root cause of the problem in commit '308c6cafde01' is: Reference to hns_nic_init_phy, the flag phydev->supported is changed after phy_connect_direct. The flag phydev->supported is 0x6ff when hns modules is loaded, so will not change Fiber Port power(Reference to marvell.c), which is power on at default. Then the flag phydev->supported is changed to 0x6f, so Fiber Port power is off when removing hns modules. When hns modules installed again, the flag phydev->supported is default value 0x6ff, so will not change Fiber Port power(now is off), causing mac link not up problem. So the solution is change phy flags before phy_connect_direct. Fixes: 308c6cafde01 ("net: hns: All ports can not work when insmod hns ko after rmmod.") Signed-off-by: Yonglong Liu Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns/hns_enet.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns/hns_enet.c b/drivers/net/ethernet/hisilicon/hns/hns_enet.c index 5748d3f722f6..5b33238c6680 100644 --- a/drivers/net/ethernet/hisilicon/hns/hns_enet.c +++ b/drivers/net/ethernet/hisilicon/hns/hns_enet.c @@ -1170,6 +1170,13 @@ int hns_nic_init_phy(struct net_device *ndev, struct hnae_handle *h) if (!h->phy_dev) return 0; + ethtool_convert_legacy_u32_to_link_mode(supported, h->if_support); + linkmode_and(phy_dev->supported, phy_dev->supported, supported); + linkmode_copy(phy_dev->advertising, phy_dev->supported); + + if (h->phy_if == PHY_INTERFACE_MODE_XGMII) + phy_dev->autoneg = false; + if (h->phy_if != PHY_INTERFACE_MODE_XGMII) { phy_dev->dev_flags = 0; @@ -1181,16 +1188,6 @@ int hns_nic_init_phy(struct net_device *ndev, struct hnae_handle *h) if (unlikely(ret)) return -ENODEV; - ethtool_convert_legacy_u32_to_link_mode(supported, h->if_support); - linkmode_and(phy_dev->supported, phy_dev->supported, supported); - linkmode_copy(phy_dev->advertising, phy_dev->supported); - - if (h->phy_if == PHY_INTERFACE_MODE_XGMII) - phy_dev->autoneg = false; - - if (h->phy_if == PHY_INTERFACE_MODE_SGMII) - phy_stop(phy_dev); - return 0; } From bb989501abcafa0de5f18b0ec0ec459b5b817908 Mon Sep 17 00:00:00 2001 From: Yonglong Liu Date: Fri, 4 Jan 2019 20:18:11 +0800 Subject: [PATCH 15/87] net: hns: Fix use after free identified by SLUB debug When enable SLUB debug, than remove hns_enet_drv module, SLUB debug will identify a use after free bug: [134.189505] Unable to handle kernel paging request at virtual address 006b6b6b6b6b6b6b [134.197553] Mem abort info: [134.200381] ESR = 0x96000004 [134.203487] Exception class = DABT (current EL), IL = 32 bits [134.209497] SET = 0, FnV = 0 [134.212596] EA = 0, S1PTW = 0 [134.215777] Data abort info: [134.218701] ISV = 0, ISS = 0x00000004 [134.222596] CM = 0, WnR = 0 [134.225606] [006b6b6b6b6b6b6b] address between user and kernel address ranges [134.232851] Internal error: Oops: 96000004 [#1] SMP [134.237798] CPU: 21 PID: 27834 Comm: rmmod Kdump: loaded Tainted: G OE 4.19.5-1.2.34.aarch64 #1 [134.247856] Hardware name: Huawei TaiShan 2280 /BC11SPCD, BIOS 1.58 10/24/2018 [134.255181] pstate: 20000005 (nzCv daif -PAN -UAO) [134.260044] pc : hns_ae_put_handle+0x38/0x60 [134.264372] lr : hns_ae_put_handle+0x24/0x60 [134.268700] sp : ffff00001be93c50 [134.272054] x29: ffff00001be93c50 x28: ffff802faaec8040 [134.277442] x27: 0000000000000000 x26: 0000000000000000 [134.282830] x25: 0000000056000000 x24: 0000000000000015 [134.288284] x23: ffff0000096fe098 x22: ffff000001050070 [134.293671] x21: ffff801fb3c044a0 x20: ffff80afb75ec098 [134.303287] x19: ffff80afb75ec098 x18: 0000000000000000 [134.312945] x17: 0000000000000000 x16: 0000000000000000 [134.322517] x15: 0000000000000002 x14: 0000000000000000 [134.332030] x13: dead000000000100 x12: ffff7e02bea3c988 [134.341487] x11: ffff80affbee9e68 x10: 0000000000000000 [134.351033] x9 : 6fffff8000008101 x8 : 0000000000000000 [134.360569] x7 : dead000000000100 x6 : ffff000009579748 [134.370059] x5 : 0000000000210d00 x4 : 0000000000000000 [134.379550] x3 : 0000000000000001 x2 : 0000000000000000 [134.388813] x1 : 6b6b6b6b6b6b6b6b x0 : 0000000000000000 [134.397993] Process rmmod (pid: 27834, stack limit = 0x00000000d474b7fd) [134.408498] Call trace: [134.414611] hns_ae_put_handle+0x38/0x60 [134.422208] hnae_put_handle+0xd4/0x108 [134.429563] hns_nic_dev_remove+0x60/0xc0 [hns_enet_drv] [134.438342] platform_drv_remove+0x2c/0x70 [134.445958] device_release_driver_internal+0x174/0x208 [134.454810] driver_detach+0x70/0xd8 [134.461913] bus_remove_driver+0x64/0xe8 [134.469396] driver_unregister+0x34/0x60 [134.476822] platform_driver_unregister+0x20/0x30 [134.485130] hns_nic_dev_driver_exit+0x14/0x6e4 [hns_enet_drv] [134.494634] __arm64_sys_delete_module+0x238/0x290 struct hnae_handle is a member of struct hnae_vf_cb, so when vf_cb is freed, than use hnae_handle will cause use after free panic. This patch frees vf_cb after hnae_handle used. Signed-off-by: Yonglong Liu Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns/hns_ae_adapt.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns/hns_ae_adapt.c b/drivers/net/ethernet/hisilicon/hns/hns_ae_adapt.c index ad1779fc410e..a78bfafd212c 100644 --- a/drivers/net/ethernet/hisilicon/hns/hns_ae_adapt.c +++ b/drivers/net/ethernet/hisilicon/hns/hns_ae_adapt.c @@ -147,12 +147,10 @@ static void hns_ae_put_handle(struct hnae_handle *handle) struct hnae_vf_cb *vf_cb = hns_ae_get_vf_cb(handle); int i; - vf_cb->mac_cb = NULL; - - kfree(vf_cb); - for (i = 0; i < handle->q_num; i++) hns_ae_get_ring_pair(handle->qs[i])->used_by_vf = 0; + + kfree(vf_cb); } static int hns_ae_wait_flow_down(struct hnae_handle *handle) From f87118d5760f00af7228033fbe783c7f380d2866 Mon Sep 17 00:00:00 2001 From: Daniele Palmas Date: Fri, 4 Jan 2019 13:26:10 +0100 Subject: [PATCH 16/87] qmi_wwan: add MTU default to qmap network interface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch adds MTU default value to qmap network interface in order to avoid "RTNETLINK answers: No buffer space available" error when setting an ipv6 address. Signed-off-by: Daniele Palmas Acked-by: Bjørn Mork Signed-off-by: David S. Miller --- drivers/net/usb/qmi_wwan.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index 774e1ff01c9a..735ad838e2ba 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -123,6 +123,7 @@ static void qmimux_setup(struct net_device *dev) dev->addr_len = 0; dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST; dev->netdev_ops = &qmimux_netdev_ops; + dev->mtu = 1500; dev->needs_free_netdev = true; } From 3271a4821882a64214acc1bd7b173900ec70c9bf Mon Sep 17 00:00:00 2001 From: Peter Oskolkov Date: Fri, 4 Jan 2019 09:43:08 -0800 Subject: [PATCH 17/87] selftests: net: fix/improve ip_defrag selftest Commit ade446403bfb ("net: ipv4: do not handle duplicate fragments as overlapping") changed IPv4 defragmentation so that duplicate fragments, as well as _some_ fragments completely covered by previously delivered fragments, do not lead to the whole frag queue being discarded. This makes the existing ip_defrag selftest flaky. This patch * makes sure that negative IPv4 defrag tests generate truly overlapping fragments that trigger defrag queue drops; * tests that duplicate IPv4 fragments do not trigger defrag queue drops; * makes a couple of minor tweaks to the test aimed at increasing its code coverage and reduce flakiness. Signed-off-by: Peter Oskolkov Signed-off-by: David S. Miller --- tools/testing/selftests/net/ip_defrag.c | 96 +++++++++++++++++++++--- tools/testing/selftests/net/ip_defrag.sh | 9 ++- 2 files changed, 95 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/net/ip_defrag.c b/tools/testing/selftests/net/ip_defrag.c index 61ae2782388e..5d56cc0838f6 100644 --- a/tools/testing/selftests/net/ip_defrag.c +++ b/tools/testing/selftests/net/ip_defrag.c @@ -203,6 +203,7 @@ static void send_udp_frags(int fd_raw, struct sockaddr *addr, { struct ip *iphdr = (struct ip *)ip_frame; struct ip6_hdr *ip6hdr = (struct ip6_hdr *)ip_frame; + const bool ipv4 = !ipv6; int res; int offset; int frag_len; @@ -239,19 +240,53 @@ static void send_udp_frags(int fd_raw, struct sockaddr *addr, iphdr->ip_sum = 0; } + /* Occasionally test in-order fragments. */ + if (!cfg_overlap && (rand() % 100 < 15)) { + offset = 0; + while (offset < (UDP_HLEN + payload_len)) { + send_fragment(fd_raw, addr, alen, offset, ipv6); + offset += max_frag_len; + } + return; + } + + /* Occasionally test IPv4 "runs" (see net/ipv4/ip_fragment.c) */ + if (ipv4 && !cfg_overlap && (rand() % 100 < 20) && + (payload_len > 9 * max_frag_len)) { + offset = 6 * max_frag_len; + while (offset < (UDP_HLEN + payload_len)) { + send_fragment(fd_raw, addr, alen, offset, ipv6); + offset += max_frag_len; + } + offset = 3 * max_frag_len; + while (offset < 6 * max_frag_len) { + send_fragment(fd_raw, addr, alen, offset, ipv6); + offset += max_frag_len; + } + offset = 0; + while (offset < 3 * max_frag_len) { + send_fragment(fd_raw, addr, alen, offset, ipv6); + offset += max_frag_len; + } + return; + } + /* Odd fragments. */ offset = max_frag_len; while (offset < (UDP_HLEN + payload_len)) { send_fragment(fd_raw, addr, alen, offset, ipv6); + /* IPv4 ignores duplicates, so randomly send a duplicate. */ + if (ipv4 && (1 == rand() % 100)) + send_fragment(fd_raw, addr, alen, offset, ipv6); offset += 2 * max_frag_len; } if (cfg_overlap) { /* Send an extra random fragment. */ - offset = rand() % (UDP_HLEN + payload_len - 1); - /* sendto() returns EINVAL if offset + frag_len is too small. */ if (ipv6) { struct ip6_frag *fraghdr = (struct ip6_frag *)(ip_frame + IP6_HLEN); + /* sendto() returns EINVAL if offset + frag_len is too small. */ + offset = rand() % (UDP_HLEN + payload_len - 1); frag_len = max_frag_len + rand() % 256; /* In IPv6 if !!(frag_len % 8), the fragment is dropped. */ frag_len &= ~0x7; @@ -259,13 +294,29 @@ static void send_udp_frags(int fd_raw, struct sockaddr *addr, ip6hdr->ip6_plen = htons(frag_len); frag_len += IP6_HLEN; } else { - frag_len = IP4_HLEN + UDP_HLEN + rand() % 256; + /* In IPv4, duplicates and some fragments completely inside + * previously sent fragments are dropped/ignored. So + * random offset and frag_len can result in a dropped + * fragment instead of a dropped queue/packet. So we + * hard-code offset and frag_len. + * + * See ade446403bfb ("net: ipv4: do not handle duplicate + * fragments as overlapping"). + */ + if (max_frag_len * 4 < payload_len || max_frag_len < 16) { + /* not enough payload to play with random offset and frag_len. */ + offset = 8; + frag_len = IP4_HLEN + UDP_HLEN + max_frag_len; + } else { + offset = rand() % (payload_len / 2); + frag_len = 2 * max_frag_len + 1 + rand() % 256; + } iphdr->ip_off = htons(offset / 8 | IP4_MF); iphdr->ip_len = htons(frag_len); } res = sendto(fd_raw, ip_frame, frag_len, 0, addr, alen); if (res < 0) - error(1, errno, "sendto overlap"); + error(1, errno, "sendto overlap: %d", frag_len); if (res != frag_len) error(1, 0, "sendto overlap: %d vs %d", (int)res, frag_len); frag_counter++; @@ -275,6 +326,9 @@ static void send_udp_frags(int fd_raw, struct sockaddr *addr, offset = 0; while (offset < (UDP_HLEN + payload_len)) { send_fragment(fd_raw, addr, alen, offset, ipv6); + /* IPv4 ignores duplicates, so randomly send a duplicate. */ + if (ipv4 && (1 == rand() % 100)) + send_fragment(fd_raw, addr, alen, offset, ipv6); offset += 2 * max_frag_len; } } @@ -282,7 +336,11 @@ static void send_udp_frags(int fd_raw, struct sockaddr *addr, static void run_test(struct sockaddr *addr, socklen_t alen, bool ipv6) { int fd_tx_raw, fd_rx_udp; - struct timeval tv = { .tv_sec = 0, .tv_usec = 10 * 1000 }; + /* Frag queue timeout is set to one second in the calling script; + * socket timeout should be just a bit longer to avoid tests interfering + * with each other. + */ + struct timeval tv = { .tv_sec = 1, .tv_usec = 10 }; int idx; int min_frag_len = ipv6 ? 1280 : 8; @@ -308,12 +366,32 @@ static void run_test(struct sockaddr *addr, socklen_t alen, bool ipv6) payload_len += (rand() % 4096)) { if (cfg_verbose) printf("payload_len: %d\n", payload_len); - max_frag_len = min_frag_len; - do { + + if (cfg_overlap) { + /* With overlaps, one send/receive pair below takes + * at least one second (== timeout) to run, so there + * is not enough test time to run a nested loop: + * the full overlap test takes 20-30 seconds. + */ + max_frag_len = min_frag_len + + rand() % (1500 - FRAG_HLEN - min_frag_len); send_udp_frags(fd_tx_raw, addr, alen, ipv6); recv_validate_udp(fd_rx_udp); - max_frag_len += 8 * (rand() % 8); - } while (max_frag_len < (1500 - FRAG_HLEN) && max_frag_len <= payload_len); + } else { + /* Without overlaps, each packet reassembly (== one + * send/receive pair below) takes very little time to + * run, so we can easily afford more thourough testing + * with a nested loop: the full non-overlap test takes + * less than one second). + */ + max_frag_len = min_frag_len; + do { + send_udp_frags(fd_tx_raw, addr, alen, ipv6); + recv_validate_udp(fd_rx_udp); + max_frag_len += 8 * (rand() % 8); + } while (max_frag_len < (1500 - FRAG_HLEN) && + max_frag_len <= payload_len); + } } /* Cleanup. */ diff --git a/tools/testing/selftests/net/ip_defrag.sh b/tools/testing/selftests/net/ip_defrag.sh index f34672796044..7dd79a9efb17 100755 --- a/tools/testing/selftests/net/ip_defrag.sh +++ b/tools/testing/selftests/net/ip_defrag.sh @@ -11,10 +11,17 @@ readonly NETNS="ns-$(mktemp -u XXXXXX)" setup() { ip netns add "${NETNS}" ip -netns "${NETNS}" link set lo up + ip netns exec "${NETNS}" sysctl -w net.ipv4.ipfrag_high_thresh=9000000 >/dev/null 2>&1 ip netns exec "${NETNS}" sysctl -w net.ipv4.ipfrag_low_thresh=7000000 >/dev/null 2>&1 + ip netns exec "${NETNS}" sysctl -w net.ipv4.ipfrag_time=1 >/dev/null 2>&1 + ip netns exec "${NETNS}" sysctl -w net.ipv6.ip6frag_high_thresh=9000000 >/dev/null 2>&1 ip netns exec "${NETNS}" sysctl -w net.ipv6.ip6frag_low_thresh=7000000 >/dev/null 2>&1 + ip netns exec "${NETNS}" sysctl -w net.ipv6.ip6frag_time=1 >/dev/null 2>&1 + + # DST cache can get full with a lot of frags, with GC not keeping up with the test. + ip netns exec "${NETNS}" sysctl -w net.ipv6.route.max_size=65536 >/dev/null 2>&1 } cleanup() { @@ -27,7 +34,6 @@ setup echo "ipv4 defrag" ip netns exec "${NETNS}" ./ip_defrag -4 - echo "ipv4 defrag with overlaps" ip netns exec "${NETNS}" ./ip_defrag -4o @@ -37,3 +43,4 @@ ip netns exec "${NETNS}" ./ip_defrag -6 echo "ipv6 defrag with overlaps" ip netns exec "${NETNS}" ./ip_defrag -6o +echo "all tests done" From 8d933670452107e41165bea70a30dffbd281bef1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 4 Jan 2019 11:00:00 -0800 Subject: [PATCH 18/87] ipv6: make icmp6_send() robust against null skb->dev syzbot was able to crash one host with the following stack trace : kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] PREEMPT SMP KASAN CPU: 0 PID: 8625 Comm: syz-executor4 Not tainted 4.20.0+ #8 RIP: 0010:dev_net include/linux/netdevice.h:2169 [inline] RIP: 0010:icmp6_send+0x116/0x2d30 net/ipv6/icmp.c:426 icmpv6_send smack_socket_sock_rcv_skb security_sock_rcv_skb sk_filter_trim_cap __sk_receive_skb dccp_v6_do_rcv release_sock This is because a RX packet found socket owned by user and was stored into socket backlog. Before leaving RCU protected section, skb->dev was cleared in __sk_receive_skb(). When socket backlog was finally handled at release_sock() time, skb was fed to smack_socket_sock_rcv_skb() then icmp6_send() We could fix the bug in smack_socket_sock_rcv_skb(), or simply make icmp6_send() more robust against such possibility. In the future we might provide to icmp6_send() the net pointer instead of infering it. Fixes: d66a8acbda92 ("Smack: Inform peer that IPv6 traffic has been blocked") Signed-off-by: Eric Dumazet Cc: Piotr Sawicki Cc: Casey Schaufler Reported-by: syzbot Acked-by: Casey Schaufler Signed-off-by: David S. Miller --- net/ipv6/icmp.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 5d7aa2c2770c..bbcdfd299692 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -423,10 +423,10 @@ static int icmp6_iif(const struct sk_buff *skb) static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, const struct in6_addr *force_saddr) { - struct net *net = dev_net(skb->dev); struct inet6_dev *idev = NULL; struct ipv6hdr *hdr = ipv6_hdr(skb); struct sock *sk; + struct net *net; struct ipv6_pinfo *np; const struct in6_addr *saddr = NULL; struct dst_entry *dst; @@ -437,12 +437,16 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, int iif = 0; int addr_type = 0; int len; - u32 mark = IP6_REPLY_MARK(net, skb->mark); + u32 mark; if ((u8 *)hdr < skb->head || (skb_network_header(skb) + sizeof(*hdr)) > skb_tail_pointer(skb)) return; + if (!skb->dev) + return; + net = dev_net(skb->dev); + mark = IP6_REPLY_MARK(net, skb->mark); /* * Make sure we respect the rules * i.e. RFC 1885 2.4(e) From ae84e4a8eb6f0d7f3b902ce238f285e98cf2ac12 Mon Sep 17 00:00:00 2001 From: Jeff Kirsher Date: Fri, 4 Jan 2019 10:48:02 -0800 Subject: [PATCH 19/87] ixgbe: fix Kconfig when driver is not a module The new ability added to the driver to use mii_bus to handle MII related ioctls is causing compile issues when the driver is compiled into the kernel (i.e. not a module). The problem was in selecting MDIO_DEVICE instead of the preferred PHYLIB Kconfig option. The reason being that MDIO_DEVICE had a dependency on PHYLIB and would be compiled as a module when PHYLIB was a module, no matter whether ixgbe was compiled into the kernel. CC: Dave Jones CC: Steve Douthit CC: Florian Fainelli Signed-off-by: Jeff Kirsher Reviewed-by: Stephen Douthit Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/Kconfig b/drivers/net/ethernet/intel/Kconfig index 31fb76ee9d82..a1246e89aad4 100644 --- a/drivers/net/ethernet/intel/Kconfig +++ b/drivers/net/ethernet/intel/Kconfig @@ -159,7 +159,7 @@ config IXGBE tristate "Intel(R) 10GbE PCI Express adapters support" depends on PCI select MDIO - select MDIO_DEVICE + select PHYLIB imply PTP_1588_CLOCK ---help--- This driver supports Intel(R) 10GbE PCI Express family of From ec90ad334986fa5856d11dd272f7f22fa86c55c4 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 4 Jan 2019 16:58:15 -0800 Subject: [PATCH 20/87] ipv6: Consider sk_bound_dev_if when binding a socket to a v4 mapped address Similar to c5ee066333eb ("ipv6: Consider sk_bound_dev_if when binding a socket to an address"), binding a socket to v4 mapped addresses needs to consider if the socket is bound to a device. This problem also exists from the beginning of git history. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/af_inet6.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 0bfb6cc0a30a..93288b9f1697 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -310,6 +310,7 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, /* Check if the address belongs to the host. */ if (addr_type == IPV6_ADDR_MAPPED) { + struct net_device *dev = NULL; int chk_addr_ret; /* Binding to v4-mapped address on a v6-only socket @@ -320,9 +321,17 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, goto out; } + if (sk->sk_bound_dev_if) { + dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if); + if (!dev) { + err = -ENODEV; + goto out; + } + } + /* Reproduce AF_INET checks to make the bindings consistent */ v4addr = addr->sin6_addr.s6_addr32[3]; - chk_addr_ret = inet_addr_type(net, v4addr); + chk_addr_ret = inet_addr_type_dev_table(net, dev, v4addr); if (!inet_can_nonlocal_bind(net, inet) && v4addr != htonl(INADDR_ANY) && chk_addr_ret != RTN_LOCAL && From e8e36984080b55ac5e57bdb09a5b570f2fc8e963 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Fri, 4 Jan 2019 01:07:07 -0800 Subject: [PATCH 21/87] bpf: Fix [::] -> [::1] rewrite in sys_sendmsg sys_sendmsg has supported unspecified destination IPv6 (wildcard) for unconnected UDP sockets since 876c7f41. When [::] is passed by user as destination, sys_sendmsg rewrites it with [::1] to be consistent with BSD (see "BSD'ism" comment in the code). This didn't work when cgroup-bpf was enabled though since the rewrite [::] -> [::1] happened before passing control to cgroup-bpf block where fl6.daddr was updated with passed by user sockaddr_in6.sin6_addr (that might or might not be changed by BPF program). That way if user passed [::] as dst IPv6 it was first rewritten with [::1] by original code from 876c7f41, but then rewritten back with [::] by cgroup-bpf block. It happened even when BPF_CGROUP_UDP6_SENDMSG program was not present (CONFIG_CGROUP_BPF=y was enough). The fix is to apply BSD'ism after cgroup-bpf block so that [::] is replaced with [::1] no matter where it came from: passed by user to sys_sendmsg or set by BPF_CGROUP_UDP6_SENDMSG program. Fixes: 1cedee13d25a ("bpf: Hooks for sys_sendmsg") Reported-by: Nitin Rawat Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov --- net/ipv6/udp.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 9cbf363172bd..7c3505006f8e 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1390,10 +1390,7 @@ do_udp_sendmsg: ipc6.opt = opt; fl6.flowi6_proto = sk->sk_protocol; - if (!ipv6_addr_any(daddr)) - fl6.daddr = *daddr; - else - fl6.daddr.s6_addr[15] = 0x1; /* :: means loopback (BSD'ism) */ + fl6.daddr = *daddr; if (ipv6_addr_any(&fl6.saddr) && !ipv6_addr_any(&np->saddr)) fl6.saddr = np->saddr; fl6.fl6_sport = inet->inet_sport; @@ -1421,6 +1418,9 @@ do_udp_sendmsg: } } + if (ipv6_addr_any(&fl6.daddr)) + fl6.daddr.s6_addr[15] = 0x1; /* :: means loopback (BSD'ism) */ + final_p = fl6_update_dst(&fl6, opt, &final); if (final_p) connected = false; From 976b4f3a4646fbf0d189caca25f91f82e4be4b5a Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Fri, 4 Jan 2019 01:07:08 -0800 Subject: [PATCH 22/87] selftests/bpf: Test [::] -> [::1] rewrite in sys_sendmsg in test_sock_addr Test that sys_sendmsg BPF hook doesn't break sys_sendmsg behaviour to rewrite destination IPv6 = [::] with [::1] (BSD'ism). Two test cases are added: 1) User passes dst IPv6 = [::] and BPF_CGROUP_UDP6_SENDMSG program doesn't touch it. 2) User passes dst IPv6 != [::], but BPF_CGROUP_UDP6_SENDMSG program rewrites it with [::]. In both cases [::1] is used by sys_sendmsg code eventually and datagram is sent successfully for unconnected UDP socket. Example of relevant output: Test case: sendmsg6: set dst IP = [::] (BSD'ism) .. [PASS] Test case: sendmsg6: preserve dst IP = [::] (BSD'ism) .. [PASS] Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/test_sock_addr.c | 53 ++++++++++++++++++-- 1 file changed, 50 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/test_sock_addr.c b/tools/testing/selftests/bpf/test_sock_addr.c index 73b7493d4120..d94336cbd8bd 100644 --- a/tools/testing/selftests/bpf/test_sock_addr.c +++ b/tools/testing/selftests/bpf/test_sock_addr.c @@ -44,6 +44,7 @@ #define SERV6_V4MAPPED_IP "::ffff:192.168.0.4" #define SRC6_IP "::1" #define SRC6_REWRITE_IP "::6" +#define WILDCARD6_IP "::" #define SERV6_PORT 6060 #define SERV6_REWRITE_PORT 6666 @@ -85,12 +86,14 @@ static int bind4_prog_load(const struct sock_addr_test *test); static int bind6_prog_load(const struct sock_addr_test *test); static int connect4_prog_load(const struct sock_addr_test *test); static int connect6_prog_load(const struct sock_addr_test *test); +static int sendmsg_allow_prog_load(const struct sock_addr_test *test); static int sendmsg_deny_prog_load(const struct sock_addr_test *test); static int sendmsg4_rw_asm_prog_load(const struct sock_addr_test *test); static int sendmsg4_rw_c_prog_load(const struct sock_addr_test *test); static int sendmsg6_rw_asm_prog_load(const struct sock_addr_test *test); static int sendmsg6_rw_c_prog_load(const struct sock_addr_test *test); static int sendmsg6_rw_v4mapped_prog_load(const struct sock_addr_test *test); +static int sendmsg6_rw_wildcard_prog_load(const struct sock_addr_test *test); static struct sock_addr_test tests[] = { /* bind */ @@ -462,6 +465,34 @@ static struct sock_addr_test tests[] = { SRC6_REWRITE_IP, SYSCALL_ENOTSUPP, }, + { + "sendmsg6: set dst IP = [::] (BSD'ism)", + sendmsg6_rw_wildcard_prog_load, + BPF_CGROUP_UDP6_SENDMSG, + BPF_CGROUP_UDP6_SENDMSG, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + SUCCESS, + }, + { + "sendmsg6: preserve dst IP = [::] (BSD'ism)", + sendmsg_allow_prog_load, + BPF_CGROUP_UDP6_SENDMSG, + BPF_CGROUP_UDP6_SENDMSG, + AF_INET6, + SOCK_DGRAM, + WILDCARD6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_PORT, + SRC6_IP, + SUCCESS, + }, { "sendmsg6: deny call", sendmsg_deny_prog_load, @@ -734,16 +765,27 @@ static int connect6_prog_load(const struct sock_addr_test *test) return load_path(test, CONNECT6_PROG_PATH); } -static int sendmsg_deny_prog_load(const struct sock_addr_test *test) +static int sendmsg_ret_only_prog_load(const struct sock_addr_test *test, + int32_t rc) { struct bpf_insn insns[] = { - /* return 0 */ - BPF_MOV64_IMM(BPF_REG_0, 0), + /* return rc */ + BPF_MOV64_IMM(BPF_REG_0, rc), BPF_EXIT_INSN(), }; return load_insns(test, insns, sizeof(insns) / sizeof(struct bpf_insn)); } +static int sendmsg_allow_prog_load(const struct sock_addr_test *test) +{ + return sendmsg_ret_only_prog_load(test, /*rc*/ 1); +} + +static int sendmsg_deny_prog_load(const struct sock_addr_test *test) +{ + return sendmsg_ret_only_prog_load(test, /*rc*/ 0); +} + static int sendmsg4_rw_asm_prog_load(const struct sock_addr_test *test) { struct sockaddr_in dst4_rw_addr; @@ -864,6 +906,11 @@ static int sendmsg6_rw_v4mapped_prog_load(const struct sock_addr_test *test) return sendmsg6_rw_dst_asm_prog_load(test, SERV6_V4MAPPED_IP); } +static int sendmsg6_rw_wildcard_prog_load(const struct sock_addr_test *test) +{ + return sendmsg6_rw_dst_asm_prog_load(test, WILDCARD6_IP); +} + static int sendmsg6_rw_c_prog_load(const struct sock_addr_test *test) { return load_path(test, SENDMSG6_PROG_PATH); From d4a7e9bb74b5aaf07b89f6531c080b1130bdf019 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sat, 5 Jan 2019 07:35:04 -0800 Subject: [PATCH 23/87] ipv6: Take rcu_read_lock in __inet6_bind for mapped addresses I realized the last patch calls dev_get_by_index_rcu in a branch not holding the rcu lock. Add the calls to rcu_read_lock and rcu_read_unlock. Fixes: ec90ad334986 ("ipv6: Consider sk_bound_dev_if when binding a socket to a v4 mapped address") Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/af_inet6.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 93288b9f1697..d99753b5e39b 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -321,17 +321,20 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, goto out; } + rcu_read_lock(); if (sk->sk_bound_dev_if) { dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if); if (!dev) { err = -ENODEV; - goto out; + goto out_unlock; } } /* Reproduce AF_INET checks to make the bindings consistent */ v4addr = addr->sin6_addr.s6_addr32[3]; chk_addr_ret = inet_addr_type_dev_table(net, dev, v4addr); + rcu_read_unlock(); + if (!inet_can_nonlocal_bind(net, inet) && v4addr != htonl(INADDR_ANY) && chk_addr_ret != RTN_LOCAL && From d3bd7413e0ca40b60cf60d4003246d067cafdeda Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 6 Jan 2019 00:54:37 +0100 Subject: [PATCH 24/87] bpf: fix sanitation of alu op with pointer / scalar type from different paths While 979d63d50c0c ("bpf: prevent out of bounds speculation on pointer arithmetic") took care of rejecting alu op on pointer when e.g. pointer came from two different map values with different map properties such as value size, Jann reported that a case was not covered yet when a given alu op is used in both "ptr_reg += reg" and "numeric_reg += reg" from different branches where we would incorrectly try to sanitize based on the pointer's limit. Catch this corner case and reject the program instead. Fixes: 979d63d50c0c ("bpf: prevent out of bounds speculation on pointer arithmetic") Reported-by: Jann Horn Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 1 + kernel/bpf/verifier.c | 61 ++++++++++++++++++++++++++++-------- 2 files changed, 49 insertions(+), 13 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 27b74947cd2b..573cca00a0e6 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -172,6 +172,7 @@ struct bpf_verifier_state_list { #define BPF_ALU_SANITIZE_SRC 1U #define BPF_ALU_SANITIZE_DST 2U #define BPF_ALU_NEG_VALUE (1U << 2) +#define BPF_ALU_NON_POINTER (1U << 3) #define BPF_ALU_SANITIZE (BPF_ALU_SANITIZE_SRC | \ BPF_ALU_SANITIZE_DST) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f6bc62a9ee8e..56674a7c3778 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3103,6 +3103,40 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, } } +static bool can_skip_alu_sanitation(const struct bpf_verifier_env *env, + const struct bpf_insn *insn) +{ + return env->allow_ptr_leaks || BPF_SRC(insn->code) == BPF_K; +} + +static int update_alu_sanitation_state(struct bpf_insn_aux_data *aux, + u32 alu_state, u32 alu_limit) +{ + /* If we arrived here from different branches with different + * state or limits to sanitize, then this won't work. + */ + if (aux->alu_state && + (aux->alu_state != alu_state || + aux->alu_limit != alu_limit)) + return -EACCES; + + /* Corresponding fixup done in fixup_bpf_calls(). */ + aux->alu_state = alu_state; + aux->alu_limit = alu_limit; + return 0; +} + +static int sanitize_val_alu(struct bpf_verifier_env *env, + struct bpf_insn *insn) +{ + struct bpf_insn_aux_data *aux = cur_aux(env); + + if (can_skip_alu_sanitation(env, insn)) + return 0; + + return update_alu_sanitation_state(aux, BPF_ALU_NON_POINTER, 0); +} + static int sanitize_ptr_alu(struct bpf_verifier_env *env, struct bpf_insn *insn, const struct bpf_reg_state *ptr_reg, @@ -3117,7 +3151,7 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env, struct bpf_reg_state tmp; bool ret; - if (env->allow_ptr_leaks || BPF_SRC(insn->code) == BPF_K) + if (can_skip_alu_sanitation(env, insn)) return 0; /* We already marked aux for masking from non-speculative @@ -3133,19 +3167,8 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env, if (retrieve_ptr_limit(ptr_reg, &alu_limit, opcode, off_is_neg)) return 0; - - /* If we arrived here from different branches with different - * limits to sanitize, then this won't work. - */ - if (aux->alu_state && - (aux->alu_state != alu_state || - aux->alu_limit != alu_limit)) + if (update_alu_sanitation_state(aux, alu_state, alu_limit)) return -EACCES; - - /* Corresponding fixup done in fixup_bpf_calls(). */ - aux->alu_state = alu_state; - aux->alu_limit = alu_limit; - do_sim: /* Simulate and find potential out-of-bounds access under * speculative execution from truncation as a result of @@ -3418,6 +3441,8 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, s64 smin_val, smax_val; u64 umin_val, umax_val; u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32; + u32 dst = insn->dst_reg; + int ret; if (insn_bitness == 32) { /* Relevant for 32-bit RSH: Information can propagate towards @@ -3452,6 +3477,11 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, switch (opcode) { case BPF_ADD: + ret = sanitize_val_alu(env, insn); + if (ret < 0) { + verbose(env, "R%d tried to add from different pointers or scalars\n", dst); + return ret; + } if (signed_add_overflows(dst_reg->smin_value, smin_val) || signed_add_overflows(dst_reg->smax_value, smax_val)) { dst_reg->smin_value = S64_MIN; @@ -3471,6 +3501,11 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, dst_reg->var_off = tnum_add(dst_reg->var_off, src_reg.var_off); break; case BPF_SUB: + ret = sanitize_val_alu(env, insn); + if (ret < 0) { + verbose(env, "R%d tried to sub from different pointers or scalars\n", dst); + return ret; + } if (signed_sub_overflows(dst_reg->smin_value, smax_val) || signed_sub_overflows(dst_reg->smax_value, smin_val)) { /* Overflow possible, we know nothing */ From 1cbbcfbbd56efd994d643428c69467fe3c8ab672 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 6 Jan 2019 00:54:38 +0100 Subject: [PATCH 25/87] bpf: add various test cases for alu op on mixed dst register types Add couple of test_verifier tests to check sanitation of alu op insn with pointer and scalar type coming from different paths. This also includes BPF insns of the test reproducer provided by Jann Horn. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/test_verifier.c | 120 ++++++++++++++++++++ 1 file changed, 120 insertions(+) diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 10d44446e801..2fd90d456892 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -6933,6 +6933,126 @@ static struct bpf_test tests[] = { .result = ACCEPT, .retval = 1, }, + { + "map access: mixing value pointer and scalar, 1", + .insns = { + // load map value pointer into r0 and r2 + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_LD_MAP_FD(BPF_REG_ARG1, 0), + BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_FP), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG2, -16), + BPF_ST_MEM(BPF_DW, BPF_REG_FP, -16, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), + BPF_EXIT_INSN(), + // load some number from the map into r1 + BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0), + // depending on r1, branch: + BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 3), + // branch A + BPF_MOV64_REG(BPF_REG_2, BPF_REG_0), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_JMP_A(2), + // branch B + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_MOV64_IMM(BPF_REG_3, 0x100000), + // common instruction + BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_3), + // depending on r1, branch: + BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 1), + // branch A + BPF_JMP_A(4), + // branch B + BPF_MOV64_IMM(BPF_REG_0, 0x13371337), + // verifier follows fall-through + BPF_JMP_IMM(BPF_JNE, BPF_REG_2, 0x100000, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + // fake-dead code; targeted from branch A to + // prevent dead code sanitization + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_array_48b = { 1 }, + .result = ACCEPT, + .result_unpriv = REJECT, + .errstr_unpriv = "R2 tried to add from different pointers or scalars", + .retval = 0, + }, + { + "map access: mixing value pointer and scalar, 2", + .insns = { + // load map value pointer into r0 and r2 + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_LD_MAP_FD(BPF_REG_ARG1, 0), + BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_FP), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG2, -16), + BPF_ST_MEM(BPF_DW, BPF_REG_FP, -16, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), + BPF_EXIT_INSN(), + // load some number from the map into r1 + BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0), + // depending on r1, branch: + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 3), + // branch A + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_MOV64_IMM(BPF_REG_3, 0x100000), + BPF_JMP_A(2), + // branch B + BPF_MOV64_REG(BPF_REG_2, BPF_REG_0), + BPF_MOV64_IMM(BPF_REG_3, 0), + // common instruction + BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_3), + // depending on r1, branch: + BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 1), + // branch A + BPF_JMP_A(4), + // branch B + BPF_MOV64_IMM(BPF_REG_0, 0x13371337), + // verifier follows fall-through + BPF_JMP_IMM(BPF_JNE, BPF_REG_2, 0x100000, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + // fake-dead code; targeted from branch A to + // prevent dead code sanitization + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_array_48b = { 1 }, + .result = ACCEPT, + .result_unpriv = REJECT, + .errstr_unpriv = "R2 tried to add from different maps or paths", + .retval = 0, + }, + { + "sanitation: alu with different scalars", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_LD_MAP_FD(BPF_REG_ARG1, 0), + BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_FP), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG2, -16), + BPF_ST_MEM(BPF_DW, BPF_REG_FP, -16, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), + BPF_EXIT_INSN(), + BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 3), + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_MOV64_IMM(BPF_REG_3, 0x100000), + BPF_JMP_A(2), + BPF_MOV64_IMM(BPF_REG_2, 42), + BPF_MOV64_IMM(BPF_REG_3, 0x100001), + BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_3), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), + BPF_EXIT_INSN(), + }, + .fixup_map_array_48b = { 1 }, + .result = ACCEPT, + .retval = 0x100000, + }, { "map access: value_ptr += known scalar, upper oob arith, test 1", .insns = { From f65e192af35058e5c82da9e90871b472d24912bc Mon Sep 17 00:00:00 2001 From: Stephen Warren Date: Thu, 3 Jan 2019 10:23:23 -0700 Subject: [PATCH 26/87] net/mlx4: Get rid of page operation after dma_alloc_coherent This patch solves a crash at the time of mlx4 driver unload or system shutdown. The crash occurs because dma_alloc_coherent() returns one value in mlx4_alloc_icm_coherent(), but a different value is passed to dma_free_coherent() in mlx4_free_icm_coherent(). In turn this is because when allocated, that pointer is passed to sg_set_buf() to record it, then when freed it is re-calculated by calling lowmem_page_address(sg_page()) which returns a different value. Solve this by recording the value that dma_alloc_coherent() returns, and passing this to dma_free_coherent(). This patch is roughly equivalent to commit 378efe798ecf ("RDMA/hns: Get rid of page operation after dma_alloc_coherent"). Based-on-code-from: Christoph Hellwig Signed-off-by: Stephen Warren Reviewed-by: Tariq Toukan Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/icm.c | 94 ++++++++++++++---------- drivers/net/ethernet/mellanox/mlx4/icm.h | 22 +++++- 2 files changed, 76 insertions(+), 40 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/icm.c b/drivers/net/ethernet/mellanox/mlx4/icm.c index 4b4351141b94..76b84d08a058 100644 --- a/drivers/net/ethernet/mellanox/mlx4/icm.c +++ b/drivers/net/ethernet/mellanox/mlx4/icm.c @@ -57,12 +57,12 @@ static void mlx4_free_icm_pages(struct mlx4_dev *dev, struct mlx4_icm_chunk *chu int i; if (chunk->nsg > 0) - pci_unmap_sg(dev->persist->pdev, chunk->mem, chunk->npages, + pci_unmap_sg(dev->persist->pdev, chunk->sg, chunk->npages, PCI_DMA_BIDIRECTIONAL); for (i = 0; i < chunk->npages; ++i) - __free_pages(sg_page(&chunk->mem[i]), - get_order(chunk->mem[i].length)); + __free_pages(sg_page(&chunk->sg[i]), + get_order(chunk->sg[i].length)); } static void mlx4_free_icm_coherent(struct mlx4_dev *dev, struct mlx4_icm_chunk *chunk) @@ -71,9 +71,9 @@ static void mlx4_free_icm_coherent(struct mlx4_dev *dev, struct mlx4_icm_chunk * for (i = 0; i < chunk->npages; ++i) dma_free_coherent(&dev->persist->pdev->dev, - chunk->mem[i].length, - lowmem_page_address(sg_page(&chunk->mem[i])), - sg_dma_address(&chunk->mem[i])); + chunk->buf[i].size, + chunk->buf[i].addr, + chunk->buf[i].dma_addr); } void mlx4_free_icm(struct mlx4_dev *dev, struct mlx4_icm *icm, int coherent) @@ -111,22 +111,21 @@ static int mlx4_alloc_icm_pages(struct scatterlist *mem, int order, return 0; } -static int mlx4_alloc_icm_coherent(struct device *dev, struct scatterlist *mem, - int order, gfp_t gfp_mask) +static int mlx4_alloc_icm_coherent(struct device *dev, struct mlx4_icm_buf *buf, + int order, gfp_t gfp_mask) { - void *buf = dma_alloc_coherent(dev, PAGE_SIZE << order, - &sg_dma_address(mem), gfp_mask); - if (!buf) + buf->addr = dma_alloc_coherent(dev, PAGE_SIZE << order, + &buf->dma_addr, gfp_mask); + if (!buf->addr) return -ENOMEM; - if (offset_in_page(buf)) { - dma_free_coherent(dev, PAGE_SIZE << order, - buf, sg_dma_address(mem)); + if (offset_in_page(buf->addr)) { + dma_free_coherent(dev, PAGE_SIZE << order, buf->addr, + buf->dma_addr); return -ENOMEM; } - sg_set_buf(mem, buf, PAGE_SIZE << order); - sg_dma_len(mem) = PAGE_SIZE << order; + buf->size = PAGE_SIZE << order; return 0; } @@ -159,21 +158,21 @@ struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages, while (npages > 0) { if (!chunk) { - chunk = kmalloc_node(sizeof(*chunk), + chunk = kzalloc_node(sizeof(*chunk), gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN), dev->numa_node); if (!chunk) { - chunk = kmalloc(sizeof(*chunk), + chunk = kzalloc(sizeof(*chunk), gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN)); if (!chunk) goto fail; } + chunk->coherent = coherent; - sg_init_table(chunk->mem, MLX4_ICM_CHUNK_LEN); - chunk->npages = 0; - chunk->nsg = 0; + if (!coherent) + sg_init_table(chunk->sg, MLX4_ICM_CHUNK_LEN); list_add_tail(&chunk->list, &icm->chunk_list); } @@ -186,10 +185,10 @@ struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages, if (coherent) ret = mlx4_alloc_icm_coherent(&dev->persist->pdev->dev, - &chunk->mem[chunk->npages], - cur_order, mask); + &chunk->buf[chunk->npages], + cur_order, mask); else - ret = mlx4_alloc_icm_pages(&chunk->mem[chunk->npages], + ret = mlx4_alloc_icm_pages(&chunk->sg[chunk->npages], cur_order, mask, dev->numa_node); @@ -205,7 +204,7 @@ struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages, if (coherent) ++chunk->nsg; else if (chunk->npages == MLX4_ICM_CHUNK_LEN) { - chunk->nsg = pci_map_sg(dev->persist->pdev, chunk->mem, + chunk->nsg = pci_map_sg(dev->persist->pdev, chunk->sg, chunk->npages, PCI_DMA_BIDIRECTIONAL); @@ -220,7 +219,7 @@ struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages, } if (!coherent && chunk) { - chunk->nsg = pci_map_sg(dev->persist->pdev, chunk->mem, + chunk->nsg = pci_map_sg(dev->persist->pdev, chunk->sg, chunk->npages, PCI_DMA_BIDIRECTIONAL); @@ -320,7 +319,7 @@ void *mlx4_table_find(struct mlx4_icm_table *table, u32 obj, u64 idx; struct mlx4_icm_chunk *chunk; struct mlx4_icm *icm; - struct page *page = NULL; + void *addr = NULL; if (!table->lowmem) return NULL; @@ -336,28 +335,49 @@ void *mlx4_table_find(struct mlx4_icm_table *table, u32 obj, list_for_each_entry(chunk, &icm->chunk_list, list) { for (i = 0; i < chunk->npages; ++i) { - if (dma_handle && dma_offset >= 0) { - if (sg_dma_len(&chunk->mem[i]) > dma_offset) - *dma_handle = sg_dma_address(&chunk->mem[i]) + - dma_offset; - dma_offset -= sg_dma_len(&chunk->mem[i]); + dma_addr_t dma_addr; + size_t len; + + if (table->coherent) { + len = chunk->buf[i].size; + dma_addr = chunk->buf[i].dma_addr; + addr = chunk->buf[i].addr; + } else { + struct page *page; + + len = sg_dma_len(&chunk->sg[i]); + dma_addr = sg_dma_address(&chunk->sg[i]); + + /* XXX: we should never do this for highmem + * allocation. This function either needs + * to be split, or the kernel virtual address + * return needs to be made optional. + */ + page = sg_page(&chunk->sg[i]); + addr = lowmem_page_address(page); } + + if (dma_handle && dma_offset >= 0) { + if (len > dma_offset) + *dma_handle = dma_addr + dma_offset; + dma_offset -= len; + } + /* * DMA mapping can merge pages but not split them, * so if we found the page, dma_handle has already * been assigned to. */ - if (chunk->mem[i].length > offset) { - page = sg_page(&chunk->mem[i]); + if (len > offset) goto out; - } - offset -= chunk->mem[i].length; + offset -= len; } } + addr = NULL; out: mutex_unlock(&table->mutex); - return page ? lowmem_page_address(page) + offset : NULL; + return addr ? addr + offset : NULL; } int mlx4_table_get_range(struct mlx4_dev *dev, struct mlx4_icm_table *table, diff --git a/drivers/net/ethernet/mellanox/mlx4/icm.h b/drivers/net/ethernet/mellanox/mlx4/icm.h index c9169a490557..d199874b1c07 100644 --- a/drivers/net/ethernet/mellanox/mlx4/icm.h +++ b/drivers/net/ethernet/mellanox/mlx4/icm.h @@ -47,11 +47,21 @@ enum { MLX4_ICM_PAGE_SIZE = 1 << MLX4_ICM_PAGE_SHIFT, }; +struct mlx4_icm_buf { + void *addr; + size_t size; + dma_addr_t dma_addr; +}; + struct mlx4_icm_chunk { struct list_head list; int npages; int nsg; - struct scatterlist mem[MLX4_ICM_CHUNK_LEN]; + bool coherent; + union { + struct scatterlist sg[MLX4_ICM_CHUNK_LEN]; + struct mlx4_icm_buf buf[MLX4_ICM_CHUNK_LEN]; + }; }; struct mlx4_icm { @@ -114,12 +124,18 @@ static inline void mlx4_icm_next(struct mlx4_icm_iter *iter) static inline dma_addr_t mlx4_icm_addr(struct mlx4_icm_iter *iter) { - return sg_dma_address(&iter->chunk->mem[iter->page_idx]); + if (iter->chunk->coherent) + return iter->chunk->buf[iter->page_idx].dma_addr; + else + return sg_dma_address(&iter->chunk->sg[iter->page_idx]); } static inline unsigned long mlx4_icm_size(struct mlx4_icm_iter *iter) { - return sg_dma_len(&iter->chunk->mem[iter->page_idx]); + if (iter->chunk->coherent) + return iter->chunk->buf[iter->page_idx].size; + else + return sg_dma_len(&iter->chunk->sg[iter->page_idx]); } int mlx4_MAP_ICM_AUX(struct mlx4_dev *dev, struct mlx4_icm *icm); From 01cd364a15f42575ef4aac8f82ff05516ea5da9a Mon Sep 17 00:00:00 2001 From: Stephen Warren Date: Thu, 3 Jan 2019 10:23:24 -0700 Subject: [PATCH 27/87] net/mlx4: replace pci_{,un}map_sg with dma_{,un}map_sg pci_{,un}map_sg are deprecated and replaced by dma_{,un}map_sg. This is especially relevant since the rest of the driver uses the DMA API. Fix the driver to use the replacement APIs. Signed-off-by: Stephen Warren Reviewed-by: Tariq Toukan Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/icm.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/icm.c b/drivers/net/ethernet/mellanox/mlx4/icm.c index 76b84d08a058..d89a3da89e5a 100644 --- a/drivers/net/ethernet/mellanox/mlx4/icm.c +++ b/drivers/net/ethernet/mellanox/mlx4/icm.c @@ -57,8 +57,8 @@ static void mlx4_free_icm_pages(struct mlx4_dev *dev, struct mlx4_icm_chunk *chu int i; if (chunk->nsg > 0) - pci_unmap_sg(dev->persist->pdev, chunk->sg, chunk->npages, - PCI_DMA_BIDIRECTIONAL); + dma_unmap_sg(&dev->persist->pdev->dev, chunk->sg, chunk->npages, + DMA_BIDIRECTIONAL); for (i = 0; i < chunk->npages; ++i) __free_pages(sg_page(&chunk->sg[i]), @@ -204,9 +204,9 @@ struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages, if (coherent) ++chunk->nsg; else if (chunk->npages == MLX4_ICM_CHUNK_LEN) { - chunk->nsg = pci_map_sg(dev->persist->pdev, chunk->sg, - chunk->npages, - PCI_DMA_BIDIRECTIONAL); + chunk->nsg = dma_map_sg(&dev->persist->pdev->dev, + chunk->sg, chunk->npages, + DMA_BIDIRECTIONAL); if (chunk->nsg <= 0) goto fail; @@ -219,9 +219,8 @@ struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages, } if (!coherent && chunk) { - chunk->nsg = pci_map_sg(dev->persist->pdev, chunk->sg, - chunk->npages, - PCI_DMA_BIDIRECTIONAL); + chunk->nsg = dma_map_sg(&dev->persist->pdev->dev, chunk->sg, + chunk->npages, DMA_BIDIRECTIONAL); if (chunk->nsg <= 0) goto fail; From 0aaa81377c5a01f686bcdb8c7a6929a7bf330c68 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Fri, 4 Jan 2019 15:55:26 +0100 Subject: [PATCH 28/87] can: gw: ensure DLC boundaries after CAN frame modification Muyu Yu provided a POC where user root with CAP_NET_ADMIN can create a CAN frame modification rule that makes the data length code a higher value than the available CAN frame data size. In combination with a configured checksum calculation where the result is stored relatively to the end of the data (e.g. cgw_csum_xor_rel) the tail of the skb (e.g. frag_list pointer in skb_shared_info) can be rewritten which finally can cause a system crash. Michael Kubecek suggested to drop frames that have a DLC exceeding the available space after the modification process and provided a patch that can handle CAN FD frames too. Within this patch we also limit the length for the checksum calculations to the maximum of Classic CAN data length (8). CAN frames that are dropped by these additional checks are counted with the CGW_DELETED counter which indicates misconfigurations in can-gw rules. This fixes CVE-2019-3701. Reported-by: Muyu Yu Reported-by: Marcus Meissner Suggested-by: Michal Kubecek Tested-by: Muyu Yu Tested-by: Oliver Hartkopp Signed-off-by: Oliver Hartkopp Cc: linux-stable # >= v3.2 Signed-off-by: Marc Kleine-Budde Signed-off-by: David S. Miller --- net/can/gw.c | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/net/can/gw.c b/net/can/gw.c index faa3da88a127..53859346dc9a 100644 --- a/net/can/gw.c +++ b/net/can/gw.c @@ -416,13 +416,29 @@ static void can_can_gw_rcv(struct sk_buff *skb, void *data) while (modidx < MAX_MODFUNCTIONS && gwj->mod.modfunc[modidx]) (*gwj->mod.modfunc[modidx++])(cf, &gwj->mod); - /* check for checksum updates when the CAN frame has been modified */ + /* Has the CAN frame been modified? */ if (modidx) { - if (gwj->mod.csumfunc.crc8) - (*gwj->mod.csumfunc.crc8)(cf, &gwj->mod.csum.crc8); + /* get available space for the processed CAN frame type */ + int max_len = nskb->len - offsetof(struct can_frame, data); + + /* dlc may have changed, make sure it fits to the CAN frame */ + if (cf->can_dlc > max_len) + goto out_delete; + + /* check for checksum updates in classic CAN length only */ + if (gwj->mod.csumfunc.crc8) { + if (cf->can_dlc > 8) + goto out_delete; + + (*gwj->mod.csumfunc.crc8)(cf, &gwj->mod.csum.crc8); + } + + if (gwj->mod.csumfunc.xor) { + if (cf->can_dlc > 8) + goto out_delete; - if (gwj->mod.csumfunc.xor) (*gwj->mod.csumfunc.xor)(cf, &gwj->mod.csum.xor); + } } /* clear the skb timestamp if not configured the other way */ @@ -434,6 +450,14 @@ static void can_can_gw_rcv(struct sk_buff *skb, void *data) gwj->dropped_frames++; else gwj->handled_frames++; + + return; + + out_delete: + /* delete frame due to misconfiguration */ + gwj->deleted_frames++; + kfree_skb(nskb); + return; } static inline int cgw_register_filter(struct net *net, struct cgw_job *gwj) From 10262b0b53666cbc506989b17a3ead1e9c3b43b4 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sun, 6 Jan 2019 20:44:00 +0100 Subject: [PATCH 29/87] r8169: don't try to read counters if chip is in a PCI power-save state Avoid log spam caused by trying to read counters from the chip whilst it is in a PCI power-save state. Reference: https://bugzilla.kernel.org/show_bug.cgi?id=107421 Fixes: 1ef7286e7f36 ("r8169: Dereference MMIO address immediately before use") Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c index 7fe9180261b8..784ae5001656 100644 --- a/drivers/net/ethernet/realtek/r8169.c +++ b/drivers/net/ethernet/realtek/r8169.c @@ -1681,11 +1681,13 @@ static bool rtl8169_reset_counters(struct rtl8169_private *tp) static bool rtl8169_update_counters(struct rtl8169_private *tp) { + u8 val = RTL_R8(tp, ChipCmd); + /* * Some chips are unable to dump tally counters when the receiver - * is disabled. + * is disabled. If 0xff chip may be in a PCI power-save state. */ - if ((RTL_R8(tp, ChipCmd) & CmdRxEnb) == 0) + if (!(val & CmdRxEnb) || val == 0xff) return true; return rtl8169_do_counters(tp, CounterDump); From eeb2c4fb6a3d0ebed35fbc13a255f691c8b8d7e5 Mon Sep 17 00:00:00 2001 From: Jacob Wen Date: Mon, 7 Jan 2019 09:59:59 +0800 Subject: [PATCH 30/87] rds: use DIV_ROUND_UP instead of ceil Yes indeed, DIV_ROUND_UP is in kernel.h. Signed-off-by: Jacob Wen Signed-off-by: David S. Miller --- net/rds/ib_send.c | 4 ++-- net/rds/message.c | 4 ++-- net/rds/rds.h | 4 ---- net/rds/send.c | 2 +- 4 files changed, 5 insertions(+), 9 deletions(-) diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index 2dcb555e6350..4e0c36acf866 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -522,7 +522,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) i = 1; else - i = ceil(be32_to_cpu(rm->m_inc.i_hdr.h_len), RDS_FRAG_SIZE); + i = DIV_ROUND_UP(be32_to_cpu(rm->m_inc.i_hdr.h_len), RDS_FRAG_SIZE); work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos); if (work_alloc == 0) { @@ -879,7 +879,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) * Instead of knowing how to return a partial rdma read/write we insist that there * be enough work requests to send the entire message. */ - i = ceil(op->op_count, max_sge); + i = DIV_ROUND_UP(op->op_count, max_sge); work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos); if (work_alloc != i) { diff --git a/net/rds/message.c b/net/rds/message.c index f139420ba1f6..50f13f1d4ae0 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -341,7 +341,7 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in { struct rds_message *rm; unsigned int i; - int num_sgs = ceil(total_len, PAGE_SIZE); + int num_sgs = DIV_ROUND_UP(total_len, PAGE_SIZE); int extra_bytes = num_sgs * sizeof(struct scatterlist); int ret; @@ -351,7 +351,7 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in set_bit(RDS_MSG_PAGEVEC, &rm->m_flags); rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len); - rm->data.op_nents = ceil(total_len, PAGE_SIZE); + rm->data.op_nents = DIV_ROUND_UP(total_len, PAGE_SIZE); rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs, &ret); if (!rm->data.op_sg) { rds_message_put(rm); diff --git a/net/rds/rds.h b/net/rds/rds.h index 02ec4a3b2799..4ffe100ff5e6 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -48,10 +48,6 @@ void rdsdebug(char *fmt, ...) } #endif -/* XXX is there one of these somewhere? */ -#define ceil(x, y) \ - ({ unsigned long __x = (x), __y = (y); (__x + __y - 1) / __y; }) - #define RDS_FRAG_SHIFT 12 #define RDS_FRAG_SIZE ((unsigned int)(1 << RDS_FRAG_SHIFT)) diff --git a/net/rds/send.c b/net/rds/send.c index 3d822bad7de9..fd8b687d5c05 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -1107,7 +1107,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) size_t total_payload_len = payload_len, rdma_payload_len = 0; bool zcopy = ((msg->msg_flags & MSG_ZEROCOPY) && sock_flag(rds_rs_to_sk(rs), SOCK_ZEROCOPY)); - int num_sgs = ceil(payload_len, PAGE_SIZE); + int num_sgs = DIV_ROUND_UP(payload_len, PAGE_SIZE); int namelen; struct rds_iov_vector_arr vct; int ind; From a29c3c09bad18ac2c91dfdbcc892a7a37e407227 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Mork?= Date: Sat, 5 Jan 2019 14:32:39 +0100 Subject: [PATCH 31/87] cdc_ether: trivial whitespace readability fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This function is unreadable enough without indenting mismatches and unnecessary line breaks. Signed-off-by: Bjørn Mork Signed-off-by: David S. Miller --- drivers/net/usb/cdc_ether.c | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/drivers/net/usb/cdc_ether.c b/drivers/net/usb/cdc_ether.c index b3b3c05903a1..3305f23793c7 100644 --- a/drivers/net/usb/cdc_ether.c +++ b/drivers/net/usb/cdc_ether.c @@ -179,10 +179,8 @@ int usbnet_generic_cdc_bind(struct usbnet *dev, struct usb_interface *intf) * probed with) and a slave/data interface; union * descriptors sort this all out. */ - info->control = usb_ifnum_to_if(dev->udev, - info->u->bMasterInterface0); - info->data = usb_ifnum_to_if(dev->udev, - info->u->bSlaveInterface0); + info->control = usb_ifnum_to_if(dev->udev, info->u->bMasterInterface0); + info->data = usb_ifnum_to_if(dev->udev, info->u->bSlaveInterface0); if (!info->control || !info->data) { dev_dbg(&intf->dev, "master #%u/%p slave #%u/%p\n", @@ -216,18 +214,16 @@ int usbnet_generic_cdc_bind(struct usbnet *dev, struct usb_interface *intf) /* a data interface altsetting does the real i/o */ d = &info->data->cur_altsetting->desc; if (d->bInterfaceClass != USB_CLASS_CDC_DATA) { - dev_dbg(&intf->dev, "slave class %u\n", - d->bInterfaceClass); + dev_dbg(&intf->dev, "slave class %u\n", d->bInterfaceClass); goto bad_desc; } skip: - if ( rndis && - header.usb_cdc_acm_descriptor && - header.usb_cdc_acm_descriptor->bmCapabilities) { - dev_dbg(&intf->dev, - "ACM capabilities %02x, not really RNDIS?\n", - header.usb_cdc_acm_descriptor->bmCapabilities); - goto bad_desc; + if (rndis && header.usb_cdc_acm_descriptor && + header.usb_cdc_acm_descriptor->bmCapabilities) { + dev_dbg(&intf->dev, + "ACM capabilities %02x, not really RNDIS?\n", + header.usb_cdc_acm_descriptor->bmCapabilities); + goto bad_desc; } if (header.usb_cdc_ether_desc && info->ether->wMaxSegmentSize) { @@ -238,7 +234,7 @@ skip: } if (header.usb_cdc_mdlm_desc && - memcmp(header.usb_cdc_mdlm_desc->bGUID, mbm_guid, 16)) { + memcmp(header.usb_cdc_mdlm_desc->bGUID, mbm_guid, 16)) { dev_dbg(&intf->dev, "GUID doesn't match\n"); goto bad_desc; } @@ -302,7 +298,7 @@ skip: if (info->control->cur_altsetting->desc.bNumEndpoints == 1) { struct usb_endpoint_descriptor *desc; - dev->status = &info->control->cur_altsetting->endpoint [0]; + dev->status = &info->control->cur_altsetting->endpoint[0]; desc = &dev->status->desc; if (!usb_endpoint_is_int_in(desc) || (le16_to_cpu(desc->wMaxPacketSize) From f87d8ad9233f115db92c6c087d58403b0009ed36 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Sat, 5 Jan 2019 10:52:23 -0600 Subject: [PATCH 32/87] tipc: fix memory leak in tipc_nl_compat_publ_dump There is a memory leak in case genlmsg_put fails. Fix this by freeing *args* before return. Addresses-Coverity-ID: 1476406 ("Resource leak") Fixes: 46273cf7e009 ("tipc: fix a missing check of genlmsg_put") Signed-off-by: Gustavo A. R. Silva Acked-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/netlink_compat.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index 40f5cae623a7..77e4b2418f30 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -904,8 +904,10 @@ static int tipc_nl_compat_publ_dump(struct tipc_nl_compat_msg *msg, u32 sock) hdr = genlmsg_put(args, 0, 0, &tipc_genl_family, NLM_F_MULTI, TIPC_NL_PUBL_GET); - if (!hdr) + if (!hdr) { + kfree_skb(args); return -EMSGSIZE; + } nest = nla_nest_start(args, TIPC_NLA_SOCK); if (!nest) { From 4c84edc11b76590859b1e45dd676074c59602dc4 Mon Sep 17 00:00:00 2001 From: JianJhen Chen Date: Sun, 6 Jan 2019 11:28:13 +0800 Subject: [PATCH 33/87] net: bridge: fix a bug on using a neighbour cache entry without checking its state When handling DNAT'ed packets on a bridge device, the neighbour cache entry from lookup was used without checking its state. It means that a cache entry in the NUD_STALE state will be used directly instead of entering the NUD_DELAY state to confirm the reachability of the neighbor. This problem becomes worse after commit 2724680bceee ("neigh: Keep neighbour cache entries if number of them is small enough."), since all neighbour cache entries in the NUD_STALE state will be kept in the neighbour table as long as the number of cache entries does not exceed the value specified in gc_thresh1. This commit validates the state of a neighbour cache entry before using the entry. Signed-off-by: JianJhen Chen Reviewed-by: JinLin Chen Signed-off-by: David S. Miller --- net/bridge/br_netfilter_hooks.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index d21a23698410..c93c35bb73dd 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -265,7 +265,7 @@ int br_nf_pre_routing_finish_bridge(struct net *net, struct sock *sk, struct sk_ struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); int ret; - if (neigh->hh.hh_len) { + if ((neigh->nud_state & NUD_CONNECTED) && neigh->hh.hh_len) { neigh_hh_bridge(&neigh->hh, skb); skb->dev = nf_bridge->physindev; ret = br_handle_frame_finish(net, sk, skb); From 7acf8b36a2ac6eb4fdc53c4d862570089e56c69e Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 7 Jan 2019 17:54:14 +0000 Subject: [PATCH 34/87] phy: ti: Fix compilation failures without REGMAP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This driver requires regmap or the compile fails: drivers/phy/ti/phy-gmii-sel.c:43:27: error: array type has incomplete element type ‘struct reg_field’ const struct reg_field (*regfields)[PHY_GMII_SEL_LAST]; Add it to kconfig. Signed-off-by: Jason Gunthorpe Signed-off-by: David S. Miller --- drivers/phy/ti/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/phy/ti/Kconfig b/drivers/phy/ti/Kconfig index f137e0107764..c4709ed7fb0e 100644 --- a/drivers/phy/ti/Kconfig +++ b/drivers/phy/ti/Kconfig @@ -82,6 +82,7 @@ config PHY_TI_GMII_SEL default y if TI_CPSW=y depends on TI_CPSW || COMPILE_TEST select GENERIC_PHY + select REGMAP default m help This driver supports configuring of the TI CPSW Port mode depending on From 26d92e951fe0a44ee4aec157cabb65a818cc8151 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sat, 5 Jan 2019 23:45:26 -0800 Subject: [PATCH 35/87] smc: move unhash as early as possible in smc_release() In smc_release() we release smc->clcsock before unhash the smc sock, but a parallel smc_diag_dump() may be still reading smc->clcsock, therefore this could cause a use-after-free as reported by syzbot. Reported-and-tested-by: syzbot+fbd1e5476e4c94c7b34e@syzkaller.appspotmail.com Fixes: 51f1de79ad8e ("net/smc: replace sock_put worker by socket refcounting") Cc: Ursula Braun Signed-off-by: Cong Wang Reported-by: syzbot+0bf2e01269f1274b4b03@syzkaller.appspotmail.com Reported-by: syzbot+e3132895630f957306bc@syzkaller.appspotmail.com Signed-off-by: David S. Miller --- net/smc/af_smc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index c4da4a78d369..c4e56602e0c6 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -146,6 +146,9 @@ static int smc_release(struct socket *sock) sock_set_flag(sk, SOCK_DEAD); sk->sk_shutdown |= SHUTDOWN_MASK; } + + sk->sk_prot->unhash(sk); + if (smc->clcsock) { if (smc->use_fallback && sk->sk_state == SMC_LISTEN) { /* wake up clcsock accept */ @@ -170,7 +173,6 @@ static int smc_release(struct socket *sock) smc_conn_free(&smc->conn); release_sock(sk); - sk->sk_prot->unhash(sk); sock_put(sk); /* final sock_put */ out: return rc; From a8911d6d5878587767a78c6bde371298ca2a3be3 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 7 Jan 2019 09:46:46 -0800 Subject: [PATCH 36/87] selftests/bpf: fix incorrect users of create_and_get_cgroup We have some tests that assume create_and_get_cgroup returns -1 on error which is incorrect (it returns 0 on error). Since fd might be zero in general case, change create_and_get_cgroup to return -1 on error and fix the users that assume 0 on error. Fixes: f269099a7e7a ("tools/bpf: add a selftest for bpf_get_current_cgroup_id() helper") Fixes: 7d2c6cfc5411 ("bpf: use --cgroup in test_suite if supplied") v2: - instead of fixing the uses that assume -1 on error, convert the users that assume 0 on error (fd might be zero in general case) Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov --- samples/bpf/test_cgrp2_attach2.c | 14 +++++++------- samples/bpf/test_current_task_under_cgroup_user.c | 2 +- tools/testing/selftests/bpf/cgroup_helpers.c | 6 +++--- tools/testing/selftests/bpf/test_cgroup_storage.c | 2 +- tools/testing/selftests/bpf/test_dev_cgroup.c | 2 +- tools/testing/selftests/bpf/test_netcnt.c | 2 +- .../selftests/bpf/test_skb_cgroup_id_user.c | 2 +- tools/testing/selftests/bpf/test_sock.c | 2 +- tools/testing/selftests/bpf/test_sock_addr.c | 2 +- tools/testing/selftests/bpf/test_socket_cookie.c | 2 +- tools/testing/selftests/bpf/test_tcpbpf_user.c | 2 +- tools/testing/selftests/bpf/test_tcpnotify_user.c | 2 +- 12 files changed, 20 insertions(+), 20 deletions(-) diff --git a/samples/bpf/test_cgrp2_attach2.c b/samples/bpf/test_cgrp2_attach2.c index d7b68ef5ba79..0bb6507256b7 100644 --- a/samples/bpf/test_cgrp2_attach2.c +++ b/samples/bpf/test_cgrp2_attach2.c @@ -77,7 +77,7 @@ static int test_foo_bar(void) /* Create cgroup /foo, get fd, and join it */ foo = create_and_get_cgroup(FOO); - if (!foo) + if (foo < 0) goto err; if (join_cgroup(FOO)) @@ -94,7 +94,7 @@ static int test_foo_bar(void) /* Create cgroup /foo/bar, get fd, and join it */ bar = create_and_get_cgroup(BAR); - if (!bar) + if (bar < 0) goto err; if (join_cgroup(BAR)) @@ -298,19 +298,19 @@ static int test_multiprog(void) goto err; cg1 = create_and_get_cgroup("/cg1"); - if (!cg1) + if (cg1 < 0) goto err; cg2 = create_and_get_cgroup("/cg1/cg2"); - if (!cg2) + if (cg2 < 0) goto err; cg3 = create_and_get_cgroup("/cg1/cg2/cg3"); - if (!cg3) + if (cg3 < 0) goto err; cg4 = create_and_get_cgroup("/cg1/cg2/cg3/cg4"); - if (!cg4) + if (cg4 < 0) goto err; cg5 = create_and_get_cgroup("/cg1/cg2/cg3/cg4/cg5"); - if (!cg5) + if (cg5 < 0) goto err; if (join_cgroup("/cg1/cg2/cg3/cg4/cg5")) diff --git a/samples/bpf/test_current_task_under_cgroup_user.c b/samples/bpf/test_current_task_under_cgroup_user.c index 2259f997a26c..f082d6ac59f0 100644 --- a/samples/bpf/test_current_task_under_cgroup_user.c +++ b/samples/bpf/test_current_task_under_cgroup_user.c @@ -32,7 +32,7 @@ int main(int argc, char **argv) cg2 = create_and_get_cgroup(CGROUP_PATH); - if (!cg2) + if (cg2 < 0) goto err; if (bpf_map_update_elem(map_fd[0], &idx, &cg2, BPF_ANY)) { diff --git a/tools/testing/selftests/bpf/cgroup_helpers.c b/tools/testing/selftests/bpf/cgroup_helpers.c index cf16948aad4a..6692a40a6979 100644 --- a/tools/testing/selftests/bpf/cgroup_helpers.c +++ b/tools/testing/selftests/bpf/cgroup_helpers.c @@ -155,7 +155,7 @@ void cleanup_cgroup_environment(void) * This function creates a cgroup under the top level workdir and returns the * file descriptor. It is idempotent. * - * On success, it returns the file descriptor. On failure it returns 0. + * On success, it returns the file descriptor. On failure it returns -1. * If there is a failure, it prints the error to stderr. */ int create_and_get_cgroup(const char *path) @@ -166,13 +166,13 @@ int create_and_get_cgroup(const char *path) format_cgroup_path(cgroup_path, path); if (mkdir(cgroup_path, 0777) && errno != EEXIST) { log_err("mkdiring cgroup %s .. %s", path, cgroup_path); - return 0; + return -1; } fd = open(cgroup_path, O_RDONLY); if (fd < 0) { log_err("Opening Cgroup"); - return 0; + return -1; } return fd; diff --git a/tools/testing/selftests/bpf/test_cgroup_storage.c b/tools/testing/selftests/bpf/test_cgroup_storage.c index f44834155f25..2fc4625c1a15 100644 --- a/tools/testing/selftests/bpf/test_cgroup_storage.c +++ b/tools/testing/selftests/bpf/test_cgroup_storage.c @@ -81,7 +81,7 @@ int main(int argc, char **argv) /* Create a cgroup, get fd, and join it */ cgroup_fd = create_and_get_cgroup(TEST_CGROUP); - if (!cgroup_fd) { + if (cgroup_fd < 0) { printf("Failed to create test cgroup\n"); goto err; } diff --git a/tools/testing/selftests/bpf/test_dev_cgroup.c b/tools/testing/selftests/bpf/test_dev_cgroup.c index 9c8b50bac7e0..76e4993b7c16 100644 --- a/tools/testing/selftests/bpf/test_dev_cgroup.c +++ b/tools/testing/selftests/bpf/test_dev_cgroup.c @@ -43,7 +43,7 @@ int main(int argc, char **argv) /* Create a cgroup, get fd, and join it */ cgroup_fd = create_and_get_cgroup(TEST_CGROUP); - if (!cgroup_fd) { + if (cgroup_fd < 0) { printf("Failed to create test cgroup\n"); goto err; } diff --git a/tools/testing/selftests/bpf/test_netcnt.c b/tools/testing/selftests/bpf/test_netcnt.c index 44ed7f29f8ab..c1da5404454a 100644 --- a/tools/testing/selftests/bpf/test_netcnt.c +++ b/tools/testing/selftests/bpf/test_netcnt.c @@ -65,7 +65,7 @@ int main(int argc, char **argv) /* Create a cgroup, get fd, and join it */ cgroup_fd = create_and_get_cgroup(TEST_CGROUP); - if (!cgroup_fd) { + if (cgroup_fd < 0) { printf("Failed to create test cgroup\n"); goto err; } diff --git a/tools/testing/selftests/bpf/test_skb_cgroup_id_user.c b/tools/testing/selftests/bpf/test_skb_cgroup_id_user.c index c121cc59f314..9220747c069d 100644 --- a/tools/testing/selftests/bpf/test_skb_cgroup_id_user.c +++ b/tools/testing/selftests/bpf/test_skb_cgroup_id_user.c @@ -164,7 +164,7 @@ int main(int argc, char **argv) goto err; cgfd = create_and_get_cgroup(CGROUP_PATH); - if (!cgfd) + if (cgfd < 0) goto err; if (join_cgroup(CGROUP_PATH)) diff --git a/tools/testing/selftests/bpf/test_sock.c b/tools/testing/selftests/bpf/test_sock.c index b8ebe2f58074..561ffb6d6433 100644 --- a/tools/testing/selftests/bpf/test_sock.c +++ b/tools/testing/selftests/bpf/test_sock.c @@ -458,7 +458,7 @@ int main(int argc, char **argv) goto err; cgfd = create_and_get_cgroup(CG_PATH); - if (!cgfd) + if (cgfd < 0) goto err; if (join_cgroup(CG_PATH)) diff --git a/tools/testing/selftests/bpf/test_sock_addr.c b/tools/testing/selftests/bpf/test_sock_addr.c index d94336cbd8bd..3f110eaaf29c 100644 --- a/tools/testing/selftests/bpf/test_sock_addr.c +++ b/tools/testing/selftests/bpf/test_sock_addr.c @@ -1442,7 +1442,7 @@ int main(int argc, char **argv) goto err; cgfd = create_and_get_cgroup(CG_PATH); - if (!cgfd) + if (cgfd < 0) goto err; if (join_cgroup(CG_PATH)) diff --git a/tools/testing/selftests/bpf/test_socket_cookie.c b/tools/testing/selftests/bpf/test_socket_cookie.c index b6c2c605d8c0..fc7832ee566b 100644 --- a/tools/testing/selftests/bpf/test_socket_cookie.c +++ b/tools/testing/selftests/bpf/test_socket_cookie.c @@ -202,7 +202,7 @@ int main(int argc, char **argv) goto err; cgfd = create_and_get_cgroup(CG_PATH); - if (!cgfd) + if (cgfd < 0) goto err; if (join_cgroup(CG_PATH)) diff --git a/tools/testing/selftests/bpf/test_tcpbpf_user.c b/tools/testing/selftests/bpf/test_tcpbpf_user.c index e6eebda7d112..716b4e3be581 100644 --- a/tools/testing/selftests/bpf/test_tcpbpf_user.c +++ b/tools/testing/selftests/bpf/test_tcpbpf_user.c @@ -103,7 +103,7 @@ int main(int argc, char **argv) goto err; cg_fd = create_and_get_cgroup(cg_path); - if (!cg_fd) + if (cg_fd < 0) goto err; if (join_cgroup(cg_path)) diff --git a/tools/testing/selftests/bpf/test_tcpnotify_user.c b/tools/testing/selftests/bpf/test_tcpnotify_user.c index ff3c4522aed6..4e4353711a86 100644 --- a/tools/testing/selftests/bpf/test_tcpnotify_user.c +++ b/tools/testing/selftests/bpf/test_tcpnotify_user.c @@ -115,7 +115,7 @@ int main(int argc, char **argv) goto err; cg_fd = create_and_get_cgroup(cg_path); - if (!cg_fd) + if (cg_fd < 0) goto err; if (join_cgroup(cg_path)) From a769fa7208b94f37b6240215dc6970f9d76fc58c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 7 Jan 2019 22:57:17 +0100 Subject: [PATCH 37/87] bpf, doc: update design qa to reflect kern_version requirement Update the bpf_design_QA.rst to also reflect recent changes in 6c4fc209fcf9 ("bpf: remove useless version check for prog load"). Suggested-by: Quentin Monnet Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- Documentation/bpf/bpf_design_QA.rst | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/Documentation/bpf/bpf_design_QA.rst b/Documentation/bpf/bpf_design_QA.rst index 6780a6d81745..7cc9e368c1e9 100644 --- a/Documentation/bpf/bpf_design_QA.rst +++ b/Documentation/bpf/bpf_design_QA.rst @@ -157,12 +157,11 @@ Q: Does BPF have a stable ABI? ------------------------------ A: YES. BPF instructions, arguments to BPF programs, set of helper functions and their arguments, recognized return codes are all part -of ABI. However when tracing programs are using bpf_probe_read() helper -to walk kernel internal datastructures and compile with kernel -internal headers these accesses can and will break with newer -kernels. The union bpf_attr -> kern_version is checked at load time -to prevent accidentally loading kprobe-based bpf programs written -for a different kernel. Networking programs don't do kern_version check. +of ABI. However there is one specific exception to tracing programs +which are using helpers like bpf_probe_read() to walk kernel internal +data structures and compile with kernel internal headers. Both of these +kernel internals are subject to change and can break with newer kernels +such that the program needs to be adapted accordingly. Q: How much stack space a BPF program uses? ------------------------------------------- From 80f21ff987eb377140d27102285f8dd1167b335c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 7 Jan 2019 22:57:18 +0100 Subject: [PATCH 38/87] bpf, doc: add note for libbpf's stand-alone build Given this came up couple of times, add a note to libbpf's readme about the semi-automated mirror for a stand-alone build which is officially managed by BPF folks. While at it, also explicitly state the libbpf license in the readme file. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/README.rst | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tools/lib/bpf/README.rst b/tools/lib/bpf/README.rst index 056f38310722..607aae40f4ed 100644 --- a/tools/lib/bpf/README.rst +++ b/tools/lib/bpf/README.rst @@ -132,6 +132,20 @@ For example, if current state of ``libbpf.map`` is: Format of version script and ways to handle ABI changes, including incompatible ones, described in details in [1]. +Stand-alone build +================= + +Under https://github.com/libbpf/libbpf there is a (semi-)automated +mirror of the mainline's version of libbpf for a stand-alone build. + +However, all changes to libbpf's code base must be upstreamed through +the mainline kernel tree. + +License +======= + +libbpf is dual-licensed under LGPL 2.1 and BSD 2-Clause. + Links ===== From 895ac1376d5abcb94ca1b70a595579f253237790 Mon Sep 17 00:00:00 2001 From: Eugene Syromiatnikov Date: Mon, 7 Jan 2019 16:22:29 +0100 Subject: [PATCH 39/87] ptp: check that rsv field is zero in struct ptp_sys_offset_extended Otherwise it is impossible to use it for something else, as it will break userspace that puts garbage there. The same check should be done in other structures, but the fact that data in reserved fields is ignored is already part of the kernel ABI. Signed-off-by: Eugene Syromiatnikov Signed-off-by: David S. Miller --- drivers/ptp/ptp_chardev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/ptp/ptp_chardev.c b/drivers/ptp/ptp_chardev.c index 797fab33bb98..7cbea796652a 100644 --- a/drivers/ptp/ptp_chardev.c +++ b/drivers/ptp/ptp_chardev.c @@ -224,7 +224,8 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg) extoff = NULL; break; } - if (extoff->n_samples > PTP_MAX_SAMPLES) { + if (extoff->n_samples > PTP_MAX_SAMPLES + || extoff->rsv[0] || extoff->rsv[1] || extoff->rsv[2]) { err = -EINVAL; break; } From b7ea4894aa867aaf1c31bfb4b00a3c3e38eedf95 Mon Sep 17 00:00:00 2001 From: Eugene Syromiatnikov Date: Mon, 7 Jan 2019 16:22:38 +0100 Subject: [PATCH 40/87] ptp: uapi: change _IOW to IOWR in PTP_SYS_OFFSET_EXTENDED definition The ioctl command is read/write (or just read, if the fact that user space writes n_samples field is ignored). Signed-off-by: Eugene Syromiatnikov Signed-off-by: David S. Miller --- include/uapi/linux/ptp_clock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/linux/ptp_clock.h b/include/uapi/linux/ptp_clock.h index d73d83950265..1bc794ad957a 100644 --- a/include/uapi/linux/ptp_clock.h +++ b/include/uapi/linux/ptp_clock.h @@ -147,7 +147,7 @@ struct ptp_pin_desc { #define PTP_SYS_OFFSET_PRECISE \ _IOWR(PTP_CLK_MAGIC, 8, struct ptp_sys_offset_precise) #define PTP_SYS_OFFSET_EXTENDED \ - _IOW(PTP_CLK_MAGIC, 9, struct ptp_sys_offset_extended) + _IOWR(PTP_CLK_MAGIC, 9, struct ptp_sys_offset_extended) struct ptp_extts_event { struct ptp_clock_time t; /* Time event occured. */ From a0071840d2040ea1b27e5a008182b09b88defc15 Mon Sep 17 00:00:00 2001 From: Bryan Whitehead Date: Mon, 7 Jan 2019 14:00:09 -0500 Subject: [PATCH 41/87] lan743x: Remove phy_read from link status change function It has been noticed that some phys do not have the registers required by the previous implementation. To fix this, instead of using phy_read, the required information is extracted from the phy_device structure. fixes: 23f0703c125b ("lan743x: Add main source files for new lan743x driver") Signed-off-by: Bryan Whitehead Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/ethernet/microchip/lan743x_main.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/microchip/lan743x_main.c b/drivers/net/ethernet/microchip/lan743x_main.c index 20c9377e99cb..310807ef328b 100644 --- a/drivers/net/ethernet/microchip/lan743x_main.c +++ b/drivers/net/ethernet/microchip/lan743x_main.c @@ -962,13 +962,10 @@ static void lan743x_phy_link_status_change(struct net_device *netdev) memset(&ksettings, 0, sizeof(ksettings)); phy_ethtool_get_link_ksettings(netdev, &ksettings); - local_advertisement = phy_read(phydev, MII_ADVERTISE); - if (local_advertisement < 0) - return; - - remote_advertisement = phy_read(phydev, MII_LPA); - if (remote_advertisement < 0) - return; + local_advertisement = + linkmode_adv_to_mii_adv_t(phydev->advertising); + remote_advertisement = + linkmode_adv_to_mii_adv_t(phydev->lp_advertising); lan743x_phy_update_flowcontrol(adapter, ksettings.base.duplex, From 11287b693d03830010356339e4ceddf47dee34fa Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Mon, 7 Jan 2019 21:49:09 +0100 Subject: [PATCH 42/87] r8169: load Realtek PHY driver module before r8169 This soft dependency works around an issue where sometimes the genphy driver is used instead of the dedicated PHY driver. The root cause of the issue isn't clear yet. People reported the unloading/re-loading module r8169 helps, and also configuring this soft dependency in the modprobe config files. Important just seems to be that the realtek module is loaded before r8169. Once this has been applied preliminary fix 38af4b903210 ("net: phy: add workaround for issue where PHY driver doesn't bind to the device") will be removed. Fixes: f1e911d5d0df ("r8169: add basic phylib support") Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c index 784ae5001656..abb94c543aa2 100644 --- a/drivers/net/ethernet/realtek/r8169.c +++ b/drivers/net/ethernet/realtek/r8169.c @@ -708,6 +708,7 @@ module_param(use_dac, int, 0); MODULE_PARM_DESC(use_dac, "Enable PCI DAC. Unsafe on 32 bit PCI slot."); module_param_named(debug, debug.msg_enable, int, 0); MODULE_PARM_DESC(debug, "Debug verbosity level (0=none, ..., 16=all)"); +MODULE_SOFTDEP("pre: realtek"); MODULE_LICENSE("GPL"); MODULE_FIRMWARE(FIRMWARE_8168D_1); MODULE_FIRMWARE(FIRMWARE_8168D_2); From ff0db43cd6c530ff944773ccf48ece55d32d0c22 Mon Sep 17 00:00:00 2001 From: Nir Dotan Date: Tue, 8 Jan 2019 16:48:03 +0000 Subject: [PATCH 43/87] mlxsw: spectrum_acl: Add cleanup after C-TCAM update error condition When writing to C-TCAM, mlxsw driver uses cregion->ops->entry_insert(). In case of C-TCAM HW insertion error, the opposite action should take place. Add error handling case in which the C-TCAM region entry is removed, by calling cregion->ops->entry_remove(). Fixes: a0a777b9409f ("mlxsw: spectrum_acl: Start using A-TCAM") Signed-off-by: Nir Dotan Reviewed-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- .../net/ethernet/mellanox/mlxsw/spectrum_acl_ctcam.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_ctcam.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_ctcam.c index b0f2d8e8ded0..ac222833a5cf 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_ctcam.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_ctcam.c @@ -72,7 +72,15 @@ mlxsw_sp_acl_ctcam_region_entry_insert(struct mlxsw_sp *mlxsw_sp, act_set = mlxsw_afa_block_first_set(rulei->act_block); mlxsw_reg_ptce2_flex_action_set_memcpy_to(ptce2_pl, act_set); - return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ptce2), ptce2_pl); + err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ptce2), ptce2_pl); + if (err) + goto err_ptce2_write; + + return 0; + +err_ptce2_write: + cregion->ops->entry_remove(cregion, centry); + return err; } static void From 04d075b7aa8433dc3c98e14c571705effc318cd8 Mon Sep 17 00:00:00 2001 From: Nir Dotan Date: Tue, 8 Jan 2019 16:48:04 +0000 Subject: [PATCH 44/87] mlxsw: spectrum_acl: Remove ASSERT_RTNL()s in module removal flow Removal of the mlxsw driver on Spectrum-2 platforms hits an ASSERT_RTNL() in Spectrum-2 ACL Bloom filter and in ERP removal paths. This happens because the multicast router implementation in Spectrum-2 relies on ACLs. Taking the RTNL lock upon driver removal is useless since the driver first removes its ports and unregisters from notifiers so concurrent writes cannot happen at that time. The assertions were originally put as a reminder for future work involving ERP background optimization, but having these assertions only during addition serves this purpose as well. Therefore remove the ASSERT_RTNL() in both places related to ERP and Bloom filter removal. Fixes: cf7221a4f5a5 ("mlxsw: spectrum_router: Add Multicast routing support for Spectrum-2") Signed-off-by: Nir Dotan Reviewed-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_erp.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_erp.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_erp.c index 1c19feefa5f2..2941967e1cc5 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_erp.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_erp.c @@ -1022,7 +1022,6 @@ void mlxsw_sp_acl_erp_mask_put(struct mlxsw_sp_acl_atcam_region *aregion, { struct objagg_obj *objagg_obj = (struct objagg_obj *) erp_mask; - ASSERT_RTNL(); objagg_obj_put(aregion->erp_table->objagg, objagg_obj); } @@ -1054,7 +1053,6 @@ void mlxsw_sp_acl_erp_bf_remove(struct mlxsw_sp *mlxsw_sp, const struct mlxsw_sp_acl_erp *erp = objagg_obj_root_priv(objagg_obj); unsigned int erp_bank; - ASSERT_RTNL(); if (!mlxsw_sp_acl_erp_table_is_used(erp->erp_table)) return; From 8adbe212a159d9c78a90fca1d854f6e63452426b Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 8 Jan 2019 16:48:05 +0000 Subject: [PATCH 45/87] mlxsw: spectrum: Disable lag port TX before removing it Make sure that lag port TX is disabled before mlxsw_sp_port_lag_leave() is called and prevent from possible EMAD error. Fixes: 0d65fc13042f ("mlxsw: spectrum: Implement LAG port join/leave") Signed-off-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index eed1045e4d96..32519c93df17 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -5005,12 +5005,15 @@ static int mlxsw_sp_netdevice_port_upper_event(struct net_device *lower_dev, lower_dev, upper_dev); } else if (netif_is_lag_master(upper_dev)) { - if (info->linking) + if (info->linking) { err = mlxsw_sp_port_lag_join(mlxsw_sp_port, upper_dev); - else + } else { + mlxsw_sp_port_lag_tx_en_set(mlxsw_sp_port, + false); mlxsw_sp_port_lag_leave(mlxsw_sp_port, upper_dev); + } } else if (netif_is_ovs_master(upper_dev)) { if (info->linking) err = mlxsw_sp_port_ovs_join(mlxsw_sp_port); From 143a8e038ac599ca73c6354c8af6a8fdeee9fa7d Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 8 Jan 2019 16:48:06 +0000 Subject: [PATCH 46/87] mlxsw: spectrum: Add VXLAN dependency for spectrum When VXLAN is a loadable module, MLXSW_SPECTRUM must not be built-in: drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c:2547: undefined reference to `vxlan_fdb_find_uc' Add Kconfig dependency to enforce usable configurations. Fixes: 1231e04f5bba ("mlxsw: spectrum_switchdev: Add support for VxLAN encapsulation") Signed-off-by: Ido Schimmel Reported-by: kbuild test robot Reviewed-by: Petr Machata Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/mellanox/mlxsw/Kconfig b/drivers/net/ethernet/mellanox/mlxsw/Kconfig index 080ddd1942ec..b9a25aed5d11 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/Kconfig +++ b/drivers/net/ethernet/mellanox/mlxsw/Kconfig @@ -78,6 +78,7 @@ config MLXSW_SPECTRUM depends on IPV6 || IPV6=n depends on NET_IPGRE || NET_IPGRE=n depends on IPV6_GRE || IPV6_GRE=n + depends on VXLAN || VXLAN=n select GENERIC_ALLOCATOR select PARMAN select OBJAGG From 457e20d659247a09524e276aed46f19d853701d0 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 8 Jan 2019 16:48:07 +0000 Subject: [PATCH 47/87] mlxsw: spectrum_switchdev: Avoid returning errors in commit phase Drivers are not supposed to return errors in switchdev commit phase if they returned OK in prepare phase. Otherwise, a WARNING is emitted. However, when the offloading of a VXLAN tunnel is triggered by the addition of a VLAN on a local port, it is not possible to guarantee that the commit phase will succeed without doing a lot of work. In these cases, the artificial division between prepare and commit phase does not make sense, so simply do the work in the prepare phase. Fixes: d70e42b22dd4 ("mlxsw: spectrum: Enable VxLAN enslavement to VLAN-aware bridges") Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Signed-off-by: David S. Miller --- .../mellanox/mlxsw/spectrum_switchdev.c | 21 ++++++++----------- 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c index 1bd2c6e15f8d..e8ce2307352b 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c @@ -1078,8 +1078,7 @@ static int mlxsw_sp_bridge_port_vlan_add(struct mlxsw_sp_port *mlxsw_sp_port, struct mlxsw_sp_bridge_port *bridge_port, u16 vid, bool is_untagged, bool is_pvid, - struct netlink_ext_ack *extack, - struct switchdev_trans *trans) + struct netlink_ext_ack *extack) { u16 pvid = mlxsw_sp_port_pvid_determine(mlxsw_sp_port, vid, is_pvid); struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan; @@ -1095,9 +1094,6 @@ mlxsw_sp_bridge_port_vlan_add(struct mlxsw_sp_port *mlxsw_sp_port, mlxsw_sp_port_vlan->bridge_port != bridge_port) return -EEXIST; - if (switchdev_trans_ph_prepare(trans)) - return 0; - if (!mlxsw_sp_port_vlan) { mlxsw_sp_port_vlan = mlxsw_sp_port_vlan_create(mlxsw_sp_port, vid); @@ -1188,6 +1184,9 @@ static int mlxsw_sp_port_vlans_add(struct mlxsw_sp_port *mlxsw_sp_port, return err; } + if (switchdev_trans_ph_commit(trans)) + return 0; + bridge_port = mlxsw_sp_bridge_port_find(mlxsw_sp->bridge, orig_dev); if (WARN_ON(!bridge_port)) return -EINVAL; @@ -1200,7 +1199,7 @@ static int mlxsw_sp_port_vlans_add(struct mlxsw_sp_port *mlxsw_sp_port, err = mlxsw_sp_bridge_port_vlan_add(mlxsw_sp_port, bridge_port, vid, flag_untagged, - flag_pvid, extack, trans); + flag_pvid, extack); if (err) return err; } @@ -3207,7 +3206,6 @@ mlxsw_sp_switchdev_vxlan_vlan_add(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_bridge_device *bridge_device, const struct net_device *vxlan_dev, u16 vid, bool flag_untagged, bool flag_pvid, - struct switchdev_trans *trans, struct netlink_ext_ack *extack) { struct vxlan_dev *vxlan = netdev_priv(vxlan_dev); @@ -3225,9 +3223,6 @@ mlxsw_sp_switchdev_vxlan_vlan_add(struct mlxsw_sp *mlxsw_sp, mlxsw_sp_bridge_8021q_vxlan_dev_find(bridge_device->dev, vid)) return -EINVAL; - if (switchdev_trans_ph_prepare(trans)) - return 0; - if (!netif_running(vxlan_dev)) return 0; @@ -3345,6 +3340,9 @@ mlxsw_sp_switchdev_vxlan_vlans_add(struct net_device *vxlan_dev, port_obj_info->handled = true; + if (switchdev_trans_ph_commit(trans)) + return 0; + bridge_device = mlxsw_sp_bridge_device_find(mlxsw_sp->bridge, br_dev); if (!bridge_device) return -EINVAL; @@ -3358,8 +3356,7 @@ mlxsw_sp_switchdev_vxlan_vlans_add(struct net_device *vxlan_dev, err = mlxsw_sp_switchdev_vxlan_vlan_add(mlxsw_sp, bridge_device, vxlan_dev, vid, flag_untagged, - flag_pvid, trans, - extack); + flag_pvid, extack); if (err) return err; } From 412283eedc138381e24d8c1304a3c5b298c503c4 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 8 Jan 2019 16:48:08 +0000 Subject: [PATCH 48/87] mlxsw: spectrum_nve: Replace error code with EINVAL Adding a VLAN on a port can trigger the offload of a VXLAN tunnel which is already a member in the VLAN. In case the configuration of the VXLAN is not supported, the driver would return -EOPNOTSUPP. This is problematic since bridge code does not interpret this as error, but rather that it should try to setup the VLAN using the 8021q driver instead of switchdev. Fixes: d70e42b22dd4 ("mlxsw: spectrum: Enable VxLAN enslavement to VLAN-aware bridges") Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_nve.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_nve.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_nve.c index 0a31fff2516e..fb1c48c698f2 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_nve.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_nve.c @@ -816,14 +816,14 @@ int mlxsw_sp_nve_fid_enable(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_fid *fid, ops = nve->nve_ops_arr[params->type]; if (!ops->can_offload(nve, params->dev, extack)) - return -EOPNOTSUPP; + return -EINVAL; memset(&config, 0, sizeof(config)); ops->nve_config(nve, params->dev, &config); if (nve->num_nve_tunnels && memcmp(&config, &nve->config, sizeof(config))) { NL_SET_ERR_MSG_MOD(extack, "Conflicting NVE tunnels configuration"); - return -EOPNOTSUPP; + return -EINVAL; } err = mlxsw_sp_nve_tunnel_init(mlxsw_sp, &config); From 16dc42e4511172aa3a4d49931597582c9ff78843 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 8 Jan 2019 16:48:10 +0000 Subject: [PATCH 49/87] selftests: mlxsw: Add a test case for VLAN addition error flow Add a test case for the issue fixed by previous commit. In case the offloading of an unsupported VxLAN tunnel was triggered by adding the mapped VLAN to a local port, then error should be returned to the user. Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Signed-off-by: David S. Miller --- .../selftests/drivers/net/mlxsw/vxlan.sh | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tools/testing/selftests/drivers/net/mlxsw/vxlan.sh b/tools/testing/selftests/drivers/net/mlxsw/vxlan.sh index dcf9f4e913e0..ae6146ec5afd 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/vxlan.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/vxlan.sh @@ -847,6 +847,24 @@ sanitization_vlan_aware_test() log_test "vlan-aware - failed enslavement to vlan-aware bridge" + bridge vlan del vid 10 dev vxlan20 + bridge vlan add vid 20 dev vxlan20 pvid untagged + + # Test that offloading of an unsupported tunnel fails when it is + # triggered by addition of VLAN to a local port + RET=0 + + # TOS must be set to inherit + ip link set dev vxlan10 type vxlan tos 42 + + ip link set dev $swp1 master br0 + bridge vlan add vid 10 dev $swp1 &> /dev/null + check_fail $? + + log_test "vlan-aware - failed vlan addition to a local port" + + ip link set dev vxlan10 type vxlan tos inherit + ip link del dev vxlan20 ip link del dev vxlan10 ip link del dev br0 From 279737939a8194f02fa352ab4476a1b241f44ef4 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 8 Jan 2019 16:48:11 +0000 Subject: [PATCH 50/87] net: bridge: Fix VLANs memory leak When adding / deleting VLANs to / from a bridge port, the bridge driver first tries to propagate the information via switchdev and falls back to the 8021q driver in case the underlying driver does not support switchdev. This can result in a memory leak [1] when VXLAN and mlxsw ports are enslaved to the bridge: $ ip link set dev vxlan0 master br0 # No mlxsw ports are enslaved to 'br0', so mlxsw ignores the switchdev # notification and the bridge driver adds the VLAN on 'vxlan0' via the # 8021q driver $ bridge vlan add vid 10 dev vxlan0 pvid untagged # mlxsw port is enslaved to the bridge $ ip link set dev swp1 master br0 # mlxsw processes the switchdev notification and the 8021q driver is # skipped $ bridge vlan del vid 10 dev vxlan0 This results in 'struct vlan_info' and 'struct vlan_vid_info' being leaked, as they were allocated by the 8021q driver during VLAN addition, but never freed as the 8021q driver was skipped during deletion. Fix this by introducing a new VLAN private flag that indicates whether the VLAN was added on the port by switchdev or the 8021q driver. If the VLAN was added by the 8021q driver, then we make sure to delete it via the 8021q driver as well. [1] unreferenced object 0xffff88822d20b1e8 (size 256): comm "bridge", pid 2532, jiffies 4295216998 (age 1188.830s) hex dump (first 32 bytes): e0 42 97 ce 81 88 ff ff 00 00 00 00 00 00 00 00 .B.............. 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<00000000f82d851d>] kmem_cache_alloc_trace+0x1be/0x330 [<00000000e0178b02>] vlan_vid_add+0x661/0x920 [<00000000218ebd5f>] __vlan_add+0x1be9/0x3a00 [<000000006eafa1ca>] nbp_vlan_add+0x8b3/0xd90 [<000000003535392c>] br_vlan_info+0x132/0x410 [<00000000aedaa9dc>] br_afspec+0x75c/0x870 [<00000000f5716133>] br_setlink+0x3dc/0x6d0 [<00000000aceca5e2>] rtnl_bridge_setlink+0x615/0xb30 [<00000000a2f2d23e>] rtnetlink_rcv_msg+0x3a3/0xa80 [<0000000064097e69>] netlink_rcv_skb+0x152/0x3c0 [<000000008be8d614>] rtnetlink_rcv+0x21/0x30 [<000000009ab2ca25>] netlink_unicast+0x52f/0x740 [<00000000e7d9ac96>] netlink_sendmsg+0x9c7/0xf50 [<000000005d1e2050>] sock_sendmsg+0xbe/0x120 [<00000000d51426bc>] ___sys_sendmsg+0x778/0x8f0 [<00000000b9d7b2cc>] __sys_sendmsg+0x112/0x270 unreferenced object 0xffff888227454308 (size 32): comm "bridge", pid 2532, jiffies 4295216998 (age 1188.882s) hex dump (first 32 bytes): 88 b2 20 2d 82 88 ff ff 88 b2 20 2d 82 88 ff ff .. -...... -.... 81 00 0a 00 01 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<00000000f82d851d>] kmem_cache_alloc_trace+0x1be/0x330 [<0000000018050631>] vlan_vid_add+0x3e6/0x920 [<00000000218ebd5f>] __vlan_add+0x1be9/0x3a00 [<000000006eafa1ca>] nbp_vlan_add+0x8b3/0xd90 [<000000003535392c>] br_vlan_info+0x132/0x410 [<00000000aedaa9dc>] br_afspec+0x75c/0x870 [<00000000f5716133>] br_setlink+0x3dc/0x6d0 [<00000000aceca5e2>] rtnl_bridge_setlink+0x615/0xb30 [<00000000a2f2d23e>] rtnetlink_rcv_msg+0x3a3/0xa80 [<0000000064097e69>] netlink_rcv_skb+0x152/0x3c0 [<000000008be8d614>] rtnetlink_rcv+0x21/0x30 [<000000009ab2ca25>] netlink_unicast+0x52f/0x740 [<00000000e7d9ac96>] netlink_sendmsg+0x9c7/0xf50 [<000000005d1e2050>] sock_sendmsg+0xbe/0x120 [<00000000d51426bc>] ___sys_sendmsg+0x778/0x8f0 [<00000000b9d7b2cc>] __sys_sendmsg+0x112/0x270 Fixes: d70e42b22dd4 ("mlxsw: spectrum: Enable VxLAN enslavement to VLAN-aware bridges") Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Cc: Roopa Prabhu Cc: Nikolay Aleksandrov Cc: bridge@lists.linux-foundation.org Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_private.h | 1 + net/bridge/br_vlan.c | 26 +++++++++++++------------- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index d240b3e7919f..eabf8bf28a3f 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -107,6 +107,7 @@ struct br_tunnel_info { /* private vlan flags */ enum { BR_VLFLAG_PER_PORT_STATS = BIT(0), + BR_VLFLAG_ADDED_BY_SWITCHDEV = BIT(1), }; /** diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 4a2f31157ef5..96abf8feb9dc 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -80,16 +80,18 @@ static bool __vlan_add_flags(struct net_bridge_vlan *v, u16 flags) } static int __vlan_vid_add(struct net_device *dev, struct net_bridge *br, - u16 vid, u16 flags, struct netlink_ext_ack *extack) + struct net_bridge_vlan *v, u16 flags, + struct netlink_ext_ack *extack) { int err; /* Try switchdev op first. In case it is not supported, fallback to * 8021q add. */ - err = br_switchdev_port_vlan_add(dev, vid, flags, extack); + err = br_switchdev_port_vlan_add(dev, v->vid, flags, extack); if (err == -EOPNOTSUPP) - return vlan_vid_add(dev, br->vlan_proto, vid); + return vlan_vid_add(dev, br->vlan_proto, v->vid); + v->priv_flags |= BR_VLFLAG_ADDED_BY_SWITCHDEV; return err; } @@ -121,19 +123,17 @@ static void __vlan_del_list(struct net_bridge_vlan *v) } static int __vlan_vid_del(struct net_device *dev, struct net_bridge *br, - u16 vid) + const struct net_bridge_vlan *v) { int err; /* Try switchdev op first. In case it is not supported, fallback to * 8021q del. */ - err = br_switchdev_port_vlan_del(dev, vid); - if (err == -EOPNOTSUPP) { - vlan_vid_del(dev, br->vlan_proto, vid); - return 0; - } - return err; + err = br_switchdev_port_vlan_del(dev, v->vid); + if (!(v->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV)) + vlan_vid_del(dev, br->vlan_proto, v->vid); + return err == -EOPNOTSUPP ? 0 : err; } /* Returns a master vlan, if it didn't exist it gets created. In all cases a @@ -242,7 +242,7 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags, * This ensures tagged traffic enters the bridge when * promiscuous mode is disabled by br_manage_promisc(). */ - err = __vlan_vid_add(dev, br, v->vid, flags, extack); + err = __vlan_vid_add(dev, br, v, flags, extack); if (err) goto out; @@ -305,7 +305,7 @@ out_fdb_insert: out_filt: if (p) { - __vlan_vid_del(dev, br, v->vid); + __vlan_vid_del(dev, br, v); if (masterv) { if (v->stats && masterv->stats != v->stats) free_percpu(v->stats); @@ -338,7 +338,7 @@ static int __vlan_del(struct net_bridge_vlan *v) __vlan_delete_pvid(vg, v->vid); if (p) { - err = __vlan_vid_del(p->dev, p->br, v->vid); + err = __vlan_vid_del(p->dev, p->br, v); if (err) goto out; } else { From 289fb44df4e4c3fe977ee4b451e1d11e10227654 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 8 Jan 2019 16:48:12 +0000 Subject: [PATCH 51/87] selftests: forwarding: Fix test for different devices When running the test on the Spectrum ASIC the generated packets are counted on the ingress filter and injected back to the pipeline because of the 'pass' action. The router block then drops the packets due to checksum error, as the test generates packets with zero checksum. When running the test on an emulator that is not as strict about checksum errors the test fails since packets are counted twice. Once by the emulated ASIC on its ingress filter and again by the kernel as the emulator does not perform checksum validation and allows the packets to be trapped by a matching host route. Fix this by changing the action to 'drop', which will prevent the packet from continuing further in the pipeline to the router block. For veth pairs this change is essentially a NOP given packets are only processed once (by the kernel). Fixes: a0b61f3d8ebf ("selftests: forwarding: vxlan_bridge_1d: Add an ECN decap test") Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Signed-off-by: David S. Miller --- tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh b/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh index 56cef3b1c194..bb10e33690b2 100755 --- a/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh +++ b/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh @@ -629,7 +629,7 @@ __test_ecn_decap() RET=0 tc filter add dev $h1 ingress pref 77 prot ip \ - flower ip_tos $decapped_tos action pass + flower ip_tos $decapped_tos action drop sleep 1 vxlan_encapped_ping_test v2 v1 192.0.2.17 \ $orig_inner_tos $orig_outer_tos \ From 674bed5df4cab8f96d04f7b99608883a48f9226b Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 8 Jan 2019 16:48:13 +0000 Subject: [PATCH 52/87] mlxsw: spectrum_switchdev: Set PVID correctly during VLAN deletion When a VLAN is deleted from a bridge port we should not change the PVID unless the deleted VLAN is the PVID. Fixes: fe9ccc785de5 ("mlxsw: spectrum_switchdev: Don't batch VLAN operations") Signed-off-by: Ido Schimmel Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c index e8ce2307352b..0abbaa0fbf14 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c @@ -1807,7 +1807,7 @@ static void mlxsw_sp_bridge_port_vlan_del(struct mlxsw_sp_port *mlxsw_sp_port, struct mlxsw_sp_bridge_port *bridge_port, u16 vid) { - u16 pvid = mlxsw_sp_port->pvid == vid ? 0 : vid; + u16 pvid = mlxsw_sp_port->pvid == vid ? 0 : mlxsw_sp_port->pvid; struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan; mlxsw_sp_port_vlan = mlxsw_sp_port_vlan_find_by_vid(mlxsw_sp_port, vid); From 4fabf3bf93a194c7fa5288da3e0af37e4b943cf3 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 8 Jan 2019 16:48:14 +0000 Subject: [PATCH 53/87] selftests: forwarding: Add a test for VLAN deletion Add a VLAN on a bridge port, delete it and make sure the PVID VLAN is not affected. Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- .../selftests/net/forwarding/bridge_vlan_aware.sh | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/forwarding/bridge_vlan_aware.sh b/tools/testing/selftests/net/forwarding/bridge_vlan_aware.sh index d8313d0438b7..04c6431b2bd8 100755 --- a/tools/testing/selftests/net/forwarding/bridge_vlan_aware.sh +++ b/tools/testing/selftests/net/forwarding/bridge_vlan_aware.sh @@ -1,7 +1,7 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 -ALL_TESTS="ping_ipv4 ping_ipv6 learning flooding" +ALL_TESTS="ping_ipv4 ping_ipv6 learning flooding vlan_deletion" NUM_NETIFS=4 CHECK_TC="yes" source lib.sh @@ -96,6 +96,19 @@ flooding() flood_test $swp2 $h1 $h2 } +vlan_deletion() +{ + # Test that the deletion of a VLAN on a bridge port does not affect + # the PVID VLAN + log_info "Add and delete a VLAN on bridge port $swp1" + + bridge vlan add vid 10 dev $swp1 + bridge vlan del vid 10 dev $swp1 + + ping_ipv4 + ping_ipv6 +} + trap cleanup EXIT setup_prepare From d972f3dce8d161e2142da0ab1ef25df00e2f21a9 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 8 Jan 2019 23:27:06 +0000 Subject: [PATCH 54/87] packet: Do not leak dev refcounts on error exit 'dev' is non NULL when the addr_len check triggers so it must goto a label that does the dev_put otherwise dev will have a leaked refcount. This bug causes the ib_ipoib module to become unloadable when using systemd-network as it triggers this check on InfiniBand links. Fixes: 99137b7888f4 ("packet: validate address length") Reported-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/packet/af_packet.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index eedacdebcd4c..d0945253f43b 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2628,7 +2628,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) addr = saddr->sll_halen ? saddr->sll_addr : NULL; dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex); if (addr && dev && saddr->sll_halen < dev->addr_len) - goto out; + goto out_put; } err = -ENXIO; @@ -2828,7 +2828,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) addr = saddr->sll_halen ? saddr->sll_addr : NULL; dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex); if (addr && dev && saddr->sll_halen < dev->addr_len) - goto out; + goto out_unlock; } err = -ENXIO; From 31aa6503a15ba00182ea6dbbf51afb63bf9e851d Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Tue, 8 Jan 2019 18:12:24 -0800 Subject: [PATCH 55/87] bpf: correctly set initial window on active Fast Open sender The existing BPF TCP initial congestion window (TCP_BPF_IW) does not to work on (active) Fast Open sender. This is because it changes the (initial) window only if data_segs_out is zero -- but data_segs_out is also incremented on SYN-data. This patch fixes the issue by proerly accounting for SYN-data additionally. Fixes: fc7478103c84 ("bpf: Adds support for setting initial cwnd") Signed-off-by: Yuchung Cheng Reviewed-by: Neal Cardwell Acked-by: Lawrence Brakmo Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/filter.c b/net/core/filter.c index 447dd1bad31f..2b3b436ef545 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4203,7 +4203,7 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, /* Only some options are supported */ switch (optname) { case TCP_BPF_IW: - if (val <= 0 || tp->data_segs_out > 0) + if (val <= 0 || tp->data_segs_out > tp->syn_data) ret = -EINVAL; else tp->snd_cwnd = val; From 0b7959b6257322f7693b08a459c505d4938646f2 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 7 Jan 2019 13:38:38 -0800 Subject: [PATCH 56/87] tun: publish tfile after it's fully initialized BUG: unable to handle kernel NULL pointer dereference at 00000000000000d1 Call Trace: ? napi_gro_frags+0xa7/0x2c0 tun_get_user+0xb50/0xf20 tun_chr_write_iter+0x53/0x70 new_sync_write+0xff/0x160 vfs_write+0x191/0x1e0 __x64_sys_write+0x5e/0xd0 do_syscall_64+0x47/0xf0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 I think there is a subtle race between sending a packet via tap and attaching it: CPU0: CPU1: tun_chr_ioctl(TUNSETIFF) tun_set_iff tun_attach rcu_assign_pointer(tfile->tun, tun); tun_fops->write_iter() tun_chr_write_iter tun_napi_alloc_frags napi_get_frags napi->skb = napi_alloc_skb tun_napi_init netif_napi_add napi->skb = NULL napi->skb is NULL here napi_gro_frags napi_frags_skb skb = napi->skb skb_reset_mac_header(skb) panic() Move rcu_assign_pointer(tfile->tun) and rcu_assign_pointer(tun->tfiles) to be the last thing we do in tun_attach(); this should guarantee that when we call tun_get() we always get an initialized object. v2 changes: * remove extra napi_mutex locks/unlocks for napi operations Reported-by: syzbot Fixes: 90e33d459407 ("tun: enable napi_gro_frags() for TUN/TAP driver") Signed-off-by: Stanislav Fomichev Signed-off-by: David S. Miller --- drivers/net/tun.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index a4fdad475594..18656c4094b3 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -856,10 +856,6 @@ static int tun_attach(struct tun_struct *tun, struct file *file, err = 0; } - rcu_assign_pointer(tfile->tun, tun); - rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile); - tun->numqueues++; - if (tfile->detached) { tun_enable_queue(tfile); } else { @@ -876,6 +872,13 @@ static int tun_attach(struct tun_struct *tun, struct file *file, * refcnt. */ + /* Publish tfile->tun and tun->tfiles only after we've fully + * initialized tfile; otherwise we risk using half-initialized + * object. + */ + rcu_assign_pointer(tfile->tun, tun); + rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile); + tun->numqueues++; out: return err; } From 4a06fa67c4da20148803525151845276cdb995c1 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 7 Jan 2019 16:47:33 -0500 Subject: [PATCH 57/87] ip: on queued skb use skb_header_pointer instead of pskb_may_pull Commit 2efd4fca703a ("ip: in cmsg IP(V6)_ORIGDSTADDR call pskb_may_pull") avoided a read beyond the end of the skb linear segment by calling pskb_may_pull. That function can trigger a BUG_ON in pskb_expand_head if the skb is shared, which it is when when peeking. It can also return ENOMEM. Avoid both by switching to safer skb_header_pointer. Fixes: 2efd4fca703a ("ip: in cmsg IP(V6)_ORIGDSTADDR call pskb_may_pull") Reported-by: syzbot Suggested-by: Eric Dumazet Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv4/ip_sockglue.c | 12 +++++------- net/ipv6/datagram.c | 10 ++++------ 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index fffcc130900e..82f341e84fae 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -148,19 +148,17 @@ static void ip_cmsg_recv_security(struct msghdr *msg, struct sk_buff *skb) static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb) { + __be16 _ports[2], *ports; struct sockaddr_in sin; - __be16 *ports; - int end; - - end = skb_transport_offset(skb) + 4; - if (end > 0 && !pskb_may_pull(skb, end)) - return; /* All current transport protocols have the port numbers in the * first four bytes of the transport header and this function is * written with this assumption in mind. */ - ports = (__be16 *)skb_transport_header(skb); + ports = skb_header_pointer(skb, skb_transport_offset(skb), + sizeof(_ports), &_ports); + if (!ports) + return; sin.sin_family = AF_INET; sin.sin_addr.s_addr = ip_hdr(skb)->daddr; diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index bde08aa549f3..c2262a7e2088 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -700,17 +700,15 @@ void ip6_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg, } if (np->rxopt.bits.rxorigdstaddr) { struct sockaddr_in6 sin6; - __be16 *ports; - int end; + __be16 _ports[2], *ports; - end = skb_transport_offset(skb) + 4; - if (end <= 0 || pskb_may_pull(skb, end)) { + ports = skb_header_pointer(skb, skb_transport_offset(skb), + sizeof(_ports), &_ports); + if (ports) { /* All current transport protocols have the port numbers in the * first four bytes of the transport header and this function is * written with this assumption in mind. */ - ports = (__be16 *)skb_transport_header(skb); - sin6.sin6_family = AF_INET6; sin6.sin6_addr = ipv6_hdr(skb)->daddr; sin6.sin6_port = ports[1]; From 2acc0abc882ac3be47719e189f3db006493ab640 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 7 Jan 2019 23:39:24 +0000 Subject: [PATCH 58/87] net: cxgb3: fix various indentation issues There are handful of lines that have indentation issues, fix these. Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb3/sge.c | 12 +++++++----- drivers/net/ethernet/chelsio/cxgb3/t3_hw.c | 6 +++--- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb3/sge.c b/drivers/net/ethernet/chelsio/cxgb3/sge.c index 20b6e1b3f5e3..6a80b44f2e71 100644 --- a/drivers/net/ethernet/chelsio/cxgb3/sge.c +++ b/drivers/net/ethernet/chelsio/cxgb3/sge.c @@ -2381,7 +2381,7 @@ no_mem: lro_add_page(adap, qs, fl, G_RSPD_LEN(len), flags & F_RSPD_EOP); - goto next_fl; + goto next_fl; } skb = get_packet_pg(adap, fl, q, @@ -3214,11 +3214,13 @@ void t3_start_sge_timers(struct adapter *adap) for (i = 0; i < SGE_QSETS; ++i) { struct sge_qset *q = &adap->sge.qs[i]; - if (q->tx_reclaim_timer.function) - mod_timer(&q->tx_reclaim_timer, jiffies + TX_RECLAIM_PERIOD); + if (q->tx_reclaim_timer.function) + mod_timer(&q->tx_reclaim_timer, + jiffies + TX_RECLAIM_PERIOD); - if (q->rx_reclaim_timer.function) - mod_timer(&q->rx_reclaim_timer, jiffies + RX_RECLAIM_PERIOD); + if (q->rx_reclaim_timer.function) + mod_timer(&q->rx_reclaim_timer, + jiffies + RX_RECLAIM_PERIOD); } } diff --git a/drivers/net/ethernet/chelsio/cxgb3/t3_hw.c b/drivers/net/ethernet/chelsio/cxgb3/t3_hw.c index 080918af773c..0a9f2c596624 100644 --- a/drivers/net/ethernet/chelsio/cxgb3/t3_hw.c +++ b/drivers/net/ethernet/chelsio/cxgb3/t3_hw.c @@ -1082,7 +1082,7 @@ int t3_check_fw_version(struct adapter *adapter) CH_WARN(adapter, "found newer FW version(%u.%u), " "driver compiled for version %u.%u\n", major, minor, FW_VERSION_MAJOR, FW_VERSION_MINOR); - return 0; + return 0; } return -EINVAL; } @@ -3619,7 +3619,7 @@ int t3_reset_adapter(struct adapter *adapter) static int init_parity(struct adapter *adap) { - int i, err, addr; + int i, err, addr; if (t3_read_reg(adap, A_SG_CONTEXT_CMD) & F_CONTEXT_CMD_BUSY) return -EBUSY; @@ -3806,6 +3806,6 @@ int t3_replay_prep_adapter(struct adapter *adapter) p->phy.ops->power_down(&p->phy, 1); } -return 0; + return 0; } From fd21c89b876565df76051eca395018de2280f8e1 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 7 Jan 2019 23:48:09 +0000 Subject: [PATCH 59/87] net: cxgb4: fix various indentation issues There are some lines that have indentation issues, fix these. Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/cxgb4_ptp.c | 8 ++++---- drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c | 2 +- drivers/net/ethernet/chelsio/cxgb4/t4_hw.c | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ptp.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ptp.c index 9f9d6cae39d5..58a039c3224a 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ptp.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ptp.c @@ -378,10 +378,10 @@ static void cxgb4_init_ptp_timer(struct adapter *adapter) int err; memset(&c, 0, sizeof(c)); - c.op_to_portid = cpu_to_be32(FW_CMD_OP_V(FW_PTP_CMD) | - FW_CMD_REQUEST_F | - FW_CMD_WRITE_F | - FW_PTP_CMD_PORTID_V(0)); + c.op_to_portid = cpu_to_be32(FW_CMD_OP_V(FW_PTP_CMD) | + FW_CMD_REQUEST_F | + FW_CMD_WRITE_F | + FW_PTP_CMD_PORTID_V(0)); c.retval_len16 = cpu_to_be32(FW_CMD_LEN16_V(sizeof(c) / 16)); c.u.scmd.sc = FW_PTP_SC_INIT_TIMER; diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c index 9a6065a3fa46..c041f44324db 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c @@ -78,7 +78,7 @@ static void free_msix_idx_in_bmap(struct adapter *adap, unsigned int msix_idx) unsigned long flags; spin_lock_irqsave(&bmap->lock, flags); - __clear_bit(msix_idx, bmap->msix_bmap); + __clear_bit(msix_idx, bmap->msix_bmap); spin_unlock_irqrestore(&bmap->lock, flags); } diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c index e8c34292a0ec..2b03f6187a24 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c +++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c @@ -3794,7 +3794,7 @@ int t4_load_phy_fw(struct adapter *adap, /* If we have version number support, then check to see if the adapter * already has up-to-date PHY firmware loaded. */ - if (phy_fw_version) { + if (phy_fw_version) { new_phy_fw_vers = phy_fw_version(phy_fw_data, phy_fw_size); ret = t4_phy_fw_ver(adap, &cur_phy_fw_ver); if (ret < 0) From 85704cb8dcfd88d351bfc87faaeba1c8214f3177 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Tue, 8 Jan 2019 12:30:00 +0300 Subject: [PATCH 60/87] net/core/neighbour: tell kmemleak about hash tables This fixes false-positive kmemleak reports about leaked neighbour entries: unreferenced object 0xffff8885c6e4d0a8 (size 1024): comm "softirq", pid 0, jiffies 4294922664 (age 167640.804s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 20 2c f3 83 ff ff ff ff ........ ,...... 08 c0 ef 5f 84 88 ff ff 01 8c 7d 02 01 00 00 00 ..._......}..... backtrace: [<00000000748509fe>] ip6_finish_output2+0x887/0x1e40 [<0000000036d7a0d8>] ip6_output+0x1ba/0x600 [<0000000027ea7dba>] ip6_send_skb+0x92/0x2f0 [<00000000d6e2111d>] udp_v6_send_skb.isra.24+0x680/0x15e0 [<000000000668a8be>] udpv6_sendmsg+0x18c9/0x27a0 [<000000004bd5fa90>] sock_sendmsg+0xb3/0xf0 [<000000008227b29f>] ___sys_sendmsg+0x745/0x8f0 [<000000008698009d>] __sys_sendmsg+0xde/0x170 [<00000000889dacf1>] do_syscall_64+0x9b/0x400 [<0000000081cdb353>] entry_SYSCALL_64_after_hwframe+0x49/0xbe [<000000005767ed39>] 0xffffffffffffffff Signed-off-by: Konstantin Khlebnikov Signed-off-by: David S. Miller --- net/core/neighbour.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 763a7b08df67..3e27a779f288 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -18,6 +18,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include +#include #include #include #include @@ -443,12 +444,14 @@ static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift) ret = kmalloc(sizeof(*ret), GFP_ATOMIC); if (!ret) return NULL; - if (size <= PAGE_SIZE) + if (size <= PAGE_SIZE) { buckets = kzalloc(size, GFP_ATOMIC); - else + } else { buckets = (struct neighbour __rcu **) __get_free_pages(GFP_ATOMIC | __GFP_ZERO, get_order(size)); + kmemleak_alloc(buckets, size, 0, GFP_ATOMIC); + } if (!buckets) { kfree(ret); return NULL; @@ -468,10 +471,12 @@ static void neigh_hash_free_rcu(struct rcu_head *head) size_t size = (1 << nht->hash_shift) * sizeof(struct neighbour *); struct neighbour __rcu **buckets = nht->hash_buckets; - if (size <= PAGE_SIZE) + if (size <= PAGE_SIZE) { kfree(buckets); - else + } else { + kmemleak_free(buckets); free_pages((unsigned long)buckets, get_order(size)); + } kfree(nht); } From 7d033c9f6a7fd3821af75620a0257db87c2b552a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 8 Jan 2019 04:06:14 -0800 Subject: [PATCH 61/87] ipv6: fix kernel-infoleak in ipv6_local_error() This patch makes sure the flow label in the IPv6 header forged in ipv6_local_error() is initialized. BUG: KMSAN: kernel-infoleak in _copy_to_user+0x16b/0x1f0 lib/usercopy.c:32 CPU: 1 PID: 24675 Comm: syz-executor1 Not tainted 4.20.0-rc7+ #4 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x173/0x1d0 lib/dump_stack.c:113 kmsan_report+0x12e/0x2a0 mm/kmsan/kmsan.c:613 kmsan_internal_check_memory+0x455/0xb00 mm/kmsan/kmsan.c:675 kmsan_copy_to_user+0xab/0xc0 mm/kmsan/kmsan_hooks.c:601 _copy_to_user+0x16b/0x1f0 lib/usercopy.c:32 copy_to_user include/linux/uaccess.h:177 [inline] move_addr_to_user+0x2e9/0x4f0 net/socket.c:227 ___sys_recvmsg+0x5d7/0x1140 net/socket.c:2284 __sys_recvmsg net/socket.c:2327 [inline] __do_sys_recvmsg net/socket.c:2337 [inline] __se_sys_recvmsg+0x2fa/0x450 net/socket.c:2334 __x64_sys_recvmsg+0x4a/0x70 net/socket.c:2334 do_syscall_64+0xbc/0xf0 arch/x86/entry/common.c:291 entry_SYSCALL_64_after_hwframe+0x63/0xe7 RIP: 0033:0x457ec9 Code: 6d b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 3b b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007f8750c06c78 EFLAGS: 00000246 ORIG_RAX: 000000000000002f RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 0000000000457ec9 RDX: 0000000000002000 RSI: 0000000020000400 RDI: 0000000000000005 RBP: 000000000073bf00 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00007f8750c076d4 R13: 00000000004c4a60 R14: 00000000004d8140 R15: 00000000ffffffff Uninit was stored to memory at: kmsan_save_stack_with_flags mm/kmsan/kmsan.c:204 [inline] kmsan_save_stack mm/kmsan/kmsan.c:219 [inline] kmsan_internal_chain_origin+0x134/0x230 mm/kmsan/kmsan.c:439 __msan_chain_origin+0x70/0xe0 mm/kmsan/kmsan_instr.c:200 ipv6_recv_error+0x1e3f/0x1eb0 net/ipv6/datagram.c:475 udpv6_recvmsg+0x398/0x2ab0 net/ipv6/udp.c:335 inet_recvmsg+0x4fb/0x600 net/ipv4/af_inet.c:830 sock_recvmsg_nosec net/socket.c:794 [inline] sock_recvmsg+0x1d1/0x230 net/socket.c:801 ___sys_recvmsg+0x4d5/0x1140 net/socket.c:2278 __sys_recvmsg net/socket.c:2327 [inline] __do_sys_recvmsg net/socket.c:2337 [inline] __se_sys_recvmsg+0x2fa/0x450 net/socket.c:2334 __x64_sys_recvmsg+0x4a/0x70 net/socket.c:2334 do_syscall_64+0xbc/0xf0 arch/x86/entry/common.c:291 entry_SYSCALL_64_after_hwframe+0x63/0xe7 Uninit was created at: kmsan_save_stack_with_flags mm/kmsan/kmsan.c:204 [inline] kmsan_internal_poison_shadow+0x92/0x150 mm/kmsan/kmsan.c:158 kmsan_kmalloc+0xa6/0x130 mm/kmsan/kmsan_hooks.c:176 kmsan_slab_alloc+0xe/0x10 mm/kmsan/kmsan_hooks.c:185 slab_post_alloc_hook mm/slab.h:446 [inline] slab_alloc_node mm/slub.c:2759 [inline] __kmalloc_node_track_caller+0xe18/0x1030 mm/slub.c:4383 __kmalloc_reserve net/core/skbuff.c:137 [inline] __alloc_skb+0x309/0xa20 net/core/skbuff.c:205 alloc_skb include/linux/skbuff.h:998 [inline] ipv6_local_error+0x1a7/0x9e0 net/ipv6/datagram.c:334 __ip6_append_data+0x129f/0x4fd0 net/ipv6/ip6_output.c:1311 ip6_make_skb+0x6cc/0xcf0 net/ipv6/ip6_output.c:1775 udpv6_sendmsg+0x3f8e/0x45d0 net/ipv6/udp.c:1384 inet_sendmsg+0x54a/0x720 net/ipv4/af_inet.c:798 sock_sendmsg_nosec net/socket.c:621 [inline] sock_sendmsg net/socket.c:631 [inline] __sys_sendto+0x8c4/0xac0 net/socket.c:1788 __do_sys_sendto net/socket.c:1800 [inline] __se_sys_sendto+0x107/0x130 net/socket.c:1796 __x64_sys_sendto+0x6e/0x90 net/socket.c:1796 do_syscall_64+0xbc/0xf0 arch/x86/entry/common.c:291 entry_SYSCALL_64_after_hwframe+0x63/0xe7 Bytes 4-7 of 28 are uninitialized Memory access of size 28 starts at ffff8881937bfce0 Data copied to user address 0000000020000000 Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- net/ipv6/datagram.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index c2262a7e2088..ee4a4e54d016 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -341,6 +341,7 @@ void ipv6_local_error(struct sock *sk, int err, struct flowi6 *fl6, u32 info) skb_reset_network_header(skb); iph = ipv6_hdr(skb); iph->daddr = fl6->daddr; + ip6_flow_hdr(iph, 0, 0); serr = SKB_EXT_ERR(skb); serr->ee.ee_errno = err; From e3ca63de8ade75757a067f6a5bd111d30cdcadb5 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 8 Jan 2019 16:07:28 -0800 Subject: [PATCH 62/87] selftests/bpf: add missing executables to .gitignore We build test_libbpf with CXX to make sure linking against C++ works. $ make -s -C tools/lib/bpf $ git status -sb ? tools/lib/bpf/test_libbpf $ make -s -C tools/testing/selftests/bpf $ git status -sb ? tools/lib/bpf/test_libbpf ? tools/testing/selftests/bpf/test_libbpf Fixes: 8c4905b995c6 ("libbpf: make sure bpf headers are c++ include-able") Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- tools/lib/bpf/.gitignore | 1 + tools/testing/selftests/bpf/.gitignore | 1 + 2 files changed, 2 insertions(+) diff --git a/tools/lib/bpf/.gitignore b/tools/lib/bpf/.gitignore index f81e549ddfdb..4db74758c674 100644 --- a/tools/lib/bpf/.gitignore +++ b/tools/lib/bpf/.gitignore @@ -1,2 +1,3 @@ libbpf_version.h FEATURE-DUMP.libbpf +test_libbpf diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index 4a9785043a39..dd093bd91aa9 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -28,3 +28,4 @@ flow_dissector_load test_netcnt test_section_names test_tcpnotify_user +test_libbpf From 11b36abc249f5e100d532c5271dae938fde85175 Mon Sep 17 00:00:00 2001 From: Ioana Ciornei Date: Wed, 9 Jan 2019 17:06:55 +0000 Subject: [PATCH 63/87] samples: bpf: user proper argument index Use optind as index for argv instead of a hardcoded value. When the program has options this leads to improper parameter handling. Fixes: dc378a1ab5b6 ("samples: bpf: get ifindex from ifname") Signed-off-by: Ioana Ciornei Acked-by: Matteo Croce Signed-off-by: Daniel Borkmann --- samples/bpf/xdp1_user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/bpf/xdp1_user.c b/samples/bpf/xdp1_user.c index 0a197f86ac43..8bfda95c77ad 100644 --- a/samples/bpf/xdp1_user.c +++ b/samples/bpf/xdp1_user.c @@ -103,7 +103,7 @@ int main(int argc, char **argv) return 1; } - ifindex = if_nametoindex(argv[1]); + ifindex = if_nametoindex(argv[optind]); if (!ifindex) { perror("if_nametoindex"); return 1; From f98937c6bb73ae11717a15aec85c187d33ca5d34 Mon Sep 17 00:00:00 2001 From: Anders Roxell Date: Tue, 8 Jan 2019 10:23:03 +0100 Subject: [PATCH 64/87] selftests: bpf: install files tcp_(server|client)*.py When test_tcpbpf_user runs it complains that it can't find files tcp_server.py and tcp_client.py. Rework so that tcp_server.py and tcp_client.py gets installed, added them to the variable TEST_PROGS_EXTENDED. Fixes: d6d4f60c3a09 ("bpf: add selftest for tcpbpf") Signed-off-by: Anders Roxell Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/Makefile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 73aa6d8f4a2f..70229de510f5 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -55,7 +55,9 @@ TEST_PROGS := test_kmod.sh \ test_flow_dissector.sh \ test_xdp_vlan.sh -TEST_PROGS_EXTENDED := with_addr.sh +TEST_PROGS_EXTENDED := with_addr.sh \ + tcp_client.py \ + tcp_server.py # Compile but not part of 'make run_tests' TEST_GEN_PROGS_EXTENDED = test_libbpf_open test_sock_addr test_skb_cgroup_id_user \ From beaf3d1901f4ea46fbd5c9d857227d99751de469 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Tue, 8 Jan 2019 14:20:44 -0800 Subject: [PATCH 65/87] bpf: fix panic in stack_map_get_build_id() on i386 and arm32 As Naresh reported, test_stacktrace_build_id() causes panic on i386 and arm32 systems. This is caused by page_address() returns NULL in certain cases. This patch fixes this error by using kmap_atomic/kunmap_atomic instead of page_address. Fixes: 615755a77b24 (" bpf: extend stackmap to save binary_build_id+offset instead of address") Reported-by: Naresh Kamboju Signed-off-by: Song Liu Signed-off-by: Daniel Borkmann --- kernel/bpf/stackmap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 90daf285de03..d9e2483669d0 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -260,7 +260,7 @@ static int stack_map_get_build_id(struct vm_area_struct *vma, return -EFAULT; /* page not mapped */ ret = -EINVAL; - page_addr = page_address(page); + page_addr = kmap_atomic(page); ehdr = (Elf32_Ehdr *)page_addr; /* compare magic x7f "ELF" */ @@ -276,6 +276,7 @@ static int stack_map_get_build_id(struct vm_area_struct *vma, else if (ehdr->e_ident[EI_CLASS] == ELFCLASS64) ret = stack_map_get_build_id_64(page_addr, build_id); out: + kunmap_atomic(page_addr); put_page(page); return ret; } From 001e465f09a18857443489a57e74314a3368c805 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 8 Jan 2019 12:32:42 -0500 Subject: [PATCH 66/87] bonding: update nest level on unlink A network device stack with multiple layers of bonding devices can trigger a false positive lockdep warning. Adding lockdep nest levels fixes this. Update the level on both enslave and unlink, to avoid the following series of events .. ip netns add test ip netns exec test bash ip link set dev lo addr 00:11:22:33:44:55 ip link set dev lo down ip link add dev bond1 type bond ip link add dev bond2 type bond ip link set dev lo master bond1 ip link set dev bond1 master bond2 ip link set dev bond1 nomaster ip link set dev bond2 master bond1 .. from still generating a splat: [ 193.652127] ====================================================== [ 193.658231] WARNING: possible circular locking dependency detected [ 193.664350] 4.20.0 #8 Not tainted [ 193.668310] ------------------------------------------------------ [ 193.674417] ip/15577 is trying to acquire lock: [ 193.678897] 00000000a40e3b69 (&(&bond->stats_lock)->rlock#3/3){+.+.}, at: bond_get_stats+0x58/0x290 [ 193.687851] but task is already holding lock: [ 193.693625] 00000000807b9d9f (&(&bond->stats_lock)->rlock#2/2){+.+.}, at: bond_get_stats+0x58/0x290 [..] [ 193.851092] lock_acquire+0xa7/0x190 [ 193.855138] _raw_spin_lock_nested+0x2d/0x40 [ 193.859878] bond_get_stats+0x58/0x290 [ 193.864093] dev_get_stats+0x5a/0xc0 [ 193.868140] bond_get_stats+0x105/0x290 [ 193.872444] dev_get_stats+0x5a/0xc0 [ 193.876493] rtnl_fill_stats+0x40/0x130 [ 193.880797] rtnl_fill_ifinfo+0x6c5/0xdc0 [ 193.885271] rtmsg_ifinfo_build_skb+0x86/0xe0 [ 193.890091] rtnetlink_event+0x5b/0xa0 [ 193.894320] raw_notifier_call_chain+0x43/0x60 [ 193.899225] netdev_change_features+0x50/0xa0 [ 193.904044] bond_compute_features.isra.46+0x1ab/0x270 [ 193.909640] bond_enslave+0x141d/0x15b0 [ 193.913946] do_set_master+0x89/0xa0 [ 193.918016] do_setlink+0x37c/0xda0 [ 193.921980] __rtnl_newlink+0x499/0x890 [ 193.926281] rtnl_newlink+0x48/0x70 [ 193.930238] rtnetlink_rcv_msg+0x171/0x4b0 [ 193.934801] netlink_rcv_skb+0xd1/0x110 [ 193.939103] rtnetlink_rcv+0x15/0x20 [ 193.943151] netlink_unicast+0x3b5/0x520 [ 193.947544] netlink_sendmsg+0x2fd/0x3f0 [ 193.951942] sock_sendmsg+0x38/0x50 [ 193.955899] ___sys_sendmsg+0x2ba/0x2d0 [ 193.960205] __x64_sys_sendmsg+0xad/0x100 [ 193.964687] do_syscall_64+0x5a/0x460 [ 193.968823] entry_SYSCALL_64_after_hwframe+0x49/0xbe Fixes: 7e2556e40026 ("bonding: avoid lockdep confusion in bond_get_stats()") Reported-by: syzbot Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index a9d597f28023..485462d3087f 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1963,6 +1963,9 @@ static int __bond_release_one(struct net_device *bond_dev, if (!bond_has_slaves(bond)) { bond_set_carrier(bond); eth_hw_addr_random(bond_dev); + bond->nest_level = SINGLE_DEPTH_NESTING; + } else { + bond->nest_level = dev_get_nest_level(bond_dev) + 1; } unblock_netpoll_tx(); From ea89098ef9a574bceca00d3b5df14aaf0b3f9ccf Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 9 Jan 2019 00:24:03 +0100 Subject: [PATCH 67/87] net: dsa: mv88x6xxx: mv88e6390 errata The 6390 copper ports have an errata which require poking magic values into undocumented magic registers and then performing a software reset. Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/dsa/mv88e6xxx/chip.c | 113 +++++++++++++++++++++++++++++++ drivers/net/dsa/mv88e6xxx/chip.h | 5 ++ drivers/net/dsa/mv88e6xxx/port.h | 10 +++ 3 files changed, 128 insertions(+) diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index 8a517d8fb9d1..8dca2c949e73 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -2403,6 +2403,107 @@ static int mv88e6xxx_stats_setup(struct mv88e6xxx_chip *chip) return mv88e6xxx_g1_stats_clear(chip); } +/* The mv88e6390 has some hidden registers used for debug and + * development. The errata also makes use of them. + */ +static int mv88e6390_hidden_write(struct mv88e6xxx_chip *chip, int port, + int reg, u16 val) +{ + u16 ctrl; + int err; + + err = mv88e6xxx_port_write(chip, PORT_RESERVED_1A_DATA_PORT, + PORT_RESERVED_1A, val); + if (err) + return err; + + ctrl = PORT_RESERVED_1A_BUSY | PORT_RESERVED_1A_WRITE | + PORT_RESERVED_1A_BLOCK | port << PORT_RESERVED_1A_PORT_SHIFT | + reg; + + return mv88e6xxx_port_write(chip, PORT_RESERVED_1A_CTRL_PORT, + PORT_RESERVED_1A, ctrl); +} + +static int mv88e6390_hidden_wait(struct mv88e6xxx_chip *chip) +{ + return mv88e6xxx_wait(chip, PORT_RESERVED_1A_CTRL_PORT, + PORT_RESERVED_1A, PORT_RESERVED_1A_BUSY); +} + + +static int mv88e6390_hidden_read(struct mv88e6xxx_chip *chip, int port, + int reg, u16 *val) +{ + u16 ctrl; + int err; + + ctrl = PORT_RESERVED_1A_BUSY | PORT_RESERVED_1A_READ | + PORT_RESERVED_1A_BLOCK | port << PORT_RESERVED_1A_PORT_SHIFT | + reg; + + err = mv88e6xxx_port_write(chip, PORT_RESERVED_1A_CTRL_PORT, + PORT_RESERVED_1A, ctrl); + if (err) + return err; + + err = mv88e6390_hidden_wait(chip); + if (err) + return err; + + return mv88e6xxx_port_read(chip, PORT_RESERVED_1A_DATA_PORT, + PORT_RESERVED_1A, val); +} + +/* Check if the errata has already been applied. */ +static bool mv88e6390_setup_errata_applied(struct mv88e6xxx_chip *chip) +{ + int port; + int err; + u16 val; + + for (port = 0; port < mv88e6xxx_num_ports(chip); port++) { + err = mv88e6390_hidden_read(chip, port, 0, &val); + if (err) { + dev_err(chip->dev, + "Error reading hidden register: %d\n", err); + return false; + } + if (val != 0x01c0) + return false; + } + + return true; +} + +/* The 6390 copper ports have an errata which require poking magic + * values into undocumented hidden registers and then performing a + * software reset. + */ +static int mv88e6390_setup_errata(struct mv88e6xxx_chip *chip) +{ + int port; + int err; + + if (mv88e6390_setup_errata_applied(chip)) + return 0; + + /* Set the ports into blocking mode */ + for (port = 0; port < mv88e6xxx_num_ports(chip); port++) { + err = mv88e6xxx_port_set_state(chip, port, BR_STATE_DISABLED); + if (err) + return err; + } + + for (port = 0; port < mv88e6xxx_num_ports(chip); port++) { + err = mv88e6390_hidden_write(chip, port, 0, 0x01c0); + if (err) + return err; + } + + return mv88e6xxx_software_reset(chip); +} + static int mv88e6xxx_setup(struct dsa_switch *ds) { struct mv88e6xxx_chip *chip = ds->priv; @@ -2415,6 +2516,12 @@ static int mv88e6xxx_setup(struct dsa_switch *ds) mutex_lock(&chip->reg_lock); + if (chip->info->ops->setup_errata) { + err = chip->info->ops->setup_errata(chip); + if (err) + goto unlock; + } + /* Cache the cmode of each port. */ for (i = 0; i < mv88e6xxx_num_ports(chip); i++) { if (chip->info->ops->port_get_cmode) { @@ -3226,6 +3333,7 @@ static const struct mv88e6xxx_ops mv88e6185_ops = { static const struct mv88e6xxx_ops mv88e6190_ops = { /* MV88E6XXX_FAMILY_6390 */ + .setup_errata = mv88e6390_setup_errata, .irl_init_all = mv88e6390_g2_irl_init_all, .get_eeprom = mv88e6xxx_g2_get_eeprom8, .set_eeprom = mv88e6xxx_g2_set_eeprom8, @@ -3269,6 +3377,7 @@ static const struct mv88e6xxx_ops mv88e6190_ops = { static const struct mv88e6xxx_ops mv88e6190x_ops = { /* MV88E6XXX_FAMILY_6390 */ + .setup_errata = mv88e6390_setup_errata, .irl_init_all = mv88e6390_g2_irl_init_all, .get_eeprom = mv88e6xxx_g2_get_eeprom8, .set_eeprom = mv88e6xxx_g2_set_eeprom8, @@ -3312,6 +3421,7 @@ static const struct mv88e6xxx_ops mv88e6190x_ops = { static const struct mv88e6xxx_ops mv88e6191_ops = { /* MV88E6XXX_FAMILY_6390 */ + .setup_errata = mv88e6390_setup_errata, .irl_init_all = mv88e6390_g2_irl_init_all, .get_eeprom = mv88e6xxx_g2_get_eeprom8, .set_eeprom = mv88e6xxx_g2_set_eeprom8, @@ -3404,6 +3514,7 @@ static const struct mv88e6xxx_ops mv88e6240_ops = { static const struct mv88e6xxx_ops mv88e6290_ops = { /* MV88E6XXX_FAMILY_6390 */ + .setup_errata = mv88e6390_setup_errata, .irl_init_all = mv88e6390_g2_irl_init_all, .get_eeprom = mv88e6xxx_g2_get_eeprom8, .set_eeprom = mv88e6xxx_g2_set_eeprom8, @@ -3709,6 +3820,7 @@ static const struct mv88e6xxx_ops mv88e6352_ops = { static const struct mv88e6xxx_ops mv88e6390_ops = { /* MV88E6XXX_FAMILY_6390 */ + .setup_errata = mv88e6390_setup_errata, .irl_init_all = mv88e6390_g2_irl_init_all, .get_eeprom = mv88e6xxx_g2_get_eeprom8, .set_eeprom = mv88e6xxx_g2_set_eeprom8, @@ -3756,6 +3868,7 @@ static const struct mv88e6xxx_ops mv88e6390_ops = { static const struct mv88e6xxx_ops mv88e6390x_ops = { /* MV88E6XXX_FAMILY_6390 */ + .setup_errata = mv88e6390_setup_errata, .irl_init_all = mv88e6390_g2_irl_init_all, .get_eeprom = mv88e6xxx_g2_get_eeprom8, .set_eeprom = mv88e6xxx_g2_set_eeprom8, diff --git a/drivers/net/dsa/mv88e6xxx/chip.h b/drivers/net/dsa/mv88e6xxx/chip.h index f9ecb7872d32..546651d8c3e1 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.h +++ b/drivers/net/dsa/mv88e6xxx/chip.h @@ -300,6 +300,11 @@ struct mv88e6xxx_mdio_bus { }; struct mv88e6xxx_ops { + /* Switch Setup Errata, called early in the switch setup to + * allow any errata actions to be performed + */ + int (*setup_errata)(struct mv88e6xxx_chip *chip); + int (*ieee_pri_map)(struct mv88e6xxx_chip *chip); int (*ip_pri_map)(struct mv88e6xxx_chip *chip); diff --git a/drivers/net/dsa/mv88e6xxx/port.h b/drivers/net/dsa/mv88e6xxx/port.h index 0d81866d0e4a..e583641de758 100644 --- a/drivers/net/dsa/mv88e6xxx/port.h +++ b/drivers/net/dsa/mv88e6xxx/port.h @@ -251,6 +251,16 @@ /* Offset 0x19: Port IEEE Priority Remapping Registers (4-7) */ #define MV88E6095_PORT_IEEE_PRIO_REMAP_4567 0x19 +/* Offset 0x1a: Magic undocumented errata register */ +#define PORT_RESERVED_1A 0x1a +#define PORT_RESERVED_1A_BUSY BIT(15) +#define PORT_RESERVED_1A_WRITE BIT(14) +#define PORT_RESERVED_1A_READ 0 +#define PORT_RESERVED_1A_PORT_SHIFT 5 +#define PORT_RESERVED_1A_BLOCK (0xf << 10) +#define PORT_RESERVED_1A_CTRL_PORT 4 +#define PORT_RESERVED_1A_DATA_PORT 5 + int mv88e6xxx_port_read(struct mv88e6xxx_chip *chip, int port, int reg, u16 *val); int mv88e6xxx_port_write(struct mv88e6xxx_chip *chip, int port, int reg, From c5715b8fabfca0ef85903f8bad2189940ed41cc8 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Tue, 8 Jan 2019 18:14:28 -0800 Subject: [PATCH 68/87] tcp: change txhash on SYN-data timeout Previously upon SYN timeouts the sender recomputes the txhash to try a different path. However this does not apply on the initial timeout of SYN-data (active Fast Open). Therefore an active IPv6 Fast Open connection may incur one second RTO penalty to take on a new path after the second SYN retransmission uses a new flow label. This patch removes this undesirable behavior so Fast Open changes the flow label just like the regular connections. This also helps avoid falsely disabling Fast Open on the sender which triggers after two consecutive SYN timeouts on Fast Open. Signed-off-by: Yuchung Cheng Reviewed-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index f87dbc78b6bc..71a29e9c0620 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -226,7 +226,7 @@ static int tcp_write_timeout(struct sock *sk) if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { if (icsk->icsk_retransmits) { dst_negative_advice(sk); - } else if (!tp->syn_data && !tp->syn_fastopen) { + } else { sk_rethink_txhash(sk); } retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries; From b19bce0335e25b9069ddb10d234e673bbd46d2f4 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Wed, 9 Jan 2019 08:20:28 +0100 Subject: [PATCH 69/87] net: ethernet: mediatek: fix warning in phy_start_aneg linux 5.0-rc1 shows following warning on bpi-r2/mt7623 bootup: [ 5.170597] WARNING: CPU: 3 PID: 1 at drivers/net/phy/phy.c:548 phy_start_aneg+0x110/0x144 [ 5.178826] called from state READY .... [ 5.264111] [] (phy_start_aneg) from [] (mtk_init+0x414/0x47c) [ 5.271630] r7:df5f5eec r6:c0f08c48 r5:00000000 r4:dea67800 [ 5.277256] [] (mtk_init) from [] (register_netdevice+0x98/0x51c) [ 5.285035] r8:00000000 r7:00000000 r6:c0f97080 r5:c0f08c48 r4:dea67800 [ 5.291693] [] (register_netdevice) from [] (register_netdev+0x2c/0x44) [ 5.299989] r8:00000000 r7:dea2e608 r6:deacea00 r5:dea2e604 r4:dea67800 [ 5.306646] [] (register_netdev) from [] (mtk_probe+0x668/0x7ac) [ 5.314336] r5:dea2e604 r4:dea2e040 [ 5.317890] [] (mtk_probe) from [] (platform_drv_probe+0x58/0xa8) [ 5.325670] r10:c0f86bac r9:00000000 r8:c0fbe578 r7:00000000 r6:c0f86bac r5:00000000 [ 5.333445] r4:deacea10 [ 5.335963] [] (platform_drv_probe) from [] (really_probe+0x2d8/0x424) maybe other boards using this generic driver are affected v2: optimization: - phy_set_max_speed() is only needed if you want to reduce the max speed, typically if the PHY supports 1Gbps but the MAC supports 100Mbps only. - The pause parameters are autonegotiated. Except you have a specific need you normally don't need to manually fiddle with this. - phy_start_aneg() is called implicitly by the phylib state machine, you shouldn't call it manually except you have a good excuse. - netif_carrier_on/netif_carrier_off in mtk_phy_link_adjust() isn't needed. It's done by phy_link_change() in phylib. Signed-off-by: Frank Wunderlich Reviewed-by: Heiner Kallweit Acked-by: Sean Wang Signed-off-by: David S. Miller --- drivers/net/ethernet/mediatek/mtk_eth_soc.c | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c index 399f565dd85a..2968d29a992f 100644 --- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c +++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c @@ -258,11 +258,6 @@ static void mtk_phy_link_adjust(struct net_device *dev) mtk_w32(mac->hw, mcr, MTK_MAC_MCR(mac->id)); - if (dev->phydev->link) - netif_carrier_on(dev); - else - netif_carrier_off(dev); - if (!of_phy_is_fixed_link(mac->of_node)) phy_print_status(dev->phydev); } @@ -347,17 +342,6 @@ static int mtk_phy_connect(struct net_device *dev) if (mtk_phy_connect_node(eth, mac, np)) goto err_phy; - dev->phydev->autoneg = AUTONEG_ENABLE; - dev->phydev->speed = 0; - dev->phydev->duplex = 0; - - phy_set_max_speed(dev->phydev, SPEED_1000); - phy_support_asym_pause(dev->phydev); - linkmode_copy(dev->phydev->advertising, dev->phydev->supported); - linkmode_set_bit(ETHTOOL_LINK_MODE_Autoneg_BIT, - dev->phydev->advertising); - phy_start_aneg(dev->phydev); - of_node_put(np); return 0; From 17e3ac812541f73224299d8958ddb420c2d5bbd8 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 10 Jan 2019 11:14:00 -0800 Subject: [PATCH 70/87] bpf: fix bpffs bitfield pretty print Commit 9d5f9f701b18 ("bpf: btf: fix struct/union/fwd types with kind_flag") introduced kind_flag and used bitfield_size in the btf_member to directly pretty print member values. The commit contained a bug where the incorrect parameters could be passed to function btf_bitfield_seq_show(). The bits_offset parameter in the function expects a value less than 8. Instead, the member offset in the structure is passed. The below is btf_bitfield_seq_show() func signature: void btf_bitfield_seq_show(void *data, u8 bits_offset, u8 nr_bits, struct seq_file *m) both bits_offset and nr_bits are u8 type. If the bitfield member offset is greater than 256, incorrect value will be printed. This patch fixed the issue by calculating correct proper data offset and bits_offset similar to non kind_flag case. Fixes: 9d5f9f701b18 ("bpf: btf: fix struct/union/fwd types with kind_flag") Acked-by: Martin KaFai Lau Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann --- kernel/bpf/btf.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 715f9fcf4712..a2f53642592b 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -1219,8 +1219,6 @@ static void btf_bitfield_seq_show(void *data, u8 bits_offset, u8 nr_copy_bits; u64 print_num; - data += BITS_ROUNDDOWN_BYTES(bits_offset); - bits_offset = BITS_PER_BYTE_MASKED(bits_offset); nr_copy_bits = nr_bits + bits_offset; nr_copy_bytes = BITS_ROUNDUP_BYTES(nr_copy_bits); @@ -1255,7 +1253,9 @@ static void btf_int_bits_seq_show(const struct btf *btf, * BTF_INT_OFFSET() cannot exceed 64 bits. */ total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data); - btf_bitfield_seq_show(data, total_bits_offset, nr_bits, m); + data += BITS_ROUNDDOWN_BYTES(total_bits_offset); + bits_offset = BITS_PER_BYTE_MASKED(total_bits_offset); + btf_bitfield_seq_show(data, bits_offset, nr_bits, m); } static void btf_int_seq_show(const struct btf *btf, const struct btf_type *t, @@ -2001,12 +2001,12 @@ static void btf_struct_seq_show(const struct btf *btf, const struct btf_type *t, member_offset = btf_member_bit_offset(t, member); bitfield_size = btf_member_bitfield_size(t, member); + bytes_offset = BITS_ROUNDDOWN_BYTES(member_offset); + bits8_offset = BITS_PER_BYTE_MASKED(member_offset); if (bitfield_size) { - btf_bitfield_seq_show(data, member_offset, + btf_bitfield_seq_show(data + bytes_offset, bits8_offset, bitfield_size, m); } else { - bytes_offset = BITS_ROUNDDOWN_BYTES(member_offset); - bits8_offset = BITS_PER_BYTE_MASKED(member_offset); ops = btf_type_ops(member_type); ops->seq_show(btf, member_type, member->type, data + bytes_offset, bits8_offset, m); From e43207fa2e6130e39e3aca4c55e2ee21cfb46828 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 10 Jan 2019 11:14:01 -0800 Subject: [PATCH 71/87] tools/bpf: test btf bitfield with >=256 struct member offset This patch modified test_btf pretty print test to cover the bitfield with struct member equal to or greater 256. Without the previous kernel patch fix, the modified test will fail: $ test_btf -p ...... BTF pretty print array(#1)......unexpected pprint output expected: 0: {0,0,0,0x3,0x0,0x3,{0|[0,0,0,0,0,0,0,0]},ENUM_ZERO,4,0x1} read: 0: {0,0,0,0x3,0x0,0x3,{0|[0,0,0,0,0,0,0,0]},ENUM_ZERO,4,0x0} BTF pretty print array(#2)......unexpected pprint output expected: 0: {0,0,0,0x3,0x0,0x3,{0|[0,0,0,0,0,0,0,0]},ENUM_ZERO,4,0x1} read: 0: {0,0,0,0x3,0x0,0x3,{0|[0,0,0,0,0,0,0,0]},ENUM_ZERO,4,0x0} PASS:6 SKIP:0 FAIL:2 With the kernel fix, the modified test will succeed: $ test_btf -p ...... BTF pretty print array(#1)......OK BTF pretty print array(#2)......OK PASS:8 SKIP:0 FAIL:0 Fixes: 9d5f9f701b18 ("bpf: btf: fix struct/union/fwd types with kind_flag") Acked-by: Martin KaFai Lau Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/test_btf.c | 29 +++++++++++++++++++------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/bpf/test_btf.c b/tools/testing/selftests/bpf/test_btf.c index 8bcd38010582..a0bd04befe87 100644 --- a/tools/testing/selftests/bpf/test_btf.c +++ b/tools/testing/selftests/bpf/test_btf.c @@ -3526,6 +3526,8 @@ struct pprint_mapv { ENUM_TWO, ENUM_THREE, } aenum; + uint32_t ui32b; + uint32_t bits2c:2; }; static struct btf_raw_test pprint_test_template[] = { @@ -3568,7 +3570,7 @@ static struct btf_raw_test pprint_test_template[] = { BTF_ENUM_ENC(NAME_TBD, 2), BTF_ENUM_ENC(NAME_TBD, 3), /* struct pprint_mapv */ /* [16] */ - BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 8), 32), + BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 10), 40), BTF_MEMBER_ENC(NAME_TBD, 11, 0), /* uint32_t ui32 */ BTF_MEMBER_ENC(NAME_TBD, 10, 32), /* uint16_t ui16 */ BTF_MEMBER_ENC(NAME_TBD, 12, 64), /* int32_t si32 */ @@ -3577,9 +3579,11 @@ static struct btf_raw_test pprint_test_template[] = { BTF_MEMBER_ENC(NAME_TBD, 6, 126), /* unused_bits2b */ BTF_MEMBER_ENC(0, 14, 128), /* union (anon) */ BTF_MEMBER_ENC(NAME_TBD, 15, 192), /* aenum */ + BTF_MEMBER_ENC(NAME_TBD, 11, 224), /* uint32_t ui32b */ + BTF_MEMBER_ENC(NAME_TBD, 6, 256), /* bits2c */ BTF_END_RAW, }, - BTF_STR_SEC("\0unsigned char\0unsigned short\0unsigned int\0int\0unsigned long long\0uint8_t\0uint16_t\0uint32_t\0int32_t\0uint64_t\0ui64\0ui8a\0ENUM_ZERO\0ENUM_ONE\0ENUM_TWO\0ENUM_THREE\0pprint_mapv\0ui32\0ui16\0si32\0unused_bits2a\0bits28\0unused_bits2b\0aenum"), + BTF_STR_SEC("\0unsigned char\0unsigned short\0unsigned int\0int\0unsigned long long\0uint8_t\0uint16_t\0uint32_t\0int32_t\0uint64_t\0ui64\0ui8a\0ENUM_ZERO\0ENUM_ONE\0ENUM_TWO\0ENUM_THREE\0pprint_mapv\0ui32\0ui16\0si32\0unused_bits2a\0bits28\0unused_bits2b\0aenum\0ui32b\0bits2c"), .key_size = sizeof(unsigned int), .value_size = sizeof(struct pprint_mapv), .key_type_id = 3, /* unsigned int */ @@ -3628,7 +3632,7 @@ static struct btf_raw_test pprint_test_template[] = { BTF_ENUM_ENC(NAME_TBD, 2), BTF_ENUM_ENC(NAME_TBD, 3), /* struct pprint_mapv */ /* [16] */ - BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 1, 8), 32), + BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 1, 10), 40), BTF_MEMBER_ENC(NAME_TBD, 11, BTF_MEMBER_OFFSET(0, 0)), /* uint32_t ui32 */ BTF_MEMBER_ENC(NAME_TBD, 10, BTF_MEMBER_OFFSET(0, 32)), /* uint16_t ui16 */ BTF_MEMBER_ENC(NAME_TBD, 12, BTF_MEMBER_OFFSET(0, 64)), /* int32_t si32 */ @@ -3637,9 +3641,11 @@ static struct btf_raw_test pprint_test_template[] = { BTF_MEMBER_ENC(NAME_TBD, 6, BTF_MEMBER_OFFSET(2, 126)), /* unused_bits2b */ BTF_MEMBER_ENC(0, 14, BTF_MEMBER_OFFSET(0, 128)), /* union (anon) */ BTF_MEMBER_ENC(NAME_TBD, 15, BTF_MEMBER_OFFSET(0, 192)), /* aenum */ + BTF_MEMBER_ENC(NAME_TBD, 11, BTF_MEMBER_OFFSET(0, 224)), /* uint32_t ui32b */ + BTF_MEMBER_ENC(NAME_TBD, 6, BTF_MEMBER_OFFSET(2, 256)), /* bits2c */ BTF_END_RAW, }, - BTF_STR_SEC("\0unsigned char\0unsigned short\0unsigned int\0int\0unsigned long long\0uint8_t\0uint16_t\0uint32_t\0int32_t\0uint64_t\0ui64\0ui8a\0ENUM_ZERO\0ENUM_ONE\0ENUM_TWO\0ENUM_THREE\0pprint_mapv\0ui32\0ui16\0si32\0unused_bits2a\0bits28\0unused_bits2b\0aenum"), + BTF_STR_SEC("\0unsigned char\0unsigned short\0unsigned int\0int\0unsigned long long\0uint8_t\0uint16_t\0uint32_t\0int32_t\0uint64_t\0ui64\0ui8a\0ENUM_ZERO\0ENUM_ONE\0ENUM_TWO\0ENUM_THREE\0pprint_mapv\0ui32\0ui16\0si32\0unused_bits2a\0bits28\0unused_bits2b\0aenum\0ui32b\0bits2c"), .key_size = sizeof(unsigned int), .value_size = sizeof(struct pprint_mapv), .key_type_id = 3, /* unsigned int */ @@ -3690,7 +3696,7 @@ static struct btf_raw_test pprint_test_template[] = { BTF_ENUM_ENC(NAME_TBD, 2), BTF_ENUM_ENC(NAME_TBD, 3), /* struct pprint_mapv */ /* [16] */ - BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 1, 8), 32), + BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 1, 10), 40), BTF_MEMBER_ENC(NAME_TBD, 11, BTF_MEMBER_OFFSET(0, 0)), /* uint32_t ui32 */ BTF_MEMBER_ENC(NAME_TBD, 10, BTF_MEMBER_OFFSET(0, 32)), /* uint16_t ui16 */ BTF_MEMBER_ENC(NAME_TBD, 12, BTF_MEMBER_OFFSET(0, 64)), /* int32_t si32 */ @@ -3699,13 +3705,15 @@ static struct btf_raw_test pprint_test_template[] = { BTF_MEMBER_ENC(NAME_TBD, 19, BTF_MEMBER_OFFSET(2, 126)),/* unused_bits2b */ BTF_MEMBER_ENC(0, 14, BTF_MEMBER_OFFSET(0, 128)), /* union (anon) */ BTF_MEMBER_ENC(NAME_TBD, 15, BTF_MEMBER_OFFSET(0, 192)), /* aenum */ + BTF_MEMBER_ENC(NAME_TBD, 11, BTF_MEMBER_OFFSET(0, 224)), /* uint32_t ui32b */ + BTF_MEMBER_ENC(NAME_TBD, 17, BTF_MEMBER_OFFSET(2, 256)), /* bits2c */ /* typedef unsigned int ___int */ /* [17] */ BTF_TYPEDEF_ENC(NAME_TBD, 18), BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_VOLATILE, 0, 0), 6), /* [18] */ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 15), /* [19] */ BTF_END_RAW, }, - BTF_STR_SEC("\0unsigned char\0unsigned short\0unsigned int\0int\0unsigned long long\0uint8_t\0uint16_t\0uint32_t\0int32_t\0uint64_t\0ui64\0ui8a\0ENUM_ZERO\0ENUM_ONE\0ENUM_TWO\0ENUM_THREE\0pprint_mapv\0ui32\0ui16\0si32\0unused_bits2a\0bits28\0unused_bits2b\0aenum\0___int"), + BTF_STR_SEC("\0unsigned char\0unsigned short\0unsigned int\0int\0unsigned long long\0uint8_t\0uint16_t\0uint32_t\0int32_t\0uint64_t\0ui64\0ui8a\0ENUM_ZERO\0ENUM_ONE\0ENUM_TWO\0ENUM_THREE\0pprint_mapv\0ui32\0ui16\0si32\0unused_bits2a\0bits28\0unused_bits2b\0aenum\0ui32b\0bits2c\0___int"), .key_size = sizeof(unsigned int), .value_size = sizeof(struct pprint_mapv), .key_type_id = 3, /* unsigned int */ @@ -3793,6 +3801,8 @@ static void set_pprint_mapv(struct pprint_mapv *v, uint32_t i, v->unused_bits2b = 3; v->ui64 = i; v->aenum = i & 0x03; + v->ui32b = 4; + v->bits2c = 1; v = (void *)v + rounded_value_size; } } @@ -3955,7 +3965,8 @@ static int do_test_pprint(int test_num) nexpected_line = snprintf(expected_line, sizeof(expected_line), "%s%u: {%u,0,%d,0x%x,0x%x,0x%x," - "{%lu|[%u,%u,%u,%u,%u,%u,%u,%u]},%s}\n", + "{%lu|[%u,%u,%u,%u,%u,%u,%u,%u]},%s," + "%u,0x%x}\n", percpu_map ? "\tcpu" : "", percpu_map ? cpu : next_key, cmapv->ui32, cmapv->si32, @@ -3967,7 +3978,9 @@ static int do_test_pprint(int test_num) cmapv->ui8a[2], cmapv->ui8a[3], cmapv->ui8a[4], cmapv->ui8a[5], cmapv->ui8a[6], cmapv->ui8a[7], - pprint_enum_str[cmapv->aenum]); + pprint_enum_str[cmapv->aenum], + cmapv->ui32b, + cmapv->bits2c); err = check_line(expected_line, nexpected_line, sizeof(expected_line), line); From 298e59d322954e89ed2a556c601a04a4c007d1b3 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 10 Jan 2019 11:14:02 -0800 Subject: [PATCH 72/87] tools/bpf: fix bpftool map dump with bitfields Commit 8772c8bc093b ("tools: bpftool: support pretty print with kind_flag set") added bpftool map dump with kind_flag support. When bitfield_size can be retrieved directly from btf_member, function btf_dumper_bitfield() is called to dump the bitfield. The implementation passed the wrong parameter "bit_offset" to the function. The excepted value is the bit_offset within a byte while the passed-in value is the struct member offset. This commit fixed the bug with passing correct "bit_offset" with adjusted data pointer. Fixes: 8772c8bc093b ("tools: bpftool: support pretty print with kind_flag set") Acked-by: Martin KaFai Lau Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann --- tools/bpf/bpftool/btf_dumper.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/tools/bpf/bpftool/btf_dumper.c b/tools/bpf/bpftool/btf_dumper.c index 3f0629edbca5..6ba5f567a9d8 100644 --- a/tools/bpf/bpftool/btf_dumper.c +++ b/tools/bpf/bpftool/btf_dumper.c @@ -82,8 +82,6 @@ static void btf_dumper_bitfield(__u32 nr_bits, __u8 bit_offset, int bits_to_copy; __u64 print_num; - data += BITS_ROUNDDOWN_BYTES(bit_offset); - bit_offset = BITS_PER_BYTE_MASKED(bit_offset); bits_to_copy = bit_offset + nr_bits; bytes_to_copy = BITS_ROUNDUP_BYTES(bits_to_copy); @@ -118,7 +116,9 @@ static void btf_dumper_int_bits(__u32 int_type, __u8 bit_offset, * BTF_INT_OFFSET() cannot exceed 64 bits. */ total_bits_offset = bit_offset + BTF_INT_OFFSET(int_type); - btf_dumper_bitfield(nr_bits, total_bits_offset, data, jw, + data += BITS_ROUNDDOWN_BYTES(total_bits_offset); + bit_offset = BITS_PER_BYTE_MASKED(total_bits_offset); + btf_dumper_bitfield(nr_bits, bit_offset, data, jw, is_plain_text); } @@ -216,11 +216,12 @@ static int btf_dumper_struct(const struct btf_dumper *d, __u32 type_id, } jsonw_name(d->jw, btf__name_by_offset(d->btf, m[i].name_off)); + data_off = data + BITS_ROUNDDOWN_BYTES(bit_offset); if (bitfield_size) { - btf_dumper_bitfield(bitfield_size, bit_offset, - data, d->jw, d->is_plain_text); + btf_dumper_bitfield(bitfield_size, + BITS_PER_BYTE_MASKED(bit_offset), + data_off, d->jw, d->is_plain_text); } else { - data_off = data + BITS_ROUNDDOWN_BYTES(bit_offset); ret = btf_dumper_do_type(d, m[i].type, BITS_PER_BYTE_MASKED(bit_offset), data_off); From 6dea7e1881fd86b80da64e476ac398008daed857 Mon Sep 17 00:00:00 2001 From: Jose Abreu Date: Wed, 9 Jan 2019 10:05:56 +0100 Subject: [PATCH 73/87] net: stmmac: Fix PCI module removal leak Since commit b7d0f08e9129, the enable / disable of PCI device is not managed which will result in IO regions not being automatically unmapped. As regions continue mapped it is currently not possible to remove and then probe again the PCI module of stmmac. Fix this by manually unmapping regions on remove callback. Changes from v1: - Fix build error Cc: Joao Pinto Cc: David S. Miller Cc: Giuseppe Cavallaro Cc: Alexandre Torgue Fixes: b7d0f08e9129 ("net: stmmac: Fix WoL for PCI-based setups") Signed-off-by: Jose Abreu Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c index c54a50dbd5ac..d819e8eaba12 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c @@ -299,7 +299,17 @@ static int stmmac_pci_probe(struct pci_dev *pdev, */ static void stmmac_pci_remove(struct pci_dev *pdev) { + int i; + stmmac_dvr_remove(&pdev->dev); + + for (i = 0; i <= PCI_STD_RESOURCE_END; i++) { + if (pci_resource_len(pdev, i) == 0) + continue; + pcim_iounmap_regions(pdev, BIT(i)); + break; + } + pci_disable_device(pdev); } From fcc509eb10ff4794641e6ad3082118287a750d0a Mon Sep 17 00:00:00 2001 From: Jose Abreu Date: Wed, 9 Jan 2019 10:05:57 +0100 Subject: [PATCH 74/87] net: stmmac: dwxgmac2: Only clear interrupts that are active In DMA interrupt handler we were clearing all interrupts status, even the ones that were not active. Fix this and only clear the active interrupts. Cc: Joao Pinto Cc: David S. Miller Cc: Giuseppe Cavallaro Cc: Alexandre Torgue Signed-off-by: Jose Abreu Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c index 6c5092e7771c..c5e25580a43f 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c @@ -263,6 +263,7 @@ static int dwxgmac2_dma_interrupt(void __iomem *ioaddr, struct stmmac_extra_stats *x, u32 chan) { u32 intr_status = readl(ioaddr + XGMAC_DMA_CH_STATUS(chan)); + u32 intr_en = readl(ioaddr + XGMAC_DMA_CH_INT_EN(chan)); int ret = 0; /* ABNORMAL interrupts */ @@ -282,8 +283,7 @@ static int dwxgmac2_dma_interrupt(void __iomem *ioaddr, x->normal_irq_n++; if (likely(intr_status & XGMAC_RI)) { - u32 value = readl(ioaddr + XGMAC_DMA_CH_INT_EN(chan)); - if (likely(value & XGMAC_RIE)) { + if (likely(intr_en & XGMAC_RIE)) { x->rx_normal_irq_n++; ret |= handle_rx; } @@ -295,7 +295,7 @@ static int dwxgmac2_dma_interrupt(void __iomem *ioaddr, } /* Clear interrupts */ - writel(~0x0, ioaddr + XGMAC_DMA_CH_STATUS(chan)); + writel(intr_en & intr_status, ioaddr + XGMAC_DMA_CH_STATUS(chan)); return ret; } From 0650d4017f4d2eee67230a02285a7ae5204240c2 Mon Sep 17 00:00:00 2001 From: Jose Abreu Date: Wed, 9 Jan 2019 10:05:58 +0100 Subject: [PATCH 75/87] net: stmmac: Check if CBS is supported before configuring Check if CBS is currently supported before trying to configure it in HW. Cc: Joao Pinto Cc: David S. Miller Cc: Giuseppe Cavallaro Cc: Alexandre Torgue Signed-off-by: Jose Abreu Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c index 531294f4978b..58ea18af9813 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c @@ -301,6 +301,8 @@ static int tc_setup_cbs(struct stmmac_priv *priv, /* Queue 0 is not AVB capable */ if (queue <= 0 || queue >= tx_queues_count) return -EINVAL; + if (!priv->dma_cap.av) + return -EOPNOTSUPP; if (priv->speed != SPEED_100 && priv->speed != SPEED_1000) return -EOPNOTSUPP; From 3b5094665e273c4a2a99f7f5f16977c0f1e19095 Mon Sep 17 00:00:00 2001 From: Jose Abreu Date: Wed, 9 Jan 2019 10:05:59 +0100 Subject: [PATCH 76/87] net: stmmac: Fix the logic of checking if RX Watchdog must be enabled RX Watchdog can be disabled by platform definitions but currently we are initializing the descriptors before checking if Watchdog must be disabled or not. Fix this by checking earlier if user wants Watchdog disabled or not. Cc: Joao Pinto Cc: David S. Miller Cc: Giuseppe Cavallaro Cc: Alexandre Torgue Signed-off-by: Jose Abreu Signed-off-by: David S. Miller --- .../net/ethernet/stmicro/stmmac/stmmac_main.c | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 0e0a0789c2ed..83ceb1a12e77 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -4168,6 +4168,18 @@ static int stmmac_hw_init(struct stmmac_priv *priv) return ret; } + /* Rx Watchdog is available in the COREs newer than the 3.40. + * In some case, for example on bugged HW this feature + * has to be disable and this can be done by passing the + * riwt_off field from the platform. + */ + if (((priv->synopsys_id >= DWMAC_CORE_3_50) || + (priv->plat->has_xgmac)) && (!priv->plat->riwt_off)) { + priv->use_riwt = 1; + dev_info(priv->device, + "Enable RX Mitigation via HW Watchdog Timer\n"); + } + return 0; } @@ -4300,18 +4312,6 @@ int stmmac_dvr_probe(struct device *device, if (flow_ctrl) priv->flow_ctrl = FLOW_AUTO; /* RX/TX pause on */ - /* Rx Watchdog is available in the COREs newer than the 3.40. - * In some case, for example on bugged HW this feature - * has to be disable and this can be done by passing the - * riwt_off field from the platform. - */ - if (((priv->synopsys_id >= DWMAC_CORE_3_50) || - (priv->plat->has_xgmac)) && (!priv->plat->riwt_off)) { - priv->use_riwt = 1; - dev_info(priv->device, - "Enable RX Mitigation via HW Watchdog Timer\n"); - } - /* Setup channels NAPI */ maxq = max(priv->plat->rx_queues_to_use, priv->plat->tx_queues_to_use); From fa0be0a43f101888ac677dba31b590963eafeaa1 Mon Sep 17 00:00:00 2001 From: Jose Abreu Date: Wed, 9 Jan 2019 10:06:00 +0100 Subject: [PATCH 77/87] net: stmmac: Prevent RX starvation in stmmac_napi_poll() Currently, TX is given a budget which is consumed by stmmac_tx_clean() and stmmac_rx() is given the remaining non-consumed budget. This is wrong and in case we are sending a large number of packets this can starve RX because remaining budget will be low. Let's give always the same budget for RX and TX clean. While at it, check if we missed any interrupts while we were in NAPI callback by looking at DMA interrupt status. Cc: Joao Pinto Cc: David S. Miller Cc: Giuseppe Cavallaro Cc: Alexandre Torgue Signed-off-by: Jose Abreu Signed-off-by: David S. Miller --- .../net/ethernet/stmicro/stmmac/stmmac_main.c | 27 ++++++++++--------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 83ceb1a12e77..3f23e14891df 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -3525,27 +3525,28 @@ static int stmmac_napi_poll(struct napi_struct *napi, int budget) struct stmmac_channel *ch = container_of(napi, struct stmmac_channel, napi); struct stmmac_priv *priv = ch->priv_data; - int work_done = 0, work_rem = budget; + int work_done, rx_done = 0, tx_done = 0; u32 chan = ch->index; priv->xstats.napi_poll++; - if (ch->has_tx) { - int done = stmmac_tx_clean(priv, work_rem, chan); + if (ch->has_tx) + tx_done = stmmac_tx_clean(priv, budget, chan); + if (ch->has_rx) + rx_done = stmmac_rx(priv, budget, chan); - work_done += done; - work_rem -= done; - } + work_done = max(rx_done, tx_done); + work_done = min(work_done, budget); - if (ch->has_rx) { - int done = stmmac_rx(priv, work_rem, chan); + if (work_done < budget && napi_complete_done(napi, work_done)) { + int stat; - work_done += done; - work_rem -= done; - } - - if (work_done < budget && napi_complete_done(napi, work_done)) stmmac_enable_dma_irq(priv, priv->ioaddr, chan); + stat = stmmac_dma_interrupt_status(priv, priv->ioaddr, + &priv->xstats, chan); + if (stat && napi_reschedule(napi)) + stmmac_disable_dma_irq(priv, priv->ioaddr, chan); + } return work_done; } From 7fbe078c37aba3088359c9256c1a1d0c3e39ee81 Mon Sep 17 00:00:00 2001 From: Zha Bin Date: Tue, 8 Jan 2019 16:07:03 +0800 Subject: [PATCH 78/87] vhost/vsock: fix vhost vsock cid hashing inconsistent The vsock core only supports 32bit CID, but the Virtio-vsock spec define CID (dst_cid and src_cid) as u64 and the upper 32bits is reserved as zero. This inconsistency causes one bug in vhost vsock driver. The scenarios is: 0. A hash table (vhost_vsock_hash) is used to map an CID to a vsock object. And hash_min() is used to compute the hash key. hash_min() is defined as: (sizeof(val) <= 4 ? hash_32(val, bits) : hash_long(val, bits)). That means the hash algorithm has dependency on the size of macro argument 'val'. 0. In function vhost_vsock_set_cid(), a 64bit CID is passed to hash_min() to compute the hash key when inserting a vsock object into the hash table. 0. In function vhost_vsock_get(), a 32bit CID is passed to hash_min() to compute the hash key when looking up a vsock for an CID. Because the different size of the CID, hash_min() returns different hash key, thus fails to look up the vsock object for an CID. To fix this bug, we keep CID as u64 in the IOCTLs and virtio message headers, but explicitly convert u64 to u32 when deal with the hash table and vsock core. Fixes: 834e772c8db0 ("vhost/vsock: fix use-after-free in network stack callers") Link: https://github.com/stefanha/virtio/blob/vsock/trunk/content.tex Signed-off-by: Zha Bin Reviewed-by: Liu Jiang Reviewed-by: Stefan Hajnoczi Acked-by: Jason Wang Signed-off-by: David S. Miller --- drivers/vhost/vsock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c index bc42d38ae031..3fbc068eaa9b 100644 --- a/drivers/vhost/vsock.c +++ b/drivers/vhost/vsock.c @@ -642,7 +642,7 @@ static int vhost_vsock_set_cid(struct vhost_vsock *vsock, u64 guest_cid) hash_del_rcu(&vsock->hash); vsock->guest_cid = guest_cid; - hash_add_rcu(vhost_vsock_hash, &vsock->hash, guest_cid); + hash_add_rcu(vhost_vsock_hash, &vsock->hash, vsock->guest_cid); mutex_unlock(&vhost_vsock_mutex); return 0; From 2ff33d6637393fe9348357285931811b76e1402f Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Tue, 8 Jan 2019 21:04:48 +0800 Subject: [PATCH 79/87] isdn: i4l: isdn_tty: Fix some concurrency double-free bugs The functions isdn_tty_tiocmset() and isdn_tty_set_termios() may be concurrently executed. isdn_tty_tiocmset isdn_tty_modem_hup line 719: kfree(info->dtmf_state); line 721: kfree(info->silence_state); line 723: kfree(info->adpcms); line 725: kfree(info->adpcmr); isdn_tty_set_termios isdn_tty_modem_hup line 719: kfree(info->dtmf_state); line 721: kfree(info->silence_state); line 723: kfree(info->adpcms); line 725: kfree(info->adpcmr); Thus, some concurrency double-free bugs may occur. These possible bugs are found by a static tool written by myself and my manual code review. To fix these possible bugs, the mutex lock "modem_info_mutex" used in isdn_tty_tiocmset() is added in isdn_tty_set_termios(). Signed-off-by: Jia-Ju Bai Signed-off-by: David S. Miller --- drivers/isdn/i4l/isdn_tty.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/isdn/i4l/isdn_tty.c b/drivers/isdn/i4l/isdn_tty.c index 1b2239c1d569..dc1cded716c1 100644 --- a/drivers/isdn/i4l/isdn_tty.c +++ b/drivers/isdn/i4l/isdn_tty.c @@ -1437,15 +1437,19 @@ isdn_tty_set_termios(struct tty_struct *tty, struct ktermios *old_termios) { modem_info *info = (modem_info *) tty->driver_data; + mutex_lock(&modem_info_mutex); if (!old_termios) isdn_tty_change_speed(info); else { if (tty->termios.c_cflag == old_termios->c_cflag && tty->termios.c_ispeed == old_termios->c_ispeed && - tty->termios.c_ospeed == old_termios->c_ospeed) + tty->termios.c_ospeed == old_termios->c_ospeed) { + mutex_unlock(&modem_info_mutex); return; + } isdn_tty_change_speed(info); } + mutex_unlock(&modem_info_mutex); } /* From 73ab1cb2de9e3efe7f818d5453de271e5371df1d Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Wed, 9 Jan 2019 02:23:56 +0900 Subject: [PATCH 80/87] umh: add exit routine for UMH process A UMH process which is created by the fork_usermode_blob() such as bpfilter needs to release members of the umh_info when process is terminated. But the do_exit() does not release members of the umh_info. hence module which uses UMH needs own code to detect whether UMH process is terminated or not. But this implementation needs extra code for checking the status of UMH process. it eventually makes the code more complex. The new PF_UMH flag is added and it is used to identify UMH processes. The exit_umh() does not release members of the umh_info. Hence umh_info->cleanup callback should release both members of the umh_info and the private data. Suggested-by: David S. Miller Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- include/linux/sched.h | 9 +++++++++ include/linux/umh.h | 2 ++ kernel/exit.c | 1 + kernel/umh.c | 33 +++++++++++++++++++++++++++++++-- 4 files changed, 43 insertions(+), 2 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 89541d248893..e35e35b9fc48 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1406,6 +1406,7 @@ extern struct pid *cad_pid; #define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */ #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ #define PF_MEMSTALL 0x01000000 /* Stalled due to lack of memory */ +#define PF_UMH 0x02000000 /* I'm an Usermodehelper process */ #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ @@ -1904,6 +1905,14 @@ static inline void rseq_execve(struct task_struct *t) #endif +void __exit_umh(struct task_struct *tsk); + +static inline void exit_umh(struct task_struct *tsk) +{ + if (unlikely(tsk->flags & PF_UMH)) + __exit_umh(tsk); +} + #ifdef CONFIG_DEBUG_RSEQ void rseq_syscall(struct pt_regs *regs); diff --git a/include/linux/umh.h b/include/linux/umh.h index 235f51b62c71..0c08de356d0d 100644 --- a/include/linux/umh.h +++ b/include/linux/umh.h @@ -47,6 +47,8 @@ struct umh_info { const char *cmdline; struct file *pipe_to_umh; struct file *pipe_from_umh; + struct list_head list; + void (*cleanup)(struct umh_info *info); pid_t pid; }; int fork_usermode_blob(void *data, size_t len, struct umh_info *info); diff --git a/kernel/exit.c b/kernel/exit.c index 8a01b671dc1f..dad70419195c 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -866,6 +866,7 @@ void __noreturn do_exit(long code) exit_task_namespaces(tsk); exit_task_work(tsk); exit_thread(tsk); + exit_umh(tsk); /* * Flush inherited counters to the parent - before the parent diff --git a/kernel/umh.c b/kernel/umh.c index 0baa672e023c..d937cbad903a 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -37,6 +37,8 @@ static kernel_cap_t usermodehelper_bset = CAP_FULL_SET; static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET; static DEFINE_SPINLOCK(umh_sysctl_lock); static DECLARE_RWSEM(umhelper_sem); +static LIST_HEAD(umh_list); +static DEFINE_MUTEX(umh_list_lock); static void call_usermodehelper_freeinfo(struct subprocess_info *info) { @@ -100,10 +102,12 @@ static int call_usermodehelper_exec_async(void *data) commit_creds(new); sub_info->pid = task_pid_nr(current); - if (sub_info->file) + if (sub_info->file) { retval = do_execve_file(sub_info->file, sub_info->argv, sub_info->envp); - else + if (!retval) + current->flags |= PF_UMH; + } else retval = do_execve(getname_kernel(sub_info->path), (const char __user *const __user *)sub_info->argv, (const char __user *const __user *)sub_info->envp); @@ -517,6 +521,11 @@ int fork_usermode_blob(void *data, size_t len, struct umh_info *info) goto out; err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); + if (!err) { + mutex_lock(&umh_list_lock); + list_add(&info->list, &umh_list); + mutex_unlock(&umh_list_lock); + } out: fput(file); return err; @@ -679,6 +688,26 @@ static int proc_cap_handler(struct ctl_table *table, int write, return 0; } +void __exit_umh(struct task_struct *tsk) +{ + struct umh_info *info; + pid_t pid = tsk->pid; + + mutex_lock(&umh_list_lock); + list_for_each_entry(info, &umh_list, list) { + if (info->pid == pid) { + list_del(&info->list); + mutex_unlock(&umh_list_lock); + goto out; + } + } + mutex_unlock(&umh_list_lock); + return; +out: + if (info->cleanup) + info->cleanup(info); +} + struct ctl_table usermodehelper_table[] = { { .procname = "bset", From 5b4cb650e569db2e6a09d2fa0ef8eb789a0ac5d8 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Wed, 9 Jan 2019 02:24:34 +0900 Subject: [PATCH 81/87] net: bpfilter: use cleanup callback to release umh_info Now, UMH process is killed, do_exit() calls the umh_info->cleanup callback to release members of the umh_info. This patch makes bpfilter_umh's cleanup routine to use the umh_info->cleanup callback. Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- include/linux/bpfilter.h | 11 ++++++++--- net/bpfilter/bpfilter_kern.c | 23 ++++++++++------------- net/ipv4/bpfilter/sockopt.c | 33 ++++++++++++++++++++++++++------- 3 files changed, 44 insertions(+), 23 deletions(-) diff --git a/include/linux/bpfilter.h b/include/linux/bpfilter.h index f02cee0225d4..70ffeed280e9 100644 --- a/include/linux/bpfilter.h +++ b/include/linux/bpfilter.h @@ -3,13 +3,18 @@ #define _LINUX_BPFILTER_H #include +#include struct sock; int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char __user *optval, unsigned int optlen); int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval, int __user *optlen); -extern int (*bpfilter_process_sockopt)(struct sock *sk, int optname, - char __user *optval, - unsigned int optlen, bool is_set); +struct bpfilter_umh_ops { + struct umh_info info; + int (*sockopt)(struct sock *sk, int optname, + char __user *optval, + unsigned int optlen, bool is_set); +}; +extern struct bpfilter_umh_ops bpfilter_ops; #endif diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c index 7acfc83087d5..a68940b74c01 100644 --- a/net/bpfilter/bpfilter_kern.c +++ b/net/bpfilter/bpfilter_kern.c @@ -13,7 +13,6 @@ extern char bpfilter_umh_start; extern char bpfilter_umh_end; -static struct umh_info info; /* since ip_getsockopt() can run in parallel, serialize access to umh */ static DEFINE_MUTEX(bpfilter_lock); @@ -28,16 +27,13 @@ static void shutdown_umh(struct umh_info *info) force_sig(SIGKILL, tsk); put_task_struct(tsk); } - fput(info->pipe_to_umh); - fput(info->pipe_from_umh); - info->pid = 0; } static void __stop_umh(void) { if (IS_ENABLED(CONFIG_INET)) { - bpfilter_process_sockopt = NULL; - shutdown_umh(&info); + bpfilter_ops.sockopt = NULL; + shutdown_umh(&bpfilter_ops.info); } } @@ -64,9 +60,10 @@ static int __bpfilter_process_sockopt(struct sock *sk, int optname, req.addr = (long __force __user)optval; req.len = optlen; mutex_lock(&bpfilter_lock); - if (!info.pid) + if (!bpfilter_ops.info.pid) goto out; - n = __kernel_write(info.pipe_to_umh, &req, sizeof(req), &pos); + n = __kernel_write(bpfilter_ops.info.pipe_to_umh, &req, sizeof(req), + &pos); if (n != sizeof(req)) { pr_err("write fail %zd\n", n); __stop_umh(); @@ -74,7 +71,8 @@ static int __bpfilter_process_sockopt(struct sock *sk, int optname, goto out; } pos = 0; - n = kernel_read(info.pipe_from_umh, &reply, sizeof(reply), &pos); + n = kernel_read(bpfilter_ops.info.pipe_from_umh, &reply, sizeof(reply), + &pos); if (n != sizeof(reply)) { pr_err("read fail %zd\n", n); __stop_umh(); @@ -92,13 +90,12 @@ static int __init load_umh(void) int err; /* fork usermode process */ - info.cmdline = "bpfilter_umh"; err = fork_usermode_blob(&bpfilter_umh_start, &bpfilter_umh_end - &bpfilter_umh_start, - &info); + &bpfilter_ops.info); if (err) return err; - pr_info("Loaded bpfilter_umh pid %d\n", info.pid); + pr_info("Loaded bpfilter_umh pid %d\n", bpfilter_ops.info.pid); /* health check that usermode process started correctly */ if (__bpfilter_process_sockopt(NULL, 0, NULL, 0, 0) != 0) { @@ -106,7 +103,7 @@ static int __init load_umh(void) return -EFAULT; } if (IS_ENABLED(CONFIG_INET)) - bpfilter_process_sockopt = &__bpfilter_process_sockopt; + bpfilter_ops.sockopt = &__bpfilter_process_sockopt; return 0; } diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c index 5e04ed25bc0e..c326cfbc0f62 100644 --- a/net/ipv4/bpfilter/sockopt.c +++ b/net/ipv4/bpfilter/sockopt.c @@ -1,28 +1,37 @@ // SPDX-License-Identifier: GPL-2.0 +#include +#include #include #include #include #include #include +#include +#include -int (*bpfilter_process_sockopt)(struct sock *sk, int optname, - char __user *optval, - unsigned int optlen, bool is_set); -EXPORT_SYMBOL_GPL(bpfilter_process_sockopt); +struct bpfilter_umh_ops bpfilter_ops; +EXPORT_SYMBOL_GPL(bpfilter_ops); + +static void bpfilter_umh_cleanup(struct umh_info *info) +{ + fput(info->pipe_to_umh); + fput(info->pipe_from_umh); + info->pid = 0; +} static int bpfilter_mbox_request(struct sock *sk, int optname, char __user *optval, unsigned int optlen, bool is_set) { - if (!bpfilter_process_sockopt) { + if (!bpfilter_ops.sockopt) { int err = request_module("bpfilter"); if (err) return err; - if (!bpfilter_process_sockopt) + if (!bpfilter_ops.sockopt) return -ECHILD; } - return bpfilter_process_sockopt(sk, optname, optval, optlen, is_set); + return bpfilter_ops.sockopt(sk, optname, optval, optlen, is_set); } int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char __user *optval, @@ -41,3 +50,13 @@ int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval, return bpfilter_mbox_request(sk, optname, optval, len, false); } + +static int __init bpfilter_sockopt_init(void) +{ + bpfilter_ops.info.cmdline = "bpfilter_umh"; + bpfilter_ops.info.cleanup = &bpfilter_umh_cleanup; + + return 0; +} + +module_init(bpfilter_sockopt_init); From 61fbf5933d42b02f552123af5a87a06335a3b4db Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Wed, 9 Jan 2019 02:24:53 +0900 Subject: [PATCH 82/87] net: bpfilter: restart bpfilter_umh when error occurred The bpfilter_umh will be stopped via __stop_umh() when the bpfilter error occurred. The bpfilter_umh() couldn't start again because there is no restart routine. The section of the bpfilter_umh_{start/end} is no longer .init.rodata because these area should be reused in the restart routine. hence the section name is changed to .bpfilter_umh. The bpfilter_ops->start() is restart callback. it will be called when bpfilter_umh is stopped. The stop bit means bpfilter_umh is stopped. this bit is set by both start and stop routine. Before this patch, Test commands: $ iptables -vnL $ kill -9 $ iptables -vnL [ 480.045136] bpfilter: write fail -32 $ iptables -vnL All iptables commands will fail. After this patch, Test commands: $ iptables -vnL $ kill -9 $ iptables -vnL $ iptables -vnL Now, all iptables commands will work. Fixes: d2ba09c17a06 ("net: add skeleton of bpfilter kernel module") Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- include/linux/bpfilter.h | 2 ++ net/bpfilter/bpfilter_kern.c | 37 +++++++++++++++++++++++--------- net/bpfilter/bpfilter_umh_blob.S | 2 +- net/ipv4/bpfilter/sockopt.c | 11 +++++++++- 4 files changed, 40 insertions(+), 12 deletions(-) diff --git a/include/linux/bpfilter.h b/include/linux/bpfilter.h index 70ffeed280e9..8ebcbdd70bdc 100644 --- a/include/linux/bpfilter.h +++ b/include/linux/bpfilter.h @@ -15,6 +15,8 @@ struct bpfilter_umh_ops { int (*sockopt)(struct sock *sk, int optname, char __user *optval, unsigned int optlen, bool is_set); + int (*start)(void); + bool stop; }; extern struct bpfilter_umh_ops bpfilter_ops; #endif diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c index a68940b74c01..c0fcde910a7a 100644 --- a/net/bpfilter/bpfilter_kern.c +++ b/net/bpfilter/bpfilter_kern.c @@ -16,13 +16,14 @@ extern char bpfilter_umh_end; /* since ip_getsockopt() can run in parallel, serialize access to umh */ static DEFINE_MUTEX(bpfilter_lock); -static void shutdown_umh(struct umh_info *info) +static void shutdown_umh(void) { struct task_struct *tsk; - if (!info->pid) + if (bpfilter_ops.stop) return; - tsk = get_pid_task(find_vpid(info->pid), PIDTYPE_PID); + + tsk = get_pid_task(find_vpid(bpfilter_ops.info.pid), PIDTYPE_PID); if (tsk) { force_sig(SIGKILL, tsk); put_task_struct(tsk); @@ -31,10 +32,8 @@ static void shutdown_umh(struct umh_info *info) static void __stop_umh(void) { - if (IS_ENABLED(CONFIG_INET)) { - bpfilter_ops.sockopt = NULL; - shutdown_umh(&bpfilter_ops.info); - } + if (IS_ENABLED(CONFIG_INET)) + shutdown_umh(); } static void stop_umh(void) @@ -85,7 +84,7 @@ out: return ret; } -static int __init load_umh(void) +static int start_umh(void) { int err; @@ -95,6 +94,7 @@ static int __init load_umh(void) &bpfilter_ops.info); if (err) return err; + bpfilter_ops.stop = false; pr_info("Loaded bpfilter_umh pid %d\n", bpfilter_ops.info.pid); /* health check that usermode process started correctly */ @@ -102,14 +102,31 @@ static int __init load_umh(void) stop_umh(); return -EFAULT; } - if (IS_ENABLED(CONFIG_INET)) - bpfilter_ops.sockopt = &__bpfilter_process_sockopt; return 0; } +static int __init load_umh(void) +{ + int err; + + if (!bpfilter_ops.stop) + return -EFAULT; + err = start_umh(); + if (!err && IS_ENABLED(CONFIG_INET)) { + bpfilter_ops.sockopt = &__bpfilter_process_sockopt; + bpfilter_ops.start = &start_umh; + } + + return err; +} + static void __exit fini_umh(void) { + if (IS_ENABLED(CONFIG_INET)) { + bpfilter_ops.start = NULL; + bpfilter_ops.sockopt = NULL; + } stop_umh(); } module_init(load_umh); diff --git a/net/bpfilter/bpfilter_umh_blob.S b/net/bpfilter/bpfilter_umh_blob.S index 40311d10d2f2..7f1c521dcc2f 100644 --- a/net/bpfilter/bpfilter_umh_blob.S +++ b/net/bpfilter/bpfilter_umh_blob.S @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ - .section .init.rodata, "a" + .section .bpfilter_umh, "a" .global bpfilter_umh_start bpfilter_umh_start: .incbin "net/bpfilter/bpfilter_umh" diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c index c326cfbc0f62..de84ede4e765 100644 --- a/net/ipv4/bpfilter/sockopt.c +++ b/net/ipv4/bpfilter/sockopt.c @@ -14,6 +14,7 @@ EXPORT_SYMBOL_GPL(bpfilter_ops); static void bpfilter_umh_cleanup(struct umh_info *info) { + bpfilter_ops.stop = true; fput(info->pipe_to_umh); fput(info->pipe_from_umh); info->pid = 0; @@ -23,14 +24,21 @@ static int bpfilter_mbox_request(struct sock *sk, int optname, char __user *optval, unsigned int optlen, bool is_set) { + int err; + if (!bpfilter_ops.sockopt) { - int err = request_module("bpfilter"); + err = request_module("bpfilter"); if (err) return err; if (!bpfilter_ops.sockopt) return -ECHILD; } + if (bpfilter_ops.stop) { + err = bpfilter_ops.start(); + if (err) + return err; + } return bpfilter_ops.sockopt(sk, optname, optval, optlen, is_set); } @@ -53,6 +61,7 @@ int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval, static int __init bpfilter_sockopt_init(void) { + bpfilter_ops.stop = true; bpfilter_ops.info.cmdline = "bpfilter_umh"; bpfilter_ops.info.cleanup = &bpfilter_umh_cleanup; From 71a8508402b570127d6500c1ad456bbd33ccf187 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Wed, 9 Jan 2019 02:25:10 +0900 Subject: [PATCH 83/87] net: bpfilter: disallow to remove bpfilter module while being used The bpfilter.ko module can be removed while functions of the bpfilter.ko are executing. so panic can occurred. in order to protect that, locks can be used. a bpfilter_lock protects routines in the __bpfilter_process_sockopt() but it's not enough because __exit routine can be executed concurrently. Now, the bpfilter_umh can not run in parallel. So, the module do not removed while it's being used and it do not double-create UMH process. The members of the umh_info and the bpfilter_umh_ops are protected by the bpfilter_umh_ops.lock. test commands: while : do iptables -I FORWARD -m string --string ap --algo kmp & modprobe -rv bpfilter & done splat looks like: [ 298.623435] BUG: unable to handle kernel paging request at fffffbfff807440b [ 298.628512] #PF error: [normal kernel read fault] [ 298.633018] PGD 124327067 P4D 124327067 PUD 11c1a3067 PMD 119eb2067 PTE 0 [ 298.638859] Oops: 0000 [#1] SMP DEBUG_PAGEALLOC KASAN PTI [ 298.638859] CPU: 0 PID: 2997 Comm: iptables Not tainted 4.20.0+ #154 [ 298.638859] RIP: 0010:__mutex_lock+0x6b9/0x16a0 [ 298.638859] Code: c0 00 00 e8 89 82 ff ff 80 bd 8f fc ff ff 00 0f 85 d9 05 00 00 48 8b 85 80 fc ff ff 48 bf 00 00 00 00 00 fc ff df 48 c1 e8 03 <80> 3c 38 00 0f 85 1d 0e 00 00 48 8b 85 c8 fc ff ff 49 39 47 58 c6 [ 298.638859] RSP: 0018:ffff88810e7777a0 EFLAGS: 00010202 [ 298.638859] RAX: 1ffffffff807440b RBX: ffff888111bd4d80 RCX: 0000000000000000 [ 298.638859] RDX: 1ffff110235ff806 RSI: ffff888111bd5538 RDI: dffffc0000000000 [ 298.638859] RBP: ffff88810e777b30 R08: 0000000080000002 R09: 0000000000000000 [ 298.638859] R10: 0000000000000000 R11: 0000000000000000 R12: fffffbfff168a42c [ 298.638859] R13: ffff888111bd4d80 R14: ffff8881040e9a05 R15: ffffffffc03a2000 [ 298.638859] FS: 00007f39e3758700(0000) GS:ffff88811ae00000(0000) knlGS:0000000000000000 [ 298.638859] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 298.638859] CR2: fffffbfff807440b CR3: 000000011243e000 CR4: 00000000001006f0 [ 298.638859] Call Trace: [ 298.638859] ? mutex_lock_io_nested+0x1560/0x1560 [ 298.638859] ? kasan_kmalloc+0xa0/0xd0 [ 298.638859] ? kmem_cache_alloc+0x1c2/0x260 [ 298.638859] ? __alloc_file+0x92/0x3c0 [ 298.638859] ? alloc_empty_file+0x43/0x120 [ 298.638859] ? alloc_file_pseudo+0x220/0x330 [ 298.638859] ? sock_alloc_file+0x39/0x160 [ 298.638859] ? __sys_socket+0x113/0x1d0 [ 298.638859] ? __x64_sys_socket+0x6f/0xb0 [ 298.638859] ? do_syscall_64+0x138/0x560 [ 298.638859] ? entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 298.638859] ? __alloc_file+0x92/0x3c0 [ 298.638859] ? init_object+0x6b/0x80 [ 298.638859] ? cyc2ns_read_end+0x10/0x10 [ 298.638859] ? cyc2ns_read_end+0x10/0x10 [ 298.638859] ? hlock_class+0x140/0x140 [ 298.638859] ? sched_clock_local+0xd4/0x140 [ 298.638859] ? sched_clock_local+0xd4/0x140 [ 298.638859] ? check_flags.part.37+0x440/0x440 [ 298.638859] ? __lock_acquire+0x4f90/0x4f90 [ 298.638859] ? set_rq_offline.part.89+0x140/0x140 [ ... ] Fixes: d2ba09c17a06 ("net: add skeleton of bpfilter kernel module") Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- include/linux/bpfilter.h | 2 ++ net/bpfilter/bpfilter_kern.c | 28 +++++++++++----------------- net/ipv4/bpfilter/sockopt.c | 22 ++++++++++++++++------ 3 files changed, 29 insertions(+), 23 deletions(-) diff --git a/include/linux/bpfilter.h b/include/linux/bpfilter.h index 8ebcbdd70bdc..d815622cd31e 100644 --- a/include/linux/bpfilter.h +++ b/include/linux/bpfilter.h @@ -12,6 +12,8 @@ int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval, int __user *optlen); struct bpfilter_umh_ops { struct umh_info info; + /* since ip_getsockopt() can run in parallel, serialize access to umh */ + struct mutex lock; int (*sockopt)(struct sock *sk, int optname, char __user *optval, unsigned int optlen, bool is_set); diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c index c0fcde910a7a..7ee4fea93637 100644 --- a/net/bpfilter/bpfilter_kern.c +++ b/net/bpfilter/bpfilter_kern.c @@ -13,9 +13,6 @@ extern char bpfilter_umh_start; extern char bpfilter_umh_end; -/* since ip_getsockopt() can run in parallel, serialize access to umh */ -static DEFINE_MUTEX(bpfilter_lock); - static void shutdown_umh(void) { struct task_struct *tsk; @@ -36,13 +33,6 @@ static void __stop_umh(void) shutdown_umh(); } -static void stop_umh(void) -{ - mutex_lock(&bpfilter_lock); - __stop_umh(); - mutex_unlock(&bpfilter_lock); -} - static int __bpfilter_process_sockopt(struct sock *sk, int optname, char __user *optval, unsigned int optlen, bool is_set) @@ -58,7 +48,6 @@ static int __bpfilter_process_sockopt(struct sock *sk, int optname, req.cmd = optname; req.addr = (long __force __user)optval; req.len = optlen; - mutex_lock(&bpfilter_lock); if (!bpfilter_ops.info.pid) goto out; n = __kernel_write(bpfilter_ops.info.pipe_to_umh, &req, sizeof(req), @@ -80,7 +69,6 @@ static int __bpfilter_process_sockopt(struct sock *sk, int optname, } ret = reply.status; out: - mutex_unlock(&bpfilter_lock); return ret; } @@ -99,7 +87,7 @@ static int start_umh(void) /* health check that usermode process started correctly */ if (__bpfilter_process_sockopt(NULL, 0, NULL, 0, 0) != 0) { - stop_umh(); + shutdown_umh(); return -EFAULT; } @@ -110,24 +98,30 @@ static int __init load_umh(void) { int err; - if (!bpfilter_ops.stop) - return -EFAULT; + mutex_lock(&bpfilter_ops.lock); + if (!bpfilter_ops.stop) { + err = -EFAULT; + goto out; + } err = start_umh(); if (!err && IS_ENABLED(CONFIG_INET)) { bpfilter_ops.sockopt = &__bpfilter_process_sockopt; bpfilter_ops.start = &start_umh; } - +out: + mutex_unlock(&bpfilter_ops.lock); return err; } static void __exit fini_umh(void) { + mutex_lock(&bpfilter_ops.lock); if (IS_ENABLED(CONFIG_INET)) { + shutdown_umh(); bpfilter_ops.start = NULL; bpfilter_ops.sockopt = NULL; } - stop_umh(); + mutex_unlock(&bpfilter_ops.lock); } module_init(load_umh); module_exit(fini_umh); diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c index de84ede4e765..1e976bb93d99 100644 --- a/net/ipv4/bpfilter/sockopt.c +++ b/net/ipv4/bpfilter/sockopt.c @@ -14,10 +14,12 @@ EXPORT_SYMBOL_GPL(bpfilter_ops); static void bpfilter_umh_cleanup(struct umh_info *info) { + mutex_lock(&bpfilter_ops.lock); bpfilter_ops.stop = true; fput(info->pipe_to_umh); fput(info->pipe_from_umh); info->pid = 0; + mutex_unlock(&bpfilter_ops.lock); } static int bpfilter_mbox_request(struct sock *sk, int optname, @@ -25,21 +27,28 @@ static int bpfilter_mbox_request(struct sock *sk, int optname, unsigned int optlen, bool is_set) { int err; - + mutex_lock(&bpfilter_ops.lock); if (!bpfilter_ops.sockopt) { + mutex_unlock(&bpfilter_ops.lock); err = request_module("bpfilter"); + mutex_lock(&bpfilter_ops.lock); if (err) - return err; - if (!bpfilter_ops.sockopt) - return -ECHILD; + goto out; + if (!bpfilter_ops.sockopt) { + err = -ECHILD; + goto out; + } } if (bpfilter_ops.stop) { err = bpfilter_ops.start(); if (err) - return err; + goto out; } - return bpfilter_ops.sockopt(sk, optname, optval, optlen, is_set); + err = bpfilter_ops.sockopt(sk, optname, optval, optlen, is_set); +out: + mutex_unlock(&bpfilter_ops.lock); + return err; } int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char __user *optval, @@ -61,6 +70,7 @@ int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval, static int __init bpfilter_sockopt_init(void) { + mutex_init(&bpfilter_ops.lock); bpfilter_ops.stop = true; bpfilter_ops.info.cmdline = "bpfilter_umh"; bpfilter_ops.info.cleanup = &bpfilter_umh_cleanup; From 41d1c8839e5f8cb781cc635f12791decee8271b7 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 8 Jan 2019 18:45:05 +0100 Subject: [PATCH 84/87] net: clear skb->tstamp in bridge forwarding path Matteo reported forwarding issues inside the linux bridge, if the enslaved interfaces use the fq qdisc. Similar to commit 8203e2d844d3 ("net: clear skb->tstamp in forwarding paths"), we need to clear the tstamp field in the bridge forwarding path. Fixes: 80b14dee2bea ("net: Add a new socket option for a future transmit time.") Fixes: fb420d5d91c1 ("tcp/fq: move back to CLOCK_MONOTONIC") Reported-and-tested-by: Matteo Croce Signed-off-by: Paolo Abeni Acked-by: Nikolay Aleksandrov Acked-by: Roopa Prabhu Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/bridge/br_forward.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 5372e2042adf..2cb8da465b98 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -65,6 +65,7 @@ EXPORT_SYMBOL_GPL(br_dev_queue_push_xmit); int br_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { + skb->tstamp = 0; return NF_HOOK(NFPROTO_BRIDGE, NF_BR_POST_ROUTING, net, sk, skb, NULL, skb->dev, br_dev_queue_push_xmit); From 8d008e64a2ebe6567c3f5e048b05842a0297350b Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 8 Jan 2019 15:27:05 -0600 Subject: [PATCH 85/87] mISDN: hfcsusb: Use struct_size() in kzalloc() One of the more common cases of allocation size calculations is finding the size of a structure that has a zero-sized array at the end, along with memory for some number of elements for that array. For example: struct foo { int stuff; void *entry[]; }; instance = kzalloc(sizeof(struct foo) + sizeof(void *) * count, GFP_KERNEL); Instead of leaving these open-coded and prone to type mistakes, we can now use the new struct_size() helper: instance = kzalloc(struct_size(instance, entry, count), GFP_KERNEL); This code was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller --- drivers/isdn/hardware/mISDN/hfcsusb.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/isdn/hardware/mISDN/hfcsusb.c b/drivers/isdn/hardware/mISDN/hfcsusb.c index 6d05946b445e..124ff530da82 100644 --- a/drivers/isdn/hardware/mISDN/hfcsusb.c +++ b/drivers/isdn/hardware/mISDN/hfcsusb.c @@ -262,8 +262,7 @@ hfcsusb_ph_info(struct hfcsusb *hw) struct dchannel *dch = &hw->dch; int i; - phi = kzalloc(sizeof(struct ph_info) + - dch->dev.nrbchan * sizeof(struct ph_info_ch), GFP_ATOMIC); + phi = kzalloc(struct_size(phi, bch, dch->dev.nrbchan), GFP_ATOMIC); phi->dch.ch.protocol = hw->protocol; phi->dch.ch.Flags = dch->Flags; phi->dch.state = dch->state; From 0b815023a1d479aa8f8851ee880d5388e53b7ae5 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Sat, 12 Jan 2019 00:13:04 -0500 Subject: [PATCH 86/87] bnxt_en: Fix ring checking logic on 57500 chips. In bnxt_hwrm_check_pf_rings(), add the proper flag to test the NQ resources. Without the proper flag, the firmware will change the NQ resource allocation and remap the IRQ, causing missing IRQs. This issue shows up when adding MQPRIO TX queues, for example. Fixes: 36d65be9a880 ("bnxt_en: Disable MSIX before re-reserving NQs/CMPL rings.") Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 3 ++- drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 3aa80da973d7..a125fbe5183f 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -5601,7 +5601,8 @@ static int bnxt_hwrm_check_pf_rings(struct bnxt *bp, int tx_rings, int rx_rings, FUNC_CFG_REQ_FLAGS_STAT_CTX_ASSETS_TEST | FUNC_CFG_REQ_FLAGS_VNIC_ASSETS_TEST; if (bp->flags & BNXT_FLAG_CHIP_P5) - flags |= FUNC_CFG_REQ_FLAGS_RSSCOS_CTX_ASSETS_TEST; + flags |= FUNC_CFG_REQ_FLAGS_RSSCOS_CTX_ASSETS_TEST | + FUNC_CFG_REQ_FLAGS_NQ_ASSETS_TEST; else flags |= FUNC_CFG_REQ_FLAGS_RING_GRP_ASSETS_TEST; } diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h index f1aaac8e6268..0a0995894ddb 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h @@ -386,8 +386,8 @@ struct hwrm_err_output { #define HWRM_VERSION_MAJOR 1 #define HWRM_VERSION_MINOR 10 #define HWRM_VERSION_UPDATE 0 -#define HWRM_VERSION_RSVD 33 -#define HWRM_VERSION_STR "1.10.0.33" +#define HWRM_VERSION_RSVD 35 +#define HWRM_VERSION_STR "1.10.0.35" /* hwrm_ver_get_input (size:192b/24B) */ struct hwrm_ver_get_input { @@ -1184,6 +1184,7 @@ struct hwrm_func_cfg_input { #define FUNC_CFG_REQ_FLAGS_L2_CTX_ASSETS_TEST 0x100000UL #define FUNC_CFG_REQ_FLAGS_TRUSTED_VF_ENABLE 0x200000UL #define FUNC_CFG_REQ_FLAGS_DYNAMIC_TX_RING_ALLOC 0x400000UL + #define FUNC_CFG_REQ_FLAGS_NQ_ASSETS_TEST 0x800000UL __le32 enables; #define FUNC_CFG_REQ_ENABLES_MTU 0x1UL #define FUNC_CFG_REQ_ENABLES_MRU 0x2UL From 6ef982dec7eda9affa81a2bb84f75441deb56d06 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Sat, 12 Jan 2019 00:13:05 -0500 Subject: [PATCH 87/87] bnxt_en: Fix context memory allocation. When allocating memory pages for context memory, if the last page table should be fully populated, the current code will set nr_pages to 0 when calling bnxt_alloc_ctx_mem_blk(). This will cause the last page table to be completely blank and causing some RDMA failures. Fix it by setting the last page table's nr_pages to the remainder only if it is non-zero. Fixes: 08fe9d181606 ("bnxt_en: Add Level 2 context memory paging support.") Reported-by: Eric Davis Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index a125fbe5183f..9499d01632ff 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -6222,9 +6222,12 @@ static int bnxt_alloc_ctx_pg_tbls(struct bnxt *bp, rmem->pg_tbl_map = ctx_pg->ctx_dma_arr[i]; rmem->depth = 1; rmem->nr_pages = MAX_CTX_PAGES; - if (i == (nr_tbls - 1)) - rmem->nr_pages = ctx_pg->nr_pages % - MAX_CTX_PAGES; + if (i == (nr_tbls - 1)) { + int rem = ctx_pg->nr_pages % MAX_CTX_PAGES; + + if (rem) + rmem->nr_pages = rem; + } rc = bnxt_alloc_ctx_mem_blk(bp, pg_tbl); if (rc) break;