From cb853da3a368c40300a0e940f86be582037bb082 Mon Sep 17 00:00:00 2001 From: Arend Van Spriel Date: Fri, 9 Dec 2016 11:34:13 +0000 Subject: [PATCH 01/30] brcmfmac: fix memory leak in brcmf_cfg80211_attach() In brcmf_cfg80211_attach() there was one error path not properly handled as it leaked memory allocated in brcmf_btcoex_attach(). Reviewed-by: Hante Meuleman Reviewed-by: Pieter-Paul Giesberts Reviewed-by: Franky Lin Signed-off-by: Arend van Spriel Signed-off-by: Kalle Valo --- .../net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c index ccae3bbe7db2..7ffc4aba5bab 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c @@ -6868,7 +6868,7 @@ struct brcmf_cfg80211_info *brcmf_cfg80211_attach(struct brcmf_pub *drvr, err = brcmf_p2p_attach(cfg, p2pdev_forced); if (err) { - brcmf_err("P2P initilisation failed (%d)\n", err); + brcmf_err("P2P initialisation failed (%d)\n", err); goto wiphy_unreg_out; } err = brcmf_btcoex_attach(cfg); @@ -6893,7 +6893,7 @@ struct brcmf_cfg80211_info *brcmf_cfg80211_attach(struct brcmf_pub *drvr, err = brcmf_fweh_activate_events(ifp); if (err) { brcmf_err("FWEH activation failed (%d)\n", err); - goto wiphy_unreg_out; + goto detach; } /* Fill in some of the advertised nl80211 supported features */ @@ -6908,6 +6908,9 @@ struct brcmf_cfg80211_info *brcmf_cfg80211_attach(struct brcmf_pub *drvr, return cfg; +detach: + brcmf_btcoex_detach(cfg); + brcmf_p2p_detach(&cfg->p2p); wiphy_unreg_out: wiphy_unregister(cfg->wiphy); priv_out: From 2b66325d5ea7c2a39ac69ed83b6979afe480d81a Mon Sep 17 00:00:00 2001 From: Arend Van Spriel Date: Fri, 9 Dec 2016 11:34:14 +0000 Subject: [PATCH 02/30] brcmfmac: fix uninitialized field in scheduled scan ssid configuration The scheduled scan ssid configuration in firmware has a flags field that was not initialized resulting in unexpected behaviour. Fixes: e3bdb7cc0300 ("brcmfmac: fix handling ssids in .sched_scan_start() callback") Reviewed-by: Hante Meuleman Reviewed-by: Pieter-Paul Giesberts Reviewed-by: Franky Lin Signed-off-by: Arend van Spriel Signed-off-by: Kalle Valo --- drivers/net/wireless/broadcom/brcm80211/brcmfmac/pno.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pno.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pno.c index f273cab0da10..9a25e79a46cf 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pno.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pno.c @@ -137,6 +137,7 @@ static int brcmf_pno_add_ssid(struct brcmf_if *ifp, struct cfg80211_ssid *ssid, pfn.wpa_auth = cpu_to_le32(BRCMF_PNO_WPA_AUTH_ANY); pfn.wsec = cpu_to_le32(0); pfn.infra = cpu_to_le32(1); + pfn.flags = 0; if (active) pfn.flags = cpu_to_le32(1 << BRCMF_PNO_HIDDEN_BIT); pfn.ssid.SSID_len = cpu_to_le32(ssid->ssid_len); From d4166b8b33650d9dc89715c9540ba0f261490d4d Mon Sep 17 00:00:00 2001 From: Ben Greear Date: Thu, 15 Dec 2016 11:23:19 +0200 Subject: [PATCH 03/30] ath10k: free host-mem with DMA_BIRECTIONAL flag Hopefully this fixes the problem reported by Kalle: Noticed this in my log, but I don't have time to investigate this in detail right now: [ 413.795346] IPv6: ADDRCONF(NETDEV_UP): wlan0: link is not ready [ 414.158755] IPv6: ADDRCONF(NETDEV_CHANGE): wlan0: link becomes ready [ 477.439659] ath10k_pci 0000:02:00.0: could not get mac80211 beacon [ 481.666630] ------------[ cut here ]------------ [ 481.666669] WARNING: CPU: 0 PID: 1978 at lib/dma-debug.c:1155 check_unmap+0x320/0x8e0 [ 481.666688] ath10k_pci 0000:02:00.0: DMA-API: device driver frees DMA memory with different direction [device address=0x000000002d130000] [size=63800 bytes] [mapped with DMA_BIDIRECTIONAL] [unmapped with DMA_TO_DEVICE] [ 481.666703] Modules linked in: ctr ccm ath10k_pci(E-) ath10k_core(E) ath(E) mac80211(E) cfg80211(E) snd_hda_codec_hdmi snd_hda_codec_idt snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hda_core snd_hwdep snd_pcm snd_seq_midi arc4 snd_rawmidi snd_seq_midi_event snd_seq btusb btintel snd_seq_device joydev coret [ 481.671468] CPU: 0 PID: 1978 Comm: rmmod Tainted: G E 4.9.0-rc7-wt+ #54 [ 481.671478] Hardware name: Hewlett-Packard HP ProBook 6540b/1722, BIOS 68CDD Ver. F.04 01/27/2010 [ 481.671489] ef49dcec c842ee92 c8b5830e ef49dd34 ef49dd20 c80850f5 c8b5a13c ef49dd50 [ 481.671560] 000007ba c8b5830e 00000483 c8461830 c8461830 00000483 ef49ddcc f34e64b8 [ 481.671641] c8b58360 ef49dd3c c80851bb 00000009 00000000 ef49dd34 c8b5a13c ef49dd50 [ 481.671716] Call Trace: [ 481.671731] [] dump_stack+0x76/0xb4 [ 481.671745] [] __warn+0xe5/0x100 [ 481.671757] [] ? check_unmap+0x320/0x8e0 [ 481.671769] [] ? check_unmap+0x320/0x8e0 [ 481.671780] [] warn_slowpath_fmt+0x3b/0x40 [ 481.671791] [] check_unmap+0x320/0x8e0 [ 481.671804] [] debug_dma_unmap_page+0x84/0xa0 [ 481.671835] [] ath10k_wmi_free_host_mem+0x9a/0xe0 [ath10k_core] [ 481.671861] [] ath10k_core_destroy+0x50/0x60 [ath10k_core] [ 481.671875] [] ath10k_pci_remove+0x79/0xa0 [ath10k_pci] [ 481.671889] [] pci_device_remove+0x38/0xb0 [ 481.671901] [] __device_release_driver+0x7b/0x110 [ 481.671913] [] driver_detach+0x97/0xa0 [ 481.671923] [] bus_remove_driver+0x4b/0xb0 [ 481.671934] [] driver_unregister+0x2a/0x60 [ 481.671949] [] pci_unregister_driver+0x18/0x70 [ 481.671965] [] ath10k_pci_exit+0xd/0x25f [ath10k_pci] [ 481.671979] [] SyS_delete_module+0xf4/0x180 [ 481.671995] [] ? __might_fault+0x8b/0xa0 [ 481.672009] [] do_fast_syscall_32+0xa0/0x1e0 [ 481.672025] [] sysenter_past_esp+0x45/0x74 [ 481.672037] ---[ end trace 3fd23759e17e1622 ]--- [ 481.672049] Mapped at: [ 481.672060] [ 481.672072] [] debug_dma_map_page.part.25+0x1c/0xf0 [ 481.672083] [ 481.672095] [] debug_dma_map_page+0x99/0xc0 [ 481.672106] [ 481.672132] [] ath10k_wmi_alloc_chunk+0x12c/0x1f0 [ath10k_core] [ 481.672142] [ 481.672168] [] ath10k_wmi_event_service_ready_work+0x304/0x540 [ath10k_core] [ 481.672178] [ 481.672190] [] process_one_work+0x1c3/0x670 [ 482.137134] ath10k_pci 0000:02:00.0: pci irq msi oper_irq_mode 2 irq_mode 0 reset_mode 0 [ 482.313144] ath10k_pci 0000:02:00.0: Direct firmware load for ath10k/pre-cal-pci-0000:02:00.0.bin failed with error -2 [ 482.313274] ath10k_pci 0000:02:00.0: Direct firmware load for ath10k/cal-pci-0000:02:00.0.bin failed with error -2 [ 482.313768] ath10k_pci 0000:02:00.0: qca988x hw2.0 target 0x4100016c chip_id 0x043202ff sub 0000:0000 [ 482.313777] ath10k_pci 0000:02:00.0: kconfig debug 1 debugfs 1 tracing 1 dfs 0 testmode 1 [ 482.313974] ath10k_pci 0000:02:00.0: firmware ver 10.2.4.70.59-2 api 5 features no-p2p,raw-mode,mfp,allows-mesh-bcast crc32 4159f498 [ 482.369858] ath10k_pci 0000:02:00.0: Direct firmware load for ath10k/QCA988X/hw2.0/board-2.bin failed with error -2 [ 482.370011] ath10k_pci 0000:02:00.0: board_file api 1 bmi_id N/A crc32 bebc7c08 [ 483.596770] ath10k_pci 0000:02:00.0: htt-ver 2.1 wmi-op 5 htt-op 2 cal otp max-sta 128 raw 0 hwcrypto 1 [ 483.701686] ath: EEPROM regdomain: 0x0 [ 483.701706] ath: EEPROM indicates default country code should be used [ 483.701713] ath: doing EEPROM country->regdmn map search [ 483.701721] ath: country maps to regdmn code: 0x3a [ 483.701730] ath: Country alpha2 being used: US [ 483.701737] ath: Regpair used: 0x3a Reported-by: Kalle Valo Signed-off-by: Ben Greear Signed-off-by: Kalle Valo --- drivers/net/wireless/ath/ath10k/wmi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/ath/ath10k/wmi.c b/drivers/net/wireless/ath/ath10k/wmi.c index c893314a191f..50d6ee6afe26 100644 --- a/drivers/net/wireless/ath/ath10k/wmi.c +++ b/drivers/net/wireless/ath/ath10k/wmi.c @@ -8271,7 +8271,7 @@ void ath10k_wmi_free_host_mem(struct ath10k *ar) dma_unmap_single(ar->dev, ar->wmi.mem_chunks[i].paddr, ar->wmi.mem_chunks[i].len, - DMA_TO_DEVICE); + DMA_BIDIRECTIONAL); kfree(ar->wmi.mem_chunks[i].vaddr); } From d1f1c0e289e1bc46cd6873ba6dd6c627f459e7fa Mon Sep 17 00:00:00 2001 From: Tobias Klausmann Date: Tue, 13 Dec 2016 18:08:07 +0100 Subject: [PATCH 04/30] ath9k: do not return early to fix rcu unlocking Starting with commit d94a461d7a7d ("ath9k: use ieee80211_tx_status_noskb where possible") the driver uses rcu_read_lock() && rcu_read_unlock(), yet on returning early in ath_tx_edma_tasklet() the unlock is missing leading to stalls and suspicious RCU usage: =============================== [ INFO: suspicious RCU usage. ] 4.9.0-rc8 #11 Not tainted ------------------------------- kernel/rcu/tree.c:705 Illegal idle entry in RCU read-side critical section.! other info that might help us debug this: RCU used illegally from idle CPU! rcu_scheduler_active = 1, debug_locks = 0 RCU used illegally from extended quiescent state! 1 lock held by swapper/7/0: #0: ( rcu_read_lock ){......} , at: [] ath_tx_edma_tasklet+0x0/0x450 [ath9k] stack backtrace: CPU: 7 PID: 0 Comm: swapper/7 Not tainted 4.9.0-rc8 #11 Hardware name: Acer Aspire V3-571G/VA50_HC_CR, BIOS V2.21 12/16/2013 ffff88025efc3f38 ffffffff8132b1e5 ffff88017ede4540 0000000000000001 ffff88025efc3f68 ffffffff810a25f7 ffff88025efcee60 ffff88017edebdd8 ffff88025eeb5400 0000000000000091 ffff88025efc3f88 ffffffff810c3cd4 Call Trace: [] dump_stack+0x68/0x93 [] lockdep_rcu_suspicious+0xd7/0x110 [] rcu_eqs_enter_common.constprop.85+0x154/0x200 [] rcu_irq_exit+0x44/0xa0 [] irq_exit+0x61/0xd0 [] do_IRQ+0x65/0x110 [] common_interrupt+0x89/0x89 [] ? cpuidle_enter_state+0x151/0x200 [] cpuidle_enter+0x12/0x20 [] call_cpuidle+0x1e/0x40 [] cpu_startup_entry+0x146/0x220 [] start_secondary+0x148/0x170 Signed-off-by: Tobias Klausmann Fixes: d94a461d7a7d ("ath9k: use ieee80211_tx_status_noskb where possible") Cc: # v4.9 Acked-by: Felix Fietkau Acked-by: Paul E. McKenney Tested-by: Gabriel Craciunescu Signed-off-by: Kalle Valo --- drivers/net/wireless/ath/ath9k/xmit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/ath/ath9k/xmit.c b/drivers/net/wireless/ath/ath9k/xmit.c index 486afa98a5b8..4e2f3ac266c3 100644 --- a/drivers/net/wireless/ath/ath9k/xmit.c +++ b/drivers/net/wireless/ath/ath9k/xmit.c @@ -2713,7 +2713,7 @@ void ath_tx_edma_tasklet(struct ath_softc *sc) fifo_list = &txq->txq_fifo[txq->txq_tailidx]; if (list_empty(fifo_list)) { ath_txq_unlock(sc, txq); - return; + break; } bf = list_first_entry(fifo_list, struct ath_buf, list); From 22b68b93ae2506bd56ee3bf232a51bc8ab955b56 Mon Sep 17 00:00:00 2001 From: Larry Finger Date: Mon, 19 Dec 2016 20:38:12 -0600 Subject: [PATCH 05/30] rtlwifi: Fix kernel oops introduced with commit e49656147359 With commit e49656147359 {"rtlwifi: Use dev_kfree_skb_irq instead of kfree_skb"), the method used to free an skb was changed because the kfree_skb() was inside a spinlock. What was forgotten is that kfree_skb() guards against a NULL value for the argument. Routine dev_kfree_skb_irq() does not, and a test is needed to prevent kernel panics. Fixes: e49656147359 ("rtlwifi: Use dev_kfree_skb_irq instead of kfree_skb") Signed-off-by: Larry Finger Cc: Stable # 4.9+ Cc: Wei Yongjun Signed-off-by: Kalle Valo --- drivers/net/wireless/realtek/rtlwifi/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/realtek/rtlwifi/core.c b/drivers/net/wireless/realtek/rtlwifi/core.c index 2caa4ad04dba..ded1493fee9c 100644 --- a/drivers/net/wireless/realtek/rtlwifi/core.c +++ b/drivers/net/wireless/realtek/rtlwifi/core.c @@ -1829,7 +1829,8 @@ bool rtl_cmd_send_packet(struct ieee80211_hw *hw, struct sk_buff *skb) spin_lock_irqsave(&rtlpriv->locks.irq_th_lock, flags); pskb = __skb_dequeue(&ring->queue); - dev_kfree_skb_irq(pskb); + if (pskb) + dev_kfree_skb_irq(pskb); /*this is wrong, fill_tx_cmddesc needs update*/ pdesc = &ring->desc[0]; From 76b15923b777aa2660029629179550124c1fc40e Mon Sep 17 00:00:00 2001 From: Kalesh A P Date: Tue, 20 Dec 2016 10:14:30 -0500 Subject: [PATCH 06/30] be2net: Increase skb headroom size to 256 bytes The driver currently allocates 128 bytes of skb headroom. This was found to be insufficient with some configurations like Geneve tunnels, which resulted in skb head reallocations. Increase the headroom to 256 bytes to fix this. Signed-off-by: Kalesh A P Signed-off-by: Suresh Reddy Signed-off-by: David S. Miller --- drivers/net/ethernet/emulex/benet/be.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/emulex/benet/be.h b/drivers/net/ethernet/emulex/benet/be.h index 6cfa63a5e9b4..4c30c44b242e 100644 --- a/drivers/net/ethernet/emulex/benet/be.h +++ b/drivers/net/ethernet/emulex/benet/be.h @@ -65,7 +65,7 @@ /* Number of bytes of an RX frame that are copied to skb->data */ #define BE_HDR_LEN ((u16) 64) /* allocate extra space to allow tunneling decapsulation without head reallocation */ -#define BE_RX_SKB_ALLOC_SIZE (BE_HDR_LEN + 64) +#define BE_RX_SKB_ALLOC_SIZE 256 #define BE_MAX_JUMBO_FRAME_SIZE 9018 #define BE_MIN_MTU 256 From f217bfde24077b6684e625dd618e3a522e6f272a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Heiko=20St=C3=BCbner?= Date: Tue, 20 Dec 2016 17:17:06 +0100 Subject: [PATCH 07/30] net: ethernet: stmmac: dwmac-rk: make clk enablement first in powerup Right now the dwmac-rk tries to set up the GRF-specific speed and link options before enabling clocks, phys etc and on previous socs this works because the GRF is supplied on the whole by one clock. On the rk3399 however the GRF (General Register Files) clock-supply has been split into multiple clocks and while there is no specific grf-gmac clock like for other sub-blocks, it seems the mac-specific portions are actually supplied by the general mac clock. This results in hangs on rk3399 boards if the driver is build as module. When built in te problem of course doesn't surface, as the clocks are of course still on at the stage before clock_disable_unused. To solve this, simply move the clock enablement to the first position in the powerup callback. This is also a good idea in general to enable clocks before everything else. Tested on rk3288, rk3368 and rk3399 the dwmac still works on all of them. Signed-off-by: Heiko Stuebner Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c index 77ab0a85f067..fa6e9704c077 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c @@ -864,6 +864,10 @@ static int rk_gmac_powerup(struct rk_priv_data *bsp_priv) int ret; struct device *dev = &bsp_priv->pdev->dev; + ret = gmac_clk_enable(bsp_priv, true); + if (ret) + return ret; + /*rmii or rgmii*/ if (bsp_priv->phy_iface == PHY_INTERFACE_MODE_RGMII) { dev_info(dev, "init for RGMII\n"); @@ -880,10 +884,6 @@ static int rk_gmac_powerup(struct rk_priv_data *bsp_priv) if (ret) return ret; - ret = gmac_clk_enable(bsp_priv, true); - if (ret) - return ret; - pm_runtime_enable(dev); pm_runtime_get_sync(dev); From 8354491c9d5b06709384cea91d13019bf5e61449 Mon Sep 17 00:00:00 2001 From: Thomas Petazzoni Date: Wed, 21 Dec 2016 11:28:49 +0100 Subject: [PATCH 08/30] net: mvpp2: fix dma unmapping of TX buffers for fragments Since commit 71ce391dfb784 ("net: mvpp2: enable proper per-CPU TX buffers unmapping"), we are not correctly DMA unmapping TX buffers for fragments. Indeed, the mvpp2_txq_inc_put() function only stores in the txq_cpu->tx_buffs[] array the physical address of the buffer to be DMA-unmapped when skb != NULL. In addition, when DMA-unmapping, we use skb_headlen(skb) to get the size to be unmapped. Both of this works fine for TX descriptors that are associated directly to a SKB, but not the ones that are used for fragments, with a NULL pointer as skb: - We have a NULL physical address when calling DMA unmap - skb_headlen(skb) crashes because skb is NULL This causes random crashes when fragments are used. To solve this problem, we need to: - Store the physical address of the buffer to be unmapped unconditionally, regardless of whether it is tied to a SKB or not. - Store the length of the buffer to be unmapped, which requires a new field. Instead of adding a third array to store the length of the buffer to be unmapped, and as suggested by David Miller, this commit refactors the tx_buffs[] and tx_skb[] arrays of 'struct mvpp2_txq_pcpu' into a separate structure 'mvpp2_txq_pcpu_buf', to which a 'size' field is added. Therefore, instead of having three arrays to allocate/free, we have a single one, which also improve data locality, reducing the impact on the CPU cache. Fixes: 71ce391dfb784 ("net: mvpp2: enable proper per-CPU TX buffers unmapping") Reported-by: Raphael G Cc: Raphael G Cc: stable@vger.kernel.org Signed-off-by: Thomas Petazzoni Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mvpp2.c | 59 ++++++++++++++-------------- 1 file changed, 30 insertions(+), 29 deletions(-) diff --git a/drivers/net/ethernet/marvell/mvpp2.c b/drivers/net/ethernet/marvell/mvpp2.c index dabc5418efcc..cda04b3126bc 100644 --- a/drivers/net/ethernet/marvell/mvpp2.c +++ b/drivers/net/ethernet/marvell/mvpp2.c @@ -770,6 +770,17 @@ struct mvpp2_rx_desc { u32 reserved8; }; +struct mvpp2_txq_pcpu_buf { + /* Transmitted SKB */ + struct sk_buff *skb; + + /* Physical address of transmitted buffer */ + dma_addr_t phys; + + /* Size transmitted */ + size_t size; +}; + /* Per-CPU Tx queue control */ struct mvpp2_txq_pcpu { int cpu; @@ -785,11 +796,8 @@ struct mvpp2_txq_pcpu { /* Number of Tx DMA descriptors reserved for each CPU */ int reserved_num; - /* Array of transmitted skb */ - struct sk_buff **tx_skb; - - /* Array of transmitted buffers' physical addresses */ - dma_addr_t *tx_buffs; + /* Infos about transmitted buffers */ + struct mvpp2_txq_pcpu_buf *buffs; /* Index of last TX DMA descriptor that was inserted */ int txq_put_index; @@ -979,10 +987,11 @@ static void mvpp2_txq_inc_put(struct mvpp2_txq_pcpu *txq_pcpu, struct sk_buff *skb, struct mvpp2_tx_desc *tx_desc) { - txq_pcpu->tx_skb[txq_pcpu->txq_put_index] = skb; - if (skb) - txq_pcpu->tx_buffs[txq_pcpu->txq_put_index] = - tx_desc->buf_phys_addr; + struct mvpp2_txq_pcpu_buf *tx_buf = + txq_pcpu->buffs + txq_pcpu->txq_put_index; + tx_buf->skb = skb; + tx_buf->size = tx_desc->data_size; + tx_buf->phys = tx_desc->buf_phys_addr; txq_pcpu->txq_put_index++; if (txq_pcpu->txq_put_index == txq_pcpu->size) txq_pcpu->txq_put_index = 0; @@ -4401,17 +4410,16 @@ static void mvpp2_txq_bufs_free(struct mvpp2_port *port, int i; for (i = 0; i < num; i++) { - dma_addr_t buf_phys_addr = - txq_pcpu->tx_buffs[txq_pcpu->txq_get_index]; - struct sk_buff *skb = txq_pcpu->tx_skb[txq_pcpu->txq_get_index]; + struct mvpp2_txq_pcpu_buf *tx_buf = + txq_pcpu->buffs + txq_pcpu->txq_get_index; mvpp2_txq_inc_get(txq_pcpu); - dma_unmap_single(port->dev->dev.parent, buf_phys_addr, - skb_headlen(skb), DMA_TO_DEVICE); - if (!skb) + dma_unmap_single(port->dev->dev.parent, tx_buf->phys, + tx_buf->size, DMA_TO_DEVICE); + if (!tx_buf->skb) continue; - dev_kfree_skb_any(skb); + dev_kfree_skb_any(tx_buf->skb); } } @@ -4651,15 +4659,10 @@ static int mvpp2_txq_init(struct mvpp2_port *port, for_each_present_cpu(cpu) { txq_pcpu = per_cpu_ptr(txq->pcpu, cpu); txq_pcpu->size = txq->size; - txq_pcpu->tx_skb = kmalloc(txq_pcpu->size * - sizeof(*txq_pcpu->tx_skb), - GFP_KERNEL); - if (!txq_pcpu->tx_skb) - goto error; - - txq_pcpu->tx_buffs = kmalloc(txq_pcpu->size * - sizeof(dma_addr_t), GFP_KERNEL); - if (!txq_pcpu->tx_buffs) + txq_pcpu->buffs = kmalloc(txq_pcpu->size * + sizeof(struct mvpp2_txq_pcpu_buf), + GFP_KERNEL); + if (!txq_pcpu->buffs) goto error; txq_pcpu->count = 0; @@ -4673,8 +4676,7 @@ static int mvpp2_txq_init(struct mvpp2_port *port, error: for_each_present_cpu(cpu) { txq_pcpu = per_cpu_ptr(txq->pcpu, cpu); - kfree(txq_pcpu->tx_skb); - kfree(txq_pcpu->tx_buffs); + kfree(txq_pcpu->buffs); } dma_free_coherent(port->dev->dev.parent, @@ -4693,8 +4695,7 @@ static void mvpp2_txq_deinit(struct mvpp2_port *port, for_each_present_cpu(cpu) { txq_pcpu = per_cpu_ptr(txq->pcpu, cpu); - kfree(txq_pcpu->tx_skb); - kfree(txq_pcpu->tx_buffs); + kfree(txq_pcpu->buffs); } if (txq->descs) From 0a9648f1293966c838dc570da73c15a76f4c89d6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 21 Dec 2016 05:42:43 -0800 Subject: [PATCH 09/30] tcp: add a missing barrier in tcp_tasklet_func() Madalin reported crashes happening in tcp_tasklet_func() on powerpc64 Before TSQ_QUEUED bit is cleared, we must ensure the changes done by list_del(&tp->tsq_node); are committed to memory, otherwise corruption might happen, as an other cpu could catch TSQ_QUEUED clearance too soon. We can notice that old kernels were immune to this bug, because TSQ_QUEUED was cleared after a bh_lock_sock(sk)/bh_unlock_sock(sk) section, but they could have missed a kick to write additional bytes, when NIC interrupts for a given flow are spread to multiple cpus. Affected TCP flows would need an incoming ACK or RTO timer to add more packets to the pipe. So overall situation should be better now. Fixes: b223feb9de2a ("tcp: tsq: add shortcut in tcp_tasklet_func()") Signed-off-by: Eric Dumazet Reported-by: Madalin Bucur Tested-by: Madalin Bucur Tested-by: Xing Lei Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index b45101f3d2bd..31a255b555ad 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -769,6 +769,7 @@ static void tcp_tasklet_func(unsigned long data) list_del(&tp->tsq_node); sk = (struct sock *)tp; + smp_mb__before_atomic(); clear_bit(TSQ_QUEUED, &sk->sk_tsq_flags); if (!sk->sk_lock.owned && From 551cde192343a10c8fbd57c4cc8f0c4b6cd307e4 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 21 Dec 2016 16:03:23 +0000 Subject: [PATCH 10/30] net: fddi: skfp: use %p format specifier for addresses rather than %x Trivial fix: Addresses should be printed using the %p format specifier rather than using %x. Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- drivers/net/fddi/skfp/hwmtm.c | 12 ++++++------ drivers/net/fddi/skfp/pmf.c | 2 +- drivers/net/fddi/skfp/smt.c | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/net/fddi/skfp/hwmtm.c b/drivers/net/fddi/skfp/hwmtm.c index e26398b5a7dc..d0a68bdd5f63 100644 --- a/drivers/net/fddi/skfp/hwmtm.c +++ b/drivers/net/fddi/skfp/hwmtm.c @@ -1483,7 +1483,7 @@ void mac_drv_clear_rx_queue(struct s_smc *smc) r = queue->rx_curr_get ; while (queue->rx_used) { DRV_BUF_FLUSH(r,DDI_DMA_SYNC_FORCPU) ; - DB_RX("switch OWN bit of RxD 0x%x ",r,0,5) ; + DB_RX("switch OWN bit of RxD 0x%p ",r,0,5) ; r->rxd_rbctrl &= ~cpu_to_le32(BMU_OWN) ; frag_count = 1 ; DRV_BUF_FLUSH(r,DDI_DMA_SYNC_FORDEV) ; @@ -1645,7 +1645,7 @@ void hwm_tx_frag(struct s_smc *smc, char far *virt, u_long phys, int len, DB_TX("hwm_tx_frag: len = %d, frame_status = %x ",len,frame_status,2) ; if (frame_status & LAN_TX) { /* '*t' is already defined */ - DB_TX("LAN_TX: TxD = %x, virt = %x ",t,virt,3) ; + DB_TX("LAN_TX: TxD = %p, virt = %p ",t,virt,3) ; t->txd_virt = virt ; t->txd_txdscr = cpu_to_le32(smc->os.hwm.tx_descr) ; t->txd_tbadr = cpu_to_le32(phys) ; @@ -1819,7 +1819,7 @@ void smt_send_mbuf(struct s_smc *smc, SMbuf *mb, int fc) __le32 tbctrl; NDD_TRACE("THSB",mb,fc,0) ; - DB_TX("smt_send_mbuf: mb = 0x%x, fc = 0x%x",mb,fc,4) ; + DB_TX("smt_send_mbuf: mb = 0x%p, fc = 0x%x",mb,fc,4) ; mb->sm_off-- ; /* set to fc */ mb->sm_len++ ; /* + fc */ @@ -1960,7 +1960,7 @@ static void mac_drv_clear_txd(struct s_smc *smc) do { DRV_BUF_FLUSH(t1,DDI_DMA_SYNC_FORCPU) ; - DB_TX("check OWN/EOF bit of TxD 0x%x",t1,0,5) ; + DB_TX("check OWN/EOF bit of TxD 0x%p",t1,0,5) ; tbctrl = le32_to_cpu(CR_READ(t1->txd_tbctrl)); if (tbctrl & BMU_OWN || !queue->tx_used){ @@ -1988,7 +1988,7 @@ static void mac_drv_clear_txd(struct s_smc *smc) } else { #ifndef PASS_1ST_TXD_2_TX_COMP - DB_TX("mac_drv_tx_comp for TxD 0x%x",t2,0,4) ; + DB_TX("mac_drv_tx_comp for TxD 0x%p",t2,0,4) ; mac_drv_tx_complete(smc,t2) ; #else DB_TX("mac_drv_tx_comp for TxD 0x%x", @@ -2052,7 +2052,7 @@ void mac_drv_clear_tx_queue(struct s_smc *smc) tx_used = queue->tx_used ; while (tx_used) { DRV_BUF_FLUSH(t,DDI_DMA_SYNC_FORCPU) ; - DB_TX("switch OWN bit of TxD 0x%x ",t,0,5) ; + DB_TX("switch OWN bit of TxD 0x%p ",t,0,5) ; t->txd_tbctrl &= ~cpu_to_le32(BMU_OWN) ; DRV_BUF_FLUSH(t,DDI_DMA_SYNC_FORDEV) ; t = t->txd_next ; diff --git a/drivers/net/fddi/skfp/pmf.c b/drivers/net/fddi/skfp/pmf.c index 441b4dc79450..52fa162a31e0 100644 --- a/drivers/net/fddi/skfp/pmf.c +++ b/drivers/net/fddi/skfp/pmf.c @@ -284,7 +284,7 @@ void smt_pmf_received_pack(struct s_smc *smc, SMbuf *mb, int local) SMbuf *reply ; sm = smtod(mb,struct smt_header *) ; - DB_SMT("SMT: processing PMF frame at %x len %d\n",sm,mb->sm_len) ; + DB_SMT("SMT: processing PMF frame at %p len %d\n",sm,mb->sm_len) ; #ifdef DEBUG dump_smt(smc,sm,"PMF Received") ; #endif diff --git a/drivers/net/fddi/skfp/smt.c b/drivers/net/fddi/skfp/smt.c index cd78b7cacc75..e80a08903fcf 100644 --- a/drivers/net/fddi/skfp/smt.c +++ b/drivers/net/fddi/skfp/smt.c @@ -504,7 +504,7 @@ void smt_received_pack(struct s_smc *smc, SMbuf *mb, int fs) #endif smt_swap_para(sm,(int) mb->sm_len,1) ; - DB_SMT("SMT : received packet [%s] at 0x%x\n", + DB_SMT("SMT : received packet [%s] at 0x%p\n", smt_type_name[m_fc(mb) & 0xf],sm) ; DB_SMT("SMT : version %d, class %s\n",sm->smt_version, smt_class_name[(sm->smt_class>LAST_CLASS)?0 : sm->smt_class]) ; From 7d99569460eae28b187d574aec930a4cf8b90441 Mon Sep 17 00:00:00 2001 From: Lorenzo Colitti Date: Fri, 23 Dec 2016 00:33:57 +0900 Subject: [PATCH 11/30] net: ipv4: Don't crash if passing a null sk to ip_do_redirect. Commit e2d118a1cb5e ("net: inet: Support UID-based routing in IP protocols.") made ip_do_redirect call sock_net(sk) to determine the network namespace of the passed-in socket. This crashes if sk is NULL. Fix this by getting the network namespace from the skb instead. Fixes: e2d118a1cb5e ("net: inet: Support UID-based routing in IP protocols.") Signed-off-by: Lorenzo Colitti Signed-off-by: David S. Miller --- net/ipv4/route.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index fa5c037227cb..9eabf490133a 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -798,6 +798,7 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf struct rtable *rt; struct flowi4 fl4; const struct iphdr *iph = (const struct iphdr *) skb->data; + struct net *net = dev_net(skb->dev); int oif = skb->dev->ifindex; u8 tos = RT_TOS(iph->tos); u8 prot = iph->protocol; @@ -805,7 +806,7 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf rt = (struct rtable *) dst; - __build_flow_key(sock_net(sk), &fl4, sk, iph, oif, tos, prot, mark, 0); + __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0); __ip_do_redirect(rt, skb, &fl4, true); } From 567be786597ca24b13906a552ad2159316c6fe7d Mon Sep 17 00:00:00 2001 From: jpinto Date: Fri, 23 Dec 2016 10:15:59 +0000 Subject: [PATCH 12/30] stmmac: CSR clock configuration fix When testing stmmac with my QoS reference design I checked a problem in the CSR clock configuration that was impossibilitating the phy discovery, since every read operation returned 0x0000ffff. This patch fixes the issue. Signed-off-by: Joao Pinto Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c | 2 +- drivers/net/ethernet/stmicro/stmmac/dwmac100_core.c | 2 +- drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c index b21d03fe4f43..be3c91c7f211 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c @@ -539,7 +539,7 @@ struct mac_device_info *dwmac1000_setup(void __iomem *ioaddr, int mcbins, mac->mii.reg_shift = 6; mac->mii.reg_mask = 0x000007C0; mac->mii.clk_csr_shift = 2; - mac->mii.clk_csr_mask = 0xF; + mac->mii.clk_csr_mask = GENMASK(5, 2); /* Get and dump the chip ID */ *synopsys_id = stmmac_get_synopsys_id(hwid); diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac100_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac100_core.c index a1d582f47b1a..9dd2987e284d 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac100_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac100_core.c @@ -197,7 +197,7 @@ struct mac_device_info *dwmac100_setup(void __iomem *ioaddr, int *synopsys_id) mac->mii.reg_shift = 6; mac->mii.reg_mask = 0x000007C0; mac->mii.clk_csr_shift = 2; - mac->mii.clk_csr_mask = 0xF; + mac->mii.clk_csr_mask = GENMASK(5, 2); /* Synopsys Id is not available on old chips */ *synopsys_id = 0; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c index 23322fd9e3ac..fda01f770eff 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c @@ -81,8 +81,8 @@ static int stmmac_mdio_read(struct mii_bus *bus, int phyaddr, int phyreg) value |= (phyaddr << priv->hw->mii.addr_shift) & priv->hw->mii.addr_mask; value |= (phyreg << priv->hw->mii.reg_shift) & priv->hw->mii.reg_mask; - value |= (priv->clk_csr & priv->hw->mii.clk_csr_mask) - << priv->hw->mii.clk_csr_shift; + value |= (priv->clk_csr << priv->hw->mii.clk_csr_shift) + & priv->hw->mii.clk_csr_mask; if (priv->plat->has_gmac4) value |= MII_GMAC4_READ; @@ -122,8 +122,8 @@ static int stmmac_mdio_write(struct mii_bus *bus, int phyaddr, int phyreg, & priv->hw->mii.addr_mask; value |= (phyreg << priv->hw->mii.reg_shift) & priv->hw->mii.reg_mask; - value |= ((priv->clk_csr & priv->hw->mii.clk_csr_mask) - << priv->hw->mii.clk_csr_shift); + value |= (priv->clk_csr << priv->hw->mii.clk_csr_shift) + & priv->hw->mii.clk_csr_mask; if (priv->plat->has_gmac4) value |= MII_GMAC4_WRITE; From dc594ecd4185831031d3fef2853ee76908428107 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Thu, 22 Dec 2016 14:28:14 +0200 Subject: [PATCH 13/30] net/sched: act_tunnel_key: Fix setting UDP dst port in metadata under IPv6 The UDP dst port was provided to the helper function which sets the IPv6 IP tunnel meta-data under a wrong param order, fix that. Fixes: 75bfbca01e48 ('net/sched: act_tunnel_key: Add UDP dst port option') Signed-off-by: Or Gerlitz Reviewed-by: Hadar Hen Zion Signed-off-by: David S. Miller --- net/sched/act_tunnel_key.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index 7af712526f01..e3a58e021198 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -134,8 +134,8 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, saddr = nla_get_in6_addr(tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC]); daddr = nla_get_in6_addr(tb[TCA_TUNNEL_KEY_ENC_IPV6_DST]); - metadata = __ipv6_tun_set_dst(&saddr, &daddr, 0, 0, 0, - dst_port, TUNNEL_KEY, + metadata = __ipv6_tun_set_dst(&saddr, &daddr, 0, 0, dst_port, + 0, TUNNEL_KEY, key_id, 0); } From d9724772e69cb8076231202292665ca74eec13e1 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Thu, 22 Dec 2016 14:28:15 +0200 Subject: [PATCH 14/30] net/sched: cls_flower: Mandate mask when matching on flags When matching on flags, we should require the user to provide the mask and avoid using an all-ones mask. Not doing so causes matching on flags provided w.o mask to hit on the value being unset for all flags, which may not what the user wanted to happen. Fixes: faa3ffce7829 ('net/sched: cls_flower: Add support for matching on flags') Signed-off-by: Or Gerlitz Reported-by: Paul Blakey Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_flower.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 35ac28d0720c..333f8e268431 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -442,32 +442,32 @@ static void fl_set_key_flag(u32 flower_key, u32 flower_mask, } } -static void fl_set_key_flags(struct nlattr **tb, - u32 *flags_key, u32 *flags_mask) +static int fl_set_key_flags(struct nlattr **tb, + u32 *flags_key, u32 *flags_mask) { u32 key, mask; - if (!tb[TCA_FLOWER_KEY_FLAGS]) - return; + /* mask is mandatory for flags */ + if (!tb[TCA_FLOWER_KEY_FLAGS_MASK]) + return -EINVAL; key = be32_to_cpu(nla_get_u32(tb[TCA_FLOWER_KEY_FLAGS])); - - if (!tb[TCA_FLOWER_KEY_FLAGS_MASK]) - mask = ~0; - else - mask = be32_to_cpu(nla_get_u32(tb[TCA_FLOWER_KEY_FLAGS_MASK])); + mask = be32_to_cpu(nla_get_u32(tb[TCA_FLOWER_KEY_FLAGS_MASK])); *flags_key = 0; *flags_mask = 0; fl_set_key_flag(key, mask, flags_key, flags_mask, TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT, FLOW_DIS_IS_FRAGMENT); + + return 0; } static int fl_set_key(struct net *net, struct nlattr **tb, struct fl_flow_key *key, struct fl_flow_key *mask) { __be16 ethertype; + int ret = 0; #ifdef CONFIG_NET_CLS_IND if (tb[TCA_FLOWER_INDEV]) { int err = tcf_change_indev(net, tb[TCA_FLOWER_INDEV]); @@ -614,9 +614,10 @@ static int fl_set_key(struct net *net, struct nlattr **tb, &mask->enc_tp.dst, TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK, sizeof(key->enc_tp.dst)); - fl_set_key_flags(tb, &key->control.flags, &mask->control.flags); + if (tb[TCA_FLOWER_KEY_FLAGS]) + ret = fl_set_key_flags(tb, &key->control.flags, &mask->control.flags); - return 0; + return ret; } static bool fl_mask_eq(struct fl_flow_mask *mask1, From 39b2dd765e0711e1efd1d1df089473a8dd93ad48 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 22 Dec 2016 18:19:16 -0500 Subject: [PATCH 15/30] inet: fix IP(V6)_RECVORIGDSTADDR for udp sockets Socket cmsg IP(V6)_RECVORIGDSTADDR checks that port range lies within the packet. For sockets that have transport headers pulled, transport offset can be negative. Use signed comparison to avoid overflow. Fixes: e6afc8ace6dd ("udp: remove headers from UDP packets before queueing") Reported-by: Nisar Jagabar Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv4/ip_sockglue.c | 2 +- net/ipv6/datagram.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 8b13881ed064..976073417b03 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -148,7 +148,7 @@ static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb) const struct iphdr *iph = ip_hdr(skb); __be16 *ports = (__be16 *)skb_transport_header(skb); - if (skb_transport_offset(skb) + 4 > skb->len) + if (skb_transport_offset(skb) + 4 > (int)skb->len) return; /* All current transport protocols have the port numbers in the diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 0489e19258ad..1407426bc862 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -701,7 +701,7 @@ void ip6_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg, struct sockaddr_in6 sin6; __be16 *ports = (__be16 *) skb_transport_header(skb); - if (skb_transport_offset(skb) + 4 <= skb->len) { + if (skb_transport_offset(skb) + 4 <= (int)skb->len) { /* All current transport protocols have the port numbers in the * first four bytes of the transport header and this function is * written with this assumption in mind. From a98f91758995cb59611e61318dddd8a6956b52c3 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Thu, 22 Dec 2016 11:16:22 -0500 Subject: [PATCH 16/30] ipv6: handle -EFAULT from skb_copy_bits By setting certain socket options on ipv6 raw sockets, we can confuse the length calculation in rawv6_push_pending_frames triggering a BUG_ON. RIP: 0010:[] [] rawv6_sendmsg+0xc30/0xc40 RSP: 0018:ffff881f6c4a7c18 EFLAGS: 00010282 RAX: 00000000fffffff2 RBX: ffff881f6c681680 RCX: 0000000000000002 RDX: ffff881f6c4a7cf8 RSI: 0000000000000030 RDI: ffff881fed0f6a00 RBP: ffff881f6c4a7da8 R08: 0000000000000000 R09: 0000000000000009 R10: ffff881fed0f6a00 R11: 0000000000000009 R12: 0000000000000030 R13: ffff881fed0f6a00 R14: ffff881fee39ba00 R15: ffff881fefa93a80 Call Trace: [] ? unmap_page_range+0x693/0x830 [] inet_sendmsg+0x67/0xa0 [] sock_sendmsg+0x38/0x50 [] SYSC_sendto+0xef/0x170 [] SyS_sendto+0xe/0x10 [] do_syscall_64+0x50/0xa0 [] entry_SYSCALL64_slow_path+0x25/0x25 Handle by jumping to the failure path if skb_copy_bits gets an EFAULT. Reproducer: #include #include #include #include #include #include #include #define LEN 504 int main(int argc, char* argv[]) { int fd; int zero = 0; char buf[LEN]; memset(buf, 0, LEN); fd = socket(AF_INET6, SOCK_RAW, 7); setsockopt(fd, SOL_IPV6, IPV6_CHECKSUM, &zero, 4); setsockopt(fd, SOL_IPV6, IPV6_DSTOPTS, &buf, LEN); sendto(fd, buf, 1, 0, (struct sockaddr *) buf, 110); } Signed-off-by: Dave Jones Signed-off-by: David S. Miller --- net/ipv6/raw.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 291ebc260e70..ea89073c8247 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -591,7 +591,11 @@ static int rawv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, } offset += skb_transport_offset(skb); - BUG_ON(skb_copy_bits(skb, offset, &csum, 2)); + err = skb_copy_bits(skb, offset, &csum, 2); + if (err < 0) { + ip6_flush_pending_frames(sk); + goto out; + } /* in case cksum was not initialized */ if (unlikely(csum)) From 53f800e3baf980827c197a9332f63effe80d4809 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 23 Dec 2016 09:32:48 +0100 Subject: [PATCH 17/30] neigh: Send netevent after marking neigh as dead neigh_cleanup_and_release() is always called after marking a neighbour as dead, but it only notifies user space and not in-kernel listeners of the netevent notification chain. This can cause multiple problems. In my specific use case, it causes the listener (a switch driver capable of L3 offloads) to believe a neighbour entry is still valid, and is thus erroneously kept in the device's table. Fix that by sending a netevent after marking the neighbour as dead. Fixes: a6bf9e933daf ("mlxsw: spectrum_router: Offload neighbours based on NUD state change") Signed-off-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/neighbour.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 782dd8663665..7bb12e07ffef 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -100,6 +100,7 @@ static void neigh_cleanup_and_release(struct neighbour *neigh) neigh->parms->neigh_cleanup(neigh); __neigh_notify(neigh, RTM_DELNEIGH, 0); + call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh); neigh_release(neigh); } From 93a87e5e794fb71a51f97fbde6c0010680b62d70 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 23 Dec 2016 09:32:49 +0100 Subject: [PATCH 18/30] mlxsw: spectrum_router: Don't reflect dead neighs When a neighbour is considered to be dead, we should remove it from the device's table regardless of its NUD state. Without this patch, after setting a port to be administratively down we get the following errors when we periodically try to update the kernel about neighbours activity: [ 461.947268] mlxsw_spectrum 0000:03:00.0 sw1p3: Failed to find matching neighbour for IP=192.168.100.2 Fixes: a6bf9e933daf ("mlxsw: spectrum_router: Offload neighbours based on NUD state change") Signed-off-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 53126bf68ea9..a0f9742f62df 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -942,7 +942,7 @@ static void mlxsw_sp_router_neigh_update_hw(struct work_struct *work) char rauht_pl[MLXSW_REG_RAUHT_LEN]; struct net_device *dev; bool entry_connected; - u8 nud_state; + u8 nud_state, dead; bool updating; bool removing; bool adding; @@ -953,10 +953,11 @@ static void mlxsw_sp_router_neigh_update_hw(struct work_struct *work) dip = ntohl(*((__be32 *) n->primary_key)); memcpy(neigh_entry->ha, n->ha, sizeof(neigh_entry->ha)); nud_state = n->nud_state; + dead = n->dead; dev = n->dev; read_unlock_bh(&n->lock); - entry_connected = nud_state & NUD_VALID; + entry_connected = nud_state & NUD_VALID && !dead; adding = (!neigh_entry->offloaded) && entry_connected; updating = neigh_entry->offloaded && entry_connected; removing = neigh_entry->offloaded && !entry_connected; @@ -1351,7 +1352,7 @@ static int mlxsw_sp_nexthop_init(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_neigh_entry *neigh_entry; struct net_device *dev = fib_nh->nh_dev; struct neighbour *n; - u8 nud_state; + u8 nud_state, dead; /* Take a reference of neigh here ensuring that neigh would * not be detructed before the nexthop entry is finished. @@ -1383,8 +1384,9 @@ static int mlxsw_sp_nexthop_init(struct mlxsw_sp *mlxsw_sp, list_add_tail(&nh->neigh_list_node, &neigh_entry->nexthop_list); read_lock_bh(&n->lock); nud_state = n->nud_state; + dead = n->dead; read_unlock_bh(&n->lock); - __mlxsw_sp_nexthop_neigh_update(nh, !(nud_state & NUD_VALID)); + __mlxsw_sp_nexthop_neigh_update(nh, !(nud_state & NUD_VALID && !dead)); return 0; } From 58312125da5806308bd69e075fedae30f8cf7794 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 23 Dec 2016 09:32:50 +0100 Subject: [PATCH 19/30] mlxsw: spectrum_router: Correctly remove nexthop groups At the end of the nexthop initialization process we determine whether the nexthop should be offloaded or not based on the NUD state of the neighbour representing it. After all the nexthops were initialized we refresh the nexthop group and potentially offload it to the device, in case some of the nexthops were resolved. Make the destruction of a nexthop group symmetric with its creation by marking all nexthops as invalid and then refresh the nexthop group to make sure it was removed from the device's tables. Fixes: b2157149b0b0 ("mlxsw: spectrum_router: Add the nexthop neigh activity update") Signed-off-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index a0f9742f62df..01d0efa9c5c7 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -1396,6 +1396,7 @@ static void mlxsw_sp_nexthop_fini(struct mlxsw_sp *mlxsw_sp, { struct mlxsw_sp_neigh_entry *neigh_entry = nh->neigh_entry; + __mlxsw_sp_nexthop_neigh_update(nh, true); list_del(&nh->neigh_list_node); /* If that is the last nexthop connected to that neigh, remove from @@ -1454,6 +1455,8 @@ mlxsw_sp_nexthop_group_destroy(struct mlxsw_sp *mlxsw_sp, nh = &nh_grp->nexthops[i]; mlxsw_sp_nexthop_fini(mlxsw_sp, nh); } + mlxsw_sp_nexthop_group_refresh(mlxsw_sp, nh_grp); + WARN_ON_ONCE(nh_grp->adj_index_valid); kfree(nh_grp); } From 73b62bd085f4737679ea9afc7867fa5f99ba7d1b Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Fri, 23 Dec 2016 22:37:24 +0800 Subject: [PATCH 20/30] virtio-net: remove the warning before XDP linearizing Since we use EWMA to estimate the size of rx buffer. When rx buffer size is underestimated, it's usual to have a packet with more than one buffers. Consider this is not a bug, remove the warning and correct the comment before XDP linearizing. Cc: John Fastabend Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 08327e005ccc..1067253b98be 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -552,14 +552,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, struct page *xdp_page; u32 act; - /* No known backend devices should send packets with - * more than a single buffer when XDP conditions are - * met. However it is not strictly illegal so the case - * is handled as an exception and a warning is thrown. - */ + /* This happens when rx buffer size is underestimated */ if (unlikely(num_buf > 1)) { - bpf_warn_invalid_xdp_buffer(); - /* linearize data for XDP */ xdp_page = xdp_linearize_page(rq, num_buf, page, offset, &len); From 275be061b33b945cfb120ee8a570f78c3ccafe56 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Fri, 23 Dec 2016 22:37:25 +0800 Subject: [PATCH 21/30] virtio-net: correctly xmit linearized page on XDP_TX After we linearize page, we should xmit this page instead of the page of first buffer which may lead unexpected result. With this patch, we can see correct packet during XDP_TX. Cc: John Fastabend Signed-off-by: Jason Wang Acked-by: John Fastabend Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 1067253b98be..fe4562d395e3 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -572,7 +572,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, if (unlikely(hdr->hdr.gso_type || hdr->hdr.flags)) goto err_xdp; - act = do_xdp_prog(vi, rq, xdp_prog, page, offset, len); + act = do_xdp_prog(vi, rq, xdp_prog, xdp_page, offset, len); switch (act) { case XDP_PASS: if (unlikely(xdp_page != page)) From 56a86f84b8332afe8c6fcb4b09d09d9bf094e2db Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Fri, 23 Dec 2016 22:37:26 +0800 Subject: [PATCH 22/30] virtio-net: fix page miscount during XDP linearizing We don't put page during linearizing, the would cause leaking when xmit through XDP_TX or the packet exceeds PAGE_SIZE. Fix them by put page accordingly. Also decrease the number of buffers during linearizing to make sure caller can free buffers correctly when packet exceeds PAGE_SIZE. With this patch, we won't get OOM after linearize huge number of packets. Cc: John Fastabend Signed-off-by: Jason Wang Acked-by: John Fastabend Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index fe4562d395e3..58ad40e17a74 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -483,7 +483,7 @@ xdp_xmit: * anymore. */ static struct page *xdp_linearize_page(struct receive_queue *rq, - u16 num_buf, + u16 *num_buf, struct page *p, int offset, unsigned int *len) @@ -497,7 +497,7 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, memcpy(page_address(page) + page_off, page_address(p) + offset, *len); page_off += *len; - while (--num_buf) { + while (--*num_buf) { unsigned int buflen; unsigned long ctx; void *buf; @@ -507,19 +507,22 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, if (unlikely(!ctx)) goto err_buf; - /* guard against a misconfigured or uncooperative backend that - * is sending packet larger than the MTU. - */ - if ((page_off + buflen) > PAGE_SIZE) - goto err_buf; - buf = mergeable_ctx_to_buf_address(ctx); p = virt_to_head_page(buf); off = buf - page_address(p); + /* guard against a misconfigured or uncooperative backend that + * is sending packet larger than the MTU. + */ + if ((page_off + buflen) > PAGE_SIZE) { + put_page(p); + goto err_buf; + } + memcpy(page_address(page) + page_off, page_address(p) + off, buflen); page_off += buflen; + put_page(p); } *len = page_off; @@ -555,7 +558,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, /* This happens when rx buffer size is underestimated */ if (unlikely(num_buf > 1)) { /* linearize data for XDP */ - xdp_page = xdp_linearize_page(rq, num_buf, + xdp_page = xdp_linearize_page(rq, &num_buf, page, offset, &len); if (!xdp_page) goto err_xdp; From 1830f8935f3b173d229b86e9927b3b6d599aa1f6 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Fri, 23 Dec 2016 22:37:27 +0800 Subject: [PATCH 23/30] virtio-net: correctly handle XDP_PASS for linearized packets When XDP_PASS were determined for linearized packets, we try to get new buffers in the virtqueue and build skbs from them. This is wrong, we should create skbs based on existed buffers instead. Fixing them by creating skb based on xdp_page. With this patch "ping 192.168.100.4 -s 3900 -M do" works for XDP_PASS. Cc: John Fastabend Signed-off-by: Jason Wang Acked-by: John Fastabend Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 58ad40e17a74..470293e2b84d 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -578,8 +578,14 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, act = do_xdp_prog(vi, rq, xdp_prog, xdp_page, offset, len); switch (act) { case XDP_PASS: - if (unlikely(xdp_page != page)) - __free_pages(xdp_page, 0); + /* We can only create skb based on xdp_page. */ + if (unlikely(xdp_page != page)) { + rcu_read_unlock(); + put_page(page); + head_skb = page_to_skb(vi, rq, xdp_page, + 0, len, PAGE_SIZE); + return head_skb; + } break; case XDP_TX: if (unlikely(xdp_page != page)) From b00f70b0dacb3d2e009afce860ebc90219072f5c Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Fri, 23 Dec 2016 22:37:28 +0800 Subject: [PATCH 24/30] virtio-net: unbreak csumed packets for XDP_PASS We drop csumed packet when do XDP for packets. This breaks XDP_PASS when GUEST_CSUM is supported. Fix this by allowing csum flag to be set. With this patch, simple TCP works for XDP_PASS. Cc: John Fastabend Signed-off-by: Jason Wang Acked-by: John Fastabend Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 470293e2b84d..0778dc88597e 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -440,7 +440,7 @@ static struct sk_buff *receive_big(struct net_device *dev, struct virtio_net_hdr_mrg_rxbuf *hdr = buf; u32 act; - if (unlikely(hdr->hdr.gso_type || hdr->hdr.flags)) + if (unlikely(hdr->hdr.gso_type)) goto err_xdp; act = do_xdp_prog(vi, rq, xdp_prog, page, 0, len); switch (act) { @@ -572,7 +572,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, * the receive path after XDP is loaded. In practice I * was not able to create this condition. */ - if (unlikely(hdr->hdr.gso_type || hdr->hdr.flags)) + if (unlikely(hdr->hdr.gso_type)) goto err_xdp; act = do_xdp_prog(vi, rq, xdp_prog, xdp_page, offset, len); From 5c33474d41af09f09c98f1df70f267920587bec0 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Fri, 23 Dec 2016 22:37:29 +0800 Subject: [PATCH 25/30] virtio-net: make rx buf size estimation works for XDP We don't update ewma rx buf size in the case of XDP. This will lead underestimation of rx buf size which causes host to produce more than one buffers. This will greatly increase the possibility of XDP page linearization. Cc: John Fastabend Signed-off-by: Jason Wang Acked-by: John Fastabend Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 0778dc88597e..77ae358ec630 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -584,10 +584,12 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, put_page(page); head_skb = page_to_skb(vi, rq, xdp_page, 0, len, PAGE_SIZE); + ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len); return head_skb; } break; case XDP_TX: + ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len); if (unlikely(xdp_page != page)) goto err_xdp; rcu_read_unlock(); @@ -596,6 +598,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, default: if (unlikely(xdp_page != page)) __free_pages(xdp_page, 0); + ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len); goto err_xdp; } } From 92502fe86c7c9b3f8543f29641a3c71805e82757 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Fri, 23 Dec 2016 22:37:30 +0800 Subject: [PATCH 26/30] virtio-net: forbid XDP when VIRTIO_NET_F_GUEST_UFO is support When VIRTIO_NET_F_GUEST_UFO is negotiated, host could still send UFO packet that exceeds a single page which could not be handled correctly by XDP. So this patch forbids setting XDP when GUEST_UFO is supported. While at it, forbid XDP for ECN (which comes only from GRO) too to prevent user from misconfiguration. Cc: John Fastabend Signed-off-by: Jason Wang Acked-by: John Fastabend Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 77ae358ec630..c1f66d8bfb7b 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -1684,7 +1684,9 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog) int i, err; if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO4) || - virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO6)) { + virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO6) || + virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_ECN) || + virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_UFO)) { netdev_warn(dev, "can't set XDP while host is implementing LRO, disable LRO first\n"); return -EOPNOTSUPP; } From c47a43d3004ad6ff2a94a670cb3274cd6338d41e Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Fri, 23 Dec 2016 22:37:31 +0800 Subject: [PATCH 27/30] virtio-net: remove big packet XDP codes Now we in fact don't allow XDP for big packets, remove its codes. Cc: John Fastabend Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 44 +++------------------------------------- 1 file changed, 3 insertions(+), 41 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index c1f66d8bfb7b..e53365a86ca3 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -344,11 +344,7 @@ static void virtnet_xdp_xmit(struct virtnet_info *vi, /* Free up any pending old buffers before queueing new ones. */ while ((xdp_sent = virtqueue_get_buf(sq->vq, &len)) != NULL) { struct page *sent_page = virt_to_head_page(xdp_sent); - - if (vi->mergeable_rx_bufs) - put_page(sent_page); - else - give_pages(rq, sent_page); + put_page(sent_page); } /* Zero header and leave csum up to XDP layers */ @@ -360,15 +356,8 @@ static void virtnet_xdp_xmit(struct virtnet_info *vi, err = virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, xdp->data, GFP_ATOMIC); if (unlikely(err)) { - if (vi->mergeable_rx_bufs) - put_page(page); - else - give_pages(rq, page); + put_page(page); return; // On error abort to avoid unnecessary kick - } else if (!vi->mergeable_rx_bufs) { - /* If not mergeable bufs must be big packets so cleanup pages */ - give_pages(rq, (struct page *)page->private); - page->private = 0; } virtqueue_kick(sq->vq); @@ -430,44 +419,17 @@ static struct sk_buff *receive_big(struct net_device *dev, void *buf, unsigned int len) { - struct bpf_prog *xdp_prog; struct page *page = buf; - struct sk_buff *skb; + struct sk_buff *skb = page_to_skb(vi, rq, page, 0, len, PAGE_SIZE); - rcu_read_lock(); - xdp_prog = rcu_dereference(rq->xdp_prog); - if (xdp_prog) { - struct virtio_net_hdr_mrg_rxbuf *hdr = buf; - u32 act; - - if (unlikely(hdr->hdr.gso_type)) - goto err_xdp; - act = do_xdp_prog(vi, rq, xdp_prog, page, 0, len); - switch (act) { - case XDP_PASS: - break; - case XDP_TX: - rcu_read_unlock(); - goto xdp_xmit; - case XDP_DROP: - default: - goto err_xdp; - } - } - rcu_read_unlock(); - - skb = page_to_skb(vi, rq, page, 0, len, PAGE_SIZE); if (unlikely(!skb)) goto err; return skb; -err_xdp: - rcu_read_unlock(); err: dev->stats.rx_dropped++; give_pages(rq, page); -xdp_xmit: return NULL; } From bb91accf27335c6dc460e202991ca140fa21e1b5 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Fri, 23 Dec 2016 22:37:32 +0800 Subject: [PATCH 28/30] virtio-net: XDP support for small buffers Commit f600b6905015 ("virtio_net: Add XDP support") leaves the case of small receive buffer untouched. This will confuse the user who want to set XDP but use small buffers. Other than forbid XDP in small buffer mode, let's make it work. XDP then can only work at skb->data since virtio-net create skbs during refill, this is sub optimal which could be optimized in the future. Cc: John Fastabend Signed-off-by: Jason Wang Acked-by: John Fastabend Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 112 ++++++++++++++++++++++++++++++--------- 1 file changed, 87 insertions(+), 25 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index e53365a86ca3..5deeda61d6d3 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -333,9 +333,9 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi, static void virtnet_xdp_xmit(struct virtnet_info *vi, struct receive_queue *rq, struct send_queue *sq, - struct xdp_buff *xdp) + struct xdp_buff *xdp, + void *data) { - struct page *page = virt_to_head_page(xdp->data); struct virtio_net_hdr_mrg_rxbuf *hdr; unsigned int num_sg, len; void *xdp_sent; @@ -343,20 +343,45 @@ static void virtnet_xdp_xmit(struct virtnet_info *vi, /* Free up any pending old buffers before queueing new ones. */ while ((xdp_sent = virtqueue_get_buf(sq->vq, &len)) != NULL) { - struct page *sent_page = virt_to_head_page(xdp_sent); - put_page(sent_page); + if (vi->mergeable_rx_bufs) { + struct page *sent_page = virt_to_head_page(xdp_sent); + + put_page(sent_page); + } else { /* small buffer */ + struct sk_buff *skb = xdp_sent; + + kfree_skb(skb); + } } - /* Zero header and leave csum up to XDP layers */ - hdr = xdp->data; - memset(hdr, 0, vi->hdr_len); + if (vi->mergeable_rx_bufs) { + /* Zero header and leave csum up to XDP layers */ + hdr = xdp->data; + memset(hdr, 0, vi->hdr_len); - num_sg = 1; - sg_init_one(sq->sg, xdp->data, xdp->data_end - xdp->data); + num_sg = 1; + sg_init_one(sq->sg, xdp->data, xdp->data_end - xdp->data); + } else { /* small buffer */ + struct sk_buff *skb = data; + + /* Zero header and leave csum up to XDP layers */ + hdr = skb_vnet_hdr(skb); + memset(hdr, 0, vi->hdr_len); + + num_sg = 2; + sg_init_table(sq->sg, 2); + sg_set_buf(sq->sg, hdr, vi->hdr_len); + skb_to_sgvec(skb, sq->sg + 1, 0, skb->len); + } err = virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, - xdp->data, GFP_ATOMIC); + data, GFP_ATOMIC); if (unlikely(err)) { - put_page(page); + if (vi->mergeable_rx_bufs) { + struct page *page = virt_to_head_page(xdp->data); + + put_page(page); + } else /* small buffer */ + kfree_skb(data); return; // On error abort to avoid unnecessary kick } @@ -366,23 +391,26 @@ static void virtnet_xdp_xmit(struct virtnet_info *vi, static u32 do_xdp_prog(struct virtnet_info *vi, struct receive_queue *rq, struct bpf_prog *xdp_prog, - struct page *page, int offset, int len) + void *data, int len) { int hdr_padded_len; struct xdp_buff xdp; + void *buf; unsigned int qp; u32 act; - u8 *buf; - buf = page_address(page) + offset; - - if (vi->mergeable_rx_bufs) + if (vi->mergeable_rx_bufs) { hdr_padded_len = sizeof(struct virtio_net_hdr_mrg_rxbuf); - else - hdr_padded_len = sizeof(struct padded_vnet_hdr); + xdp.data = data + hdr_padded_len; + xdp.data_end = xdp.data + (len - vi->hdr_len); + buf = data; + } else { /* small buffers */ + struct sk_buff *skb = data; - xdp.data = buf + hdr_padded_len; - xdp.data_end = xdp.data + (len - vi->hdr_len); + xdp.data = skb->data; + xdp.data_end = xdp.data + len; + buf = skb->data; + } act = bpf_prog_run_xdp(xdp_prog, &xdp); switch (act) { @@ -392,8 +420,8 @@ static u32 do_xdp_prog(struct virtnet_info *vi, qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id(); - xdp.data = buf + (vi->mergeable_rx_bufs ? 0 : 4); - virtnet_xdp_xmit(vi, rq, &vi->sq[qp], &xdp); + xdp.data = buf; + virtnet_xdp_xmit(vi, rq, &vi->sq[qp], &xdp, data); return XDP_TX; default: bpf_warn_invalid_xdp_action(act); @@ -403,14 +431,47 @@ static u32 do_xdp_prog(struct virtnet_info *vi, } } -static struct sk_buff *receive_small(struct virtnet_info *vi, void *buf, unsigned int len) +static struct sk_buff *receive_small(struct net_device *dev, + struct virtnet_info *vi, + struct receive_queue *rq, + void *buf, unsigned int len) { struct sk_buff * skb = buf; + struct bpf_prog *xdp_prog; len -= vi->hdr_len; skb_trim(skb, len); + rcu_read_lock(); + xdp_prog = rcu_dereference(rq->xdp_prog); + if (xdp_prog) { + struct virtio_net_hdr_mrg_rxbuf *hdr = buf; + u32 act; + + if (unlikely(hdr->hdr.gso_type || hdr->hdr.flags)) + goto err_xdp; + act = do_xdp_prog(vi, rq, xdp_prog, skb, len); + switch (act) { + case XDP_PASS: + break; + case XDP_TX: + rcu_read_unlock(); + goto xdp_xmit; + case XDP_DROP: + default: + goto err_xdp; + } + } + rcu_read_unlock(); + return skb; + +err_xdp: + rcu_read_unlock(); + dev->stats.rx_dropped++; + kfree_skb(skb); +xdp_xmit: + return NULL; } static struct sk_buff *receive_big(struct net_device *dev, @@ -537,7 +598,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, if (unlikely(hdr->hdr.gso_type)) goto err_xdp; - act = do_xdp_prog(vi, rq, xdp_prog, xdp_page, offset, len); + act = do_xdp_prog(vi, rq, xdp_prog, + page_address(xdp_page) + offset, len); switch (act) { case XDP_PASS: /* We can only create skb based on xdp_page. */ @@ -672,7 +734,7 @@ static void receive_buf(struct virtnet_info *vi, struct receive_queue *rq, else if (vi->big_packets) skb = receive_big(dev, vi, rq, buf, len); else - skb = receive_small(vi, buf, len); + skb = receive_small(dev, vi, rq, buf, len); if (unlikely(!skb)) return; From 58b94d88deb9ac913740e7778f2005f3ce6d0443 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Fri, 23 Dec 2016 14:29:02 -0200 Subject: [PATCH 29/30] sctp: do not loose window information if in rwnd_over It's possible that we receive a packet that is larger than current window. If it's the first packet in this way, it will cause it to increase rwnd_over. Then, if we receive another data chunk (specially as SCTP allows you to have one data chunk in flight even during 0 window), rwnd_over will be overwritten instead of added to. In the long run, this could cause the window to grow bigger than its initial size, as rwnd_over would be charged only for the last received data chunk while the code will try open the window for all packets that were received and had its value in rwnd_over overwritten. This, then, can lead to the worsening of payload/buffer ratio and cause rwnd_press to kick in more often. The fix is to sum it too, same as is done for rwnd_press, so that if we receive 3 chunks after closing the window, we still have to release that same amount before re-opening it. Log snippet from sctp_test exhibiting the issue: [ 146.209232] sctp: sctp_assoc_rwnd_decrease: asoc:ffff88013928e000 rwnd decreased by 1 to (0, 1, 114221) [ 146.209232] sctp: sctp_assoc_rwnd_decrease: association:ffff88013928e000 has asoc->rwnd:0, asoc->rwnd_over:1! [ 146.209232] sctp: sctp_assoc_rwnd_decrease: asoc:ffff88013928e000 rwnd decreased by 1 to (0, 1, 114221) [ 146.209232] sctp: sctp_assoc_rwnd_decrease: association:ffff88013928e000 has asoc->rwnd:0, asoc->rwnd_over:1! [ 146.209232] sctp: sctp_assoc_rwnd_decrease: asoc:ffff88013928e000 rwnd decreased by 1 to (0, 1, 114221) [ 146.209232] sctp: sctp_assoc_rwnd_decrease: association:ffff88013928e000 has asoc->rwnd:0, asoc->rwnd_over:1! [ 146.209232] sctp: sctp_assoc_rwnd_decrease: asoc:ffff88013928e000 rwnd decreased by 1 to (0, 1, 114221) Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/associola.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 68428e1f7181..56ddcfaeb4f6 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -1539,7 +1539,7 @@ void sctp_assoc_rwnd_decrease(struct sctp_association *asoc, unsigned int len) asoc->rwnd = 0; } } else { - asoc->rwnd_over = len - asoc->rwnd; + asoc->rwnd_over += len - asoc->rwnd; asoc->rwnd = 0; } From 1636098c46ac52c7fec384fe629144e8e03487b1 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Fri, 23 Dec 2016 14:29:37 -0200 Subject: [PATCH 30/30] sctp: fix recovering from 0 win with small data chunks Currently if SCTP closes the receive window with window pressure, mostly caused by excessive skb overhead on payload/overheads ratio, SCTP will close the window abruptly while saving the delta on rwnd_press. It will start recovering rwnd as the chunks are consumed by the application and the rwnd_press will be only recovered after rwnd reach the same value as of rwnd_press, mostly to prevent silly window syndrome. Thing is, this is very inefficient with small data chunks, as with those it will never reach back that value, and thus it will never recover from such pressure. This means that we will not issue window updates when recovering from 0 window and will rely on a sender retransmit to notice it. The fix here is to remove such threshold, as no value is good enough: it depends on the (avg) chunk sizes being used. Test with netperf -t SCTP_STREAM -- -m 1, and trigger 0 window by sending SIGSTOP to netserver, sleep 1.2, and SIGCONT. Rate limited to 845kbps, for visibility. Capture done at netserver side. Previously: 01.500751 IP B.48277 > A.36925: sctp (1) [SACK] [cum ack 632372996] [a_rwnd 99153] [ 01.500752 IP A.36925 > B.48277: sctp (1) [DATA] (B)(E) [TSN: 632372997] [SID: 0] [SS 01.517471 IP A.36925 > B.48277: sctp (1) [DATA] (B)(E) [TSN: 632373010] [SID: 0] [SS 01.517483 IP B.48277 > A.36925: sctp (1) [SACK] [cum ack 632373009] [a_rwnd 0] [#gap 01.517485 IP A.36925 > B.48277: sctp (1) [DATA] (B)(E) [TSN: 632373083] [SID: 0] [SS 01.517488 IP B.48277 > A.36925: sctp (1) [SACK] [cum ack 632373009] [a_rwnd 0] [#gap 01.534168 IP A.36925 > B.48277: sctp (1) [DATA] (B)(E) [TSN: 632373096] [SID: 0] [SS 01.534180 IP B.48277 > A.36925: sctp (1) [SACK] [cum ack 632373009] [a_rwnd 0] [#gap 01.534181 IP A.36925 > B.48277: sctp (1) [DATA] (B)(E) [TSN: 632373169] [SID: 0] [SS 01.534185 IP B.48277 > A.36925: sctp (1) [SACK] [cum ack 632373009] [a_rwnd 0] [#gap 02.525978 IP A.36925 > B.48277: sctp (1) [DATA] (B)(E) [TSN: 632373010] [SID: 0] [SS 02.526021 IP B.48277 > A.36925: sctp (1) [SACK] [cum ack 632373009] [a_rwnd 0] [#gap (window update missed) 04.573807 IP A.36925 > B.48277: sctp (1) [DATA] (B)(E) [TSN: 632373010] [SID: 0] [SS 04.779370 IP B.48277 > A.36925: sctp (1) [SACK] [cum ack 632373082] [a_rwnd 859] [#g 04.789162 IP A.36925 > B.48277: sctp (1) [DATA] (B)(E) [TSN: 632373083] [SID: 0] [SS 04.789323 IP A.36925 > B.48277: sctp (1) [DATA] (B)(E) [TSN: 632373156] [SID: 0] [SS 04.789372 IP B.48277 > A.36925: sctp (1) [SACK] [cum ack 632373228] [a_rwnd 786] [#g After: 02.568957 IP B.50536 > A.55173: sctp (1) [SACK] [cum ack 2490098728] [a_rwnd 99153] 02.568961 IP A.55173 > B.50536: sctp (1) [DATA] (B)(E) [TSN: 2490098729] [SID: 0] [S 02.585631 IP A.55173 > B.50536: sctp (1) [DATA] (B)(E) [TSN: 2490098742] [SID: 0] [S 02.585666 IP B.50536 > A.55173: sctp (1) [SACK] [cum ack 2490098741] [a_rwnd 0] [#ga 02.585671 IP A.55173 > B.50536: sctp (1) [DATA] (B)(E) [TSN: 2490098815] [SID: 0] [S 02.585683 IP B.50536 > A.55173: sctp (1) [SACK] [cum ack 2490098741] [a_rwnd 0] [#ga 02.602330 IP A.55173 > B.50536: sctp (1) [DATA] (B)(E) [TSN: 2490098828] [SID: 0] [S 02.602359 IP B.50536 > A.55173: sctp (1) [SACK] [cum ack 2490098741] [a_rwnd 0] [#ga 02.602363 IP A.55173 > B.50536: sctp (1) [DATA] (B)(E) [TSN: 2490098901] [SID: 0] [S 02.602372 IP B.50536 > A.55173: sctp (1) [SACK] [cum ack 2490098741] [a_rwnd 0] [#ga 03.600788 IP A.55173 > B.50536: sctp (1) [DATA] (B)(E) [TSN: 2490098742] [SID: 0] [S 03.600830 IP B.50536 > A.55173: sctp (1) [SACK] [cum ack 2490098741] [a_rwnd 0] [#ga 03.619455 IP B.50536 > A.55173: sctp (1) [SACK] [cum ack 2490098741] [a_rwnd 13508] 03.619479 IP B.50536 > A.55173: sctp (1) [SACK] [cum ack 2490098741] [a_rwnd 27017] 03.619497 IP B.50536 > A.55173: sctp (1) [SACK] [cum ack 2490098741] [a_rwnd 40526] 03.619516 IP B.50536 > A.55173: sctp (1) [SACK] [cum ack 2490098741] [a_rwnd 54035] 03.619533 IP B.50536 > A.55173: sctp (1) [SACK] [cum ack 2490098741] [a_rwnd 67544] 03.619552 IP B.50536 > A.55173: sctp (1) [SACK] [cum ack 2490098741] [a_rwnd 81053] 03.619570 IP B.50536 > A.55173: sctp (1) [SACK] [cum ack 2490098741] [a_rwnd 94562] (following data transmission triggered by window updates above) 03.633504 IP A.55173 > B.50536: sctp (1) [DATA] (B)(E) [TSN: 2490098742] [SID: 0] [S 03.836445 IP B.50536 > A.55173: sctp (1) [SACK] [cum ack 2490098814] [a_rwnd 100000] 03.843125 IP A.55173 > B.50536: sctp (1) [DATA] (B)(E) [TSN: 2490098815] [SID: 0] [S 03.843285 IP A.55173 > B.50536: sctp (1) [DATA] (B)(E) [TSN: 2490098888] [SID: 0] [S 03.843345 IP B.50536 > A.55173: sctp (1) [SACK] [cum ack 2490098960] [a_rwnd 99894] 03.856546 IP A.55173 > B.50536: sctp (1) [DATA] (B)(E) [TSN: 2490098961] [SID: 0] [S 03.866450 IP A.55173 > B.50536: sctp (1) [DATA] (B)(E) [TSN: 2490099011] [SID: 0] [S Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/associola.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 56ddcfaeb4f6..d3cc30c25c41 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -1471,7 +1471,7 @@ void sctp_assoc_rwnd_increase(struct sctp_association *asoc, unsigned int len) * threshold. The idea is to recover slowly, but up * to the initial advertised window. */ - if (asoc->rwnd_press && asoc->rwnd >= asoc->rwnd_press) { + if (asoc->rwnd_press) { int change = min(asoc->pathmtu, asoc->rwnd_press); asoc->rwnd += change; asoc->rwnd_press -= change;