From cb853da3a368c40300a0e940f86be582037bb082 Mon Sep 17 00:00:00 2001
From: Arend Van Spriel <arend.vanspriel@broadcom.com>
Date: Fri, 9 Dec 2016 11:34:13 +0000
Subject: [PATCH 01/30] brcmfmac: fix memory leak in brcmf_cfg80211_attach()

In brcmf_cfg80211_attach() there was one error path not properly
handled as it leaked memory allocated in brcmf_btcoex_attach().

Reviewed-by: Hante Meuleman <hante.meuleman@broadcom.com>
Reviewed-by: Pieter-Paul Giesberts <pieter-paul.giesberts@broadcom.com>
Reviewed-by: Franky Lin <franky.lin@broadcom.com>
Signed-off-by: Arend van Spriel <arend.vanspriel@broadcom.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 .../net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c    | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
index ccae3bbe7db2..7ffc4aba5bab 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
@@ -6868,7 +6868,7 @@ struct brcmf_cfg80211_info *brcmf_cfg80211_attach(struct brcmf_pub *drvr,
 
 	err = brcmf_p2p_attach(cfg, p2pdev_forced);
 	if (err) {
-		brcmf_err("P2P initilisation failed (%d)\n", err);
+		brcmf_err("P2P initialisation failed (%d)\n", err);
 		goto wiphy_unreg_out;
 	}
 	err = brcmf_btcoex_attach(cfg);
@@ -6893,7 +6893,7 @@ struct brcmf_cfg80211_info *brcmf_cfg80211_attach(struct brcmf_pub *drvr,
 	err = brcmf_fweh_activate_events(ifp);
 	if (err) {
 		brcmf_err("FWEH activation failed (%d)\n", err);
-		goto wiphy_unreg_out;
+		goto detach;
 	}
 
 	/* Fill in some of the advertised nl80211 supported features */
@@ -6908,6 +6908,9 @@ struct brcmf_cfg80211_info *brcmf_cfg80211_attach(struct brcmf_pub *drvr,
 
 	return cfg;
 
+detach:
+	brcmf_btcoex_detach(cfg);
+	brcmf_p2p_detach(&cfg->p2p);
 wiphy_unreg_out:
 	wiphy_unregister(cfg->wiphy);
 priv_out:

From 2b66325d5ea7c2a39ac69ed83b6979afe480d81a Mon Sep 17 00:00:00 2001
From: Arend Van Spriel <arend.vanspriel@broadcom.com>
Date: Fri, 9 Dec 2016 11:34:14 +0000
Subject: [PATCH 02/30] brcmfmac: fix uninitialized field in scheduled scan
 ssid configuration

The scheduled scan ssid configuration in firmware has a flags field that
was not initialized resulting in unexpected behaviour.

Fixes: e3bdb7cc0300 ("brcmfmac: fix handling ssids in .sched_scan_start() callback")
Reviewed-by: Hante Meuleman <hante.meuleman@broadcom.com>
Reviewed-by: Pieter-Paul Giesberts <pieter-paul.giesberts@broadcom.com>
Reviewed-by: Franky Lin <franky.lin@broadcom.com>
Signed-off-by: Arend van Spriel <arend.vanspriel@broadcom.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/broadcom/brcm80211/brcmfmac/pno.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pno.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pno.c
index f273cab0da10..9a25e79a46cf 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pno.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pno.c
@@ -137,6 +137,7 @@ static int brcmf_pno_add_ssid(struct brcmf_if *ifp, struct cfg80211_ssid *ssid,
 	pfn.wpa_auth = cpu_to_le32(BRCMF_PNO_WPA_AUTH_ANY);
 	pfn.wsec = cpu_to_le32(0);
 	pfn.infra = cpu_to_le32(1);
+	pfn.flags = 0;
 	if (active)
 		pfn.flags = cpu_to_le32(1 << BRCMF_PNO_HIDDEN_BIT);
 	pfn.ssid.SSID_len = cpu_to_le32(ssid->ssid_len);

From d4166b8b33650d9dc89715c9540ba0f261490d4d Mon Sep 17 00:00:00 2001
From: Ben Greear <greearb@candelatech.com>
Date: Thu, 15 Dec 2016 11:23:19 +0200
Subject: [PATCH 03/30] ath10k: free host-mem with DMA_BIRECTIONAL flag

Hopefully this fixes the problem reported by Kalle:

Noticed this in my log, but I don't have time to investigate this in
detail right now:

[  413.795346] IPv6: ADDRCONF(NETDEV_UP): wlan0: link is not ready
[  414.158755] IPv6: ADDRCONF(NETDEV_CHANGE): wlan0: link becomes ready
[  477.439659] ath10k_pci 0000:02:00.0: could not get mac80211 beacon
[  481.666630] ------------[ cut here ]------------
[  481.666669] WARNING: CPU: 0 PID: 1978 at lib/dma-debug.c:1155 check_unmap+0x320/0x8e0
[  481.666688] ath10k_pci 0000:02:00.0: DMA-API: device driver frees DMA memory with different direction [device address=0x000000002d130000] [size=63800 bytes] [mapped with DMA_BIDIRECTIONAL] [unmapped with DMA_TO_DEVICE]
[  481.666703] Modules linked in: ctr ccm ath10k_pci(E-) ath10k_core(E) ath(E) mac80211(E) cfg80211(E) snd_hda_codec_hdmi snd_hda_codec_idt snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hda_core snd_hwdep snd_pcm snd_seq_midi arc4 snd_rawmidi snd_seq_midi_event snd_seq btusb btintel snd_seq_device joydev coret
[  481.671468] CPU: 0 PID: 1978 Comm: rmmod Tainted: G            E   4.9.0-rc7-wt+ #54
[  481.671478] Hardware name: Hewlett-Packard HP ProBook 6540b/1722, BIOS 68CDD Ver. F.04 01/27/2010
[  481.671489]  ef49dcec c842ee92 c8b5830e ef49dd34 ef49dd20 c80850f5 c8b5a13c ef49dd50
[  481.671560]  000007ba c8b5830e 00000483 c8461830 c8461830 00000483 ef49ddcc f34e64b8
[  481.671641]  c8b58360 ef49dd3c c80851bb 00000009 00000000 ef49dd34 c8b5a13c ef49dd50
[  481.671716] Call Trace:
[  481.671731]  [<c842ee92>] dump_stack+0x76/0xb4
[  481.671745]  [<c80850f5>] __warn+0xe5/0x100
[  481.671757]  [<c8461830>] ? check_unmap+0x320/0x8e0
[  481.671769]  [<c8461830>] ? check_unmap+0x320/0x8e0
[  481.671780]  [<c80851bb>] warn_slowpath_fmt+0x3b/0x40
[  481.671791]  [<c8461830>] check_unmap+0x320/0x8e0
[  481.671804]  [<c8462054>] debug_dma_unmap_page+0x84/0xa0
[  481.671835]  [<f937cd7a>] ath10k_wmi_free_host_mem+0x9a/0xe0 [ath10k_core]
[  481.671861]  [<f9363400>] ath10k_core_destroy+0x50/0x60 [ath10k_core]
[  481.671875]  [<f8e13969>] ath10k_pci_remove+0x79/0xa0 [ath10k_pci]
[  481.671889]  [<c848d8d8>] pci_device_remove+0x38/0xb0
[  481.671901]  [<c859fe4b>] __device_release_driver+0x7b/0x110
[  481.671913]  [<c85a00e7>] driver_detach+0x97/0xa0
[  481.671923]  [<c859ef8b>] bus_remove_driver+0x4b/0xb0
[  481.671934]  [<c85a0cda>] driver_unregister+0x2a/0x60
[  481.671949]  [<c848c888>] pci_unregister_driver+0x18/0x70
[  481.671965]  [<f8e14dae>] ath10k_pci_exit+0xd/0x25f [ath10k_pci]
[  481.671979]  [<c812bb84>] SyS_delete_module+0xf4/0x180
[  481.671995]  [<c81f801b>] ? __might_fault+0x8b/0xa0
[  481.672009]  [<c80037d0>] do_fast_syscall_32+0xa0/0x1e0
[  481.672025]  [<c88d4c88>] sysenter_past_esp+0x45/0x74
[  481.672037] ---[ end trace 3fd23759e17e1622 ]---
[  481.672049] Mapped at:
[  481.672060]  [  481.672072] [<c846062c>] debug_dma_map_page.part.25+0x1c/0xf0
[  481.672083]  [  481.672095] [<c8460799>] debug_dma_map_page+0x99/0xc0
[  481.672106]  [  481.672132] [<f93745ec>] ath10k_wmi_alloc_chunk+0x12c/0x1f0 [ath10k_core]
[  481.672142]  [  481.672168] [<f937d0c4>] ath10k_wmi_event_service_ready_work+0x304/0x540 [ath10k_core]
[  481.672178]  [  481.672190] [<c80a3643>] process_one_work+0x1c3/0x670
[  482.137134] ath10k_pci 0000:02:00.0: pci irq msi oper_irq_mode 2 irq_mode 0 reset_mode 0
[  482.313144] ath10k_pci 0000:02:00.0: Direct firmware load for ath10k/pre-cal-pci-0000:02:00.0.bin failed with error -2
[  482.313274] ath10k_pci 0000:02:00.0: Direct firmware load for ath10k/cal-pci-0000:02:00.0.bin failed with error -2
[  482.313768] ath10k_pci 0000:02:00.0: qca988x hw2.0 target 0x4100016c chip_id 0x043202ff sub 0000:0000
[  482.313777] ath10k_pci 0000:02:00.0: kconfig debug 1 debugfs 1 tracing 1 dfs 0 testmode 1
[  482.313974] ath10k_pci 0000:02:00.0: firmware ver 10.2.4.70.59-2 api 5 features no-p2p,raw-mode,mfp,allows-mesh-bcast crc32 4159f498
[  482.369858] ath10k_pci 0000:02:00.0: Direct firmware load for ath10k/QCA988X/hw2.0/board-2.bin failed with error -2
[  482.370011] ath10k_pci 0000:02:00.0: board_file api 1 bmi_id N/A crc32 bebc7c08
[  483.596770] ath10k_pci 0000:02:00.0: htt-ver 2.1 wmi-op 5 htt-op 2 cal otp max-sta 128 raw 0 hwcrypto 1
[  483.701686] ath: EEPROM regdomain: 0x0
[  483.701706] ath: EEPROM indicates default country code should be used
[  483.701713] ath: doing EEPROM country->regdmn map search
[  483.701721] ath: country maps to regdmn code: 0x3a
[  483.701730] ath: Country alpha2 being used: US
[  483.701737] ath: Regpair used: 0x3a

Reported-by: Kalle Valo <kvalo@qca.qualcomm.com>
Signed-off-by: Ben Greear <greearb@candelatech.com>
Signed-off-by: Kalle Valo <kvalo@qca.qualcomm.com>
---
 drivers/net/wireless/ath/ath10k/wmi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/ath/ath10k/wmi.c b/drivers/net/wireless/ath/ath10k/wmi.c
index c893314a191f..50d6ee6afe26 100644
--- a/drivers/net/wireless/ath/ath10k/wmi.c
+++ b/drivers/net/wireless/ath/ath10k/wmi.c
@@ -8271,7 +8271,7 @@ void ath10k_wmi_free_host_mem(struct ath10k *ar)
 		dma_unmap_single(ar->dev,
 				 ar->wmi.mem_chunks[i].paddr,
 				 ar->wmi.mem_chunks[i].len,
-				 DMA_TO_DEVICE);
+				 DMA_BIDIRECTIONAL);
 		kfree(ar->wmi.mem_chunks[i].vaddr);
 	}
 

From d1f1c0e289e1bc46cd6873ba6dd6c627f459e7fa Mon Sep 17 00:00:00 2001
From: Tobias Klausmann <tobias.johannes.klausmann@mni.thm.de>
Date: Tue, 13 Dec 2016 18:08:07 +0100
Subject: [PATCH 04/30] ath9k: do not return early to fix rcu unlocking

Starting with commit d94a461d7a7d ("ath9k: use ieee80211_tx_status_noskb
where possible") the driver uses rcu_read_lock() && rcu_read_unlock(), yet on
returning early in ath_tx_edma_tasklet() the unlock is missing leading to stalls
and suspicious RCU usage:

 ===============================
 [ INFO: suspicious RCU usage. ]
 4.9.0-rc8 #11 Not tainted
 -------------------------------
 kernel/rcu/tree.c:705 Illegal idle entry in RCU read-side critical section.!

 other info that might help us debug this:

 RCU used illegally from idle CPU!
 rcu_scheduler_active = 1, debug_locks = 0
 RCU used illegally from extended quiescent state!
 1 lock held by swapper/7/0:
 #0:
  (
 rcu_read_lock
 ){......}
 , at:
 [<ffffffffa06ed110>] ath_tx_edma_tasklet+0x0/0x450 [ath9k]

 stack backtrace:
 CPU: 7 PID: 0 Comm: swapper/7 Not tainted 4.9.0-rc8 #11
 Hardware name: Acer Aspire V3-571G/VA50_HC_CR, BIOS V2.21 12/16/2013
  ffff88025efc3f38 ffffffff8132b1e5 ffff88017ede4540 0000000000000001
  ffff88025efc3f68 ffffffff810a25f7 ffff88025efcee60 ffff88017edebdd8
  ffff88025eeb5400 0000000000000091 ffff88025efc3f88 ffffffff810c3cd4
 Call Trace:
  <IRQ>
  [<ffffffff8132b1e5>] dump_stack+0x68/0x93
  [<ffffffff810a25f7>] lockdep_rcu_suspicious+0xd7/0x110
  [<ffffffff810c3cd4>] rcu_eqs_enter_common.constprop.85+0x154/0x200
  [<ffffffff810c5a54>] rcu_irq_exit+0x44/0xa0
  [<ffffffff81058631>] irq_exit+0x61/0xd0
  [<ffffffff81018d25>] do_IRQ+0x65/0x110
  [<ffffffff81672189>] common_interrupt+0x89/0x89
  <EOI>
  [<ffffffff814ffe11>] ? cpuidle_enter_state+0x151/0x200
  [<ffffffff814ffee2>] cpuidle_enter+0x12/0x20
  [<ffffffff8109a6ae>] call_cpuidle+0x1e/0x40
  [<ffffffff8109a8f6>] cpu_startup_entry+0x146/0x220
  [<ffffffff810336f8>] start_secondary+0x148/0x170

Signed-off-by: Tobias Klausmann <tobias.johannes.klausmann@mni.thm.de>
Fixes: d94a461d7a7d ("ath9k: use ieee80211_tx_status_noskb where possible")
Cc: <stable@vger.kernel.org> # v4.9
Acked-by: Felix Fietkau <nbd@nbd.name>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Tested-by: Gabriel Craciunescu <nix.or.die@gmail.com>
Signed-off-by: Kalle Valo <kvalo@qca.qualcomm.com>
---
 drivers/net/wireless/ath/ath9k/xmit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/ath/ath9k/xmit.c b/drivers/net/wireless/ath/ath9k/xmit.c
index 486afa98a5b8..4e2f3ac266c3 100644
--- a/drivers/net/wireless/ath/ath9k/xmit.c
+++ b/drivers/net/wireless/ath/ath9k/xmit.c
@@ -2713,7 +2713,7 @@ void ath_tx_edma_tasklet(struct ath_softc *sc)
 		fifo_list = &txq->txq_fifo[txq->txq_tailidx];
 		if (list_empty(fifo_list)) {
 			ath_txq_unlock(sc, txq);
-			return;
+			break;
 		}
 
 		bf = list_first_entry(fifo_list, struct ath_buf, list);

From 22b68b93ae2506bd56ee3bf232a51bc8ab955b56 Mon Sep 17 00:00:00 2001
From: Larry Finger <Larry.Finger@lwfinger.net>
Date: Mon, 19 Dec 2016 20:38:12 -0600
Subject: [PATCH 05/30] rtlwifi: Fix kernel oops introduced with commit
 e49656147359

With commit e49656147359 {"rtlwifi: Use dev_kfree_skb_irq instead of
kfree_skb"), the method used to free an skb was changed because the
kfree_skb() was inside a spinlock. What was forgotten is that kfree_skb()
guards against a NULL value for the argument. Routine dev_kfree_skb_irq()
does not, and a test is needed to prevent kernel panics.

Fixes: e49656147359 ("rtlwifi: Use dev_kfree_skb_irq instead of kfree_skb")
Signed-off-by: Larry Finger <Larry.Finger@lwfinger.net>
Cc: Stable <stable@vger.kernel.org> # 4.9+
Cc: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/realtek/rtlwifi/core.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/wireless/realtek/rtlwifi/core.c b/drivers/net/wireless/realtek/rtlwifi/core.c
index 2caa4ad04dba..ded1493fee9c 100644
--- a/drivers/net/wireless/realtek/rtlwifi/core.c
+++ b/drivers/net/wireless/realtek/rtlwifi/core.c
@@ -1829,7 +1829,8 @@ bool rtl_cmd_send_packet(struct ieee80211_hw *hw, struct sk_buff *skb)
 
 	spin_lock_irqsave(&rtlpriv->locks.irq_th_lock, flags);
 	pskb = __skb_dequeue(&ring->queue);
-	dev_kfree_skb_irq(pskb);
+	if (pskb)
+		dev_kfree_skb_irq(pskb);
 
 	/*this is wrong, fill_tx_cmddesc needs update*/
 	pdesc = &ring->desc[0];

From 76b15923b777aa2660029629179550124c1fc40e Mon Sep 17 00:00:00 2001
From: Kalesh A P <kalesh-anakkur.purayil@broadcom.com>
Date: Tue, 20 Dec 2016 10:14:30 -0500
Subject: [PATCH 06/30] be2net: Increase skb headroom size to 256 bytes

The driver currently allocates 128 bytes of skb headroom.
This was found to be insufficient with some configurations
like Geneve tunnels, which resulted in skb head reallocations.

Increase the headroom to 256 bytes to fix this.

Signed-off-by: Kalesh A P <kalesh-anakkur.purayil@broadcom.com>
Signed-off-by: Suresh Reddy <suresh.reddy@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/emulex/benet/be.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/emulex/benet/be.h b/drivers/net/ethernet/emulex/benet/be.h
index 6cfa63a5e9b4..4c30c44b242e 100644
--- a/drivers/net/ethernet/emulex/benet/be.h
+++ b/drivers/net/ethernet/emulex/benet/be.h
@@ -65,7 +65,7 @@
 /* Number of bytes of an RX frame that are copied to skb->data */
 #define BE_HDR_LEN		((u16) 64)
 /* allocate extra space to allow tunneling decapsulation without head reallocation */
-#define BE_RX_SKB_ALLOC_SIZE (BE_HDR_LEN + 64)
+#define BE_RX_SKB_ALLOC_SIZE	256
 
 #define BE_MAX_JUMBO_FRAME_SIZE	9018
 #define BE_MIN_MTU		256

From f217bfde24077b6684e625dd618e3a522e6f272a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Heiko=20St=C3=BCbner?= <heiko@sntech.de>
Date: Tue, 20 Dec 2016 17:17:06 +0100
Subject: [PATCH 07/30] net: ethernet: stmmac: dwmac-rk: make clk enablement
 first in powerup

Right now the dwmac-rk tries to set up the GRF-specific speed and link
options before enabling clocks, phys etc and on previous socs this works
because the GRF is supplied on the whole by one clock.

On the rk3399 however the GRF (General Register Files) clock-supply
has been split into multiple clocks and while there is no specific
grf-gmac clock like for other sub-blocks, it seems the mac-specific
portions are actually supplied by the general mac clock.

This results in hangs on rk3399 boards if the driver is build as module.
When built in te problem of course doesn't surface, as the clocks
are of course still on at the stage before clock_disable_unused.

To solve this, simply move the clock enablement to the first position
in the powerup callback. This is also a good idea in general to
enable clocks before everything else.

Tested on rk3288, rk3368 and rk3399 the dwmac still works on all of them.

Signed-off-by: Heiko Stuebner <heiko@sntech.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c
index 77ab0a85f067..fa6e9704c077 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c
@@ -864,6 +864,10 @@ static int rk_gmac_powerup(struct rk_priv_data *bsp_priv)
 	int ret;
 	struct device *dev = &bsp_priv->pdev->dev;
 
+	ret = gmac_clk_enable(bsp_priv, true);
+	if (ret)
+		return ret;
+
 	/*rmii or rgmii*/
 	if (bsp_priv->phy_iface == PHY_INTERFACE_MODE_RGMII) {
 		dev_info(dev, "init for RGMII\n");
@@ -880,10 +884,6 @@ static int rk_gmac_powerup(struct rk_priv_data *bsp_priv)
 	if (ret)
 		return ret;
 
-	ret = gmac_clk_enable(bsp_priv, true);
-	if (ret)
-		return ret;
-
 	pm_runtime_enable(dev);
 	pm_runtime_get_sync(dev);
 

From 8354491c9d5b06709384cea91d13019bf5e61449 Mon Sep 17 00:00:00 2001
From: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Date: Wed, 21 Dec 2016 11:28:49 +0100
Subject: [PATCH 08/30] net: mvpp2: fix dma unmapping of TX buffers for
 fragments

Since commit 71ce391dfb784 ("net: mvpp2: enable proper per-CPU TX
buffers unmapping"), we are not correctly DMA unmapping TX buffers for
fragments.

Indeed, the mvpp2_txq_inc_put() function only stores in the
txq_cpu->tx_buffs[] array the physical address of the buffer to be
DMA-unmapped when skb != NULL. In addition, when DMA-unmapping, we use
skb_headlen(skb) to get the size to be unmapped. Both of this works fine
for TX descriptors that are associated directly to a SKB, but not the
ones that are used for fragments, with a NULL pointer as skb:

 - We have a NULL physical address when calling DMA unmap
 - skb_headlen(skb) crashes because skb is NULL

This causes random crashes when fragments are used.

To solve this problem, we need to:

 - Store the physical address of the buffer to be unmapped
   unconditionally, regardless of whether it is tied to a SKB or not.

 - Store the length of the buffer to be unmapped, which requires a new
   field.

Instead of adding a third array to store the length of the buffer to be
unmapped, and as suggested by David Miller, this commit refactors the
tx_buffs[] and tx_skb[] arrays of 'struct mvpp2_txq_pcpu' into a
separate structure 'mvpp2_txq_pcpu_buf', to which a 'size' field is
added. Therefore, instead of having three arrays to allocate/free, we
have a single one, which also improve data locality, reducing the
impact on the CPU cache.

Fixes: 71ce391dfb784 ("net: mvpp2: enable proper per-CPU TX buffers unmapping")
Reported-by: Raphael G <raphael.glon@corp.ovh.com>
Cc: Raphael G <raphael.glon@corp.ovh.com>
Cc: stable@vger.kernel.org
Signed-off-by: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/marvell/mvpp2.c | 59 ++++++++++++++--------------
 1 file changed, 30 insertions(+), 29 deletions(-)

diff --git a/drivers/net/ethernet/marvell/mvpp2.c b/drivers/net/ethernet/marvell/mvpp2.c
index dabc5418efcc..cda04b3126bc 100644
--- a/drivers/net/ethernet/marvell/mvpp2.c
+++ b/drivers/net/ethernet/marvell/mvpp2.c
@@ -770,6 +770,17 @@ struct mvpp2_rx_desc {
 	u32 reserved8;
 };
 
+struct mvpp2_txq_pcpu_buf {
+	/* Transmitted SKB */
+	struct sk_buff *skb;
+
+	/* Physical address of transmitted buffer */
+	dma_addr_t phys;
+
+	/* Size transmitted */
+	size_t size;
+};
+
 /* Per-CPU Tx queue control */
 struct mvpp2_txq_pcpu {
 	int cpu;
@@ -785,11 +796,8 @@ struct mvpp2_txq_pcpu {
 	/* Number of Tx DMA descriptors reserved for each CPU */
 	int reserved_num;
 
-	/* Array of transmitted skb */
-	struct sk_buff **tx_skb;
-
-	/* Array of transmitted buffers' physical addresses */
-	dma_addr_t *tx_buffs;
+	/* Infos about transmitted buffers */
+	struct mvpp2_txq_pcpu_buf *buffs;
 
 	/* Index of last TX DMA descriptor that was inserted */
 	int txq_put_index;
@@ -979,10 +987,11 @@ static void mvpp2_txq_inc_put(struct mvpp2_txq_pcpu *txq_pcpu,
 			      struct sk_buff *skb,
 			      struct mvpp2_tx_desc *tx_desc)
 {
-	txq_pcpu->tx_skb[txq_pcpu->txq_put_index] = skb;
-	if (skb)
-		txq_pcpu->tx_buffs[txq_pcpu->txq_put_index] =
-							 tx_desc->buf_phys_addr;
+	struct mvpp2_txq_pcpu_buf *tx_buf =
+		txq_pcpu->buffs + txq_pcpu->txq_put_index;
+	tx_buf->skb = skb;
+	tx_buf->size = tx_desc->data_size;
+	tx_buf->phys = tx_desc->buf_phys_addr;
 	txq_pcpu->txq_put_index++;
 	if (txq_pcpu->txq_put_index == txq_pcpu->size)
 		txq_pcpu->txq_put_index = 0;
@@ -4401,17 +4410,16 @@ static void mvpp2_txq_bufs_free(struct mvpp2_port *port,
 	int i;
 
 	for (i = 0; i < num; i++) {
-		dma_addr_t buf_phys_addr =
-				    txq_pcpu->tx_buffs[txq_pcpu->txq_get_index];
-		struct sk_buff *skb = txq_pcpu->tx_skb[txq_pcpu->txq_get_index];
+		struct mvpp2_txq_pcpu_buf *tx_buf =
+			txq_pcpu->buffs + txq_pcpu->txq_get_index;
 
 		mvpp2_txq_inc_get(txq_pcpu);
 
-		dma_unmap_single(port->dev->dev.parent, buf_phys_addr,
-				 skb_headlen(skb), DMA_TO_DEVICE);
-		if (!skb)
+		dma_unmap_single(port->dev->dev.parent, tx_buf->phys,
+				 tx_buf->size, DMA_TO_DEVICE);
+		if (!tx_buf->skb)
 			continue;
-		dev_kfree_skb_any(skb);
+		dev_kfree_skb_any(tx_buf->skb);
 	}
 }
 
@@ -4651,15 +4659,10 @@ static int mvpp2_txq_init(struct mvpp2_port *port,
 	for_each_present_cpu(cpu) {
 		txq_pcpu = per_cpu_ptr(txq->pcpu, cpu);
 		txq_pcpu->size = txq->size;
-		txq_pcpu->tx_skb = kmalloc(txq_pcpu->size *
-					   sizeof(*txq_pcpu->tx_skb),
-					   GFP_KERNEL);
-		if (!txq_pcpu->tx_skb)
-			goto error;
-
-		txq_pcpu->tx_buffs = kmalloc(txq_pcpu->size *
-					     sizeof(dma_addr_t), GFP_KERNEL);
-		if (!txq_pcpu->tx_buffs)
+		txq_pcpu->buffs = kmalloc(txq_pcpu->size *
+					  sizeof(struct mvpp2_txq_pcpu_buf),
+					  GFP_KERNEL);
+		if (!txq_pcpu->buffs)
 			goto error;
 
 		txq_pcpu->count = 0;
@@ -4673,8 +4676,7 @@ static int mvpp2_txq_init(struct mvpp2_port *port,
 error:
 	for_each_present_cpu(cpu) {
 		txq_pcpu = per_cpu_ptr(txq->pcpu, cpu);
-		kfree(txq_pcpu->tx_skb);
-		kfree(txq_pcpu->tx_buffs);
+		kfree(txq_pcpu->buffs);
 	}
 
 	dma_free_coherent(port->dev->dev.parent,
@@ -4693,8 +4695,7 @@ static void mvpp2_txq_deinit(struct mvpp2_port *port,
 
 	for_each_present_cpu(cpu) {
 		txq_pcpu = per_cpu_ptr(txq->pcpu, cpu);
-		kfree(txq_pcpu->tx_skb);
-		kfree(txq_pcpu->tx_buffs);
+		kfree(txq_pcpu->buffs);
 	}
 
 	if (txq->descs)

From 0a9648f1293966c838dc570da73c15a76f4c89d6 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 21 Dec 2016 05:42:43 -0800
Subject: [PATCH 09/30] tcp: add a missing barrier in tcp_tasklet_func()

Madalin reported crashes happening in tcp_tasklet_func() on powerpc64

Before TSQ_QUEUED bit is cleared, we must ensure the changes done
by list_del(&tp->tsq_node); are committed to memory, otherwise
corruption might happen, as an other cpu could catch TSQ_QUEUED
clearance too soon.

We can notice that old kernels were immune to this bug, because
TSQ_QUEUED was cleared after a bh_lock_sock(sk)/bh_unlock_sock(sk)
section, but they could have missed a kick to write additional bytes,
when NIC interrupts for a given flow are spread to multiple cpus.

Affected TCP flows would need an incoming ACK or RTO timer to add more
packets to the pipe. So overall situation should be better now.

Fixes: b223feb9de2a ("tcp: tsq: add shortcut in tcp_tasklet_func()")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Madalin Bucur <madalin.bucur@nxp.com>
Tested-by: Madalin Bucur <madalin.bucur@nxp.com>
Tested-by: Xing Lei <xing.lei@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index b45101f3d2bd..31a255b555ad 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -769,6 +769,7 @@ static void tcp_tasklet_func(unsigned long data)
 		list_del(&tp->tsq_node);
 
 		sk = (struct sock *)tp;
+		smp_mb__before_atomic();
 		clear_bit(TSQ_QUEUED, &sk->sk_tsq_flags);
 
 		if (!sk->sk_lock.owned &&

From 551cde192343a10c8fbd57c4cc8f0c4b6cd307e4 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Wed, 21 Dec 2016 16:03:23 +0000
Subject: [PATCH 10/30] net: fddi: skfp: use %p format specifier for addresses
 rather than %x

Trivial fix: Addresses should be printed using the %p format specifier
rather than using %x.

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/fddi/skfp/hwmtm.c | 12 ++++++------
 drivers/net/fddi/skfp/pmf.c   |  2 +-
 drivers/net/fddi/skfp/smt.c   |  2 +-
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/net/fddi/skfp/hwmtm.c b/drivers/net/fddi/skfp/hwmtm.c
index e26398b5a7dc..d0a68bdd5f63 100644
--- a/drivers/net/fddi/skfp/hwmtm.c
+++ b/drivers/net/fddi/skfp/hwmtm.c
@@ -1483,7 +1483,7 @@ void mac_drv_clear_rx_queue(struct s_smc *smc)
 	r = queue->rx_curr_get ;
 	while (queue->rx_used) {
 		DRV_BUF_FLUSH(r,DDI_DMA_SYNC_FORCPU) ;
-		DB_RX("switch OWN bit of RxD 0x%x ",r,0,5) ;
+		DB_RX("switch OWN bit of RxD 0x%p ",r,0,5) ;
 		r->rxd_rbctrl &= ~cpu_to_le32(BMU_OWN) ;
 		frag_count = 1 ;
 		DRV_BUF_FLUSH(r,DDI_DMA_SYNC_FORDEV) ;
@@ -1645,7 +1645,7 @@ void hwm_tx_frag(struct s_smc *smc, char far *virt, u_long phys, int len,
 	DB_TX("hwm_tx_frag: len = %d, frame_status = %x ",len,frame_status,2) ;
 	if (frame_status & LAN_TX) {
 		/* '*t' is already defined */
-		DB_TX("LAN_TX: TxD = %x, virt = %x ",t,virt,3) ;
+		DB_TX("LAN_TX: TxD = %p, virt = %p ",t,virt,3) ;
 		t->txd_virt = virt ;
 		t->txd_txdscr = cpu_to_le32(smc->os.hwm.tx_descr) ;
 		t->txd_tbadr = cpu_to_le32(phys) ;
@@ -1819,7 +1819,7 @@ void smt_send_mbuf(struct s_smc *smc, SMbuf *mb, int fc)
 	__le32	tbctrl;
 
 	NDD_TRACE("THSB",mb,fc,0) ;
-	DB_TX("smt_send_mbuf: mb = 0x%x, fc = 0x%x",mb,fc,4) ;
+	DB_TX("smt_send_mbuf: mb = 0x%p, fc = 0x%x",mb,fc,4) ;
 
 	mb->sm_off-- ;	/* set to fc */
 	mb->sm_len++ ;	/* + fc */
@@ -1960,7 +1960,7 @@ static void mac_drv_clear_txd(struct s_smc *smc)
 
 			do {
 				DRV_BUF_FLUSH(t1,DDI_DMA_SYNC_FORCPU) ;
-				DB_TX("check OWN/EOF bit of TxD 0x%x",t1,0,5) ;
+				DB_TX("check OWN/EOF bit of TxD 0x%p",t1,0,5) ;
 				tbctrl = le32_to_cpu(CR_READ(t1->txd_tbctrl));
 
 				if (tbctrl & BMU_OWN || !queue->tx_used){
@@ -1988,7 +1988,7 @@ static void mac_drv_clear_txd(struct s_smc *smc)
 			}
 			else {
 #ifndef PASS_1ST_TXD_2_TX_COMP
-				DB_TX("mac_drv_tx_comp for TxD 0x%x",t2,0,4) ;
+				DB_TX("mac_drv_tx_comp for TxD 0x%p",t2,0,4) ;
 				mac_drv_tx_complete(smc,t2) ;
 #else
 				DB_TX("mac_drv_tx_comp for TxD 0x%x",
@@ -2052,7 +2052,7 @@ void mac_drv_clear_tx_queue(struct s_smc *smc)
 		tx_used = queue->tx_used ;
 		while (tx_used) {
 			DRV_BUF_FLUSH(t,DDI_DMA_SYNC_FORCPU) ;
-			DB_TX("switch OWN bit of TxD 0x%x ",t,0,5) ;
+			DB_TX("switch OWN bit of TxD 0x%p ",t,0,5) ;
 			t->txd_tbctrl &= ~cpu_to_le32(BMU_OWN) ;
 			DRV_BUF_FLUSH(t,DDI_DMA_SYNC_FORDEV) ;
 			t = t->txd_next ;
diff --git a/drivers/net/fddi/skfp/pmf.c b/drivers/net/fddi/skfp/pmf.c
index 441b4dc79450..52fa162a31e0 100644
--- a/drivers/net/fddi/skfp/pmf.c
+++ b/drivers/net/fddi/skfp/pmf.c
@@ -284,7 +284,7 @@ void smt_pmf_received_pack(struct s_smc *smc, SMbuf *mb, int local)
 	SMbuf		*reply ;
 
 	sm = smtod(mb,struct smt_header *) ;
-	DB_SMT("SMT: processing PMF frame at %x len %d\n",sm,mb->sm_len) ;
+	DB_SMT("SMT: processing PMF frame at %p len %d\n",sm,mb->sm_len) ;
 #ifdef	DEBUG
 	dump_smt(smc,sm,"PMF Received") ;
 #endif
diff --git a/drivers/net/fddi/skfp/smt.c b/drivers/net/fddi/skfp/smt.c
index cd78b7cacc75..e80a08903fcf 100644
--- a/drivers/net/fddi/skfp/smt.c
+++ b/drivers/net/fddi/skfp/smt.c
@@ -504,7 +504,7 @@ void smt_received_pack(struct s_smc *smc, SMbuf *mb, int fs)
 #endif
 
 	smt_swap_para(sm,(int) mb->sm_len,1) ;
-	DB_SMT("SMT : received packet [%s] at 0x%x\n",
+	DB_SMT("SMT : received packet [%s] at 0x%p\n",
 		smt_type_name[m_fc(mb) & 0xf],sm) ;
 	DB_SMT("SMT : version %d, class %s\n",sm->smt_version,
 		smt_class_name[(sm->smt_class>LAST_CLASS)?0 : sm->smt_class]) ;

From 7d99569460eae28b187d574aec930a4cf8b90441 Mon Sep 17 00:00:00 2001
From: Lorenzo Colitti <lorenzo@google.com>
Date: Fri, 23 Dec 2016 00:33:57 +0900
Subject: [PATCH 11/30] net: ipv4: Don't crash if passing a null sk to
 ip_do_redirect.

Commit e2d118a1cb5e ("net: inet: Support UID-based routing in IP
protocols.") made ip_do_redirect call sock_net(sk) to determine
the network namespace of the passed-in socket. This crashes if sk
is NULL.

Fix this by getting the network namespace from the skb instead.

Fixes: e2d118a1cb5e ("net: inet: Support UID-based routing in IP protocols.")
Signed-off-by: Lorenzo Colitti <lorenzo@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index fa5c037227cb..9eabf490133a 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -798,6 +798,7 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf
 	struct rtable *rt;
 	struct flowi4 fl4;
 	const struct iphdr *iph = (const struct iphdr *) skb->data;
+	struct net *net = dev_net(skb->dev);
 	int oif = skb->dev->ifindex;
 	u8 tos = RT_TOS(iph->tos);
 	u8 prot = iph->protocol;
@@ -805,7 +806,7 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf
 
 	rt = (struct rtable *) dst;
 
-	__build_flow_key(sock_net(sk), &fl4, sk, iph, oif, tos, prot, mark, 0);
+	__build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
 	__ip_do_redirect(rt, skb, &fl4, true);
 }
 

From 567be786597ca24b13906a552ad2159316c6fe7d Mon Sep 17 00:00:00 2001
From: jpinto <Joao.Pinto@synopsys.com>
Date: Fri, 23 Dec 2016 10:15:59 +0000
Subject: [PATCH 12/30] stmmac: CSR clock configuration fix

When testing stmmac with my QoS reference design I checked a problem in the
CSR clock configuration that was impossibilitating the phy discovery, since
every read operation returned 0x0000ffff. This patch fixes the issue.

Signed-off-by: Joao Pinto <jpinto@synopsys.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c | 2 +-
 drivers/net/ethernet/stmicro/stmmac/dwmac100_core.c  | 2 +-
 drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c    | 8 ++++----
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c
index b21d03fe4f43..be3c91c7f211 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c
@@ -539,7 +539,7 @@ struct mac_device_info *dwmac1000_setup(void __iomem *ioaddr, int mcbins,
 	mac->mii.reg_shift = 6;
 	mac->mii.reg_mask = 0x000007C0;
 	mac->mii.clk_csr_shift = 2;
-	mac->mii.clk_csr_mask = 0xF;
+	mac->mii.clk_csr_mask = GENMASK(5, 2);
 
 	/* Get and dump the chip ID */
 	*synopsys_id = stmmac_get_synopsys_id(hwid);
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac100_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac100_core.c
index a1d582f47b1a..9dd2987e284d 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac100_core.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac100_core.c
@@ -197,7 +197,7 @@ struct mac_device_info *dwmac100_setup(void __iomem *ioaddr, int *synopsys_id)
 	mac->mii.reg_shift = 6;
 	mac->mii.reg_mask = 0x000007C0;
 	mac->mii.clk_csr_shift = 2;
-	mac->mii.clk_csr_mask = 0xF;
+	mac->mii.clk_csr_mask = GENMASK(5, 2);
 
 	/* Synopsys Id is not available on old chips */
 	*synopsys_id = 0;
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
index 23322fd9e3ac..fda01f770eff 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
@@ -81,8 +81,8 @@ static int stmmac_mdio_read(struct mii_bus *bus, int phyaddr, int phyreg)
 	value |= (phyaddr << priv->hw->mii.addr_shift)
 		& priv->hw->mii.addr_mask;
 	value |= (phyreg << priv->hw->mii.reg_shift) & priv->hw->mii.reg_mask;
-	value |= (priv->clk_csr & priv->hw->mii.clk_csr_mask)
-		<< priv->hw->mii.clk_csr_shift;
+	value |= (priv->clk_csr << priv->hw->mii.clk_csr_shift)
+		& priv->hw->mii.clk_csr_mask;
 	if (priv->plat->has_gmac4)
 		value |= MII_GMAC4_READ;
 
@@ -122,8 +122,8 @@ static int stmmac_mdio_write(struct mii_bus *bus, int phyaddr, int phyreg,
 		& priv->hw->mii.addr_mask;
 	value |= (phyreg << priv->hw->mii.reg_shift) & priv->hw->mii.reg_mask;
 
-	value |= ((priv->clk_csr & priv->hw->mii.clk_csr_mask)
-		<< priv->hw->mii.clk_csr_shift);
+	value |= (priv->clk_csr << priv->hw->mii.clk_csr_shift)
+		& priv->hw->mii.clk_csr_mask;
 	if (priv->plat->has_gmac4)
 		value |= MII_GMAC4_WRITE;
 

From dc594ecd4185831031d3fef2853ee76908428107 Mon Sep 17 00:00:00 2001
From: Or Gerlitz <ogerlitz@mellanox.com>
Date: Thu, 22 Dec 2016 14:28:14 +0200
Subject: [PATCH 13/30] net/sched: act_tunnel_key: Fix setting UDP dst port in
 metadata under IPv6

The UDP dst port was provided to the helper function which sets the
IPv6 IP tunnel meta-data under a wrong param order, fix that.

Fixes: 75bfbca01e48 ('net/sched: act_tunnel_key: Add UDP dst port option')
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Reviewed-by: Hadar Hen Zion <hadarh@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_tunnel_key.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
index 7af712526f01..e3a58e021198 100644
--- a/net/sched/act_tunnel_key.c
+++ b/net/sched/act_tunnel_key.c
@@ -134,8 +134,8 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
 			saddr = nla_get_in6_addr(tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC]);
 			daddr = nla_get_in6_addr(tb[TCA_TUNNEL_KEY_ENC_IPV6_DST]);
 
-			metadata = __ipv6_tun_set_dst(&saddr, &daddr, 0, 0, 0,
-						      dst_port, TUNNEL_KEY,
+			metadata = __ipv6_tun_set_dst(&saddr, &daddr, 0, 0, dst_port,
+						      0, TUNNEL_KEY,
 						      key_id, 0);
 		}
 

From d9724772e69cb8076231202292665ca74eec13e1 Mon Sep 17 00:00:00 2001
From: Or Gerlitz <ogerlitz@mellanox.com>
Date: Thu, 22 Dec 2016 14:28:15 +0200
Subject: [PATCH 14/30] net/sched: cls_flower: Mandate mask when matching on
 flags

When matching on flags, we should require the user to provide the
mask and avoid using an all-ones mask. Not doing so causes matching
on flags provided w.o mask to hit on the value being unset for all
flags, which may not what the user wanted to happen.

Fixes: faa3ffce7829 ('net/sched: cls_flower: Add support for matching on flags')
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Reported-by: Paul Blakey <paulb@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_flower.c | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 35ac28d0720c..333f8e268431 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -442,32 +442,32 @@ static void fl_set_key_flag(u32 flower_key, u32 flower_mask,
 	}
 }
 
-static void fl_set_key_flags(struct nlattr **tb,
-			     u32 *flags_key, u32 *flags_mask)
+static int fl_set_key_flags(struct nlattr **tb,
+			    u32 *flags_key, u32 *flags_mask)
 {
 	u32 key, mask;
 
-	if (!tb[TCA_FLOWER_KEY_FLAGS])
-		return;
+	/* mask is mandatory for flags */
+	if (!tb[TCA_FLOWER_KEY_FLAGS_MASK])
+		return -EINVAL;
 
 	key = be32_to_cpu(nla_get_u32(tb[TCA_FLOWER_KEY_FLAGS]));
-
-	if (!tb[TCA_FLOWER_KEY_FLAGS_MASK])
-		mask = ~0;
-	else
-		mask = be32_to_cpu(nla_get_u32(tb[TCA_FLOWER_KEY_FLAGS_MASK]));
+	mask = be32_to_cpu(nla_get_u32(tb[TCA_FLOWER_KEY_FLAGS_MASK]));
 
 	*flags_key  = 0;
 	*flags_mask = 0;
 
 	fl_set_key_flag(key, mask, flags_key, flags_mask,
 			TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT, FLOW_DIS_IS_FRAGMENT);
+
+	return 0;
 }
 
 static int fl_set_key(struct net *net, struct nlattr **tb,
 		      struct fl_flow_key *key, struct fl_flow_key *mask)
 {
 	__be16 ethertype;
+	int ret = 0;
 #ifdef CONFIG_NET_CLS_IND
 	if (tb[TCA_FLOWER_INDEV]) {
 		int err = tcf_change_indev(net, tb[TCA_FLOWER_INDEV]);
@@ -614,9 +614,10 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
 		       &mask->enc_tp.dst, TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK,
 		       sizeof(key->enc_tp.dst));
 
-	fl_set_key_flags(tb, &key->control.flags, &mask->control.flags);
+	if (tb[TCA_FLOWER_KEY_FLAGS])
+		ret = fl_set_key_flags(tb, &key->control.flags, &mask->control.flags);
 
-	return 0;
+	return ret;
 }
 
 static bool fl_mask_eq(struct fl_flow_mask *mask1,

From 39b2dd765e0711e1efd1d1df089473a8dd93ad48 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Thu, 22 Dec 2016 18:19:16 -0500
Subject: [PATCH 15/30] inet: fix IP(V6)_RECVORIGDSTADDR for udp sockets

Socket cmsg IP(V6)_RECVORIGDSTADDR checks that port range lies within
the packet. For sockets that have transport headers pulled, transport
offset can be negative. Use signed comparison to avoid overflow.

Fixes: e6afc8ace6dd ("udp: remove headers from UDP packets before queueing")
Reported-by: Nisar Jagabar <njagabar@cloudmark.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_sockglue.c | 2 +-
 net/ipv6/datagram.c    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 8b13881ed064..976073417b03 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -148,7 +148,7 @@ static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb)
 	const struct iphdr *iph = ip_hdr(skb);
 	__be16 *ports = (__be16 *)skb_transport_header(skb);
 
-	if (skb_transport_offset(skb) + 4 > skb->len)
+	if (skb_transport_offset(skb) + 4 > (int)skb->len)
 		return;
 
 	/* All current transport protocols have the port numbers in the
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 0489e19258ad..1407426bc862 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -701,7 +701,7 @@ void ip6_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg,
 		struct sockaddr_in6 sin6;
 		__be16 *ports = (__be16 *) skb_transport_header(skb);
 
-		if (skb_transport_offset(skb) + 4 <= skb->len) {
+		if (skb_transport_offset(skb) + 4 <= (int)skb->len) {
 			/* All current transport protocols have the port numbers in the
 			 * first four bytes of the transport header and this function is
 			 * written with this assumption in mind.

From a98f91758995cb59611e61318dddd8a6956b52c3 Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@codemonkey.org.uk>
Date: Thu, 22 Dec 2016 11:16:22 -0500
Subject: [PATCH 16/30] ipv6: handle -EFAULT from skb_copy_bits

By setting certain socket options on ipv6 raw sockets, we can confuse the
length calculation in rawv6_push_pending_frames triggering a BUG_ON.

RIP: 0010:[<ffffffff817c6390>] [<ffffffff817c6390>] rawv6_sendmsg+0xc30/0xc40
RSP: 0018:ffff881f6c4a7c18  EFLAGS: 00010282
RAX: 00000000fffffff2 RBX: ffff881f6c681680 RCX: 0000000000000002
RDX: ffff881f6c4a7cf8 RSI: 0000000000000030 RDI: ffff881fed0f6a00
RBP: ffff881f6c4a7da8 R08: 0000000000000000 R09: 0000000000000009
R10: ffff881fed0f6a00 R11: 0000000000000009 R12: 0000000000000030
R13: ffff881fed0f6a00 R14: ffff881fee39ba00 R15: ffff881fefa93a80

Call Trace:
 [<ffffffff8118ba23>] ? unmap_page_range+0x693/0x830
 [<ffffffff81772697>] inet_sendmsg+0x67/0xa0
 [<ffffffff816d93f8>] sock_sendmsg+0x38/0x50
 [<ffffffff816d982f>] SYSC_sendto+0xef/0x170
 [<ffffffff816da27e>] SyS_sendto+0xe/0x10
 [<ffffffff81002910>] do_syscall_64+0x50/0xa0
 [<ffffffff817f7cbc>] entry_SYSCALL64_slow_path+0x25/0x25

Handle by jumping to the failure path if skb_copy_bits gets an EFAULT.

Reproducer:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>

#define LEN 504

int main(int argc, char* argv[])
{
	int fd;
	int zero = 0;
	char buf[LEN];

	memset(buf, 0, LEN);

	fd = socket(AF_INET6, SOCK_RAW, 7);

	setsockopt(fd, SOL_IPV6, IPV6_CHECKSUM, &zero, 4);
	setsockopt(fd, SOL_IPV6, IPV6_DSTOPTS, &buf, LEN);

	sendto(fd, buf, 1, 0, (struct sockaddr *) buf, 110);
}

Signed-off-by: Dave Jones <davej@codemonkey.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/raw.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 291ebc260e70..ea89073c8247 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -591,7 +591,11 @@ static int rawv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6,
 	}
 
 	offset += skb_transport_offset(skb);
-	BUG_ON(skb_copy_bits(skb, offset, &csum, 2));
+	err = skb_copy_bits(skb, offset, &csum, 2);
+	if (err < 0) {
+		ip6_flush_pending_frames(sk);
+		goto out;
+	}
 
 	/* in case cksum was not initialized */
 	if (unlikely(csum))

From 53f800e3baf980827c197a9332f63effe80d4809 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Fri, 23 Dec 2016 09:32:48 +0100
Subject: [PATCH 17/30] neigh: Send netevent after marking neigh as dead

neigh_cleanup_and_release() is always called after marking a neighbour
as dead, but it only notifies user space and not in-kernel listeners of
the netevent notification chain.

This can cause multiple problems. In my specific use case, it causes the
listener (a switch driver capable of L3 offloads) to believe a neighbour
entry is still valid, and is thus erroneously kept in the device's
table.

Fix that by sending a netevent after marking the neighbour as dead.

Fixes: a6bf9e933daf ("mlxsw: spectrum_router: Offload neighbours based on NUD state change")
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/neighbour.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 782dd8663665..7bb12e07ffef 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -100,6 +100,7 @@ static void neigh_cleanup_and_release(struct neighbour *neigh)
 		neigh->parms->neigh_cleanup(neigh);
 
 	__neigh_notify(neigh, RTM_DELNEIGH, 0);
+	call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh);
 	neigh_release(neigh);
 }
 

From 93a87e5e794fb71a51f97fbde6c0010680b62d70 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Fri, 23 Dec 2016 09:32:49 +0100
Subject: [PATCH 18/30] mlxsw: spectrum_router: Don't reflect dead neighs

When a neighbour is considered to be dead, we should remove it from the
device's table regardless of its NUD state.

Without this patch, after setting a port to be administratively down we
get the following errors when we periodically try to update the kernel
about neighbours activity:

[  461.947268] mlxsw_spectrum 0000:03:00.0 sw1p3: Failed to find
matching neighbour for IP=192.168.100.2

Fixes: a6bf9e933daf ("mlxsw: spectrum_router: Offload neighbours based on NUD state change")
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index 53126bf68ea9..a0f9742f62df 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -942,7 +942,7 @@ static void mlxsw_sp_router_neigh_update_hw(struct work_struct *work)
 	char rauht_pl[MLXSW_REG_RAUHT_LEN];
 	struct net_device *dev;
 	bool entry_connected;
-	u8 nud_state;
+	u8 nud_state, dead;
 	bool updating;
 	bool removing;
 	bool adding;
@@ -953,10 +953,11 @@ static void mlxsw_sp_router_neigh_update_hw(struct work_struct *work)
 	dip = ntohl(*((__be32 *) n->primary_key));
 	memcpy(neigh_entry->ha, n->ha, sizeof(neigh_entry->ha));
 	nud_state = n->nud_state;
+	dead = n->dead;
 	dev = n->dev;
 	read_unlock_bh(&n->lock);
 
-	entry_connected = nud_state & NUD_VALID;
+	entry_connected = nud_state & NUD_VALID && !dead;
 	adding = (!neigh_entry->offloaded) && entry_connected;
 	updating = neigh_entry->offloaded && entry_connected;
 	removing = neigh_entry->offloaded && !entry_connected;
@@ -1351,7 +1352,7 @@ static int mlxsw_sp_nexthop_init(struct mlxsw_sp *mlxsw_sp,
 	struct mlxsw_sp_neigh_entry *neigh_entry;
 	struct net_device *dev = fib_nh->nh_dev;
 	struct neighbour *n;
-	u8 nud_state;
+	u8 nud_state, dead;
 
 	/* Take a reference of neigh here ensuring that neigh would
 	 * not be detructed before the nexthop entry is finished.
@@ -1383,8 +1384,9 @@ static int mlxsw_sp_nexthop_init(struct mlxsw_sp *mlxsw_sp,
 	list_add_tail(&nh->neigh_list_node, &neigh_entry->nexthop_list);
 	read_lock_bh(&n->lock);
 	nud_state = n->nud_state;
+	dead = n->dead;
 	read_unlock_bh(&n->lock);
-	__mlxsw_sp_nexthop_neigh_update(nh, !(nud_state & NUD_VALID));
+	__mlxsw_sp_nexthop_neigh_update(nh, !(nud_state & NUD_VALID && !dead));
 
 	return 0;
 }

From 58312125da5806308bd69e075fedae30f8cf7794 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Fri, 23 Dec 2016 09:32:50 +0100
Subject: [PATCH 19/30] mlxsw: spectrum_router: Correctly remove nexthop groups

At the end of the nexthop initialization process we determine whether
the nexthop should be offloaded or not based on the NUD state of the
neighbour representing it. After all the nexthops were initialized we
refresh the nexthop group and potentially offload it to the device, in
case some of the nexthops were resolved.

Make the destruction of a nexthop group symmetric with its creation by
marking all nexthops as invalid and then refresh the nexthop group to
make sure it was removed from the device's tables.

Fixes: b2157149b0b0 ("mlxsw: spectrum_router: Add the nexthop neigh activity update")
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index a0f9742f62df..01d0efa9c5c7 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -1396,6 +1396,7 @@ static void mlxsw_sp_nexthop_fini(struct mlxsw_sp *mlxsw_sp,
 {
 	struct mlxsw_sp_neigh_entry *neigh_entry = nh->neigh_entry;
 
+	__mlxsw_sp_nexthop_neigh_update(nh, true);
 	list_del(&nh->neigh_list_node);
 
 	/* If that is the last nexthop connected to that neigh, remove from
@@ -1454,6 +1455,8 @@ mlxsw_sp_nexthop_group_destroy(struct mlxsw_sp *mlxsw_sp,
 		nh = &nh_grp->nexthops[i];
 		mlxsw_sp_nexthop_fini(mlxsw_sp, nh);
 	}
+	mlxsw_sp_nexthop_group_refresh(mlxsw_sp, nh_grp);
+	WARN_ON_ONCE(nh_grp->adj_index_valid);
 	kfree(nh_grp);
 }
 

From 73b62bd085f4737679ea9afc7867fa5f99ba7d1b Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Fri, 23 Dec 2016 22:37:24 +0800
Subject: [PATCH 20/30] virtio-net: remove the warning before XDP linearizing

Since we use EWMA to estimate the size of rx buffer. When rx buffer
size is underestimated, it's usual to have a packet with more than one
buffers. Consider this is not a bug, remove the warning and correct
the comment before XDP linearizing.

Cc: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/virtio_net.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 08327e005ccc..1067253b98be 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -552,14 +552,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 		struct page *xdp_page;
 		u32 act;
 
-		/* No known backend devices should send packets with
-		 * more than a single buffer when XDP conditions are
-		 * met. However it is not strictly illegal so the case
-		 * is handled as an exception and a warning is thrown.
-		 */
+		/* This happens when rx buffer size is underestimated */
 		if (unlikely(num_buf > 1)) {
-			bpf_warn_invalid_xdp_buffer();
-
 			/* linearize data for XDP */
 			xdp_page = xdp_linearize_page(rq, num_buf,
 						      page, offset, &len);

From 275be061b33b945cfb120ee8a570f78c3ccafe56 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Fri, 23 Dec 2016 22:37:25 +0800
Subject: [PATCH 21/30] virtio-net: correctly xmit linearized page on XDP_TX

After we linearize page, we should xmit this page instead of the page
of first buffer which may lead unexpected result. With this patch, we
can see correct packet during XDP_TX.

Cc: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
Acked-by: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/virtio_net.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 1067253b98be..fe4562d395e3 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -572,7 +572,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 		if (unlikely(hdr->hdr.gso_type || hdr->hdr.flags))
 			goto err_xdp;
 
-		act = do_xdp_prog(vi, rq, xdp_prog, page, offset, len);
+		act = do_xdp_prog(vi, rq, xdp_prog, xdp_page, offset, len);
 		switch (act) {
 		case XDP_PASS:
 			if (unlikely(xdp_page != page))

From 56a86f84b8332afe8c6fcb4b09d09d9bf094e2db Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Fri, 23 Dec 2016 22:37:26 +0800
Subject: [PATCH 22/30] virtio-net: fix page miscount during XDP linearizing

We don't put page during linearizing, the would cause leaking when
xmit through XDP_TX or the packet exceeds PAGE_SIZE. Fix them by
put page accordingly. Also decrease the number of buffers during
linearizing to make sure caller can free buffers correctly when packet
exceeds PAGE_SIZE. With this patch, we won't get OOM after linearize
huge number of packets.

Cc: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
Acked-by: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/virtio_net.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index fe4562d395e3..58ad40e17a74 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -483,7 +483,7 @@ xdp_xmit:
  * anymore.
  */
 static struct page *xdp_linearize_page(struct receive_queue *rq,
-				       u16 num_buf,
+				       u16 *num_buf,
 				       struct page *p,
 				       int offset,
 				       unsigned int *len)
@@ -497,7 +497,7 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
 	memcpy(page_address(page) + page_off, page_address(p) + offset, *len);
 	page_off += *len;
 
-	while (--num_buf) {
+	while (--*num_buf) {
 		unsigned int buflen;
 		unsigned long ctx;
 		void *buf;
@@ -507,19 +507,22 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
 		if (unlikely(!ctx))
 			goto err_buf;
 
-		/* guard against a misconfigured or uncooperative backend that
-		 * is sending packet larger than the MTU.
-		 */
-		if ((page_off + buflen) > PAGE_SIZE)
-			goto err_buf;
-
 		buf = mergeable_ctx_to_buf_address(ctx);
 		p = virt_to_head_page(buf);
 		off = buf - page_address(p);
 
+		/* guard against a misconfigured or uncooperative backend that
+		 * is sending packet larger than the MTU.
+		 */
+		if ((page_off + buflen) > PAGE_SIZE) {
+			put_page(p);
+			goto err_buf;
+		}
+
 		memcpy(page_address(page) + page_off,
 		       page_address(p) + off, buflen);
 		page_off += buflen;
+		put_page(p);
 	}
 
 	*len = page_off;
@@ -555,7 +558,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 		/* This happens when rx buffer size is underestimated */
 		if (unlikely(num_buf > 1)) {
 			/* linearize data for XDP */
-			xdp_page = xdp_linearize_page(rq, num_buf,
+			xdp_page = xdp_linearize_page(rq, &num_buf,
 						      page, offset, &len);
 			if (!xdp_page)
 				goto err_xdp;

From 1830f8935f3b173d229b86e9927b3b6d599aa1f6 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Fri, 23 Dec 2016 22:37:27 +0800
Subject: [PATCH 23/30] virtio-net: correctly handle XDP_PASS for linearized
 packets

When XDP_PASS were determined for linearized packets, we try to get
new buffers in the virtqueue and build skbs from them. This is wrong,
we should create skbs based on existed buffers instead. Fixing them by
creating skb based on xdp_page.

With this patch "ping 192.168.100.4 -s 3900 -M do" works for XDP_PASS.

Cc: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
Acked-by: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/virtio_net.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 58ad40e17a74..470293e2b84d 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -578,8 +578,14 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 		act = do_xdp_prog(vi, rq, xdp_prog, xdp_page, offset, len);
 		switch (act) {
 		case XDP_PASS:
-			if (unlikely(xdp_page != page))
-				__free_pages(xdp_page, 0);
+			/* We can only create skb based on xdp_page. */
+			if (unlikely(xdp_page != page)) {
+				rcu_read_unlock();
+				put_page(page);
+				head_skb = page_to_skb(vi, rq, xdp_page,
+						       0, len, PAGE_SIZE);
+				return head_skb;
+			}
 			break;
 		case XDP_TX:
 			if (unlikely(xdp_page != page))

From b00f70b0dacb3d2e009afce860ebc90219072f5c Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Fri, 23 Dec 2016 22:37:28 +0800
Subject: [PATCH 24/30] virtio-net: unbreak csumed packets for XDP_PASS

We drop csumed packet when do XDP for packets. This breaks
XDP_PASS when GUEST_CSUM is supported. Fix this by allowing csum flag
to be set. With this patch, simple TCP works for XDP_PASS.

Cc: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
Acked-by: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/virtio_net.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 470293e2b84d..0778dc88597e 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -440,7 +440,7 @@ static struct sk_buff *receive_big(struct net_device *dev,
 		struct virtio_net_hdr_mrg_rxbuf *hdr = buf;
 		u32 act;
 
-		if (unlikely(hdr->hdr.gso_type || hdr->hdr.flags))
+		if (unlikely(hdr->hdr.gso_type))
 			goto err_xdp;
 		act = do_xdp_prog(vi, rq, xdp_prog, page, 0, len);
 		switch (act) {
@@ -572,7 +572,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 		 * the receive path after XDP is loaded. In practice I
 		 * was not able to create this condition.
 		 */
-		if (unlikely(hdr->hdr.gso_type || hdr->hdr.flags))
+		if (unlikely(hdr->hdr.gso_type))
 			goto err_xdp;
 
 		act = do_xdp_prog(vi, rq, xdp_prog, xdp_page, offset, len);

From 5c33474d41af09f09c98f1df70f267920587bec0 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Fri, 23 Dec 2016 22:37:29 +0800
Subject: [PATCH 25/30] virtio-net: make rx buf size estimation works for XDP

We don't update ewma rx buf size in the case of XDP. This will lead
underestimation of rx buf size which causes host to produce more than
one buffers. This will greatly increase the possibility of XDP page
linearization.

Cc: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
Acked-by: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/virtio_net.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 0778dc88597e..77ae358ec630 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -584,10 +584,12 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 				put_page(page);
 				head_skb = page_to_skb(vi, rq, xdp_page,
 						       0, len, PAGE_SIZE);
+				ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len);
 				return head_skb;
 			}
 			break;
 		case XDP_TX:
+			ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len);
 			if (unlikely(xdp_page != page))
 				goto err_xdp;
 			rcu_read_unlock();
@@ -596,6 +598,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 		default:
 			if (unlikely(xdp_page != page))
 				__free_pages(xdp_page, 0);
+			ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len);
 			goto err_xdp;
 		}
 	}

From 92502fe86c7c9b3f8543f29641a3c71805e82757 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Fri, 23 Dec 2016 22:37:30 +0800
Subject: [PATCH 26/30] virtio-net: forbid XDP when VIRTIO_NET_F_GUEST_UFO is
 support

When VIRTIO_NET_F_GUEST_UFO is negotiated, host could still send UFO
packet that exceeds a single page which could not be handled
correctly by XDP. So this patch forbids setting XDP when GUEST_UFO is
supported. While at it, forbid XDP for ECN (which comes only from GRO)
too to prevent user from misconfiguration.

Cc: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
Acked-by: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/virtio_net.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 77ae358ec630..c1f66d8bfb7b 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1684,7 +1684,9 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog)
 	int i, err;
 
 	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO4) ||
-	    virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO6)) {
+	    virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO6) ||
+	    virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_ECN) ||
+	    virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_UFO)) {
 		netdev_warn(dev, "can't set XDP while host is implementing LRO, disable LRO first\n");
 		return -EOPNOTSUPP;
 	}

From c47a43d3004ad6ff2a94a670cb3274cd6338d41e Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Fri, 23 Dec 2016 22:37:31 +0800
Subject: [PATCH 27/30] virtio-net: remove big packet XDP codes

Now we in fact don't allow XDP for big packets, remove its codes.

Cc: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/virtio_net.c | 44 +++-------------------------------------
 1 file changed, 3 insertions(+), 41 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index c1f66d8bfb7b..e53365a86ca3 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -344,11 +344,7 @@ static void virtnet_xdp_xmit(struct virtnet_info *vi,
 	/* Free up any pending old buffers before queueing new ones. */
 	while ((xdp_sent = virtqueue_get_buf(sq->vq, &len)) != NULL) {
 		struct page *sent_page = virt_to_head_page(xdp_sent);
-
-		if (vi->mergeable_rx_bufs)
-			put_page(sent_page);
-		else
-			give_pages(rq, sent_page);
+		put_page(sent_page);
 	}
 
 	/* Zero header and leave csum up to XDP layers */
@@ -360,15 +356,8 @@ static void virtnet_xdp_xmit(struct virtnet_info *vi,
 	err = virtqueue_add_outbuf(sq->vq, sq->sg, num_sg,
 				   xdp->data, GFP_ATOMIC);
 	if (unlikely(err)) {
-		if (vi->mergeable_rx_bufs)
-			put_page(page);
-		else
-			give_pages(rq, page);
+		put_page(page);
 		return; // On error abort to avoid unnecessary kick
-	} else if (!vi->mergeable_rx_bufs) {
-		/* If not mergeable bufs must be big packets so cleanup pages */
-		give_pages(rq, (struct page *)page->private);
-		page->private = 0;
 	}
 
 	virtqueue_kick(sq->vq);
@@ -430,44 +419,17 @@ static struct sk_buff *receive_big(struct net_device *dev,
 				   void *buf,
 				   unsigned int len)
 {
-	struct bpf_prog *xdp_prog;
 	struct page *page = buf;
-	struct sk_buff *skb;
+	struct sk_buff *skb = page_to_skb(vi, rq, page, 0, len, PAGE_SIZE);
 
-	rcu_read_lock();
-	xdp_prog = rcu_dereference(rq->xdp_prog);
-	if (xdp_prog) {
-		struct virtio_net_hdr_mrg_rxbuf *hdr = buf;
-		u32 act;
-
-		if (unlikely(hdr->hdr.gso_type))
-			goto err_xdp;
-		act = do_xdp_prog(vi, rq, xdp_prog, page, 0, len);
-		switch (act) {
-		case XDP_PASS:
-			break;
-		case XDP_TX:
-			rcu_read_unlock();
-			goto xdp_xmit;
-		case XDP_DROP:
-		default:
-			goto err_xdp;
-		}
-	}
-	rcu_read_unlock();
-
-	skb = page_to_skb(vi, rq, page, 0, len, PAGE_SIZE);
 	if (unlikely(!skb))
 		goto err;
 
 	return skb;
 
-err_xdp:
-	rcu_read_unlock();
 err:
 	dev->stats.rx_dropped++;
 	give_pages(rq, page);
-xdp_xmit:
 	return NULL;
 }
 

From bb91accf27335c6dc460e202991ca140fa21e1b5 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Fri, 23 Dec 2016 22:37:32 +0800
Subject: [PATCH 28/30] virtio-net: XDP support for small buffers

Commit f600b6905015 ("virtio_net: Add XDP support") leaves the case of
small receive buffer untouched. This will confuse the user who want to
set XDP but use small buffers. Other than forbid XDP in small buffer
mode, let's make it work. XDP then can only work at skb->data since
virtio-net create skbs during refill, this is sub optimal which could
be optimized in the future.

Cc: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
Acked-by: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/virtio_net.c | 112 ++++++++++++++++++++++++++++++---------
 1 file changed, 87 insertions(+), 25 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index e53365a86ca3..5deeda61d6d3 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -333,9 +333,9 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
 static void virtnet_xdp_xmit(struct virtnet_info *vi,
 			     struct receive_queue *rq,
 			     struct send_queue *sq,
-			     struct xdp_buff *xdp)
+			     struct xdp_buff *xdp,
+			     void *data)
 {
-	struct page *page = virt_to_head_page(xdp->data);
 	struct virtio_net_hdr_mrg_rxbuf *hdr;
 	unsigned int num_sg, len;
 	void *xdp_sent;
@@ -343,20 +343,45 @@ static void virtnet_xdp_xmit(struct virtnet_info *vi,
 
 	/* Free up any pending old buffers before queueing new ones. */
 	while ((xdp_sent = virtqueue_get_buf(sq->vq, &len)) != NULL) {
-		struct page *sent_page = virt_to_head_page(xdp_sent);
-		put_page(sent_page);
+		if (vi->mergeable_rx_bufs) {
+			struct page *sent_page = virt_to_head_page(xdp_sent);
+
+			put_page(sent_page);
+		} else { /* small buffer */
+			struct sk_buff *skb = xdp_sent;
+
+			kfree_skb(skb);
+		}
 	}
 
-	/* Zero header and leave csum up to XDP layers */
-	hdr = xdp->data;
-	memset(hdr, 0, vi->hdr_len);
+	if (vi->mergeable_rx_bufs) {
+		/* Zero header and leave csum up to XDP layers */
+		hdr = xdp->data;
+		memset(hdr, 0, vi->hdr_len);
 
-	num_sg = 1;
-	sg_init_one(sq->sg, xdp->data, xdp->data_end - xdp->data);
+		num_sg = 1;
+		sg_init_one(sq->sg, xdp->data, xdp->data_end - xdp->data);
+	} else { /* small buffer */
+		struct sk_buff *skb = data;
+
+		/* Zero header and leave csum up to XDP layers */
+		hdr = skb_vnet_hdr(skb);
+		memset(hdr, 0, vi->hdr_len);
+
+		num_sg = 2;
+		sg_init_table(sq->sg, 2);
+		sg_set_buf(sq->sg, hdr, vi->hdr_len);
+		skb_to_sgvec(skb, sq->sg + 1, 0, skb->len);
+	}
 	err = virtqueue_add_outbuf(sq->vq, sq->sg, num_sg,
-				   xdp->data, GFP_ATOMIC);
+				   data, GFP_ATOMIC);
 	if (unlikely(err)) {
-		put_page(page);
+		if (vi->mergeable_rx_bufs) {
+			struct page *page = virt_to_head_page(xdp->data);
+
+			put_page(page);
+		} else /* small buffer */
+			kfree_skb(data);
 		return; // On error abort to avoid unnecessary kick
 	}
 
@@ -366,23 +391,26 @@ static void virtnet_xdp_xmit(struct virtnet_info *vi,
 static u32 do_xdp_prog(struct virtnet_info *vi,
 		       struct receive_queue *rq,
 		       struct bpf_prog *xdp_prog,
-		       struct page *page, int offset, int len)
+		       void *data, int len)
 {
 	int hdr_padded_len;
 	struct xdp_buff xdp;
+	void *buf;
 	unsigned int qp;
 	u32 act;
-	u8 *buf;
 
-	buf = page_address(page) + offset;
-
-	if (vi->mergeable_rx_bufs)
+	if (vi->mergeable_rx_bufs) {
 		hdr_padded_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
-	else
-		hdr_padded_len = sizeof(struct padded_vnet_hdr);
+		xdp.data = data + hdr_padded_len;
+		xdp.data_end = xdp.data + (len - vi->hdr_len);
+		buf = data;
+	} else { /* small buffers */
+		struct sk_buff *skb = data;
 
-	xdp.data = buf + hdr_padded_len;
-	xdp.data_end = xdp.data + (len - vi->hdr_len);
+		xdp.data = skb->data;
+		xdp.data_end = xdp.data + len;
+		buf = skb->data;
+	}
 
 	act = bpf_prog_run_xdp(xdp_prog, &xdp);
 	switch (act) {
@@ -392,8 +420,8 @@ static u32 do_xdp_prog(struct virtnet_info *vi,
 		qp = vi->curr_queue_pairs -
 			vi->xdp_queue_pairs +
 			smp_processor_id();
-		xdp.data = buf + (vi->mergeable_rx_bufs ? 0 : 4);
-		virtnet_xdp_xmit(vi, rq, &vi->sq[qp], &xdp);
+		xdp.data = buf;
+		virtnet_xdp_xmit(vi, rq, &vi->sq[qp], &xdp, data);
 		return XDP_TX;
 	default:
 		bpf_warn_invalid_xdp_action(act);
@@ -403,14 +431,47 @@ static u32 do_xdp_prog(struct virtnet_info *vi,
 	}
 }
 
-static struct sk_buff *receive_small(struct virtnet_info *vi, void *buf, unsigned int len)
+static struct sk_buff *receive_small(struct net_device *dev,
+				     struct virtnet_info *vi,
+				     struct receive_queue *rq,
+				     void *buf, unsigned int len)
 {
 	struct sk_buff * skb = buf;
+	struct bpf_prog *xdp_prog;
 
 	len -= vi->hdr_len;
 	skb_trim(skb, len);
 
+	rcu_read_lock();
+	xdp_prog = rcu_dereference(rq->xdp_prog);
+	if (xdp_prog) {
+		struct virtio_net_hdr_mrg_rxbuf *hdr = buf;
+		u32 act;
+
+		if (unlikely(hdr->hdr.gso_type || hdr->hdr.flags))
+			goto err_xdp;
+		act = do_xdp_prog(vi, rq, xdp_prog, skb, len);
+		switch (act) {
+		case XDP_PASS:
+			break;
+		case XDP_TX:
+			rcu_read_unlock();
+			goto xdp_xmit;
+		case XDP_DROP:
+		default:
+			goto err_xdp;
+		}
+	}
+	rcu_read_unlock();
+
 	return skb;
+
+err_xdp:
+	rcu_read_unlock();
+	dev->stats.rx_dropped++;
+	kfree_skb(skb);
+xdp_xmit:
+	return NULL;
 }
 
 static struct sk_buff *receive_big(struct net_device *dev,
@@ -537,7 +598,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 		if (unlikely(hdr->hdr.gso_type))
 			goto err_xdp;
 
-		act = do_xdp_prog(vi, rq, xdp_prog, xdp_page, offset, len);
+		act = do_xdp_prog(vi, rq, xdp_prog,
+				  page_address(xdp_page) + offset, len);
 		switch (act) {
 		case XDP_PASS:
 			/* We can only create skb based on xdp_page. */
@@ -672,7 +734,7 @@ static void receive_buf(struct virtnet_info *vi, struct receive_queue *rq,
 	else if (vi->big_packets)
 		skb = receive_big(dev, vi, rq, buf, len);
 	else
-		skb = receive_small(vi, buf, len);
+		skb = receive_small(dev, vi, rq, buf, len);
 
 	if (unlikely(!skb))
 		return;

From 58b94d88deb9ac913740e7778f2005f3ce6d0443 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Fri, 23 Dec 2016 14:29:02 -0200
Subject: [PATCH 29/30] sctp: do not loose window information if in rwnd_over

It's possible that we receive a packet that is larger than current
window. If it's the first packet in this way, it will cause it to
increase rwnd_over. Then, if we receive another data chunk (specially as
SCTP allows you to have one data chunk in flight even during 0 window),
rwnd_over will be overwritten instead of added to.

In the long run, this could cause the window to grow bigger than its
initial size, as rwnd_over would be charged only for the last received
data chunk while the code will try open the window for all packets that
were received and had its value in rwnd_over overwritten. This, then,
can lead to the worsening of payload/buffer ratio and cause rwnd_press
to kick in more often.

The fix is to sum it too, same as is done for rwnd_press, so that if we
receive 3 chunks after closing the window, we still have to release that
same amount before re-opening it.

Log snippet from sctp_test exhibiting the issue:
[  146.209232] sctp: sctp_assoc_rwnd_decrease: asoc:ffff88013928e000
rwnd decreased by 1 to (0, 1, 114221)
[  146.209232] sctp: sctp_assoc_rwnd_decrease:
association:ffff88013928e000 has asoc->rwnd:0, asoc->rwnd_over:1!
[  146.209232] sctp: sctp_assoc_rwnd_decrease: asoc:ffff88013928e000
rwnd decreased by 1 to (0, 1, 114221)
[  146.209232] sctp: sctp_assoc_rwnd_decrease:
association:ffff88013928e000 has asoc->rwnd:0, asoc->rwnd_over:1!
[  146.209232] sctp: sctp_assoc_rwnd_decrease: asoc:ffff88013928e000
rwnd decreased by 1 to (0, 1, 114221)
[  146.209232] sctp: sctp_assoc_rwnd_decrease:
association:ffff88013928e000 has asoc->rwnd:0, asoc->rwnd_over:1!
[  146.209232] sctp: sctp_assoc_rwnd_decrease: asoc:ffff88013928e000
rwnd decreased by 1 to (0, 1, 114221)

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/associola.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 68428e1f7181..56ddcfaeb4f6 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -1539,7 +1539,7 @@ void sctp_assoc_rwnd_decrease(struct sctp_association *asoc, unsigned int len)
 			asoc->rwnd = 0;
 		}
 	} else {
-		asoc->rwnd_over = len - asoc->rwnd;
+		asoc->rwnd_over += len - asoc->rwnd;
 		asoc->rwnd = 0;
 	}
 

From 1636098c46ac52c7fec384fe629144e8e03487b1 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Fri, 23 Dec 2016 14:29:37 -0200
Subject: [PATCH 30/30] sctp: fix recovering from 0 win with small data chunks

Currently if SCTP closes the receive window with window pressure, mostly
caused by excessive skb overhead on payload/overheads ratio, SCTP will
close the window abruptly while saving the delta on rwnd_press. It will
start recovering rwnd as the chunks are consumed by the application and
the rwnd_press will be only recovered after rwnd reach the same value as
of rwnd_press, mostly to prevent silly window syndrome.

Thing is, this is very inefficient with small data chunks, as with those
it will never reach back that value, and thus it will never recover from
such pressure. This means that we will not issue window updates when
recovering from 0 window and will rely on a sender retransmit to notice
it.

The fix here is to remove such threshold, as no value is good enough: it
depends on the (avg) chunk sizes being used.

Test with netperf -t SCTP_STREAM -- -m 1, and trigger 0 window by
sending SIGSTOP to netserver, sleep 1.2, and SIGCONT.
Rate limited to 845kbps, for visibility. Capture done at netserver side.

Previously:
01.500751 IP B.48277 > A.36925: sctp (1) [SACK] [cum ack 632372996] [a_rwnd 99153] [
01.500752 IP A.36925 > B.48277: sctp (1) [DATA] (B)(E) [TSN: 632372997] [SID: 0] [SS
01.517471 IP A.36925 > B.48277: sctp (1) [DATA] (B)(E) [TSN: 632373010] [SID: 0] [SS
01.517483 IP B.48277 > A.36925: sctp (1) [SACK] [cum ack 632373009] [a_rwnd 0] [#gap
01.517485 IP A.36925 > B.48277: sctp (1) [DATA] (B)(E) [TSN: 632373083] [SID: 0] [SS
01.517488 IP B.48277 > A.36925: sctp (1) [SACK] [cum ack 632373009] [a_rwnd 0] [#gap
01.534168 IP A.36925 > B.48277: sctp (1) [DATA] (B)(E) [TSN: 632373096] [SID: 0] [SS
01.534180 IP B.48277 > A.36925: sctp (1) [SACK] [cum ack 632373009] [a_rwnd 0] [#gap
01.534181 IP A.36925 > B.48277: sctp (1) [DATA] (B)(E) [TSN: 632373169] [SID: 0] [SS
01.534185 IP B.48277 > A.36925: sctp (1) [SACK] [cum ack 632373009] [a_rwnd 0] [#gap
02.525978 IP A.36925 > B.48277: sctp (1) [DATA] (B)(E) [TSN: 632373010] [SID: 0] [SS
02.526021 IP B.48277 > A.36925: sctp (1) [SACK] [cum ack 632373009] [a_rwnd 0] [#gap
  (window update missed)
04.573807 IP A.36925 > B.48277: sctp (1) [DATA] (B)(E) [TSN: 632373010] [SID: 0] [SS
04.779370 IP B.48277 > A.36925: sctp (1) [SACK] [cum ack 632373082] [a_rwnd 859] [#g
04.789162 IP A.36925 > B.48277: sctp (1) [DATA] (B)(E) [TSN: 632373083] [SID: 0] [SS
04.789323 IP A.36925 > B.48277: sctp (1) [DATA] (B)(E) [TSN: 632373156] [SID: 0] [SS
04.789372 IP B.48277 > A.36925: sctp (1) [SACK] [cum ack 632373228] [a_rwnd 786] [#g

After:
02.568957 IP B.50536 > A.55173: sctp (1) [SACK] [cum ack 2490098728] [a_rwnd 99153]
02.568961 IP A.55173 > B.50536: sctp (1) [DATA] (B)(E) [TSN: 2490098729] [SID: 0] [S
02.585631 IP A.55173 > B.50536: sctp (1) [DATA] (B)(E) [TSN: 2490098742] [SID: 0] [S
02.585666 IP B.50536 > A.55173: sctp (1) [SACK] [cum ack 2490098741] [a_rwnd 0] [#ga
02.585671 IP A.55173 > B.50536: sctp (1) [DATA] (B)(E) [TSN: 2490098815] [SID: 0] [S
02.585683 IP B.50536 > A.55173: sctp (1) [SACK] [cum ack 2490098741] [a_rwnd 0] [#ga
02.602330 IP A.55173 > B.50536: sctp (1) [DATA] (B)(E) [TSN: 2490098828] [SID: 0] [S
02.602359 IP B.50536 > A.55173: sctp (1) [SACK] [cum ack 2490098741] [a_rwnd 0] [#ga
02.602363 IP A.55173 > B.50536: sctp (1) [DATA] (B)(E) [TSN: 2490098901] [SID: 0] [S
02.602372 IP B.50536 > A.55173: sctp (1) [SACK] [cum ack 2490098741] [a_rwnd 0] [#ga
03.600788 IP A.55173 > B.50536: sctp (1) [DATA] (B)(E) [TSN: 2490098742] [SID: 0] [S
03.600830 IP B.50536 > A.55173: sctp (1) [SACK] [cum ack 2490098741] [a_rwnd 0] [#ga
03.619455 IP B.50536 > A.55173: sctp (1) [SACK] [cum ack 2490098741] [a_rwnd 13508]
03.619479 IP B.50536 > A.55173: sctp (1) [SACK] [cum ack 2490098741] [a_rwnd 27017]
03.619497 IP B.50536 > A.55173: sctp (1) [SACK] [cum ack 2490098741] [a_rwnd 40526]
03.619516 IP B.50536 > A.55173: sctp (1) [SACK] [cum ack 2490098741] [a_rwnd 54035]
03.619533 IP B.50536 > A.55173: sctp (1) [SACK] [cum ack 2490098741] [a_rwnd 67544]
03.619552 IP B.50536 > A.55173: sctp (1) [SACK] [cum ack 2490098741] [a_rwnd 81053]
03.619570 IP B.50536 > A.55173: sctp (1) [SACK] [cum ack 2490098741] [a_rwnd 94562]
  (following data transmission triggered by window updates above)
03.633504 IP A.55173 > B.50536: sctp (1) [DATA] (B)(E) [TSN: 2490098742] [SID: 0] [S
03.836445 IP B.50536 > A.55173: sctp (1) [SACK] [cum ack 2490098814] [a_rwnd 100000]
03.843125 IP A.55173 > B.50536: sctp (1) [DATA] (B)(E) [TSN: 2490098815] [SID: 0] [S
03.843285 IP A.55173 > B.50536: sctp (1) [DATA] (B)(E) [TSN: 2490098888] [SID: 0] [S
03.843345 IP B.50536 > A.55173: sctp (1) [SACK] [cum ack 2490098960] [a_rwnd 99894]
03.856546 IP A.55173 > B.50536: sctp (1) [DATA] (B)(E) [TSN: 2490098961] [SID: 0] [S
03.866450 IP A.55173 > B.50536: sctp (1) [DATA] (B)(E) [TSN: 2490099011] [SID: 0] [S

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/associola.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 56ddcfaeb4f6..d3cc30c25c41 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -1471,7 +1471,7 @@ void sctp_assoc_rwnd_increase(struct sctp_association *asoc, unsigned int len)
 	 * threshold.  The idea is to recover slowly, but up
 	 * to the initial advertised window.
 	 */
-	if (asoc->rwnd_press && asoc->rwnd >= asoc->rwnd_press) {
+	if (asoc->rwnd_press) {
 		int change = min(asoc->pathmtu, asoc->rwnd_press);
 		asoc->rwnd += change;
 		asoc->rwnd_press -= change;