From c20c6704bf2dafaba0d90c8310ef9e919fe4d2e2 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Thu, 16 Nov 2017 04:36:51 +0000
Subject: [PATCH 001/808] ASoC: rcar: revert IOMMU support so far

commit 4821d914fe74 ("ASoC: rsnd: use dma_sync_single_for_xxx() for
IOMMU") had supported IOMMU, but it breaks normal sound "recorde"
and both PulseAudio's "playback/recorde". The sound will be noisy.

That commit was using dma_sync_single_for_xxx(), and driver should
make sure memory is protected during CPU or Device are using it.
But if driver returns current "residue" data size correctly on pointer
function, player/recorder will access to protected memory.

IOMMU feature should be supported, but I don't know how to handle it
without memory cache problem at this point.
Thus, this patch simply revert it to avoid current noisy sound.

Tested-by: Hiroyuki Yokoyama <hiroyuki.yokoyama.vx@renesas.com>
Tested-by: Ryo Kodama <ryo.kodama.vz@renesas.com>
Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 sound/soc/sh/rcar/core.c |  4 +-
 sound/soc/sh/rcar/dma.c  | 86 +++-------------------------------------
 2 files changed, 8 insertions(+), 82 deletions(-)

diff --git a/sound/soc/sh/rcar/core.c b/sound/soc/sh/rcar/core.c
index c70eb2097816..f12a88a21dfa 100644
--- a/sound/soc/sh/rcar/core.c
+++ b/sound/soc/sh/rcar/core.c
@@ -1332,8 +1332,8 @@ static int rsnd_pcm_new(struct snd_soc_pcm_runtime *rtd)
 
 	return snd_pcm_lib_preallocate_pages_for_all(
 		rtd->pcm,
-		SNDRV_DMA_TYPE_CONTINUOUS,
-		snd_dma_continuous_data(GFP_KERNEL),
+		SNDRV_DMA_TYPE_DEV,
+		rtd->card->snd_card->dev,
 		PREALLOC_BUFFER, PREALLOC_BUFFER_MAX);
 }
 
diff --git a/sound/soc/sh/rcar/dma.c b/sound/soc/sh/rcar/dma.c
index fd557abfe390..4d750bdf8e24 100644
--- a/sound/soc/sh/rcar/dma.c
+++ b/sound/soc/sh/rcar/dma.c
@@ -26,10 +26,7 @@
 struct rsnd_dmaen {
 	struct dma_chan		*chan;
 	dma_cookie_t		cookie;
-	dma_addr_t		dma_buf;
 	unsigned int		dma_len;
-	unsigned int		dma_period;
-	unsigned int		dma_cnt;
 };
 
 struct rsnd_dmapp {
@@ -71,38 +68,10 @@ static struct rsnd_mod mem = {
 /*
  *		Audio DMAC
  */
-#define rsnd_dmaen_sync(dmaen, io, i)	__rsnd_dmaen_sync(dmaen, io, i, 1)
-#define rsnd_dmaen_unsync(dmaen, io, i)	__rsnd_dmaen_sync(dmaen, io, i, 0)
-static void __rsnd_dmaen_sync(struct rsnd_dmaen *dmaen, struct rsnd_dai_stream *io,
-			      int i, int sync)
-{
-	struct device *dev = dmaen->chan->device->dev;
-	enum dma_data_direction dir;
-	int is_play = rsnd_io_is_play(io);
-	dma_addr_t buf;
-	int len, max;
-	size_t period;
-
-	len	= dmaen->dma_len;
-	period	= dmaen->dma_period;
-	max	= len / period;
-	i	= i % max;
-	buf	= dmaen->dma_buf + (period * i);
-
-	dir = is_play ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
-
-	if (sync)
-		dma_sync_single_for_device(dev, buf, period, dir);
-	else
-		dma_sync_single_for_cpu(dev, buf, period, dir);
-}
-
 static void __rsnd_dmaen_complete(struct rsnd_mod *mod,
 				  struct rsnd_dai_stream *io)
 {
 	struct rsnd_priv *priv = rsnd_mod_to_priv(mod);
-	struct rsnd_dma *dma = rsnd_mod_to_dma(mod);
-	struct rsnd_dmaen *dmaen = rsnd_dma_to_dmaen(dma);
 	bool elapsed = false;
 	unsigned long flags;
 
@@ -115,22 +84,9 @@ static void __rsnd_dmaen_complete(struct rsnd_mod *mod,
 	 */
 	spin_lock_irqsave(&priv->lock, flags);
 
-	if (rsnd_io_is_working(io)) {
-		rsnd_dmaen_unsync(dmaen, io, dmaen->dma_cnt);
-
-		/*
-		 * Next period is already started.
-		 * Let's sync Next Next period
-		 * see
-		 *	rsnd_dmaen_start()
-		 */
-		rsnd_dmaen_sync(dmaen, io, dmaen->dma_cnt + 2);
-
+	if (rsnd_io_is_working(io))
 		elapsed = true;
 
-		dmaen->dma_cnt++;
-	}
-
 	spin_unlock_irqrestore(&priv->lock, flags);
 
 	if (elapsed)
@@ -165,14 +121,8 @@ static int rsnd_dmaen_stop(struct rsnd_mod *mod,
 	struct rsnd_dma *dma = rsnd_mod_to_dma(mod);
 	struct rsnd_dmaen *dmaen = rsnd_dma_to_dmaen(dma);
 
-	if (dmaen->chan) {
-		int is_play = rsnd_io_is_play(io);
-
+	if (dmaen->chan)
 		dmaengine_terminate_all(dmaen->chan);
-		dma_unmap_single(dmaen->chan->device->dev,
-				 dmaen->dma_buf, dmaen->dma_len,
-				 is_play ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
-	}
 
 	return 0;
 }
@@ -237,11 +187,7 @@ static int rsnd_dmaen_start(struct rsnd_mod *mod,
 	struct device *dev = rsnd_priv_to_dev(priv);
 	struct dma_async_tx_descriptor *desc;
 	struct dma_slave_config cfg = {};
-	dma_addr_t buf;
-	size_t len;
-	size_t period;
 	int is_play = rsnd_io_is_play(io);
-	int i;
 	int ret;
 
 	cfg.direction	= is_play ? DMA_MEM_TO_DEV : DMA_DEV_TO_MEM;
@@ -258,19 +204,10 @@ static int rsnd_dmaen_start(struct rsnd_mod *mod,
 	if (ret < 0)
 		return ret;
 
-	len	= snd_pcm_lib_buffer_bytes(substream);
-	period	= snd_pcm_lib_period_bytes(substream);
-	buf	= dma_map_single(dmaen->chan->device->dev,
-				 substream->runtime->dma_area,
-				 len,
-				 is_play ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
-	if (dma_mapping_error(dmaen->chan->device->dev, buf)) {
-		dev_err(dev, "dma map failed\n");
-		return -EIO;
-	}
-
 	desc = dmaengine_prep_dma_cyclic(dmaen->chan,
-					 buf, len, period,
+					 substream->runtime->dma_addr,
+					 snd_pcm_lib_buffer_bytes(substream),
+					 snd_pcm_lib_period_bytes(substream),
 					 is_play ? DMA_MEM_TO_DEV : DMA_DEV_TO_MEM,
 					 DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
 
@@ -282,18 +219,7 @@ static int rsnd_dmaen_start(struct rsnd_mod *mod,
 	desc->callback		= rsnd_dmaen_complete;
 	desc->callback_param	= rsnd_mod_get(dma);
 
-	dmaen->dma_buf		= buf;
-	dmaen->dma_len		= len;
-	dmaen->dma_period	= period;
-	dmaen->dma_cnt		= 0;
-
-	/*
-	 * synchronize this and next period
-	 * see
-	 *	__rsnd_dmaen_complete()
-	 */
-	for (i = 0; i < 2; i++)
-		rsnd_dmaen_sync(dmaen, io, i);
+	dmaen->dma_len		= snd_pcm_lib_buffer_bytes(substream);
 
 	dmaen->cookie = dmaengine_submit(desc);
 	if (dmaen->cookie < 0) {

From 8c059a4676038967dd6efe614538c329b61e68a1 Mon Sep 17 00:00:00 2001
From: Trent Piepho <tpiepho@impinj.com>
Date: Wed, 15 Nov 2017 11:52:32 -0800
Subject: [PATCH 002/808] spi: imx: Update device tree binding documentation

Update documentation for gpio-cs and num-cs to reflect the standard SPI
bindings.

The dma properties are optional.

Include a warning that native CS do not work in a commonly useful manner
with this hardware/driver, and therefor most users probably should use GPIO
based CS lines rather than native.

CC: Mark Brown <broonie@kernel.org>
CC: Shawn Guo <shawnguo@kernel.org>
CC: Sascha Hauer <kernel@pengutronix.de>
CC: Fabio Estevam <fabio.estevam@nxp.com>
CC: Oleksij Rempel <o.rempel@pengutronix.de>
Signed-off-by: Trent Piepho <tpiepho@impinj.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 .../devicetree/bindings/spi/fsl-imx-cspi.txt   | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/Documentation/devicetree/bindings/spi/fsl-imx-cspi.txt b/Documentation/devicetree/bindings/spi/fsl-imx-cspi.txt
index 5bf13960f7f4..e3c48b20b1a6 100644
--- a/Documentation/devicetree/bindings/spi/fsl-imx-cspi.txt
+++ b/Documentation/devicetree/bindings/spi/fsl-imx-cspi.txt
@@ -12,24 +12,30 @@ Required properties:
   - "fsl,imx53-ecspi" for SPI compatible with the one integrated on i.MX53 and later Soc
 - reg : Offset and length of the register set for the device
 - interrupts : Should contain CSPI/eCSPI interrupt
-- cs-gpios : Specifies the gpio pins to be used for chipselects.
 - clocks : Clock specifiers for both ipg and per clocks.
 - clock-names : Clock names should include both "ipg" and "per"
 See the clock consumer binding,
 	Documentation/devicetree/bindings/clock/clock-bindings.txt
-- dmas: DMA specifiers for tx and rx dma. See the DMA client binding,
-		Documentation/devicetree/bindings/dma/dma.txt
-- dma-names: DMA request names should include "tx" and "rx" if present.
 
-Obsolete properties:
-- fsl,spi-num-chipselects : Contains the number of the chipselect
+Recommended properties:
+- cs-gpios : GPIOs to use as chip selects, see spi-bus.txt.  While the native chip
+select lines can be used, they appear to always generate a pulse between each
+word of a transfer.  Most use cases will require GPIO based chip selects to
+generate a valid transaction.
 
 Optional properties:
+- num-cs :  Number of total chip selects, see spi-bus.txt.
+- dmas: DMA specifiers for tx and rx dma. See the DMA client binding,
+Documentation/devicetree/bindings/dma/dma.txt.
+- dma-names: DMA request names, if present, should include "tx" and "rx".
 - fsl,spi-rdy-drctl: Integer, representing the value of DRCTL, the register
 controlling the SPI_READY handling. Note that to enable the DRCTL consideration,
 the SPI_READY mode-flag needs to be set too.
 Valid values are: 0 (disabled), 1 (edge-triggered burst) and 2 (level-triggered burst).
 
+Obsolete properties:
+- fsl,spi-num-chipselects : Contains the number of the chipselect
+
 Example:
 
 ecspi@70010000 {

From 4c761ebfcb2d04ee36783c4c8c45ae00caf59d36 Mon Sep 17 00:00:00 2001
From: Naveen Manohar <naveen.m@intel.com>
Date: Fri, 3 Nov 2017 19:15:02 +0530
Subject: [PATCH 003/808] ASoC: Intel: kbl: Modify map for Headset Playback to
 fix pop-noise

Patch fixes wrong path in commit 0b06122fc8d0 ("ASoC: Intel: kbl: Add
map for new DAIs for Multi-Playback & Echo Ref") which resulted in pop
noise.
Current topology for Headset results in unwanted pop noise, while
switching from spk->hs at the start of Headset Playback.
Hence re-introduced mixin-mixout dsp module in topology for headset
playback pipe to fix the regression.
And the corresponding modification for headset route is updated here.

Fixes: 0b06122fc8d0 ("ASoC: Intel: kbl: Add map for new DAIs for
Multi-Playback & Echo Ref")
Signed-off-by: Naveen Manohar <naveen.m@intel.com>
Signed-off-by: Sathya Prakash M R <sathya.prakash.m.r@intel.com>
Acked-By: Vinod Koul <vinod.koul@intel.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 sound/soc/intel/boards/kbl_rt5663_max98927.c        | 2 +-
 sound/soc/intel/boards/kbl_rt5663_rt5514_max98927.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/sound/soc/intel/boards/kbl_rt5663_max98927.c b/sound/soc/intel/boards/kbl_rt5663_max98927.c
index 6f9a8bcf20f3..6dcad0a8a0d0 100644
--- a/sound/soc/intel/boards/kbl_rt5663_max98927.c
+++ b/sound/soc/intel/boards/kbl_rt5663_max98927.c
@@ -101,7 +101,7 @@ static const struct snd_soc_dapm_route kabylake_map[] = {
 	{ "ssp0 Tx", NULL, "spk_out" },
 
 	{ "AIF Playback", NULL, "ssp1 Tx" },
-	{ "ssp1 Tx", NULL, "hs_out" },
+	{ "ssp1 Tx", NULL, "codec1_out" },
 
 	{ "hs_in", NULL, "ssp1 Rx" },
 	{ "ssp1 Rx", NULL, "AIF Capture" },
diff --git a/sound/soc/intel/boards/kbl_rt5663_rt5514_max98927.c b/sound/soc/intel/boards/kbl_rt5663_rt5514_max98927.c
index 6072164f2d43..271ae3c2c535 100644
--- a/sound/soc/intel/boards/kbl_rt5663_rt5514_max98927.c
+++ b/sound/soc/intel/boards/kbl_rt5663_rt5514_max98927.c
@@ -109,7 +109,7 @@ static const struct snd_soc_dapm_route kabylake_map[] = {
 	{ "ssp0 Tx", NULL, "spk_out" },
 
 	{ "AIF Playback", NULL, "ssp1 Tx" },
-	{ "ssp1 Tx", NULL, "hs_out" },
+	{ "ssp1 Tx", NULL, "codec1_out" },
 
 	{ "hs_in", NULL, "ssp1 Rx" },
 	{ "ssp1 Rx", NULL, "AIF Capture" },

From bc6476d6c1edcb9b97621b5131bd169aa81f27db Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Mon, 13 Nov 2017 12:12:55 +0100
Subject: [PATCH 004/808] ASoC: da7218: fix fix child-node lookup

Fix child-node lookup during probe, which ended up searching the whole
device tree depth-first starting at the parent rather than just matching
on its children.

To make things worse, the parent codec node was also prematurely freed.

Fixes: 4d50934abd22 ("ASoC: da7218: Add da7218 codec driver")
Signed-off-by: Johan Hovold <johan@kernel.org>
Acked-by: Adam Thomson <Adam.Thomson.Opensource@diasemi.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Cc: stable <stable@vger.kernel.org>
---
 sound/soc/codecs/da7218.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sound/soc/codecs/da7218.c b/sound/soc/codecs/da7218.c
index b2d42ec1dcd9..56564ce90cb6 100644
--- a/sound/soc/codecs/da7218.c
+++ b/sound/soc/codecs/da7218.c
@@ -2520,7 +2520,7 @@ static struct da7218_pdata *da7218_of_to_pdata(struct snd_soc_codec *codec)
 	}
 
 	if (da7218->dev_id == DA7218_DEV_ID) {
-		hpldet_np = of_find_node_by_name(np, "da7218_hpldet");
+		hpldet_np = of_get_child_by_name(np, "da7218_hpldet");
 		if (!hpldet_np)
 			return pdata;
 

From 15f8c5f2415bfac73f33a14bcd83422bcbfb5298 Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Mon, 13 Nov 2017 12:12:56 +0100
Subject: [PATCH 005/808] ASoC: twl4030: fix child-node lookup

Fix child-node lookup during probe, which ended up searching the whole
device tree depth-first starting at the parent rather than just matching
on its children.

To make things worse, the parent codec node was also prematurely freed,
while the child node was leaked.

Fixes: 2d6d649a2e0f ("ASoC: twl4030: Support for DT booted kernel")
Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
Cc: stable <stable@vger.kernel.org>
---
 sound/soc/codecs/twl4030.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/sound/soc/codecs/twl4030.c b/sound/soc/codecs/twl4030.c
index c482b2e7a7d2..cfe72b9d4356 100644
--- a/sound/soc/codecs/twl4030.c
+++ b/sound/soc/codecs/twl4030.c
@@ -232,7 +232,7 @@ static struct twl4030_codec_data *twl4030_get_pdata(struct snd_soc_codec *codec)
 	struct twl4030_codec_data *pdata = dev_get_platdata(codec->dev);
 	struct device_node *twl4030_codec_node = NULL;
 
-	twl4030_codec_node = of_find_node_by_name(codec->dev->parent->of_node,
+	twl4030_codec_node = of_get_child_by_name(codec->dev->parent->of_node,
 						  "codec");
 
 	if (!pdata && twl4030_codec_node) {
@@ -241,9 +241,11 @@ static struct twl4030_codec_data *twl4030_get_pdata(struct snd_soc_codec *codec)
 				     GFP_KERNEL);
 		if (!pdata) {
 			dev_err(codec->dev, "Can not allocate memory\n");
+			of_node_put(twl4030_codec_node);
 			return NULL;
 		}
 		twl4030_setup_pdata_of(pdata, twl4030_codec_node);
+		of_node_put(twl4030_codec_node);
 	}
 
 	return pdata;

From 542134c0375b5ca2b1d18490c02b8a20bfdd8d74 Mon Sep 17 00:00:00 2001
From: Eudean Sun <eudean@arista.com>
Date: Tue, 21 Nov 2017 10:43:24 -0800
Subject: [PATCH 006/808] HID: cp2112: Fix I2C_BLOCK_DATA transactions

The existing driver erroneously treats I2C_BLOCK_DATA and BLOCK_DATA
commands the same.

For I2C_BLOCK_DATA reads, the length of the read is provided in
data->block[0], but the length itself should not be sent to the slave. In
contrast, for BLOCK_DATA reads no length is specified since the length
will be the first byte returned from the slave. When copying data back
to the data buffer, for an I2C_BLOCK_DATA read we have to take care not to
overwrite data->block[0] to avoid overwriting the length. A BLOCK_DATA
read doesn't have this concern since the first byte returned by the device
is the length and belongs in data->block[0].

For I2C_BLOCK_DATA writes, the length is also provided in data->block[0],
but the length itself is not sent to the slave (in contrast to BLOCK_DATA
writes where the length prefixes the data sent to the slave).

This was tested on physical hardware using i2cdump with the i and s flags
to test the behavior of I2C_BLOCK_DATA reads and BLOCK_DATA reads,
respectively. Writes were not tested but the I2C_BLOCK_DATA write change
is pretty simple to verify by inspection.

Signed-off-by: Eudean Sun <eudean@arista.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/hid-cp2112.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/drivers/hid/hid-cp2112.c b/drivers/hid/hid-cp2112.c
index 68cdc962265b..271f31461da4 100644
--- a/drivers/hid/hid-cp2112.c
+++ b/drivers/hid/hid-cp2112.c
@@ -696,8 +696,16 @@ static int cp2112_xfer(struct i2c_adapter *adap, u16 addr,
 					      (u8 *)&word, 2);
 		break;
 	case I2C_SMBUS_I2C_BLOCK_DATA:
-		size = I2C_SMBUS_BLOCK_DATA;
-		/* fallthrough */
+		if (read_write == I2C_SMBUS_READ) {
+			read_length = data->block[0];
+			count = cp2112_write_read_req(buf, addr, read_length,
+						      command, NULL, 0);
+		} else {
+			count = cp2112_write_req(buf, addr, command,
+						 data->block + 1,
+						 data->block[0]);
+		}
+		break;
 	case I2C_SMBUS_BLOCK_DATA:
 		if (I2C_SMBUS_READ == read_write) {
 			count = cp2112_write_read_req(buf, addr,
@@ -785,6 +793,9 @@ static int cp2112_xfer(struct i2c_adapter *adap, u16 addr,
 	case I2C_SMBUS_WORD_DATA:
 		data->word = le16_to_cpup((__le16 *)buf);
 		break;
+	case I2C_SMBUS_I2C_BLOCK_DATA:
+		memcpy(data->block + 1, buf, read_length);
+		break;
 	case I2C_SMBUS_BLOCK_DATA:
 		if (read_length > I2C_SMBUS_BLOCK_MAX) {
 			ret = -EPROTO;

From 56986b07d17b4a19416e248aaca9367c241a824b Mon Sep 17 00:00:00 2001
From: Bard Liao <bardliao@realtek.com>
Date: Wed, 22 Nov 2017 13:59:19 +0800
Subject: [PATCH 007/808] ASoC: rt5645: reset RT5645_AD_DA_MIXER at probe

RT5645_AD_DA_MIXER (0x29) register will not be reset to default after
SW reset. So we have to write it to its default value in i2c_probe.

Signed-off-by: Bard Liao <bardliao@realtek.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 sound/soc/codecs/rt5645.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sound/soc/codecs/rt5645.c b/sound/soc/codecs/rt5645.c
index 5f24df4fae8e..fcd02c2c76f1 100644
--- a/sound/soc/codecs/rt5645.c
+++ b/sound/soc/codecs/rt5645.c
@@ -3823,6 +3823,8 @@ static int rt5645_i2c_probe(struct i2c_client *i2c,
 	regmap_read(regmap, RT5645_VENDOR_ID, &val);
 	rt5645->v_id = val & 0xff;
 
+	regmap_write(rt5645->regmap, RT5645_AD_DA_MIXER, 0x8080);
+
 	ret = regmap_register_patch(rt5645->regmap, init_list,
 				    ARRAY_SIZE(init_list));
 	if (ret != 0)

From 254beff97b4714bac4ec8add5a6888c1adc1ad8f Mon Sep 17 00:00:00 2001
From: "oder_chiou@realtek.com" <oder_chiou@realtek.com>
Date: Fri, 24 Nov 2017 16:11:22 +0800
Subject: [PATCH 008/808] ASoC: rt5514: Make sure the DMIC delay will be
 happened after normal SUPPLY widgets power on

The patch makes sure the DMIC delay will be happened after normal SUPPLY
widgets power on. If there are some platforms that provide the MCLK using
the SUPPLY widget, it will make sure the delay time is helpful.

Signed-off-by: Oder Chiou <oder_chiou@realtek.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 sound/soc/codecs/rt5514.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sound/soc/codecs/rt5514.c b/sound/soc/codecs/rt5514.c
index 2a5b5d74e697..2dd6e9f990a4 100644
--- a/sound/soc/codecs/rt5514.c
+++ b/sound/soc/codecs/rt5514.c
@@ -496,7 +496,7 @@ static const struct snd_soc_dapm_widget rt5514_dapm_widgets[] = {
 	SND_SOC_DAPM_PGA("DMIC1", SND_SOC_NOPM, 0, 0, NULL, 0),
 	SND_SOC_DAPM_PGA("DMIC2", SND_SOC_NOPM, 0, 0, NULL, 0),
 
-	SND_SOC_DAPM_SUPPLY("DMIC CLK", SND_SOC_NOPM, 0, 0,
+	SND_SOC_DAPM_SUPPLY_S("DMIC CLK", 1, SND_SOC_NOPM, 0, 0,
 		rt5514_set_dmic_clk, SND_SOC_DAPM_PRE_PMU),
 
 	SND_SOC_DAPM_SUPPLY("ADC CLK", RT5514_CLK_CTRL1,

From 5a1314fa697fc65cefaba64cd4699bfc3e6882a6 Mon Sep 17 00:00:00 2001
From: Ricardo Ribalda <ricardo.ribalda@gmail.com>
Date: Tue, 21 Nov 2017 10:09:02 +0100
Subject: [PATCH 009/808] spi: xilinx: Detect stall with Unknown commands

When the core is configured in C_SPI_MODE > 0, it integrates a
lookup table that automatically configures the core in dual or quad mode
based on the command (first byte on the tx fifo).

Unfortunately, that list mode_?_memoy_*.mif does not contain all the
supported commands by the flash.

Since 4.14 spi-nor automatically tries to probe the flash using SFDP
(command 0x5a), and that command is not part of the list_mode table.

Whit the right combination of C_SPI_MODE and C_SPI_MEMORY this leads
into a stall that can only be recovered with a soft rest.

This patch detects this kind of stall and returns -EIO to the caller on
those commands. spi-nor can handle this error properly:

m25p80 spi0.0: Detected stall. Check C_SPI_MODE and C_SPI_MEMORY. 0x21 0x2404
m25p80 spi0.0: SPI transfer failed: -5
spi_master spi0: failed to transfer one message from queue
m25p80 spi0.0: s25sl064p (8192 Kbytes)

Signed-off-by: Ricardo Ribalda Delgado <ricardo.ribalda@gmail.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Cc: stable@vger.kernel.org
---
 drivers/spi/spi-xilinx.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/drivers/spi/spi-xilinx.c b/drivers/spi/spi-xilinx.c
index bc7100b93dfc..e0b9fe1d0e37 100644
--- a/drivers/spi/spi-xilinx.c
+++ b/drivers/spi/spi-xilinx.c
@@ -271,6 +271,7 @@ static int xilinx_spi_txrx_bufs(struct spi_device *spi, struct spi_transfer *t)
 	while (remaining_words) {
 		int n_words, tx_words, rx_words;
 		u32 sr;
+		int stalled;
 
 		n_words = min(remaining_words, xspi->buffer_size);
 
@@ -299,7 +300,17 @@ static int xilinx_spi_txrx_bufs(struct spi_device *spi, struct spi_transfer *t)
 
 		/* Read out all the data from the Rx FIFO */
 		rx_words = n_words;
+		stalled = 10;
 		while (rx_words) {
+			if (rx_words == n_words && !(stalled--) &&
+			    !(sr & XSPI_SR_TX_EMPTY_MASK) &&
+			    (sr & XSPI_SR_RX_EMPTY_MASK)) {
+				dev_err(&spi->dev,
+					"Detected stall. Check C_SPI_MODE and C_SPI_MEMORY\n");
+				xspi_init_hw(xspi);
+				return -EIO;
+			}
+
 			if ((sr & XSPI_SR_TX_EMPTY_MASK) && (rx_words > 1)) {
 				xilinx_spi_rx(xspi);
 				rx_words--;

From 5ddc3c656bfb5c90d0196ff72b908d0343fef85e Mon Sep 17 00:00:00 2001
From: Zhen Lei <thunder.leizhen@huawei.com>
Date: Sat, 25 Nov 2017 15:48:32 -0800
Subject: [PATCH 010/808] Input: ims-pcu - fix typo in the error message

1. change "to" to "too".
2. move ")" to the front of "\n", which discovered by Joe Perches.

Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
Reviewed-by: Joe Perches <joe@perches.com>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/misc/ims-pcu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/input/misc/ims-pcu.c b/drivers/input/misc/ims-pcu.c
index ae473123583b..3d51175c4d72 100644
--- a/drivers/input/misc/ims-pcu.c
+++ b/drivers/input/misc/ims-pcu.c
@@ -1651,7 +1651,7 @@ ims_pcu_get_cdc_union_desc(struct usb_interface *intf)
 				return union_desc;
 
 			dev_err(&intf->dev,
-				"Union descriptor to short (%d vs %zd\n)",
+				"Union descriptor too short (%d vs %zd)\n",
 				union_desc->bLength, sizeof(*union_desc));
 			return NULL;
 		}

From 10d900303f1c3a821eb0bef4e7b7ece16768fba4 Mon Sep 17 00:00:00 2001
From: Aaron Ma <aaron.ma@canonical.com>
Date: Sat, 25 Nov 2017 16:48:41 -0800
Subject: [PATCH 011/808] Input: elantech - add new icbody type 15

The touchpad of Lenovo Thinkpad L480 reports it's version as 15.

Cc: stable@vger.kernel.org
Signed-off-by: Aaron Ma <aaron.ma@canonical.com>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/mouse/elantech.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/input/mouse/elantech.c b/drivers/input/mouse/elantech.c
index b84cd978fce2..a4aaa748e987 100644
--- a/drivers/input/mouse/elantech.c
+++ b/drivers/input/mouse/elantech.c
@@ -1613,7 +1613,7 @@ static int elantech_set_properties(struct elantech_data *etd)
 		case 5:
 			etd->hw_version = 3;
 			break;
-		case 6 ... 14:
+		case 6 ... 15:
 			etd->hw_version = 4;
 			break;
 		default:

From bdfe4cebea11476d278b1b98dd0f7cdac8269d62 Mon Sep 17 00:00:00 2001
From: Icenowy Zheng <icenowy@aosc.io>
Date: Fri, 10 Nov 2017 17:26:54 +0800
Subject: [PATCH 012/808] arm64: allwinner: a64: add Ethernet PHY regulator for
 several boards

On several A64 boards the Ethernet PHY is powered by the DC1SW regulator
on the AXP803 PMIC.

Add phy-handle property to these boards' emac node.

Signed-off-by: Icenowy Zheng <icenowy@aosc.io>
Acked-by: Corentin LABBE <clabbe.montjoie@gmail.com>
Tested-by: Corentin LABBE <clabbe.montjoie@gmail.com>
Signed-off-by: Maxime Ripard <maxime.ripard@free-electrons.com>
---
 arch/arm64/boot/dts/allwinner/sun50i-a64-bananapi-m64.dts     | 1 +
 arch/arm64/boot/dts/allwinner/sun50i-a64-pine64.dts           | 1 +
 arch/arm64/boot/dts/allwinner/sun50i-a64-sopine-baseboard.dts | 1 +
 3 files changed, 3 insertions(+)

diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-bananapi-m64.dts b/arch/arm64/boot/dts/allwinner/sun50i-a64-bananapi-m64.dts
index 45bdbfb96126..4a8d3f83a36e 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-a64-bananapi-m64.dts
+++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-bananapi-m64.dts
@@ -75,6 +75,7 @@
 	pinctrl-0 = <&rgmii_pins>;
 	phy-mode = "rgmii";
 	phy-handle = <&ext_rgmii_phy>;
+	phy-supply = <&reg_dc1sw>;
 	status = "okay";
 };
 
diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64.dts b/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64.dts
index 806442d3e846..604cdaedac38 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64.dts
+++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64.dts
@@ -77,6 +77,7 @@
 	pinctrl-0 = <&rmii_pins>;
 	phy-mode = "rmii";
 	phy-handle = <&ext_rmii_phy1>;
+	phy-supply = <&reg_dc1sw>;
 	status = "okay";
 
 };
diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine-baseboard.dts b/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine-baseboard.dts
index 0eb2acedf8c3..a053a6ac5267 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine-baseboard.dts
+++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine-baseboard.dts
@@ -82,6 +82,7 @@
 	pinctrl-0 = <&rgmii_pins>;
 	phy-mode = "rgmii";
 	phy-handle = <&ext_rgmii_phy>;
+	phy-supply = <&reg_dc1sw>;
 	status = "okay";
 };
 

From 251c201bf4f8b5bf4f1ccb4f8920eed2e1f57580 Mon Sep 17 00:00:00 2001
From: Maxime Chevallier <maxime.chevallier@smile.fr>
Date: Mon, 27 Nov 2017 15:16:32 +0100
Subject: [PATCH 013/808] spi: a3700: Fix clk prescaling for coefficient over
 15

The Armada 3700 SPI controller has 2 ranges of prescaler coefficients.
One ranging from 0 to 15 by steps of 1, and one ranging from 0 to 30 by
steps of 2.

This commit fixes the prescaler coefficients that are over 15 so that it
uses the correct range of values. The prescaling coefficient is rounded
to the upper value if it is odd.

This was tested on Espressobin with spidev and a locigal analyser.

Signed-off-by: Maxime Chevallier <maxime.chevallier@smile.fr>
Signed-off-by: Mark Brown <broonie@kernel.org>
Cc: stable@vger.kernel.org
---
 drivers/spi/spi-armada-3700.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/spi/spi-armada-3700.c b/drivers/spi/spi-armada-3700.c
index 77fe55ce790c..d65345312527 100644
--- a/drivers/spi/spi-armada-3700.c
+++ b/drivers/spi/spi-armada-3700.c
@@ -79,6 +79,7 @@
 #define A3700_SPI_BYTE_LEN		BIT(5)
 #define A3700_SPI_CLK_PRESCALE		BIT(0)
 #define A3700_SPI_CLK_PRESCALE_MASK	(0x1f)
+#define A3700_SPI_CLK_EVEN_OFFS		(0x10)
 
 #define A3700_SPI_WFIFO_THRS_BIT	28
 #define A3700_SPI_RFIFO_THRS_BIT	24
@@ -220,6 +221,13 @@ static void a3700_spi_clock_set(struct a3700_spi *a3700_spi,
 
 	prescale = DIV_ROUND_UP(clk_get_rate(a3700_spi->clk), speed_hz);
 
+	/* For prescaler values over 15, we can only set it by steps of 2.
+	 * Starting from A3700_SPI_CLK_EVEN_OFFS, we set values from 0 up to
+	 * 30. We only use this range from 16 to 30.
+	 */
+	if (prescale > 15)
+		prescale = A3700_SPI_CLK_EVEN_OFFS + DIV_ROUND_UP(prescale, 2);
+
 	val = spireg_read(a3700_spi, A3700_SPI_IF_CFG_REG);
 	val = val & ~A3700_SPI_CLK_PRESCALE_MASK;
 

From fdaa451107ce543d345a339b4d5e20e8e4bac396 Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Mon, 20 Nov 2017 20:27:56 -0800
Subject: [PATCH 014/808] ASoC: amd: Add error checking to probe function

The acp_audio_dma does not perform sufficient error checking in its probe
function. This can result in crashes if a critical error path is
encountered.

Fixes: 7c31335a03b6a ("ASoC: AMD: add AMD ASoC ACP 2.x DMA driver")
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: Dominik Behr <dbehr@chromium.org>
Cc: Daniel Kurtz <djkurtz@chromium.org>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 sound/soc/amd/acp-pcm-dma.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/sound/soc/amd/acp-pcm-dma.c b/sound/soc/amd/acp-pcm-dma.c
index 9f521a55d610..b5e41df6bb3a 100644
--- a/sound/soc/amd/acp-pcm-dma.c
+++ b/sound/soc/amd/acp-pcm-dma.c
@@ -1051,6 +1051,11 @@ static int acp_audio_probe(struct platform_device *pdev)
 	struct resource *res;
 	const u32 *pdata = pdev->dev.platform_data;
 
+	if (!pdata) {
+		dev_err(&pdev->dev, "Missing platform data\n");
+		return -ENODEV;
+	}
+
 	audio_drv_data = devm_kzalloc(&pdev->dev, sizeof(struct audio_drv_data),
 					GFP_KERNEL);
 	if (audio_drv_data == NULL)
@@ -1058,6 +1063,8 @@ static int acp_audio_probe(struct platform_device *pdev)
 
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	audio_drv_data->acp_mmio = devm_ioremap_resource(&pdev->dev, res);
+	if (IS_ERR(audio_drv_data->acp_mmio))
+		return PTR_ERR(audio_drv_data->acp_mmio);
 
 	/* The following members gets populated in device 'open'
 	 * function. Till then interrupts are disabled in 'acp_init'

From 695b78b548d8a26288f041e907ff17758df9e1d5 Mon Sep 17 00:00:00 2001
From: "Maciej S. Szmigiero" <mail@maciej.szmigiero.name>
Date: Mon, 20 Nov 2017 23:14:55 +0100
Subject: [PATCH 015/808] ASoC: fsl_ssi: AC'97 ops need regmap, clock and
 cleaning up on failure

AC'97 ops (register read / write) need SSI regmap and clock, so they have
to be set after them.

We also need to set these ops back to NULL if we fail the probe.

Signed-off-by: Maciej S. Szmigiero <mail@maciej.szmigiero.name>
Acked-by: Nicolin Chen <nicoleotsuka@gmail.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Cc: stable@vger.kernel.org
---
 sound/soc/fsl/fsl_ssi.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/sound/soc/fsl/fsl_ssi.c b/sound/soc/fsl/fsl_ssi.c
index f2f51e06e22c..c3a83ed0297e 100644
--- a/sound/soc/fsl/fsl_ssi.c
+++ b/sound/soc/fsl/fsl_ssi.c
@@ -1458,12 +1458,6 @@ static int fsl_ssi_probe(struct platform_device *pdev)
 				sizeof(fsl_ssi_ac97_dai));
 
 		fsl_ac97_data = ssi_private;
-
-		ret = snd_soc_set_ac97_ops_of_reset(&fsl_ssi_ac97_ops, pdev);
-		if (ret) {
-			dev_err(&pdev->dev, "could not set AC'97 ops\n");
-			return ret;
-		}
 	} else {
 		/* Initialize this copy of the CPU DAI driver structure */
 		memcpy(&ssi_private->cpu_dai_drv, &fsl_ssi_dai_template,
@@ -1574,6 +1568,14 @@ static int fsl_ssi_probe(struct platform_device *pdev)
 			return ret;
 	}
 
+	if (fsl_ssi_is_ac97(ssi_private)) {
+		ret = snd_soc_set_ac97_ops_of_reset(&fsl_ssi_ac97_ops, pdev);
+		if (ret) {
+			dev_err(&pdev->dev, "could not set AC'97 ops\n");
+			goto error_ac97_ops;
+		}
+	}
+
 	ret = devm_snd_soc_register_component(&pdev->dev, &fsl_ssi_component,
 					      &ssi_private->cpu_dai_drv, 1);
 	if (ret) {
@@ -1657,6 +1659,10 @@ error_sound_card:
 	fsl_ssi_debugfs_remove(&ssi_private->dbg_stats);
 
 error_asoc_register:
+	if (fsl_ssi_is_ac97(ssi_private))
+		snd_soc_set_ac97_ops(NULL);
+
+error_ac97_ops:
 	if (ssi_private->soc->imx)
 		fsl_ssi_imx_clean(pdev, ssi_private);
 

From b880b8056b31288323745a13930bc45cf4c86e9d Mon Sep 17 00:00:00 2001
From: "Maciej S. Szmigiero" <mail@maciej.szmigiero.name>
Date: Mon, 20 Nov 2017 23:16:07 +0100
Subject: [PATCH 016/808] ASoC: fsl_ssi: serialize AC'97 register access
 operations

AC'97 register access operations (both read and write) on SSI use a one,
shared set of SSI registers for AC'97 register address and data.
This means that only one such access is possible at a time and so all these
operations need to be serialized.

Since an AC'97 register access operation in this driver takes 100us+ let's
use a mutex for this.

Use this opportunity to also change a default value returned from AC'97
register read function from -1 to 0, since that's what AC'97 specs require
to be returned when unknown / undefined registers are read.

Signed-off-by: Maciej S. Szmigiero <mail@maciej.szmigiero.name>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 sound/soc/fsl/fsl_ssi.c | 26 ++++++++++++++++++++++----
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/sound/soc/fsl/fsl_ssi.c b/sound/soc/fsl/fsl_ssi.c
index c3a83ed0297e..424bafaf51ef 100644
--- a/sound/soc/fsl/fsl_ssi.c
+++ b/sound/soc/fsl/fsl_ssi.c
@@ -38,6 +38,7 @@
 #include <linux/ctype.h>
 #include <linux/device.h>
 #include <linux/delay.h>
+#include <linux/mutex.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/of.h>
@@ -265,6 +266,8 @@ struct fsl_ssi_private {
 
 	u32 fifo_watermark;
 	u32 dma_maxburst;
+
+	struct mutex ac97_reg_lock;
 };
 
 /*
@@ -1260,11 +1263,13 @@ static void fsl_ssi_ac97_write(struct snd_ac97 *ac97, unsigned short reg,
 	if (reg > 0x7f)
 		return;
 
+	mutex_lock(&fsl_ac97_data->ac97_reg_lock);
+
 	ret = clk_prepare_enable(fsl_ac97_data->clk);
 	if (ret) {
 		pr_err("ac97 write clk_prepare_enable failed: %d\n",
 			ret);
-		return;
+		goto ret_unlock;
 	}
 
 	lreg = reg <<  12;
@@ -1278,6 +1283,9 @@ static void fsl_ssi_ac97_write(struct snd_ac97 *ac97, unsigned short reg,
 	udelay(100);
 
 	clk_disable_unprepare(fsl_ac97_data->clk);
+
+ret_unlock:
+	mutex_unlock(&fsl_ac97_data->ac97_reg_lock);
 }
 
 static unsigned short fsl_ssi_ac97_read(struct snd_ac97 *ac97,
@@ -1285,16 +1293,18 @@ static unsigned short fsl_ssi_ac97_read(struct snd_ac97 *ac97,
 {
 	struct regmap *regs = fsl_ac97_data->regs;
 
-	unsigned short val = -1;
+	unsigned short val = 0;
 	u32 reg_val;
 	unsigned int lreg;
 	int ret;
 
+	mutex_lock(&fsl_ac97_data->ac97_reg_lock);
+
 	ret = clk_prepare_enable(fsl_ac97_data->clk);
 	if (ret) {
 		pr_err("ac97 read clk_prepare_enable failed: %d\n",
 			ret);
-		return -1;
+		goto ret_unlock;
 	}
 
 	lreg = (reg & 0x7f) <<  12;
@@ -1309,6 +1319,8 @@ static unsigned short fsl_ssi_ac97_read(struct snd_ac97 *ac97,
 
 	clk_disable_unprepare(fsl_ac97_data->clk);
 
+ret_unlock:
+	mutex_unlock(&fsl_ac97_data->ac97_reg_lock);
 	return val;
 }
 
@@ -1569,6 +1581,7 @@ static int fsl_ssi_probe(struct platform_device *pdev)
 	}
 
 	if (fsl_ssi_is_ac97(ssi_private)) {
+		mutex_init(&ssi_private->ac97_reg_lock);
 		ret = snd_soc_set_ac97_ops_of_reset(&fsl_ssi_ac97_ops, pdev);
 		if (ret) {
 			dev_err(&pdev->dev, "could not set AC'97 ops\n");
@@ -1663,6 +1676,9 @@ error_asoc_register:
 		snd_soc_set_ac97_ops(NULL);
 
 error_ac97_ops:
+	if (fsl_ssi_is_ac97(ssi_private))
+		mutex_destroy(&ssi_private->ac97_reg_lock);
+
 	if (ssi_private->soc->imx)
 		fsl_ssi_imx_clean(pdev, ssi_private);
 
@@ -1681,8 +1697,10 @@ static int fsl_ssi_remove(struct platform_device *pdev)
 	if (ssi_private->soc->imx)
 		fsl_ssi_imx_clean(pdev, ssi_private);
 
-	if (fsl_ssi_is_ac97(ssi_private))
+	if (fsl_ssi_is_ac97(ssi_private)) {
 		snd_soc_set_ac97_ops(NULL);
+		mutex_destroy(&ssi_private->ac97_reg_lock);
+	}
 
 	return 0;
 }

From 346cccf88319344c9f513bd85df6ae2258e8a8ea Mon Sep 17 00:00:00 2001
From: "oder_chiou@realtek.com" <oder_chiou@realtek.com>
Date: Mon, 20 Nov 2017 18:23:19 +0800
Subject: [PATCH 017/808] ASoC: rt5514: Add the sanity check for the
 driver_data in the resume function

If the rt5514 spi driver is loaded, but the snd_soc_platform_driver is not
loaded by the correct DAI settings, the NULL pointer will be gotten by
snd_soc_platform_get_drvdata in the resume function.

Signed-off-by: Oder Chiou <oder_chiou@realtek.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 sound/soc/codecs/rt5514-spi.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/sound/soc/codecs/rt5514-spi.c b/sound/soc/codecs/rt5514-spi.c
index 2df91db765ac..ca6a90d8fc39 100644
--- a/sound/soc/codecs/rt5514-spi.c
+++ b/sound/soc/codecs/rt5514-spi.c
@@ -482,10 +482,13 @@ static int __maybe_unused rt5514_resume(struct device *dev)
 	if (device_may_wakeup(dev))
 		disable_irq_wake(irq);
 
-	if (rt5514_dsp->substream) {
-		rt5514_spi_burst_read(RT5514_IRQ_CTRL, (u8 *)&buf, sizeof(buf));
-		if (buf[0] & RT5514_IRQ_STATUS_BIT)
-			rt5514_schedule_copy(rt5514_dsp);
+	if (rt5514_dsp) {
+		if (rt5514_dsp->substream) {
+			rt5514_spi_burst_read(RT5514_IRQ_CTRL, (u8 *)&buf,
+				sizeof(buf));
+			if (buf[0] & RT5514_IRQ_STATUS_BIT)
+				rt5514_schedule_copy(rt5514_dsp);
+		}
 	}
 
 	return 0;

From d3b0535216f04e7e149eaebe8e967c46bdf88dc3 Mon Sep 17 00:00:00 2001
From: Adam Thomson <Adam.Thomson.Opensource@diasemi.com>
Date: Fri, 17 Nov 2017 15:09:27 +0000
Subject: [PATCH 018/808] ASoC: da7219: Correct IRQ level in DT binding example

Current DT binding documentation shows an example where the IRQ
for the device is chosen to be ACTIVE_HIGH. This is incorrect as
the device only supports ACTIVE_LOW, so this commit fixes that
discrepancy.

Signed-off-by: Adam Thomson <Adam.Thomson.Opensource@diasemi.com>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 Documentation/devicetree/bindings/sound/da7219.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/devicetree/bindings/sound/da7219.txt b/Documentation/devicetree/bindings/sound/da7219.txt
index cf61681826b6..5b54d2d045c3 100644
--- a/Documentation/devicetree/bindings/sound/da7219.txt
+++ b/Documentation/devicetree/bindings/sound/da7219.txt
@@ -77,7 +77,7 @@ Example:
 		reg = <0x1a>;
 
 		interrupt-parent = <&gpio6>;
-		interrupts = <11 IRQ_TYPE_LEVEL_HIGH>;
+		interrupts = <11 IRQ_TYPE_LEVEL_LOW>;
 
 		VDD-supply = <&reg_audio>;
 		VDDMIC-supply = <&reg_audio>;

From b7926c464d6479fc62a4297ca4f48a5da5fb0988 Mon Sep 17 00:00:00 2001
From: Adam Thomson <Adam.Thomson.Opensource@diasemi.com>
Date: Fri, 17 Nov 2017 15:09:28 +0000
Subject: [PATCH 019/808] ASoC: da7218: Correct IRQ level in DT binding example

Current DT binding documentation shows an example where the IRQ
for the device is chosen to be ACTIVE_HIGH. This is incorrect as
the device only supports ACTIVE_LOW, so this commit fixes that
discrepancy.

Signed-off-by: Adam Thomson <Adam.Thomson.Opensource@diasemi.com>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 Documentation/devicetree/bindings/sound/da7218.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/devicetree/bindings/sound/da7218.txt b/Documentation/devicetree/bindings/sound/da7218.txt
index 5ca5a709b6aa..3ab9dfef38d1 100644
--- a/Documentation/devicetree/bindings/sound/da7218.txt
+++ b/Documentation/devicetree/bindings/sound/da7218.txt
@@ -73,7 +73,7 @@ Example:
 		compatible = "dlg,da7218";
 		reg = <0x1a>;
 		interrupt-parent = <&gpio6>;
-		interrupts = <11 IRQ_TYPE_LEVEL_HIGH>;
+		interrupts = <11 IRQ_TYPE_LEVEL_LOW>;
 		wakeup-source;
 
 		VDD-supply = <&reg_audio>;

From a91d7fb97092d6b840af5899ded3b389603fd7f1 Mon Sep 17 00:00:00 2001
From: Jiada Wang <jiada_wang@mentor.com>
Date: Tue, 28 Nov 2017 16:05:13 +0900
Subject: [PATCH 020/808] ASoC: rsnd: ssiu: clear SSI_MODE for non TDM Extended
 modes

register SSI_MODE is set when SSI works in TDM Extended,
but it isn't reset when SSI starts to work in other modes,
thus causes issues.

This patch clearss SSI_MODE register when SSI works in modes
other than TDM Extended.

Fixes: 186fadc132f0 ("ASoC: rsnd: add TDM Extend Mode support")
Signed-off-by: Jiada Wang <jiada_wang@mentor.com>
Acked-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 sound/soc/sh/rcar/ssiu.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/sound/soc/sh/rcar/ssiu.c b/sound/soc/sh/rcar/ssiu.c
index 4d948757d300..6ff8a36c2c82 100644
--- a/sound/soc/sh/rcar/ssiu.c
+++ b/sound/soc/sh/rcar/ssiu.c
@@ -125,6 +125,7 @@ static int rsnd_ssiu_init_gen2(struct rsnd_mod *mod,
 {
 	int hdmi = rsnd_ssi_hdmi_port(io);
 	int ret;
+	u32 mode = 0;
 
 	ret = rsnd_ssiu_init(mod, io, priv);
 	if (ret < 0)
@@ -136,9 +137,11 @@ static int rsnd_ssiu_init_gen2(struct rsnd_mod *mod,
 		 * see
 		 *	rsnd_ssi_config_init()
 		 */
-		rsnd_mod_write(mod, SSI_MODE, 0x1);
+		mode = 0x1;
 	}
 
+	rsnd_mod_write(mod, SSI_MODE, mode);
+
 	if (rsnd_ssi_use_busif(io)) {
 		rsnd_mod_write(mod, SSI_BUSIF_ADINR,
 			       rsnd_get_adinr_bit(mod, io) |

From 329b4130bc5eb2a1b123a652b985dbdb08d6b9a8 Mon Sep 17 00:00:00 2001
From: Alexey Brodkin <Alexey.Brodkin@synopsys.com>
Date: Thu, 23 Nov 2017 13:21:55 +0300
Subject: [PATCH 021/808] ARC: Fix detection of dual-issue enabled

As per PRM bit #0 ("D") in EXEC_CTRL enables dual-issue if set to 0,
otherwise if set to 1 all instructions are executed one at a time,
i.e. dual-issue is disabled.

Signed-off-by: Alexey Brodkin <abrodkin@synopsys.com>
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
---
 arch/arc/kernel/setup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arc/kernel/setup.c b/arch/arc/kernel/setup.c
index 7ef7d9a8ff89..9d27331fe69a 100644
--- a/arch/arc/kernel/setup.c
+++ b/arch/arc/kernel/setup.c
@@ -199,7 +199,7 @@ static void read_arc_build_cfg_regs(void)
 			unsigned int exec_ctrl;
 
 			READ_BCR(AUX_EXEC_CTRL, exec_ctrl);
-			cpu->extn.dual_enb = exec_ctrl & 1;
+			cpu->extn.dual_enb = !(exec_ctrl & 1);
 
 			/* dual issue always present for this core */
 			cpu->extn.dual = 1;

From 6a53b7593233ab9e4f96873ebacc0f653a55c3e1 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Mon, 27 Nov 2017 11:15:16 -0800
Subject: [PATCH 022/808] xfrm: check id proto in validate_tmpl()

syzbot reported a kernel warning in xfrm_state_fini(), which
indicates that we have entries left in the list
net->xfrm.state_all whose proto is zero. And
xfrm_id_proto_match() doesn't consider them as a match with
IPSEC_PROTO_ANY in this case.

Proto with value 0 is probably not a valid value, at least
verify_newsa_info() doesn't consider it valid either.

This patch fixes it by checking the proto value in
validate_tmpl() and rejecting invalid ones, like what iproute2
does in xfrm_xfrmproto_getbyname().

Reported-by: syzbot <syzkaller@googlegroups.com>
Cc: Steffen Klassert <steffen.klassert@secunet.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_user.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 983b0233767b..c2cfcc6fdb34 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -1445,6 +1445,21 @@ static int validate_tmpl(int nr, struct xfrm_user_tmpl *ut, u16 family)
 		default:
 			return -EINVAL;
 		}
+
+		switch (ut[i].id.proto) {
+		case IPPROTO_AH:
+		case IPPROTO_ESP:
+		case IPPROTO_COMP:
+#if IS_ENABLED(CONFIG_IPV6)
+		case IPPROTO_ROUTING:
+		case IPPROTO_DSTOPTS:
+#endif
+		case IPSEC_PROTO_ANY:
+			break;
+		default:
+			return -EINVAL;
+		}
+
 	}
 
 	return 0;

From b89b6925bb9d48926d7ba713d3f13b14fc35c544 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicoleotsuka@gmail.com>
Date: Thu, 16 Nov 2017 11:55:18 -0800
Subject: [PATCH 023/808] ASoC: fsl_asrc: Fix typo in a field define

ASRFSTi_IAEi has an 11-bit offset as its _SHIFT macro defines.

So this patch just fixes that.

Reported-by: Laurent Charpentier <laurent.charpentier@nxp.com>
Signed-off-by: Nicolin Chen <nicoleotsuka@gmail.com>
Reviewed-by: Fabio Estevam <fabio.estevam@nxp.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 sound/soc/fsl/fsl_asrc.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sound/soc/fsl/fsl_asrc.h b/sound/soc/fsl/fsl_asrc.h
index 0f163abe4ba3..52c27a358933 100644
--- a/sound/soc/fsl/fsl_asrc.h
+++ b/sound/soc/fsl/fsl_asrc.h
@@ -260,8 +260,8 @@
 #define ASRFSTi_OUTPUT_FIFO_SHIFT	12
 #define ASRFSTi_OUTPUT_FIFO_MASK	(((1 << ASRFSTi_OUTPUT_FIFO_WIDTH) - 1) << ASRFSTi_OUTPUT_FIFO_SHIFT)
 #define ASRFSTi_IAEi_SHIFT		11
-#define ASRFSTi_IAEi_MASK		(1 << ASRFSTi_OAFi_SHIFT)
-#define ASRFSTi_IAEi			(1 << ASRFSTi_OAFi_SHIFT)
+#define ASRFSTi_IAEi_MASK		(1 << ASRFSTi_IAEi_SHIFT)
+#define ASRFSTi_IAEi			(1 << ASRFSTi_IAEi_SHIFT)
 #define ASRFSTi_INPUT_FIFO_WIDTH	7
 #define ASRFSTi_INPUT_FIFO_SHIFT	0
 #define ASRFSTi_INPUT_FIFO_MASK		((1 << ASRFSTi_INPUT_FIFO_WIDTH) - 1)

From 15d8374874ded0bec37ef27f8301a6d54032c0e5 Mon Sep 17 00:00:00 2001
From: Jon Hunter <jonathanh@nvidia.com>
Date: Tue, 14 Nov 2017 14:43:27 +0000
Subject: [PATCH 024/808] mfd: cros ec: spi: Don't send first message too soon

On the Tegra124 Nyan-Big chromebook the very first SPI message sent to
the EC is failing.

The Tegra SPI driver configures the SPI chip-selects to be active-high
by default (and always has for many years). The EC SPI requires an
active-low chip-select and so the Tegra chip-select is reconfigured to
be active-low when the EC SPI driver calls spi_setup(). The problem is
that if the first SPI message to the EC is sent too soon after
reconfiguring the SPI chip-select, it fails.

The EC SPI driver prevents back-to-back SPI messages being sent too
soon by keeping track of the time the last transfer was sent via the
variable 'last_transfer_ns'. To prevent the very first transfer being
sent too soon, initialise the 'last_transfer_ns' variable after calling
spi_setup() and before sending the first SPI message.

Cc: <stable@vger.kernel.org>
Signed-off-by: Jon Hunter <jonathanh@nvidia.com>
Reviewed-by: Brian Norris <briannorris@chromium.org>
Reviewed-by: Douglas Anderson <dianders@chromium.org>
Acked-by: Benson Leung <bleung@chromium.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/cros_ec_spi.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/mfd/cros_ec_spi.c b/drivers/mfd/cros_ec_spi.c
index c9714072e224..a14196e95e9b 100644
--- a/drivers/mfd/cros_ec_spi.c
+++ b/drivers/mfd/cros_ec_spi.c
@@ -667,6 +667,7 @@ static int cros_ec_spi_probe(struct spi_device *spi)
 			   sizeof(struct ec_response_get_protocol_info);
 	ec_dev->dout_size = sizeof(struct ec_host_request);
 
+	ec_spi->last_transfer_ns = ktime_get_ns();
 
 	err = cros_ec_register(ec_dev);
 	if (err) {

From 0a423772de2f3d7b00899987884f62f63ae00dcb Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Sat, 11 Nov 2017 16:38:43 +0100
Subject: [PATCH 025/808] mfd: twl4030-audio: Fix sibling-node lookup

A helper purported to look up a child node based on its name was using
the wrong of-helper and ended up prematurely freeing the parent of-node
while leaking any matching node.

To make things worse, any matching node would not even necessarily be a
child node as the whole device tree was searched depth-first starting at
the parent.

Fixes: 019a7e6b7b31 ("mfd: twl4030-audio: Add DT support")
Cc: stable <stable@vger.kernel.org>     # 3.7
Signed-off-by: Johan Hovold <johan@kernel.org>
Acked-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/twl4030-audio.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/mfd/twl4030-audio.c b/drivers/mfd/twl4030-audio.c
index da16bf45fab4..dc94ffc6321a 100644
--- a/drivers/mfd/twl4030-audio.c
+++ b/drivers/mfd/twl4030-audio.c
@@ -159,13 +159,18 @@ unsigned int twl4030_audio_get_mclk(void)
 EXPORT_SYMBOL_GPL(twl4030_audio_get_mclk);
 
 static bool twl4030_audio_has_codec(struct twl4030_audio_data *pdata,
-			      struct device_node *node)
+			      struct device_node *parent)
 {
+	struct device_node *node;
+
 	if (pdata && pdata->codec)
 		return true;
 
-	if (of_find_node_by_name(node, "codec"))
+	node = of_get_child_by_name(parent, "codec");
+	if (node) {
+		of_node_put(node);
 		return true;
+	}
 
 	return false;
 }

From 85e9b13cbb130a3209f21bd7933933399c389ffe Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Sat, 11 Nov 2017 16:38:44 +0100
Subject: [PATCH 026/808] mfd: twl6040: Fix child-node lookup

Fix child-node lookup during probe, which ended up searching the whole
device tree depth-first starting at the parent rather than just matching
on its children.

To make things worse, the parent node was prematurely freed, while the
child node was leaked.

Note that the CONFIG_OF compile guard can be removed as
of_get_child_by_name() provides a !CONFIG_OF implementation which always
fails.

Cc: stable <stable@vger.kernel.org>     # 3.5
Fixes: 37e13cecaa14 ("mfd: Add support for Device Tree to twl6040")
Fixes: ca2cad6ae38e ("mfd: Fix twl6040 build failure")
Signed-off-by: Johan Hovold <johan@kernel.org>
Acked-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/twl6040.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/mfd/twl6040.c b/drivers/mfd/twl6040.c
index d66502d36ba0..dd19f17a1b63 100644
--- a/drivers/mfd/twl6040.c
+++ b/drivers/mfd/twl6040.c
@@ -97,12 +97,16 @@ static struct reg_sequence twl6040_patch[] = {
 };
 
 
-static bool twl6040_has_vibra(struct device_node *node)
+static bool twl6040_has_vibra(struct device_node *parent)
 {
-#ifdef CONFIG_OF
-	if (of_find_node_by_name(node, "vibra"))
+	struct device_node *node;
+
+	node = of_get_child_by_name(parent, "vibra");
+	if (node) {
+		of_node_put(node);
 		return true;
-#endif
+	}
+
 	return false;
 }
 

From 001dde9400d5c3e9e2ce2abe06c1efa70a25dfde Mon Sep 17 00:00:00 2001
From: Shawn Nematbakhsh <shawnn@chromium.org>
Date: Wed, 27 Sep 2017 14:35:27 -0700
Subject: [PATCH 027/808] mfd: cros ec: spi: Fix "in progress" error signaling

For host commands that take a long time to process, cros ec can return
early by signaling a EC_RES_IN_PROGRESS result. The host must then poll
status with EC_CMD_GET_COMMS_STATUS until completion of the command.

None of the above applies when data link errors are encountered. When
errors such as EC_SPI_PAST_END are encountered during command
transmission, it usually means the command was not received by the EC.
Treating such errors as if they were 'EC_RES_IN_PROGRESS' results is
almost always the wrong decision, and can result in host commands
silently being lost.

Reported-by: Jon Hunter <jonathanh@nvidia.com>
Signed-off-by: Shawn Nematbakhsh <shawnn@chromium.org>
Reviewed-by: Brian Norris <briannorris@chromium.org>
Tested-by: Jon Hunter <jonathanh@nvidia.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/cros_ec_spi.c | 52 ++++++++++++++++++---------------------
 1 file changed, 24 insertions(+), 28 deletions(-)

diff --git a/drivers/mfd/cros_ec_spi.c b/drivers/mfd/cros_ec_spi.c
index a14196e95e9b..59c82cdcf48d 100644
--- a/drivers/mfd/cros_ec_spi.c
+++ b/drivers/mfd/cros_ec_spi.c
@@ -377,6 +377,7 @@ static int cros_ec_pkt_xfer_spi(struct cros_ec_device *ec_dev,
 	u8 *ptr;
 	u8 *rx_buf;
 	u8 sum;
+	u8 rx_byte;
 	int ret = 0, final_ret;
 
 	len = cros_ec_prepare_tx(ec_dev, ec_msg);
@@ -421,25 +422,22 @@ static int cros_ec_pkt_xfer_spi(struct cros_ec_device *ec_dev,
 	if (!ret) {
 		/* Verify that EC can process command */
 		for (i = 0; i < len; i++) {
-			switch (rx_buf[i]) {
-			case EC_SPI_PAST_END:
-			case EC_SPI_RX_BAD_DATA:
-			case EC_SPI_NOT_READY:
-				ret = -EAGAIN;
-				ec_msg->result = EC_RES_IN_PROGRESS;
-			default:
+			rx_byte = rx_buf[i];
+			if (rx_byte == EC_SPI_PAST_END  ||
+			    rx_byte == EC_SPI_RX_BAD_DATA ||
+			    rx_byte == EC_SPI_NOT_READY) {
+				ret = -EREMOTEIO;
 				break;
 			}
-			if (ret)
-				break;
 		}
-		if (!ret)
-			ret = cros_ec_spi_receive_packet(ec_dev,
-					ec_msg->insize + sizeof(*response));
-	} else {
-		dev_err(ec_dev->dev, "spi transfer failed: %d\n", ret);
 	}
 
+	if (!ret)
+		ret = cros_ec_spi_receive_packet(ec_dev,
+				ec_msg->insize + sizeof(*response));
+	else
+		dev_err(ec_dev->dev, "spi transfer failed: %d\n", ret);
+
 	final_ret = terminate_request(ec_dev);
 
 	spi_bus_unlock(ec_spi->spi->master);
@@ -508,6 +506,7 @@ static int cros_ec_cmd_xfer_spi(struct cros_ec_device *ec_dev,
 	int i, len;
 	u8 *ptr;
 	u8 *rx_buf;
+	u8 rx_byte;
 	int sum;
 	int ret = 0, final_ret;
 
@@ -544,25 +543,22 @@ static int cros_ec_cmd_xfer_spi(struct cros_ec_device *ec_dev,
 	if (!ret) {
 		/* Verify that EC can process command */
 		for (i = 0; i < len; i++) {
-			switch (rx_buf[i]) {
-			case EC_SPI_PAST_END:
-			case EC_SPI_RX_BAD_DATA:
-			case EC_SPI_NOT_READY:
-				ret = -EAGAIN;
-				ec_msg->result = EC_RES_IN_PROGRESS;
-			default:
+			rx_byte = rx_buf[i];
+			if (rx_byte == EC_SPI_PAST_END  ||
+			    rx_byte == EC_SPI_RX_BAD_DATA ||
+			    rx_byte == EC_SPI_NOT_READY) {
+				ret = -EREMOTEIO;
 				break;
 			}
-			if (ret)
-				break;
 		}
-		if (!ret)
-			ret = cros_ec_spi_receive_response(ec_dev,
-					ec_msg->insize + EC_MSG_TX_PROTO_BYTES);
-	} else {
-		dev_err(ec_dev->dev, "spi transfer failed: %d\n", ret);
 	}
 
+	if (!ret)
+		ret = cros_ec_spi_receive_response(ec_dev,
+				ec_msg->insize + EC_MSG_TX_PROTO_BYTES);
+	else
+		dev_err(ec_dev->dev, "spi transfer failed: %d\n", ret);
+
 	final_ret = terminate_request(ec_dev);
 
 	spi_bus_unlock(ec_spi->spi->master);

From da8df83957b179e5edc1029f637e5b69eff44967 Mon Sep 17 00:00:00 2001
From: Olof Johansson <olof@lixom.net>
Date: Wed, 29 Nov 2017 22:48:11 -0800
Subject: [PATCH 028/808] Input: joystick/analog - riscv has get_cycles()

Fixes:

drivers/input/joystick/analog.c:176:2: warning: #warning Precise timer not defined for this architecture. [-Wcpp]

Signed-off-by: Olof Johansson <olof@lixom.net>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/joystick/analog.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/input/joystick/analog.c b/drivers/input/joystick/analog.c
index 3d8ff09eba57..c868a878c84f 100644
--- a/drivers/input/joystick/analog.c
+++ b/drivers/input/joystick/analog.c
@@ -163,7 +163,7 @@ static unsigned int get_time_pit(void)
 #define GET_TIME(x)	do { x = (unsigned int)rdtsc(); } while (0)
 #define DELTA(x,y)	((y)-(x))
 #define TIME_NAME	"TSC"
-#elif defined(__alpha__) || defined(CONFIG_MN10300) || defined(CONFIG_ARM) || defined(CONFIG_ARM64) || defined(CONFIG_TILE)
+#elif defined(__alpha__) || defined(CONFIG_MN10300) || defined(CONFIG_ARM) || defined(CONFIG_ARM64) || defined(CONFIG_RISCV) || defined(CONFIG_TILE)
 #define GET_TIME(x)	do { x = get_cycles(); } while (0)
 #define DELTA(x,y)	((y)-(x))
 #define TIME_NAME	"get_cycles"

From 4c83c071b7849ca3e8072284a8587669d8ba6a3d Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Thu, 16 Nov 2017 16:09:29 -0800
Subject: [PATCH 029/808] Input: elants_i2c - do not clobber interrupt trigger
 on x86

This is similar to commit a4b0a58bb142 ("Input: elan_i2c - do not
clobber interrupt trigger on x86")

On x86 we historically used falling edge interrupts in the driver
because that's how first Chrome devices were configured. They also
did not use ACPI to enumerate I2C devices (because back then there
was no kernel support for that), so trigger was hard-coded in the
driver. However the controller behavior is much more reliable if
we use level triggers, and that is how we configured ARM devices,
and how want to configure newer x86 devices as well. All newer
x86 boxes have their I2C devices enumerated in ACPI.

Let's see if platform code (ACPI, DT) described interrupt and
specified particular trigger type, and if so, let's use it instead
of always clobbering trigger with IRQF_TRIGGER_FALLING. We will
still use this trigger type as a fallback if platform code left
interrupt trigger unconfigured.

Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/touchscreen/elants_i2c.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/drivers/input/touchscreen/elants_i2c.c b/drivers/input/touchscreen/elants_i2c.c
index e102d7764bc2..a458e5ec9e41 100644
--- a/drivers/input/touchscreen/elants_i2c.c
+++ b/drivers/input/touchscreen/elants_i2c.c
@@ -27,6 +27,7 @@
 #include <linux/module.h>
 #include <linux/input.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/platform_device.h>
 #include <linux/async.h>
 #include <linux/i2c.h>
@@ -1261,10 +1262,13 @@ static int elants_i2c_probe(struct i2c_client *client,
 	}
 
 	/*
-	 * Systems using device tree should set up interrupt via DTS,
-	 * the rest will use the default falling edge interrupts.
+	 * Platform code (ACPI, DTS) should normally set up interrupt
+	 * for us, but in case it did not let's fall back to using falling
+	 * edge to be compatible with older Chromebooks.
 	 */
-	irqflags = client->dev.of_node ? 0 : IRQF_TRIGGER_FALLING;
+	irqflags = irq_get_trigger_type(client->irq);
+	if (!irqflags)
+		irqflags = IRQF_TRIGGER_FALLING;
 
 	error = devm_request_threaded_irq(&client->dev, client->irq,
 					  NULL, elants_i2c_irq,

From 51f493ae71adc2c49a317a13c38e54e1cdf46005 Mon Sep 17 00:00:00 2001
From: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Date: Thu, 30 Nov 2017 10:15:02 +0000
Subject: [PATCH 030/808] ASoC: codecs: msm8916-wcd: Fix supported formats

This codec is configurable for only 16 bit and 32 bit samples, so reflect
this in the supported formats also remove 24bit sample from supported list.

Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
Cc: stable@vger.kernel.org
---
 sound/soc/codecs/msm8916-wcd-analog.c  | 2 +-
 sound/soc/codecs/msm8916-wcd-digital.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/sound/soc/codecs/msm8916-wcd-analog.c b/sound/soc/codecs/msm8916-wcd-analog.c
index 5f3c42c4f74a..066ea2f4ce7b 100644
--- a/sound/soc/codecs/msm8916-wcd-analog.c
+++ b/sound/soc/codecs/msm8916-wcd-analog.c
@@ -267,7 +267,7 @@
 #define MSM8916_WCD_ANALOG_RATES (SNDRV_PCM_RATE_8000 | SNDRV_PCM_RATE_16000 |\
 			SNDRV_PCM_RATE_32000 | SNDRV_PCM_RATE_48000)
 #define MSM8916_WCD_ANALOG_FORMATS (SNDRV_PCM_FMTBIT_S16_LE |\
-				    SNDRV_PCM_FMTBIT_S24_LE)
+				    SNDRV_PCM_FMTBIT_S32_LE)
 
 static int btn_mask = SND_JACK_BTN_0 | SND_JACK_BTN_1 |
 	       SND_JACK_BTN_2 | SND_JACK_BTN_3 | SND_JACK_BTN_4;
diff --git a/sound/soc/codecs/msm8916-wcd-digital.c b/sound/soc/codecs/msm8916-wcd-digital.c
index a10a724eb448..13354d6304a8 100644
--- a/sound/soc/codecs/msm8916-wcd-digital.c
+++ b/sound/soc/codecs/msm8916-wcd-digital.c
@@ -194,7 +194,7 @@
 				   SNDRV_PCM_RATE_32000 | \
 				   SNDRV_PCM_RATE_48000)
 #define MSM8916_WCD_DIGITAL_FORMATS (SNDRV_PCM_FMTBIT_S16_LE |\
-				     SNDRV_PCM_FMTBIT_S24_LE)
+				     SNDRV_PCM_FMTBIT_S32_LE)
 
 struct msm8916_wcd_digital_priv {
 	struct clk *ahbclk, *mclk;
@@ -645,7 +645,7 @@ static int msm8916_wcd_digital_hw_params(struct snd_pcm_substream *substream,
 				    RX_I2S_CTL_RX_I2S_MODE_MASK,
 				    RX_I2S_CTL_RX_I2S_MODE_16);
 		break;
-	case SNDRV_PCM_FORMAT_S24_LE:
+	case SNDRV_PCM_FORMAT_S32_LE:
 		snd_soc_update_bits(dai->codec, LPASS_CDC_CLK_TX_I2S_CTL,
 				    TX_I2S_CTL_TX_I2S_MODE_MASK,
 				    TX_I2S_CTL_TX_I2S_MODE_32);

From 737e0b7b67bdfe24090fab2852044bb283282fc5 Mon Sep 17 00:00:00 2001
From: "Andrew F. Davis" <afd@ti.com>
Date: Wed, 29 Nov 2017 15:32:46 -0600
Subject: [PATCH 031/808] ASoC: tlv320aic31xx: Fix GPIO1 register definition

GPIO1 control register is number 51, fix this here.

Fixes: bafcbfe429eb ("ASoC: tlv320aic31xx: Make the register values human readable")
Signed-off-by: Andrew F. Davis <afd@ti.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Cc: stable@vger.kernel.org
---
 sound/soc/codecs/tlv320aic31xx.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sound/soc/codecs/tlv320aic31xx.h b/sound/soc/codecs/tlv320aic31xx.h
index 730fb2058869..1ff3edb7bbb6 100644
--- a/sound/soc/codecs/tlv320aic31xx.h
+++ b/sound/soc/codecs/tlv320aic31xx.h
@@ -116,7 +116,7 @@ struct aic31xx_pdata {
 /* INT2 interrupt control */
 #define AIC31XX_INT2CTRL	AIC31XX_REG(0, 49)
 /* GPIO1 control */
-#define AIC31XX_GPIO1		AIC31XX_REG(0, 50)
+#define AIC31XX_GPIO1		AIC31XX_REG(0, 51)
 
 #define AIC31XX_DACPRB		AIC31XX_REG(0, 60)
 /* ADC Instruction Set Register */

From 8d26fdfcb45dc420115b267ac9d6b3ac13457f1b Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Thu, 30 Nov 2017 14:35:08 +0100
Subject: [PATCH 032/808] spi: Fix double "when"

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/spi/spi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 7b2170bfd6e7..bc6bb325d1bf 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -126,7 +126,7 @@ void spi_statistics_add_transfer_stats(struct spi_statistics *stats,
  *	for that name.  This appears in the sysfs "modalias" attribute
  *	for driver coldplugging, and in uevents used for hotplugging
  * @cs_gpio: gpio number of the chipselect line (optional, -ENOENT when
- *	when not using a GPIO line)
+ *	not using a GPIO line)
  *
  * @statistics: statistics for the spi_device
  *

From e719135881f00c01ca400abb8a5dadaf297a24f9 Mon Sep 17 00:00:00 2001
From: Michal Kubecek <mkubecek@suse.cz>
Date: Wed, 29 Nov 2017 18:23:56 +0100
Subject: [PATCH 033/808] xfrm: fix XFRMA_OUTPUT_MARK policy entry

This seems to be an obvious typo, NLA_U32 is type of the attribute, not its
(minimal) length.

Fixes: 077fbac405bf ("net: xfrm: support setting an output mark.")
Signed-off-by: Michal Kubecek <mkubecek@suse.cz>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_user.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index c2cfcc6fdb34..ff58c37469d6 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -2485,7 +2485,7 @@ static const struct nla_policy xfrma_policy[XFRMA_MAX+1] = {
 	[XFRMA_PROTO]		= { .type = NLA_U8 },
 	[XFRMA_ADDRESS_FILTER]	= { .len = sizeof(struct xfrm_address_filter) },
 	[XFRMA_OFFLOAD_DEV]	= { .len = sizeof(struct xfrm_user_offload) },
-	[XFRMA_OUTPUT_MARK]	= { .len = NLA_U32 },
+	[XFRMA_OUTPUT_MARK]	= { .type = NLA_U32 },
 };
 
 static const struct nla_policy xfrma_spd_policy[XFRMA_SPD_MAX+1] = {

From 4ce3dbe397d7b6b15f272ae757c78c35e9e4b61d Mon Sep 17 00:00:00 2001
From: Aviv Heller <avivh@mellanox.com>
Date: Tue, 28 Nov 2017 19:55:40 +0200
Subject: [PATCH 034/808] xfrm: Fix xfrm_input() to verify state is valid when
 (encap_type < 0)

Code path when (encap_type < 0) does not verify the state is valid
before progressing.

This will result in a crash if, for instance, x->km.state ==
XFRM_STATE_ACQ.

Fixes: 7785bba299a8 ("esp: Add a software GRO codepath")
Signed-off-by: Aviv Heller <avivh@mellanox.com>
Signed-off-by: Yevgeny Kliteynik <kliteyn@mellanox.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_input.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index 347ab31574d5..da6447389ffb 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -207,7 +207,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
 	xfrm_address_t *daddr;
 	struct xfrm_mode *inner_mode;
 	u32 mark = skb->mark;
-	unsigned int family;
+	unsigned int family = AF_UNSPEC;
 	int decaps = 0;
 	int async = 0;
 	bool xfrm_gro = false;
@@ -216,6 +216,16 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
 
 	if (encap_type < 0) {
 		x = xfrm_input_state(skb);
+
+		if (unlikely(x->km.state != XFRM_STATE_VALID)) {
+			if (x->km.state == XFRM_STATE_ACQ)
+				XFRM_INC_STATS(net, LINUX_MIB_XFRMACQUIREERROR);
+			else
+				XFRM_INC_STATS(net,
+					       LINUX_MIB_XFRMINSTATEINVALID);
+			goto drop;
+		}
+
 		family = x->outer_mode->afinfo->family;
 
 		/* An encap_type of -1 indicates async resumption. */

From ddc47e4404b58f03e98345398fb12d38fe291512 Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Wed, 29 Nov 2017 06:53:55 +0100
Subject: [PATCH 035/808] xfrm: Fix stack-out-of-bounds read on socket policy
 lookup.

When we do tunnel or beet mode, we pass saddr and daddr from the
template to xfrm_state_find(), this is ok. On transport mode,
we pass the addresses from the flowi, assuming that the IP
addresses (and address family) don't change during transformation.
This assumption is wrong in the IPv4 mapped IPv6 case, packet
is IPv4 and template is IPv6.

Fix this by catching address family missmatches of the policy
and the flow already before we do the lookup.

Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_policy.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 9542975eb2f9..038ec68f6901 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1168,9 +1168,15 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir,
  again:
 	pol = rcu_dereference(sk->sk_policy[dir]);
 	if (pol != NULL) {
-		bool match = xfrm_selector_match(&pol->selector, fl, family);
+		bool match;
 		int err = 0;
 
+		if (pol->family != family) {
+			pol = NULL;
+			goto out;
+		}
+
+		match = xfrm_selector_match(&pol->selector, fl, family);
 		if (match) {
 			if ((sk->sk_mark & pol->mark.m) != pol->mark.v) {
 				pol = NULL;

From 56075f6072e7fdac302cff4e1b4c93b64ced99ab Mon Sep 17 00:00:00 2001
From: Daniel Axtens <dja@axtens.net>
Date: Sun, 26 Nov 2017 15:34:04 +1100
Subject: [PATCH 036/808] HID: holtekff: move MODULE_* parameters out of #ifdef
 block

If you compile with:
CONFIG_HID_HOLTEK=m
CONFIG_HOLTEK_FF is not set

You get the following warning:
WARNING: modpost: missing MODULE_LICENSE() in drivers/hid/hid-holtekff.o
see include/linux/module.h for more information

Fix this by moving the module info out of the #ifdef CONFIG_HOLTEK_FF
block and into the un-guarded part of the file.

Signed-off-by: Daniel Axtens <dja@axtens.net>
Acked-by: Anssi Hannula <anssi.hannula@iki.fi>
Reviewed-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/hid-holtekff.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/hid/hid-holtekff.c b/drivers/hid/hid-holtekff.c
index 9325545fc3ae..edc0f64bb584 100644
--- a/drivers/hid/hid-holtekff.c
+++ b/drivers/hid/hid-holtekff.c
@@ -32,10 +32,6 @@
 
 #ifdef CONFIG_HOLTEK_FF
 
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Anssi Hannula <anssi.hannula@iki.fi>");
-MODULE_DESCRIPTION("Force feedback support for Holtek On Line Grip based devices");
-
 /*
  * These commands and parameters are currently known:
  *
@@ -223,3 +219,7 @@ static struct hid_driver holtek_driver = {
 	.probe = holtek_probe,
 };
 module_hid_driver(holtek_driver);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Anssi Hannula <anssi.hannula@iki.fi>");
+MODULE_DESCRIPTION("Force feedback support for Holtek On Line Grip based devices");

From 741f5afbba70ff3cddcc5bba2595d9a44fa722e5 Mon Sep 17 00:00:00 2001
From: Heiko Stuebner <heiko@sntech.de>
Date: Sat, 2 Dec 2017 17:36:45 +0100
Subject: [PATCH 037/808] ARM: dts: rockchip: add cpu0-regulator on
 rk3066a-marsboard

The rk3066 also has operating points now, but without adjusting
the cpu-regulator will break once higher voltages are needed for
a specific frequency, so add the needed cpu0-regulator.

Signed-off-by: Heiko Stuebner <heiko@sntech.de>
---
 arch/arm/boot/dts/rk3066a-marsboard.dts | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/arm/boot/dts/rk3066a-marsboard.dts b/arch/arm/boot/dts/rk3066a-marsboard.dts
index c6d92c25df42..d23ee6d911ac 100644
--- a/arch/arm/boot/dts/rk3066a-marsboard.dts
+++ b/arch/arm/boot/dts/rk3066a-marsboard.dts
@@ -83,6 +83,10 @@
 	};
 };
 
+&cpu0 {
+	cpu0-supply = <&vdd_arm>;
+};
+
 &i2c1 {
 	status = "okay";
 	clock-frequency = <400000>;

From 912d7985f3cef1b901a4fd9fede549b919fe7ac3 Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Thu, 9 Nov 2017 16:35:35 -0600
Subject: [PATCH 038/808] ARM: dts: rockchip: fix rk3288 iep-IOMMU interrupts
 property cells

The interrupts property in the iep-IOMMU node for the rk3288 dts file has a
spurious extra cell causing a dtc warning:

Warning (interrupts_property): interrupts size is (16), expected multiple of 12 in /iommu@ff900800

Remove the extra cell.

Signed-off-by: Rob Herring <robh@kernel.org>
Signed-off-by: Heiko Stuebner <heiko@sntech.de>
---
 arch/arm/boot/dts/rk3288.dtsi | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm/boot/dts/rk3288.dtsi b/arch/arm/boot/dts/rk3288.dtsi
index cd24894ee5c6..6102e4e7f35c 100644
--- a/arch/arm/boot/dts/rk3288.dtsi
+++ b/arch/arm/boot/dts/rk3288.dtsi
@@ -956,7 +956,7 @@
 	iep_mmu: iommu@ff900800 {
 		compatible = "rockchip,iommu";
 		reg = <0x0 0xff900800 0x0 0x40>;
-		interrupts = <GIC_SPI 17 IRQ_TYPE_LEVEL_HIGH 0>;
+		interrupts = <GIC_SPI 17 IRQ_TYPE_LEVEL_HIGH>;
 		interrupt-names = "iep_mmu";
 		#iommu-cells = <0>;
 		status = "disabled";

From 3fa8c49f27c15df259b7b8f94eb126ae491893fd Mon Sep 17 00:00:00 2001
From: Heiko Stuebner <heiko@sntech.de>
Date: Mon, 4 Dec 2017 18:36:10 +0100
Subject: [PATCH 039/808] arm64: dts: rockchip: fix trailing 0 in rk3328 tsadc
 interrupts

Probably due to some copy-paste mistake, the tsadc of rk3328 ended up
with a 0 as 4th element that shouldn't be there, as interrupts on the
rk3328 only have multiples of 3, making dtc complain. So remove it.

Signed-off-by: Heiko Stuebner <heiko@sntech.de>
---
 arch/arm64/boot/dts/rockchip/rk3328.dtsi | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/boot/dts/rockchip/rk3328.dtsi b/arch/arm64/boot/dts/rockchip/rk3328.dtsi
index 41d61840fb99..2426da631938 100644
--- a/arch/arm64/boot/dts/rockchip/rk3328.dtsi
+++ b/arch/arm64/boot/dts/rockchip/rk3328.dtsi
@@ -514,7 +514,7 @@
 	tsadc: tsadc@ff250000 {
 		compatible = "rockchip,rk3328-tsadc";
 		reg = <0x0 0xff250000 0x0 0x100>;
-		interrupts = <GIC_SPI 58 IRQ_TYPE_LEVEL_HIGH 0>;
+		interrupts = <GIC_SPI 58 IRQ_TYPE_LEVEL_HIGH>;
 		assigned-clocks = <&cru SCLK_TSADC>;
 		assigned-clock-rates = <50000>;
 		clocks = <&cru SCLK_TSADC>, <&cru PCLK_TSADC>;

From adf6895754e2503d994a765535fd1813f8834674 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 30 Nov 2017 19:42:52 -0800
Subject: [PATCH 040/808] acpi, nfit: fix health event notification

Integration testing with a BIOS that generates injected health event
notifications fails to communicate those events to userspace. The nfit
driver neglects to link the ACPI DIMM device with the necessary driver
data so acpi_nvdimm_notify() fails this lookup:

        nfit_mem = dev_get_drvdata(dev);
        if (nfit_mem && nfit_mem->flags_attr)
                sysfs_notify_dirent(nfit_mem->flags_attr);

Add the necessary linkage when installing the notification handler and
clean it up when the nfit driver instance is torn down.

Cc: <stable@vger.kernel.org>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Fixes: ba9c8dd3c222 ("acpi, nfit: add dimm device notification support")
Reported-by: Daniel Osawa <daniel.k.osawa@intel.com>
Tested-by: Daniel Osawa <daniel.k.osawa@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/nfit/core.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index ff2580e7611d..abeb4df4f22e 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -1670,6 +1670,11 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc,
 				dev_name(&adev_dimm->dev));
 		return -ENXIO;
 	}
+	/*
+	 * Record nfit_mem for the notification path to track back to
+	 * the nfit sysfs attributes for this dimm device object.
+	 */
+	dev_set_drvdata(&adev_dimm->dev, nfit_mem);
 
 	/*
 	 * Until standardization materializes we need to consider 4
@@ -1752,9 +1757,11 @@ static void shutdown_dimm_notify(void *data)
 			sysfs_put(nfit_mem->flags_attr);
 			nfit_mem->flags_attr = NULL;
 		}
-		if (adev_dimm)
+		if (adev_dimm) {
 			acpi_remove_notify_handler(adev_dimm->handle,
 					ACPI_DEVICE_NOTIFY, acpi_nvdimm_notify);
+			dev_set_drvdata(&adev_dimm->dev, NULL);
+		}
 	}
 	mutex_unlock(&acpi_desc->init_mutex);
 }

From bc53e3aa88e8240823c1c440e6bab3c3a5ba5f59 Mon Sep 17 00:00:00 2001
From: Peter Rosin <peda@axentia.se>
Date: Mon, 27 Nov 2017 17:31:01 +0100
Subject: [PATCH 041/808] ARM: dts: at91: disable the nxp,se97b SMBUS timeout
 on the TSE-850

The I2C adapter driver is sometimes slow, causing the SCL line to
be stuck low for more than the stipulated SMBUS timeout of 25-35 ms.
This causes the client device to give up which in turn causes silent
corruption of data. So, disable the SMBUS timeout in the client device.

Signed-off-by: Peter Rosin <peda@axentia.se>
Acked-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Alexandre Belloni <alexandre.belloni@free-electrons.com>
---
 arch/arm/boot/dts/at91-tse850-3.dts | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm/boot/dts/at91-tse850-3.dts b/arch/arm/boot/dts/at91-tse850-3.dts
index 5f29010cdbd8..9b82cc8843e1 100644
--- a/arch/arm/boot/dts/at91-tse850-3.dts
+++ b/arch/arm/boot/dts/at91-tse850-3.dts
@@ -221,6 +221,7 @@
 	jc42@18 {
 		compatible = "nxp,se97b", "jedec,jc-42.4-temp";
 		reg = <0x18>;
+		smbus-timeout-disable;
 	};
 
 	dpot: mcp4651-104@28 {

From e2bf801ecd4e62222a46d1ba9e57e710171d29c1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Stefan=20Br=C3=BCns?= <stefan.bruens@rwth-aachen.de>
Date: Mon, 27 Nov 2017 20:05:34 +0100
Subject: [PATCH 042/808] sunxi-rsb: Include OF based modalias in device uevent
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Include the OF-based modalias in the uevent sent when registering devices
on the sunxi RSB bus, so that user space has a chance to autoload the
kernel module for the device.

Fixes a regression caused by commit 3f241bfa60bd ("arm64: allwinner: a64:
pine64: Use dcdc1 regulator for mmc0"). When the axp20x-rsb module for
the AXP803 PMIC is built as a module, it is not loaded and the system
ends up with an disfunctional MMC controller.

Fixes: d787dcdb9c8f ("bus: sunxi-rsb: Add driver for Allwinner Reduced Serial Bus")
Cc: stable <stable@vger.kernel.org> # 4.4.x 7a3b7cd332db of: device: Export of_device_{get_modalias, uvent_modalias} to modules
Acked-by: Chen-Yu Tsai <wens@csie.org>
Signed-off-by: Stefan Brüns <stefan.bruens@rwth-aachen.de>
Signed-off-by: Maxime Ripard <maxime.ripard@free-electrons.com>
---
 drivers/bus/sunxi-rsb.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/bus/sunxi-rsb.c b/drivers/bus/sunxi-rsb.c
index 328ca93781cf..1b76d9585902 100644
--- a/drivers/bus/sunxi-rsb.c
+++ b/drivers/bus/sunxi-rsb.c
@@ -178,6 +178,7 @@ static struct bus_type sunxi_rsb_bus = {
 	.match		= sunxi_rsb_device_match,
 	.probe		= sunxi_rsb_device_probe,
 	.remove		= sunxi_rsb_device_remove,
+	.uevent		= of_device_uevent_modalias,
 };
 
 static void sunxi_rsb_dev_release(struct device *dev)

From e17e237cd69f9f6ecaa0e875f889ad401a625148 Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wens@csie.org>
Date: Mon, 4 Dec 2017 16:44:01 +0800
Subject: [PATCH 043/808] ARM: dts: sunxi: Convert to CCU index macros for HDMI
 controller

When the HDMI controller device node was added, the needed PLL clock
macros were not exported. A separate patch addresses that, but it is
merged through a different tree.

Now that both patches are in mainline proper, we can convert the raw
numbers to proper macros.

Signed-off-by: Chen-Yu Tsai <wens@csie.org>
Signed-off-by: Maxime Ripard <maxime.ripard@free-electrons.com>
---
 arch/arm/boot/dts/sun4i-a10.dtsi  | 4 ++--
 arch/arm/boot/dts/sun5i-a10s.dtsi | 4 ++--
 arch/arm/boot/dts/sun6i-a31.dtsi  | 4 ++--
 arch/arm/boot/dts/sun7i-a20.dtsi  | 4 ++--
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/arch/arm/boot/dts/sun4i-a10.dtsi b/arch/arm/boot/dts/sun4i-a10.dtsi
index b91300d49a31..5840f5c75c3b 100644
--- a/arch/arm/boot/dts/sun4i-a10.dtsi
+++ b/arch/arm/boot/dts/sun4i-a10.dtsi
@@ -502,8 +502,8 @@
 			reg = <0x01c16000 0x1000>;
 			interrupts = <58>;
 			clocks = <&ccu CLK_AHB_HDMI0>, <&ccu CLK_HDMI>,
-				 <&ccu 9>,
-				 <&ccu 18>;
+				 <&ccu CLK_PLL_VIDEO0_2X>,
+				 <&ccu CLK_PLL_VIDEO1_2X>;
 			clock-names = "ahb", "mod", "pll-0", "pll-1";
 			dmas = <&dma SUN4I_DMA_NORMAL 16>,
 			       <&dma SUN4I_DMA_NORMAL 16>,
diff --git a/arch/arm/boot/dts/sun5i-a10s.dtsi b/arch/arm/boot/dts/sun5i-a10s.dtsi
index 6ae4d95e230e..316cb8b2945b 100644
--- a/arch/arm/boot/dts/sun5i-a10s.dtsi
+++ b/arch/arm/boot/dts/sun5i-a10s.dtsi
@@ -82,8 +82,8 @@
 			reg = <0x01c16000 0x1000>;
 			interrupts = <58>;
 			clocks = <&ccu CLK_AHB_HDMI>, <&ccu CLK_HDMI>,
-				 <&ccu 9>,
-				 <&ccu 16>;
+				 <&ccu CLK_PLL_VIDEO0_2X>,
+				 <&ccu CLK_PLL_VIDEO1_2X>;
 			clock-names = "ahb", "mod", "pll-0", "pll-1";
 			dmas = <&dma SUN4I_DMA_NORMAL 16>,
 			       <&dma SUN4I_DMA_NORMAL 16>,
diff --git a/arch/arm/boot/dts/sun6i-a31.dtsi b/arch/arm/boot/dts/sun6i-a31.dtsi
index 8bfa12b548e0..72d3fe44ecaf 100644
--- a/arch/arm/boot/dts/sun6i-a31.dtsi
+++ b/arch/arm/boot/dts/sun6i-a31.dtsi
@@ -429,8 +429,8 @@
 			interrupts = <GIC_SPI 88 IRQ_TYPE_LEVEL_HIGH>;
 			clocks = <&ccu CLK_AHB1_HDMI>, <&ccu CLK_HDMI>,
 				 <&ccu CLK_HDMI_DDC>,
-				 <&ccu 7>,
-				 <&ccu 13>;
+				 <&ccu CLK_PLL_VIDEO0_2X>,
+				 <&ccu CLK_PLL_VIDEO1_2X>;
 			clock-names = "ahb", "mod", "ddc", "pll-0", "pll-1";
 			resets = <&ccu RST_AHB1_HDMI>;
 			reset-names = "ahb";
diff --git a/arch/arm/boot/dts/sun7i-a20.dtsi b/arch/arm/boot/dts/sun7i-a20.dtsi
index 68dfa82544fc..59655e42e4b0 100644
--- a/arch/arm/boot/dts/sun7i-a20.dtsi
+++ b/arch/arm/boot/dts/sun7i-a20.dtsi
@@ -581,8 +581,8 @@
 			reg = <0x01c16000 0x1000>;
 			interrupts = <GIC_SPI 58 IRQ_TYPE_LEVEL_HIGH>;
 			clocks = <&ccu CLK_AHB_HDMI0>, <&ccu CLK_HDMI>,
-				 <&ccu 9>,
-				 <&ccu 18>;
+				 <&ccu CLK_PLL_VIDEO0_2X>,
+				 <&ccu CLK_PLL_VIDEO1_2X>;
 			clock-names = "ahb", "mod", "pll-0", "pll-1";
 			dmas = <&dma SUN4I_DMA_NORMAL 16>,
 			       <&dma SUN4I_DMA_NORMAL 16>,

From 7d556bfc49adddf2beb0d16c91945c3b8b783282 Mon Sep 17 00:00:00 2001
From: Jagan Teki <jagannadh.teki@gmail.com>
Date: Mon, 4 Dec 2017 10:23:07 +0530
Subject: [PATCH 044/808] arm64: allwinner: a64-sopine: Fix to use dcdc1
 regulator instead of vcc3v3

Since current tree support AXP803 regulators,
replace fixed regulator vcc3v3 with AXP803 dcdc1 regulator where ever
it need to replace.

Tested mmc0 on sopine baseboard.

Signed-off-by: Jagan Teki <jagan@amarulasolutions.com>
Signed-off-by: Maxime Ripard <maxime.ripard@free-electrons.com>
---
 .../dts/allwinner/sun50i-a64-sopine-baseboard.dts     |  2 +-
 arch/arm64/boot/dts/allwinner/sun50i-a64-sopine.dtsi  | 11 +----------
 2 files changed, 2 insertions(+), 11 deletions(-)

diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine-baseboard.dts b/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine-baseboard.dts
index a053a6ac5267..abe179de35d7 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine-baseboard.dts
+++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine-baseboard.dts
@@ -96,7 +96,7 @@
 &mmc2 {
 	pinctrl-names = "default";
 	pinctrl-0 = <&mmc2_pins>;
-	vmmc-supply = <&reg_vcc3v3>;
+	vmmc-supply = <&reg_dcdc1>;
 	vqmmc-supply = <&reg_vcc1v8>;
 	bus-width = <8>;
 	non-removable;
diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine.dtsi b/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine.dtsi
index a5da18a6f286..43418bd881d8 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine.dtsi
+++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine.dtsi
@@ -45,19 +45,10 @@
 
 #include "sun50i-a64.dtsi"
 
-/ {
-	reg_vcc3v3: vcc3v3 {
-		compatible = "regulator-fixed";
-		regulator-name = "vcc3v3";
-		regulator-min-microvolt = <3300000>;
-		regulator-max-microvolt = <3300000>;
-	};
-};
-
 &mmc0 {
 	pinctrl-names = "default";
 	pinctrl-0 = <&mmc0_pins>;
-	vmmc-supply = <&reg_vcc3v3>;
+	vmmc-supply = <&reg_dcdc1>;
 	non-removable;
 	disable-wp;
 	bus-width = <4>;

From f88e9301948173dd35afad4a6939092c7f269aed Mon Sep 17 00:00:00 2001
From: Sergey Matyukevich <geomatsi@gmail.com>
Date: Fri, 3 Nov 2017 22:58:54 +0300
Subject: [PATCH 045/808] arm64: dts: orange-pi-zero-plus2: fix sdcard detect

The sdcard detect pin on orange-pi-zero-plus2 is pulled up.
Fix cd-gpio description to enable sdcard detect.

Signed-off-by: Sergey Matyukevich <geomatsi@gmail.com>
Signed-off-by: Maxime Ripard <maxime.ripard@free-electrons.com>
---
 arch/arm64/boot/dts/allwinner/sun50i-h5-orangepi-zero-plus2.dts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h5-orangepi-zero-plus2.dts b/arch/arm64/boot/dts/allwinner/sun50i-h5-orangepi-zero-plus2.dts
index b6b7a561df8c..a42fd79a62a3 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-h5-orangepi-zero-plus2.dts
+++ b/arch/arm64/boot/dts/allwinner/sun50i-h5-orangepi-zero-plus2.dts
@@ -71,7 +71,7 @@
 	pinctrl-0 = <&mmc0_pins_a>, <&mmc0_cd_pin>;
 	vmmc-supply = <&reg_vcc3v3>;
 	bus-width = <4>;
-	cd-gpios = <&pio 5 6 GPIO_ACTIVE_HIGH>;
+	cd-gpios = <&pio 5 6 GPIO_ACTIVE_LOW>;
 	status = "okay";
 };
 

From 588fb54b0cc5be5fd2e12bb04810534ffc3d49cc Mon Sep 17 00:00:00 2001
From: Marek Szyprowski <m.szyprowski@samsung.com>
Date: Thu, 30 Nov 2017 13:14:51 +0100
Subject: [PATCH 046/808] clk: Manage proper runtime PM state in
 clk_change_rate()

clk_change_rate() propagates rate change down to all its children. Such
operation requires managing proper runtime PM state of each child, what
was missing. Add needed calls to clk_pm_runtime*() to ensure that
set_rate() clock callback is called on runtime active clock.

This fixes following issue found on Exynos5433 TM2 board with devfreq
enabled:

Synchronous External Abort: synchronous external abort (0x96000210) at 0xffffff80093f5600
Internal error: : 96000210 [#1] PREEMPT SMP
Modules linked in:
CPU: 0 PID: 5 Comm: kworker/u16:0 Not tainted 4.15.0-rc1-next-20171129+ #4
Hardware name: Samsung TM2 board (DT)
Workqueue: devfreq_wq devfreq_monitor
task: ffffffc0ca96b600 task.stack: ffffff80093a8000
pstate: a0000085 (NzCv daIf -PAN -UAO)
pc : clk_divider_set_rate+0x54/0x118
lr : clk_divider_set_rate+0x44/0x118
...
Process kworker/u16:0 (pid: 5, stack limit = 0xffffff80093a8000)
Call trace:
 clk_divider_set_rate+0x54/0x118
 clk_change_rate+0xfc/0x4e0
 clk_change_rate+0x1f0/0x4e0
 clk_change_rate+0x1f0/0x4e0
 clk_change_rate+0x1f0/0x4e0
 clk_core_set_rate_nolock+0x138/0x148
 clk_set_rate+0x28/0x50
 exynos_bus_passive_target+0x6c/0x11c
 update_devfreq_passive+0x58/0xb4
 devfreq_passive_notifier_call+0x50/0x5c
 notifier_call_chain+0x4c/0x88
 __srcu_notifier_call_chain+0x54/0x80
 srcu_notifier_call_chain+0x14/0x1c
 update_devfreq+0x100/0x1b4
 devfreq_monitor+0x2c/0x88
 process_one_work+0x148/0x3d8
 worker_thread+0x13c/0x3f8
 kthread+0x100/0x12c
 ret_from_fork+0x10/0x18

Reported-by: Chanwoo Choi <cw00.choi@samsung.com>
Fixes: 9a34b45397e5 ("clk: Add support for runtime PM")
Signed-off-by: Marek Szyprowski <m.szyprowski@samsung.com>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Tested-by: Chanwoo Choi <cw00.choi@samsung.com>
Reviewed-by: Chanwoo Choi <cw00.choi@samsung.com>
Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
---
 drivers/clk/clk.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c
index 647d056df88c..8a1860a36c77 100644
--- a/drivers/clk/clk.c
+++ b/drivers/clk/clk.c
@@ -1564,6 +1564,9 @@ static void clk_change_rate(struct clk_core *core)
 		best_parent_rate = core->parent->rate;
 	}
 
+	if (clk_pm_runtime_get(core))
+		return;
+
 	if (core->flags & CLK_SET_RATE_UNGATE) {
 		unsigned long flags;
 
@@ -1634,6 +1637,8 @@ static void clk_change_rate(struct clk_core *core)
 	/* handle the new child who might not be in core->children yet */
 	if (core->new_child)
 		clk_change_rate(core->new_child);
+
+	clk_pm_runtime_put(core);
 }
 
 static int clk_core_set_rate_nolock(struct clk_core *core,

From 975b820b6836b6b6c42fb84cd2e772e2b41bca67 Mon Sep 17 00:00:00 2001
From: Cai Li <cai.li@spreadtrum.com>
Date: Tue, 21 Nov 2017 17:24:38 +0800
Subject: [PATCH 047/808] clk: fix a panic error caused by accessing NULL
 pointer

In some cases the clock parent would be set NULL when doing re-parent,
it will cause a NULL pointer accessing if clk_set trace event is
enabled.

This patch sets the parent as "none" if the input parameter is NULL.

Fixes: dfc202ead312 (clk: Add tracepoints for hardware operations)
Signed-off-by: Cai Li <cai.li@spreadtrum.com>
Signed-off-by: Chunyan Zhang <chunyan.zhang@spreadtrum.com>
Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
---
 include/trace/events/clk.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/trace/events/clk.h b/include/trace/events/clk.h
index 758607226bfd..2cd449328aee 100644
--- a/include/trace/events/clk.h
+++ b/include/trace/events/clk.h
@@ -134,12 +134,12 @@ DECLARE_EVENT_CLASS(clk_parent,
 
 	TP_STRUCT__entry(
 		__string(        name,           core->name                )
-		__string(        pname,          parent->name              )
+		__string(        pname, parent ? parent->name : "none"     )
 	),
 
 	TP_fast_assign(
 		__assign_str(name, core->name);
-		__assign_str(pname, parent->name);
+		__assign_str(pname, parent ? parent->name : "none");
 	),
 
 	TP_printk("%s %s", __get_str(name), __get_str(pname))

From 87eba0716011e528f7841026f2cc65683219d0ad Mon Sep 17 00:00:00 2001
From: Klaus Goger <klaus.goger@theobroma-systems.com>
Date: Tue, 5 Dec 2017 08:11:58 +0100
Subject: [PATCH 048/808] arm64: dts: rockchip: remove vdd_log from rk3399-puma

vdd_log has no consumer and therefore will not be set to a specific
voltage. Still the PWM output pin gets configured and thence the vdd_log
output voltage will changed from it's default. Depending on the idle
state of the PWM this will slightly over or undervoltage the logic supply
of the RK3399 and cause instability with GbE (undervoltage) and PCIe
(overvoltage). Since the default value set by a voltage divider is the
correct supply voltage and we don't need to change it during runtime we
remove the rail from the devicetree completely so the PWM pin will not
be configured.

Signed-off-by: Klaus Goger <klaus.goger@theobroma-systems.com>
Signed-off-by: Heiko Stuebner <heiko@sntech.de>
---
 arch/arm64/boot/dts/rockchip/rk3399-puma.dtsi | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/arch/arm64/boot/dts/rockchip/rk3399-puma.dtsi b/arch/arm64/boot/dts/rockchip/rk3399-puma.dtsi
index 910628d18add..1fc5060d7027 100644
--- a/arch/arm64/boot/dts/rockchip/rk3399-puma.dtsi
+++ b/arch/arm64/boot/dts/rockchip/rk3399-puma.dtsi
@@ -155,17 +155,6 @@
 		regulator-min-microvolt = <5000000>;
 		regulator-max-microvolt = <5000000>;
 	};
-
-	vdd_log: vdd-log {
-		compatible = "pwm-regulator";
-		pwms = <&pwm2 0 25000 0>;
-		regulator-name = "vdd_log";
-		regulator-min-microvolt = <800000>;
-		regulator-max-microvolt = <1400000>;
-		regulator-always-on;
-		regulator-boot-on;
-		status = "okay";
-	};
 };
 
 &cpu_b0 {

From bc631943faba6fc3f755748091ada31798fb7d50 Mon Sep 17 00:00:00 2001
From: Heiko Stuebner <heiko@sntech.de>
Date: Wed, 6 Dec 2017 01:10:05 +0100
Subject: [PATCH 049/808] arm64: dts: rockchip: limit rk3328-rock64 gmac speed
 to 100MBit for now

It looks like either the current kernel or the hardware has reliability
issues when the gmac is actually running at 1GBit. In my test-case
it is not able to boot on a nfsroot at this speed, as the system
will always lose the connection to the nfs-server during boot, before
reaching any login prompt and not recover from this.

So until this is solved, limit the speed to 100MBit as with this the
nfsroot survives stress tests like an apt-get upgrade without problems.

Signed-off-by: Heiko Stuebner <heiko@sntech.de>
---
 arch/arm64/boot/dts/rockchip/rk3328-rock64.dts | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/arm64/boot/dts/rockchip/rk3328-rock64.dts b/arch/arm64/boot/dts/rockchip/rk3328-rock64.dts
index d4f80786e7c2..3890468678ce 100644
--- a/arch/arm64/boot/dts/rockchip/rk3328-rock64.dts
+++ b/arch/arm64/boot/dts/rockchip/rk3328-rock64.dts
@@ -132,6 +132,8 @@
 	assigned-clocks = <&cru SCLK_MAC2IO>, <&cru SCLK_MAC2IO_EXT>;
 	assigned-clock-parents = <&gmac_clkin>, <&gmac_clkin>;
 	clock_in_out = "input";
+	/* shows instability at 1GBit right now */
+	max-speed = <100>;
 	phy-supply = <&vcc_io>;
 	phy-mode = "rgmii";
 	pinctrl-names = "default";

From 3073774e638ef18d222465fe92bfc8fccb90d288 Mon Sep 17 00:00:00 2001
From: Serhii Popovych <spopovyc@redhat.com>
Date: Mon, 4 Dec 2017 09:36:41 -0500
Subject: [PATCH 050/808] KVM: PPC: Book3S HV: Drop prepare_done from struct
 kvm_resize_hpt

Currently the kvm_resize_hpt structure has two fields relevant to the
state of an ongoing resize: 'prepare_done', which indicates whether
the worker thread has completed or not, and 'error' which indicates
whether it was successful or not.

Since the success/failure isn't known until completion, this is
confusingly redundant.  This patch consolidates the information into
just the 'error' value: -EBUSY indicates the worked is still in
progress, other negative values indicate (completed) failure, 0
indicates successful completion.

As a bonus this reduces size of struct kvm_resize_hpt by
__alignof__(struct kvm_hpt_info) and saves few bytes of code.

While there correct comment in struct kvm_resize_hpt which references
a non-existent semaphore (leftover from an early draft).

Assert with WARN_ON() in case of HPT allocation thread work runs more
than once for resize request or resize_hpt_allocate() returns -EBUSY
that is treated specially.

Change comparison against zero to make checkpatch.pl happy.

Cc: stable@vger.kernel.org # v4.10+
Signed-off-by: Serhii Popovych <spopovyc@redhat.com>
[dwg: Changed BUG_ON()s to WARN_ON()s and altered commit message for
 clarity]
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_64_mmu_hv.c | 46 ++++++++++++++++++-----------
 1 file changed, 28 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 966097232d21..f5f2c6bf5856 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -65,11 +65,17 @@ struct kvm_resize_hpt {
 	u32 order;
 
 	/* These fields protected by kvm->lock */
-	int error;
-	bool prepare_done;
 
-	/* Private to the work thread, until prepare_done is true,
-	 * then protected by kvm->resize_hpt_sem */
+	/* Possible values and their usage:
+	 *  <0     an error occurred during allocation,
+	 *  -EBUSY allocation is in the progress,
+	 *  0      allocation made successfuly.
+	 */
+	int error;
+
+	/* Private to the work thread, until error != -EBUSY,
+	 * then protected by kvm->lock.
+	 */
 	struct kvm_hpt_info hpt;
 };
 
@@ -1433,15 +1439,23 @@ static void resize_hpt_prepare_work(struct work_struct *work)
 	struct kvm *kvm = resize->kvm;
 	int err;
 
+	if (WARN_ON(resize->error != -EBUSY))
+		return;
+
 	resize_hpt_debug(resize, "resize_hpt_prepare_work(): order = %d\n",
 			 resize->order);
 
 	err = resize_hpt_allocate(resize);
 
+	/* We have strict assumption about -EBUSY
+	 * when preparing for HPT resize.
+	 */
+	if (WARN_ON(err == -EBUSY))
+		err = -EINPROGRESS;
+
 	mutex_lock(&kvm->lock);
 
 	resize->error = err;
-	resize->prepare_done = true;
 
 	mutex_unlock(&kvm->lock);
 }
@@ -1466,14 +1480,12 @@ long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm,
 
 	if (resize) {
 		if (resize->order == shift) {
-			/* Suitable resize in progress */
-			if (resize->prepare_done) {
-				ret = resize->error;
-				if (ret != 0)
-					resize_hpt_release(kvm, resize);
-			} else {
+			/* Suitable resize in progress? */
+			ret = resize->error;
+			if (ret == -EBUSY)
 				ret = 100; /* estimated time in ms */
-			}
+			else if (ret)
+				resize_hpt_release(kvm, resize);
 
 			goto out;
 		}
@@ -1493,6 +1505,8 @@ long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm,
 		ret = -ENOMEM;
 		goto out;
 	}
+
+	resize->error = -EBUSY;
 	resize->order = shift;
 	resize->kvm = kvm;
 	INIT_WORK(&resize->work, resize_hpt_prepare_work);
@@ -1547,16 +1561,12 @@ long kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm,
 	if (!resize || (resize->order != shift))
 		goto out;
 
-	ret = -EBUSY;
-	if (!resize->prepare_done)
-		goto out;
-
 	ret = resize->error;
-	if (ret != 0)
+	if (ret)
 		goto out;
 
 	ret = resize_hpt_rehash(resize);
-	if (ret != 0)
+	if (ret)
 		goto out;
 
 	resize_hpt_pivot(resize);

From 4ed11aeefda439c76ddae3ceebcfa4fad111f149 Mon Sep 17 00:00:00 2001
From: Serhii Popovych <spopovyc@redhat.com>
Date: Mon, 4 Dec 2017 09:36:42 -0500
Subject: [PATCH 051/808] KVM: PPC: Book3S HV: Fix use after free in case of
 multiple resize requests

When serving multiple resize requests following could happen:

    CPU0                                    CPU1
    ----                                    ----
    kvm_vm_ioctl_resize_hpt_prepare(1);
      -> schedule_work()
                                            /* system_rq might be busy: delay */
    kvm_vm_ioctl_resize_hpt_prepare(2);
      mutex_lock();
      if (resize) {
         ...
         release_hpt_resize();
      }
      ...                                   resize_hpt_prepare_work()
      -> schedule_work()                    {
      mutex_unlock()                           /* resize->kvm could be wrong */
                                               struct kvm *kvm = resize->kvm;

                                               mutex_lock(&kvm->lock);   <<<< UAF
                                               ...
                                            }

i.e. a second resize request with different order could be started by
kvm_vm_ioctl_resize_hpt_prepare(), causing the previous request to be
free()d when there's still an active worker thread which will try to
access it.  This leads to a use after free in point marked with UAF on
the diagram above.

To prevent this from happening, instead of unconditionally releasing a
pre-existing resize structure from the prepare ioctl(), we check if
the existing structure has an in-progress worker.  We do that by
checking if the resize->error == -EBUSY, which is safe because the
resize->error field is protected by the kvm->lock.  If there is an
active worker, instead of releasing, we mark the structure as stale by
unlinking it from kvm_struct.

In the worker thread we check for a stale structure (with kvm->lock
held), and in that case abort, releasing the stale structure ourself.
We make the check both before and the actual allocation.  Strictly,
only the check afterwards is needed, the check before is an
optimization: if the structure happens to become stale before the
worker thread is dispatched, rather than during the allocation, it
means we can avoid allocating then immediately freeing a potentially
substantial amount of memory.

This fixes following or similar host kernel crash message:

[  635.277361] Unable to handle kernel paging request for data at address 0x00000000
[  635.277438] Faulting instruction address: 0xc00000000052f568
[  635.277446] Oops: Kernel access of bad area, sig: 11 [#1]
[  635.277451] SMP NR_CPUS=2048 NUMA PowerNV
[  635.277470] Modules linked in: xt_CHECKSUM iptable_mangle ipt_MASQUERADE
nf_nat_masquerade_ipv4 iptable_nat nf_nat_ipv4 nf_nat nf_conntrack_ipv4
nf_defrag_ipv4 xt_conntrack nf_conntrack ipt_REJECT nf_reject_ipv4 tun bridge stp llc
ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter nfsv3 nfs_acl nfs
lockd grace fscache kvm_hv kvm rpcrdma sunrpc ib_isert iscsi_target_mod ib_iser libiscsi
scsi_transport_iscsi ib_srpt target_core_mod ext4 ib_srp scsi_transport_srp
ib_ipoib mbcache jbd2 rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm ocrdma(T)
ib_core ses enclosure scsi_transport_sas sg shpchp leds_powernv ibmpowernv i2c_opal
i2c_core powernv_rng ipmi_powernv ipmi_devintf ipmi_msghandler ip_tables xfs
libcrc32c sr_mod sd_mod cdrom lpfc nvme_fc(T) nvme_fabrics nvme_core ipr nvmet_fc(T)
tg3 nvmet libata be2net crc_t10dif crct10dif_generic scsi_transport_fc ptp scsi_tgt
pps_core crct10dif_common dm_mirror dm_region_hash dm_log dm_mod
[  635.278687] CPU: 40 PID: 749 Comm: kworker/40:1 Tainted: G
------------ T 3.10.0.bz1510771+ #1
[  635.278782] Workqueue: events resize_hpt_prepare_work [kvm_hv]
[  635.278851] task: c0000007e6840000 ti: c0000007e9180000 task.ti: c0000007e9180000
[  635.278919] NIP: c00000000052f568 LR: c0000000009ea310 CTR: c0000000009ea4f0
[  635.278988] REGS: c0000007e91837f0 TRAP: 0300   Tainted: G
------------ T  (3.10.0.bz1510771+)
[  635.279077] MSR: 9000000100009033 <SF,HV,EE,ME,IR,DR,RI,LE>  CR: 24002022  XER:
00000000
[  635.279248] CFAR: c000000000009368 DAR: 0000000000000000 DSISR: 40000000 SOFTE: 1
GPR00: c0000000009ea310 c0000007e9183a70 c000000001250b00 c0000007e9183b10
GPR04: 0000000000000000 0000000000000000 c0000007e9183650 0000000000000000
GPR08: c0000007ffff7b80 00000000ffffffff 0000000080000028 d00000000d2529a0
GPR12: 0000000000002200 c000000007b56800 c000000000120028 c0000007f135bb40
GPR16: 0000000000000000 c000000005c1e018 c000000005c1e018 0000000000000000
GPR20: 0000000000000001 c0000000011bf778 0000000000000001 fffffffffffffef7
GPR24: 0000000000000000 c000000f1e262e50 0000000000000002 c0000007e9180000
GPR28: c000000f1e262e4c c000000f1e262e50 0000000000000000 c0000007e9183b10
[  635.280149] NIP [c00000000052f568] __list_add+0x38/0x110
[  635.280197] LR [c0000000009ea310] __mutex_lock_slowpath+0xe0/0x2c0
[  635.280253] Call Trace:
[  635.280277] [c0000007e9183af0] [c0000000009ea310] __mutex_lock_slowpath+0xe0/0x2c0
[  635.280356] [c0000007e9183b70] [c0000000009ea554] mutex_lock+0x64/0x70
[  635.280426] [c0000007e9183ba0] [d00000000d24da04]
resize_hpt_prepare_work+0xe4/0x1c0 [kvm_hv]
[  635.280507] [c0000007e9183c40] [c000000000113c0c] process_one_work+0x1dc/0x680
[  635.280587] [c0000007e9183ce0] [c000000000114250] worker_thread+0x1a0/0x520
[  635.280655] [c0000007e9183d80] [c00000000012010c] kthread+0xec/0x100
[  635.280724] [c0000007e9183e30] [c00000000000a4b8] ret_from_kernel_thread+0x5c/0xa4
[  635.280814] Instruction dump:
[  635.280880] 7c0802a6 fba1ffe8 fbc1fff0 7cbd2b78 fbe1fff8 7c9e2378 7c7f1b78
f8010010
[  635.281099] f821ff81 e8a50008 7fa52040 40de00b8 <e8be0000> 7fbd2840 40de008c
7fbff040
[  635.281324] ---[ end trace b628b73449719b9d ]---

Cc: stable@vger.kernel.org # v4.10+
Fixes: b5baa6877315 ("KVM: PPC: Book3S HV: KVM-HV HPT resizing implementation")
Signed-off-by: Serhii Popovych <spopovyc@redhat.com>
[dwg: Replaced BUG_ON()s with WARN_ONs() and reworded commit message
 for clarity]
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_64_mmu_hv.c | 54 ++++++++++++++++++++---------
 1 file changed, 37 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index f5f2c6bf5856..8355398f0bb6 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -1419,16 +1419,20 @@ static void resize_hpt_pivot(struct kvm_resize_hpt *resize)
 
 static void resize_hpt_release(struct kvm *kvm, struct kvm_resize_hpt *resize)
 {
-	BUG_ON(kvm->arch.resize_hpt != resize);
+	if (WARN_ON(!mutex_is_locked(&kvm->lock)))
+		return;
 
 	if (!resize)
 		return;
 
-	if (resize->hpt.virt)
-		kvmppc_free_hpt(&resize->hpt);
+	if (resize->error != -EBUSY) {
+		if (resize->hpt.virt)
+			kvmppc_free_hpt(&resize->hpt);
+		kfree(resize);
+	}
 
-	kvm->arch.resize_hpt = NULL;
-	kfree(resize);
+	if (kvm->arch.resize_hpt == resize)
+		kvm->arch.resize_hpt = NULL;
 }
 
 static void resize_hpt_prepare_work(struct work_struct *work)
@@ -1437,26 +1441,42 @@ static void resize_hpt_prepare_work(struct work_struct *work)
 						     struct kvm_resize_hpt,
 						     work);
 	struct kvm *kvm = resize->kvm;
-	int err;
+	int err = 0;
 
 	if (WARN_ON(resize->error != -EBUSY))
 		return;
 
-	resize_hpt_debug(resize, "resize_hpt_prepare_work(): order = %d\n",
-			 resize->order);
-
-	err = resize_hpt_allocate(resize);
-
-	/* We have strict assumption about -EBUSY
-	 * when preparing for HPT resize.
-	 */
-	if (WARN_ON(err == -EBUSY))
-		err = -EINPROGRESS;
-
 	mutex_lock(&kvm->lock);
 
+	/* Request is still current? */
+	if (kvm->arch.resize_hpt == resize) {
+		/* We may request large allocations here:
+		 * do not sleep with kvm->lock held for a while.
+		 */
+		mutex_unlock(&kvm->lock);
+
+		resize_hpt_debug(resize, "resize_hpt_prepare_work(): order = %d\n",
+				 resize->order);
+
+		err = resize_hpt_allocate(resize);
+
+		/* We have strict assumption about -EBUSY
+		 * when preparing for HPT resize.
+		 */
+		if (WARN_ON(err == -EBUSY))
+			err = -EINPROGRESS;
+
+		mutex_lock(&kvm->lock);
+		/* It is possible that kvm->arch.resize_hpt != resize
+		 * after we grab kvm->lock again.
+		 */
+	}
+
 	resize->error = err;
 
+	if (kvm->arch.resize_hpt != resize)
+		resize_hpt_release(kvm, resize);
+
 	mutex_unlock(&kvm->lock);
 }
 

From cfe17c9bbe6a673fdafdab179c32b355ed447f66 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Mon, 27 Nov 2017 21:15:13 +0900
Subject: [PATCH 052/808] kbuild: move cc-option and cc-disable-warning after
 incl. arch Makefile

Geert reported commit ae6b289a3789 ("kbuild: Set KBUILD_CFLAGS before
incl. arch Makefile") broke cross-compilation using a cross-compiler
that supports less compiler options than the host compiler.

For example,

  cc1: error: unrecognized command line option "-Wno-unused-but-set-variable"

This problem happens on architectures that setup CROSS_COMPILE in their
arch/*/Makefile.

Move the cc-option and cc-disable-warning back to the original position,
but keep the Clang target options untouched.

Fixes: ae6b289a3789 ("kbuild: Set KBUILD_CFLAGS before incl. arch Makefile")
Reported-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Tested-by: Geert Uytterhoeven <geert@linux-m68k.org>
---
 Makefile | 43 +++++++++++++++++++++++--------------------
 1 file changed, 23 insertions(+), 20 deletions(-)

diff --git a/Makefile b/Makefile
index c988e46a53cd..477c4cf01cae 100644
--- a/Makefile
+++ b/Makefile
@@ -484,26 +484,6 @@ CLANG_GCC_TC	:= --gcc-toolchain=$(GCC_TOOLCHAIN)
 endif
 KBUILD_CFLAGS += $(CLANG_TARGET) $(CLANG_GCC_TC)
 KBUILD_AFLAGS += $(CLANG_TARGET) $(CLANG_GCC_TC)
-KBUILD_CPPFLAGS += $(call cc-option,-Qunused-arguments,)
-KBUILD_CFLAGS += $(call cc-disable-warning, unused-variable)
-KBUILD_CFLAGS += $(call cc-disable-warning, format-invalid-specifier)
-KBUILD_CFLAGS += $(call cc-disable-warning, gnu)
-KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member)
-# Quiet clang warning: comparison of unsigned expression < 0 is always false
-KBUILD_CFLAGS += $(call cc-disable-warning, tautological-compare)
-# CLANG uses a _MergedGlobals as optimization, but this breaks modpost, as the
-# source of a reference will be _MergedGlobals and not on of the whitelisted names.
-# See modpost pattern 2
-KBUILD_CFLAGS += $(call cc-option, -mno-global-merge,)
-KBUILD_CFLAGS += $(call cc-option, -fcatch-undefined-behavior)
-KBUILD_CFLAGS += $(call cc-option, -no-integrated-as)
-KBUILD_AFLAGS += $(call cc-option, -no-integrated-as)
-else
-
-# These warnings generated too much noise in a regular build.
-# Use make W=1 to enable them (see scripts/Makefile.extrawarn)
-KBUILD_CFLAGS += $(call cc-disable-warning, unused-but-set-variable)
-KBUILD_CFLAGS += $(call cc-disable-warning, unused-const-variable)
 endif
 
 ifeq ($(config-targets),1)
@@ -716,6 +696,29 @@ ifdef CONFIG_CC_STACKPROTECTOR
 endif
 KBUILD_CFLAGS += $(stackp-flag)
 
+ifeq ($(cc-name),clang)
+KBUILD_CPPFLAGS += $(call cc-option,-Qunused-arguments,)
+KBUILD_CFLAGS += $(call cc-disable-warning, unused-variable)
+KBUILD_CFLAGS += $(call cc-disable-warning, format-invalid-specifier)
+KBUILD_CFLAGS += $(call cc-disable-warning, gnu)
+KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member)
+# Quiet clang warning: comparison of unsigned expression < 0 is always false
+KBUILD_CFLAGS += $(call cc-disable-warning, tautological-compare)
+# CLANG uses a _MergedGlobals as optimization, but this breaks modpost, as the
+# source of a reference will be _MergedGlobals and not on of the whitelisted names.
+# See modpost pattern 2
+KBUILD_CFLAGS += $(call cc-option, -mno-global-merge,)
+KBUILD_CFLAGS += $(call cc-option, -fcatch-undefined-behavior)
+KBUILD_CFLAGS += $(call cc-option, -no-integrated-as)
+KBUILD_AFLAGS += $(call cc-option, -no-integrated-as)
+else
+
+# These warnings generated too much noise in a regular build.
+# Use make W=1 to enable them (see scripts/Makefile.extrawarn)
+KBUILD_CFLAGS += $(call cc-disable-warning, unused-but-set-variable)
+KBUILD_CFLAGS += $(call cc-disable-warning, unused-const-variable)
+endif
+
 ifdef CONFIG_FRAME_POINTER
 KBUILD_CFLAGS	+= -fno-omit-frame-pointer -fno-optimize-sibling-calls
 else

From c7b92172a61b91936be985cb9bc499a4ebc6489b Mon Sep 17 00:00:00 2001
From: Stefan Potyra <Stefan.Potyra@elektrobit.com>
Date: Wed, 6 Dec 2017 16:03:24 +0100
Subject: [PATCH 053/808] ASoC: rockchip: disable clock on error

Disable the clocks in  rk_spdif_probe when an error occurs after one
of the clocks has been enabled previously.

Found by Linux Driver Verification project (linuxtesting.org).

Fixes: f874b80e1571 ASoC: rockchip: Add rockchip SPDIF transceiver driver
Signed-off-by: Stefan Potyra <Stefan.Potyra@elektrobit.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 sound/soc/rockchip/rockchip_spdif.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/sound/soc/rockchip/rockchip_spdif.c b/sound/soc/rockchip/rockchip_spdif.c
index ee5055d47d13..a89fe9b6463b 100644
--- a/sound/soc/rockchip/rockchip_spdif.c
+++ b/sound/soc/rockchip/rockchip_spdif.c
@@ -322,26 +322,30 @@ static int rk_spdif_probe(struct platform_device *pdev)
 	spdif->mclk = devm_clk_get(&pdev->dev, "mclk");
 	if (IS_ERR(spdif->mclk)) {
 		dev_err(&pdev->dev, "Can't retrieve rk_spdif master clock\n");
-		return PTR_ERR(spdif->mclk);
+		ret = PTR_ERR(spdif->mclk);
+		goto err_disable_hclk;
 	}
 
 	ret = clk_prepare_enable(spdif->mclk);
 	if (ret) {
 		dev_err(spdif->dev, "clock enable failed %d\n", ret);
-		return ret;
+		goto err_disable_clocks;
 	}
 
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	regs = devm_ioremap_resource(&pdev->dev, res);
-	if (IS_ERR(regs))
-		return PTR_ERR(regs);
+	if (IS_ERR(regs)) {
+		ret = PTR_ERR(regs);
+		goto err_disable_clocks;
+	}
 
 	spdif->regmap = devm_regmap_init_mmio_clk(&pdev->dev, "hclk", regs,
 						  &rk_spdif_regmap_config);
 	if (IS_ERR(spdif->regmap)) {
 		dev_err(&pdev->dev,
 			"Failed to initialise managed register map\n");
-		return PTR_ERR(spdif->regmap);
+		ret = PTR_ERR(spdif->regmap);
+		goto err_disable_clocks;
 	}
 
 	spdif->playback_dma_data.addr = res->start + SPDIF_SMPDR;
@@ -373,6 +377,10 @@ static int rk_spdif_probe(struct platform_device *pdev)
 
 err_pm_runtime:
 	pm_runtime_disable(&pdev->dev);
+err_disable_clocks:
+	clk_disable_unprepare(spdif->mclk);
+err_disable_hclk:
+	clk_disable_unprepare(spdif->hclk);
 
 	return ret;
 }

From e02b03303f13b6a571f01b4d84b69440696d2dde Mon Sep 17 00:00:00 2001
From: Guneshwor Singh <guneshwor.o.singh@intel.com>
Date: Wed, 6 Dec 2017 16:34:04 +0530
Subject: [PATCH 054/808] ASoC: Intel: Skylake: Do not check dev_type for dmic
 link type

Some BIOS have inconsistent dev_type value for DMIC link type.
Since there is only one device type for DMIC link type, remove device
type check if link type is NHLT_LINK_DMIC.

Signed-off-by: Guneshwor Singh <guneshwor.o.singh@intel.com>
Acked-By: Vinod Koul <vinod.koul@intel.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 sound/soc/intel/skylake/skl-nhlt.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/sound/soc/intel/skylake/skl-nhlt.c b/sound/soc/intel/skylake/skl-nhlt.c
index d14c50a60289..3eaac41090ca 100644
--- a/sound/soc/intel/skylake/skl-nhlt.c
+++ b/sound/soc/intel/skylake/skl-nhlt.c
@@ -119,11 +119,16 @@ static bool skl_check_ep_match(struct device *dev, struct nhlt_endpoint *epnt,
 
 	if ((epnt->virtual_bus_id == instance_id) &&
 			(epnt->linktype == link_type) &&
-			(epnt->direction == dirn) &&
-			(epnt->device_type == dev_type))
-		return true;
-	else
-		return false;
+			(epnt->direction == dirn)) {
+		/* do not check dev_type for DMIC link type */
+		if (epnt->linktype == NHLT_LINK_DMIC)
+			return true;
+
+		if (epnt->device_type == dev_type)
+			return true;
+	}
+
+	return false;
 }
 
 struct nhlt_specific_cfg

From fcf38cdf332a81b20a59e3ebaea81f6b316bbe0c Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Tue, 5 Dec 2017 22:57:43 -0800
Subject: [PATCH 055/808] kyber: fix another domain token wait queue hang

Commit 8cf466602028 ("kyber: fix hang on domain token wait queue") fixed
a hang caused by leaving wait entries on the domain token wait queue
after the __sbitmap_queue_get() retry succeeded, making that wait entry
a "dud" which won't in turn wake more entries up. However, we can also
get a dud entry if kyber_get_domain_token() fails once but is then
called again and succeeds. This can happen if the hardware queue is
rerun for some other reason, or, more likely, kyber_dispatch_request()
tries the same domain twice.

The fix is to remove our entry from the wait queue whenever we
successfully get a token. The only complication is that we might be on
one of many wait queues in the struct sbitmap_queue, but that's easily
fixed by remembering which wait queue we were put on.

While we're here, only initialize the wait queue entry once instead of
on every wait, and use spin_lock_irq() instead of spin_lock_irqsave(),
since this is always called from process context with irqs enabled.

Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/kyber-iosched.c | 39 +++++++++++++++++++++++++--------------
 1 file changed, 25 insertions(+), 14 deletions(-)

diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index b4df317c2916..f95c60774ce8 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -100,9 +100,13 @@ struct kyber_hctx_data {
 	unsigned int cur_domain;
 	unsigned int batching;
 	wait_queue_entry_t domain_wait[KYBER_NUM_DOMAINS];
+	struct sbq_wait_state *domain_ws[KYBER_NUM_DOMAINS];
 	atomic_t wait_index[KYBER_NUM_DOMAINS];
 };
 
+static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
+			     void *key);
+
 static int rq_sched_domain(const struct request *rq)
 {
 	unsigned int op = rq->cmd_flags;
@@ -385,6 +389,9 @@ static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
 
 	for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
 		INIT_LIST_HEAD(&khd->rqs[i]);
+		init_waitqueue_func_entry(&khd->domain_wait[i],
+					  kyber_domain_wake);
+		khd->domain_wait[i].private = hctx;
 		INIT_LIST_HEAD(&khd->domain_wait[i].entry);
 		atomic_set(&khd->wait_index[i], 0);
 	}
@@ -524,35 +531,39 @@ static int kyber_get_domain_token(struct kyber_queue_data *kqd,
 	int nr;
 
 	nr = __sbitmap_queue_get(domain_tokens);
-	if (nr >= 0)
-		return nr;
 
 	/*
 	 * If we failed to get a domain token, make sure the hardware queue is
 	 * run when one becomes available. Note that this is serialized on
 	 * khd->lock, but we still need to be careful about the waker.
 	 */
-	if (list_empty_careful(&wait->entry)) {
-		init_waitqueue_func_entry(wait, kyber_domain_wake);
-		wait->private = hctx;
+	if (nr < 0 && list_empty_careful(&wait->entry)) {
 		ws = sbq_wait_ptr(domain_tokens,
 				  &khd->wait_index[sched_domain]);
+		khd->domain_ws[sched_domain] = ws;
 		add_wait_queue(&ws->wait, wait);
 
 		/*
 		 * Try again in case a token was freed before we got on the wait
-		 * queue. The waker may have already removed the entry from the
-		 * wait queue, but list_del_init() is okay with that.
+		 * queue.
 		 */
 		nr = __sbitmap_queue_get(domain_tokens);
-		if (nr >= 0) {
-			unsigned long flags;
-
-			spin_lock_irqsave(&ws->wait.lock, flags);
-			list_del_init(&wait->entry);
-			spin_unlock_irqrestore(&ws->wait.lock, flags);
-		}
 	}
+
+	/*
+	 * If we got a token while we were on the wait queue, remove ourselves
+	 * from the wait queue to ensure that all wake ups make forward
+	 * progress. It's possible that the waker already deleted the entry
+	 * between the !list_empty_careful() check and us grabbing the lock, but
+	 * list_del_init() is okay with that.
+	 */
+	if (nr >= 0 && !list_empty_careful(&wait->entry)) {
+		ws = khd->domain_ws[sched_domain];
+		spin_lock_irq(&ws->wait.lock);
+		list_del_init(&wait->entry);
+		spin_unlock_irq(&ws->wait.lock);
+	}
+
 	return nr;
 }
 

From b638823a7bbd251d442042b0e9522100bdaa5b66 Mon Sep 17 00:00:00 2001
From: Alejandro Mery <amery@hanoverdisplays.com>
Date: Tue, 5 Dec 2017 12:34:56 +0000
Subject: [PATCH 056/808] ARM: davinci: Use platform_device_register_full() to
 create pdev for dm365's eDMA

Convert the DM365 EDMA platform device creation to use
struct platform_device_info XXXXXX __initconst and
platform_device_register_full()

This will allow us to specify the dma_mask for the device
in an upcoming patch. Without this, EDMA on DM365 refuses
to probe.

Fixes: 7ab388e85faa ("ARM: davinci: Use platform_device_register_full() to create pdev for eDMA")
Reviewed-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Signed-off-by: Alejandro Mery <amery@hanoverdisplays.com>
Signed-off-by: Sekhar Nori <nsekhar@ti.com>
---
 arch/arm/mach-davinci/dm365.c | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/arch/arm/mach-davinci/dm365.c b/arch/arm/mach-davinci/dm365.c
index 8be04ec95adf..9bd17bc77b5c 100644
--- a/arch/arm/mach-davinci/dm365.c
+++ b/arch/arm/mach-davinci/dm365.c
@@ -925,12 +925,13 @@ static struct resource edma_resources[] = {
 	/* not using TC*_ERR */
 };
 
-static struct platform_device dm365_edma_device = {
-	.name			= "edma",
-	.id			= 0,
-	.dev.platform_data	= &dm365_edma_pdata,
-	.num_resources		= ARRAY_SIZE(edma_resources),
-	.resource		= edma_resources,
+static const struct platform_device_info dm365_edma_device __initconst = {
+	.name		= "edma",
+	.id		= 0,
+	.res		= edma_resources,
+	.num_res	= ARRAY_SIZE(edma_resources),
+	.data		= &dm365_edma_pdata,
+	.size_data	= sizeof(dm365_edma_pdata),
 };
 
 static struct resource dm365_asp_resources[] = {
@@ -1428,13 +1429,18 @@ int __init dm365_init_video(struct vpfe_config *vpfe_cfg,
 
 static int __init dm365_init_devices(void)
 {
+	struct platform_device *edma_pdev;
 	int ret = 0;
 
 	if (!cpu_is_davinci_dm365())
 		return 0;
 
 	davinci_cfg_reg(DM365_INT_EDMA_CC);
-	platform_device_register(&dm365_edma_device);
+	edma_pdev = platform_device_register_full(&dm365_edma_device);
+	if (IS_ERR(edma_pdev)) {
+		pr_warn("%s: Failed to register eDMA\n", __func__);
+		return PTR_ERR(edma_pdev);
+	}
 
 	platform_device_register(&dm365_mdio_device);
 	platform_device_register(&dm365_emac_device);

From 621f96bcb49412010876a1e6e006f748b91d9e75 Mon Sep 17 00:00:00 2001
From: Alejandro Mery <amery@hanoverdisplays.com>
Date: Tue, 5 Dec 2017 12:34:57 +0000
Subject: [PATCH 057/808] ARM: davinci: Add dma_mask to dm365's eDMA device

Add dma_mask to dm365's EDMA device.

Without a valid dma_mask, EDMA on DM365 refuses to
probe.

Fixes: cef5b0da4019 ("ARM: davinci: Add dma_mask to eDMA devices")
Reviewed-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Signed-off-by: Alejandro Mery <amery@hanoverdisplays.com>
Signed-off-by: Sekhar Nori <nsekhar@ti.com>
---
 arch/arm/mach-davinci/dm365.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm/mach-davinci/dm365.c b/arch/arm/mach-davinci/dm365.c
index 9bd17bc77b5c..103316f01a22 100644
--- a/arch/arm/mach-davinci/dm365.c
+++ b/arch/arm/mach-davinci/dm365.c
@@ -928,6 +928,7 @@ static struct resource edma_resources[] = {
 static const struct platform_device_info dm365_edma_device __initconst = {
 	.name		= "edma",
 	.id		= 0,
+	.dma_mask	= DMA_BIT_MASK(32),
 	.res		= edma_resources,
 	.num_res	= ARRAY_SIZE(edma_resources),
 	.data		= &dm365_edma_pdata,

From c5a88cd2e1c508868922bafa0a5c3365986b98e5 Mon Sep 17 00:00:00 2001
From: David Lechner <david@lechnology.com>
Date: Sun, 3 Dec 2017 16:04:53 -0600
Subject: [PATCH 058/808] ARM: dts: da850-lego-ev3: Fix battery voltage gpio

This fixes the battery voltage monitoring gpio-hog settings.

When the gpio is low, it turns off the battery voltage to the ADC chip.
However, this needs to be on all of the time so that we can monitor
battery voltage.

Also, there was a typo that prevented pinmuxing from working correctly.

Signed-off-by: David Lechner <david@lechnology.com>
Signed-off-by: Sekhar Nori <nsekhar@ti.com>
---
 arch/arm/boot/dts/da850-lego-ev3.dts | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm/boot/dts/da850-lego-ev3.dts b/arch/arm/boot/dts/da850-lego-ev3.dts
index 413dbd5d9f64..81942ae83e1f 100644
--- a/arch/arm/boot/dts/da850-lego-ev3.dts
+++ b/arch/arm/boot/dts/da850-lego-ev3.dts
@@ -178,7 +178,7 @@
 	 */
 	battery {
 		pinctrl-names = "default";
-		pintctrl-0 = <&battery_pins>;
+		pinctrl-0 = <&battery_pins>;
 		compatible = "lego,ev3-battery";
 		io-channels = <&adc 4>, <&adc 3>;
 		io-channel-names = "voltage", "current";
@@ -392,7 +392,7 @@
 	batt_volt_en {
 		gpio-hog;
 		gpios = <6 GPIO_ACTIVE_HIGH>;
-		output-low;
+		output-high;
 	};
 };
 

From 7cb4774e2d3282d29edd00762167876a27cc7d2a Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Wed, 6 Dec 2017 17:54:38 +0100
Subject: [PATCH 059/808] HID: core: lower log level for unknown main item tags
 to warnings

Given all the effort distros have done with splash-screens to give
users a nice clean boot experience, we really want dmesg --level=err
to not print anything unless there is a real problem with either the
hardware or the kernel. Buggy HID descriptors unfortunately happen
all too often, so lower the log level to warning keep the console
clear of error messages such as:

[  441.079664] apple 0005:05AC:0239.0003: unknown main item tag 0x0

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Acked-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/hid-core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c
index f3fcb836a1f9..0c3f608131cf 100644
--- a/drivers/hid/hid-core.c
+++ b/drivers/hid/hid-core.c
@@ -551,7 +551,7 @@ static int hid_parser_main(struct hid_parser *parser, struct hid_item *item)
 		ret = hid_add_field(parser, HID_FEATURE_REPORT, data);
 		break;
 	default:
-		hid_err(parser->device, "unknown main item tag 0x%x\n", item->tag);
+		hid_warn(parser->device, "unknown main item tag 0x%x\n", item->tag);
 		ret = 0;
 	}
 

From b860b419d970f286294fbfb2b21a4028fd8ee442 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Wed, 6 Dec 2017 12:21:35 +0100
Subject: [PATCH 060/808] mfd: Fix RTS5227 (and others) powermanagement

Commit 8275b77a1513 ("mfd: rts5249: Add support for RTS5250S power
saving") adds powersaving support for device-ids 5249 524a and 525a.

But as a side effect it breaks ASPM support for all the other device-ids,
causing e.g. the Haswell CPU on a Lenovo T440s to not go into a higher
c-state then PC3, while previously it would go to PC7, causing the
machine to idle at 7.4W instead of 6.6W!

The problem here is the new option.dev_aspm_mode field, which only gets
explicitly initialized in the new code for the device-ids 5249 524a and
525a. Leaving the dev_aspm_mode 0 for the other device-ids.

The default dev_aspm_mode 0 is mapped to DEV_ASPM_DISABLE, but the
old behavior of calling rtsx_pci_enable_aspm() when idle and
rtsx_pci_disable_aspm() when busy happens when dev_aspm_mode ==
DEV_ASPM_DYNAMIC.

This commit changes the enum so that 0 = DEV_ASPM_DYNAMIC matching the
old default behavior, fixing the pm regression with the other device-ids.

Fixes: 8275b77a1513 ("mfd: rts5249: Add support for RTS5250S power saving")
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Acked-by: Rui Feng <rui_feng@realsil.com.cn>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/rtsx_pci.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/mfd/rtsx_pci.h b/include/linux/mfd/rtsx_pci.h
index a2a1318a3d0c..c3d3f04d8cc6 100644
--- a/include/linux/mfd/rtsx_pci.h
+++ b/include/linux/mfd/rtsx_pci.h
@@ -915,10 +915,10 @@ enum PDEV_STAT  {PDEV_STAT_IDLE, PDEV_STAT_RUN};
 #define LTR_L1SS_PWR_GATE_CHECK_CARD_EN	BIT(6)
 
 enum dev_aspm_mode {
-	DEV_ASPM_DISABLE = 0,
 	DEV_ASPM_DYNAMIC,
 	DEV_ASPM_BACKDOOR,
 	DEV_ASPM_STATIC,
+	DEV_ASPM_DISABLE,
 };
 
 /*

From b458a3490e46dddd5b63f59b458c9b6d2284a63f Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Thu, 7 Dec 2017 11:09:21 +0100
Subject: [PATCH 061/808] spi: rspi: Do not set SPCR_SPE in
 qspi_set_config_register()

The R-Car Gen2 Hardware User Manual Rev. 2.00 states:

    If the master/slave mode select bit (MSTR) is modified while the SPI
    function enable bit (SPE) is set to 1 (that is, this module is
    enabled), the subsequent operation cannot be guaranteed.

Hence do not set SPCR_SPE when setting SPCR_MSTR, just like the
.set_config_register() implementations for other RSPI variants do.

Note that when booted from QSPI, the boot loader will have set SPCR_MSTR
already, hence usually the bit is never modified by the Linux driver.

Reported-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi-rspi.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/spi/spi-rspi.c b/drivers/spi/spi-rspi.c
index 2ce875764ca6..0835a8d88fb8 100644
--- a/drivers/spi/spi-rspi.c
+++ b/drivers/spi/spi-rspi.c
@@ -377,8 +377,8 @@ static int qspi_set_config_register(struct rspi_data *rspi, int access_size)
 	/* Sets SPCMD */
 	rspi_write16(rspi, rspi->spcmd, RSPI_SPCMD0);
 
-	/* Enables SPI function in master mode */
-	rspi_write8(rspi, SPCR_SPE | SPCR_MSTR, RSPI_SPCR);
+	/* Sets RSPI mode */
+	rspi_write8(rspi, SPCR_MSTR, RSPI_SPCR);
 
 	return 0;
 }

From c810daba0ab5226084a56893a789af427a801146 Mon Sep 17 00:00:00 2001
From: Takuo Koguchi <takuo.koguchi@gmail.com>
Date: Thu, 7 Dec 2017 16:20:14 +0900
Subject: [PATCH 062/808] spi: sun4i: disable clocks in the remove function

mclk and hclk need to be disabled. Since pm_runtime_disable does
not disable the clocks, use pm_runtime_force_suspend instead.

Found by Linux Driver Verification project (linuxtesting.org).

Signed-off-by: Takuo Koguchi <takuo.koguchi.sw@hitachi.com>
Acked-by: Maxime Ripard <maxime.ripard@free-electrons.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi-sun4i.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/spi/spi-sun4i.c b/drivers/spi/spi-sun4i.c
index c5cd635c28f3..41410031f8e9 100644
--- a/drivers/spi/spi-sun4i.c
+++ b/drivers/spi/spi-sun4i.c
@@ -525,7 +525,7 @@ err_free_master:
 
 static int sun4i_spi_remove(struct platform_device *pdev)
 {
-	pm_runtime_disable(&pdev->dev);
+	pm_runtime_force_suspend(&pdev->dev);
 
 	return 0;
 }

From 866f7ed7d67936dcdbcddc111c8af878c918fe7c Mon Sep 17 00:00:00 2001
From: Jussi Laako <jussi@sonarnerd.net>
Date: Thu, 7 Dec 2017 12:58:33 +0200
Subject: [PATCH 063/808] ALSA: usb-audio: Add native DSD support for Esoteric
 D-05X

Adds VID:PID of Esoteric D-05X to the TEAC device id's.
Renames the is_teac_50X_dac() function to is_teac_dsd_dac() to cover
broader device family from the same corporation sharing the same USB
audio implementation.

Signed-off-by: Jussi Laako <jussi@sonarnerd.net>
Cc: <stable@vger.kernel.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/usb/quirks.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c
index 77eecaa4db1f..a66ef5777887 100644
--- a/sound/usb/quirks.c
+++ b/sound/usb/quirks.c
@@ -1166,10 +1166,11 @@ static bool is_marantz_denon_dac(unsigned int id)
 /* TEAC UD-501/UD-503/NT-503 USB DACs need a vendor cmd to switch
  * between PCM/DOP and native DSD mode
  */
-static bool is_teac_50X_dac(unsigned int id)
+static bool is_teac_dsd_dac(unsigned int id)
 {
 	switch (id) {
 	case USB_ID(0x0644, 0x8043): /* TEAC UD-501/UD-503/NT-503 */
+	case USB_ID(0x0644, 0x8044): /* Esoteric D-05X */
 		return true;
 	}
 	return false;
@@ -1202,7 +1203,7 @@ int snd_usb_select_mode_quirk(struct snd_usb_substream *subs,
 			break;
 		}
 		mdelay(20);
-	} else if (is_teac_50X_dac(subs->stream->chip->usb_id)) {
+	} else if (is_teac_dsd_dac(subs->stream->chip->usb_id)) {
 		/* Vendor mode switch cmd is required. */
 		switch (fmt->altsetting) {
 		case 3: /* DSD mode (DSD_U32) requested */
@@ -1392,7 +1393,7 @@ u64 snd_usb_interface_dsd_format_quirks(struct snd_usb_audio *chip,
 	}
 
 	/* TEAC devices with USB DAC functionality */
-	if (is_teac_50X_dac(chip->usb_id)) {
+	if (is_teac_dsd_dac(chip->usb_id)) {
 		if (fp->altsetting == 3)
 			return SNDRV_PCM_FMTBIT_DSD_U32_BE;
 	}

From 2b4584d00a6bc02b63ab3c7213060d41a74bdff1 Mon Sep 17 00:00:00 2001
From: Guneshwor Singh <guneshwor.o.singh@intel.com>
Date: Thu, 7 Dec 2017 18:06:20 +0530
Subject: [PATCH 064/808] ALSA: hda - Add vendor id for Cannonlake HDMI codec

Cannonlake HDMI codec has the same nid as Geminilake. This adds the
codec entry for it.

Signed-off-by: Guneshwor Singh <guneshwor.o.singh@intel.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/pci/hda/patch_hdmi.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/sound/pci/hda/patch_hdmi.c b/sound/pci/hda/patch_hdmi.c
index c19c81d230bd..b4f1b6e88305 100644
--- a/sound/pci/hda/patch_hdmi.c
+++ b/sound/pci/hda/patch_hdmi.c
@@ -55,10 +55,11 @@ MODULE_PARM_DESC(static_hdmi_pcm, "Don't restrict PCM parameters per ELD info");
 #define is_kabylake(codec) ((codec)->core.vendor_id == 0x8086280b)
 #define is_geminilake(codec) (((codec)->core.vendor_id == 0x8086280d) || \
 				((codec)->core.vendor_id == 0x80862800))
+#define is_cannonlake(codec) ((codec)->core.vendor_id == 0x8086280c)
 #define is_haswell_plus(codec) (is_haswell(codec) || is_broadwell(codec) \
 				|| is_skylake(codec) || is_broxton(codec) \
-				|| is_kabylake(codec)) || is_geminilake(codec)
-
+				|| is_kabylake(codec)) || is_geminilake(codec) \
+				|| is_cannonlake(codec)
 #define is_valleyview(codec) ((codec)->core.vendor_id == 0x80862882)
 #define is_cherryview(codec) ((codec)->core.vendor_id == 0x80862883)
 #define is_valleyview_plus(codec) (is_valleyview(codec) || is_cherryview(codec))
@@ -3841,6 +3842,7 @@ HDA_CODEC_ENTRY(0x80862808, "Broadwell HDMI",	patch_i915_hsw_hdmi),
 HDA_CODEC_ENTRY(0x80862809, "Skylake HDMI",	patch_i915_hsw_hdmi),
 HDA_CODEC_ENTRY(0x8086280a, "Broxton HDMI",	patch_i915_hsw_hdmi),
 HDA_CODEC_ENTRY(0x8086280b, "Kabylake HDMI",	patch_i915_hsw_hdmi),
+HDA_CODEC_ENTRY(0x8086280c, "Cannonlake HDMI",	patch_i915_glk_hdmi),
 HDA_CODEC_ENTRY(0x8086280d, "Geminilake HDMI",	patch_i915_glk_hdmi),
 HDA_CODEC_ENTRY(0x80862800, "Geminilake HDMI",	patch_i915_glk_hdmi),
 HDA_CODEC_ENTRY(0x80862880, "CedarTrail HDMI",	patch_generic_hdmi),

From 75bf50f4aaa1c78d769d854ab3d975884909e4fb Mon Sep 17 00:00:00 2001
From: Antony Antony <antony@phenome.org>
Date: Thu, 7 Dec 2017 21:54:27 +0100
Subject: [PATCH 065/808] xfrm: fix xfrm_do_migrate() with AEAD e.g(AES-GCM)

copy geniv when cloning the xfrm state.

x->geniv was not copied to the new state and migration would fail.

xfrm_do_migrate
  ..
  xfrm_state_clone()
   ..
   ..
   esp_init_aead()
   crypto_alloc_aead()
    crypto_alloc_tfm()
     crypto_find_alg() return EAGAIN and failed

Signed-off-by: Antony Antony <antony@phenome.org>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_state.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 1f5cee2269af..88d0a563e141 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -1344,6 +1344,7 @@ static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig,
 
 	if (orig->aead) {
 		x->aead = xfrm_algo_aead_clone(orig->aead);
+		x->geniv = orig->geniv;
 		if (!x->aead)
 			goto error;
 	}

From 732706afe1cc46ef48493b3d2b69c98f36314ae4 Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Fri, 8 Dec 2017 08:07:25 +0100
Subject: [PATCH 066/808] xfrm: Fix stack-out-of-bounds with misconfigured
 transport mode policies.

On policies with a transport mode template, we pass the addresses
from the flowi to xfrm_state_find(), assuming that the IP addresses
(and address family) don't change during transformation.

Unfortunately our policy template validation is not strict enough.
It is possible to configure policies with transport mode template
where the address family of the template does not match the selectors
address family. This lead to stack-out-of-bound reads because
we compare arddesses of the wrong family. Fix this by refusing
such a configuration, address family can not change on transport
mode.

We use the assumption that, on transport mode, the first templates
address family must match the address family of the policy selector.
Subsequent transport mode templates must mach the address family of
the previous template.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_user.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index ff58c37469d6..bdb48e5dba04 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -1419,11 +1419,14 @@ static void copy_templates(struct xfrm_policy *xp, struct xfrm_user_tmpl *ut,
 
 static int validate_tmpl(int nr, struct xfrm_user_tmpl *ut, u16 family)
 {
+	u16 prev_family;
 	int i;
 
 	if (nr > XFRM_MAX_DEPTH)
 		return -EINVAL;
 
+	prev_family = family;
+
 	for (i = 0; i < nr; i++) {
 		/* We never validated the ut->family value, so many
 		 * applications simply leave it at zero.  The check was
@@ -1435,6 +1438,12 @@ static int validate_tmpl(int nr, struct xfrm_user_tmpl *ut, u16 family)
 		if (!ut[i].family)
 			ut[i].family = family;
 
+		if ((ut[i].mode == XFRM_MODE_TRANSPORT) &&
+		    (ut[i].family != prev_family))
+			return -EINVAL;
+
+		prev_family = ut[i].family;
+
 		switch (ut[i].family) {
 		case AF_INET:
 			break;

From 451df7d110b82998c04a80d0de0f1e79aaa7792a Mon Sep 17 00:00:00 2001
From: Alejandro Mery <amery@hanoverdisplays.com>
Date: Fri, 8 Dec 2017 10:35:58 +0000
Subject: [PATCH 067/808] ARM: davinci: fix mmc entries in dm365's
 dma_slave_map

fix mmc entries in dm365's dma_slave_map to match the actual device names

Fixes: 0c750e1fe481 ("ARM: davinci: dm365: Add dma_slave_map to edma")
Signed-off-by: Alejandro Mery <amery@hanoverdisplays.com>
Signed-off-by: Sekhar Nori <nsekhar@ti.com>
---
 arch/arm/mach-davinci/dm365.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/arm/mach-davinci/dm365.c b/arch/arm/mach-davinci/dm365.c
index 103316f01a22..5ace9380626a 100644
--- a/arch/arm/mach-davinci/dm365.c
+++ b/arch/arm/mach-davinci/dm365.c
@@ -868,10 +868,10 @@ static const struct dma_slave_map dm365_edma_map[] = {
 	{ "spi_davinci.0", "rx", EDMA_FILTER_PARAM(0, 17) },
 	{ "spi_davinci.3", "tx", EDMA_FILTER_PARAM(0, 18) },
 	{ "spi_davinci.3", "rx", EDMA_FILTER_PARAM(0, 19) },
-	{ "dm6441-mmc.0", "rx", EDMA_FILTER_PARAM(0, 26) },
-	{ "dm6441-mmc.0", "tx", EDMA_FILTER_PARAM(0, 27) },
-	{ "dm6441-mmc.1", "rx", EDMA_FILTER_PARAM(0, 30) },
-	{ "dm6441-mmc.1", "tx", EDMA_FILTER_PARAM(0, 31) },
+	{ "da830-mmc.0", "rx", EDMA_FILTER_PARAM(0, 26) },
+	{ "da830-mmc.0", "tx", EDMA_FILTER_PARAM(0, 27) },
+	{ "da830-mmc.1", "rx", EDMA_FILTER_PARAM(0, 30) },
+	{ "da830-mmc.1", "tx", EDMA_FILTER_PARAM(0, 31) },
 };
 
 static struct edma_soc_info dm365_edma_pdata = {

From 50dd2ea8ef67a1617e0c0658bcbec4b9fb03b936 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <ben.hutchings@codethink.co.uk>
Date: Fri, 8 Dec 2017 16:15:20 +0000
Subject: [PATCH 068/808] ASoC: wm_adsp: Fix validation of firmware and coeff
 lengths

The checks for whether another region/block header could be present
are subtracting the size from the current offset.  Obviously we should
instead subtract the offset from the size.

The checks for whether the region/block data fit in the file are
adding the data size to the current offset and header size, without
checking for integer overflow.  Rearrange these so that overflow is
impossible.

Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Acked-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Tested-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Cc: stable@vger.kernel.org
---
 sound/soc/codecs/wm_adsp.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/sound/soc/codecs/wm_adsp.c b/sound/soc/codecs/wm_adsp.c
index 65c059b5ffd7..66e32f5d2917 100644
--- a/sound/soc/codecs/wm_adsp.c
+++ b/sound/soc/codecs/wm_adsp.c
@@ -1733,7 +1733,7 @@ static int wm_adsp_load(struct wm_adsp *dsp)
 		 le64_to_cpu(footer->timestamp));
 
 	while (pos < firmware->size &&
-	       pos - firmware->size > sizeof(*region)) {
+	       sizeof(*region) < firmware->size - pos) {
 		region = (void *)&(firmware->data[pos]);
 		region_name = "Unknown";
 		reg = 0;
@@ -1782,8 +1782,8 @@ static int wm_adsp_load(struct wm_adsp *dsp)
 			 regions, le32_to_cpu(region->len), offset,
 			 region_name);
 
-		if ((pos + le32_to_cpu(region->len) + sizeof(*region)) >
-		    firmware->size) {
+		if (le32_to_cpu(region->len) >
+		    firmware->size - pos - sizeof(*region)) {
 			adsp_err(dsp,
 				 "%s.%d: %s region len %d bytes exceeds file length %zu\n",
 				 file, regions, region_name,
@@ -2253,7 +2253,7 @@ static int wm_adsp_load_coeff(struct wm_adsp *dsp)
 
 	blocks = 0;
 	while (pos < firmware->size &&
-	       pos - firmware->size > sizeof(*blk)) {
+	       sizeof(*blk) < firmware->size - pos) {
 		blk = (void *)(&firmware->data[pos]);
 
 		type = le16_to_cpu(blk->type);
@@ -2327,8 +2327,8 @@ static int wm_adsp_load_coeff(struct wm_adsp *dsp)
 		}
 
 		if (reg) {
-			if ((pos + le32_to_cpu(blk->len) + sizeof(*blk)) >
-			    firmware->size) {
+			if (le32_to_cpu(blk->len) >
+			    firmware->size - pos - sizeof(*blk)) {
 				adsp_err(dsp,
 					 "%s.%d: %s region len %d bytes exceeds file length %zu\n",
 					 file, blocks, region_name,

From 0f0be40ba59c2d5fdfea48e3ff93f6165d616440 Mon Sep 17 00:00:00 2001
From: Alexandre Belloni <alexandre.belloni@free-electrons.com>
Date: Fri, 8 Dec 2017 15:18:53 +0100
Subject: [PATCH 069/808] ASoC: atmel-classd: select correct Kconfig symbol

SND_ATMEL_SOC_CLASSD selects SND_ATMEL_SOC_DMA but the driver itself
handles its own DMA operations and doesn't need anything from
atmel-pcm-dma.c or atmel_ssc_dai.c.

Replace SND_ATMEL_SOC_DMA by SND_SOC_GENERIC_DMAENGINE_PCM which is the
only one actually required.

This may end up in a configuration leading to a link error:

sound/soc/atmel/atmel_ssc_dai.o: In function `atmel_ssc_set_audio':
atmel_ssc_dai.c:(.text+0x79c): undefined reference to `atmel_pcm_dma_platform_register'
atmel_ssc_dai.c:(.text+0x79c): relocation truncated to fit: R_AARCH64_CALL26 against undefined symbol `atmel_pcm_dma_platform_register'
sound/soc/atmel/atmel_ssc_dai.o: In function `atmel_ssc_put_audio':
atmel_ssc_dai.c:(.text+0xf24): undefined reference to `atmel_pcm_dma_platform_unregister'
atmel_ssc_dai.c:(.text+0xf24): relocation truncated to fit: R_AARCH64_CALL26 against undefined symbol `atmel_pcm_dma_platform_unregister'

Tested on sama5d2 xplained with the following configuration
where nothing selects SND_ATMEL_SOC_DMA:

CONFIG_SND_ATMEL_SOC=y
CONFIG_SND_ATMEL_SOC_CLASSD=y

Reported-by: Arnd Bergmann <arnd@arndb.de>
Tested-by: Arnd Bergmann <arnd@arndb.de>
Fixes: e0a25b6d1862 ("ASoC: atmel-classd: add the Audio Class D Amplifier")
Signed-off-by: Alexandre Belloni <alexandre.belloni@free-electrons.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 sound/soc/atmel/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sound/soc/atmel/Kconfig b/sound/soc/atmel/Kconfig
index 4a56f3dfba51..dcee145dd179 100644
--- a/sound/soc/atmel/Kconfig
+++ b/sound/soc/atmel/Kconfig
@@ -64,7 +64,7 @@ config SND_AT91_SOC_SAM9X5_WM8731
 config SND_ATMEL_SOC_CLASSD
 	tristate "Atmel ASoC driver for boards using CLASSD"
 	depends on ARCH_AT91 || COMPILE_TEST
-	select SND_ATMEL_SOC_DMA
+	select SND_SOC_GENERIC_DMAENGINE_PCM
 	select REGMAP_MMIO
 	help
 	  Say Y if you want to add support for Atmel ASoC driver for boards using

From 4362934a75ff2a399fd0bcd75937907115770020 Mon Sep 17 00:00:00 2001
From: Naveen Manohar <naveen.m@intel.com>
Date: Fri, 8 Dec 2017 09:30:18 +0530
Subject: [PATCH 070/808] ASoC: Intel: Change kern log level to avoid unwanted
 messages

patch suppresses the warning message "control load not supported"
as this is a debug information to help debug issues in topology.

Signed-off-by: Naveen Manohar <naveen.m@intel.com>
Acked-By: Vinod Koul <vinod.koul@intel.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 sound/soc/intel/skylake/skl-topology.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sound/soc/intel/skylake/skl-topology.c b/sound/soc/intel/skylake/skl-topology.c
index a072bcf209d2..81923da18ac2 100644
--- a/sound/soc/intel/skylake/skl-topology.c
+++ b/sound/soc/intel/skylake/skl-topology.c
@@ -2908,7 +2908,7 @@ static int skl_tplg_control_load(struct snd_soc_component *cmpnt,
 		break;
 
 	default:
-		dev_warn(bus->dev, "Control load not supported %d:%d:%d\n",
+		dev_dbg(bus->dev, "Control load not supported %d:%d:%d\n",
 			hdr->ops.get, hdr->ops.put, hdr->ops.info);
 		break;
 	}

From 33f801366bdf3f8b67dfe325b84f4051a090d01e Mon Sep 17 00:00:00 2001
From: Jiada Wang <jiada_wang@mentor.com>
Date: Thu, 7 Dec 2017 22:15:38 -0800
Subject: [PATCH 071/808] ASoC: rsnd: ssi: fix race condition in
 rsnd_ssi_pointer_update

Currently there is race condition between set of byte_pos and wrap
it around when new buffer starts. If .pointer is called in-between
it will result in inconsistent pointer position be returned
from .pointer callback.

This patch increments buffer pointer atomically to avoid this issue.

Signed-off-by: Jiada Wang <jiada_wang@mentor.com>
Reviewed-by: Takashi Sakamoto <takashi.sakamoto@miraclelinux.com>
Acked-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 sound/soc/sh/rcar/ssi.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/sound/soc/sh/rcar/ssi.c b/sound/soc/sh/rcar/ssi.c
index fece1e5f582f..cbf3bf312d23 100644
--- a/sound/soc/sh/rcar/ssi.c
+++ b/sound/soc/sh/rcar/ssi.c
@@ -446,25 +446,29 @@ static bool rsnd_ssi_pointer_update(struct rsnd_mod *mod,
 				    int byte)
 {
 	struct rsnd_ssi *ssi = rsnd_mod_to_ssi(mod);
+	bool ret = false;
+	int byte_pos;
 
-	ssi->byte_pos += byte;
+	byte_pos = ssi->byte_pos + byte;
 
-	if (ssi->byte_pos >= ssi->next_period_byte) {
+	if (byte_pos >= ssi->next_period_byte) {
 		struct snd_pcm_runtime *runtime = rsnd_io_to_runtime(io);
 
 		ssi->period_pos++;
 		ssi->next_period_byte += ssi->byte_per_period;
 
 		if (ssi->period_pos >= runtime->periods) {
-			ssi->byte_pos = 0;
+			byte_pos = 0;
 			ssi->period_pos = 0;
 			ssi->next_period_byte = ssi->byte_per_period;
 		}
 
-		return true;
+		ret = true;
 	}
 
-	return false;
+	WRITE_ONCE(ssi->byte_pos, byte_pos);
+
+	return ret;
 }
 
 /*
@@ -838,7 +842,7 @@ static int rsnd_ssi_pointer(struct rsnd_mod *mod,
 	struct rsnd_ssi *ssi = rsnd_mod_to_ssi(mod);
 	struct snd_pcm_runtime *runtime = rsnd_io_to_runtime(io);
 
-	*pointer = bytes_to_frames(runtime, ssi->byte_pos);
+	*pointer = bytes_to_frames(runtime, READ_ONCE(ssi->byte_pos));
 
 	return 0;
 }

From f5f00e7dcc4161f07b76ff1a854e8b1ea7a1ed41 Mon Sep 17 00:00:00 2001
From: Xiaolin Zhang <xiaolin.zhang@intel.com>
Date: Tue, 5 Dec 2017 14:45:32 +0800
Subject: [PATCH 072/808] drm/i915/gvt: Fix pipe A enable as default for vgpu

observed igt drv_module_reload test case failure on 4.15.0
rc2 kernel with panic due to no active pipe available.

the gpu will reset during unload/load and make pipe config reg
lost which can cause kernel panic issue happen.

this patch is to move pipe enabling to emulate_mointor_status_chagne
to handle vgpu reset case as well.

Fixes: 7e6059020894 ("drm/i915/gvt: enabled pipe A default on creating vgpu")
Signed-off-by: Xiaolin Zhang <xiaolin.zhang@intel.com>
Signed-off-by: Zhenyu Wang <zhenyuw@linux.intel.com>
---
 drivers/gpu/drm/i915/gvt/display.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/gvt/display.c b/drivers/gpu/drm/i915/gvt/display.c
index 355120865efd..309f3fa6794a 100644
--- a/drivers/gpu/drm/i915/gvt/display.c
+++ b/drivers/gpu/drm/i915/gvt/display.c
@@ -266,6 +266,8 @@ static void emulate_monitor_status_change(struct intel_vgpu *vgpu)
 	/* Clear host CRT status, so guest couldn't detect this host CRT. */
 	if (IS_BROADWELL(dev_priv))
 		vgpu_vreg(vgpu, PCH_ADPA) &= ~ADPA_CRT_HOTPLUG_MONITOR_MASK;
+
+	vgpu_vreg(vgpu, PIPECONF(PIPE_A)) |= PIPECONF_ENABLE;
 }
 
 static void clean_virtual_dp_monitor(struct intel_vgpu *vgpu, int port_num)
@@ -282,7 +284,6 @@ static void clean_virtual_dp_monitor(struct intel_vgpu *vgpu, int port_num)
 static int setup_virtual_dp_monitor(struct intel_vgpu *vgpu, int port_num,
 				    int type, unsigned int resolution)
 {
-	struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv;
 	struct intel_vgpu_port *port = intel_vgpu_port(vgpu, port_num);
 
 	if (WARN_ON(resolution >= GVT_EDID_NUM))
@@ -308,7 +309,7 @@ static int setup_virtual_dp_monitor(struct intel_vgpu *vgpu, int port_num,
 	port->type = type;
 
 	emulate_monitor_status_change(vgpu);
-	vgpu_vreg(vgpu, PIPECONF(PIPE_A)) |= PIPECONF_ENABLE;
+
 	return 0;
 }
 

From 2b4f27c36bcd46e820ddb9a8e6fe6a63fa4250b8 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Wed, 29 Nov 2017 01:18:57 -0800
Subject: [PATCH 073/808] crypto: skcipher - set walk.iv for zero-length inputs

All the ChaCha20 algorithms as well as the ARM bit-sliced AES-XTS
algorithms call skcipher_walk_virt(), then access the IV (walk.iv)
before checking whether any bytes need to be processed (walk.nbytes).

But if the input is empty, then skcipher_walk_virt() doesn't set the IV,
and the algorithms crash trying to use the uninitialized IV pointer.

Fix it by setting the IV earlier in skcipher_walk_virt().  Also fix it
for the AEAD walk functions.

This isn't a perfect solution because we can't actually align the IV to
->cra_alignmask unless there are bytes to process, for one because the
temporary buffer for the aligned IV is freed by skcipher_walk_done(),
which is only called when there are bytes to process.  Thus, algorithms
that require aligned IVs will still need to avoid accessing the IV when
walk.nbytes == 0.  Still, many algorithms/architectures are fine with
IVs having any alignment, and even for those that aren't, a misaligned
pointer bug is much less severe than an uninitialized pointer bug.

This change also matches the behavior of the older blkcipher_walk API.

Fixes: 0cabf2af6f5a ("crypto: skcipher - Fix crash on zero-length input")
Reported-by: syzbot <syzkaller@googlegroups.com>
Cc: <stable@vger.kernel.org> # v4.14+
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/skcipher.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/crypto/skcipher.c b/crypto/skcipher.c
index 778e0ff42bfa..11af5fd6a443 100644
--- a/crypto/skcipher.c
+++ b/crypto/skcipher.c
@@ -449,6 +449,8 @@ static int skcipher_walk_skcipher(struct skcipher_walk *walk,
 
 	walk->total = req->cryptlen;
 	walk->nbytes = 0;
+	walk->iv = req->iv;
+	walk->oiv = req->iv;
 
 	if (unlikely(!walk->total))
 		return 0;
@@ -456,9 +458,6 @@ static int skcipher_walk_skcipher(struct skcipher_walk *walk,
 	scatterwalk_start(&walk->in, req->src);
 	scatterwalk_start(&walk->out, req->dst);
 
-	walk->iv = req->iv;
-	walk->oiv = req->iv;
-
 	walk->flags &= ~SKCIPHER_WALK_SLEEP;
 	walk->flags |= req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP ?
 		       SKCIPHER_WALK_SLEEP : 0;
@@ -510,6 +509,8 @@ static int skcipher_walk_aead_common(struct skcipher_walk *walk,
 	int err;
 
 	walk->nbytes = 0;
+	walk->iv = req->iv;
+	walk->oiv = req->iv;
 
 	if (unlikely(!walk->total))
 		return 0;
@@ -525,9 +526,6 @@ static int skcipher_walk_aead_common(struct skcipher_walk *walk,
 	scatterwalk_done(&walk->in, 0, walk->total);
 	scatterwalk_done(&walk->out, 0, walk->total);
 
-	walk->iv = req->iv;
-	walk->oiv = req->iv;
-
 	if (req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP)
 		walk->flags |= SKCIPHER_WALK_SLEEP;
 	else

From 11edb555966ed2c66c533d17c604f9d7e580a829 Mon Sep 17 00:00:00 2001
From: Stephan Mueller <smueller@chronox.de>
Date: Wed, 29 Nov 2017 12:02:23 +0100
Subject: [PATCH 074/808] crypto: af_alg - wait for data at beginning of
 recvmsg

The wait for data is a non-atomic operation that can sleep and therefore
potentially release the socket lock. The release of the socket lock
allows another thread to modify the context data structure. The waiting
operation for new data therefore must be called at the beginning of
recvmsg. This prevents a race condition where checks of the members of
the context data structure are performed by recvmsg while there is a
potential for modification of these values.

Fixes: e870456d8e7c ("crypto: algif_skcipher - overhaul memory management")
Fixes: d887c52d6ae4 ("crypto: algif_aead - overhaul memory management")
Reported-by: syzbot <syzkaller@googlegroups.com>
Cc: <stable@vger.kernel.org> # v4.14+
Signed-off-by: Stephan Mueller <smueller@chronox.de>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/af_alg.c         | 6 ------
 crypto/algif_aead.c     | 6 ++++++
 crypto/algif_skcipher.c | 6 ++++++
 3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/crypto/af_alg.c b/crypto/af_alg.c
index 358749c38894..f1a2caf1b59b 100644
--- a/crypto/af_alg.c
+++ b/crypto/af_alg.c
@@ -1137,12 +1137,6 @@ int af_alg_get_rsgl(struct sock *sk, struct msghdr *msg, int flags,
 		if (!af_alg_readable(sk))
 			break;
 
-		if (!ctx->used) {
-			err = af_alg_wait_for_data(sk, flags);
-			if (err)
-				return err;
-		}
-
 		seglen = min_t(size_t, (maxsize - len),
 			       msg_data_left(msg));
 
diff --git a/crypto/algif_aead.c b/crypto/algif_aead.c
index 805f485ddf1b..c8a32bef208a 100644
--- a/crypto/algif_aead.c
+++ b/crypto/algif_aead.c
@@ -111,6 +111,12 @@ static int _aead_recvmsg(struct socket *sock, struct msghdr *msg,
 	size_t usedpages = 0;		/* [in]  RX bufs to be used from user */
 	size_t processed = 0;		/* [in]  TX bufs to be consumed */
 
+	if (!ctx->used) {
+		err = af_alg_wait_for_data(sk, flags);
+		if (err)
+			return err;
+	}
+
 	/*
 	 * Data length provided by caller via sendmsg/sendpage that has not
 	 * yet been processed.
diff --git a/crypto/algif_skcipher.c b/crypto/algif_skcipher.c
index 30cff827dd8f..6fb595cd63ac 100644
--- a/crypto/algif_skcipher.c
+++ b/crypto/algif_skcipher.c
@@ -72,6 +72,12 @@ static int _skcipher_recvmsg(struct socket *sock, struct msghdr *msg,
 	int err = 0;
 	size_t len = 0;
 
+	if (!ctx->used) {
+		err = af_alg_wait_for_data(sk, flags);
+		if (err)
+			return err;
+	}
+
 	/* Allocate cipher request for current operation. */
 	areq = af_alg_alloc_areq(sk, sizeof(struct af_alg_async_req) +
 				     crypto_skcipher_reqsize(tfm));

From 9abffc6f2efe46c3564c04312e52e07622d40e51 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 30 Nov 2017 13:39:27 +0100
Subject: [PATCH 075/808] crypto: mcryptd - protect the per-CPU queue with a
 lock

mcryptd_enqueue_request() grabs the per-CPU queue struct and protects
access to it with disabled preemption. Then it schedules a worker on the
same CPU. The worker in mcryptd_queue_worker() guards access to the same
per-CPU variable with disabled preemption.

If we take CPU-hotplug into account then it is possible that between
queue_work_on() and the actual invocation of the worker the CPU goes
down and the worker will be scheduled on _another_ CPU. And here the
preempt_disable() protection does not work anymore. The easiest thing is
to add a spin_lock() to guard access to the list.

Another detail: mcryptd_queue_worker() is not processing more than
MCRYPTD_BATCH invocation in a row. If there are still items left, then
it will invoke queue_work() to proceed with more later. *I* would
suggest to simply drop that check because it does not use a system
workqueue and the workqueue is already marked as "CPU_INTENSIVE". And if
preemption is required then the scheduler should do it.
However if queue_work() is used then the work item is marked as CPU
unbound. That means it will try to run on the local CPU but it may run
on another CPU as well. Especially with CONFIG_DEBUG_WQ_FORCE_RR_CPU=y.
Again, the preempt_disable() won't work here but lock which was
introduced will help.
In order to keep work-item on the local CPU (and avoid RR) I changed it
to queue_work_on().

Cc: stable@vger.kernel.org
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/mcryptd.c         | 23 ++++++++++-------------
 include/crypto/mcryptd.h |  1 +
 2 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/crypto/mcryptd.c b/crypto/mcryptd.c
index 4e6472658852..eca04d3729b3 100644
--- a/crypto/mcryptd.c
+++ b/crypto/mcryptd.c
@@ -81,6 +81,7 @@ static int mcryptd_init_queue(struct mcryptd_queue *queue,
 		pr_debug("cpu_queue #%d %p\n", cpu, queue->cpu_queue);
 		crypto_init_queue(&cpu_queue->queue, max_cpu_qlen);
 		INIT_WORK(&cpu_queue->work, mcryptd_queue_worker);
+		spin_lock_init(&cpu_queue->q_lock);
 	}
 	return 0;
 }
@@ -104,15 +105,16 @@ static int mcryptd_enqueue_request(struct mcryptd_queue *queue,
 	int cpu, err;
 	struct mcryptd_cpu_queue *cpu_queue;
 
-	cpu = get_cpu();
-	cpu_queue = this_cpu_ptr(queue->cpu_queue);
-	rctx->tag.cpu = cpu;
+	cpu_queue = raw_cpu_ptr(queue->cpu_queue);
+	spin_lock(&cpu_queue->q_lock);
+	cpu = smp_processor_id();
+	rctx->tag.cpu = smp_processor_id();
 
 	err = crypto_enqueue_request(&cpu_queue->queue, request);
 	pr_debug("enqueue request: cpu %d cpu_queue %p request %p\n",
 		 cpu, cpu_queue, request);
+	spin_unlock(&cpu_queue->q_lock);
 	queue_work_on(cpu, kcrypto_wq, &cpu_queue->work);
-	put_cpu();
 
 	return err;
 }
@@ -161,16 +163,11 @@ static void mcryptd_queue_worker(struct work_struct *work)
 	cpu_queue = container_of(work, struct mcryptd_cpu_queue, work);
 	i = 0;
 	while (i < MCRYPTD_BATCH || single_task_running()) {
-		/*
-		 * preempt_disable/enable is used to prevent
-		 * being preempted by mcryptd_enqueue_request()
-		 */
-		local_bh_disable();
-		preempt_disable();
+
+		spin_lock_bh(&cpu_queue->q_lock);
 		backlog = crypto_get_backlog(&cpu_queue->queue);
 		req = crypto_dequeue_request(&cpu_queue->queue);
-		preempt_enable();
-		local_bh_enable();
+		spin_unlock_bh(&cpu_queue->q_lock);
 
 		if (!req) {
 			mcryptd_opportunistic_flush();
@@ -185,7 +182,7 @@ static void mcryptd_queue_worker(struct work_struct *work)
 		++i;
 	}
 	if (cpu_queue->queue.qlen)
-		queue_work(kcrypto_wq, &cpu_queue->work);
+		queue_work_on(smp_processor_id(), kcrypto_wq, &cpu_queue->work);
 }
 
 void mcryptd_flusher(struct work_struct *__work)
diff --git a/include/crypto/mcryptd.h b/include/crypto/mcryptd.h
index cceafa01f907..b67404fc4b34 100644
--- a/include/crypto/mcryptd.h
+++ b/include/crypto/mcryptd.h
@@ -27,6 +27,7 @@ static inline struct mcryptd_ahash *__mcryptd_ahash_cast(
 
 struct mcryptd_cpu_queue {
 	struct crypto_queue queue;
+	spinlock_t q_lock;
 	struct work_struct work;
 };
 

From d53c5135792319e095bb126bc43b2ee98586f7fe Mon Sep 17 00:00:00 2001
From: Stephan Mueller <smueller@chronox.de>
Date: Fri, 8 Dec 2017 11:50:37 +0100
Subject: [PATCH 076/808] crypto: af_alg - fix race accessing cipher request

When invoking an asynchronous cipher operation, the invocation of the
callback may be performed before the subsequent operations in the
initial code path are invoked. The callback deletes the cipher request
data structure which implies that after the invocation of the
asynchronous cipher operation, this data structure must not be accessed
any more.

The setting of the return code size with the request data structure must
therefore be moved before the invocation of the asynchronous cipher
operation.

Fixes: e870456d8e7c ("crypto: algif_skcipher - overhaul memory management")
Fixes: d887c52d6ae4 ("crypto: algif_aead - overhaul memory management")
Reported-by: syzbot <syzkaller@googlegroups.com>
Cc: <stable@vger.kernel.org> # v4.14+
Signed-off-by: Stephan Mueller <smueller@chronox.de>
Acked-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/algif_aead.c     | 10 +++++-----
 crypto/algif_skcipher.c | 10 +++++-----
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/crypto/algif_aead.c b/crypto/algif_aead.c
index c8a32bef208a..b73db2b27656 100644
--- a/crypto/algif_aead.c
+++ b/crypto/algif_aead.c
@@ -291,6 +291,10 @@ static int _aead_recvmsg(struct socket *sock, struct msghdr *msg,
 		/* AIO operation */
 		sock_hold(sk);
 		areq->iocb = msg->msg_iocb;
+
+		/* Remember output size that will be generated. */
+		areq->outlen = outlen;
+
 		aead_request_set_callback(&areq->cra_u.aead_req,
 					  CRYPTO_TFM_REQ_MAY_BACKLOG,
 					  af_alg_async_cb, areq);
@@ -298,12 +302,8 @@ static int _aead_recvmsg(struct socket *sock, struct msghdr *msg,
 				 crypto_aead_decrypt(&areq->cra_u.aead_req);
 
 		/* AIO operation in progress */
-		if (err == -EINPROGRESS || err == -EBUSY) {
-			/* Remember output size that will be generated. */
-			areq->outlen = outlen;
-
+		if (err == -EINPROGRESS || err == -EBUSY)
 			return -EIOCBQUEUED;
-		}
 
 		sock_put(sk);
 	} else {
diff --git a/crypto/algif_skcipher.c b/crypto/algif_skcipher.c
index 6fb595cd63ac..baef9bfccdda 100644
--- a/crypto/algif_skcipher.c
+++ b/crypto/algif_skcipher.c
@@ -125,6 +125,10 @@ static int _skcipher_recvmsg(struct socket *sock, struct msghdr *msg,
 		/* AIO operation */
 		sock_hold(sk);
 		areq->iocb = msg->msg_iocb;
+
+		/* Remember output size that will be generated. */
+		areq->outlen = len;
+
 		skcipher_request_set_callback(&areq->cra_u.skcipher_req,
 					      CRYPTO_TFM_REQ_MAY_SLEEP,
 					      af_alg_async_cb, areq);
@@ -133,12 +137,8 @@ static int _skcipher_recvmsg(struct socket *sock, struct msghdr *msg,
 			crypto_skcipher_decrypt(&areq->cra_u.skcipher_req);
 
 		/* AIO operation in progress */
-		if (err == -EINPROGRESS || err == -EBUSY) {
-			/* Remember output size that will be generated. */
-			areq->outlen = len;
-
+		if (err == -EINPROGRESS || err == -EBUSY)
 			return -EIOCBQUEUED;
-		}
 
 		sock_put(sk);
 	} else {

From 4564b187c16327045d87596e8980c65ba7b84c50 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 11 Dec 2017 12:33:47 +0100
Subject: [PATCH 077/808] nl80211: fix nl80211_send_iface() error paths

Evidently I introduced a locking bug in my change here,
the nla_put_failure sometimes needs to unlock. Fix it.

Fixes: 44905265bc15 ("nl80211: don't expose wdev->ssid for most interfaces")
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/nl80211.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index b1ac23ca20c8..213d0c498c97 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -2610,7 +2610,7 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag
 	case NL80211_IFTYPE_AP:
 		if (wdev->ssid_len &&
 		    nla_put(msg, NL80211_ATTR_SSID, wdev->ssid_len, wdev->ssid))
-			goto nla_put_failure;
+			goto nla_put_failure_locked;
 		break;
 	case NL80211_IFTYPE_STATION:
 	case NL80211_IFTYPE_P2P_CLIENT:
@@ -2623,7 +2623,7 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag
 		if (!ssid_ie)
 			break;
 		if (nla_put(msg, NL80211_ATTR_SSID, ssid_ie[1], ssid_ie + 2))
-			goto nla_put_failure;
+			goto nla_put_failure_locked;
 		break;
 		}
 	default:
@@ -2635,6 +2635,8 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag
 	genlmsg_end(msg, hdr);
 	return 0;
 
+ nla_put_failure_locked:
+	wdev_unlock(wdev);
  nla_put_failure:
 	genlmsg_cancel(msg, hdr);
 	return -EMSGSIZE;

From d2950278d2d04ff5314abeb38d9c59c4e7c0ee53 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 11 Dec 2017 18:23:09 +0100
Subject: [PATCH 078/808] xfrm: put policies when reusing pcpu xdst entry

We need to put the policies when re-using the pcpu xdst entry, else
this leaks the reference.

Fixes: ec30d78c14a813db39a647b6a348b428 ("xfrm: add xdst pcpu cache")
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_policy.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 038ec68f6901..70aa5cb0c659 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1839,6 +1839,7 @@ xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols,
 		   sizeof(struct xfrm_policy *) * num_pols) == 0 &&
 	    xfrm_xdst_can_reuse(xdst, xfrm, err)) {
 		dst_hold(&xdst->u.dst);
+		xfrm_pols_put(pols, num_pols);
 		while (err > 0)
 			xfrm_state_put(xfrm[--err]);
 		return xdst;

From d2b3c353595a855794f8b9df5b5bdbe8deb0c413 Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Mon, 4 Dec 2017 12:11:02 +0300
Subject: [PATCH 079/808] pinctrl: cherryview: Mask all interrupts on
 Intel_Strago based systems

Guenter Roeck reported an interrupt storm on a prototype system which is
based on Cyan Chromebook. The root cause turned out to be a incorrectly
configured pin that triggers spurious interrupts. This will be fixed in
coreboot but currently we need to prevent the interrupt storm from
happening by masking all interrupts (but not GPEs) on those systems.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=197953
Fixes: bcb48cca23ec ("pinctrl: cherryview: Do not mask all interrupts in probe")
Reported-and-tested-by: Guenter Roeck <linux@roeck-us.net>
Reported-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Cc: stable@vger.kernel.org
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/pinctrl/intel/pinctrl-cherryview.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/drivers/pinctrl/intel/pinctrl-cherryview.c b/drivers/pinctrl/intel/pinctrl-cherryview.c
index bdedb6325c72..4471fd94e1fe 100644
--- a/drivers/pinctrl/intel/pinctrl-cherryview.c
+++ b/drivers/pinctrl/intel/pinctrl-cherryview.c
@@ -1620,6 +1620,22 @@ static int chv_gpio_probe(struct chv_pinctrl *pctrl, int irq)
 			clear_bit(i, chip->irq.valid_mask);
 	}
 
+	/*
+	 * The same set of machines in chv_no_valid_mask[] have incorrectly
+	 * configured GPIOs that generate spurious interrupts so we use
+	 * this same list to apply another quirk for them.
+	 *
+	 * See also https://bugzilla.kernel.org/show_bug.cgi?id=197953.
+	 */
+	if (!need_valid_mask) {
+		/*
+		 * Mask all interrupts the community is able to generate
+		 * but leave the ones that can only generate GPEs unmasked.
+		 */
+		chv_writel(GENMASK(31, pctrl->community->nirqs),
+			   pctrl->regs + CHV_INTMASK);
+	}
+
 	/* Clear all interrupts */
 	chv_writel(0xffff, pctrl->regs + CHV_INTSTAT);
 

From e7fd37ba12170cc414be8b639dfc2c5f7172fac2 Mon Sep 17 00:00:00 2001
From: Ma Shimiao <mashimiao.fnst@cn.fujitsu.com>
Date: Tue, 12 Dec 2017 09:43:49 +0800
Subject: [PATCH 080/808] cgroup: avoid copying strings longer than the buffers

cgroup root name and file name have max length limit, we should
avoid copying longer name than that to the name.

tj: minor update to $SUBJ.

Signed-off-by: Ma Shimiao <mashimiao.fnst@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/cgroup.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 0b1ffe147f24..18d71fbd3923 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1397,7 +1397,7 @@ static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
 			 cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
 			 cft->name);
 	else
-		strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
+		strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
 	return buf;
 }
 
@@ -1864,9 +1864,9 @@ void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts)
 
 	root->flags = opts->flags;
 	if (opts->release_agent)
-		strcpy(root->release_agent_path, opts->release_agent);
+		strscpy(root->release_agent_path, opts->release_agent, PATH_MAX);
 	if (opts->name)
-		strcpy(root->name, opts->name);
+		strscpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN);
 	if (opts->cpuset_clone_children)
 		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
 }

From 17278a91e04f858155d54bee5528ba4fbcec6f87 Mon Sep 17 00:00:00 2001
From: James Hogan <jhogan@kernel.org>
Date: Tue, 14 Nov 2017 12:01:20 +0000
Subject: [PATCH 081/808] MIPS: CPS: Fix r1 .set mt assembler warning

MIPS CPS has a build warning on kernels configured for MIPS32R1 or
MIPS64R1, due to the use of .set mt without a prior .set mips{32,64}r2:

arch/mips/kernel/cps-vec.S Assembler messages:
arch/mips/kernel/cps-vec.S:238: Warning: the `mt' extension requires MIPS32 revision 2 or greater

Add .set MIPS_ISA_LEVEL_RAW before .set mt to silence the warning.

Fixes: 245a7868d2f2 ("MIPS: smp-cps: rework core/VPE initialisation")
Signed-off-by: James Hogan <jhogan@kernel.org>
Cc: Paul Burton <paul.burton@mips.com>
Cc: James Hogan <james.hogan@mips.com>
Cc: James Hogan <jhogan@kernel.org>
Cc: Paul Burton <paul.burton@mips.com>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/17699/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
---
 arch/mips/kernel/cps-vec.S | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/mips/kernel/cps-vec.S b/arch/mips/kernel/cps-vec.S
index c7ed26029cbb..e68e6e04063a 100644
--- a/arch/mips/kernel/cps-vec.S
+++ b/arch/mips/kernel/cps-vec.S
@@ -235,6 +235,7 @@ LEAF(mips_cps_core_init)
 	has_mt	t0, 3f
 
 	.set	push
+	.set	MIPS_ISA_LEVEL_RAW
 	.set	mt
 
 	/* Only allow 1 TC per VPE to execute... */
@@ -388,6 +389,7 @@ LEAF(mips_cps_boot_vpes)
 #elif defined(CONFIG_MIPS_MT)
 
 	.set	push
+	.set	MIPS_ISA_LEVEL_RAW
 	.set	mt
 
 	/* If the core doesn't support MT then return */

From a03fe72572c12e98f4173f8a535f32468e48b6ec Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@mips.com>
Date: Mon, 11 Dec 2017 22:51:35 +0000
Subject: [PATCH 082/808] MIPS: Factor out NT_PRFPREG regset access helpers

In preparation to fix a commit 72b22bbad1e7 ("MIPS: Don't assume 64-bit
FP registers for FP regset") FCSR access regression factor out
NT_PRFPREG regset access helpers for the non-MSA and the MSA variants
respectively, to avoid having to deal with excessive indentation in the
actual fix.

No functional change, however use `target->thread.fpu.fpr[0]' rather
than `target->thread.fpu.fpr[i]' for FGR holding type size determination
as there's no `i' variable to refer to anymore, and for the factored out
`i' variable declaration use `unsigned int' rather than `unsigned' as
its type, following the common style.

Signed-off-by: Maciej W. Rozycki <macro@mips.com>
Fixes: 72b22bbad1e7 ("MIPS: Don't assume 64-bit FP registers for FP regset")
Cc: James Hogan <james.hogan@mips.com>
Cc: Paul Burton <Paul.Burton@mips.com>
Cc: Alex Smith <alex@alex-smith.me.uk>
Cc: Dave Martin <Dave.Martin@arm.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Cc: stable@vger.kernel.org # v3.15+
Patchwork: https://patchwork.linux-mips.org/patch/17925/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
---
 arch/mips/kernel/ptrace.c | 108 +++++++++++++++++++++++++++++---------
 1 file changed, 83 insertions(+), 25 deletions(-)

diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c
index efbd8df8b665..62e8ffd9370a 100644
--- a/arch/mips/kernel/ptrace.c
+++ b/arch/mips/kernel/ptrace.c
@@ -419,25 +419,36 @@ static int gpr64_set(struct task_struct *target,
 
 #endif /* CONFIG_64BIT */
 
-static int fpr_get(struct task_struct *target,
-		   const struct user_regset *regset,
-		   unsigned int pos, unsigned int count,
-		   void *kbuf, void __user *ubuf)
+/*
+ * Copy the floating-point context to the supplied NT_PRFPREG buffer,
+ * !CONFIG_CPU_HAS_MSA variant.  FP context's general register slots
+ * correspond 1:1 to buffer slots.
+ */
+static int fpr_get_fpa(struct task_struct *target,
+		       unsigned int *pos, unsigned int *count,
+		       void **kbuf, void __user **ubuf)
 {
-	unsigned i;
-	int err;
+	return user_regset_copyout(pos, count, kbuf, ubuf,
+				   &target->thread.fpu,
+				   0, sizeof(elf_fpregset_t));
+}
+
+/*
+ * Copy the floating-point context to the supplied NT_PRFPREG buffer,
+ * CONFIG_CPU_HAS_MSA variant.  Only lower 64 bits of FP context's
+ * general register slots are copied to buffer slots.
+ */
+static int fpr_get_msa(struct task_struct *target,
+		       unsigned int *pos, unsigned int *count,
+		       void **kbuf, void __user **ubuf)
+{
+	unsigned int i;
 	u64 fpr_val;
-
-	/* XXX fcr31  */
-
-	if (sizeof(target->thread.fpu.fpr[i]) == sizeof(elf_fpreg_t))
-		return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
-					   &target->thread.fpu,
-					   0, sizeof(elf_fpregset_t));
+	int err;
 
 	for (i = 0; i < NUM_FPU_REGS; i++) {
 		fpr_val = get_fpr64(&target->thread.fpu.fpr[i], 0);
-		err = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+		err = user_regset_copyout(pos, count, kbuf, ubuf,
 					  &fpr_val, i * sizeof(elf_fpreg_t),
 					  (i + 1) * sizeof(elf_fpreg_t));
 		if (err)
@@ -447,27 +458,54 @@ static int fpr_get(struct task_struct *target,
 	return 0;
 }
 
-static int fpr_set(struct task_struct *target,
+/* Copy the floating-point context to the supplied NT_PRFPREG buffer.  */
+static int fpr_get(struct task_struct *target,
 		   const struct user_regset *regset,
 		   unsigned int pos, unsigned int count,
-		   const void *kbuf, const void __user *ubuf)
+		   void *kbuf, void __user *ubuf)
 {
-	unsigned i;
 	int err;
-	u64 fpr_val;
 
 	/* XXX fcr31  */
 
-	init_fp_ctx(target);
+	if (sizeof(target->thread.fpu.fpr[0]) == sizeof(elf_fpreg_t))
+		err = fpr_get_fpa(target, &pos, &count, &kbuf, &ubuf);
+	else
+		err = fpr_get_msa(target, &pos, &count, &kbuf, &ubuf);
 
-	if (sizeof(target->thread.fpu.fpr[i]) == sizeof(elf_fpreg_t))
-		return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
-					  &target->thread.fpu,
-					  0, sizeof(elf_fpregset_t));
+	return err;
+}
+
+/*
+ * Copy the supplied NT_PRFPREG buffer to the floating-point context,
+ * !CONFIG_CPU_HAS_MSA variant.   Buffer slots correspond 1:1 to FP
+ * context's general register slots.
+ */
+static int fpr_set_fpa(struct task_struct *target,
+		       unsigned int *pos, unsigned int *count,
+		       const void **kbuf, const void __user **ubuf)
+{
+	return user_regset_copyin(pos, count, kbuf, ubuf,
+				  &target->thread.fpu,
+				  0, sizeof(elf_fpregset_t));
+}
+
+/*
+ * Copy the supplied NT_PRFPREG buffer to the floating-point context,
+ * CONFIG_CPU_HAS_MSA variant.  Buffer slots are copied to lower 64
+ * bits only of FP context's general register slots.
+ */
+static int fpr_set_msa(struct task_struct *target,
+		       unsigned int *pos, unsigned int *count,
+		       const void **kbuf, const void __user **ubuf)
+{
+	unsigned int i;
+	u64 fpr_val;
+	int err;
 
 	BUILD_BUG_ON(sizeof(fpr_val) != sizeof(elf_fpreg_t));
-	for (i = 0; i < NUM_FPU_REGS && count >= sizeof(elf_fpreg_t); i++) {
-		err = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+	for (i = 0; i < NUM_FPU_REGS && *count >= sizeof(elf_fpreg_t); i++) {
+		err = user_regset_copyin(pos, count, kbuf, ubuf,
 					 &fpr_val, i * sizeof(elf_fpreg_t),
 					 (i + 1) * sizeof(elf_fpreg_t));
 		if (err)
@@ -478,6 +516,26 @@ static int fpr_set(struct task_struct *target,
 	return 0;
 }
 
+/* Copy the supplied NT_PRFPREG buffer to the floating-point context.  */
+static int fpr_set(struct task_struct *target,
+		   const struct user_regset *regset,
+		   unsigned int pos, unsigned int count,
+		   const void *kbuf, const void __user *ubuf)
+{
+	int err;
+
+	/* XXX fcr31  */
+
+	init_fp_ctx(target);
+
+	if (sizeof(target->thread.fpu.fpr[0]) == sizeof(elf_fpreg_t))
+		err = fpr_set_fpa(target, &pos, &count, &kbuf, &ubuf);
+	else
+		err = fpr_set_msa(target, &pos, &count, &kbuf, &ubuf);
+
+	return err;
+}
+
 enum mips_regset {
 	REGSET_GPR,
 	REGSET_FPR,

From dc24d0edf33c3e15099688b6bbdf7bdc24bf6e91 Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@mips.com>
Date: Mon, 11 Dec 2017 22:52:15 +0000
Subject: [PATCH 083/808] MIPS: Guard against any partial write attempt with
 PTRACE_SETREGSET

Complement commit d614fd58a283 ("mips/ptrace: Preserve previous
registers for short regset write") and ensure that no partial register
write attempt is made with PTRACE_SETREGSET, as we do not preinitialize
any temporaries used to hold incoming register data and consequently
random data could be written.

It is the responsibility of the caller, such as `ptrace_regset', to
arrange for writes to span whole registers only, so here we only assert
that it has indeed happened.

Signed-off-by: Maciej W. Rozycki <macro@mips.com>
Fixes: 72b22bbad1e7 ("MIPS: Don't assume 64-bit FP registers for FP regset")
Cc: James Hogan <james.hogan@mips.com>
Cc: Paul Burton <Paul.Burton@mips.com>
Cc: Alex Smith <alex@alex-smith.me.uk>
Cc: Dave Martin <Dave.Martin@arm.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Cc: stable@vger.kernel.org # v3.15+
Patchwork: https://patchwork.linux-mips.org/patch/17926/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
---
 arch/mips/kernel/ptrace.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c
index 62e8ffd9370a..7fcadaaf330f 100644
--- a/arch/mips/kernel/ptrace.c
+++ b/arch/mips/kernel/ptrace.c
@@ -516,7 +516,15 @@ static int fpr_set_msa(struct task_struct *target,
 	return 0;
 }
 
-/* Copy the supplied NT_PRFPREG buffer to the floating-point context.  */
+/*
+ * Copy the supplied NT_PRFPREG buffer to the floating-point context.
+ *
+ * We optimize for the case where `count % sizeof(elf_fpreg_t) == 0',
+ * which is supposed to have been guaranteed by the kernel before
+ * calling us, e.g. in `ptrace_regset'.  We enforce that requirement,
+ * so that we can safely avoid preinitializing temporaries for
+ * partial register writes.
+ */
 static int fpr_set(struct task_struct *target,
 		   const struct user_regset *regset,
 		   unsigned int pos, unsigned int count,
@@ -524,6 +532,8 @@ static int fpr_set(struct task_struct *target,
 {
 	int err;
 
+	BUG_ON(count % sizeof(elf_fpreg_t));
+
 	/* XXX fcr31  */
 
 	init_fp_ctx(target);

From 80b3ffce0196ea50068885d085ff981e4b8396f4 Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@mips.com>
Date: Mon, 11 Dec 2017 22:53:14 +0000
Subject: [PATCH 084/808] MIPS: Consistently handle buffer counter with
 PTRACE_SETREGSET

Update commit d614fd58a283 ("mips/ptrace: Preserve previous registers
for short regset write") bug and consistently consume all data supplied
to `fpr_set_msa' with the ptrace(2) PTRACE_SETREGSET request, such that
a zero data buffer counter is returned where insufficient data has been
given to fill a whole number of FP general registers.

In reality this is not going to happen, as the caller is supposed to
only supply data covering a whole number of registers and it is verified
in `ptrace_regset' and again asserted in `fpr_set', however structuring
code such that the presence of trailing partial FP general register data
causes `fpr_set_msa' to return with a non-zero data buffer counter makes
it appear that this trailing data will be used if there are subsequent
writes made to FP registers, which is going to be the case with the FCSR
once the missing write to that register has been fixed.

Fixes: d614fd58a283 ("mips/ptrace: Preserve previous registers for short regset write")
Signed-off-by: Maciej W. Rozycki <macro@mips.com>
Cc: James Hogan <james.hogan@mips.com>
Cc: Paul Burton <Paul.Burton@mips.com>
Cc: Alex Smith <alex@alex-smith.me.uk>
Cc: Dave Martin <Dave.Martin@arm.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Cc: stable@vger.kernel.org # v4.11+
Patchwork: https://patchwork.linux-mips.org/patch/17927/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
---
 arch/mips/kernel/ptrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c
index 7fcadaaf330f..47a01d5f26ea 100644
--- a/arch/mips/kernel/ptrace.c
+++ b/arch/mips/kernel/ptrace.c
@@ -504,7 +504,7 @@ static int fpr_set_msa(struct task_struct *target,
 	int err;
 
 	BUILD_BUG_ON(sizeof(fpr_val) != sizeof(elf_fpreg_t));
-	for (i = 0; i < NUM_FPU_REGS && *count >= sizeof(elf_fpreg_t); i++) {
+	for (i = 0; i < NUM_FPU_REGS && *count > 0; i++) {
 		err = user_regset_copyin(pos, count, kbuf, ubuf,
 					 &fpr_val, i * sizeof(elf_fpreg_t),
 					 (i + 1) * sizeof(elf_fpreg_t));

From be07a6a1188372b6d19a3307ec33211fc9c9439d Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@mips.com>
Date: Mon, 11 Dec 2017 22:54:33 +0000
Subject: [PATCH 085/808] MIPS: Fix an FCSR access API regression with
 NT_PRFPREG and MSA

Fix a commit 72b22bbad1e7 ("MIPS: Don't assume 64-bit FP registers for
FP regset") public API regression, then activated by commit 1db1af84d6df
("MIPS: Basic MSA context switching support"), that caused the FCSR
register not to be read or written for CONFIG_CPU_HAS_MSA kernel
configurations (regardless of actual presence or absence of the MSA
feature in a given processor) with ptrace(2) PTRACE_GETREGSET and
PTRACE_SETREGSET requests nor recorded in core dumps.

This is because with !CONFIG_CPU_HAS_MSA configurations the whole of
`elf_fpregset_t' array is bulk-copied as it is, which includes the FCSR
in one half of the last, 33rd slot, whereas with CONFIG_CPU_HAS_MSA
configurations array elements are copied individually, and then only the
leading 32 FGR slots while the remaining slot is ignored.

Correct the code then such that only FGR slots are copied in the
respective !MSA and MSA helpers an then the FCSR slot is handled
separately in common code.  Use `ptrace_setfcr31' to update the FCSR
too, so that the read-only mask is respected.

Retrieving a correct value of FCSR is important in debugging not only
for the human to be able to get the right interpretation of the
situation, but for correct operation of GDB as well.  This is because
the condition code bits in FSCR are used by GDB to determine the
location to place a breakpoint at when single-stepping through an FPU
branch instruction.  If such a breakpoint is placed incorrectly (i.e.
with the condition reversed), then it will be missed, likely causing the
debuggee to run away from the control of GDB and consequently breaking
the process of investigation.

Fortunately GDB continues using the older PTRACE_GETFPREGS ptrace(2)
request which is unaffected, so the regression only really hits with
post-mortem debug sessions using a core dump file, in which case
execution, and consequently single-stepping through branches is not
possible.  Of course core files created by buggy kernels out there will
have the value of FCSR recorded clobbered, but such core files cannot be
corrected and the person using them simply will have to be aware that
the value of FCSR retrieved is not reliable.

Which also means we can likely get away without defining a replacement
API which would ensure a correct value of FSCR to be retrieved, or none
at all.

This is based on previous work by Alex Smith, extensively rewritten.

Signed-off-by: Alex Smith <alex@alex-smith.me.uk>
Signed-off-by: James Hogan <james.hogan@mips.com>
Signed-off-by: Maciej W. Rozycki <macro@mips.com>
Fixes: 72b22bbad1e7 ("MIPS: Don't assume 64-bit FP registers for FP regset")
Cc: Paul Burton <Paul.Burton@mips.com>
Cc: Dave Martin <Dave.Martin@arm.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Cc: stable@vger.kernel.org # v3.15+
Patchwork: https://patchwork.linux-mips.org/patch/17928/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
---
 arch/mips/kernel/ptrace.c | 47 ++++++++++++++++++++++++++++++---------
 1 file changed, 36 insertions(+), 11 deletions(-)

diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c
index 47a01d5f26ea..0a939593ccb7 100644
--- a/arch/mips/kernel/ptrace.c
+++ b/arch/mips/kernel/ptrace.c
@@ -422,7 +422,7 @@ static int gpr64_set(struct task_struct *target,
 /*
  * Copy the floating-point context to the supplied NT_PRFPREG buffer,
  * !CONFIG_CPU_HAS_MSA variant.  FP context's general register slots
- * correspond 1:1 to buffer slots.
+ * correspond 1:1 to buffer slots.  Only general registers are copied.
  */
 static int fpr_get_fpa(struct task_struct *target,
 		       unsigned int *pos, unsigned int *count,
@@ -430,13 +430,14 @@ static int fpr_get_fpa(struct task_struct *target,
 {
 	return user_regset_copyout(pos, count, kbuf, ubuf,
 				   &target->thread.fpu,
-				   0, sizeof(elf_fpregset_t));
+				   0, NUM_FPU_REGS * sizeof(elf_fpreg_t));
 }
 
 /*
  * Copy the floating-point context to the supplied NT_PRFPREG buffer,
  * CONFIG_CPU_HAS_MSA variant.  Only lower 64 bits of FP context's
- * general register slots are copied to buffer slots.
+ * general register slots are copied to buffer slots.  Only general
+ * registers are copied.
  */
 static int fpr_get_msa(struct task_struct *target,
 		       unsigned int *pos, unsigned int *count,
@@ -458,20 +459,29 @@ static int fpr_get_msa(struct task_struct *target,
 	return 0;
 }
 
-/* Copy the floating-point context to the supplied NT_PRFPREG buffer.  */
+/*
+ * Copy the floating-point context to the supplied NT_PRFPREG buffer.
+ * Choose the appropriate helper for general registers, and then copy
+ * the FCSR register separately.
+ */
 static int fpr_get(struct task_struct *target,
 		   const struct user_regset *regset,
 		   unsigned int pos, unsigned int count,
 		   void *kbuf, void __user *ubuf)
 {
+	const int fcr31_pos = NUM_FPU_REGS * sizeof(elf_fpreg_t);
 	int err;
 
-	/* XXX fcr31  */
-
 	if (sizeof(target->thread.fpu.fpr[0]) == sizeof(elf_fpreg_t))
 		err = fpr_get_fpa(target, &pos, &count, &kbuf, &ubuf);
 	else
 		err = fpr_get_msa(target, &pos, &count, &kbuf, &ubuf);
+	if (err)
+		return err;
+
+	err = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+				  &target->thread.fpu.fcr31,
+				  fcr31_pos, fcr31_pos + sizeof(u32));
 
 	return err;
 }
@@ -479,7 +489,7 @@ static int fpr_get(struct task_struct *target,
 /*
  * Copy the supplied NT_PRFPREG buffer to the floating-point context,
  * !CONFIG_CPU_HAS_MSA variant.   Buffer slots correspond 1:1 to FP
- * context's general register slots.
+ * context's general register slots.  Only general registers are copied.
  */
 static int fpr_set_fpa(struct task_struct *target,
 		       unsigned int *pos, unsigned int *count,
@@ -487,13 +497,14 @@ static int fpr_set_fpa(struct task_struct *target,
 {
 	return user_regset_copyin(pos, count, kbuf, ubuf,
 				  &target->thread.fpu,
-				  0, sizeof(elf_fpregset_t));
+				  0, NUM_FPU_REGS * sizeof(elf_fpreg_t));
 }
 
 /*
  * Copy the supplied NT_PRFPREG buffer to the floating-point context,
  * CONFIG_CPU_HAS_MSA variant.  Buffer slots are copied to lower 64
- * bits only of FP context's general register slots.
+ * bits only of FP context's general register slots.  Only general
+ * registers are copied.
  */
 static int fpr_set_msa(struct task_struct *target,
 		       unsigned int *pos, unsigned int *count,
@@ -518,6 +529,8 @@ static int fpr_set_msa(struct task_struct *target,
 
 /*
  * Copy the supplied NT_PRFPREG buffer to the floating-point context.
+ * Choose the appropriate helper for general registers, and then copy
+ * the FCSR register separately.
  *
  * We optimize for the case where `count % sizeof(elf_fpreg_t) == 0',
  * which is supposed to have been guaranteed by the kernel before
@@ -530,18 +543,30 @@ static int fpr_set(struct task_struct *target,
 		   unsigned int pos, unsigned int count,
 		   const void *kbuf, const void __user *ubuf)
 {
+	const int fcr31_pos = NUM_FPU_REGS * sizeof(elf_fpreg_t);
+	u32 fcr31;
 	int err;
 
 	BUG_ON(count % sizeof(elf_fpreg_t));
 
-	/* XXX fcr31  */
-
 	init_fp_ctx(target);
 
 	if (sizeof(target->thread.fpu.fpr[0]) == sizeof(elf_fpreg_t))
 		err = fpr_set_fpa(target, &pos, &count, &kbuf, &ubuf);
 	else
 		err = fpr_set_msa(target, &pos, &count, &kbuf, &ubuf);
+	if (err)
+		return err;
+
+	if (count > 0) {
+		err = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+					 &fcr31,
+					 fcr31_pos, fcr31_pos + sizeof(u32));
+		if (err)
+			return err;
+
+		ptrace_setfcr31(target, fcr31);
+	}
 
 	return err;
 }

From 006501e039eec411842bb3150c41358867d320c2 Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@mips.com>
Date: Mon, 11 Dec 2017 22:55:40 +0000
Subject: [PATCH 086/808] MIPS: Also verify sizeof `elf_fpreg_t' with
 PTRACE_SETREGSET

Complement commit d614fd58a283 ("mips/ptrace: Preserve previous
registers for short regset write") and like with the PTRACE_GETREGSET
ptrace(2) request also apply a BUILD_BUG_ON check for the size of the
`elf_fpreg_t' type in the PTRACE_SETREGSET request handler.

Signed-off-by: Maciej W. Rozycki <macro@mips.com>
Fixes: d614fd58a283 ("mips/ptrace: Preserve previous registers for short regset write")
Cc: James Hogan <james.hogan@mips.com>
Cc: Paul Burton <Paul.Burton@mips.com>
Cc: Alex Smith <alex@alex-smith.me.uk>
Cc: Dave Martin <Dave.Martin@arm.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Cc: stable@vger.kernel.org # v4.11+
Patchwork: https://patchwork.linux-mips.org/patch/17929/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
---
 arch/mips/kernel/ptrace.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c
index 0a939593ccb7..256908951a7c 100644
--- a/arch/mips/kernel/ptrace.c
+++ b/arch/mips/kernel/ptrace.c
@@ -447,6 +447,7 @@ static int fpr_get_msa(struct task_struct *target,
 	u64 fpr_val;
 	int err;
 
+	BUILD_BUG_ON(sizeof(fpr_val) != sizeof(elf_fpreg_t));
 	for (i = 0; i < NUM_FPU_REGS; i++) {
 		fpr_val = get_fpr64(&target->thread.fpu.fpr[i], 0);
 		err = user_regset_copyout(pos, count, kbuf, ubuf,

From c8c5a3a24d395b14447a9a89d61586a913840a3b Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@mips.com>
Date: Mon, 11 Dec 2017 22:56:54 +0000
Subject: [PATCH 087/808] MIPS: Disallow outsized PTRACE_SETREGSET NT_PRFPREG
 regset accesses

Complement commit c23b3d1a5311 ("MIPS: ptrace: Change GP regset to use
correct core dump register layout") and also reject outsized
PTRACE_SETREGSET requests to the NT_PRFPREG regset, like with the
NT_PRSTATUS regset.

Signed-off-by: Maciej W. Rozycki <macro@mips.com>
Fixes: c23b3d1a5311 ("MIPS: ptrace: Change GP regset to use correct core dump register layout")
Cc: James Hogan <james.hogan@mips.com>
Cc: Paul Burton <Paul.Burton@mips.com>
Cc: Alex Smith <alex@alex-smith.me.uk>
Cc: Dave Martin <Dave.Martin@arm.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Cc: stable@vger.kernel.org # v3.17+
Patchwork: https://patchwork.linux-mips.org/patch/17930/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
---
 arch/mips/kernel/ptrace.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c
index 256908951a7c..0b23b1ad99e6 100644
--- a/arch/mips/kernel/ptrace.c
+++ b/arch/mips/kernel/ptrace.c
@@ -550,6 +550,9 @@ static int fpr_set(struct task_struct *target,
 
 	BUG_ON(count % sizeof(elf_fpreg_t));
 
+	if (pos + count > sizeof(elf_fpregset_t))
+		return -EIO;
+
 	init_fp_ctx(target);
 
 	if (sizeof(target->thread.fpu.fpr[0]) == sizeof(elf_fpreg_t))

From 10a6a6975691775bbcc677a04c6fd3120b5c1160 Mon Sep 17 00:00:00 2001
From: Cyrille Pitchen <cyrille.pitchen@wedev4u.fr>
Date: Tue, 12 Dec 2017 14:40:12 +0100
Subject: [PATCH 088/808] Revert "dt-bindings: mtd: add sst25wf040b and en25s64
 to sip-nor list"

This reverts commit b07815d4eaf658b683c345d6e643895a20d92f29.

The reverted commit was merged into v4-15-rc1 by mistake: it was taken
from the IMX tree but the patch has never been sent to linux-mtd nor
reviewed by any spi-nor maintainers.

Actually, it would have been rejected since we add new values for the
'compatible' DT property only for SPI NOR memories that don't support
the JEDEC READ ID op code (0x9F).

Both en25s64 and sst25wf040b support the JEDEC READ ID op code, hence
should use the "jedec,spi-nor" string alone as 'compatible' value.

See the following link for more details:
http://lists.infradead.org/pipermail/linux-mtd/2017-November/077425.html

Signed-off-by: Cyrille Pitchen <cyrille.pitchen@wedev4u.fr>
Acked-by: Marek Vasut <marek.vasut@gmail.com>
---
 Documentation/devicetree/bindings/mtd/jedec,spi-nor.txt | 2 --
 1 file changed, 2 deletions(-)

diff --git a/Documentation/devicetree/bindings/mtd/jedec,spi-nor.txt b/Documentation/devicetree/bindings/mtd/jedec,spi-nor.txt
index 376fa2f50e6b..956bb046e599 100644
--- a/Documentation/devicetree/bindings/mtd/jedec,spi-nor.txt
+++ b/Documentation/devicetree/bindings/mtd/jedec,spi-nor.txt
@@ -13,7 +13,6 @@ Required properties:
                  at25df321a
                  at25df641
                  at26df081a
-                 en25s64
                  mr25h128
                  mr25h256
                  mr25h10
@@ -33,7 +32,6 @@ Required properties:
                  s25fl008k
                  s25fl064k
                  sst25vf040b
-                 sst25wf040b
                  m25p40
                  m25p80
                  m25p16

From a782fc8cc6bf6909daf3b65630079e2afec316ef Mon Sep 17 00:00:00 2001
From: Monk Liu <Monk.Liu@amd.com>
Date: Fri, 1 Dec 2017 18:21:34 +0800
Subject: [PATCH 089/808] drm/ttm: fix incorrect calculate on shrink_pages
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

shrink_pages is in unit of Order after ttm_page_pool_free,
but it is used by nr_free in next round so need change
it into native page unit

Signed-off-by: Monk Liu <Monk.Liu@amd.com>
Reviewed-by: Roger He <Hongbo.He@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/ttm/ttm_page_alloc.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/ttm/ttm_page_alloc.c b/drivers/gpu/drm/ttm/ttm_page_alloc.c
index 44343a2bf55c..71945ccaf012 100644
--- a/drivers/gpu/drm/ttm/ttm_page_alloc.c
+++ b/drivers/gpu/drm/ttm/ttm_page_alloc.c
@@ -455,6 +455,7 @@ ttm_pool_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
 		freed += (nr_free_pool - shrink_pages) << pool->order;
 		if (freed >= sc->nr_to_scan)
 			break;
+		shrink_pages <<= pool->order;
 	}
 	mutex_unlock(&lock);
 	return freed;

From 13d3fc69a03721d972460fe2bff9b479f7999221 Mon Sep 17 00:00:00 2001
From: Monk Liu <Monk.Liu@amd.com>
Date: Fri, 1 Dec 2017 18:23:56 +0800
Subject: [PATCH 090/808] drm/ttm: max_cpages is in unit of native page
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

fix calculation.

Signed-off-by: Monk Liu <Monk.Liu@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/ttm/ttm_page_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/ttm/ttm_page_alloc.c b/drivers/gpu/drm/ttm/ttm_page_alloc.c
index 71945ccaf012..b5ba6441489f 100644
--- a/drivers/gpu/drm/ttm/ttm_page_alloc.c
+++ b/drivers/gpu/drm/ttm/ttm_page_alloc.c
@@ -544,7 +544,7 @@ static int ttm_alloc_new_pages(struct list_head *pages, gfp_t gfp_flags,
 	int r = 0;
 	unsigned i, j, cpages;
 	unsigned npages = 1 << order;
-	unsigned max_cpages = min(count, (unsigned)NUM_PAGES_TO_ALLOC);
+	unsigned max_cpages = min(count << order, (unsigned)NUM_PAGES_TO_ALLOC);
 
 	/* allocate array for page caching change */
 	caching_array = kmalloc(max_cpages*sizeof(struct page *), GFP_KERNEL);

From 0507f438ea19d4280006467ba02956f6a693deca Mon Sep 17 00:00:00 2001
From: Monk Liu <Monk.Liu@amd.com>
Date: Thu, 23 Nov 2017 18:38:59 +0800
Subject: [PATCH 091/808] drm/amdgpu: fix MAP_QUEUES paramter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Should be 0.

Signed-off-by: Monk Liu <Monk.Liu@amd.com>
Acked-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index da43813d67a4..5aeb5f8816f3 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -2467,7 +2467,7 @@ static int gfx_v9_0_kiq_kcq_enable(struct amdgpu_device *adev)
 				  PACKET3_MAP_QUEUES_PIPE(ring->pipe) |
 				  PACKET3_MAP_QUEUES_ME((ring->me == 1 ? 0 : 1)) |
 				  PACKET3_MAP_QUEUES_QUEUE_TYPE(0) | /*queue_type: normal compute queue */
-				  PACKET3_MAP_QUEUES_ALLOC_FORMAT(1) | /* alloc format: all_on_one_pipe */
+				  PACKET3_MAP_QUEUES_ALLOC_FORMAT(0) | /* alloc format: all_on_one_pipe */
 				  PACKET3_MAP_QUEUES_ENGINE_SEL(0) | /* engine_sel: compute */
 				  PACKET3_MAP_QUEUES_NUM_QUEUES(1)); /* num_queues: must be 1 */
 		amdgpu_ring_write(kiq_ring, PACKET3_MAP_QUEUES_DOORBELL_OFFSET(ring->doorbell_index));

From 964728f9f407eca0b417fdf8e784b7a76979490c Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Mon, 13 Nov 2017 11:12:58 +0100
Subject: [PATCH 092/808] USB: chipidea: msm: fix ulpi-node lookup

Fix child-node lookup during probe, which ended up searching the whole
device tree depth-first starting at the parent rather than just matching
on its children.

Note that the original premature free of the parent node has already
been fixed separately, but that fix was apparently never backported to
stable.

Fixes: 47654a162081 ("usb: chipidea: msm: Restore wrapper settings after reset")
Fixes: b74c43156c0c ("usb: chipidea: msm: ci_hdrc_msm_probe() missing of_node_get()")
Cc: stable <stable@vger.kernel.org>     # 4.10: b74c43156c0c
Cc: Stephen Boyd <stephen.boyd@linaro.org>
Cc: Frank Rowand <frank.rowand@sony.com>
Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Peter Chen <peter.chen@nxp.com>
---
 drivers/usb/chipidea/ci_hdrc_msm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/usb/chipidea/ci_hdrc_msm.c b/drivers/usb/chipidea/ci_hdrc_msm.c
index 3593ce0ec641..880009987460 100644
--- a/drivers/usb/chipidea/ci_hdrc_msm.c
+++ b/drivers/usb/chipidea/ci_hdrc_msm.c
@@ -247,7 +247,7 @@ static int ci_hdrc_msm_probe(struct platform_device *pdev)
 	if (ret)
 		goto err_mux;
 
-	ulpi_node = of_find_node_by_name(of_node_get(pdev->dev.of_node), "ulpi");
+	ulpi_node = of_get_child_by_name(pdev->dev.of_node, "ulpi");
 	if (ulpi_node) {
 		phy_node = of_get_next_available_child(ulpi_node, NULL);
 		ci->hsic = of_device_is_compatible(phy_node, "qcom,usb-hsic-phy");

From f41d84dddc66b164ac16acf3f584c276146f1c48 Mon Sep 17 00:00:00 2001
From: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com>
Date: Tue, 12 Dec 2017 17:59:15 +0530
Subject: [PATCH 093/808] powerpc/perf: Dereference BHRB entries safely

It's theoretically possible that branch instructions recorded in
BHRB (Branch History Rolling Buffer) entries have already been
unmapped before they are processed by the kernel. Hence, trying to
dereference such memory location will result in a crash. eg:

    Unable to handle kernel paging request for data at address 0xd000000019c41764
    Faulting instruction address: 0xc000000000084a14
    NIP [c000000000084a14] branch_target+0x4/0x70
    LR [c0000000000eb828] record_and_restart+0x568/0x5c0
    Call Trace:
    [c0000000000eb3b4] record_and_restart+0xf4/0x5c0 (unreliable)
    [c0000000000ec378] perf_event_interrupt+0x298/0x460
    [c000000000027964] performance_monitor_exception+0x54/0x70
    [c000000000009ba4] performance_monitor_common+0x114/0x120

Fix it by deferefencing the addresses safely.

Fixes: 691231846ceb ("powerpc/perf: Fix setting of "to" addresses for BHRB")
Cc: stable@vger.kernel.org # v3.10+
Suggested-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com>
Reviewed-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
[mpe: Use probe_kernel_read() which is clearer, tweak change log]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/perf/core-book3s.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 153812966365..fce545774d50 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -410,8 +410,12 @@ static __u64 power_pmu_bhrb_to(u64 addr)
 	int ret;
 	__u64 target;
 
-	if (is_kernel_addr(addr))
-		return branch_target((unsigned int *)addr);
+	if (is_kernel_addr(addr)) {
+		if (probe_kernel_read(&instr, (void *)addr, sizeof(instr)))
+			return 0;
+
+		return branch_target(&instr);
+	}
 
 	/* Userspace: need copy instruction here then translate it */
 	pagefault_disable();

From ad2b6e01024ef23bddc3ce0bcb115ecd8c520b7e Mon Sep 17 00:00:00 2001
From: Anju T Sudhakar <anju@linux.vnet.ibm.com>
Date: Tue, 5 Dec 2017 11:00:38 +0530
Subject: [PATCH 094/808] powerpc/perf/imc: Fix nest-imc cpuhotplug callback
 failure

Oops is observed during boot:

  Faulting instruction address: 0xc000000000248340
  cpu 0x0: Vector: 380 (Data Access Out of Range) at [c000000ff66fb850]
      pc: c000000000248340: event_function_call+0x50/0x1f0
      lr: c00000000024878c: perf_remove_from_context+0x3c/0x100
      sp: c000000ff66fbad0
     msr: 9000000000009033
     dar: 7d20e2a6f92d03c0
    pid = 14, comm = cpuhp/0

While registering the cpuhotplug callbacks for nest-imc, if we fail in
the cpuhotplug online path for any random node in a multi node
system (because the opal call to stop nest-imc counters fails for that
node), ppc_nest_imc_cpu_offline() will get invoked for other nodes who
successfully returned from cpuhotplug online path.

This call trace is generated since in the ppc_nest_imc_cpu_offline()
path we are trying to migrate the event context, when nest-imc
counters are not even initialized.

Patch to add a check to ensure that nest-imc is registered before
migrating the event context.

Fixes: 885dcd709ba9 ("powerpc/perf: Add nest IMC PMU support")
Signed-off-by: Anju T Sudhakar <anju@linux.vnet.ibm.com>
Reviewed-by: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/perf/imc-pmu.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 0ead3cd73caa..f1b940714d65 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -309,6 +309,19 @@ static int ppc_nest_imc_cpu_offline(unsigned int cpu)
 	if (!cpumask_test_and_clear_cpu(cpu, &nest_imc_cpumask))
 		return 0;
 
+	/*
+	 * Check whether nest_imc is registered. We could end up here if the
+	 * cpuhotplug callback registration fails. i.e, callback invokes the
+	 * offline path for all successfully registered nodes. At this stage,
+	 * nest_imc pmu will not be registered and we should return here.
+	 *
+	 * We return with a zero since this is not an offline failure. And
+	 * cpuhp_setup_state() returns the actual failure reason to the caller,
+	 * which in turn will call the cleanup routine.
+	 */
+	if (!nest_pmus)
+		return 0;
+
 	/*
 	 * Now that this cpu is one of the designated,
 	 * find a next cpu a) which is online and b) in same chip.

From 110df8bd3e418b3476cae80babe8add48a8ea523 Mon Sep 17 00:00:00 2001
From: Anju T Sudhakar <anju@linux.vnet.ibm.com>
Date: Thu, 7 Dec 2017 22:53:27 +0530
Subject: [PATCH 095/808] powerpc/perf: Fix kfree memory allocated for nest
 pmus

imc_common_cpuhp_mem_free() is the common function for all
IMC (In-memory Collection counters) domains to unregister cpuhotplug
callback and free memory. Since kfree of memory allocated for
nest-imc (per_nest_pmu_arr) is in the common code, all
domains (core/nest/thread) can do the kfree in the failure case.

This could potentially create a call trace as shown below, where
core(/thread/nest) imc pmu initialization fails and in the failure
path imc_common_cpuhp_mem_free() free the memory(per_nest_pmu_arr),
which is allocated by successfully registered nest units.

The call trace is generated in a scenario where core-imc
initialization is made to fail and a cpuhotplug is performed in a p9
system. During cpuhotplug ppc_nest_imc_cpu_offline() tries to access
per_nest_pmu_arr, which is already freed by core-imc.

  NIP [c000000000cb6a94] mutex_lock+0x34/0x90
  LR [c000000000cb6a88] mutex_lock+0x28/0x90
  Call Trace:
    mutex_lock+0x28/0x90 (unreliable)
    perf_pmu_migrate_context+0x90/0x3a0
    ppc_nest_imc_cpu_offline+0x190/0x1f0
    cpuhp_invoke_callback+0x160/0x820
    cpuhp_thread_fun+0x1bc/0x270
    smpboot_thread_fn+0x250/0x290
    kthread+0x1a8/0x1b0
    ret_from_kernel_thread+0x5c/0x74

To address this scenario do the kfree(per_nest_pmu_arr) only in case
of nest-imc initialization failure, and when there is no other nest
units registered.

Fixes: 73ce9aec65b1 ("powerpc/perf: Fix IMC_MAX_PMU macro")
Signed-off-by: Anju T Sudhakar <anju@linux.vnet.ibm.com>
Reviewed-by: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/perf/imc-pmu.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index f1b940714d65..be4e7f84f70a 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -1184,6 +1184,7 @@ static void imc_common_cpuhp_mem_free(struct imc_pmu *pmu_ptr)
 		if (nest_pmus == 1) {
 			cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_NEST_IMC_ONLINE);
 			kfree(nest_imc_refc);
+			kfree(per_nest_pmu_arr);
 		}
 
 		if (nest_pmus > 0)
@@ -1208,7 +1209,6 @@ static void imc_common_cpuhp_mem_free(struct imc_pmu *pmu_ptr)
 		kfree(pmu_ptr->attr_groups[IMC_EVENT_ATTR]->attrs);
 	kfree(pmu_ptr->attr_groups[IMC_EVENT_ATTR]);
 	kfree(pmu_ptr);
-	kfree(per_nest_pmu_arr);
 	return;
 }
 
@@ -1322,6 +1322,8 @@ int init_imc_pmu(struct device_node *parent, struct imc_pmu *pmu_ptr, int pmu_id
 			ret = nest_pmu_cpumask_init();
 			if (ret) {
 				mutex_unlock(&nest_init_lock);
+				kfree(nest_imc_refc);
+				kfree(per_nest_pmu_arr);
 				goto err_free;
 			}
 		}

From a5f1005517534aeb1fac20180badfbf0896c183c Mon Sep 17 00:00:00 2001
From: Sebastian Ott <sebott@linux.vnet.ibm.com>
Date: Fri, 1 Dec 2017 18:47:32 +0100
Subject: [PATCH 096/808] s390/pci: handle insufficient resources during dma
 tlb flush

In a virtualized setup lazy flushing can lead to the hypervisor
running out of resources when lots of guest pages need to be
pinned. In this situation simply trigger a global flush to give
the hypervisor a chance to free some of these resources.

Signed-off-by: Sebastian Ott <sebott@linux.vnet.ibm.com>
Reviewed-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Reviewed-by: Pierre Morel <pmorel@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/pci/pci_dma.c  | 21 +++++++++++++++++++--
 arch/s390/pci/pci_insn.c |  3 +++
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/arch/s390/pci/pci_dma.c b/arch/s390/pci/pci_dma.c
index f7aa5a77827e..2d15d84c20ed 100644
--- a/arch/s390/pci/pci_dma.c
+++ b/arch/s390/pci/pci_dma.c
@@ -181,6 +181,9 @@ out_unlock:
 static int __dma_purge_tlb(struct zpci_dev *zdev, dma_addr_t dma_addr,
 			   size_t size, int flags)
 {
+	unsigned long irqflags;
+	int ret;
+
 	/*
 	 * With zdev->tlb_refresh == 0, rpcit is not required to establish new
 	 * translations when previously invalid translation-table entries are
@@ -196,8 +199,22 @@ static int __dma_purge_tlb(struct zpci_dev *zdev, dma_addr_t dma_addr,
 			return 0;
 	}
 
-	return zpci_refresh_trans((u64) zdev->fh << 32, dma_addr,
-				  PAGE_ALIGN(size));
+	ret = zpci_refresh_trans((u64) zdev->fh << 32, dma_addr,
+				 PAGE_ALIGN(size));
+	if (ret == -ENOMEM && !s390_iommu_strict) {
+		/* enable the hypervisor to free some resources */
+		if (zpci_refresh_global(zdev))
+			goto out;
+
+		spin_lock_irqsave(&zdev->iommu_bitmap_lock, irqflags);
+		bitmap_andnot(zdev->iommu_bitmap, zdev->iommu_bitmap,
+			      zdev->lazy_bitmap, zdev->iommu_pages);
+		bitmap_zero(zdev->lazy_bitmap, zdev->iommu_pages);
+		spin_unlock_irqrestore(&zdev->iommu_bitmap_lock, irqflags);
+		ret = 0;
+	}
+out:
+	return ret;
 }
 
 static int dma_update_trans(struct zpci_dev *zdev, unsigned long pa,
diff --git a/arch/s390/pci/pci_insn.c b/arch/s390/pci/pci_insn.c
index 19bcb3b45a70..f069929e8211 100644
--- a/arch/s390/pci/pci_insn.c
+++ b/arch/s390/pci/pci_insn.c
@@ -89,6 +89,9 @@ int zpci_refresh_trans(u64 fn, u64 addr, u64 range)
 	if (cc)
 		zpci_err_insn(cc, status, addr, range);
 
+	if (cc == 1 && (status == 4 || status == 16))
+		return -ENOMEM;
+
 	return (cc) ? -EIO : 0;
 }
 

From 366d8216488319ed29308b977cd62b7964a779b7 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 13 Dec 2017 09:21:59 +0100
Subject: [PATCH 097/808] s390/sclp: disable FORTIFY_SOURCE for early sclp code
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Michal Suchánek reported the following compile error with
FORTIFY_SOURCE enabled:

drivers/s390/char/sclp_early_core.o: In function `memcpy':
include/linux/string.h:340: undefined reference to `fortify_panic'

To fix this simply disable FORTIFY_SOURCE on the early sclp code as
well, which I forgot on the initial commit.

Fixes: 79962038dffa ("s390: add support for FORTIFY_SOURCE")
Reported-by: Michal Suchánek <msuchanek@suse.de>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 drivers/s390/char/Makefile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/s390/char/Makefile b/drivers/s390/char/Makefile
index 05ac6ba15a53..614b44e70a28 100644
--- a/drivers/s390/char/Makefile
+++ b/drivers/s390/char/Makefile
@@ -17,6 +17,8 @@ CFLAGS_REMOVE_sclp_early_core.o	+= $(CC_FLAGS_MARCH)
 CFLAGS_sclp_early_core.o		+= -march=z900
 endif
 
+CFLAGS_sclp_early_core.o		+= -D__NO_FORTIFY
+
 obj-y += ctrlchar.o keyboard.o defkeymap.o sclp.o sclp_rw.o sclp_quiesce.o \
 	 sclp_cmd.o sclp_config.o sclp_cpi_sys.o sclp_ocf.o sclp_ctl.o \
 	 sclp_early.o sclp_early_core.o

From ed52870f4676489124d8697fd00e6ae6c504e586 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Mon, 4 Dec 2017 22:21:30 -0800
Subject: [PATCH 098/808] KVM: MMU: Fix infinite loop when there is no
 available mmu page
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The below test case can cause infinite loop in kvm when ept=0.

    #include <unistd.h>
    #include <sys/syscall.h>
    #include <string.h>
    #include <stdint.h>
    #include <linux/kvm.h>
    #include <fcntl.h>
    #include <sys/ioctl.h>

    long r[5];
    int main()
    {
    	r[2] = open("/dev/kvm", O_RDONLY);
    	r[3] = ioctl(r[2], KVM_CREATE_VM, 0);
    	r[4] = ioctl(r[3], KVM_CREATE_VCPU, 7);
    	ioctl(r[4], KVM_RUN, 0);
    }

It doesn't setup the memory regions, mmu_alloc_shadow/direct_roots() in
kvm return 1 when kvm fails to allocate root page table which can result
in beblow infinite loop:

    vcpu_run() {
    	for (;;) {
	    	r = vcpu_enter_guest()::kvm_mmu_reload() returns 1
	    	if (r <= 0)
	    		break;
	    	if (need_resched())
	    		cond_resched();
      }
    }

This patch fixes it by returning -ENOSPC when there is no available kvm mmu
page for root page table.

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: stable@vger.kernel.org
Fixes: 26eeb53cf0f (KVM: MMU: Bail out immediately if there is no available mmu page)
Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index e5e66e5c6640..c4deb1f34faa 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3395,7 +3395,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
 		spin_lock(&vcpu->kvm->mmu_lock);
 		if(make_mmu_pages_available(vcpu) < 0) {
 			spin_unlock(&vcpu->kvm->mmu_lock);
-			return 1;
+			return -ENOSPC;
 		}
 		sp = kvm_mmu_get_page(vcpu, 0, 0,
 				vcpu->arch.mmu.shadow_root_level, 1, ACC_ALL);
@@ -3410,7 +3410,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
 			spin_lock(&vcpu->kvm->mmu_lock);
 			if (make_mmu_pages_available(vcpu) < 0) {
 				spin_unlock(&vcpu->kvm->mmu_lock);
-				return 1;
+				return -ENOSPC;
 			}
 			sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
 					i << 30, PT32_ROOT_LEVEL, 1, ACC_ALL);
@@ -3450,7 +3450,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 		spin_lock(&vcpu->kvm->mmu_lock);
 		if (make_mmu_pages_available(vcpu) < 0) {
 			spin_unlock(&vcpu->kvm->mmu_lock);
-			return 1;
+			return -ENOSPC;
 		}
 		sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
 				vcpu->arch.mmu.shadow_root_level, 0, ACC_ALL);
@@ -3487,7 +3487,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 		spin_lock(&vcpu->kvm->mmu_lock);
 		if (make_mmu_pages_available(vcpu) < 0) {
 			spin_unlock(&vcpu->kvm->mmu_lock);
-			return 1;
+			return -ENOSPC;
 		}
 		sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL,
 				      0, ACC_ALL);

From d73235d17ba63b53dc0e1051dbc10a1f1be91b71 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Thu, 7 Dec 2017 00:30:08 -0800
Subject: [PATCH 099/808] KVM: X86: Fix load RFLAGS w/o the fixed bit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

 *** Guest State ***
 CR0: actual=0x0000000000000030, shadow=0x0000000060000010, gh_mask=fffffffffffffff7
 CR4: actual=0x0000000000002050, shadow=0x0000000000000000, gh_mask=ffffffffffffe871
 CR3 = 0x00000000fffbc000
 RSP = 0x0000000000000000  RIP = 0x0000000000000000
 RFLAGS=0x00000000         DR7 = 0x0000000000000400
        ^^^^^^^^^^

The failed vmentry is triggered by the following testcase when ept=Y:

    #include <unistd.h>
    #include <sys/syscall.h>
    #include <string.h>
    #include <stdint.h>
    #include <linux/kvm.h>
    #include <fcntl.h>
    #include <sys/ioctl.h>

    long r[5];
    int main()
    {
    	r[2] = open("/dev/kvm", O_RDONLY);
    	r[3] = ioctl(r[2], KVM_CREATE_VM, 0);
    	r[4] = ioctl(r[3], KVM_CREATE_VCPU, 7);
    	struct kvm_regs regs = {
    		.rflags = 0,
    	};
    	ioctl(r[4], KVM_SET_REGS, &regs);
    	ioctl(r[4], KVM_RUN, 0);
    }

X86 RFLAGS bit 1 is fixed set, userspace can simply clearing bit 1
of RFLAGS with KVM_SET_REGS ioctl which results in vmentry fails.
This patch fixes it by oring X86_EFLAGS_FIXED during ioctl.

Cc: stable@vger.kernel.org
Suggested-by: Jim Mattson <jmattson@google.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Quan Xu <quan.xu0@gmail.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Jim Mattson <jmattson@google.com>
Cc: stable@vger.kernel.org
Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index faf843c9b916..154ea27746e9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7384,7 +7384,7 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 #endif
 
 	kvm_rip_write(vcpu, regs->rip);
-	kvm_set_rflags(vcpu, regs->rflags);
+	kvm_set_rflags(vcpu, regs->rflags | X86_EFLAGS_FIXED);
 
 	vcpu->arch.exception.pending = false;
 

From 5663d8f9bbe4bf15488f7351efb61ea20fa6de06 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Tue, 12 Dec 2017 17:15:02 +0100
Subject: [PATCH 100/808] kvm: x86: fix WARN due to uninitialized guest FPU
 state

------------[ cut here ]------------
 Bad FPU state detected at kvm_put_guest_fpu+0xd8/0x2d0 [kvm], reinitializing FPU registers.
 WARNING: CPU: 1 PID: 4594 at arch/x86/mm/extable.c:103 ex_handler_fprestore+0x88/0x90
 CPU: 1 PID: 4594 Comm: qemu-system-x86 Tainted: G    B      OE    4.15.0-rc2+ #10
 RIP: 0010:ex_handler_fprestore+0x88/0x90
 Call Trace:
  fixup_exception+0x4e/0x60
  do_general_protection+0xff/0x270
  general_protection+0x22/0x30
 RIP: 0010:kvm_put_guest_fpu+0xd8/0x2d0 [kvm]
 RSP: 0018:ffff8803d5627810 EFLAGS: 00010246
  kvm_vcpu_reset+0x3b4/0x3c0 [kvm]
  kvm_apic_accept_events+0x1c0/0x240 [kvm]
  kvm_arch_vcpu_ioctl_run+0x1658/0x2fb0 [kvm]
  kvm_vcpu_ioctl+0x479/0x880 [kvm]
  do_vfs_ioctl+0x142/0x9a0
  SyS_ioctl+0x74/0x80
  do_syscall_64+0x15f/0x600

where kvm_put_guest_fpu is called without a prior kvm_load_guest_fpu.
To fix it, move kvm_load_guest_fpu to the very beginning of
kvm_arch_vcpu_ioctl_run.

Cc: stable@vger.kernel.org
Fixes: f775b13eedee2f7f3c6fdd4e90fb79090ce5d339
Signed-off-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 154ea27746e9..56d036b9ad75 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7264,13 +7264,12 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
 
 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
-	struct fpu *fpu = &current->thread.fpu;
 	int r;
 
-	fpu__initialize(fpu);
-
 	kvm_sigset_activate(vcpu);
 
+	kvm_load_guest_fpu(vcpu);
+
 	if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
 		if (kvm_run->immediate_exit) {
 			r = -EINTR;
@@ -7296,14 +7295,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		}
 	}
 
-	kvm_load_guest_fpu(vcpu);
-
 	if (unlikely(vcpu->arch.complete_userspace_io)) {
 		int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io;
 		vcpu->arch.complete_userspace_io = NULL;
 		r = cui(vcpu);
 		if (r <= 0)
-			goto out_fpu;
+			goto out;
 	} else
 		WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);
 
@@ -7312,9 +7309,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	else
 		r = vcpu_run(vcpu);
 
-out_fpu:
-	kvm_put_guest_fpu(vcpu);
 out:
+	kvm_put_guest_fpu(vcpu);
 	post_kvm_run_save(vcpu);
 	kvm_sigset_deactivate(vcpu);
 

From 19e8e54f4309eaa438237aa1973fe40c331903d4 Mon Sep 17 00:00:00 2001
From: Stefan Raspl <stefan.raspl@de.ibm.com>
Date: Mon, 11 Dec 2017 12:25:19 +0100
Subject: [PATCH 101/808] tools/kvm_stat: fix command line option '-g'

Specifying a guest via '-g foo' always results in an error:
  $ kvm_stat -g foo
  Usage: kvm_stat [options]

  kvm_stat: error: Error while searching for guest "foo", use "-p" to
  specify a pid instead

Reason is that Tui.get_pid_from_gname() is not static, as it is supposed
to be.

Signed-off-by: Stefan Raspl <raspl@linux.vnet.ibm.com>
Tested-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/kvm/kvm_stat/kvm_stat | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat
index 217cf6f95c36..884a74b8ca87 100755
--- a/tools/kvm/kvm_stat/kvm_stat
+++ b/tools/kvm/kvm_stat/kvm_stat
@@ -950,7 +950,8 @@ class Tui(object):
             curses.nocbreak()
             curses.endwin()
 
-    def get_all_gnames(self):
+    @staticmethod
+    def get_all_gnames():
         """Returns a list of (pid, gname) tuples of all running guests"""
         res = []
         try:
@@ -963,7 +964,7 @@ class Tui(object):
             # perform a sanity check before calling the more expensive
             # function to possibly extract the guest name
             if ' -name ' in line[1]:
-                res.append((line[0], self.get_gname_from_pid(line[0])))
+                res.append((line[0], Tui.get_gname_from_pid(line[0])))
         child.stdout.close()
 
         return res
@@ -984,7 +985,8 @@ class Tui(object):
         except Exception:
             self.screen.addstr(row + 1, 2, 'Not available')
 
-    def get_pid_from_gname(self, gname):
+    @staticmethod
+    def get_pid_from_gname(gname):
         """Fuzzy function to convert guest name to QEMU process pid.
 
         Returns a list of potential pids, can be empty if no match found.
@@ -992,7 +994,7 @@ class Tui(object):
 
         """
         pids = []
-        for line in self.get_all_gnames():
+        for line in Tui.get_all_gnames():
             if gname == line[1]:
                 pids.append(int(line[0]))
 

From faa06650418bf28d07426fcfdc5213782fb131f6 Mon Sep 17 00:00:00 2001
From: Stefan Raspl <stefan.raspl@de.ibm.com>
Date: Mon, 11 Dec 2017 12:25:20 +0100
Subject: [PATCH 102/808] tools/kvm_stat: fix drilldown in events-by-guests
 mode

When displaying debugfs events listed by guests, an attempt to switch to
reporting of stats for individual child trace events results in garbled
output. Reason is that when toggling drilldown, the update of the stats
doesn't honor when events are displayed by guests, as indicated by
Tui._display_guests.
To reproduce, run 'kvm_stat -d' and press 'b' followed by 'x'.

Signed-off-by: Stefan Raspl <raspl@linux.vnet.ibm.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/kvm/kvm_stat/kvm_stat | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat
index 884a74b8ca87..6347ad5d0d35 100755
--- a/tools/kvm/kvm_stat/kvm_stat
+++ b/tools/kvm/kvm_stat/kvm_stat
@@ -1360,7 +1360,7 @@ class Tui(object):
                 if char == 'x':
                     self.update_drilldown()
                     # prevents display of current values on next refresh
-                    self.stats.get()
+                    self.stats.get(self._display_guests)
             except KeyboardInterrupt:
                 break
             except curses.error:

From 67c162b0892ac481e47bef06d9c6231ee993843a Mon Sep 17 00:00:00 2001
From: Stefan Raspl <stefan.raspl@de.ibm.com>
Date: Mon, 11 Dec 2017 12:25:21 +0100
Subject: [PATCH 103/808] tools/kvm_stat: fix missing field update after filter
 change

When updating the fields filter, tracepoint events of fields previously
not visible were not enabled, as TracepointProvider.update_fields()
updated the member variable directly instead of using the setter, which
triggers the event enable/disable.
To reproduce, run 'kvm_stat -f kvm_exit', press 'c' to remove the
filter, and notice that no add'l fields that do not match the regex
'kvm_exit' will appear.
This issue was introduced by commit c469117df059 ("tools/kvm_stat:
simplify initializers").

Signed-off-by: Stefan Raspl <raspl@linux.vnet.ibm.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/kvm/kvm_stat/kvm_stat | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat
index 6347ad5d0d35..f133755fdde2 100755
--- a/tools/kvm/kvm_stat/kvm_stat
+++ b/tools/kvm/kvm_stat/kvm_stat
@@ -549,8 +549,8 @@ class TracepointProvider(Provider):
 
     def update_fields(self, fields_filter):
         """Refresh fields, applying fields_filter"""
-        self._fields = [field for field in self.get_available_fields()
-                        if self.is_field_wanted(fields_filter, field)]
+        self.fields = [field for field in self.get_available_fields()
+                       if self.is_field_wanted(fields_filter, field)]
 
     @staticmethod
     def get_online_cpus():

From b74faa930deb2e37ed5caa0abfc687c8c532e946 Mon Sep 17 00:00:00 2001
From: Stefan Raspl <stefan.raspl@de.ibm.com>
Date: Mon, 11 Dec 2017 12:25:22 +0100
Subject: [PATCH 104/808] tools/kvm_stat: fix extra handling of 'help' with
 fields filter

Commit 67fbcd62f54d ("tools/kvm_stat: add '-f help' to get the available
event list") added support for '-f help'. However, the extra handling of
'help' will also take effect when 'help' is specified as a regex in
interactive mode via 'f'. This results in display of all events while
only those matching this regex should be shown.

Signed-off-by: Stefan Raspl <raspl@linux.vnet.ibm.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/kvm/kvm_stat/kvm_stat | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat
index f133755fdde2..4faf9f85a00e 100755
--- a/tools/kvm/kvm_stat/kvm_stat
+++ b/tools/kvm/kvm_stat/kvm_stat
@@ -478,7 +478,7 @@ class Provider(object):
     @staticmethod
     def is_field_wanted(fields_filter, field):
         """Indicate whether field is valid according to fields_filter."""
-        if not fields_filter or fields_filter == "help":
+        if not fields_filter:
             return True
         return re.match(fields_filter, field) is not None
 
@@ -1567,6 +1567,7 @@ def main():
     stats = Stats(options)
 
     if options.fields == "help":
+        stats.fields_filter = None
         event_list = "\n"
         s = stats.get()
         for key in s.keys():

From fff8c9eb48aa58259071b5df0e6d4c1c0bc1ba51 Mon Sep 17 00:00:00 2001
From: Stefan Raspl <stefan.raspl@de.ibm.com>
Date: Mon, 11 Dec 2017 12:25:23 +0100
Subject: [PATCH 105/808] tools/kvm_stat: fix child trace events accounting

Child trace events were included in calculation of the overall total,
which is used for calculation of the percentages of the '%Total' column.
However, the parent trace envents' stats summarize the child trace
events, hence we'd incorrectly account for them twice, leading to
slightly wrong stats.
With this fix, we use the correct total. Consequently, the sum of the
child trace events' '%Total' column values is identical to the
respective value of the respective parent event. However, this also
means that the sum of the '%Total' column values will aggregate to more
than 100 percent.

Signed-off-by: Stefan Raspl <raspl@linux.vnet.ibm.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/kvm/kvm_stat/kvm_stat     | 6 +++---
 tools/kvm/kvm_stat/kvm_stat.txt | 2 ++
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat
index 4faf9f85a00e..90f0445d7808 100755
--- a/tools/kvm/kvm_stat/kvm_stat
+++ b/tools/kvm/kvm_stat/kvm_stat
@@ -1092,14 +1092,14 @@ class Tui(object):
             # sort by totals
             return (0, -stats[x][0])
         total = 0.
-        for val in stats.values():
-            total += val[0]
+        for key in stats.keys():
+            if key.find('(') is -1:
+                total += stats[key][0]
         if self._sorting == SORT_DEFAULT:
             sortkey = sortCurAvg
         else:
             sortkey = sortTotal
         for key in sorted(stats.keys(), key=sortkey):
-
             if row >= self.screen.getmaxyx()[0]:
                 break
             values = stats[key]
diff --git a/tools/kvm/kvm_stat/kvm_stat.txt b/tools/kvm/kvm_stat/kvm_stat.txt
index e5cf836be8a1..75368a3c285f 100644
--- a/tools/kvm/kvm_stat/kvm_stat.txt
+++ b/tools/kvm/kvm_stat/kvm_stat.txt
@@ -50,6 +50,8 @@ INTERACTIVE COMMANDS
 *s*::   set update interval
 
 *x*::	toggle reporting of stats for child trace events
+ ::     *Note*: The stats for the parents summarize the respective child trace
+                events
 
 Press any other key to refresh statistics immediately.
 

From f3d11b0e8619bbb053d3e13f2271819fb01c1e2a Mon Sep 17 00:00:00 2001
From: Stefan Raspl <stefan.raspl@de.ibm.com>
Date: Mon, 11 Dec 2017 12:25:24 +0100
Subject: [PATCH 106/808] tools/kvm_stat: add hint on '-f help' to man page

The man page update for this new functionality was omitted.

Signed-off-by: Stefan Raspl <raspl@linux.vnet.ibm.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/kvm/kvm_stat/kvm_stat.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/kvm/kvm_stat/kvm_stat.txt b/tools/kvm/kvm_stat/kvm_stat.txt
index 75368a3c285f..b5b3810c9e94 100644
--- a/tools/kvm/kvm_stat/kvm_stat.txt
+++ b/tools/kvm/kvm_stat/kvm_stat.txt
@@ -88,7 +88,7 @@ OPTIONS
 
 -f<fields>::
 --fields=<fields>::
-	fields to display (regex)
+	fields to display (regex), "-f help" for a list of available events
 
 -h::
 --help::

From 08e20a6300e106d5feb89c9e47ea479533fec46f Mon Sep 17 00:00:00 2001
From: Stefan Raspl <stefan.raspl@de.ibm.com>
Date: Mon, 11 Dec 2017 12:25:25 +0100
Subject: [PATCH 107/808] tools/kvm_stat: handle invalid regular expressions

Passing an invalid regular expression on the command line results in a
traceback. Note that interactive specification of invalid regular
expressions is not affected
To reproduce, run "kvm_stat -f '*'".

Signed-off-by: Stefan Raspl <raspl@linux.vnet.ibm.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/kvm/kvm_stat/kvm_stat | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat
index 90f0445d7808..29c56f3a05dc 100755
--- a/tools/kvm/kvm_stat/kvm_stat
+++ b/tools/kvm/kvm_stat/kvm_stat
@@ -1521,6 +1521,13 @@ Press any other key to refresh statistics immediately.
                          callback=cb_guest_to_pid,
                          )
     (options, _) = optparser.parse_args(sys.argv)
+    try:
+        # verify that we were passed a valid regex up front
+        re.compile(options.fields)
+    except re.error:
+        sys.exit('Error: "' + options.fields + '" is not a valid regular '
+                 'expression')
+
     return options
 
 
From 822cfe3e4813c8f52199362b0e689fba9459ddc9 Mon Sep 17 00:00:00 2001
From: Stefan Raspl <stefan.raspl@de.ibm.com>
Date: Mon, 11 Dec 2017 12:25:26 +0100
Subject: [PATCH 108/808] tools/kvm_stat: suppress usage information on command
 line errors

Errors while parsing the '-g' command line argument result in display of
usage information prior to the error message. This is a bit confusing,
as the command line is syntactically correct.
To reproduce, run 'kvm_stat -g' and specify a non-existing or inactive
guest.

Signed-off-by: Stefan Raspl <raspl@linux.vnet.ibm.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/kvm/kvm_stat/kvm_stat | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat
index 29c56f3a05dc..bf65531570f5 100755
--- a/tools/kvm/kvm_stat/kvm_stat
+++ b/tools/kvm/kvm_stat/kvm_stat
@@ -1453,16 +1453,13 @@ Press any other key to refresh statistics immediately.
         try:
             pids = Tui.get_pid_from_gname(val)
         except:
-            raise optparse.OptionValueError('Error while searching for guest '
-                                            '"{}", use "-p" to specify a pid '
-                                            'instead'.format(val))
+            sys.exit('Error while searching for guest "{}". Use "-p" to '
+                     'specify a pid instead?'.format(val))
         if len(pids) == 0:
-            raise optparse.OptionValueError('No guest by the name "{}" '
-                                            'found'.format(val))
+            sys.exit('Error: No guest by the name "{}" found'.format(val))
         if len(pids) > 1:
-            raise optparse.OptionValueError('Multiple processes found (pids: '
-                                            '{}) - use "-p" to specify a pid '
-                                            'instead'.format(" ".join(pids)))
+            sys.exit('Error: Multiple processes found (pids: {}). Use "-p" '
+                     'to specify the desired pid'.format(" ".join(pids)))
         parser.values.pid = pids[0]
 
     optparser = optparse.OptionParser(description=description_text,

From 73fab6ffbd83795e38974bb438e7afce0242c61a Mon Sep 17 00:00:00 2001
From: Stefan Raspl <stefan.raspl@de.ibm.com>
Date: Mon, 11 Dec 2017 12:25:27 +0100
Subject: [PATCH 109/808] tools/kvm_stat: stop ignoring unhandled arguments

Unhandled arguments, which could easily include typos, are simply
ignored. We should be strict to avoid undetected typos.
To reproduce start kvm_stat with an extra argument, e.g.
'kvm_stat -d bnuh5ol' and note that this will actually work.

Signed-off-by: Stefan Raspl <raspl@linux.vnet.ibm.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/kvm/kvm_stat/kvm_stat | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat
index bf65531570f5..aa3bc47af1d0 100755
--- a/tools/kvm/kvm_stat/kvm_stat
+++ b/tools/kvm/kvm_stat/kvm_stat
@@ -1517,7 +1517,9 @@ Press any other key to refresh statistics immediately.
                          help='restrict statistics to guest by name',
                          callback=cb_guest_to_pid,
                          )
-    (options, _) = optparser.parse_args(sys.argv)
+    options, unkn = optparser.parse_args(sys.argv)
+    if len(unkn) != 1:
+        sys.exit('Error: Extra argument(s): ' + ' '.join(unkn[1:]))
     try:
         # verify that we were passed a valid regex up front
         re.compile(options.fields)

From cf656c76614c6ec5b016233cac29738881c83c08 Mon Sep 17 00:00:00 2001
From: Stefan Raspl <stefan.raspl@de.ibm.com>
Date: Mon, 11 Dec 2017 12:25:29 +0100
Subject: [PATCH 110/808] tools/kvm_stat: add line for totals

Add a line for the total number of events and current average at the
bottom of the body.
Note that both values exclude child trace events. I.e. if drilldown is
activated via interactive command 'x', only the totals are accounted, or
we'd be counting these twice (see previous commit "tools/kvm_stat: fix
child trace events accounting").

Signed-off-by: Stefan Raspl <raspl@linux.vnet.ibm.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/kvm/kvm_stat/kvm_stat | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat
index aa3bc47af1d0..566a70ddd005 100755
--- a/tools/kvm/kvm_stat/kvm_stat
+++ b/tools/kvm/kvm_stat/kvm_stat
@@ -1099,8 +1099,9 @@ class Tui(object):
             sortkey = sortCurAvg
         else:
             sortkey = sortTotal
+        tavg = 0
         for key in sorted(stats.keys(), key=sortkey):
-            if row >= self.screen.getmaxyx()[0]:
+            if row >= self.screen.getmaxyx()[0] - 1:
                 break
             values = stats[key]
             if not values[0] and not values[1]:
@@ -1112,9 +1113,15 @@ class Tui(object):
                 self.screen.addstr(row, 1, '%-40s %10d%7.1f %8s' %
                                    (key, values[0], values[0] * 100 / total,
                                     cur))
+                if cur is not '' and key.find('(') is -1:
+                    tavg += cur
             row += 1
         if row == 3:
             self.screen.addstr(4, 1, 'No matching events reported yet')
+        else:
+            self.screen.addstr(row, 1, '%-40s %10d        %8s' %
+                               ('Total', total, tavg if tavg else ''),
+                               curses.A_BOLD)
         self.screen.refresh()
 
     def show_msg(self, text):

From 2797c4a11f373b2545c2398ccb02e362ee66a142 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 4 Dec 2017 13:25:13 +0000
Subject: [PATCH 111/808] drm/i915: Flush pending GTT writes before unbinding

From the shrinker paths, we want to relinquish the GPU and GGTT access to
the object, releasing the backing storage back to the system for
swapout. As a part of that process we would unpin the pages, marking
them for access by the CPU (for the swapout/swapin). However, if that
process was interrupted after unbind the vma, we missed a flush of the
inflight GGTT writes before we made that GTT space available again for
reuse, with the prospect that we would redirect them to another page.

The bug dates back to the introduction of multiple GGTT vma, but the
code itself dates to commit 02bef8f98d26 ("drm/i915: Unbind closed vma
for i915_gem_object_unbind()").

Fixes: 02bef8f98d26 ("drm/i915: Unbind closed vma for i915_gem_object_unbind()")
Fixes: c5ad54cf7dd8 ("drm/i915: Use partial view in mmap fault handler")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: stable@vger.kernel.org
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20171204132513.7303-1-chris@chris-wilson.co.uk
(cherry picked from commit 5888fc9eac3c2ff96e76aeeb865fdb46ab2d711e)
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
---
 drivers/gpu/drm/i915/i915_gem.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index ad4050f7ab3b..18de6569d04a 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -330,17 +330,10 @@ int i915_gem_object_unbind(struct drm_i915_gem_object *obj)
 	 * must wait for all rendering to complete to the object (as unbinding
 	 * must anyway), and retire the requests.
 	 */
-	ret = i915_gem_object_wait(obj,
-				   I915_WAIT_INTERRUPTIBLE |
-				   I915_WAIT_LOCKED |
-				   I915_WAIT_ALL,
-				   MAX_SCHEDULE_TIMEOUT,
-				   NULL);
+	ret = i915_gem_object_set_to_cpu_domain(obj, false);
 	if (ret)
 		return ret;
 
-	i915_gem_retire_requests(to_i915(obj->base.dev));
-
 	while ((vma = list_first_entry_or_null(&obj->vma_list,
 					       struct i915_vma,
 					       obj_link))) {

From 2b3a2e9f400acff4a4a9a2316e3e13b36b76b0e9 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 7 Dec 2017 22:00:25 +0000
Subject: [PATCH 112/808] drm/i915: Drop fb reference on load_detect_pipe
 failure path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When intel_modeset_setup_plane_state() fails drop the local framebuffer
reference before jumping to the error, otherwise we leak the framebuffer.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Fixes: edde361711ef ("drm/i915: Use atomic state to obtain load detection crtc, v3.")
Reviewed-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20171207220025.22698-1-chris@chris-wilson.co.uk
(cherry picked from commit 3e72be177cf19ab3d62b3084d424dce7e71d847f)
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
---
 drivers/gpu/drm/i915/intel_display.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index e8ccf89cb17b..ff9397030092 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -9944,11 +9944,10 @@ found:
 	}
 
 	ret = intel_modeset_setup_plane_state(state, crtc, mode, fb, 0, 0);
+	drm_framebuffer_put(fb);
 	if (ret)
 		goto fail;
 
-	drm_framebuffer_put(fb);
-
 	ret = drm_atomic_set_mode_for_crtc(&crtc_state->base, mode);
 	if (ret)
 		goto fail;

From 74c7b0782b15bc2478f557cea34b3fe34d452dc6 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Fri, 8 Dec 2017 12:10:33 +0000
Subject: [PATCH 113/808] drm/i915: Stop listening to request resubmission from
 the signaler kthread
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The intent here was that we would be listening to
i915_gem_request_unsubmit in order to cancel the signaler quickly and
release the reference on the request. Cancelling the signaler is done
directly via intel_engine_cancel_signaling (called from unsubmit), but
that does not directly wake up the signaling thread, and neither does
setting the request->global_seqno back to zero wake up listeners to the
request->execute waitqueue. So the only time that listening to the
request->execute waitqueue would wake up the signaling kthread would be
on the request resubmission, during which time we would already receive
wake ups from rejoining the global breadcrumbs wait rbtree.

Trying to wake up to release the request remains an issue. If the
signaling was cancelled and no other request required signaling, then it
is possible for us to shutdown with the reference on the request still
held. To ensure that we do not try to shutdown, leaking that request, we
kick the signaling threads whenever we disarm the breadcrumbs, i.e. on
parking the engine when idle.

v2: We do need to be sure to release the last reference on stopping the
kthread; asserting that it has been dropped already is insufficient.

Fixes: d6a2289d9d6b ("drm/i915: Remove the preempted request from the execution queue")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Michał Winiarski <michal.winiarski@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20171208121033.5236-1-chris@chris-wilson.co.uk
Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
(cherry picked from commit 776bc27fd8ab67a675cb0041d3af361af5d0e290)
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
---
 drivers/gpu/drm/i915/intel_breadcrumbs.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_breadcrumbs.c b/drivers/gpu/drm/i915/intel_breadcrumbs.c
index 5f8b9f1f40f1..bcbc7abe6693 100644
--- a/drivers/gpu/drm/i915/intel_breadcrumbs.c
+++ b/drivers/gpu/drm/i915/intel_breadcrumbs.c
@@ -186,7 +186,7 @@ void intel_engine_disarm_breadcrumbs(struct intel_engine_cs *engine)
 	struct intel_wait *wait, *n, *first;
 
 	if (!b->irq_armed)
-		return;
+		goto wakeup_signaler;
 
 	/* We only disarm the irq when we are idle (all requests completed),
 	 * so if the bottom-half remains asleep, it missed the request
@@ -208,6 +208,14 @@ void intel_engine_disarm_breadcrumbs(struct intel_engine_cs *engine)
 	b->waiters = RB_ROOT;
 
 	spin_unlock_irq(&b->rb_lock);
+
+	/*
+	 * The signaling thread may be asleep holding a reference to a request,
+	 * that had its signaling cancelled prior to being preempted. We need
+	 * to kick the signaler, just in case, to release any such reference.
+	 */
+wakeup_signaler:
+	wake_up_process(b->signaler);
 }
 
 static bool use_fake_irq(const struct intel_breadcrumbs *b)
@@ -651,23 +659,15 @@ static int intel_breadcrumbs_signaler(void *arg)
 		}
 
 		if (unlikely(do_schedule)) {
-			DEFINE_WAIT(exec);
-
 			if (kthread_should_park())
 				kthread_parkme();
 
-			if (kthread_should_stop()) {
-				GEM_BUG_ON(request);
+			if (unlikely(kthread_should_stop())) {
+				i915_gem_request_put(request);
 				break;
 			}
 
-			if (request)
-				add_wait_queue(&request->execute, &exec);
-
 			schedule();
-
-			if (request)
-				remove_wait_queue(&request->execute, &exec);
 		}
 		i915_gem_request_put(request);
 	} while (1);

From 2cf654db8d7eafb973d28eb3cddf043d353e1345 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Wed, 13 Dec 2017 09:48:02 +0000
Subject: [PATCH 114/808] drm/i915/fence: Use rcu to defer freeing of irq_work

It is illegal to perform an immediate free of the struct irq_work from
inside the irq_work callback (as irq_work_run_list modifies work->flags
after execution of the work->func()). As we use the irq_work to
coordinate the freeing of the callback from two different softirq paths,
we need to defer the kfree from inside our irq_work callback, for which
we can use kfree_rcu.

Fixes: 81c0ed21aa91 ("drm/i915/fence: Avoid del_timer_sync() from inside a timer")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20171213094802.28243-1-chris@chris-wilson.co.uk
(cherry picked from commit 7d622351c94172a42bfe9b13bdb0fdc2be90ed3b)
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
---
 drivers/gpu/drm/i915/i915_sw_fence.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_sw_fence.c b/drivers/gpu/drm/i915/i915_sw_fence.c
index e8ca67a129d2..ac236b88c99c 100644
--- a/drivers/gpu/drm/i915/i915_sw_fence.c
+++ b/drivers/gpu/drm/i915/i915_sw_fence.c
@@ -367,6 +367,7 @@ struct i915_sw_dma_fence_cb {
 	struct dma_fence *dma;
 	struct timer_list timer;
 	struct irq_work work;
+	struct rcu_head rcu;
 };
 
 static void timer_i915_sw_fence_wake(struct timer_list *t)
@@ -406,7 +407,7 @@ static void irq_i915_sw_fence_work(struct irq_work *wrk)
 	del_timer_sync(&cb->timer);
 	dma_fence_put(cb->dma);
 
-	kfree(cb);
+	kfree_rcu(cb, rcu);
 }
 
 int i915_sw_fence_await_dma_fence(struct i915_sw_fence *fence,

From 958d022e326810fd762505bd02007aced79ffcbc Mon Sep 17 00:00:00 2001
From: "oder_chiou@realtek.com" <oder_chiou@realtek.com>
Date: Thu, 14 Dec 2017 09:54:07 +0800
Subject: [PATCH 115/808] ASoC: rt5663: Fix the wrong result of the first jack
 detection

In the first jack detection while booting, the result will always show as
headset, even we insert the headphone.

Signed-off-by: Oder Chiou <oder_chiou@realtek.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 sound/soc/codecs/rt5663.c | 4 ++++
 sound/soc/codecs/rt5663.h | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/sound/soc/codecs/rt5663.c b/sound/soc/codecs/rt5663.c
index b036c9dc0c8c..d329bf719d80 100644
--- a/sound/soc/codecs/rt5663.c
+++ b/sound/soc/codecs/rt5663.c
@@ -1560,6 +1560,10 @@ static int rt5663_jack_detect(struct snd_soc_codec *codec, int jack_insert)
 			RT5663_IRQ_POW_SAV_MASK, RT5663_IRQ_POW_SAV_EN);
 		snd_soc_update_bits(codec, RT5663_IRQ_1,
 			RT5663_EN_IRQ_JD1_MASK, RT5663_EN_IRQ_JD1_EN);
+		snd_soc_update_bits(codec, RT5663_EM_JACK_TYPE_1,
+			RT5663_EM_JD_MASK, RT5663_EM_JD_RST);
+		snd_soc_update_bits(codec, RT5663_EM_JACK_TYPE_1,
+			RT5663_EM_JD_MASK, RT5663_EM_JD_NOR);
 
 		while (true) {
 			regmap_read(rt5663->regmap, RT5663_INT_ST_2, &val);
diff --git a/sound/soc/codecs/rt5663.h b/sound/soc/codecs/rt5663.h
index c5a9b69579ad..03adc8004ba9 100644
--- a/sound/soc/codecs/rt5663.h
+++ b/sound/soc/codecs/rt5663.h
@@ -1029,6 +1029,10 @@
 #define RT5663_POL_EXT_JD_SHIFT			10
 #define RT5663_POL_EXT_JD_EN			(0x1 << 10)
 #define RT5663_POL_EXT_JD_DIS			(0x0 << 10)
+#define RT5663_EM_JD_MASK			(0x1 << 7)
+#define RT5663_EM_JD_SHIFT			7
+#define RT5663_EM_JD_NOR			(0x1 << 7)
+#define RT5663_EM_JD_RST			(0x0 << 7)
 
 /* DACREF LDO Control (0x0112)*/
 #define RT5663_PWR_LDO_DACREFL_MASK		(0x1 << 9)

From c1cfd9025cc394fd137a01159d74335c5ac978ce Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Thu, 14 Dec 2017 16:44:12 +0100
Subject: [PATCH 116/808] ALSA: rawmidi: Avoid racy info ioctl via ctl device

The rawmidi also allows to obtaining the information via ioctl of ctl
API.  It means that user can issue an ioctl to the rawmidi device even
when it's being removed as long as the control device is present.
Although the code has some protection via the global register_mutex,
its range is limited to the search of the corresponding rawmidi
object, and the mutex is already unlocked at accessing the rawmidi
object.  This may lead to a use-after-free.

For avoiding it, this patch widens the application of register_mutex
to the whole snd_rawmidi_info_select() function.  We have another
mutex per rawmidi object, but this operation isn't very hot path, so
it shouldn't matter from the performance POV.

Cc: <stable@vger.kernel.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/core/rawmidi.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/sound/core/rawmidi.c b/sound/core/rawmidi.c
index b3b353d72527..f055ca10bbc1 100644
--- a/sound/core/rawmidi.c
+++ b/sound/core/rawmidi.c
@@ -579,15 +579,14 @@ static int snd_rawmidi_info_user(struct snd_rawmidi_substream *substream,
 	return 0;
 }
 
-int snd_rawmidi_info_select(struct snd_card *card, struct snd_rawmidi_info *info)
+static int __snd_rawmidi_info_select(struct snd_card *card,
+				     struct snd_rawmidi_info *info)
 {
 	struct snd_rawmidi *rmidi;
 	struct snd_rawmidi_str *pstr;
 	struct snd_rawmidi_substream *substream;
 
-	mutex_lock(&register_mutex);
 	rmidi = snd_rawmidi_search(card, info->device);
-	mutex_unlock(&register_mutex);
 	if (!rmidi)
 		return -ENXIO;
 	if (info->stream < 0 || info->stream > 1)
@@ -603,6 +602,16 @@ int snd_rawmidi_info_select(struct snd_card *card, struct snd_rawmidi_info *info
 	}
 	return -ENXIO;
 }
+
+int snd_rawmidi_info_select(struct snd_card *card, struct snd_rawmidi_info *info)
+{
+	int ret;
+
+	mutex_lock(&register_mutex);
+	ret = __snd_rawmidi_info_select(card, info);
+	mutex_unlock(&register_mutex);
+	return ret;
+}
 EXPORT_SYMBOL(snd_rawmidi_info_select);
 
 static int snd_rawmidi_info_select_user(struct snd_card *card,

From b7b2846fe26f2c0d7f317c874a13d3ecf22670ff Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Thu, 7 Dec 2017 19:07:02 -0800
Subject: [PATCH 117/808] xfs: add the ability to join a held buffer to a
 defer_ops

In certain cases, defer_ops callers will lock a buffer and want to hold
the lock across transaction rolls.  Similar to ijoined inodes, we want
to dirty & join the buffer with each transaction roll in defer_finish so
that afterwards the caller still owns the buffer lock and we haven't
inadvertently pinned the log.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_defer.c | 39 ++++++++++++++++++++++++++++++++++++---
 fs/xfs/libxfs/xfs_defer.h |  5 ++++-
 2 files changed, 40 insertions(+), 4 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index 072ebfe1d6ae..087fea02c389 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -249,6 +249,10 @@ xfs_defer_trans_roll(
 	for (i = 0; i < XFS_DEFER_OPS_NR_INODES && dop->dop_inodes[i]; i++)
 		xfs_trans_log_inode(*tp, dop->dop_inodes[i], XFS_ILOG_CORE);
 
+	/* Hold the (previously bjoin'd) buffer locked across the roll. */
+	for (i = 0; i < XFS_DEFER_OPS_NR_BUFS && dop->dop_bufs[i]; i++)
+		xfs_trans_dirty_buf(*tp, dop->dop_bufs[i]);
+
 	trace_xfs_defer_trans_roll((*tp)->t_mountp, dop);
 
 	/* Roll the transaction. */
@@ -264,6 +268,12 @@ xfs_defer_trans_roll(
 	for (i = 0; i < XFS_DEFER_OPS_NR_INODES && dop->dop_inodes[i]; i++)
 		xfs_trans_ijoin(*tp, dop->dop_inodes[i], 0);
 
+	/* Rejoin the buffers and dirty them so the log moves forward. */
+	for (i = 0; i < XFS_DEFER_OPS_NR_BUFS && dop->dop_bufs[i]; i++) {
+		xfs_trans_bjoin(*tp, dop->dop_bufs[i]);
+		xfs_trans_bhold(*tp, dop->dop_bufs[i]);
+	}
+
 	return error;
 }
 
@@ -295,6 +305,31 @@ xfs_defer_ijoin(
 		}
 	}
 
+	ASSERT(0);
+	return -EFSCORRUPTED;
+}
+
+/*
+ * Add this buffer to the deferred op.  Each joined buffer is relogged
+ * each time we roll the transaction.
+ */
+int
+xfs_defer_bjoin(
+	struct xfs_defer_ops		*dop,
+	struct xfs_buf			*bp)
+{
+	int				i;
+
+	for (i = 0; i < XFS_DEFER_OPS_NR_BUFS; i++) {
+		if (dop->dop_bufs[i] == bp)
+			return 0;
+		else if (dop->dop_bufs[i] == NULL) {
+			dop->dop_bufs[i] = bp;
+			return 0;
+		}
+	}
+
+	ASSERT(0);
 	return -EFSCORRUPTED;
 }
 
@@ -493,9 +528,7 @@ xfs_defer_init(
 	struct xfs_defer_ops		*dop,
 	xfs_fsblock_t			*fbp)
 {
-	dop->dop_committed = false;
-	dop->dop_low = false;
-	memset(&dop->dop_inodes, 0, sizeof(dop->dop_inodes));
+	memset(dop, 0, sizeof(struct xfs_defer_ops));
 	*fbp = NULLFSBLOCK;
 	INIT_LIST_HEAD(&dop->dop_intake);
 	INIT_LIST_HEAD(&dop->dop_pending);
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
index d4f046dd44bd..045beacdd37d 100644
--- a/fs/xfs/libxfs/xfs_defer.h
+++ b/fs/xfs/libxfs/xfs_defer.h
@@ -59,6 +59,7 @@ enum xfs_defer_ops_type {
 };
 
 #define XFS_DEFER_OPS_NR_INODES	2	/* join up to two inodes */
+#define XFS_DEFER_OPS_NR_BUFS	2	/* join up to two buffers */
 
 struct xfs_defer_ops {
 	bool			dop_committed;	/* did any trans commit? */
@@ -66,8 +67,9 @@ struct xfs_defer_ops {
 	struct list_head	dop_intake;	/* unlogged pending work */
 	struct list_head	dop_pending;	/* logged pending work */
 
-	/* relog these inodes with each roll */
+	/* relog these with each roll */
 	struct xfs_inode	*dop_inodes[XFS_DEFER_OPS_NR_INODES];
+	struct xfs_buf		*dop_bufs[XFS_DEFER_OPS_NR_BUFS];
 };
 
 void xfs_defer_add(struct xfs_defer_ops *dop, enum xfs_defer_ops_type type,
@@ -77,6 +79,7 @@ void xfs_defer_cancel(struct xfs_defer_ops *dop);
 void xfs_defer_init(struct xfs_defer_ops *dop, xfs_fsblock_t *fbp);
 bool xfs_defer_has_unfinished_work(struct xfs_defer_ops *dop);
 int xfs_defer_ijoin(struct xfs_defer_ops *dop, struct xfs_inode *ip);
+int xfs_defer_bjoin(struct xfs_defer_ops *dop, struct xfs_buf *bp);
 
 /* Description of a deferred type. */
 struct xfs_defer_op_type {

From 6e643cd094de3bd0f97edcc1db0089afa24d909f Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Thu, 7 Dec 2017 19:07:02 -0800
Subject: [PATCH 118/808] xfs: hold xfs_buf locked between shortform->leaf
 conversion and the addition of an attribute

The new attribute leaf buffer is not held locked across the transaction
roll between the shortform->leaf modification and the addition of the
new entry.  As a result, the attribute buffer modification being made is
not atomic from an operational perspective.  Hence the AIL push can grab
it in the transient state of "just created" after the initial
transaction is rolled, because the buffer has been released.  This leads
to xfs_attr3_leaf_verify() asserting that hdr.count is zero, treating
this as in-memory corruption, and shutting down the filesystem.

Darrick ported the original patch to 4.15 and reworked it use the
xfs_defer_bjoin helper and hold/join the buffer correctly across the
second transaction roll.

Signed-off-by: Alex Lyakas <alex@zadarastorage.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_attr.c      | 20 +++++++++++++++-----
 fs/xfs/libxfs/xfs_attr_leaf.c |  9 ++++++---
 fs/xfs/libxfs/xfs_attr_leaf.h |  3 ++-
 3 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index 6249c92671de..a76914db72ef 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -212,6 +212,7 @@ xfs_attr_set(
 	int			flags)
 {
 	struct xfs_mount	*mp = dp->i_mount;
+	struct xfs_buf		*leaf_bp = NULL;
 	struct xfs_da_args	args;
 	struct xfs_defer_ops	dfops;
 	struct xfs_trans_res	tres;
@@ -327,9 +328,16 @@ xfs_attr_set(
 		 * GROT: another possible req'mt for a double-split btree op.
 		 */
 		xfs_defer_init(args.dfops, args.firstblock);
-		error = xfs_attr_shortform_to_leaf(&args);
+		error = xfs_attr_shortform_to_leaf(&args, &leaf_bp);
 		if (error)
 			goto out_defer_cancel;
+		/*
+		 * Prevent the leaf buffer from being unlocked so that a
+		 * concurrent AIL push cannot grab the half-baked leaf
+		 * buffer and run into problems with the write verifier.
+		 */
+		xfs_trans_bhold(args.trans, leaf_bp);
+		xfs_defer_bjoin(args.dfops, leaf_bp);
 		xfs_defer_ijoin(args.dfops, dp);
 		error = xfs_defer_finish(&args.trans, args.dfops);
 		if (error)
@@ -337,13 +345,14 @@ xfs_attr_set(
 
 		/*
 		 * Commit the leaf transformation.  We'll need another (linked)
-		 * transaction to add the new attribute to the leaf.
+		 * transaction to add the new attribute to the leaf, which
+		 * means that we have to hold & join the leaf buffer here too.
 		 */
-
 		error = xfs_trans_roll_inode(&args.trans, dp);
 		if (error)
 			goto out;
-
+		xfs_trans_bjoin(args.trans, leaf_bp);
+		leaf_bp = NULL;
 	}
 
 	if (xfs_bmap_one_block(dp, XFS_ATTR_FORK))
@@ -374,8 +383,9 @@ xfs_attr_set(
 
 out_defer_cancel:
 	xfs_defer_cancel(&dfops);
-	args.trans = NULL;
 out:
+	if (leaf_bp)
+		xfs_trans_brelse(args.trans, leaf_bp);
 	if (args.trans)
 		xfs_trans_cancel(args.trans);
 	xfs_iunlock(dp, XFS_ILOCK_EXCL);
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 53cc8b986eac..601eaa36f1ad 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -735,10 +735,13 @@ xfs_attr_shortform_getvalue(xfs_da_args_t *args)
 }
 
 /*
- * Convert from using the shortform to the leaf.
+ * Convert from using the shortform to the leaf.  On success, return the
+ * buffer so that we can keep it locked until we're totally done with it.
  */
 int
-xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
+xfs_attr_shortform_to_leaf(
+	struct xfs_da_args	*args,
+	struct xfs_buf		**leaf_bp)
 {
 	xfs_inode_t *dp;
 	xfs_attr_shortform_t *sf;
@@ -818,7 +821,7 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
 		sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
 	}
 	error = 0;
-
+	*leaf_bp = bp;
 out:
 	kmem_free(tmpbuffer);
 	return error;
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h
index f7dda0c237b0..894124efb421 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.h
+++ b/fs/xfs/libxfs/xfs_attr_leaf.h
@@ -48,7 +48,8 @@ void	xfs_attr_shortform_create(struct xfs_da_args *args);
 void	xfs_attr_shortform_add(struct xfs_da_args *args, int forkoff);
 int	xfs_attr_shortform_lookup(struct xfs_da_args *args);
 int	xfs_attr_shortform_getvalue(struct xfs_da_args *args);
-int	xfs_attr_shortform_to_leaf(struct xfs_da_args *args);
+int	xfs_attr_shortform_to_leaf(struct xfs_da_args *args,
+			struct xfs_buf **leaf_bp);
 int	xfs_attr_shortform_remove(struct xfs_da_args *args);
 int	xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp);
 int	xfs_attr_shortform_bytesfit(struct xfs_inode *dp, int bytes);

From 8c57b88637d78a723e0854fc3d06c6d4c31a1e0c Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Sun, 10 Dec 2017 18:03:53 -0800
Subject: [PATCH 119/808] xfs: account for null transactions in bunmapi

In e1a4e37cc7b665 ("xfs: try to avoid blowing out the transaction
reservation when bunmaping a shared extent"), we try to constrain the
amount of real extents we unmap from the data fork in a given call so
that we don't blow out transaction reservations.

However, not all bunmapi operations require a transaction -- if we're
only removing a delalloc extent, no transaction is needed, so we have to
code against that.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_bmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 1210f684d3c2..1bddbba6b80c 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -5136,7 +5136,7 @@ __xfs_bunmapi(
 	 * blowing out the transaction with a mix of EFIs and reflink
 	 * adjustments.
 	 */
-	if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK)
+	if (tp && xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK)
 		max_len = min(len, xfs_refcount_max_unmap(tp->t_log_res));
 	else
 		max_len = len;

From c54854a437a447a6bb1dcb11f60dd01cef3fa597 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Sun, 10 Dec 2017 18:03:54 -0800
Subject: [PATCH 120/808] xfs: move xfs_iext_insert tracepoint to report useful
 information

Move the tracepoint in xfs_iext_insert to after the point where we've
inserted the extent because otherwise we report stale extent data in
the ftrace output.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_iext_tree.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c
index 89bf16b4d937..b0f31791c7e6 100644
--- a/fs/xfs/libxfs/xfs_iext_tree.c
+++ b/fs/xfs/libxfs/xfs_iext_tree.c
@@ -632,8 +632,6 @@ xfs_iext_insert(
 	struct xfs_iext_leaf	*new = NULL;
 	int			nr_entries, i;
 
-	trace_xfs_iext_insert(ip, cur, state, _RET_IP_);
-
 	if (ifp->if_height == 0)
 		xfs_iext_alloc_root(ifp, cur);
 	else if (ifp->if_height == 1)
@@ -661,6 +659,8 @@ xfs_iext_insert(
 	xfs_iext_set(cur_rec(cur), irec);
 	ifp->if_bytes += sizeof(struct xfs_iext_rec);
 
+	trace_xfs_iext_insert(ip, cur, state, _RET_IP_);
+
 	if (new)
 		xfs_iext_insert_node(ifp, xfs_iext_leaf_key(new, 0), new, 2);
 }

From 5c989a0ee06eb77a44baffd1779a5dbb9a7e873f Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Sun, 10 Dec 2017 18:03:54 -0800
Subject: [PATCH 121/808] xfs: remove dest file's post-eof preallocations
 before reflinking

If we try to reflink into a file with post-eof preallocations at an
offset well past the preallocations, we increase i_size as one would
expect.  However, those allocations do not have page cache backing them,
so they won't get cleaned out on their own.  This leads to asserts in
the collapse/insert range code and xfs_destroy_inode when they encounter
delalloc extents they weren't expecting to find.

Since there are plenty of other places where we dump those post-eof
blocks, do the same to the reflink destination file before we start
remapping extents.  This was found by adding clonerange support to
fsstress and running it in write-only mode.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_reflink.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index cf7c8f81bebb..e13f5ad57a03 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1291,6 +1291,17 @@ xfs_reflink_remap_range(
 
 	trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
 
+	/*
+	 * Clear out post-eof preallocations because we don't have page cache
+	 * backing the delayed allocations and they'll never get freed on
+	 * their own.
+	 */
+	if (xfs_can_free_eofblocks(dest, true)) {
+		ret = xfs_free_eofblocks(dest);
+		if (ret)
+			goto out_unlock;
+	}
+
 	/* Set flags and remap blocks. */
 	ret = xfs_reflink_set_inode_flag(src, dest);
 	if (ret)

From 73353f486c9b5b2407ec32be1004174dbbaf6c18 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Sun, 10 Dec 2017 18:03:55 -0800
Subject: [PATCH 122/808] xfs: relax is_reflink_inode assert in
 xfs_reflink_find_cow_mapping

We don't hold the ilock through the entire sequence of xfs_writepage_map
-> xfs_map_cow -> xfs_reflink_find_cow_mapping.  This means that we can
race with another thread that is trying to clear the inode reflink flag,
with the result that the flag is set for the xfs_map_cow check but
cleared before we get to the assert in find_cow_mapping.  When this
happens, we blow the assert even though everything is fine.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_reflink.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index e13f5ad57a03..99c5852f9fe7 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -490,8 +490,9 @@ xfs_reflink_find_cow_mapping(
 	struct xfs_iext_cursor		icur;
 
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED));
-	ASSERT(xfs_is_reflink_inode(ip));
 
+	if (!xfs_is_reflink_inode(ip))
+		return false;
 	offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
 	if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &icur, &got))
 		return false;

From 9d40fba8b2056773b9744a95df9ddd6cc33a4f83 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Sun, 10 Dec 2017 18:03:55 -0800
Subject: [PATCH 123/808] xfs: avoid infinite loop when cancelling CoW blocks
 after writeback failure

When we're cancelling a cow range, we don't always delete each extent
that we iterate, so we have to move icur backwards in the list to avoid
an infinite loop.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_reflink.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 99c5852f9fe7..6931b0c79cac 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -611,6 +611,9 @@ xfs_reflink_cancel_cow_blocks(
 
 			/* Remove the mapping from the CoW fork. */
 			xfs_bmap_del_extent_cow(ip, &icur, &got, &del);
+		} else {
+			/* Didn't do anything, push cursor back. */
+			xfs_iext_prev(ifp, &icur);
 		}
 next_extent:
 		if (!xfs_iext_get_extent(ifp, &icur, &got))

From a192de265b26c525672884630d5376c405e83b2a Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Sun, 10 Dec 2017 18:03:56 -0800
Subject: [PATCH 124/808] xfs: allow CoW remap transactions to use reserve
 blocks

Since we as yet have no way of holding on to the indlen blocks that are
reserved as part of CoW fork delalloc reservations, let the CoW remap
transaction dip into the reserves so that we avoid failing writes.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_reflink.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 6931b0c79cac..e49e6db415f7 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -729,7 +729,7 @@ xfs_reflink_end_cow(
 			(unsigned int)(end_fsb - offset_fsb),
 			XFS_DATA_FORK);
 	error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write,
-			resblks, 0, 0, &tp);
+			resblks, 0, XFS_TRANS_RESERVE, &tp);
 	if (error)
 		goto out;
 

From 093b8886f446c9351c4de512cb1d4afe30e37f6f Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Tue, 12 Dec 2017 10:23:28 -0800
Subject: [PATCH 125/808] scsi: core: Use blist_flags_t consistently

Use the type blist_flags_t for all variables that represent blacklist
flags. Additionally, suppress recently introduced sparse warnings
related to blacklist flags.

[mkp: fixed commit id]

Fixes: 5ebde4694e3b ("scsi: Use 'blist_flags_t' for scsi_devinfo flags")
Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Hannes Reinecke <hare@suse.com>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/scsi_devinfo.c       |  6 ++----
 drivers/scsi/scsi_scan.c          | 13 +++++++------
 drivers/scsi/scsi_sysfs.c         |  5 +++--
 drivers/scsi/scsi_transport_spi.c | 12 +++++++-----
 4 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/drivers/scsi/scsi_devinfo.c b/drivers/scsi/scsi_devinfo.c
index 449ef5adbb2b..dfb8da83fa50 100644
--- a/drivers/scsi/scsi_devinfo.c
+++ b/drivers/scsi/scsi_devinfo.c
@@ -374,10 +374,8 @@ int scsi_dev_info_list_add_keyed(int compatible, char *vendor, char *model,
 			    model, compatible);
 
 	if (strflags)
-		devinfo->flags = simple_strtoul(strflags, NULL, 0);
-	else
-		devinfo->flags = flags;
-
+		flags = (__force blist_flags_t)simple_strtoul(strflags, NULL, 0);
+	devinfo->flags = flags;
 	devinfo->compatible = compatible;
 
 	if (compatible)
diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c
index be5e919db0e8..0880d975eed3 100644
--- a/drivers/scsi/scsi_scan.c
+++ b/drivers/scsi/scsi_scan.c
@@ -770,7 +770,7 @@ static int scsi_probe_lun(struct scsi_device *sdev, unsigned char *inq_result,
  *     SCSI_SCAN_LUN_PRESENT: a new scsi_device was allocated and initialized
  **/
 static int scsi_add_lun(struct scsi_device *sdev, unsigned char *inq_result,
-		int *bflags, int async)
+		blist_flags_t *bflags, int async)
 {
 	int ret;
 
@@ -1049,14 +1049,15 @@ static unsigned char *scsi_inq_str(unsigned char *buf, unsigned char *inq,
  *   - SCSI_SCAN_LUN_PRESENT: a new scsi_device was allocated and initialized
  **/
 static int scsi_probe_and_add_lun(struct scsi_target *starget,
-				  u64 lun, int *bflagsp,
+				  u64 lun, blist_flags_t *bflagsp,
 				  struct scsi_device **sdevp,
 				  enum scsi_scan_mode rescan,
 				  void *hostdata)
 {
 	struct scsi_device *sdev;
 	unsigned char *result;
-	int bflags, res = SCSI_SCAN_NO_RESPONSE, result_len = 256;
+	blist_flags_t bflags;
+	int res = SCSI_SCAN_NO_RESPONSE, result_len = 256;
 	struct Scsi_Host *shost = dev_to_shost(starget->dev.parent);
 
 	/*
@@ -1201,7 +1202,7 @@ static int scsi_probe_and_add_lun(struct scsi_target *starget,
  *     Modifies sdevscan->lun.
  **/
 static void scsi_sequential_lun_scan(struct scsi_target *starget,
-				     int bflags, int scsi_level,
+				     blist_flags_t bflags, int scsi_level,
 				     enum scsi_scan_mode rescan)
 {
 	uint max_dev_lun;
@@ -1292,7 +1293,7 @@ static void scsi_sequential_lun_scan(struct scsi_target *starget,
  *     0: scan completed (or no memory, so further scanning is futile)
  *     1: could not scan with REPORT LUN
  **/
-static int scsi_report_lun_scan(struct scsi_target *starget, int bflags,
+static int scsi_report_lun_scan(struct scsi_target *starget, blist_flags_t bflags,
 				enum scsi_scan_mode rescan)
 {
 	unsigned char scsi_cmd[MAX_COMMAND_SIZE];
@@ -1538,7 +1539,7 @@ static void __scsi_scan_target(struct device *parent, unsigned int channel,
 		unsigned int id, u64 lun, enum scsi_scan_mode rescan)
 {
 	struct Scsi_Host *shost = dev_to_shost(parent);
-	int bflags = 0;
+	blist_flags_t bflags = 0;
 	int res;
 	struct scsi_target *starget;
 
diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c
index 50e7d7e4a861..a9996c16f4ae 100644
--- a/drivers/scsi/scsi_sysfs.c
+++ b/drivers/scsi/scsi_sysfs.c
@@ -967,7 +967,8 @@ sdev_show_wwid(struct device *dev, struct device_attribute *attr,
 }
 static DEVICE_ATTR(wwid, S_IRUGO, sdev_show_wwid, NULL);
 
-#define BLIST_FLAG_NAME(name) [ilog2(BLIST_##name)] = #name
+#define BLIST_FLAG_NAME(name)					\
+	[ilog2((__force unsigned int)BLIST_##name)] = #name
 static const char *const sdev_bflags_name[] = {
 #include "scsi_devinfo_tbl.c"
 };
@@ -984,7 +985,7 @@ sdev_show_blacklist(struct device *dev, struct device_attribute *attr,
 	for (i = 0; i < sizeof(sdev->sdev_bflags) * BITS_PER_BYTE; i++) {
 		const char *name = NULL;
 
-		if (!(sdev->sdev_bflags & BIT(i)))
+		if (!(sdev->sdev_bflags & (__force blist_flags_t)BIT(i)))
 			continue;
 		if (i < ARRAY_SIZE(sdev_bflags_name) && sdev_bflags_name[i])
 			name = sdev_bflags_name[i];
diff --git a/drivers/scsi/scsi_transport_spi.c b/drivers/scsi/scsi_transport_spi.c
index d0219e36080c..10ebb213ddb3 100644
--- a/drivers/scsi/scsi_transport_spi.c
+++ b/drivers/scsi/scsi_transport_spi.c
@@ -50,14 +50,14 @@
 
 /* Our blacklist flags */
 enum {
-	SPI_BLIST_NOIUS = 0x1,
+	SPI_BLIST_NOIUS = (__force blist_flags_t)0x1,
 };
 
 /* blacklist table, modelled on scsi_devinfo.c */
 static struct {
 	char *vendor;
 	char *model;
-	unsigned flags;
+	blist_flags_t flags;
 } spi_static_device_list[] __initdata = {
 	{"HP", "Ultrium 3-SCSI", SPI_BLIST_NOIUS },
 	{"IBM", "ULTRIUM-TD3", SPI_BLIST_NOIUS },
@@ -221,9 +221,11 @@ static int spi_device_configure(struct transport_container *tc,
 {
 	struct scsi_device *sdev = to_scsi_device(dev);
 	struct scsi_target *starget = sdev->sdev_target;
-	unsigned bflags = scsi_get_device_flags_keyed(sdev, &sdev->inquiry[8],
-						      &sdev->inquiry[16],
-						      SCSI_DEVINFO_SPI);
+	blist_flags_t bflags;
+
+	bflags = scsi_get_device_flags_keyed(sdev, &sdev->inquiry[8],
+					     &sdev->inquiry[16],
+					     SCSI_DEVINFO_SPI);
 
 	/* Populate the target capability fields with the values
 	 * gleaned from the device inquiry */

From 5771cfffdffe709ae9b403b6f80438ca40bf850e Mon Sep 17 00:00:00 2001
From: Prasad B Munirathnam <prasad.munirathnam@microsemi.com>
Date: Tue, 12 Dec 2017 11:40:10 -0800
Subject: [PATCH 126/808] scsi: aacraid: Fix I/O drop during reset

"FIB_CONTEXT_FLAG_TIMEDOUT" flag is set in aac_eh_abort to indicate
command timeout. Using the same flag in reset handler causes the command
to time out and the I/Os were dropped.

Define a new flag "FIB_CONTEXT_FLAG_EH_RESET" to make sure I/O is
properly handled in eh_reset handler.

[mkp: tweaked commit message]

Signed-off-by: Prasad B Munirathnam <prasad.munirathnam@microsemi.com>
Reviewed-by: Raghava Aditya Renukunta <RaghavaAditya.Renukunta@microsemi.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/aacraid/aacraid.h | 1 +
 drivers/scsi/aacraid/linit.c   | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/scsi/aacraid/aacraid.h b/drivers/scsi/aacraid/aacraid.h
index 6e3d81969a77..d52265416da2 100644
--- a/drivers/scsi/aacraid/aacraid.h
+++ b/drivers/scsi/aacraid/aacraid.h
@@ -1725,6 +1725,7 @@ struct aac_dev
 #define FIB_CONTEXT_FLAG_NATIVE_HBA		(0x00000010)
 #define FIB_CONTEXT_FLAG_NATIVE_HBA_TMF	(0x00000020)
 #define FIB_CONTEXT_FLAG_SCSI_CMD	(0x00000040)
+#define FIB_CONTEXT_FLAG_EH_RESET	(0x00000080)
 
 /*
  *	Define the command values
diff --git a/drivers/scsi/aacraid/linit.c b/drivers/scsi/aacraid/linit.c
index bdf127aaab41..d55332de08f9 100644
--- a/drivers/scsi/aacraid/linit.c
+++ b/drivers/scsi/aacraid/linit.c
@@ -1037,7 +1037,7 @@ static int aac_eh_bus_reset(struct scsi_cmnd* cmd)
 			info = &aac->hba_map[bus][cid];
 			if (bus >= AAC_MAX_BUSES || cid >= AAC_MAX_TARGETS ||
 			    info->devtype != AAC_DEVTYPE_NATIVE_RAW) {
-				fib->flags |= FIB_CONTEXT_FLAG_TIMED_OUT;
+				fib->flags |= FIB_CONTEXT_FLAG_EH_RESET;
 				cmd->SCp.phase = AAC_OWNER_ERROR_HANDLER;
 			}
 		}

From 08933099e6404f588f81c2050bfec7313e06eeaf Mon Sep 17 00:00:00 2001
From: Daniele Palmas <dnlplm@gmail.com>
Date: Thu, 14 Dec 2017 16:54:45 +0100
Subject: [PATCH 127/808] USB: serial: option: add support for Telit ME910 PID
 0x1101

This patch adds support for PID 0x1101 of Telit ME910.

Signed-off-by: Daniele Palmas <dnlplm@gmail.com>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Johan Hovold <johan@kernel.org>
---
 drivers/usb/serial/option.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c
index 3b3513874cfd..b02fb576b856 100644
--- a/drivers/usb/serial/option.c
+++ b/drivers/usb/serial/option.c
@@ -280,6 +280,7 @@ static void option_instat_callback(struct urb *urb);
 #define TELIT_PRODUCT_LE922_USBCFG3		0x1043
 #define TELIT_PRODUCT_LE922_USBCFG5		0x1045
 #define TELIT_PRODUCT_ME910			0x1100
+#define TELIT_PRODUCT_ME910_DUAL_MODEM		0x1101
 #define TELIT_PRODUCT_LE920			0x1200
 #define TELIT_PRODUCT_LE910			0x1201
 #define TELIT_PRODUCT_LE910_USBCFG4		0x1206
@@ -645,6 +646,11 @@ static const struct option_blacklist_info telit_me910_blacklist = {
 	.reserved = BIT(1) | BIT(3),
 };
 
+static const struct option_blacklist_info telit_me910_dual_modem_blacklist = {
+	.sendsetup = BIT(0),
+	.reserved = BIT(3),
+};
+
 static const struct option_blacklist_info telit_le910_blacklist = {
 	.sendsetup = BIT(0),
 	.reserved = BIT(1) | BIT(2),
@@ -1244,6 +1250,8 @@ static const struct usb_device_id option_ids[] = {
 		.driver_info = (kernel_ulong_t)&telit_le922_blacklist_usbcfg0 },
 	{ USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_ME910),
 		.driver_info = (kernel_ulong_t)&telit_me910_blacklist },
+	{ USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_ME910_DUAL_MODEM),
+		.driver_info = (kernel_ulong_t)&telit_me910_dual_modem_blacklist },
 	{ USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE910),
 		.driver_info = (kernel_ulong_t)&telit_le910_blacklist },
 	{ USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE910_USBCFG4),

From 92a18a657fb2e2ffbfa0659af32cc18fd2346516 Mon Sep 17 00:00:00 2001
From: Reinhard Speyerer <rspmn@arcor.de>
Date: Fri, 15 Dec 2017 00:39:27 +0100
Subject: [PATCH 128/808] USB: serial: qcserial: add Sierra Wireless EM7565
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Sierra Wireless EM7565 devices use the QCSERIAL_SWI layout for their
serial ports

T:  Bus=01 Lev=03 Prnt=29 Port=01 Cnt=02 Dev#= 31 Spd=480  MxCh= 0
D:  Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs=  1
P:  Vendor=1199 ProdID=9091 Rev= 0.06
S:  Manufacturer=Sierra Wireless, Incorporated
S:  Product=Sierra Wireless EM7565 Qualcomm Snapdragon X16 LTE-A
S:  SerialNumber=xxxxxxxx
C:* #Ifs= 4 Cfg#= 1 Atr=a0 MxPwr=500mA
I:* If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=qcserial
E:  Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I:* If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=qcserial
E:  Ad=83(I) Atr=03(Int.) MxPS=  10 Ivl=32ms
E:  Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I:* If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=qcserial
E:  Ad=85(I) Atr=03(Int.) MxPS=  10 Ivl=32ms
E:  Ad=84(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I:* If#= 8 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=qmi_wwan
E:  Ad=86(I) Atr=03(Int.) MxPS=   8 Ivl=32ms
E:  Ad=8e(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=0f(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms

but need sendsetup = true for the NMEA port to make it work properly.

Simplify the patch compared to v1 as suggested by Bjørn Mork by taking
advantage of the fact that existing devices work with sendsetup = true
too.

Use sendsetup = true for the NMEA interface of QCSERIAL_SWI and add
DEVICE_SWI entries for the EM7565 PID 0x9091 and the EM7565 QDL PID
0x9090.

Tests with several MC73xx/MC74xx/MC77xx devices have been performed in
order to verify backward compatibility.

Signed-off-by: Reinhard Speyerer <rspmn@arcor.de>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Johan Hovold <johan@kernel.org>
---
 drivers/usb/serial/qcserial.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/usb/serial/qcserial.c b/drivers/usb/serial/qcserial.c
index e3892541a489..613f91add03d 100644
--- a/drivers/usb/serial/qcserial.c
+++ b/drivers/usb/serial/qcserial.c
@@ -162,6 +162,8 @@ static const struct usb_device_id id_table[] = {
 	{DEVICE_SWI(0x1199, 0x9079)},	/* Sierra Wireless EM74xx */
 	{DEVICE_SWI(0x1199, 0x907a)},	/* Sierra Wireless EM74xx QDL */
 	{DEVICE_SWI(0x1199, 0x907b)},	/* Sierra Wireless EM74xx */
+	{DEVICE_SWI(0x1199, 0x9090)},	/* Sierra Wireless EM7565 QDL */
+	{DEVICE_SWI(0x1199, 0x9091)},	/* Sierra Wireless EM7565 */
 	{DEVICE_SWI(0x413c, 0x81a2)},	/* Dell Wireless 5806 Gobi(TM) 4G LTE Mobile Broadband Card */
 	{DEVICE_SWI(0x413c, 0x81a3)},	/* Dell Wireless 5570 HSPA+ (42Mbps) Mobile Broadband Card */
 	{DEVICE_SWI(0x413c, 0x81a4)},	/* Dell Wireless 5570e HSPA+ (42Mbps) Mobile Broadband Card */
@@ -342,6 +344,7 @@ static int qcprobe(struct usb_serial *serial, const struct usb_device_id *id)
 			break;
 		case 2:
 			dev_dbg(dev, "NMEA GPS interface found\n");
+			sendsetup = true;
 			break;
 		case 3:
 			dev_dbg(dev, "Modem port found\n");

From 967a6a07e95c58eb9c1581d22a1d9c2d1929843f Mon Sep 17 00:00:00 2001
From: Masaharu Hayakawa <masaharu.hayakawa.ry@renesas.com>
Date: Wed, 13 Dec 2017 11:33:00 +0900
Subject: [PATCH 129/808] mmc: renesas_sdhi: Add MODULE_LICENSE

The following error occurs when loading renesas_sdhi_core.c module,
so add MODULE_LICENSE("GPL v2").

 renesas_sdhi_core: module license 'unspecified' taints kernel.

Signed-off-by: Masaharu Hayakawa <masaharu.hayakawa.ry@renesas.com>
Fixes: 9d08428afb72 ("mmc: renesas-sdhi: make renesas_sdhi_sys_dmac main module file")
Cc: <stable@vger.kernel.org> # v4.13+
[Shimoda: Added Fixes tag and Cc to the stable ML]
Signed-off-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Reviewed-by: Simon Horman <horms+renesas@verge.net.au>
Acked-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/host/renesas_sdhi_core.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/mmc/host/renesas_sdhi_core.c b/drivers/mmc/host/renesas_sdhi_core.c
index fcf7235d5742..157e1d9e7725 100644
--- a/drivers/mmc/host/renesas_sdhi_core.c
+++ b/drivers/mmc/host/renesas_sdhi_core.c
@@ -24,6 +24,7 @@
 #include <linux/kernel.h>
 #include <linux/clk.h>
 #include <linux/slab.h>
+#include <linux/module.h>
 #include <linux/of_device.h>
 #include <linux/platform_device.h>
 #include <linux/mmc/host.h>
@@ -667,3 +668,5 @@ int renesas_sdhi_remove(struct platform_device *pdev)
 	return 0;
 }
 EXPORT_SYMBOL_GPL(renesas_sdhi_remove);
+
+MODULE_LICENSE("GPL v2");

From f29810335965ac1f7bcb501ee2af5f039f792416 Mon Sep 17 00:00:00 2001
From: Lan Tianyu <tianyu.lan@intel.com>
Date: Thu, 14 Dec 2017 03:01:52 -0500
Subject: [PATCH 130/808] KVM/x86: Check input paging mode when cs.l is set
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reported by syzkaller:
    WARNING: CPU: 0 PID: 27962 at arch/x86/kvm/emulate.c:5631 x86_emulate_insn+0x557/0x15f0 [kvm]
    Modules linked in: kvm_intel kvm [last unloaded: kvm]
    CPU: 0 PID: 27962 Comm: syz-executor Tainted: G    B   W        4.15.0-rc2-next-20171208+ #32
    Hardware name: Intel Corporation S1200SP/S1200SP, BIOS S1200SP.86B.01.03.0006.040720161253 04/07/2016
    RIP: 0010:x86_emulate_insn+0x557/0x15f0 [kvm]
    RSP: 0018:ffff8807234476d0 EFLAGS: 00010282
    RAX: 0000000000000000 RBX: ffff88072d0237a0 RCX: ffffffffa0065c4d
    RDX: 1ffff100e5a046f9 RSI: 0000000000000003 RDI: ffff88072d0237c8
    RBP: ffff880723447728 R08: ffff88072d020000 R09: ffffffffa008d240
    R10: 0000000000000002 R11: ffffed00e7d87db3 R12: ffff88072d0237c8
    R13: ffff88072d023870 R14: ffff88072d0238c2 R15: ffffffffa008d080
    FS:  00007f8a68666700(0000) GS:ffff880802200000(0000) knlGS:0000000000000000
    CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
    CR2: 000000002009506c CR3: 000000071fec4005 CR4: 00000000003626f0
    Call Trace:
     x86_emulate_instruction+0x3bc/0xb70 [kvm]
     ? reexecute_instruction.part.162+0x130/0x130 [kvm]
     vmx_handle_exit+0x46d/0x14f0 [kvm_intel]
     ? trace_event_raw_event_kvm_entry+0xe7/0x150 [kvm]
     ? handle_vmfunc+0x2f0/0x2f0 [kvm_intel]
     ? wait_lapic_expire+0x25/0x270 [kvm]
     vcpu_enter_guest+0x720/0x1ef0 [kvm]
     ...

When CS.L is set, vcpu should run in the 64 bit paging mode.
Current kvm set_sregs function doesn't have such check when
userspace inputs sreg values. This will lead unexpected behavior.
This patch is to add checks for CS.L, EFER.LME, EFER.LMA and
CR4.PAE when get SREG inputs from userspace in order to avoid
unexpected behavior.

Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Jim Mattson <jmattson@google.com>
Signed-off-by: Tianyu Lan <tianyu.lan@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 56d036b9ad75..3a82f2d4333b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7494,6 +7494,29 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
 }
 EXPORT_SYMBOL_GPL(kvm_task_switch);
 
+int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+{
+	if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG_BIT)) {
+		/*
+		 * When EFER.LME and CR0.PG are set, the processor is in
+		 * 64-bit mode (though maybe in a 32-bit code segment).
+		 * CR4.PAE and EFER.LMA must be set.
+		 */
+		if (!(sregs->cr4 & X86_CR4_PAE_BIT)
+		    || !(sregs->efer & EFER_LMA))
+			return -EINVAL;
+	} else {
+		/*
+		 * Not in 64-bit mode: EFER.LMA is clear and the code
+		 * segment cannot be 64-bit.
+		 */
+		if (sregs->efer & EFER_LMA || sregs->cs.l)
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 				  struct kvm_sregs *sregs)
 {
@@ -7506,6 +7529,9 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 			(sregs->cr4 & X86_CR4_OSXSAVE))
 		return -EINVAL;
 
+	if (kvm_valid_sregs(vcpu, sregs))
+		return -EINVAL;
+
 	apic_base_msr.data = sregs->apic_base;
 	apic_base_msr.host_initiated = true;
 	if (kvm_set_apic_base(vcpu, &apic_base_msr))

From 046046737bd35bed047460f080ea47e186be731e Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Wed, 15 Nov 2017 10:43:16 +0100
Subject: [PATCH 131/808] phy: tegra: fix device-tree node lookups

Fix child-node lookups during probe, which ended up searching the whole
device tree depth-first starting at the parents rather than just
matching on their children.

To make things worse, some parent nodes could end up being being
prematurely freed (by tegra_xusb_pad_register()) as
of_find_node_by_name() drops a reference to its first argument.

Fixes: 53d2a715c240 ("phy: Add Tegra XUSB pad controller support")
Cc: stable <stable@vger.kernel.org>     # 4.7
Cc: Thierry Reding <treding@nvidia.com>
Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com>
---
 drivers/phy/tegra/xusb.c | 58 ++++++++++++++++++++--------------------
 1 file changed, 29 insertions(+), 29 deletions(-)

diff --git a/drivers/phy/tegra/xusb.c b/drivers/phy/tegra/xusb.c
index 4307bf0013e1..63e916d4d069 100644
--- a/drivers/phy/tegra/xusb.c
+++ b/drivers/phy/tegra/xusb.c
@@ -75,14 +75,14 @@ MODULE_DEVICE_TABLE(of, tegra_xusb_padctl_of_match);
 static struct device_node *
 tegra_xusb_find_pad_node(struct tegra_xusb_padctl *padctl, const char *name)
 {
-	/*
-	 * of_find_node_by_name() drops a reference, so make sure to grab one.
-	 */
-	struct device_node *np = of_node_get(padctl->dev->of_node);
+	struct device_node *pads, *np;
 
-	np = of_find_node_by_name(np, "pads");
-	if (np)
-		np = of_find_node_by_name(np, name);
+	pads = of_get_child_by_name(padctl->dev->of_node, "pads");
+	if (!pads)
+		return NULL;
+
+	np = of_get_child_by_name(pads, name);
+	of_node_put(pads);
 
 	return np;
 }
@@ -90,16 +90,16 @@ tegra_xusb_find_pad_node(struct tegra_xusb_padctl *padctl, const char *name)
 static struct device_node *
 tegra_xusb_pad_find_phy_node(struct tegra_xusb_pad *pad, unsigned int index)
 {
-	/*
-	 * of_find_node_by_name() drops a reference, so make sure to grab one.
-	 */
-	struct device_node *np = of_node_get(pad->dev.of_node);
+	struct device_node *np, *lanes;
 
-	np = of_find_node_by_name(np, "lanes");
-	if (!np)
+	lanes = of_get_child_by_name(pad->dev.of_node, "lanes");
+	if (!lanes)
 		return NULL;
 
-	return of_find_node_by_name(np, pad->soc->lanes[index].name);
+	np = of_get_child_by_name(lanes, pad->soc->lanes[index].name);
+	of_node_put(lanes);
+
+	return np;
 }
 
 static int
@@ -195,7 +195,7 @@ int tegra_xusb_pad_register(struct tegra_xusb_pad *pad,
 	unsigned int i;
 	int err;
 
-	children = of_find_node_by_name(pad->dev.of_node, "lanes");
+	children = of_get_child_by_name(pad->dev.of_node, "lanes");
 	if (!children)
 		return -ENODEV;
 
@@ -444,21 +444,21 @@ static struct device_node *
 tegra_xusb_find_port_node(struct tegra_xusb_padctl *padctl, const char *type,
 			  unsigned int index)
 {
-	/*
-	 * of_find_node_by_name() drops a reference, so make sure to grab one.
-	 */
-	struct device_node *np = of_node_get(padctl->dev->of_node);
+	struct device_node *ports, *np;
+	char *name;
 
-	np = of_find_node_by_name(np, "ports");
-	if (np) {
-		char *name;
+	ports = of_get_child_by_name(padctl->dev->of_node, "ports");
+	if (!ports)
+		return NULL;
 
-		name = kasprintf(GFP_KERNEL, "%s-%u", type, index);
-		if (!name)
-			return ERR_PTR(-ENOMEM);
-		np = of_find_node_by_name(np, name);
-		kfree(name);
+	name = kasprintf(GFP_KERNEL, "%s-%u", type, index);
+	if (!name) {
+		of_node_put(ports);
+		return ERR_PTR(-ENOMEM);
 	}
+	np = of_get_child_by_name(ports, name);
+	kfree(name);
+	of_node_put(ports);
 
 	return np;
 }
@@ -847,7 +847,7 @@ static void tegra_xusb_remove_ports(struct tegra_xusb_padctl *padctl)
 
 static int tegra_xusb_padctl_probe(struct platform_device *pdev)
 {
-	struct device_node *np = of_node_get(pdev->dev.of_node);
+	struct device_node *np = pdev->dev.of_node;
 	const struct tegra_xusb_padctl_soc *soc;
 	struct tegra_xusb_padctl *padctl;
 	const struct of_device_id *match;
@@ -855,7 +855,7 @@ static int tegra_xusb_padctl_probe(struct platform_device *pdev)
 	int err;
 
 	/* for backwards compatibility with old device trees */
-	np = of_find_node_by_name(np, "pads");
+	np = of_get_child_by_name(np, "pads");
 	if (!np) {
 		dev_warn(&pdev->dev, "deprecated DT, using legacy driver\n");
 		return tegra_xusb_padctl_legacy_probe(pdev);

From e796cc6a3a9186c92092e2f5929cf8f65b56cf01 Mon Sep 17 00:00:00 2001
From: Arvind Yadav <arvind.yadav.cs@gmail.com>
Date: Fri, 17 Nov 2017 16:55:35 +0530
Subject: [PATCH 132/808] phy: cpcap-usb: Fix platform_get_irq_byname's error
 checking.

The platform_get_irq_byname() function returns negative if an error occurs.
zero or positive number on success. platform_get_irq_byname() error
checking for zero is not correct.

Fixes: 6d6ce40f63af ("phy: cpcap-usb: Add CPCAP PMIC USB support")
Signed-off-by: Arvind Yadav <arvind.yadav.cs@gmail.com>
Reviewed-by: Sebastian Reichel <sebastian.reichel@collabora.co.uk>
Acked-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com>
---
 drivers/phy/motorola/phy-cpcap-usb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/phy/motorola/phy-cpcap-usb.c b/drivers/phy/motorola/phy-cpcap-usb.c
index accaaaccb662..6601ad0dfb3a 100644
--- a/drivers/phy/motorola/phy-cpcap-usb.c
+++ b/drivers/phy/motorola/phy-cpcap-usb.c
@@ -310,7 +310,7 @@ static int cpcap_usb_init_irq(struct platform_device *pdev,
 	int irq, error;
 
 	irq = platform_get_irq_byname(pdev, name);
-	if (!irq)
+	if (irq < 0)
 		return -ENODEV;
 
 	error = devm_request_threaded_irq(ddata->dev, irq, NULL,

From 3cb0ab6e008f2a9ffe2d1be4246984003caed7e2 Mon Sep 17 00:00:00 2001
From: Chris Zhong <zyw@rock-chips.com>
Date: Thu, 8 Sep 2016 10:38:11 -0700
Subject: [PATCH 133/808] phy: rockchip-typec: add pm_runtime_disable in err
 case

Add pm_runtime_disable in err case to make the pm_runtime_enable/disable
is invoked balanced.

Signed-off-by: Chris Zhong <zyw@rock-chips.com>
Reviewed-by: Brian Norris <briannorris@chromium.org>
Reviewed-by: Douglas Anderson <dianders@chromium.org>
Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com>
---
 drivers/phy/rockchip/phy-rockchip-typec.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/phy/rockchip/phy-rockchip-typec.c b/drivers/phy/rockchip/phy-rockchip-typec.c
index ee85fa0ca4b0..7492c8978217 100644
--- a/drivers/phy/rockchip/phy-rockchip-typec.c
+++ b/drivers/phy/rockchip/phy-rockchip-typec.c
@@ -1137,6 +1137,7 @@ static int rockchip_typec_phy_probe(struct platform_device *pdev)
 		if (IS_ERR(phy)) {
 			dev_err(dev, "failed to create phy: %s\n",
 				child_np->name);
+			pm_runtime_disable(dev);
 			return PTR_ERR(phy);
 		}
 
@@ -1146,6 +1147,7 @@ static int rockchip_typec_phy_probe(struct platform_device *pdev)
 	phy_provider = devm_of_phy_provider_register(dev, of_phy_simple_xlate);
 	if (IS_ERR(phy_provider)) {
 		dev_err(dev, "Failed to register phy provider\n");
+		pm_runtime_disable(dev);
 		return PTR_ERR(phy_provider);
 	}
 

From 2b88212c4cc67ff33dec5bb4d690044b97a5f979 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 2 Nov 2017 12:56:36 +0100
Subject: [PATCH 134/808] phy: rcar-gen3-usb2: select USB_COMMON

When USB is disabled, we get a link error for this driver
because of the added OTG support

drivers/phy/renesas/phy-rcar-gen3-usb2.o: In function `rcar_gen3_phy_usb2_probe':
phy-rcar-gen3-usb2.c:(.text+0x250): undefined reference to `of_usb_get_dr_mode_by_phy'

Other phy drivers select USB_COMMON for this, so let's do the same
here.

Fixes: 7e0540f41332 ("phy: rcar-gen3-usb2: check dr_mode for otg mode")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com>
---
 drivers/phy/renesas/Kconfig | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/phy/renesas/Kconfig b/drivers/phy/renesas/Kconfig
index cb09245e9b4c..c845facacb06 100644
--- a/drivers/phy/renesas/Kconfig
+++ b/drivers/phy/renesas/Kconfig
@@ -12,7 +12,9 @@ config PHY_RCAR_GEN3_USB2
 	tristate "Renesas R-Car generation 3 USB 2.0 PHY driver"
 	depends on ARCH_RENESAS
 	depends on EXTCON
+	depends on USB_SUPPORT
 	select GENERIC_PHY
+	select USB_COMMON
 	help
 	  Support for USB 2.0 PHY found on Renesas R-Car generation 3 SoCs.
 

From 50034ed49645463a16327cad05694e201e6b4126 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 15 Dec 2017 05:09:47 -0800
Subject: [PATCH 135/808] cgroup: use strlcpy() instead of strscpy() to avoid
 spurious warning

As long as cft->name is guaranteed to be NUL-terminated, using strlcpy() would
work just as well and avoid that warning, so the change below could be folded
into that commit.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/cgroup.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 18d71fbd3923..f4c2f8cb5748 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1397,7 +1397,7 @@ static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
 			 cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
 			 cft->name);
 	else
-		strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
+		strlcpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
 	return buf;
 }
 
@@ -1864,9 +1864,9 @@ void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts)
 
 	root->flags = opts->flags;
 	if (opts->release_agent)
-		strscpy(root->release_agent_path, opts->release_agent, PATH_MAX);
+		strlcpy(root->release_agent_path, opts->release_agent, PATH_MAX);
 	if (opts->name)
-		strscpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN);
+		strlcpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN);
 	if (opts->cpuset_clone_children)
 		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
 }

From 2d17d8d79e77ff3f1b35b87522fc72fa562260ff Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Thu, 14 Dec 2017 17:17:56 -0800
Subject: [PATCH 136/808] xdp: linearize skb in netif_receive_generic_xdp()

In netif_receive_generic_xdp(), it is necessary to linearize all
nonlinear skb. However, in current implementation, skb with
troom <= 0 are not linearized. This patch fixes this by calling
skb_linearize() for all nonlinear skb.

Fixes: de8f3a83b0a0 ("bpf: add meta pointer for direct access")
Signed-off-by: Song Liu <songliubraving@fb.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index f47e96b62308..01ee854454a8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3904,7 +3904,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 				     hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
 				     troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
 			goto do_drop;
-		if (troom > 0 && __skb_linearize(skb))
+		if (skb_linearize(skb))
 			goto do_drop;
 	}
 

From 9f37e797547cca9d14fe1f0f43f5c89b261ff0b0 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Fri, 15 Dec 2017 14:16:04 +0100
Subject: [PATCH 137/808] s390: fix preemption race in disable_sacf_uaccess

With CONFIG_PREEMPT=y there is a possible race in disable_sacf_uaccess.

The new set_fs value needs to be stored the the task structure first,
the control register update needs to be second. Otherwise a preemptive
schedule may interrupt the code right after the control register update
has been done and the next time the task is scheduled we get an incorrect
value in the control register due to the old set_fs setting.

Fixes: 0aaba41b58 ("s390: remove all code using the access register mode")
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/lib/uaccess.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/s390/lib/uaccess.c b/arch/s390/lib/uaccess.c
index cae5a1e16cbd..c4f8039a35e8 100644
--- a/arch/s390/lib/uaccess.c
+++ b/arch/s390/lib/uaccess.c
@@ -89,11 +89,11 @@ EXPORT_SYMBOL(enable_sacf_uaccess);
 
 void disable_sacf_uaccess(mm_segment_t old_fs)
 {
+	current->thread.mm_segment = old_fs;
 	if (old_fs == USER_DS && test_facility(27)) {
 		__ctl_load(S390_lowcore.user_asce, 1, 1);
 		clear_cpu_flag(CIF_ASCE_PRIMARY);
 	}
-	current->thread.mm_segment = old_fs;
 }
 EXPORT_SYMBOL(disable_sacf_uaccess);
 

From b224f6134d72e3493a023b5bea917f9a6beea0c8 Mon Sep 17 00:00:00 2001
From: David Disseldorp <ddiss@suse.de>
Date: Fri, 24 Nov 2017 16:30:53 +0100
Subject: [PATCH 138/808] nvme: set discard_alignment to zero

Similar to 7c084289795b ("rbd: set discard_alignment to zero"), NVMe
devices are currently incorrectly initialised with the block queue
discard_alignment set to the NVMe stream alignment.

As per Documentation/ABI/testing/sysfs-block:
  The discard_alignment parameter indicates how many bytes the beginning
  of the device is offset from the internal allocation unit's natural
  alignment.

Correcting the discard_alignment parameter to zero has no effect on how
discard requests are propagated through the block layer - @alignment in
__blkdev_issue_discard() remains zero. However, it does fix other
consumers, such as LIO's Block Limits VPD response.

Signed-off-by: David Disseldorp <ddiss@suse.de>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index f837d666cbd4..67f2f94cf86e 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1287,7 +1287,7 @@ static void nvme_config_discard(struct nvme_ctrl *ctrl,
 	BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) <
 			NVME_DSM_MAX_RANGES);
 
-	queue->limits.discard_alignment = size;
+	queue->limits.discard_alignment = 0;
 	queue->limits.discard_granularity = size;
 
 	blk_queue_max_discard_sectors(queue, UINT_MAX);

From 4596e752db02d47038cd7c965419789ab15d1985 Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Wed, 29 Nov 2017 15:11:37 -0800
Subject: [PATCH 139/808] nvme-fc: remove double put reference if admin connect
 fails

There are two put references in the failure case of initial
create_association. The first put actually frees the controller, thus the
second put references freed memory.

Remove the unnecessary 2nd put.

Signed-off-by: James Smart <james.smart@broadcom.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/fc.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 0a8af4daef89..794e66e4aa20 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -3221,7 +3221,6 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
 
 		/* initiate nvme ctrl ref counting teardown */
 		nvme_uninit_ctrl(&ctrl->ctrl);
-		nvme_put_ctrl(&ctrl->ctrl);
 
 		/* Remove core ctrl ref. */
 		nvme_put_ctrl(&ctrl->ctrl);

From bd9f5d65769b9fe5e72110d4cbc9097b53b01613 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Wed, 6 Dec 2017 18:30:09 +0800
Subject: [PATCH 140/808] nvme: call blk_integrity_unregister after queue is
 cleaned up

During IO complete path, bio_integrity_advance() is often called, and
blk_get_integrity() is called in this function. But in
blk_integrity_unregister, the buffer pointed by queue->integrity
is cleared, and blk_integrity->profile becomes NULL, then blk_get_integrity
returns NULL, and causes kernel oops[1] finally.

This patch fixes this issue by calling blk_integrity_unregister() after
blk_cleanup_queue().

[1] kernel oops log
[  122.068007] BUG: unable to handle kernel NULL pointer dereference at 000000000000000a
[  122.076760] IP: bio_integrity_advance+0x3d/0xf0
[  122.081815] PGD 0 P4D 0
[  122.084641] Oops: 0000 [#1] SMP
[  122.088142] Modules linked in: sunrpc ipmi_ssif intel_rapl vfat fat x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel kvm irqbypass mei_me ipmi_si crct10dif_pclmul crc32_pclmul sg mei ghash_clmulni_intel mxm_wmi ipmi_devintf iTCO_wdt intel_cstate intel_uncore pcspkr intel_rapl_perf iTCO_vendor_support dcdbas ipmi_msghandler lpc_ich acpi_power_meter shpchp wmi dm_multipath ip_tables xfs libcrc32c sd_mod mgag200 i2c_algo_bit drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm drm crc32c_intel ahci nvme tg3 libahci nvme_core i2c_core libata ptp megaraid_sas pps_core dm_mirror dm_region_hash dm_log dm_mod
[  122.149577] CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.14.0-11.el7a.x86_64 #1
[  122.157635] Hardware name: Dell Inc. PowerEdge R730xd/072T6D, BIOS 2.5.5 08/16/2017
[  122.166179] task: ffff8802ff1e8000 task.stack: ffffc90000130000
[  122.172785] RIP: 0010:bio_integrity_advance+0x3d/0xf0
[  122.178419] RSP: 0018:ffff88047fc03d70 EFLAGS: 00010006
[  122.184248] RAX: ffff880473b08000 RBX: ffff880458c71a80 RCX: ffff880473b08248
[  122.192209] RDX: 0000000000000000 RSI: 000000000000003c RDI: ffffc900038d7ba0
[  122.200171] RBP: ffff88047fc03d78 R08: 0000000000000001 R09: ffffffffa01a78b5
[  122.208132] R10: ffff88047fc1eda0 R11: ffff880458c71ad0 R12: 0000000000007800
[  122.216094] R13: 0000000000000000 R14: 0000000000007800 R15: ffff880473a39b40
[  122.224056] FS:  0000000000000000(0000) GS:ffff88047fc00000(0000) knlGS:0000000000000000
[  122.233083] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  122.239494] CR2: 000000000000000a CR3: 0000000001c09002 CR4: 00000000001606e0
[  122.247455] Call Trace:
[  122.250183]  <IRQ>
[  122.252429]  bio_advance+0x28/0xf0
[  122.256217]  blk_update_request+0xa1/0x310
[  122.260778]  blk_mq_end_request+0x1e/0x70
[  122.265256]  nvme_complete_rq+0x1c/0xd0 [nvme_core]
[  122.270699]  nvme_pci_complete_rq+0x85/0x130 [nvme]
[  122.276140]  __blk_mq_complete_request+0x8d/0x140
[  122.281387]  blk_mq_complete_request+0x16/0x20
[  122.286345]  nvme_process_cq+0xdd/0x1c0 [nvme]
[  122.291301]  nvme_irq+0x23/0x50 [nvme]
[  122.295485]  __handle_irq_event_percpu+0x3c/0x190
[  122.300725]  handle_irq_event_percpu+0x32/0x80
[  122.305683]  handle_irq_event+0x3b/0x60
[  122.309964]  handle_edge_irq+0x8f/0x190
[  122.314247]  handle_irq+0xab/0x120
[  122.318043]  do_IRQ+0x48/0xd0
[  122.321355]  common_interrupt+0x9d/0x9d
[  122.325625]  </IRQ>
[  122.327967] RIP: 0010:cpuidle_enter_state+0xe9/0x280
[  122.333504] RSP: 0018:ffffc90000133e68 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff35
[  122.341952] RAX: ffff88047fc1b900 RBX: ffff88047fc24400 RCX: 000000000000001f
[  122.349913] RDX: 0000000000000000 RSI: fffffcf2e6007295 RDI: 0000000000000000
[  122.357874] RBP: ffffc90000133ea0 R08: 000000000000062e R09: 0000000000000253
[  122.365836] R10: 0000000000000225 R11: 0000000000000018 R12: 0000000000000002
[  122.373797] R13: 0000000000000001 R14: ffff88047fc24400 R15: 0000001c6bd1d263
[  122.381762]  ? cpuidle_enter_state+0xc5/0x280
[  122.386623]  cpuidle_enter+0x17/0x20
[  122.390611]  call_cpuidle+0x23/0x40
[  122.394501]  do_idle+0x17e/0x1f0
[  122.398101]  cpu_startup_entry+0x73/0x80
[  122.402478]  start_secondary+0x178/0x1c0
[  122.406854]  secondary_startup_64+0xa5/0xa5
[  122.411520] Code: 48 8b 5f 68 48 8b 47 08 31 d2 4c 8b 5b 48 48 8b 80 d0 03 00 00 48 83 b8 48 02 00 00 00 48 8d 88 48 02 00 00 48 0f 45 d1 c1 ee 09 <0f> b6 4a 0a 0f b6 52 09 89 f0 48 01 73 08 83 e9 09 d3 e8 0f af
[  122.432604] RIP: bio_integrity_advance+0x3d/0xf0 RSP: ffff88047fc03d70
[  122.439888] CR2: 000000000000000a

Reported-by: Zhang Yi <yizhan@redhat.com>
Tested-by: Zhang Yi <yizhan@redhat.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 67f2f94cf86e..2cc6192ef275 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2965,8 +2965,6 @@ static void nvme_ns_remove(struct nvme_ns *ns)
 		return;
 
 	if (ns->disk && ns->disk->flags & GENHD_FL_UP) {
-		if (blk_get_integrity(ns->disk))
-			blk_integrity_unregister(ns->disk);
 		nvme_mpath_remove_disk_links(ns);
 		sysfs_remove_group(&disk_to_dev(ns->disk)->kobj,
 					&nvme_ns_id_attr_group);
@@ -2974,6 +2972,8 @@ static void nvme_ns_remove(struct nvme_ns *ns)
 			nvme_nvm_unregister_sysfs(ns);
 		del_gendisk(ns->disk);
 		blk_cleanup_queue(ns->queue);
+		if (blk_get_integrity(ns->disk))
+			blk_integrity_unregister(ns->disk);
 	}
 
 	mutex_lock(&ns->ctrl->subsys->lock);

From 249159c5f15812140fa216f9997d799ac0023a1f Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Thu, 14 Dec 2017 11:20:14 -0700
Subject: [PATCH 141/808] nvme: check hw sectors before setting chunk sectors

Some devices with IDs matching the "stripe" quirk don't actually have
this quirk, and don't have an MDTS value. When MDTS is not set, the
driver sets the max sectors to UINT_MAX, which is not a power of 2,
hitting a BUG_ON from blk_queue_chunk_sectors. This patch skips setting
chunk sectors for such devices.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/core.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 2cc6192ef275..eab812dd2429 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1705,7 +1705,8 @@ static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
 		blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
 		blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX));
 	}
-	if (ctrl->quirks & NVME_QUIRK_STRIPE_SIZE)
+	if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) &&
+	    is_power_of_2(ctrl->max_hw_sectors))
 		blk_queue_chunk_sectors(q, ctrl->max_hw_sectors);
 	blk_queue_virt_boundary(q, ctrl->page_size - 1);
 	if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)

From 654b4a4acd8b52a4272114b95896e9a10d382cde Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Thu, 14 Dec 2017 11:20:32 -0700
Subject: [PATCH 142/808] nvme: setup streams after initializing namespace head

Fixes a NULL pointer dereference.

Reported-by: Arnav Dawn <a.dawn@samsung.com>
Signed-off-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index eab812dd2429..1e46e60b8f10 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2870,7 +2870,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 
 	blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
 	nvme_set_queue_limits(ctrl, ns->queue);
-	nvme_setup_streams_ns(ctrl, ns);
 
 	id = nvme_identify_ns(ctrl, nsid);
 	if (!id)
@@ -2881,6 +2880,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 
 	if (nvme_init_ns_head(ns, nsid, id, &new))
 		goto out_free_id;
+	nvme_setup_streams_ns(ctrl, ns);
 	
 #ifdef CONFIG_NVME_MULTIPATH
 	/*

From c739f930be1dd5fd949030e3475a884fe06dae9b Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Tue, 12 Dec 2017 07:56:36 -0800
Subject: [PATCH 143/808] x86/espfix/64: Fix espfix double-fault handling on
 5-level systems

Using PGDIR_SHIFT to identify espfix64 addresses on 5-level systems
was wrong, and it resulted in panics due to unhandled double faults.
Use P4D_SHIFT instead, which is correct on 4-level and 5-level
machines.

This fixes a panic when running x86 selftests on 5-level machines.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Fixes: 1d33b219563f ("x86/espfix: Add support for 5-level paging")
Link: http://lkml.kernel.org/r/24c898b4f44fdf8c22d93703850fb384ef87cfdc.1513035461.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/traps.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index b7b0f74a2150..c751518936ac 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -355,7 +355,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
 	 *
 	 * No need for ist_enter here because we don't use RCU.
 	 */
-	if (((long)regs->sp >> PGDIR_SHIFT) == ESPFIX_PGD_ENTRY &&
+	if (((long)regs->sp >> P4D_SHIFT) == ESPFIX_PGD_ENTRY &&
 		regs->cs == __KERNEL_CS &&
 		regs->ip == (unsigned long)native_irq_return_iret)
 	{

From 6d59b7dbf72ed20d0138e2f9b75ca3d4a9d4faca Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 14 Dec 2017 21:07:23 +0100
Subject: [PATCH 144/808] bpf, s390x: do not reload skb pointers in non-skb
 context

The assumption of unconditionally reloading skb pointers on
BPF helper calls where bpf_helper_changes_pkt_data() holds
true is wrong. There can be different contexts where the
BPF helper would enforce a reload such as in case of XDP.
Here, we do have a struct xdp_buff instead of struct sk_buff
as context, thus this will access garbage.

JITs only ever need to deal with cached skb pointer reload
when ld_abs/ind was seen, therefore guard the reload behind
SEEN_SKB only. Tested on s390x.

Fixes: 9db7f2b81880 ("s390/bpf: recache skb->data/hlen for skb_vlan_push/pop")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Cc: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/s390/net/bpf_jit_comp.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index e81c16838b90..9557d8b516df 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -55,8 +55,7 @@ struct bpf_jit {
 #define SEEN_LITERAL	8	/* code uses literals */
 #define SEEN_FUNC	16	/* calls C functions */
 #define SEEN_TAIL_CALL	32	/* code uses tail calls */
-#define SEEN_SKB_CHANGE	64	/* code changes skb data */
-#define SEEN_REG_AX	128	/* code uses constant blinding */
+#define SEEN_REG_AX	64	/* code uses constant blinding */
 #define SEEN_STACK	(SEEN_FUNC | SEEN_MEM | SEEN_SKB)
 
 /*
@@ -448,12 +447,12 @@ static void bpf_jit_prologue(struct bpf_jit *jit, u32 stack_depth)
 			EMIT6_DISP_LH(0xe3000000, 0x0024, REG_W1, REG_0,
 				      REG_15, 152);
 	}
-	if (jit->seen & SEEN_SKB)
+	if (jit->seen & SEEN_SKB) {
 		emit_load_skb_data_hlen(jit);
-	if (jit->seen & SEEN_SKB_CHANGE)
 		/* stg %b1,ST_OFF_SKBP(%r0,%r15) */
 		EMIT6_DISP_LH(0xe3000000, 0x0024, BPF_REG_1, REG_0, REG_15,
 			      STK_OFF_SKBP);
+	}
 }
 
 /*
@@ -983,8 +982,8 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i
 		EMIT2(0x0d00, REG_14, REG_W1);
 		/* lgr %b0,%r2: load return value into %b0 */
 		EMIT4(0xb9040000, BPF_REG_0, REG_2);
-		if (bpf_helper_changes_pkt_data((void *)func)) {
-			jit->seen |= SEEN_SKB_CHANGE;
+		if ((jit->seen & SEEN_SKB) &&
+		    bpf_helper_changes_pkt_data((void *)func)) {
 			/* lg %b1,ST_OFF_SKBP(%r15) */
 			EMIT6_DISP_LH(0xe3000000, 0x0004, BPF_REG_1, REG_0,
 				      REG_15, STK_OFF_SKBP);

From 87338c8e2cbb317b5f757e6172f94e2e3799cd20 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 14 Dec 2017 21:07:24 +0100
Subject: [PATCH 145/808] bpf, ppc64: do not reload skb pointers in non-skb
 context

The assumption of unconditionally reloading skb pointers on
BPF helper calls where bpf_helper_changes_pkt_data() holds
true is wrong. There can be different contexts where the helper
would enforce a reload such as in case of XDP. Here, we do
have a struct xdp_buff instead of struct sk_buff as context,
thus this will access garbage.

JITs only ever need to deal with cached skb pointer reload
when ld_abs/ind was seen, therefore guard the reload behind
SEEN_SKB.

Fixes: 156d0e290e96 ("powerpc/ebpf/jit: Implement JIT compiler for extended BPF")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Tested-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/powerpc/net/bpf_jit_comp64.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c
index 46d74e81aff1..d183b4801bdb 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -763,7 +763,8 @@ emit_clear:
 			func = (u8 *) __bpf_call_base + imm;
 
 			/* Save skb pointer if we need to re-cache skb data */
-			if (bpf_helper_changes_pkt_data(func))
+			if ((ctx->seen & SEEN_SKB) &&
+			    bpf_helper_changes_pkt_data(func))
 				PPC_BPF_STL(3, 1, bpf_jit_stack_local(ctx));
 
 			bpf_jit_emit_func_call(image, ctx, (u64)func);
@@ -772,7 +773,8 @@ emit_clear:
 			PPC_MR(b2p[BPF_REG_0], 3);
 
 			/* refresh skb cache */
-			if (bpf_helper_changes_pkt_data(func)) {
+			if ((ctx->seen & SEEN_SKB) &&
+			    bpf_helper_changes_pkt_data(func)) {
 				/* reload skb pointer to r3 */
 				PPC_BPF_LL(3, 1, bpf_jit_stack_local(ctx));
 				bpf_jit_emit_skb_loads(image, ctx);

From 04514d13222f2c4c91adf0ecb21004cec3388795 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 14 Dec 2017 21:07:25 +0100
Subject: [PATCH 146/808] bpf: guarantee r1 to be ctx in case of
 bpf_helper_changes_pkt_data

Some JITs don't cache skb context on stack in prologue, so when
LD_ABS/IND is used and helper calls yield bpf_helper_changes_pkt_data()
as true, then they temporarily save/restore skb pointer. However,
the assumption that skb always has to be in r1 is a bit of a
gamble. Right now it turned out to be true for all helpers listed
in bpf_helper_changes_pkt_data(), but lets enforce that from verifier
side, so that we make this a guarantee and bail out if the func
proto is misconfigured in future helpers.

In case of BPF helper calls from cBPF, bpf_helper_changes_pkt_data()
is completely unrelevant here (since cBPF is context read-only) and
therefore always false.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d4593571c404..e39b01317b6f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1674,7 +1674,13 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 		return -EINVAL;
 	}
 
+	/* With LD_ABS/IND some JITs save/restore skb from r1. */
 	changes_data = bpf_helper_changes_pkt_data(fn->func);
+	if (changes_data && fn->arg1_type != ARG_PTR_TO_CTX) {
+		verbose(env, "kernel subsystem misconfigured func %s#%d: r1 != ctx\n",
+			func_id_name(func_id), func_id);
+		return -EINVAL;
+	}
 
 	memset(&meta, 0, sizeof(meta));
 	meta.pkt_access = fn->pkt_access;

From 07aee94394547721ac168cbf4e1c09c14a5fe671 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 14 Dec 2017 21:07:26 +0100
Subject: [PATCH 147/808] bpf, sparc: fix usage of wrong reg for load_skb_regs
 after call

When LD_ABS/IND is used in the program, and we have a BPF helper
call that changes packet data (bpf_helper_changes_pkt_data() returns
true), then in case of sparc JIT, we try to reload cached skb data
from bpf2sparc[BPF_REG_6]. However, there is no such guarantee or
assumption that skb sits in R6 at this point, all helpers changing
skb data only have a guarantee that skb sits in R1. Therefore,
store BPF R1 in L7 temporarily and after procedure call use L7 to
reload cached skb data. skb sitting in R6 is only true at the time
when LD_ABS/IND is executed.

Fixes: 7a12b5031c6b ("sparc64: Add eBPF JIT.")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/sparc/net/bpf_jit_comp_64.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/sparc/net/bpf_jit_comp_64.c b/arch/sparc/net/bpf_jit_comp_64.c
index 5765e7e711f7..ff5f9cb3039a 100644
--- a/arch/sparc/net/bpf_jit_comp_64.c
+++ b/arch/sparc/net/bpf_jit_comp_64.c
@@ -1245,14 +1245,16 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx)
 		u8 *func = ((u8 *)__bpf_call_base) + imm;
 
 		ctx->saw_call = true;
+		if (ctx->saw_ld_abs_ind && bpf_helper_changes_pkt_data(func))
+			emit_reg_move(bpf2sparc[BPF_REG_1], L7, ctx);
 
 		emit_call((u32 *)func, ctx);
 		emit_nop(ctx);
 
 		emit_reg_move(O0, bpf2sparc[BPF_REG_0], ctx);
 
-		if (bpf_helper_changes_pkt_data(func) && ctx->saw_ld_abs_ind)
-			load_skb_regs(ctx, bpf2sparc[BPF_REG_6]);
+		if (ctx->saw_ld_abs_ind && bpf_helper_changes_pkt_data(func))
+			load_skb_regs(ctx, L7);
 		break;
 	}
 

From 87ab8194303e73af2898e9e1c8b3b9bcfe91e7a9 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 14 Dec 2017 21:07:27 +0100
Subject: [PATCH 148/808] bpf: add test case for ld_abs and helper changing pkt
 data

Add a test that i) uses LD_ABS, ii) zeroing R6 before call, iii) calls
a helper that triggers reload of cached skb data, iv) uses LD_ABS again.
It's added for test_bpf in order to do runtime testing after JITing as
well as test_verifier to test that the sequence is allowed.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 lib/test_bpf.c                              | 43 +++++++++++++++++++++
 tools/testing/selftests/bpf/test_verifier.c | 24 ++++++++++++
 2 files changed, 67 insertions(+)

diff --git a/lib/test_bpf.c b/lib/test_bpf.c
index aa8812ae6776..9e9748089270 100644
--- a/lib/test_bpf.c
+++ b/lib/test_bpf.c
@@ -435,6 +435,41 @@ loop:
 	return 0;
 }
 
+static int bpf_fill_ld_abs_vlan_push_pop2(struct bpf_test *self)
+{
+	struct bpf_insn *insn;
+
+	insn = kmalloc_array(16, sizeof(*insn), GFP_KERNEL);
+	if (!insn)
+		return -ENOMEM;
+
+	/* Due to func address being non-const, we need to
+	 * assemble this here.
+	 */
+	insn[0] = BPF_MOV64_REG(R6, R1);
+	insn[1] = BPF_LD_ABS(BPF_B, 0);
+	insn[2] = BPF_LD_ABS(BPF_H, 0);
+	insn[3] = BPF_LD_ABS(BPF_W, 0);
+	insn[4] = BPF_MOV64_REG(R7, R6);
+	insn[5] = BPF_MOV64_IMM(R6, 0);
+	insn[6] = BPF_MOV64_REG(R1, R7);
+	insn[7] = BPF_MOV64_IMM(R2, 1);
+	insn[8] = BPF_MOV64_IMM(R3, 2);
+	insn[9] = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+			       bpf_skb_vlan_push_proto.func - __bpf_call_base);
+	insn[10] = BPF_MOV64_REG(R6, R7);
+	insn[11] = BPF_LD_ABS(BPF_B, 0);
+	insn[12] = BPF_LD_ABS(BPF_H, 0);
+	insn[13] = BPF_LD_ABS(BPF_W, 0);
+	insn[14] = BPF_MOV64_IMM(R0, 42);
+	insn[15] = BPF_EXIT_INSN();
+
+	self->u.ptr.insns = insn;
+	self->u.ptr.len = 16;
+
+	return 0;
+}
+
 static int bpf_fill_jump_around_ld_abs(struct bpf_test *self)
 {
 	unsigned int len = BPF_MAXINSNS;
@@ -6066,6 +6101,14 @@ static struct bpf_test tests[] = {
 		{},
 		{ {0x1, 0x42 } },
 	},
+	{
+		"LD_ABS with helper changing skb data",
+		{ },
+		INTERNAL,
+		{ 0x34 },
+		{ { ETH_HLEN, 42 } },
+		.fill_helper = bpf_fill_ld_abs_vlan_push_pop2,
+	},
 };
 
 static struct net_device dev;
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 3c64f30cf63c..b03ecfd7185b 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -6116,6 +6116,30 @@ static struct bpf_test tests[] = {
 		},
 		.result = ACCEPT,
 	},
+	{
+		"ld_abs: tests on r6 and skb data reload helper",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+			BPF_LD_ABS(BPF_B, 0),
+			BPF_LD_ABS(BPF_H, 0),
+			BPF_LD_ABS(BPF_W, 0),
+			BPF_MOV64_REG(BPF_REG_7, BPF_REG_6),
+			BPF_MOV64_IMM(BPF_REG_6, 0),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+			BPF_MOV64_IMM(BPF_REG_2, 1),
+			BPF_MOV64_IMM(BPF_REG_3, 2),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_skb_vlan_push),
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_7),
+			BPF_LD_ABS(BPF_B, 0),
+			BPF_LD_ABS(BPF_H, 0),
+			BPF_LD_ABS(BPF_W, 0),
+			BPF_MOV64_IMM(BPF_REG_0, 42),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+	},
 	{
 		"ld_ind: check calling conv, r1",
 		.insns = {

From f57ab9a01a36ef3454333251cc57e3a9948b17bf Mon Sep 17 00:00:00 2001
From: Sudeep Holla <sudeep.holla@arm.com>
Date: Fri, 17 Nov 2017 11:56:41 +0000
Subject: [PATCH 149/808] drivers: base: cacheinfo: fix cache type for
 non-architected system cache

Commit dfea747d2aba ("drivers: base: cacheinfo: support DT overrides for
cache properties") doesn't initialise the cache type if it's present
only in DT and the architecture is not aware of it. They are unified
system level cache which are generally transparent.

This patch check if the cache type is set to NOCACHE but the DT node
indicates that it's unified cache and sets the cache type accordingly.

Fixes: dfea747d2aba ("drivers: base: cacheinfo: support DT overrides for cache properties")
Reported-and-tested-by: Tan Xiaojun <tanxiaojun@huawei.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/cacheinfo.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/drivers/base/cacheinfo.c b/drivers/base/cacheinfo.c
index eb3af2739537..07532d83be0b 100644
--- a/drivers/base/cacheinfo.c
+++ b/drivers/base/cacheinfo.c
@@ -186,6 +186,11 @@ static void cache_associativity(struct cacheinfo *this_leaf)
 		this_leaf->ways_of_associativity = (size / nr_sets) / line_size;
 }
 
+static bool cache_node_is_unified(struct cacheinfo *this_leaf)
+{
+	return of_property_read_bool(this_leaf->of_node, "cache-unified");
+}
+
 static void cache_of_override_properties(unsigned int cpu)
 {
 	int index;
@@ -194,6 +199,14 @@ static void cache_of_override_properties(unsigned int cpu)
 
 	for (index = 0; index < cache_leaves(cpu); index++) {
 		this_leaf = this_cpu_ci->info_list + index;
+		/*
+		 * init_cache_level must setup the cache level correctly
+		 * overriding the architecturally specified levels, so
+		 * if type is NONE at this stage, it should be unified
+		 */
+		if (this_leaf->type == CACHE_TYPE_NOCACHE &&
+		    cache_node_is_unified(this_leaf))
+			this_leaf->type = CACHE_TYPE_UNIFIED;
 		cache_size(this_leaf);
 		cache_get_line_size(this_leaf);
 		cache_nr_sets(this_leaf);

From caea4f384858ee7861367920df36995e7acfe160 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil@xs4all.nl>
Date: Fri, 15 Dec 2017 16:21:50 +0100
Subject: [PATCH 150/808] drm/sun4i: validate modes for HDMI

When I connected my cubieboard running 4.15-rc1 to my 4k display I got no
picture. Some digging found that there is no check against the upper
pixelclock limit of the HDMI output, so X selects a 4kp60 format at 594
MHz, which obviously won't work.

The patch below adds a check for the upper bound of what this hardware can
do, and it checks if the requested tmds clock can be obtained.

It also allows for the +/- 0.5% pixel clock variation that the HDMI spec permits.

That code is based on commit 22d0be2a557e ("drm: arcpgu: Allow some clock
deviation in crtc->mode_valid() callback") from Jose Abreu for drm/arc.

Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Thanks-to: Jose Abreu <Jose.Abreu@synopsys.com>
Signed-off-by: Maxime Ripard <maxime.ripard@free-electrons.com>
Link: https://patchwork.freedesktop.org/patch/msgid/162854cb-c7bd-d9ce-9fa0-9a6cd89c621b@xs4all.nl
---
 drivers/gpu/drm/sun4i/sun4i_hdmi_enc.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/drivers/gpu/drm/sun4i/sun4i_hdmi_enc.c b/drivers/gpu/drm/sun4i/sun4i_hdmi_enc.c
index dda904ec0534..c12f9bd12904 100644
--- a/drivers/gpu/drm/sun4i/sun4i_hdmi_enc.c
+++ b/drivers/gpu/drm/sun4i/sun4i_hdmi_enc.c
@@ -208,8 +208,27 @@ static int sun4i_hdmi_get_modes(struct drm_connector *connector)
 	return ret;
 }
 
+static int sun4i_hdmi_mode_valid(struct drm_connector *connector,
+				 struct drm_display_mode *mode)
+{
+	struct sun4i_hdmi *hdmi = drm_connector_to_sun4i_hdmi(connector);
+	long rate = mode->clock * 1000;
+	long diff = rate / 200; /* +-0.5% allowed by HDMI spec */
+	long rounded_rate;
+
+	/* 165 MHz is the typical max pixelclock frequency for HDMI <= 1.2 */
+	if (rate > 165000000)
+		return MODE_CLOCK_HIGH;
+	rounded_rate = clk_round_rate(hdmi->tmds_clk, rate);
+	if (max(rounded_rate, rate) - min(rounded_rate, rate) < diff &&
+	    rounded_rate > 0)
+		return MODE_OK;
+	return MODE_NOCLOCK;
+}
+
 static const struct drm_connector_helper_funcs sun4i_hdmi_connector_helper_funcs = {
 	.get_modes	= sun4i_hdmi_get_modes,
+	.mode_valid	= sun4i_hdmi_mode_valid,
 };
 
 static enum drm_connector_status

From fdf2e821052958a114618a95ab18a300d0b080cb Mon Sep 17 00:00:00 2001
From: Sascha Hauer <s.hauer@pengutronix.de>
Date: Tue, 5 Dec 2017 11:51:40 +0100
Subject: [PATCH 151/808] mtd: nand: gpmi: Fix failure when a erased page has a
 bitflip at BBM

When erased subpages are read then the BCH decoder returns STATUS_ERASED
if they are all empty, or STATUS_UNCORRECTABLE if there are bitflips.
When there are bitflips, we have to set these bits again to show the
upper layers a completely erased page. When a bitflip happens in the
exact byte where the bad block marker is, then this byte is swapped
with another byte in block_mark_swapping(). The correction code then
detects a bitflip in another subpage and no longer corrects the bitflip
where it really happens.

Correct this behaviour by calling block_mark_swapping() after the
bitflips have been corrected.

In our case UBIFS failed with this bug because it expects erased
pages to be really empty:

UBIFS error (pid 187): ubifs_scan: corrupt empty space at LEB 36:118735
UBIFS error (pid 187): ubifs_scanned_corruption: corruption at LEB 36:118735
UBIFS error (pid 187): ubifs_scanned_corruption: first 8192 bytes from LEB 36:118735
UBIFS error (pid 187): ubifs_scan: LEB 36 scanning failed
UBIFS error (pid 187): do_commit: commit failed, error -117

Signed-off-by: Sascha Hauer <s.hauer@pengutronix.de>
Reviewed-by: Richard Weinberger <richard@nod.at>
Acked-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 drivers/mtd/nand/gpmi-nand/gpmi-nand.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/mtd/nand/gpmi-nand/gpmi-nand.c b/drivers/mtd/nand/gpmi-nand/gpmi-nand.c
index 50f8d4a1b983..d4d824ef64e9 100644
--- a/drivers/mtd/nand/gpmi-nand/gpmi-nand.c
+++ b/drivers/mtd/nand/gpmi-nand/gpmi-nand.c
@@ -1067,9 +1067,6 @@ static int gpmi_ecc_read_page(struct mtd_info *mtd, struct nand_chip *chip,
 		return ret;
 	}
 
-	/* handle the block mark swapping */
-	block_mark_swapping(this, payload_virt, auxiliary_virt);
-
 	/* Loop over status bytes, accumulating ECC status. */
 	status = auxiliary_virt + nfc_geo->auxiliary_status_offset;
 
@@ -1158,6 +1155,9 @@ static int gpmi_ecc_read_page(struct mtd_info *mtd, struct nand_chip *chip,
 		max_bitflips = max_t(unsigned int, max_bitflips, *status);
 	}
 
+	/* handle the block mark swapping */
+	block_mark_swapping(this, buf, auxiliary_virt);
+
 	if (oob_required) {
 		/*
 		 * It's time to deliver the OOB bytes. See gpmi_ecc_read_oob()

From e44b9a9c135727f3410e029910275f40681dc8bc Mon Sep 17 00:00:00 2001
From: Albert Hsieh <wen.hsieh@broadcom.com>
Date: Mon, 20 Nov 2017 11:26:26 +0800
Subject: [PATCH 152/808] mtd: nand: brcmnand: Zero bitflip is not an error

A negative return value of brcmstb_nand_verify_erased_page() indicates a
real bitflip error of an erased page, and other return values (>= 0) show
the corrected bitflip number. Zero return value means no bitflip, but the
current driver code treats it as an error, and eventually leads to
falsely reported ECC error.

Fixes: 02b88eea9f9c ("mtd: brcmnand: Add check for erased page bitflip")
Signed-off-by: Albert Hsieh <wen.hsieh@broadcom.com>
Acked-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 drivers/mtd/nand/brcmnand/brcmnand.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/mtd/nand/brcmnand/brcmnand.c b/drivers/mtd/nand/brcmnand/brcmnand.c
index e0eb51d8c012..dd56a671ea42 100644
--- a/drivers/mtd/nand/brcmnand/brcmnand.c
+++ b/drivers/mtd/nand/brcmnand/brcmnand.c
@@ -1763,7 +1763,7 @@ try_dmaread:
 			err = brcmstb_nand_verify_erased_page(mtd, chip, buf,
 							      addr);
 			/* erased page bitflips corrected */
-			if (err > 0)
+			if (err >= 0)
 				return err;
 		}
 

From bc2fd1b11097ad981478abcc0328784ea131ac29 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Wed, 6 Dec 2017 18:27:14 +0100
Subject: [PATCH 153/808] mtd: nand: gpio: Fix ALE gpio configuration

Fixes a copy/paste error in commit f3d0d8d938b4d ("mtd: nand: gpio:
Convert to use GPIO descriptors") which breaks gpio-nand driver

Fixes: f3d0d8d938b4d ("mtd: nand: gpio: Convert to use GPIO descriptors")
Cc: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Reviewed-by: Richard Weinberger <richard@nod.at>
Acked-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 drivers/mtd/nand/gpio.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/mtd/nand/gpio.c b/drivers/mtd/nand/gpio.c
index 484f7fbc3f7d..a8bde6665c24 100644
--- a/drivers/mtd/nand/gpio.c
+++ b/drivers/mtd/nand/gpio.c
@@ -253,9 +253,9 @@ static int gpio_nand_probe(struct platform_device *pdev)
 		goto out_ce;
 	}
 
-	gpiomtd->nwp = devm_gpiod_get(dev, "ale", GPIOD_OUT_LOW);
-	if (IS_ERR(gpiomtd->nwp)) {
-		ret = PTR_ERR(gpiomtd->nwp);
+	gpiomtd->ale = devm_gpiod_get(dev, "ale", GPIOD_OUT_LOW);
+	if (IS_ERR(gpiomtd->ale)) {
+		ret = PTR_ERR(gpiomtd->ale);
 		goto out_ce;
 	}
 

From b2162117171864ef48d43cf5d888f3e8012c6c06 Mon Sep 17 00:00:00 2001
From: Bhawanpreet Lakha <Bhawanpreet.Lakha@amd.com>
Date: Fri, 24 Nov 2017 17:26:28 -0500
Subject: [PATCH 154/808] drm/amd/display: add pipe locking before front end
 programing

Add pipe locking/unlocking before we program the front end

Signed-off-by: Bhawanpreet Lakha <Bhawanpreet.Lakha@amd.com>
Reviewed-by: Harry Wentland <Harry.Wentland@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 .../display/dc/dce110/dce110_hw_sequencer.c   | 26 +++++++++++++++----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/dc/dce110/dce110_hw_sequencer.c b/drivers/gpu/drm/amd/display/dc/dce110/dce110_hw_sequencer.c
index 07ff8d2faf3f..d844fadcd56f 100644
--- a/drivers/gpu/drm/amd/display/dc/dce110/dce110_hw_sequencer.c
+++ b/drivers/gpu/drm/amd/display/dc/dce110/dce110_hw_sequencer.c
@@ -2866,16 +2866,19 @@ static void dce110_apply_ctx_for_surface(
 		int num_planes,
 		struct dc_state *context)
 {
-	int i, be_idx;
+	int i;
 
 	if (num_planes == 0)
 		return;
 
-	be_idx = -1;
 	for (i = 0; i < dc->res_pool->pipe_count; i++) {
-		if (stream == context->res_ctx.pipe_ctx[i].stream) {
-			be_idx = context->res_ctx.pipe_ctx[i].stream_res.tg->inst;
-			break;
+		struct pipe_ctx *pipe_ctx = &context->res_ctx.pipe_ctx[i];
+		struct pipe_ctx *old_pipe_ctx = &dc->current_state->res_ctx.pipe_ctx[i];
+
+		if (stream == pipe_ctx->stream) {
+			if (!pipe_ctx->top_pipe &&
+				(pipe_ctx->plane_state || old_pipe_ctx->plane_state))
+				dc->hwss.pipe_control_lock(dc, pipe_ctx, true);
 		}
 	}
 
@@ -2895,9 +2898,22 @@ static void dce110_apply_ctx_for_surface(
 					context->stream_count);
 
 		dce110_program_front_end_for_pipe(dc, pipe_ctx);
+
+		dc->hwss.update_plane_addr(dc, pipe_ctx);
+
 		program_surface_visibility(dc, pipe_ctx);
 
 	}
+
+	for (i = 0; i < dc->res_pool->pipe_count; i++) {
+		struct pipe_ctx *pipe_ctx = &context->res_ctx.pipe_ctx[i];
+		struct pipe_ctx *old_pipe_ctx = &dc->current_state->res_ctx.pipe_ctx[i];
+
+		if ((stream == pipe_ctx->stream) &&
+			(!pipe_ctx->top_pipe) &&
+			(pipe_ctx->plane_state || old_pipe_ctx->plane_state))
+			dc->hwss.pipe_control_lock(dc, pipe_ctx, false);
+	}
 }
 
 static void dce110_power_down_fe(struct dc *dc, int fe_idx)

From 56a9b95c4d3386a98f69f641dd6018886ed2e9d6 Mon Sep 17 00:00:00 2001
From: Dmytro Laktyushkin <Dmytro.Laktyushkin@amd.com>
Date: Mon, 13 Nov 2017 17:03:53 -0500
Subject: [PATCH 155/808] drm/amd/display: set chroma taps to 1 when not
 scaling

Signed-off-by: Dmytro Laktyushkin <Dmytro.Laktyushkin@amd.com>
Reviewed-by: Tony Cheng <Tony.Cheng@amd.com>
Acked-by: Harry Wentland <harry.wentland@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/display/dc/calcs/dcn_calcs.c | 9 +++++++++
 drivers/gpu/drm/amd/display/dc/dcn10/dcn10_dpp.c | 9 ++++-----
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/dc/calcs/dcn_calcs.c b/drivers/gpu/drm/amd/display/dc/calcs/dcn_calcs.c
index 3dce35e66b09..b142629a1058 100644
--- a/drivers/gpu/drm/amd/display/dc/calcs/dcn_calcs.c
+++ b/drivers/gpu/drm/amd/display/dc/calcs/dcn_calcs.c
@@ -900,6 +900,15 @@ bool dcn_validate_bandwidth(
 			v->override_vta_ps[input_idx] = pipe->plane_res.scl_data.taps.v_taps;
 			v->override_hta_pschroma[input_idx] = pipe->plane_res.scl_data.taps.h_taps_c;
 			v->override_vta_pschroma[input_idx] = pipe->plane_res.scl_data.taps.v_taps_c;
+			/*
+			 * Spreadsheet doesn't handle taps_c is one properly,
+			 * need to force Chroma to always be scaled to pass
+			 * bandwidth validation.
+			 */
+			if (v->override_hta_pschroma[input_idx] == 1)
+				v->override_hta_pschroma[input_idx] = 2;
+			if (v->override_vta_pschroma[input_idx] == 1)
+				v->override_vta_pschroma[input_idx] = 2;
 			v->source_scan[input_idx] = (pipe->plane_state->rotation % 2) ? dcn_bw_vert : dcn_bw_hor;
 		}
 		if (v->is_line_buffer_bpp_fixed == dcn_bw_yes)
diff --git a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_dpp.c b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_dpp.c
index 74e7c82bdc76..a9d55d0dd69e 100644
--- a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_dpp.c
+++ b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_dpp.c
@@ -159,11 +159,10 @@ bool dpp_get_optimal_number_of_taps(
 			scl_data->taps.h_taps = 1;
 		if (IDENTITY_RATIO(scl_data->ratios.vert))
 			scl_data->taps.v_taps = 1;
-		/*
-		 * Spreadsheet doesn't handle taps_c is one properly,
-		 * need to force Chroma to always be scaled to pass
-		 * bandwidth validation.
-		 */
+		if (IDENTITY_RATIO(scl_data->ratios.horz_c))
+			scl_data->taps.h_taps_c = 1;
+		if (IDENTITY_RATIO(scl_data->ratios.vert_c))
+			scl_data->taps.v_taps_c = 1;
 	}
 
 	return true;

From 78288503199d0a33b69b972a44a4cf15df989899 Mon Sep 17 00:00:00 2001
From: Eric Yang <Eric.Yang2@amd.com>
Date: Fri, 10 Nov 2017 10:44:24 -0500
Subject: [PATCH 156/808] drm/amd/display: fix missing pixel clock adjustment
 for dongle

Signed-off-by: Eric Yang <Eric.Yang2@amd.com>
Reviewed-by: Tony Cheng <Tony.Cheng@amd.com>
Reviewed-by: Andrew Jiang <Andrew.Jiang@amd.com>
Acked-by: Harry Wentland <harry.wentland@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/display/dc/core/dc_link.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_link.c b/drivers/gpu/drm/amd/display/dc/core/dc_link.c
index e27ed4a45265..42a111b9505d 100644
--- a/drivers/gpu/drm/amd/display/dc/core/dc_link.c
+++ b/drivers/gpu/drm/amd/display/dc/core/dc_link.c
@@ -1801,7 +1801,7 @@ static void disable_link(struct dc_link *link, enum signal_type signal)
 		link->link_enc->funcs->disable_output(link->link_enc, signal, link);
 }
 
-bool dp_active_dongle_validate_timing(
+static bool dp_active_dongle_validate_timing(
 		const struct dc_crtc_timing *timing,
 		const struct dc_dongle_caps *dongle_caps)
 {
@@ -1833,6 +1833,8 @@ bool dp_active_dongle_validate_timing(
 	/* Check Color Depth and Pixel Clock */
 	if (timing->pixel_encoding == PIXEL_ENCODING_YCBCR420)
 		required_pix_clk /= 2;
+	else if (timing->pixel_encoding == PIXEL_ENCODING_YCBCR422)
+		required_pix_clk = required_pix_clk * 2 / 3;
 
 	switch (timing->display_color_depth) {
 	case COLOR_DEPTH_666:

From becd0875f4393a992afbf57aa323f7bf1a71c3ff Mon Sep 17 00:00:00 2001
From: "Jerry (Fangzhi) Zuo" <Jerry.Zuo@amd.com>
Date: Fri, 1 Dec 2017 13:26:05 -0500
Subject: [PATCH 157/808] drm/amd/display: Fix rehook MST display not light
 back on

Original applied dm_restore_drm_connector_state() has got removed.
Set link status to BAD before hotplug() event could trigger
another modeset from userspace.

The fix "Fix MST daisy chain SST not light up" commit makes so it is trying
to create a stream prior to dc_sink. That makes dc_sink is not present in
create_stream_for_sink().

Signed-off-by: Jerry (Fangzhi) Zuo <Jerry.Zuo@amd.com>
Reviewed-by: Roman Li <Roman.Li@amd.com>
Acked-by: Harry Wentland <harry.wentland@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 13 +++--
 .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h |  2 +
 .../display/amdgpu_dm/amdgpu_dm_mst_types.c   | 51 +++++++++++++++++++
 .../display/amdgpu_dm/amdgpu_dm_mst_types.h   |  1 +
 4 files changed, 62 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index f71fe6d2ddda..bb5fa895fb64 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -2336,7 +2336,7 @@ create_stream_for_sink(struct amdgpu_dm_connector *aconnector,
 		       const struct dm_connector_state *dm_state)
 {
 	struct drm_display_mode *preferred_mode = NULL;
-	const struct drm_connector *drm_connector;
+	struct drm_connector *drm_connector;
 	struct dc_stream_state *stream = NULL;
 	struct drm_display_mode mode = *drm_mode;
 	bool native_mode_found = false;
@@ -2355,11 +2355,13 @@ create_stream_for_sink(struct amdgpu_dm_connector *aconnector,
 
 	if (!aconnector->dc_sink) {
 		/*
-		 * Exclude MST from creating fake_sink
-		 * TODO: need to enable MST into fake_sink feature
+		 * Create dc_sink when necessary to MST
+		 * Don't apply fake_sink to MST
 		 */
-		if (aconnector->mst_port)
-			goto stream_create_fail;
+		if (aconnector->mst_port) {
+			dm_dp_mst_dc_sink_create(drm_connector);
+			goto mst_dc_sink_create_done;
+		}
 
 		if (create_fake_sink(aconnector))
 			goto stream_create_fail;
@@ -2410,6 +2412,7 @@ create_stream_for_sink(struct amdgpu_dm_connector *aconnector,
 stream_create_fail:
 dm_state_null:
 drm_connector_null:
+mst_dc_sink_create_done:
 	return stream;
 }
 
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
index 117521c6a6ed..0230250a1164 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
@@ -189,6 +189,8 @@ struct amdgpu_dm_connector {
 	struct mutex hpd_lock;
 
 	bool fake_enable;
+
+	bool mst_connected;
 };
 
 #define to_amdgpu_dm_connector(x) container_of(x, struct amdgpu_dm_connector, base)
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c
index f8efb98b1fa7..638c2c2b5cd7 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c
@@ -185,6 +185,42 @@ static int dm_connector_update_modes(struct drm_connector *connector,
 	return ret;
 }
 
+void dm_dp_mst_dc_sink_create(struct drm_connector *connector)
+{
+	struct amdgpu_dm_connector *aconnector = to_amdgpu_dm_connector(connector);
+	struct edid *edid;
+	struct dc_sink *dc_sink;
+	struct dc_sink_init_data init_params = {
+			.link = aconnector->dc_link,
+			.sink_signal = SIGNAL_TYPE_DISPLAY_PORT_MST };
+
+	edid = drm_dp_mst_get_edid(connector, &aconnector->mst_port->mst_mgr, aconnector->port);
+
+	if (!edid) {
+		drm_mode_connector_update_edid_property(
+			&aconnector->base,
+			NULL);
+		return;
+	}
+
+	aconnector->edid = edid;
+
+	dc_sink = dc_link_add_remote_sink(
+		aconnector->dc_link,
+		(uint8_t *)aconnector->edid,
+		(aconnector->edid->extensions + 1) * EDID_LENGTH,
+		&init_params);
+
+	dc_sink->priv = aconnector;
+	aconnector->dc_sink = dc_sink;
+
+	amdgpu_dm_add_sink_to_freesync_module(
+			connector, aconnector->edid);
+
+	drm_mode_connector_update_edid_property(
+					&aconnector->base, aconnector->edid);
+}
+
 static int dm_dp_mst_get_modes(struct drm_connector *connector)
 {
 	struct amdgpu_dm_connector *aconnector = to_amdgpu_dm_connector(connector);
@@ -311,6 +347,7 @@ dm_dp_add_mst_connector(struct drm_dp_mst_topology_mgr *mgr,
 			drm_mode_connector_set_path_property(connector, pathprop);
 
 			drm_connector_list_iter_end(&conn_iter);
+			aconnector->mst_connected = true;
 			return &aconnector->base;
 		}
 	}
@@ -363,6 +400,8 @@ dm_dp_add_mst_connector(struct drm_dp_mst_topology_mgr *mgr,
 	 */
 	amdgpu_dm_connector_funcs_reset(connector);
 
+	aconnector->mst_connected = true;
+
 	DRM_INFO("DM_MST: added connector: %p [id: %d] [master: %p]\n",
 			aconnector, connector->base.id, aconnector->mst_port);
 
@@ -394,6 +433,8 @@ static void dm_dp_destroy_mst_connector(struct drm_dp_mst_topology_mgr *mgr,
 	drm_mode_connector_update_edid_property(
 			&aconnector->base,
 			NULL);
+
+	aconnector->mst_connected = false;
 }
 
 static void dm_dp_mst_hotplug(struct drm_dp_mst_topology_mgr *mgr)
@@ -404,10 +445,18 @@ static void dm_dp_mst_hotplug(struct drm_dp_mst_topology_mgr *mgr)
 	drm_kms_helper_hotplug_event(dev);
 }
 
+static void dm_dp_mst_link_status_reset(struct drm_connector *connector)
+{
+	mutex_lock(&connector->dev->mode_config.mutex);
+	drm_mode_connector_set_link_status_property(connector, DRM_MODE_LINK_STATUS_BAD);
+	mutex_unlock(&connector->dev->mode_config.mutex);
+}
+
 static void dm_dp_mst_register_connector(struct drm_connector *connector)
 {
 	struct drm_device *dev = connector->dev;
 	struct amdgpu_device *adev = dev->dev_private;
+	struct amdgpu_dm_connector *aconnector = to_amdgpu_dm_connector(connector);
 
 	if (adev->mode_info.rfbdev)
 		drm_fb_helper_add_one_connector(&adev->mode_info.rfbdev->helper, connector);
@@ -416,6 +465,8 @@ static void dm_dp_mst_register_connector(struct drm_connector *connector)
 
 	drm_connector_register(connector);
 
+	if (aconnector->mst_connected)
+		dm_dp_mst_link_status_reset(connector);
 }
 
 static const struct drm_dp_mst_topology_cbs dm_mst_cbs = {
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.h b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.h
index 2da851b40042..8cf51da26657 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.h
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.h
@@ -31,5 +31,6 @@ struct amdgpu_dm_connector;
 
 void amdgpu_dm_initialize_dp_connector(struct amdgpu_display_manager *dm,
 				       struct amdgpu_dm_connector *aconnector);
+void dm_dp_mst_dc_sink_create(struct drm_connector *connector);
 
 #endif

From 5f0e3fe6b1504d4e6530294ec87c473aa6d2d02f Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <mawilcox@microsoft.com>
Date: Tue, 14 Nov 2017 09:10:11 -0500
Subject: [PATCH 158/808] x86/build: Make isoimage work on Debian

Debian does not ship a 'mkisofs' symlink to genisoimage.  All modern
distros ship genisoimage, so just use that directly.  That requires
renaming the 'genisoimage' function.  Also neaten up the 'for' loop
while I'm in here.

Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Changbin Du <changbin.du@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/boot/genimage.sh | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/arch/x86/boot/genimage.sh b/arch/x86/boot/genimage.sh
index c9e8499fbfe7..6a10d52a4145 100644
--- a/arch/x86/boot/genimage.sh
+++ b/arch/x86/boot/genimage.sh
@@ -80,39 +80,43 @@ genfdimage288() {
 	mcopy $FBZIMAGE w:linux
 }
 
-genisoimage() {
+geniso() {
 	tmp_dir=`dirname $FIMAGE`/isoimage
 	rm -rf $tmp_dir
 	mkdir $tmp_dir
-	for i in lib lib64 share end ; do
+	for i in lib lib64 share ; do
 		for j in syslinux ISOLINUX ; do
 			if [ -f /usr/$i/$j/isolinux.bin ] ; then
 				isolinux=/usr/$i/$j/isolinux.bin
-				cp $isolinux $tmp_dir
 			fi
 		done
 		for j in syslinux syslinux/modules/bios ; do
 			if [ -f /usr/$i/$j/ldlinux.c32 ]; then
 				ldlinux=/usr/$i/$j/ldlinux.c32
-				cp $ldlinux $tmp_dir
 			fi
 		done
 		if [ -n "$isolinux" -a -n "$ldlinux" ] ; then
 			break
 		fi
-		if [ $i = end -a -z "$isolinux" ] ; then
-			echo 'Need an isolinux.bin file, please install syslinux/isolinux.'
-			exit 1
-		fi
 	done
+	if [ -z "$isolinux" ] ; then
+		echo 'Need an isolinux.bin file, please install syslinux/isolinux.'
+		exit 1
+	fi
+	if [ -z "$ldlinux" ] ; then
+		echo 'Need an ldlinux.c32 file, please install syslinux/isolinux.'
+		exit 1
+	fi
+	cp $isolinux $tmp_dir
+	cp $ldlinux $tmp_dir
 	cp $FBZIMAGE $tmp_dir/linux
 	echo "$KCMDLINE" > $tmp_dir/isolinux.cfg
 	if [ -f "$FDINITRD" ] ; then
 		cp "$FDINITRD" $tmp_dir/initrd.img
 	fi
-	mkisofs -J -r -input-charset=utf-8 -quiet -o $FIMAGE -b isolinux.bin \
-		-c boot.cat -no-emul-boot -boot-load-size 4 -boot-info-table \
-		$tmp_dir
+	genisoimage -J -r -input-charset=utf-8 -quiet -o $FIMAGE \
+		-b isolinux.bin -c boot.cat -no-emul-boot -boot-load-size 4 \
+		-boot-info-table $tmp_dir
 	isohybrid $FIMAGE 2>/dev/null || true
 	rm -rf $tmp_dir
 }
@@ -121,6 +125,6 @@ case $1 in
 	bzdisk)     genbzdisk;;
 	fdimage144) genfdimage144;;
 	fdimage288) genfdimage288;;
-	isoimage)   genisoimage;;
+	isoimage)   geniso;;
 	*)          echo 'Unknown image format'; exit 1;
 esac

From cce1fea50e3be6b78fc677e8cf20cd0ca4c851b0 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 1 Dec 2017 15:08:03 +0300
Subject: [PATCH 159/808] thunderbolt: Make pathname to force_power shorter

WMI is the bus inside kernel, so, we may access the GUID via
/sys/bus/wmi instead of doing this through /sys/devices path.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Mario Limonciello <mario.limonciello@dell.com>
Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/admin-guide/thunderbolt.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/admin-guide/thunderbolt.rst b/Documentation/admin-guide/thunderbolt.rst
index de50a8561774..9b55952039a6 100644
--- a/Documentation/admin-guide/thunderbolt.rst
+++ b/Documentation/admin-guide/thunderbolt.rst
@@ -230,7 +230,7 @@ If supported by your machine this will be exposed by the WMI bus with
 a sysfs attribute called "force_power".
 
 For example the intel-wmi-thunderbolt driver exposes this attribute in:
-  /sys/devices/platform/PNP0C14:00/wmi_bus/wmi_bus-PNP0C14:00/86CCFD48-205E-4A77-9C48-2021CBEDE341/force_power
+  /sys/bus/wmi/devices/86CCFD48-205E-4A77-9C48-2021CBEDE341/force_power
 
   To force the power to on, write 1 to this attribute file.
   To disable force power, write 0 to this attribute file.

From 78dfa29c84bab548910490cf7508c53ad99d1d9e Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Fri, 1 Dec 2017 15:08:04 +0300
Subject: [PATCH 160/808] MAINTAINERS: Add thunderbolt.rst to the Thunderbolt
 driver entry

Make sure Thunderbolt maintainers get to see patches that touch
documentation of the Thunderbolt driver as well.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 82ad0eabce4f..5da966e19e8a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -13492,6 +13492,7 @@ M:	Mika Westerberg <mika.westerberg@linux.intel.com>
 M:	Yehezkel Bernat <yehezkel.bernat@intel.com>
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/westeri/thunderbolt.git
 S:	Maintained
+F:	Documentation/admin-guide/thunderbolt.rst
 F:	drivers/thunderbolt/
 F:	include/linux/thunderbolt.h
 

From 74657181e7c449351d1ad28cf43941bc333e1bd6 Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Fri, 1 Dec 2017 15:08:05 +0300
Subject: [PATCH 161/808] thunderbolt: Mask ring interrupt properly when
 polling starts

When ring enters polling mode we are expected to mask the ring interrupt
before the callback is called. However, the current code actually
unmasks it probably because of a copy-paste mistake.

Mask the interrupt properly from now on.

Fixes: 4ffe722eefcb ("thunderbolt: Add polling mode for rings")
Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Acked-by: Yehezkel Bernat <yehezkel.bernat@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/thunderbolt/nhi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/thunderbolt/nhi.c b/drivers/thunderbolt/nhi.c
index 419a7a90bce0..f45bcbc63738 100644
--- a/drivers/thunderbolt/nhi.c
+++ b/drivers/thunderbolt/nhi.c
@@ -339,7 +339,7 @@ static void __ring_interrupt(struct tb_ring *ring)
 		return;
 
 	if (ring->start_poll) {
-		__ring_interrupt_mask(ring, false);
+		__ring_interrupt_mask(ring, true);
 		ring->start_poll(ring->poll_data);
 	} else {
 		schedule_work(&ring->work);

From 588753f1eb18978512b1c9b85fddb457d46f9033 Mon Sep 17 00:00:00 2001
From: Brendan McGrath <redmcg@redmandi.dyndns.org>
Date: Wed, 13 Dec 2017 22:14:57 +1100
Subject: [PATCH 162/808] ipv6: icmp6: Allow icmp messages to be looped back

One example of when an ICMPv6 packet is required to be looped back is
when a host acts as both a Multicast Listener and a Multicast Router.

A Multicast Router will listen on address ff02::16 for MLDv2 messages.

Currently, MLDv2 messages originating from a Multicast Listener running
on the same host as the Multicast Router are not being delivered to the
Multicast Router. This is due to dst.input being assigned the default
value of dst_discard.

This results in the packet being looped back but discarded before being
delivered to the Multicast Router.

This patch sets dst.input to ip6_input to ensure a looped back packet
is delivered to the Multicast Router.

Signed-off-by: Brendan McGrath <redmcg@redmandi.dyndns.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 7a8d1500d374..2bc91c349273 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2336,6 +2336,7 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
 	}
 
 	rt->dst.flags |= DST_HOST;
+	rt->dst.input = ip6_input;
 	rt->dst.output  = ip6_output;
 	rt->rt6i_gateway  = fl6->daddr;
 	rt->rt6i_dst.addr = fl6->daddr;

From f870c1ff65a6d1f3a083f277280802ee09a5b44d Mon Sep 17 00:00:00 2001
From: Alexey Kodanev <alexey.kodanev@oracle.com>
Date: Thu, 14 Dec 2017 20:20:00 +0300
Subject: [PATCH 163/808] vxlan: restore dev->mtu setting based on lower device

Stefano Brivio says:
    Commit a985343ba906 ("vxlan: refactor verification and
    application of configuration") introduced a change in the
    behaviour of initial MTU setting: earlier, the MTU for a link
    created on top of a given lower device, without an initial MTU
    specification, was set to the MTU of the lower device minus
    headroom as a result of this path in vxlan_dev_configure():

	if (!conf->mtu)
		dev->mtu = lowerdev->mtu -
			   (use_ipv6 ? VXLAN6_HEADROOM : VXLAN_HEADROOM);

    which is now gone. Now, the initial MTU, in absence of a
    configured value, is simply set by ether_setup() to ETH_DATA_LEN
    (1500 bytes).

    This breaks userspace expectations in case the MTU of
    the lower device is higher than 1500 bytes minus headroom.

This patch restores the previous behaviour on newlink operation. Since
max_mtu can be negative and we update dev->mtu directly, also check it
for valid minimum.

Reported-by: Junhan Yan <juyan@redhat.com>
Fixes: a985343ba906 ("vxlan: refactor verification and application of configuration")
Signed-off-by: Alexey Kodanev <alexey.kodanev@oracle.com>
Acked-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 19b9cc51079e..1000b0e4ee01 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -3103,6 +3103,11 @@ static void vxlan_config_apply(struct net_device *dev,
 
 		max_mtu = lowerdev->mtu - (use_ipv6 ? VXLAN6_HEADROOM :
 					   VXLAN_HEADROOM);
+		if (max_mtu < ETH_MIN_MTU)
+			max_mtu = ETH_MIN_MTU;
+
+		if (!changelink && !conf->mtu)
+			dev->mtu = max_mtu;
 	}
 
 	if (dev->mtu > max_mtu)

From e17f8234538d1ff708673f287a42457c4dee720d Mon Sep 17 00:00:00 2001
From: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Date: Mon, 4 Dec 2017 15:07:07 +0100
Subject: [PATCH 164/808] x86/entry/64/paravirt: Use paravirt-safe macro to
 access eflags

Commit 1d3e53e8624a ("x86/entry/64: Refactor IRQ stacks and make them
NMI-safe") added DEBUG_ENTRY_ASSERT_IRQS_OFF macro that acceses eflags
using 'pushfq' instruction when testing for IF bit. On PV Xen guests
looking at IF flag directly will always see it set, resulting in 'ud2'.

Introduce SAVE_FLAGS() macro that will use appropriate save_fl pv op when
running paravirt.

Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Juergen Gross <jgross@suse.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Cc: xen-devel@lists.xenproject.org
Link: https://lkml.kernel.org/r/20171204150604.899457242@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/entry_64.S        | 7 ++++---
 arch/x86/include/asm/irqflags.h  | 3 +++
 arch/x86/include/asm/paravirt.h  | 9 +++++++++
 arch/x86/kernel/asm-offsets_64.c | 3 +++
 4 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index a2b30ec69497..32306788821c 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -462,12 +462,13 @@ END(irq_entries_start)
 
 .macro DEBUG_ENTRY_ASSERT_IRQS_OFF
 #ifdef CONFIG_DEBUG_ENTRY
-	pushfq
-	testl $X86_EFLAGS_IF, (%rsp)
+	pushq %rax
+	SAVE_FLAGS(CLBR_RAX)
+	testl $X86_EFLAGS_IF, %eax
 	jz .Lokay_\@
 	ud2
 .Lokay_\@:
-	addq $8, %rsp
+	popq %rax
 #endif
 .endm
 
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index c8ef23f2c28f..89f08955fff7 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -142,6 +142,9 @@ static inline notrace unsigned long arch_local_irq_save(void)
 	swapgs;					\
 	sysretl
 
+#ifdef CONFIG_DEBUG_ENTRY
+#define SAVE_FLAGS(x)		pushfq; popq %rax
+#endif
 #else
 #define INTERRUPT_RETURN		iret
 #define ENABLE_INTERRUPTS_SYSEXIT	sti; sysexit
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 283efcaac8af..892df375b615 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -927,6 +927,15 @@ extern void default_banner(void);
 	PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64),	\
 		  CLBR_NONE,						\
 		  jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64))
+
+#ifdef CONFIG_DEBUG_ENTRY
+#define SAVE_FLAGS(clobbers)                                        \
+	PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_save_fl), clobbers, \
+		  PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE);        \
+		  call PARA_INDIRECT(pv_irq_ops+PV_IRQ_save_fl);    \
+		  PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
+#endif
+
 #endif	/* CONFIG_X86_32 */
 
 #endif /* __ASSEMBLY__ */
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 630212fa9b9d..e3a5175a444b 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -23,6 +23,9 @@ int main(void)
 #ifdef CONFIG_PARAVIRT
 	OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64);
 	OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs);
+#ifdef CONFIG_DEBUG_ENTRY
+	OFFSET(PV_IRQ_save_fl, pv_irq_ops, save_fl);
+#endif
 	BLANK();
 #endif
 

From d3a09104018cf2ad5973dfa8a9c138ef9f5015a3 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Mon, 4 Dec 2017 15:07:08 +0100
Subject: [PATCH 165/808] x86/unwinder/orc: Dont bail on stack overflow

If the stack overflows into a guard page and the ORC unwinder should work
well: by construction, there can't be any meaningful data in the guard page
because no writes to the guard page will have succeeded.

But there is a bug that prevents unwinding from working correctly: if the
starting register state has RSP pointing into a stack guard page, the ORC
unwinder bails out immediately.

Instead of bailing out immediately check whether the next page up is a
valid check page and if so analyze that. As a result the ORC unwinder will
start the unwind.

Tested by intentionally overflowing the task stack.  The result is an
accurate call trace instead of a trace consisting purely of '?' entries.

There are a few other bugs that are triggered if the unwinder encounters a
stack overflow after the first step, but they are outside the scope of this
fix.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Link: https://lkml.kernel.org/r/20171204150604.991389777@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/unwind_orc.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
index a3f973b2c97a..ff8e1132b2ae 100644
--- a/arch/x86/kernel/unwind_orc.c
+++ b/arch/x86/kernel/unwind_orc.c
@@ -553,8 +553,18 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task,
 	}
 
 	if (get_stack_info((unsigned long *)state->sp, state->task,
-			   &state->stack_info, &state->stack_mask))
-		return;
+			   &state->stack_info, &state->stack_mask)) {
+		/*
+		 * We weren't on a valid stack.  It's possible that
+		 * we overflowed a valid stack into a guard page.
+		 * See if the next page up is valid so that we can
+		 * generate some kind of backtrace if this happens.
+		 */
+		void *next_page = (void *)PAGE_ALIGN((unsigned long)state->sp);
+		if (get_stack_info(next_page, state->task, &state->stack_info,
+				   &state->stack_mask))
+			return;
+	}
 
 	/*
 	 * The caller can provide the address of the first frame directly

From b02fcf9ba1211097754b286043cd87a8b4907e75 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Mon, 4 Dec 2017 15:07:09 +0100
Subject: [PATCH 166/808] x86/unwinder: Handle stack overflows more gracefully

There are at least two unwinder bugs hindering the debugging of
stack-overflow crashes:

- It doesn't deal gracefully with the case where the stack overflows and
  the stack pointer itself isn't on a valid stack but the
  to-be-dereferenced data *is*.

- The ORC oops dump code doesn't know how to print partial pt_regs, for the
  case where if we get an interrupt/exception in *early* entry code
  before the full pt_regs have been saved.

Fix both issues.

http://lkml.kernel.org/r/20171126024031.uxi4numpbjm5rlbr@treble

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bpetkov@suse.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Link: https://lkml.kernel.org/r/20171204150605.071425003@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/kdebug.h |  1 +
 arch/x86/include/asm/unwind.h |  7 ++++
 arch/x86/kernel/dumpstack.c   | 32 ++++++++++++---
 arch/x86/kernel/process_64.c  | 11 +++---
 arch/x86/kernel/unwind_orc.c  | 74 ++++++++++++-----------------------
 5 files changed, 65 insertions(+), 60 deletions(-)

diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h
index f86a8caa561e..395c9631e000 100644
--- a/arch/x86/include/asm/kdebug.h
+++ b/arch/x86/include/asm/kdebug.h
@@ -26,6 +26,7 @@ extern void die(const char *, struct pt_regs *,long);
 extern int __must_check __die(const char *, struct pt_regs *, long);
 extern void show_stack_regs(struct pt_regs *regs);
 extern void __show_regs(struct pt_regs *regs, int all);
+extern void show_iret_regs(struct pt_regs *regs);
 extern unsigned long oops_begin(void);
 extern void oops_end(unsigned long, struct pt_regs *, int signr);
 
diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h
index e9cc6fe1fc6f..c1688c2d0a12 100644
--- a/arch/x86/include/asm/unwind.h
+++ b/arch/x86/include/asm/unwind.h
@@ -7,6 +7,9 @@
 #include <asm/ptrace.h>
 #include <asm/stacktrace.h>
 
+#define IRET_FRAME_OFFSET (offsetof(struct pt_regs, ip))
+#define IRET_FRAME_SIZE   (sizeof(struct pt_regs) - IRET_FRAME_OFFSET)
+
 struct unwind_state {
 	struct stack_info stack_info;
 	unsigned long stack_mask;
@@ -52,6 +55,10 @@ void unwind_start(struct unwind_state *state, struct task_struct *task,
 }
 
 #if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER)
+/*
+ * WARNING: The entire pt_regs may not be safe to dereference.  In some cases,
+ * only the iret frame registers are accessible.  Use with caution!
+ */
 static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
 {
 	if (unwind_done(state))
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index f13b4c00a5de..0bc95be5c638 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -50,6 +50,28 @@ static void printk_stack_address(unsigned long address, int reliable,
 	printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address);
 }
 
+void show_iret_regs(struct pt_regs *regs)
+{
+	printk(KERN_DEFAULT "RIP: %04x:%pS\n", (int)regs->cs, (void *)regs->ip);
+	printk(KERN_DEFAULT "RSP: %04x:%016lx EFLAGS: %08lx", (int)regs->ss,
+		regs->sp, regs->flags);
+}
+
+static void show_regs_safe(struct stack_info *info, struct pt_regs *regs)
+{
+	if (on_stack(info, regs, sizeof(*regs)))
+		__show_regs(regs, 0);
+	else if (on_stack(info, (void *)regs + IRET_FRAME_OFFSET,
+			  IRET_FRAME_SIZE)) {
+		/*
+		 * When an interrupt or exception occurs in entry code, the
+		 * full pt_regs might not have been saved yet.  In that case
+		 * just print the iret frame.
+		 */
+		show_iret_regs(regs);
+	}
+}
+
 void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
 			unsigned long *stack, char *log_lvl)
 {
@@ -94,8 +116,8 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
 		if (stack_name)
 			printk("%s <%s>\n", log_lvl, stack_name);
 
-		if (regs && on_stack(&stack_info, regs, sizeof(*regs)))
-			__show_regs(regs, 0);
+		if (regs)
+			show_regs_safe(&stack_info, regs);
 
 		/*
 		 * Scan the stack, printing any text addresses we find.  At the
@@ -119,7 +141,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
 
 			/*
 			 * Don't print regs->ip again if it was already printed
-			 * by __show_regs() below.
+			 * by show_regs_safe() below.
 			 */
 			if (regs && stack == &regs->ip)
 				goto next;
@@ -155,8 +177,8 @@ next:
 
 			/* if the frame has entry regs, print them */
 			regs = unwind_get_entry_regs(&state);
-			if (regs && on_stack(&stack_info, regs, sizeof(*regs)))
-				__show_regs(regs, 0);
+			if (regs)
+				show_regs_safe(&stack_info, regs);
 		}
 
 		if (stack_name)
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index eeeb34f85c25..01b119bebb68 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -69,9 +69,8 @@ void __show_regs(struct pt_regs *regs, int all)
 	unsigned int fsindex, gsindex;
 	unsigned int ds, cs, es;
 
-	printk(KERN_DEFAULT "RIP: %04lx:%pS\n", regs->cs, (void *)regs->ip);
-	printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx", regs->ss,
-		regs->sp, regs->flags);
+	show_iret_regs(regs);
+
 	if (regs->orig_ax != -1)
 		pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax);
 	else
@@ -88,6 +87,9 @@ void __show_regs(struct pt_regs *regs, int all)
 	printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
 	       regs->r13, regs->r14, regs->r15);
 
+	if (!all)
+		return;
+
 	asm("movl %%ds,%0" : "=r" (ds));
 	asm("movl %%cs,%0" : "=r" (cs));
 	asm("movl %%es,%0" : "=r" (es));
@@ -98,9 +100,6 @@ void __show_regs(struct pt_regs *regs, int all)
 	rdmsrl(MSR_GS_BASE, gs);
 	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
 
-	if (!all)
-		return;
-
 	cr0 = read_cr0();
 	cr2 = read_cr2();
 	cr3 = __read_cr3();
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
index ff8e1132b2ae..be86a865087a 100644
--- a/arch/x86/kernel/unwind_orc.c
+++ b/arch/x86/kernel/unwind_orc.c
@@ -253,22 +253,15 @@ unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
 	return NULL;
 }
 
-static bool stack_access_ok(struct unwind_state *state, unsigned long addr,
+static bool stack_access_ok(struct unwind_state *state, unsigned long _addr,
 			    size_t len)
 {
 	struct stack_info *info = &state->stack_info;
+	void *addr = (void *)_addr;
 
-	/*
-	 * If the address isn't on the current stack, switch to the next one.
-	 *
-	 * We may have to traverse multiple stacks to deal with the possibility
-	 * that info->next_sp could point to an empty stack and the address
-	 * could be on a subsequent stack.
-	 */
-	while (!on_stack(info, (void *)addr, len))
-		if (get_stack_info(info->next_sp, state->task, info,
-				   &state->stack_mask))
-			return false;
+	if (!on_stack(info, addr, len) &&
+	    (get_stack_info(addr, state->task, info, &state->stack_mask)))
+		return false;
 
 	return true;
 }
@@ -283,42 +276,32 @@ static bool deref_stack_reg(struct unwind_state *state, unsigned long addr,
 	return true;
 }
 
-#define REGS_SIZE (sizeof(struct pt_regs))
-#define SP_OFFSET (offsetof(struct pt_regs, sp))
-#define IRET_REGS_SIZE (REGS_SIZE - offsetof(struct pt_regs, ip))
-#define IRET_SP_OFFSET (SP_OFFSET - offsetof(struct pt_regs, ip))
-
 static bool deref_stack_regs(struct unwind_state *state, unsigned long addr,
-			     unsigned long *ip, unsigned long *sp, bool full)
+			     unsigned long *ip, unsigned long *sp)
 {
-	size_t regs_size = full ? REGS_SIZE : IRET_REGS_SIZE;
-	size_t sp_offset = full ? SP_OFFSET : IRET_SP_OFFSET;
-	struct pt_regs *regs = (struct pt_regs *)(addr + regs_size - REGS_SIZE);
+	struct pt_regs *regs = (struct pt_regs *)addr;
 
-	if (IS_ENABLED(CONFIG_X86_64)) {
-		if (!stack_access_ok(state, addr, regs_size))
-			return false;
+	/* x86-32 support will be more complicated due to the &regs->sp hack */
+	BUILD_BUG_ON(IS_ENABLED(CONFIG_X86_32));
 
-		*ip = regs->ip;
-		*sp = regs->sp;
-
-		return true;
-	}
-
-	if (!stack_access_ok(state, addr, sp_offset))
+	if (!stack_access_ok(state, addr, sizeof(struct pt_regs)))
 		return false;
 
 	*ip = regs->ip;
+	*sp = regs->sp;
+	return true;
+}
 
-	if (user_mode(regs)) {
-		if (!stack_access_ok(state, addr + sp_offset,
-				     REGS_SIZE - SP_OFFSET))
-			return false;
+static bool deref_stack_iret_regs(struct unwind_state *state, unsigned long addr,
+				  unsigned long *ip, unsigned long *sp)
+{
+	struct pt_regs *regs = (void *)addr - IRET_FRAME_OFFSET;
 
-		*sp = regs->sp;
-	} else
-		*sp = (unsigned long)&regs->sp;
+	if (!stack_access_ok(state, addr, IRET_FRAME_SIZE))
+		return false;
 
+	*ip = regs->ip;
+	*sp = regs->sp;
 	return true;
 }
 
@@ -327,7 +310,6 @@ bool unwind_next_frame(struct unwind_state *state)
 	unsigned long ip_p, sp, orig_ip, prev_sp = state->sp;
 	enum stack_type prev_type = state->stack_info.type;
 	struct orc_entry *orc;
-	struct pt_regs *ptregs;
 	bool indirect = false;
 
 	if (unwind_done(state))
@@ -435,7 +417,7 @@ bool unwind_next_frame(struct unwind_state *state)
 		break;
 
 	case ORC_TYPE_REGS:
-		if (!deref_stack_regs(state, sp, &state->ip, &state->sp, true)) {
+		if (!deref_stack_regs(state, sp, &state->ip, &state->sp)) {
 			orc_warn("can't dereference registers at %p for ip %pB\n",
 				 (void *)sp, (void *)orig_ip);
 			goto done;
@@ -447,20 +429,14 @@ bool unwind_next_frame(struct unwind_state *state)
 		break;
 
 	case ORC_TYPE_REGS_IRET:
-		if (!deref_stack_regs(state, sp, &state->ip, &state->sp, false)) {
+		if (!deref_stack_iret_regs(state, sp, &state->ip, &state->sp)) {
 			orc_warn("can't dereference iret registers at %p for ip %pB\n",
 				 (void *)sp, (void *)orig_ip);
 			goto done;
 		}
 
-		ptregs = container_of((void *)sp, struct pt_regs, ip);
-		if ((unsigned long)ptregs >= prev_sp &&
-		    on_stack(&state->stack_info, ptregs, REGS_SIZE)) {
-			state->regs = ptregs;
-			state->full_regs = false;
-		} else
-			state->regs = NULL;
-
+		state->regs = (void *)sp - IRET_FRAME_OFFSET;
+		state->full_regs = false;
 		state->signal = true;
 		break;
 

From 6669a692605547892a026445e460bf233958bd7f Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Mon, 4 Dec 2017 15:07:10 +0100
Subject: [PATCH 167/808] x86/irq: Remove an old outdated comment about context
 tracking races

That race has been fixed and code cleaned up for a while now.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Link: https://lkml.kernel.org/r/20171204150605.150551639@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/irq.c | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 52089c043160..aa9d51eea9d0 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -219,18 +219,6 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
 	/* high bit used in ret_from_ code  */
 	unsigned vector = ~regs->orig_ax;
 
-	/*
-	 * NB: Unlike exception entries, IRQ entries do not reliably
-	 * handle context tracking in the low-level entry code.  This is
-	 * because syscall entries execute briefly with IRQs on before
-	 * updating context tracking state, so we can take an IRQ from
-	 * kernel mode with CONTEXT_USER.  The low-level entry code only
-	 * updates the context if we came from user mode, so we won't
-	 * switch to CONTEXT_KERNEL.  We'll fix that once the syscall
-	 * code is cleaned up enough that we can cleanly defer enabling
-	 * IRQs.
-	 */
-
 	entering_irq();
 
 	/* entering_irq() tells RCU that we're not quiescent.  Check it. */

From 4f3789e792296e21405f708cf3cb409d7c7d5683 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Mon, 4 Dec 2017 15:07:11 +0100
Subject: [PATCH 168/808] x86/irq/64: Print the offending IP in the stack
 overflow warning

In case something goes wrong with unwind (not unlikely in case of
overflow), print the offending IP where we detected the overflow.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Link: https://lkml.kernel.org/r/20171204150605.231677119@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/irq_64.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 020efbf5786b..d86e344f5b3d 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -57,10 +57,10 @@ static inline void stack_overflow_check(struct pt_regs *regs)
 	if (regs->sp >= estack_top && regs->sp <= estack_bottom)
 		return;
 
-	WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx)\n",
+	WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx,ip:%pF)\n",
 		current->comm, curbase, regs->sp,
 		irq_stack_top, irq_stack_bottom,
-		estack_top, estack_bottom);
+		estack_top, estack_bottom, (void *)regs->ip);
 
 	if (sysctl_panic_on_stackoverflow)
 		panic("low stack detected by irq handler - check messages\n");

From 1a79797b58cddfa948420a7553241c79c013e3ca Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Mon, 4 Dec 2017 15:07:12 +0100
Subject: [PATCH 169/808] x86/entry/64: Allocate and enable the SYSENTER stack

This will simplify future changes that want scratch variables early in
the SYSENTER handler -- they'll be able to spill registers to the
stack.  It also lets us get rid of a SWAPGS_UNSAFE_STACK user.

This does not depend on CONFIG_IA32_EMULATION=y because we'll want the
stack space even without IA32 emulation.

As far as I can tell, the reason that this wasn't done from day 1 is
that we use IST for #DB and #BP, which is IMO rather nasty and causes
a lot more problems than it solves.  But, since #DB uses IST, we don't
actually need a real stack for SYSENTER (because SYSENTER with TF set
will invoke #DB on the IST stack rather than the SYSENTER stack).

I want to remove IST usage from these vectors some day, and this patch
is a prerequisite for that as well.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Link: https://lkml.kernel.org/r/20171204150605.312726423@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/entry_64_compat.S | 2 +-
 arch/x86/include/asm/processor.h | 3 ---
 arch/x86/kernel/asm-offsets.c    | 5 +++++
 arch/x86/kernel/asm-offsets_32.c | 5 -----
 arch/x86/kernel/cpu/common.c     | 4 +++-
 arch/x86/kernel/process.c        | 2 --
 arch/x86/kernel/traps.c          | 3 +--
 7 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 568e130d932c..dcc6987f9bae 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -48,7 +48,7 @@
  */
 ENTRY(entry_SYSENTER_compat)
 	/* Interrupts are off on entry. */
-	SWAPGS_UNSAFE_STACK
+	SWAPGS
 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 
 	/*
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 2db7cf720b04..789dad5da20f 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -339,14 +339,11 @@ struct tss_struct {
 	 */
 	unsigned long		io_bitmap[IO_BITMAP_LONGS + 1];
 
-#ifdef CONFIG_X86_32
 	/*
 	 * Space for the temporary SYSENTER stack.
 	 */
 	unsigned long		SYSENTER_stack_canary;
 	unsigned long		SYSENTER_stack[64];
-#endif
-
 } ____cacheline_aligned;
 
 DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 8ea78275480d..b275863128eb 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -93,4 +93,9 @@ void common(void) {
 
 	BLANK();
 	DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
+
+	/* Offset from cpu_tss to SYSENTER_stack */
+	OFFSET(CPU_TSS_SYSENTER_stack, tss_struct, SYSENTER_stack);
+	/* Size of SYSENTER_stack */
+	DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack));
 }
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index dedf428b20b6..52ce4ea16e53 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -50,11 +50,6 @@ void foo(void)
 	DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) -
 	       offsetofend(struct tss_struct, SYSENTER_stack));
 
-	/* Offset from cpu_tss to SYSENTER_stack */
-	OFFSET(CPU_TSS_SYSENTER_stack, tss_struct, SYSENTER_stack);
-	/* Size of SYSENTER_stack */
-	DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack));
-
 #ifdef CONFIG_CC_STACKPROTECTOR
 	BLANK();
 	OFFSET(stack_canary_offset, stack_canary, canary);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index cdf79ab628c2..22f542170198 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1361,7 +1361,9 @@ void syscall_init(void)
 	 * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
 	 */
 	wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
-	wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
+	wrmsrl_safe(MSR_IA32_SYSENTER_ESP,
+		    (unsigned long)this_cpu_ptr(&cpu_tss) +
+		    offsetofend(struct tss_struct, SYSENTER_stack));
 	wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
 #else
 	wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret);
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 97fb3e5737f5..35d674157fda 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -71,9 +71,7 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
 	  */
 	.io_bitmap		= { [0 ... IO_BITMAP_LONGS] = ~0 },
 #endif
-#ifdef CONFIG_X86_32
 	.SYSENTER_stack_canary	= STACK_END_MAGIC,
-#endif
 };
 EXPORT_PER_CPU_SYMBOL(cpu_tss);
 
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index d366adfc61da..d3e3bbd5d3a0 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -794,14 +794,13 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
 	debug_stack_usage_dec();
 
 exit:
-#if defined(CONFIG_X86_32)
 	/*
 	 * This is the most likely code path that involves non-trivial use
 	 * of the SYSENTER stack.  Check that we haven't overrun it.
 	 */
 	WARN(this_cpu_read(cpu_tss.SYSENTER_stack_canary) != STACK_END_MAGIC,
 	     "Overran or corrupted SYSENTER stack\n");
-#endif
+
 	ist_exit(regs);
 }
 NOKPROBE_SYMBOL(do_debug);

From 33a2f1a6c4d7c0a02d1c006fb0379cc5ca3b96bb Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Mon, 4 Dec 2017 15:07:13 +0100
Subject: [PATCH 170/808] x86/dumpstack: Add get_stack_info() support for the
 SYSENTER stack

get_stack_info() doesn't currently know about the SYSENTER stack, so
unwinding will fail if we entered the kernel on the SYSENTER stack
and haven't fully switched off.  Teach get_stack_info() about the
SYSENTER stack.

With future patches applied that run part of the entry code on the
SYSENTER stack and introduce an intentional BUG(), I would get:

  PANIC: double fault, error_code: 0x0
  ...
  RIP: 0010:do_error_trap+0x33/0x1c0
  ...
  Call Trace:
  Code: ...

With this patch, I get:

  PANIC: double fault, error_code: 0x0
  ...
  Call Trace:
   <SYSENTER>
   ? async_page_fault+0x36/0x60
   ? invalid_op+0x22/0x40
   ? async_page_fault+0x36/0x60
   ? sync_regs+0x3c/0x40
   ? sync_regs+0x2e/0x40
   ? error_entry+0x6c/0xd0
   ? async_page_fault+0x36/0x60
   </SYSENTER>
  Code: ...

which is a lot more informative.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Link: https://lkml.kernel.org/r/20171204150605.392711508@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/stacktrace.h |  3 +++
 arch/x86/kernel/dumpstack.c       | 19 +++++++++++++++++++
 arch/x86/kernel/dumpstack_32.c    |  6 ++++++
 arch/x86/kernel/dumpstack_64.c    |  6 ++++++
 4 files changed, 34 insertions(+)

diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index 8da111b3c342..f8062bfd43a0 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -16,6 +16,7 @@ enum stack_type {
 	STACK_TYPE_TASK,
 	STACK_TYPE_IRQ,
 	STACK_TYPE_SOFTIRQ,
+	STACK_TYPE_SYSENTER,
 	STACK_TYPE_EXCEPTION,
 	STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1,
 };
@@ -28,6 +29,8 @@ struct stack_info {
 bool in_task_stack(unsigned long *stack, struct task_struct *task,
 		   struct stack_info *info);
 
+bool in_sysenter_stack(unsigned long *stack, struct stack_info *info);
+
 int get_stack_info(unsigned long *stack, struct task_struct *task,
 		   struct stack_info *info, unsigned long *visit_mask);
 
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 0bc95be5c638..a33a1373a252 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -43,6 +43,25 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task,
 	return true;
 }
 
+bool in_sysenter_stack(unsigned long *stack, struct stack_info *info)
+{
+	struct tss_struct *tss = this_cpu_ptr(&cpu_tss);
+
+	/* Treat the canary as part of the stack for unwinding purposes. */
+	void *begin = &tss->SYSENTER_stack_canary;
+	void *end = (void *)&tss->SYSENTER_stack + sizeof(tss->SYSENTER_stack);
+
+	if ((void *)stack < begin || (void *)stack >= end)
+		return false;
+
+	info->type	= STACK_TYPE_SYSENTER;
+	info->begin	= begin;
+	info->end	= end;
+	info->next_sp	= NULL;
+
+	return true;
+}
+
 static void printk_stack_address(unsigned long address, int reliable,
 				 char *log_lvl)
 {
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index daefae83a3aa..5ff13a6b3680 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -26,6 +26,9 @@ const char *stack_type_name(enum stack_type type)
 	if (type == STACK_TYPE_SOFTIRQ)
 		return "SOFTIRQ";
 
+	if (type == STACK_TYPE_SYSENTER)
+		return "SYSENTER";
+
 	return NULL;
 }
 
@@ -93,6 +96,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task,
 	if (task != current)
 		goto unknown;
 
+	if (in_sysenter_stack(stack, info))
+		goto recursion_check;
+
 	if (in_hardirq_stack(stack, info))
 		goto recursion_check;
 
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 88ce2ffdb110..abc828f8c297 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -37,6 +37,9 @@ const char *stack_type_name(enum stack_type type)
 	if (type == STACK_TYPE_IRQ)
 		return "IRQ";
 
+	if (type == STACK_TYPE_SYSENTER)
+		return "SYSENTER";
+
 	if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST)
 		return exception_stack_names[type - STACK_TYPE_EXCEPTION];
 
@@ -115,6 +118,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task,
 	if (in_irq_stack(stack, info))
 		goto recursion_check;
 
+	if (in_sysenter_stack(stack, info))
+		goto recursion_check;
+
 	goto unknown;
 
 recursion_check:

From aaeed3aeb39c1ba69f0a49baec8cb728121d0a91 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Mon, 4 Dec 2017 15:07:14 +0100
Subject: [PATCH 171/808] x86/entry/gdt: Put per-CPU GDT remaps in ascending
 order

We currently have CPU 0's GDT at the top of the GDT range and
higher-numbered CPUs at lower addresses.  This happens because the
fixmap is upside down (index 0 is the top of the fixmap).

Flip it so that GDTs are in ascending order by virtual address.
This will simplify a future patch that will generalize the GDT
remap to contain multiple pages.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Link: https://lkml.kernel.org/r/20171204150605.471561421@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/desc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 0a3e808b9123..01fd944fd721 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -63,7 +63,7 @@ static inline struct desc_struct *get_current_gdt_rw(void)
 /* Get the fixmap index for a specific processor */
 static inline unsigned int get_cpu_gdt_ro_index(int cpu)
 {
-	return FIX_GDT_REMAP_BEGIN + cpu;
+	return FIX_GDT_REMAP_END - cpu;
 }
 
 /* Provide the fixmap address of the remapped GDT */

From ef8813ab280507972bb57e4b1b502811ad4411e9 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Mon, 4 Dec 2017 15:07:15 +0100
Subject: [PATCH 172/808] x86/mm/fixmap: Generalize the GDT fixmap mechanism,
 introduce struct cpu_entry_area

Currently, the GDT is an ad-hoc array of pages, one per CPU, in the
fixmap.  Generalize it to be an array of a new 'struct cpu_entry_area'
so that we can cleanly add new things to it.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Link: https://lkml.kernel.org/r/20171204150605.563271721@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/desc.h   |  9 +--------
 arch/x86/include/asm/fixmap.h | 37 +++++++++++++++++++++++++++++++++--
 arch/x86/kernel/cpu/common.c  | 14 ++++++-------
 arch/x86/xen/mmu_pv.c         |  2 +-
 4 files changed, 44 insertions(+), 18 deletions(-)

diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 01fd944fd721..f6f428432a68 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -60,17 +60,10 @@ static inline struct desc_struct *get_current_gdt_rw(void)
 	return this_cpu_ptr(&gdt_page)->gdt;
 }
 
-/* Get the fixmap index for a specific processor */
-static inline unsigned int get_cpu_gdt_ro_index(int cpu)
-{
-	return FIX_GDT_REMAP_END - cpu;
-}
-
 /* Provide the fixmap address of the remapped GDT */
 static inline struct desc_struct *get_cpu_gdt_ro(int cpu)
 {
-	unsigned int idx = get_cpu_gdt_ro_index(cpu);
-	return (struct desc_struct *)__fix_to_virt(idx);
+	return (struct desc_struct *)&get_cpu_entry_area(cpu)->gdt;
 }
 
 /* Provide the current read-only GDT */
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index b0c505fe9a95..b61f0242f9d0 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -44,6 +44,19 @@ extern unsigned long __FIXADDR_TOP;
 			 PAGE_SIZE)
 #endif
 
+/*
+ * cpu_entry_area is a percpu region in the fixmap that contains things
+ * needed by the CPU and early entry/exit code.  Real types aren't used
+ * for all fields here to avoid circular header dependencies.
+ *
+ * Every field is a virtual alias of some other allocated backing store.
+ * There is no direct allocation of a struct cpu_entry_area.
+ */
+struct cpu_entry_area {
+	char gdt[PAGE_SIZE];
+};
+
+#define CPU_ENTRY_AREA_PAGES (sizeof(struct cpu_entry_area) / PAGE_SIZE)
 
 /*
  * Here we define all the compile-time 'special' virtual
@@ -101,8 +114,8 @@ enum fixed_addresses {
 	FIX_LNW_VRTC,
 #endif
 	/* Fixmap entries to remap the GDTs, one per processor. */
-	FIX_GDT_REMAP_BEGIN,
-	FIX_GDT_REMAP_END = FIX_GDT_REMAP_BEGIN + NR_CPUS - 1,
+	FIX_CPU_ENTRY_AREA_TOP,
+	FIX_CPU_ENTRY_AREA_BOTTOM = FIX_CPU_ENTRY_AREA_TOP + (CPU_ENTRY_AREA_PAGES * NR_CPUS) - 1,
 
 #ifdef CONFIG_ACPI_APEI_GHES
 	/* Used for GHES mapping from assorted contexts */
@@ -191,5 +204,25 @@ void __init *early_memremap_decrypted_wp(resource_size_t phys_addr,
 void __early_set_fixmap(enum fixed_addresses idx,
 			phys_addr_t phys, pgprot_t flags);
 
+static inline unsigned int __get_cpu_entry_area_page_index(int cpu, int page)
+{
+	BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0);
+
+	return FIX_CPU_ENTRY_AREA_BOTTOM - cpu*CPU_ENTRY_AREA_PAGES - page;
+}
+
+#define __get_cpu_entry_area_offset_index(cpu, offset) ({		\
+	BUILD_BUG_ON(offset % PAGE_SIZE != 0);				\
+	__get_cpu_entry_area_page_index(cpu, offset / PAGE_SIZE);	\
+	})
+
+#define get_cpu_entry_area_index(cpu, field)				\
+	__get_cpu_entry_area_offset_index((cpu), offsetof(struct cpu_entry_area, field))
+
+static inline struct cpu_entry_area *get_cpu_entry_area(int cpu)
+{
+	return (struct cpu_entry_area *)__fix_to_virt(__get_cpu_entry_area_page_index(cpu, 0));
+}
+
 #endif /* !__ASSEMBLY__ */
 #endif /* _ASM_X86_FIXMAP_H */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 22f542170198..2cb394dc4153 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -466,12 +466,12 @@ void load_percpu_segment(int cpu)
 	load_stack_canary_segment();
 }
 
-/* Setup the fixmap mapping only once per-processor */
-static inline void setup_fixmap_gdt(int cpu)
+/* Setup the fixmap mappings only once per-processor */
+static inline void setup_cpu_entry_area(int cpu)
 {
 #ifdef CONFIG_X86_64
 	/* On 64-bit systems, we use a read-only fixmap GDT. */
-	pgprot_t prot = PAGE_KERNEL_RO;
+	pgprot_t gdt_prot = PAGE_KERNEL_RO;
 #else
 	/*
 	 * On native 32-bit systems, the GDT cannot be read-only because
@@ -482,11 +482,11 @@ static inline void setup_fixmap_gdt(int cpu)
 	 * On Xen PV, the GDT must be read-only because the hypervisor requires
 	 * it.
 	 */
-	pgprot_t prot = boot_cpu_has(X86_FEATURE_XENPV) ?
+	pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ?
 		PAGE_KERNEL_RO : PAGE_KERNEL;
 #endif
 
-	__set_fixmap(get_cpu_gdt_ro_index(cpu), get_cpu_gdt_paddr(cpu), prot);
+	__set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot);
 }
 
 /* Load the original GDT from the per-cpu structure */
@@ -1589,7 +1589,7 @@ void cpu_init(void)
 	if (is_uv_system())
 		uv_cpu_init();
 
-	setup_fixmap_gdt(cpu);
+	setup_cpu_entry_area(cpu);
 	load_fixmap_gdt(cpu);
 }
 
@@ -1651,7 +1651,7 @@ void cpu_init(void)
 
 	fpu__init_cpu();
 
-	setup_fixmap_gdt(cpu);
+	setup_cpu_entry_area(cpu);
 	load_fixmap_gdt(cpu);
 }
 #endif
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index 2ccdaba31a07..c2454237fa67 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -2272,7 +2272,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
 #endif
 	case FIX_TEXT_POKE0:
 	case FIX_TEXT_POKE1:
-	case FIX_GDT_REMAP_BEGIN ... FIX_GDT_REMAP_END:
+	case FIX_CPU_ENTRY_AREA_TOP ... FIX_CPU_ENTRY_AREA_BOTTOM:
 		/* All local page mappings */
 		pte = pfn_pte(phys, prot);
 		break;

From 21506525fb8ddb0342f2a2370812d47f6a1f3833 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Mon, 4 Dec 2017 15:07:16 +0100
Subject: [PATCH 173/808] x86/kasan/64: Teach KASAN about the cpu_entry_area

The cpu_entry_area will contain stacks.  Make sure that KASAN has
appropriate shadow mappings for them.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Alexander Potapenko <glider@google.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: kasan-dev@googlegroups.com
Cc: keescook@google.com
Link: https://lkml.kernel.org/r/20171204150605.642806442@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/kasan_init_64.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 99dfed6dfef8..9ec70d780f1f 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -277,6 +277,7 @@ void __init kasan_early_init(void)
 void __init kasan_init(void)
 {
 	int i;
+	void *shadow_cpu_entry_begin, *shadow_cpu_entry_end;
 
 #ifdef CONFIG_KASAN_INLINE
 	register_die_notifier(&kasan_die_notifier);
@@ -329,8 +330,23 @@ void __init kasan_init(void)
 			      (unsigned long)kasan_mem_to_shadow(_end),
 			      early_pfn_to_nid(__pa(_stext)));
 
+	shadow_cpu_entry_begin = (void *)__fix_to_virt(FIX_CPU_ENTRY_AREA_BOTTOM);
+	shadow_cpu_entry_begin = kasan_mem_to_shadow(shadow_cpu_entry_begin);
+	shadow_cpu_entry_begin = (void *)round_down((unsigned long)shadow_cpu_entry_begin,
+						PAGE_SIZE);
+
+	shadow_cpu_entry_end = (void *)(__fix_to_virt(FIX_CPU_ENTRY_AREA_TOP) + PAGE_SIZE);
+	shadow_cpu_entry_end = kasan_mem_to_shadow(shadow_cpu_entry_end);
+	shadow_cpu_entry_end = (void *)round_up((unsigned long)shadow_cpu_entry_end,
+					PAGE_SIZE);
+
 	kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
-			(void *)KASAN_SHADOW_END);
+				   shadow_cpu_entry_begin);
+
+	kasan_populate_shadow((unsigned long)shadow_cpu_entry_begin,
+			      (unsigned long)shadow_cpu_entry_end, 0);
+
+	kasan_populate_zero_shadow(shadow_cpu_entry_end, (void *)KASAN_SHADOW_END);
 
 	load_cr3(init_top_pgt);
 	__flush_tlb_all();

From 7fb983b4dd569e08564134a850dfd4eb1c63d9b8 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Mon, 4 Dec 2017 15:07:17 +0100
Subject: [PATCH 174/808] x86/entry: Fix assumptions that the HW TSS is at the
 beginning of cpu_tss

A future patch will move SYSENTER_stack to the beginning of cpu_tss
to help detect overflow.  Before this can happen, fix several code
paths that hardcode assumptions about the old layout.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Link: https://lkml.kernel.org/r/20171204150605.722425540@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/desc.h      |  2 +-
 arch/x86/include/asm/processor.h |  9 +++++++--
 arch/x86/kernel/cpu/common.c     |  8 ++++----
 arch/x86/kernel/doublefault.c    | 32 +++++++++++++++-----------------
 arch/x86/kvm/vmx.c               |  2 +-
 arch/x86/power/cpu.c             | 13 +++++++------
 6 files changed, 35 insertions(+), 31 deletions(-)

diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index f6f428432a68..2ace1f90d138 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -178,7 +178,7 @@ static inline void set_tssldt_descriptor(void *d, unsigned long addr,
 #endif
 }
 
-static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr)
+static inline void __set_tss_desc(unsigned cpu, unsigned int entry, struct x86_hw_tss *addr)
 {
 	struct desc_struct *d = get_cpu_gdt_rw(cpu);
 	tss_desc tss;
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 789dad5da20f..555c9478f3df 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -162,7 +162,7 @@ enum cpuid_regs_idx {
 extern struct cpuinfo_x86	boot_cpu_data;
 extern struct cpuinfo_x86	new_cpu_data;
 
-extern struct tss_struct	doublefault_tss;
+extern struct x86_hw_tss	doublefault_tss;
 extern __u32			cpu_caps_cleared[NCAPINTS];
 extern __u32			cpu_caps_set[NCAPINTS];
 
@@ -252,6 +252,11 @@ static inline void load_cr3(pgd_t *pgdir)
 	write_cr3(__sme_pa(pgdir));
 }
 
+/*
+ * Note that while the legacy 'TSS' name comes from 'Task State Segment',
+ * on modern x86 CPUs the TSS also holds information important to 64-bit mode,
+ * unrelated to the task-switch mechanism:
+ */
 #ifdef CONFIG_X86_32
 /* This is the TSS defined by the hardware. */
 struct x86_hw_tss {
@@ -322,7 +327,7 @@ struct x86_hw_tss {
 #define IO_BITMAP_BITS			65536
 #define IO_BITMAP_BYTES			(IO_BITMAP_BITS/8)
 #define IO_BITMAP_LONGS			(IO_BITMAP_BYTES/sizeof(long))
-#define IO_BITMAP_OFFSET		offsetof(struct tss_struct, io_bitmap)
+#define IO_BITMAP_OFFSET		(offsetof(struct tss_struct, io_bitmap) - offsetof(struct tss_struct, x86_tss))
 #define INVALID_IO_BITMAP_OFFSET	0x8000
 
 struct tss_struct {
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 2cb394dc4153..3f285b973f50 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1557,7 +1557,7 @@ void cpu_init(void)
 		}
 	}
 
-	t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
+	t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
 
 	/*
 	 * <= is required because the CPU will access up to
@@ -1576,7 +1576,7 @@ void cpu_init(void)
 	 * Initialize the TSS.  Don't bother initializing sp0, as the initial
 	 * task never enters user mode.
 	 */
-	set_tss_desc(cpu, t);
+	set_tss_desc(cpu, &t->x86_tss);
 	load_TR_desc();
 
 	load_mm_ldt(&init_mm);
@@ -1634,12 +1634,12 @@ void cpu_init(void)
 	 * Initialize the TSS.  Don't bother initializing sp0, as the initial
 	 * task never enters user mode.
 	 */
-	set_tss_desc(cpu, t);
+	set_tss_desc(cpu, &t->x86_tss);
 	load_TR_desc();
 
 	load_mm_ldt(&init_mm);
 
-	t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
+	t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
 
 #ifdef CONFIG_DOUBLEFAULT
 	/* Set up doublefault TSS pointer in the GDT */
diff --git a/arch/x86/kernel/doublefault.c b/arch/x86/kernel/doublefault.c
index 0e662c55ae90..0b8cedb20d6d 100644
--- a/arch/x86/kernel/doublefault.c
+++ b/arch/x86/kernel/doublefault.c
@@ -50,25 +50,23 @@ static void doublefault_fn(void)
 		cpu_relax();
 }
 
-struct tss_struct doublefault_tss __cacheline_aligned = {
-	.x86_tss = {
-		.sp0		= STACK_START,
-		.ss0		= __KERNEL_DS,
-		.ldt		= 0,
-		.io_bitmap_base	= INVALID_IO_BITMAP_OFFSET,
+struct x86_hw_tss doublefault_tss __cacheline_aligned = {
+	.sp0		= STACK_START,
+	.ss0		= __KERNEL_DS,
+	.ldt		= 0,
+	.io_bitmap_base	= INVALID_IO_BITMAP_OFFSET,
 
-		.ip		= (unsigned long) doublefault_fn,
-		/* 0x2 bit is always set */
-		.flags		= X86_EFLAGS_SF | 0x2,
-		.sp		= STACK_START,
-		.es		= __USER_DS,
-		.cs		= __KERNEL_CS,
-		.ss		= __KERNEL_DS,
-		.ds		= __USER_DS,
-		.fs		= __KERNEL_PERCPU,
+	.ip		= (unsigned long) doublefault_fn,
+	/* 0x2 bit is always set */
+	.flags		= X86_EFLAGS_SF | 0x2,
+	.sp		= STACK_START,
+	.es		= __USER_DS,
+	.cs		= __KERNEL_CS,
+	.ss		= __KERNEL_DS,
+	.ds		= __USER_DS,
+	.fs		= __KERNEL_PERCPU,
 
-		.__cr3		= __pa_nodebug(swapper_pg_dir),
-	}
+	.__cr3		= __pa_nodebug(swapper_pg_dir),
 };
 
 /* dummy for do_double_fault() call */
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a6f4f095f8f4..2abe0073b573 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2291,7 +2291,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 		 * processors.  See 22.2.4.
 		 */
 		vmcs_writel(HOST_TR_BASE,
-			    (unsigned long)this_cpu_ptr(&cpu_tss));
+			    (unsigned long)this_cpu_ptr(&cpu_tss.x86_tss));
 		vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt);   /* 22.2.4 */
 
 		/*
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 84fcfde53f8f..50593e138281 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -165,12 +165,13 @@ static void fix_processor_context(void)
 	struct desc_struct *desc = get_cpu_gdt_rw(cpu);
 	tss_desc tss;
 #endif
-	set_tss_desc(cpu, t);	/*
-				 * This just modifies memory; should not be
-				 * necessary. But... This is necessary, because
-				 * 386 hardware has concept of busy TSS or some
-				 * similar stupidity.
-				 */
+
+	/*
+	 * This just modifies memory; should not be necessary. But... This is
+	 * necessary, because 386 hardware has concept of busy TSS or some
+	 * similar stupidity.
+	 */
+	set_tss_desc(cpu, &t->x86_tss);
 
 #ifdef CONFIG_X86_64
 	memcpy(&tss, &desc[GDT_ENTRY_TSS], sizeof(tss_desc));

From 6e60e583426c2f8751c22c2dfe5c207083b4483a Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Mon, 4 Dec 2017 15:07:18 +0100
Subject: [PATCH 175/808] x86/dumpstack: Handle stack overflow on all stacks

We currently special-case stack overflow on the task stack.  We're
going to start putting special stacks in the fixmap with a custom
layout, so they'll have guard pages, too.  Teach the unwinder to be
able to unwind an overflow of any of the stacks.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Link: https://lkml.kernel.org/r/20171204150605.802057305@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/dumpstack.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index a33a1373a252..64f8ed2a4827 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -112,24 +112,28 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
 	 * - task stack
 	 * - interrupt stack
 	 * - HW exception stacks (double fault, nmi, debug, mce)
+	 * - SYSENTER stack
 	 *
-	 * x86-32 can have up to three stacks:
+	 * x86-32 can have up to four stacks:
 	 * - task stack
 	 * - softirq stack
 	 * - hardirq stack
+	 * - SYSENTER stack
 	 */
 	for (regs = NULL; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {
 		const char *stack_name;
 
-		/*
-		 * If we overflowed the task stack into a guard page, jump back
-		 * to the bottom of the usable stack.
-		 */
-		if (task_stack_page(task) - (void *)stack < PAGE_SIZE)
-			stack = task_stack_page(task);
-
-		if (get_stack_info(stack, task, &stack_info, &visit_mask))
-			break;
+		if (get_stack_info(stack, task, &stack_info, &visit_mask)) {
+			/*
+			 * We weren't on a valid stack.  It's possible that
+			 * we overflowed a valid stack into a guard page.
+			 * See if the next page up is valid so that we can
+			 * generate some kind of backtrace if this happens.
+			 */
+			stack = (unsigned long *)PAGE_ALIGN((unsigned long)stack);
+			if (get_stack_info(stack, task, &stack_info, &visit_mask))
+				break;
+		}
 
 		stack_name = stack_type_name(stack_info.type);
 		if (stack_name)

From 1a935bc3d4ea61556461a9e92a68ca3556232efd Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Mon, 4 Dec 2017 15:07:19 +0100
Subject: [PATCH 176/808] x86/entry: Move SYSENTER_stack to the beginning of
 struct tss_struct

SYSENTER_stack should have reliable overflow detection, which
means that it needs to be at the bottom of a page, not the top.
Move it to the beginning of struct tss_struct and page-align it.

Also add an assertion to make sure that the fixed hardware TSS
doesn't cross a page boundary.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Link: https://lkml.kernel.org/r/20171204150605.881827433@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/processor.h | 21 ++++++++++++---------
 arch/x86/kernel/cpu/common.c     | 21 +++++++++++++++++++++
 2 files changed, 33 insertions(+), 9 deletions(-)

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 555c9478f3df..759051251664 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -332,7 +332,16 @@ struct x86_hw_tss {
 
 struct tss_struct {
 	/*
-	 * The hardware state:
+	 * Space for the temporary SYSENTER stack, used for SYSENTER
+	 * and the entry trampoline as well.
+	 */
+	unsigned long		SYSENTER_stack_canary;
+	unsigned long		SYSENTER_stack[64];
+
+	/*
+	 * The fixed hardware portion.  This must not cross a page boundary
+	 * at risk of violating the SDM's advice and potentially triggering
+	 * errata.
 	 */
 	struct x86_hw_tss	x86_tss;
 
@@ -343,15 +352,9 @@ struct tss_struct {
 	 * be within the limit.
 	 */
 	unsigned long		io_bitmap[IO_BITMAP_LONGS + 1];
+} __aligned(PAGE_SIZE);
 
-	/*
-	 * Space for the temporary SYSENTER stack.
-	 */
-	unsigned long		SYSENTER_stack_canary;
-	unsigned long		SYSENTER_stack[64];
-} ____cacheline_aligned;
-
-DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);
+DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss);
 
 /*
  * sizeof(unsigned long) coming from an extra "long" at the end
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 3f285b973f50..60b2dfd2a58b 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -487,6 +487,27 @@ static inline void setup_cpu_entry_area(int cpu)
 #endif
 
 	__set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot);
+
+	/*
+	 * The Intel SDM says (Volume 3, 7.2.1):
+	 *
+	 *  Avoid placing a page boundary in the part of the TSS that the
+	 *  processor reads during a task switch (the first 104 bytes). The
+	 *  processor may not correctly perform address translations if a
+	 *  boundary occurs in this area. During a task switch, the processor
+	 *  reads and writes into the first 104 bytes of each TSS (using
+	 *  contiguous physical addresses beginning with the physical address
+	 *  of the first byte of the TSS). So, after TSS access begins, if
+	 *  part of the 104 bytes is not physically contiguous, the processor
+	 *  will access incorrect information without generating a page-fault
+	 *  exception.
+	 *
+	 * There are also a lot of errata involving the TSS spanning a page
+	 * boundary.  Assert that we're not doing that.
+	 */
+	BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^
+		      offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK);
+
 }
 
 /* Load the original GDT from the per-cpu structure */

From 72f5e08dbba2d01aa90b592cf76c378ea233b00b Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Mon, 4 Dec 2017 15:07:20 +0100
Subject: [PATCH 177/808] x86/entry: Remap the TSS into the CPU entry area

This has a secondary purpose: it puts the entry stack into a region
with a well-controlled layout.  A subsequent patch will take
advantage of this to streamline the SYSCALL entry code to be able to
find it more easily.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bpetkov@suse.de>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Link: https://lkml.kernel.org/r/20171204150605.962042855@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/entry_32.S     |  6 +++--
 arch/x86/include/asm/fixmap.h |  7 ++++++
 arch/x86/kernel/asm-offsets.c |  3 +++
 arch/x86/kernel/cpu/common.c  | 41 ++++++++++++++++++++++++++++++-----
 arch/x86/kernel/dumpstack.c   |  3 ++-
 arch/x86/kvm/vmx.c            |  2 +-
 arch/x86/power/cpu.c          | 11 +++++-----
 7 files changed, 58 insertions(+), 15 deletions(-)

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 4838037f97f6..0ab316c46806 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -941,7 +941,8 @@ ENTRY(debug)
 	movl	%esp, %eax			# pt_regs pointer
 
 	/* Are we currently on the SYSENTER stack? */
-	PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx)
+	movl	PER_CPU_VAR(cpu_entry_area), %ecx
+	addl	$CPU_ENTRY_AREA_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx
 	subl	%eax, %ecx	/* ecx = (end of SYSENTER_stack) - esp */
 	cmpl	$SIZEOF_SYSENTER_stack, %ecx
 	jb	.Ldebug_from_sysenter_stack
@@ -984,7 +985,8 @@ ENTRY(nmi)
 	movl	%esp, %eax			# pt_regs pointer
 
 	/* Are we currently on the SYSENTER stack? */
-	PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx)
+	movl	PER_CPU_VAR(cpu_entry_area), %ecx
+	addl	$CPU_ENTRY_AREA_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx
 	subl	%eax, %ecx	/* ecx = (end of SYSENTER_stack) - esp */
 	cmpl	$SIZEOF_SYSENTER_stack, %ecx
 	jb	.Lnmi_from_sysenter_stack
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index b61f0242f9d0..84558b611ad3 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -54,6 +54,13 @@ extern unsigned long __FIXADDR_TOP;
  */
 struct cpu_entry_area {
 	char gdt[PAGE_SIZE];
+
+	/*
+	 * The GDT is just below cpu_tss and thus serves (on x86_64) as a
+	 * a read-only guard page for the SYSENTER stack at the bottom
+	 * of the TSS region.
+	 */
+	struct tss_struct tss;
 };
 
 #define CPU_ENTRY_AREA_PAGES (sizeof(struct cpu_entry_area) / PAGE_SIZE)
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index b275863128eb..55858b277cf6 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -98,4 +98,7 @@ void common(void) {
 	OFFSET(CPU_TSS_SYSENTER_stack, tss_struct, SYSENTER_stack);
 	/* Size of SYSENTER_stack */
 	DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack));
+
+	/* Layout info for cpu_entry_area */
+	OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss);
 }
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 60b2dfd2a58b..e5837bd6c672 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -466,6 +466,22 @@ void load_percpu_segment(int cpu)
 	load_stack_canary_segment();
 }
 
+static void set_percpu_fixmap_pages(int fixmap_index, void *ptr,
+				    int pages, pgprot_t prot)
+{
+	int i;
+
+	for (i = 0; i < pages; i++) {
+		__set_fixmap(fixmap_index - i,
+			     per_cpu_ptr_to_phys(ptr + i * PAGE_SIZE), prot);
+	}
+}
+
+#ifdef CONFIG_X86_32
+/* The 32-bit entry code needs to find cpu_entry_area. */
+DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area);
+#endif
+
 /* Setup the fixmap mappings only once per-processor */
 static inline void setup_cpu_entry_area(int cpu)
 {
@@ -507,7 +523,15 @@ static inline void setup_cpu_entry_area(int cpu)
 	 */
 	BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^
 		      offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK);
+	BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0);
+	set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, tss),
+				&per_cpu(cpu_tss, cpu),
+				sizeof(struct tss_struct) / PAGE_SIZE,
+				PAGE_KERNEL);
 
+#ifdef CONFIG_X86_32
+	this_cpu_write(cpu_entry_area, get_cpu_entry_area(cpu));
+#endif
 }
 
 /* Load the original GDT from the per-cpu structure */
@@ -1257,7 +1281,8 @@ void enable_sep_cpu(void)
 	wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0);
 
 	wrmsr(MSR_IA32_SYSENTER_ESP,
-	      (unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack),
+	      (unsigned long)&get_cpu_entry_area(cpu)->tss +
+	      offsetofend(struct tss_struct, SYSENTER_stack),
 	      0);
 
 	wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0);
@@ -1370,6 +1395,8 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
 /* May not be marked __init: used by software suspend */
 void syscall_init(void)
 {
+	int cpu = smp_processor_id();
+
 	wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
 	wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
 
@@ -1383,7 +1410,7 @@ void syscall_init(void)
 	 */
 	wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
 	wrmsrl_safe(MSR_IA32_SYSENTER_ESP,
-		    (unsigned long)this_cpu_ptr(&cpu_tss) +
+		    (unsigned long)&get_cpu_entry_area(cpu)->tss +
 		    offsetofend(struct tss_struct, SYSENTER_stack));
 	wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
 #else
@@ -1593,11 +1620,13 @@ void cpu_init(void)
 	initialize_tlbstate_and_flush();
 	enter_lazy_tlb(&init_mm, me);
 
+	setup_cpu_entry_area(cpu);
+
 	/*
 	 * Initialize the TSS.  Don't bother initializing sp0, as the initial
 	 * task never enters user mode.
 	 */
-	set_tss_desc(cpu, &t->x86_tss);
+	set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
 	load_TR_desc();
 
 	load_mm_ldt(&init_mm);
@@ -1610,7 +1639,6 @@ void cpu_init(void)
 	if (is_uv_system())
 		uv_cpu_init();
 
-	setup_cpu_entry_area(cpu);
 	load_fixmap_gdt(cpu);
 }
 
@@ -1651,11 +1679,13 @@ void cpu_init(void)
 	initialize_tlbstate_and_flush();
 	enter_lazy_tlb(&init_mm, curr);
 
+	setup_cpu_entry_area(cpu);
+
 	/*
 	 * Initialize the TSS.  Don't bother initializing sp0, as the initial
 	 * task never enters user mode.
 	 */
-	set_tss_desc(cpu, &t->x86_tss);
+	set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
 	load_TR_desc();
 
 	load_mm_ldt(&init_mm);
@@ -1672,7 +1702,6 @@ void cpu_init(void)
 
 	fpu__init_cpu();
 
-	setup_cpu_entry_area(cpu);
 	load_fixmap_gdt(cpu);
 }
 #endif
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 64f8ed2a4827..60267850125e 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -45,7 +45,8 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task,
 
 bool in_sysenter_stack(unsigned long *stack, struct stack_info *info)
 {
-	struct tss_struct *tss = this_cpu_ptr(&cpu_tss);
+	int cpu = smp_processor_id();
+	struct tss_struct *tss = &get_cpu_entry_area(cpu)->tss;
 
 	/* Treat the canary as part of the stack for unwinding purposes. */
 	void *begin = &tss->SYSENTER_stack_canary;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 2abe0073b573..62ee4362e1c1 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2291,7 +2291,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 		 * processors.  See 22.2.4.
 		 */
 		vmcs_writel(HOST_TR_BASE,
-			    (unsigned long)this_cpu_ptr(&cpu_tss.x86_tss));
+			    (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
 		vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt);   /* 22.2.4 */
 
 		/*
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 50593e138281..04d5157fe7f8 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -160,18 +160,19 @@ static void do_fpu_end(void)
 static void fix_processor_context(void)
 {
 	int cpu = smp_processor_id();
-	struct tss_struct *t = &per_cpu(cpu_tss, cpu);
 #ifdef CONFIG_X86_64
 	struct desc_struct *desc = get_cpu_gdt_rw(cpu);
 	tss_desc tss;
 #endif
 
 	/*
-	 * This just modifies memory; should not be necessary. But... This is
-	 * necessary, because 386 hardware has concept of busy TSS or some
-	 * similar stupidity.
+	 * We need to reload TR, which requires that we change the
+	 * GDT entry to indicate "available" first.
+	 *
+	 * XXX: This could probably all be replaced by a call to
+	 * force_reload_TR().
 	 */
-	set_tss_desc(cpu, &t->x86_tss);
+	set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
 
 #ifdef CONFIG_X86_64
 	memcpy(&tss, &desc[GDT_ENTRY_TSS], sizeof(tss_desc));

From 9aaefe7b59ae00605256a7d6bd1c1456432495fc Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Mon, 4 Dec 2017 15:07:21 +0100
Subject: [PATCH 178/808] x86/entry/64: Separate cpu_current_top_of_stack from
 TSS.sp0

On 64-bit kernels, we used to assume that TSS.sp0 was the current
top of stack.  With the addition of an entry trampoline, this will
no longer be the case.  Store the current top of stack in TSS.sp1,
which is otherwise unused but shares the same cacheline.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Link: https://lkml.kernel.org/r/20171204150606.050864668@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/processor.h   | 18 +++++++++++++-----
 arch/x86/include/asm/thread_info.h |  2 +-
 arch/x86/kernel/asm-offsets_64.c   |  1 +
 arch/x86/kernel/process.c          | 10 ++++++++++
 arch/x86/kernel/process_64.c       |  1 +
 5 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 759051251664..b0cf0612a454 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -309,7 +309,13 @@ struct x86_hw_tss {
 struct x86_hw_tss {
 	u32			reserved1;
 	u64			sp0;
+
+	/*
+	 * We store cpu_current_top_of_stack in sp1 so it's always accessible.
+	 * Linux does not use ring 1, so sp1 is not otherwise needed.
+	 */
 	u64			sp1;
+
 	u64			sp2;
 	u64			reserved2;
 	u64			ist[7];
@@ -368,6 +374,8 @@ DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss);
 
 #ifdef CONFIG_X86_32
 DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
+#else
+#define cpu_current_top_of_stack cpu_tss.x86_tss.sp1
 #endif
 
 /*
@@ -539,12 +547,12 @@ static inline void native_swapgs(void)
 
 static inline unsigned long current_top_of_stack(void)
 {
-#ifdef CONFIG_X86_64
-	return this_cpu_read_stable(cpu_tss.x86_tss.sp0);
-#else
-	/* sp0 on x86_32 is special in and around vm86 mode. */
+	/*
+	 *  We can't read directly from tss.sp0: sp0 on x86_32 is special in
+	 *  and around vm86 mode and sp0 on x86_64 is special because of the
+	 *  entry trampoline.
+	 */
 	return this_cpu_read_stable(cpu_current_top_of_stack);
-#endif
 }
 
 static inline bool on_thread_stack(void)
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 70f425947dc5..44a04999791e 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -207,7 +207,7 @@ static inline int arch_within_stack_frames(const void * const stack,
 #else /* !__ASSEMBLY__ */
 
 #ifdef CONFIG_X86_64
-# define cpu_current_top_of_stack (cpu_tss + TSS_sp0)
+# define cpu_current_top_of_stack (cpu_tss + TSS_sp1)
 #endif
 
 #endif
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index e3a5175a444b..bf51e51d808d 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -66,6 +66,7 @@ int main(void)
 
 	OFFSET(TSS_ist, tss_struct, x86_tss.ist);
 	OFFSET(TSS_sp0, tss_struct, x86_tss.sp0);
+	OFFSET(TSS_sp1, tss_struct, x86_tss.sp1);
 	BLANK();
 
 #ifdef CONFIG_CC_STACKPROTECTOR
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 35d674157fda..86e83762e3b3 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -56,6 +56,16 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
 		 * Poison it.
 		 */
 		.sp0 = (1UL << (BITS_PER_LONG-1)) + 1,
+
+#ifdef CONFIG_X86_64
+		/*
+		 * .sp1 is cpu_current_top_of_stack.  The init task never
+		 * runs user code, but cpu_current_top_of_stack should still
+		 * be well defined before the first context switch.
+		 */
+		.sp1 = TOP_OF_INIT_STACK,
+#endif
+
 #ifdef CONFIG_X86_32
 		.ss0 = __KERNEL_DS,
 		.ss1 = __KERNEL_CS,
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 01b119bebb68..157f81816915 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -461,6 +461,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	 * Switch the PDA and FPU contexts.
 	 */
 	this_cpu_write(current_task, next_p);
+	this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
 
 	/* Reload sp0. */
 	update_sp0(next_p);

From 6d9256f0a89eaff97fca6006100bcaea8d1d8bdb Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Mon, 4 Dec 2017 15:07:22 +0100
Subject: [PATCH 179/808] x86/espfix/64: Stop assuming that pt_regs is on the
 entry stack

When we start using an entry trampoline, a #GP from userspace will
be delivered on the entry stack, not on the task stack.  Fix the
espfix64 #DF fixup to set up #GP according to TSS.SP0, rather than
assuming that pt_regs + 1 == SP0.  This won't change anything
without an entry stack, but it will make the code continue to work
when an entry stack is added.

While we're at it, improve the comments to explain what's actually
going on.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Link: https://lkml.kernel.org/r/20171204150606.130778051@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/traps.c | 37 ++++++++++++++++++++++++++++---------
 1 file changed, 28 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index d3e3bbd5d3a0..f0029d17b14b 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -348,9 +348,15 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
 
 	/*
 	 * If IRET takes a non-IST fault on the espfix64 stack, then we
-	 * end up promoting it to a doublefault.  In that case, modify
-	 * the stack to make it look like we just entered the #GP
-	 * handler from user space, similar to bad_iret.
+	 * end up promoting it to a doublefault.  In that case, take
+	 * advantage of the fact that we're not using the normal (TSS.sp0)
+	 * stack right now.  We can write a fake #GP(0) frame at TSS.sp0
+	 * and then modify our own IRET frame so that, when we return,
+	 * we land directly at the #GP(0) vector with the stack already
+	 * set up according to its expectations.
+	 *
+	 * The net result is that our #GP handler will think that we
+	 * entered from usermode with the bad user context.
 	 *
 	 * No need for ist_enter here because we don't use RCU.
 	 */
@@ -358,13 +364,26 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
 		regs->cs == __KERNEL_CS &&
 		regs->ip == (unsigned long)native_irq_return_iret)
 	{
-		struct pt_regs *normal_regs = task_pt_regs(current);
+		struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss.x86_tss.sp0) - 1;
 
-		/* Fake a #GP(0) from userspace. */
-		memmove(&normal_regs->ip, (void *)regs->sp, 5*8);
-		normal_regs->orig_ax = 0;  /* Missing (lost) #GP error code */
+		/*
+		 * regs->sp points to the failing IRET frame on the
+		 * ESPFIX64 stack.  Copy it to the entry stack.  This fills
+		 * in gpregs->ss through gpregs->ip.
+		 *
+		 */
+		memmove(&gpregs->ip, (void *)regs->sp, 5*8);
+		gpregs->orig_ax = 0;  /* Missing (lost) #GP error code */
+
+		/*
+		 * Adjust our frame so that we return straight to the #GP
+		 * vector with the expected RSP value.  This is safe because
+		 * we won't enable interupts or schedule before we invoke
+		 * general_protection, so nothing will clobber the stack
+		 * frame we just set up.
+		 */
 		regs->ip = (unsigned long)general_protection;
-		regs->sp = (unsigned long)&normal_regs->orig_ax;
+		regs->sp = (unsigned long)&gpregs->orig_ax;
 
 		return;
 	}
@@ -389,7 +408,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
 	 *
 	 *   Processors update CR2 whenever a page fault is detected. If a
 	 *   second page fault occurs while an earlier page fault is being
-	 *   deliv- ered, the faulting linear address of the second fault will
+	 *   delivered, the faulting linear address of the second fault will
 	 *   overwrite the contents of CR2 (replacing the previous
 	 *   address). These updates to CR2 occur even if the page fault
 	 *   results in a double fault or occurs during the delivery of a

From 7f2590a110b837af5679d08fc25c6227c5a8c497 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Mon, 4 Dec 2017 15:07:23 +0100
Subject: [PATCH 180/808] x86/entry/64: Use a per-CPU trampoline stack for IDT
 entries

Historically, IDT entries from usermode have always gone directly
to the running task's kernel stack.  Rearrange it so that we enter on
a per-CPU trampoline stack and then manually switch to the task's stack.
This touches a couple of extra cachelines, but it gives us a chance
to run some code before we touch the kernel stack.

The asm isn't exactly beautiful, but I think that fully refactoring
it can wait.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Link: https://lkml.kernel.org/r/20171204150606.225330557@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/entry_64.S        | 67 ++++++++++++++++++++++++--------
 arch/x86/entry/entry_64_compat.S |  5 ++-
 arch/x86/include/asm/switch_to.h |  4 +-
 arch/x86/include/asm/traps.h     |  1 -
 arch/x86/kernel/cpu/common.c     |  6 ++-
 arch/x86/kernel/traps.c          | 21 +++++-----
 6 files changed, 72 insertions(+), 32 deletions(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 32306788821c..35b8e949ac2f 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -560,6 +560,13 @@ END(irq_entries_start)
 /* 0(%rsp): ~(interrupt number) */
 	.macro interrupt func
 	cld
+
+	testb	$3, CS-ORIG_RAX(%rsp)
+	jz	1f
+	SWAPGS
+	call	switch_to_thread_stack
+1:
+
 	ALLOC_PT_GPREGS_ON_STACK
 	SAVE_C_REGS
 	SAVE_EXTRA_REGS
@@ -569,12 +576,8 @@ END(irq_entries_start)
 	jz	1f
 
 	/*
-	 * IRQ from user mode.  Switch to kernel gsbase and inform context
-	 * tracking that we're in kernel mode.
-	 */
-	SWAPGS
-
-	/*
+	 * IRQ from user mode.
+	 *
 	 * We need to tell lockdep that IRQs are off.  We can't do this until
 	 * we fix gsbase, and we should do it before enter_from_user_mode
 	 * (which can take locks).  Since TRACE_IRQS_OFF idempotent,
@@ -828,6 +831,32 @@ apicinterrupt IRQ_WORK_VECTOR			irq_work_interrupt		smp_irq_work_interrupt
  */
 #define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8)
 
+/*
+ * Switch to the thread stack.  This is called with the IRET frame and
+ * orig_ax on the stack.  (That is, RDI..R12 are not on the stack and
+ * space has not been allocated for them.)
+ */
+ENTRY(switch_to_thread_stack)
+	UNWIND_HINT_FUNC
+
+	pushq	%rdi
+	movq	%rsp, %rdi
+	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+	UNWIND_HINT sp_offset=16 sp_reg=ORC_REG_DI
+
+	pushq	7*8(%rdi)		/* regs->ss */
+	pushq	6*8(%rdi)		/* regs->rsp */
+	pushq	5*8(%rdi)		/* regs->eflags */
+	pushq	4*8(%rdi)		/* regs->cs */
+	pushq	3*8(%rdi)		/* regs->ip */
+	pushq	2*8(%rdi)		/* regs->orig_ax */
+	pushq	8(%rdi)			/* return address */
+	UNWIND_HINT_FUNC
+
+	movq	(%rdi), %rdi
+	ret
+END(switch_to_thread_stack)
+
 .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
 ENTRY(\sym)
 	UNWIND_HINT_IRET_REGS offset=\has_error_code*8
@@ -845,11 +874,12 @@ ENTRY(\sym)
 
 	ALLOC_PT_GPREGS_ON_STACK
 
-	.if \paranoid
-	.if \paranoid == 1
+	.if \paranoid < 2
 	testb	$3, CS(%rsp)			/* If coming from userspace, switch stacks */
-	jnz	1f
+	jnz	.Lfrom_usermode_switch_stack_\@
 	.endif
+
+	.if \paranoid
 	call	paranoid_entry
 	.else
 	call	error_entry
@@ -891,20 +921,15 @@ ENTRY(\sym)
 	jmp	error_exit
 	.endif
 
-	.if \paranoid == 1
+	.if \paranoid < 2
 	/*
-	 * Paranoid entry from userspace.  Switch stacks and treat it
+	 * Entry from userspace.  Switch stacks and treat it
 	 * as a normal entry.  This means that paranoid handlers
 	 * run in real process context if user_mode(regs).
 	 */
-1:
+.Lfrom_usermode_switch_stack_\@:
 	call	error_entry
 
-
-	movq	%rsp, %rdi			/* pt_regs pointer */
-	call	sync_regs
-	movq	%rax, %rsp			/* switch stack */
-
 	movq	%rsp, %rdi			/* pt_regs pointer */
 
 	.if \has_error_code
@@ -1165,6 +1190,14 @@ ENTRY(error_entry)
 	SWAPGS
 
 .Lerror_entry_from_usermode_after_swapgs:
+	/* Put us onto the real thread stack. */
+	popq	%r12				/* save return addr in %12 */
+	movq	%rsp, %rdi			/* arg0 = pt_regs pointer */
+	call	sync_regs
+	movq	%rax, %rsp			/* switch stack */
+	ENCODE_FRAME_POINTER
+	pushq	%r12
+
 	/*
 	 * We need to tell lockdep that IRQs are off.  We can't do this until
 	 * we fix gsbase, and we should do it before enter_from_user_mode
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index dcc6987f9bae..95ad40eb7eff 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -306,8 +306,11 @@ ENTRY(entry_INT80_compat)
 	 */
 	movl	%eax, %eax
 
-	/* Construct struct pt_regs on stack (iret frame is already on stack) */
 	pushq	%rax			/* pt_regs->orig_ax */
+
+	/* switch to thread stack expects orig_ax to be pushed */
+	call	switch_to_thread_stack
+
 	pushq	%rdi			/* pt_regs->di */
 	pushq	%rsi			/* pt_regs->si */
 	pushq	%rdx			/* pt_regs->dx */
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index 8c6bd6863db9..cbc71e73bd32 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -90,10 +90,12 @@ static inline void refresh_sysenter_cs(struct thread_struct *thread)
 /* This is used when switching tasks or entering/exiting vm86 mode. */
 static inline void update_sp0(struct task_struct *task)
 {
+	/* On x86_64, sp0 always points to the entry trampoline stack, which is constant: */
 #ifdef CONFIG_X86_32
 	load_sp0(task->thread.sp0);
 #else
-	load_sp0(task_top_of_stack(task));
+	if (static_cpu_has(X86_FEATURE_XENPV))
+		load_sp0(task_top_of_stack(task));
 #endif
 }
 
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 1fadd310ff68..31051f35cbb7 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -75,7 +75,6 @@ dotraplinkage void do_segment_not_present(struct pt_regs *, long);
 dotraplinkage void do_stack_segment(struct pt_regs *, long);
 #ifdef CONFIG_X86_64
 dotraplinkage void do_double_fault(struct pt_regs *, long);
-asmlinkage struct pt_regs *sync_regs(struct pt_regs *);
 #endif
 dotraplinkage void do_general_protection(struct pt_regs *, long);
 dotraplinkage void do_page_fault(struct pt_regs *, unsigned long);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index e5837bd6c672..57968880e39b 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1623,11 +1623,13 @@ void cpu_init(void)
 	setup_cpu_entry_area(cpu);
 
 	/*
-	 * Initialize the TSS.  Don't bother initializing sp0, as the initial
-	 * task never enters user mode.
+	 * Initialize the TSS.  sp0 points to the entry trampoline stack
+	 * regardless of what task is running.
 	 */
 	set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
 	load_TR_desc();
+	load_sp0((unsigned long)&get_cpu_entry_area(cpu)->tss +
+		 offsetofend(struct tss_struct, SYSENTER_stack));
 
 	load_mm_ldt(&init_mm);
 
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index f0029d17b14b..ee9ca0ad4388 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -619,14 +619,15 @@ NOKPROBE_SYMBOL(do_int3);
 
 #ifdef CONFIG_X86_64
 /*
- * Help handler running on IST stack to switch off the IST stack if the
- * interrupted code was in user mode. The actual stack switch is done in
- * entry_64.S
+ * Help handler running on a per-cpu (IST or entry trampoline) stack
+ * to switch to the normal thread stack if the interrupted code was in
+ * user mode. The actual stack switch is done in entry_64.S
  */
 asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs)
 {
-	struct pt_regs *regs = task_pt_regs(current);
-	*regs = *eregs;
+	struct pt_regs *regs = (struct pt_regs *)this_cpu_read(cpu_current_top_of_stack) - 1;
+	if (regs != eregs)
+		*regs = *eregs;
 	return regs;
 }
 NOKPROBE_SYMBOL(sync_regs);
@@ -642,13 +643,13 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s)
 	/*
 	 * This is called from entry_64.S early in handling a fault
 	 * caused by a bad iret to user mode.  To handle the fault
-	 * correctly, we want move our stack frame to task_pt_regs
-	 * and we want to pretend that the exception came from the
-	 * iret target.
+	 * correctly, we want to move our stack frame to where it would
+	 * be had we entered directly on the entry stack (rather than
+	 * just below the IRET frame) and we want to pretend that the
+	 * exception came from the IRET target.
 	 */
 	struct bad_iret_stack *new_stack =
-		container_of(task_pt_regs(current),
-			     struct bad_iret_stack, regs);
+		(struct bad_iret_stack *)this_cpu_read(cpu_tss.x86_tss.sp0) - 1;
 
 	/* Copy the IRET target to the new stack. */
 	memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8);

From 3e3b9293d392c577b62e24e4bc9982320438e749 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Mon, 4 Dec 2017 15:07:24 +0100
Subject: [PATCH 181/808] x86/entry/64: Return to userspace from the trampoline
 stack

By itself, this is useless.  It gives us the ability to run some final code
before exit that cannnot run on the kernel stack.  This could include a CR3
switch a la PAGE_TABLE_ISOLATION or some kernel stack erasing, for
example.  (Or even weird things like *changing* which kernel stack gets
used as an ASLR-strengthening mechanism.)

The SYSRET32 path is not covered yet.  It could be in the future or
we could just ignore it and force the slow path if needed.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Link: https://lkml.kernel.org/r/20171204150606.306546484@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/entry_64.S | 55 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 51 insertions(+), 4 deletions(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 35b8e949ac2f..42a9379f7acb 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -326,8 +326,24 @@ syscall_return_via_sysret:
 	popq	%rsi	/* skip rcx */
 	popq	%rdx
 	popq	%rsi
+
+	/*
+	 * Now all regs are restored except RSP and RDI.
+	 * Save old stack pointer and switch to trampoline stack.
+	 */
+	movq	%rsp, %rdi
+	movq	PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp
+
+	pushq	RSP-RDI(%rdi)	/* RSP */
+	pushq	(%rdi)		/* RDI */
+
+	/*
+	 * We are on the trampoline stack.  All regs except RDI are live.
+	 * We can do future final exit work right here.
+	 */
+
 	popq	%rdi
-	movq	RSP-ORIG_RAX(%rsp), %rsp
+	popq	%rsp
 	USERGS_SYSRET64
 END(entry_SYSCALL_64)
 
@@ -630,10 +646,41 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode)
 	ud2
 1:
 #endif
-	SWAPGS
 	POP_EXTRA_REGS
-	POP_C_REGS
-	addq	$8, %rsp	/* skip regs->orig_ax */
+	popq	%r11
+	popq	%r10
+	popq	%r9
+	popq	%r8
+	popq	%rax
+	popq	%rcx
+	popq	%rdx
+	popq	%rsi
+
+	/*
+	 * The stack is now user RDI, orig_ax, RIP, CS, EFLAGS, RSP, SS.
+	 * Save old stack pointer and switch to trampoline stack.
+	 */
+	movq	%rsp, %rdi
+	movq	PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp
+
+	/* Copy the IRET frame to the trampoline stack. */
+	pushq	6*8(%rdi)	/* SS */
+	pushq	5*8(%rdi)	/* RSP */
+	pushq	4*8(%rdi)	/* EFLAGS */
+	pushq	3*8(%rdi)	/* CS */
+	pushq	2*8(%rdi)	/* RIP */
+
+	/* Push user RDI on the trampoline stack. */
+	pushq	(%rdi)
+
+	/*
+	 * We are on the trampoline stack.  All regs except RDI are live.
+	 * We can do future final exit work right here.
+	 */
+
+	/* Restore RDI. */
+	popq	%rdi
+	SWAPGS
 	INTERRUPT_RETURN
 
 
From 3386bc8aed825e9f1f65ce38df4b109b2019b71a Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Mon, 4 Dec 2017 15:07:25 +0100
Subject: [PATCH 182/808] x86/entry/64: Create a per-CPU SYSCALL entry
 trampoline

Handling SYSCALL is tricky: the SYSCALL handler is entered with every
single register (except FLAGS), including RSP, live.  It somehow needs
to set RSP to point to a valid stack, which means it needs to save the
user RSP somewhere and find its own stack pointer.  The canonical way
to do this is with SWAPGS, which lets us access percpu data using the
%gs prefix.

With PAGE_TABLE_ISOLATION-like pagetable switching, this is
problematic.  Without a scratch register, switching CR3 is impossible, so
%gs-based percpu memory would need to be mapped in the user pagetables.
Doing that without information leaks is difficult or impossible.

Instead, use a different sneaky trick.  Map a copy of the first part
of the SYSCALL asm at a different address for each CPU.  Now RIP
varies depending on the CPU, so we can use RIP-relative memory access
to access percpu memory.  By putting the relevant information (one
scratch slot and the stack address) at a constant offset relative to
RIP, we can make SYSCALL work without relying on %gs.

A nice thing about this approach is that we can easily switch it on
and off if we want pagetable switching to be configurable.

The compat variant of SYSCALL doesn't have this problem in the first
place -- there are plenty of scratch registers, since we don't care
about preserving r8-r15.  This patch therefore doesn't touch SYSCALL32
at all.

This patch actually seems to be a small speedup.  With this patch,
SYSCALL touches an extra cache line and an extra virtual page, but
the pipeline no longer stalls waiting for SWAPGS.  It seems that, at
least in a tight loop, the latter outweights the former.

Thanks to David Laight for an optimization tip.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bpetkov@suse.de>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Link: https://lkml.kernel.org/r/20171204150606.403607157@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/entry_64.S     | 58 +++++++++++++++++++++++++++++++++++
 arch/x86/include/asm/fixmap.h |  2 ++
 arch/x86/kernel/asm-offsets.c |  1 +
 arch/x86/kernel/cpu/common.c  | 15 ++++++++-
 arch/x86/kernel/vmlinux.lds.S |  9 ++++++
 5 files changed, 84 insertions(+), 1 deletion(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 42a9379f7acb..2582984ffb4b 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -136,6 +136,64 @@ END(native_usergs_sysret64)
  * with them due to bugs in both AMD and Intel CPUs.
  */
 
+	.pushsection .entry_trampoline, "ax"
+
+/*
+ * The code in here gets remapped into cpu_entry_area's trampoline.  This means
+ * that the assembler and linker have the wrong idea as to where this code
+ * lives (and, in fact, it's mapped more than once, so it's not even at a
+ * fixed address).  So we can't reference any symbols outside the entry
+ * trampoline and expect it to work.
+ *
+ * Instead, we carefully abuse %rip-relative addressing.
+ * _entry_trampoline(%rip) refers to the start of the remapped) entry
+ * trampoline.  We can thus find cpu_entry_area with this macro:
+ */
+
+#define CPU_ENTRY_AREA \
+	_entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip)
+
+/* The top word of the SYSENTER stack is hot and is usable as scratch space. */
+#define RSP_SCRATCH	CPU_ENTRY_AREA_tss + CPU_TSS_SYSENTER_stack + \
+			SIZEOF_SYSENTER_stack - 8 + CPU_ENTRY_AREA
+
+ENTRY(entry_SYSCALL_64_trampoline)
+	UNWIND_HINT_EMPTY
+	swapgs
+
+	/* Stash the user RSP. */
+	movq	%rsp, RSP_SCRATCH
+
+	/* Load the top of the task stack into RSP */
+	movq	CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp
+
+	/* Start building the simulated IRET frame. */
+	pushq	$__USER_DS			/* pt_regs->ss */
+	pushq	RSP_SCRATCH			/* pt_regs->sp */
+	pushq	%r11				/* pt_regs->flags */
+	pushq	$__USER_CS			/* pt_regs->cs */
+	pushq	%rcx				/* pt_regs->ip */
+
+	/*
+	 * x86 lacks a near absolute jump, and we can't jump to the real
+	 * entry text with a relative jump.  We could push the target
+	 * address and then use retq, but this destroys the pipeline on
+	 * many CPUs (wasting over 20 cycles on Sandy Bridge).  Instead,
+	 * spill RDI and restore it in a second-stage trampoline.
+	 */
+	pushq	%rdi
+	movq	$entry_SYSCALL_64_stage2, %rdi
+	jmp	*%rdi
+END(entry_SYSCALL_64_trampoline)
+
+	.popsection
+
+ENTRY(entry_SYSCALL_64_stage2)
+	UNWIND_HINT_EMPTY
+	popq	%rdi
+	jmp	entry_SYSCALL_64_after_hwframe
+END(entry_SYSCALL_64_stage2)
+
 ENTRY(entry_SYSCALL_64)
 	UNWIND_HINT_EMPTY
 	/*
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 84558b611ad3..6a699474c2c7 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -61,6 +61,8 @@ struct cpu_entry_area {
 	 * of the TSS region.
 	 */
 	struct tss_struct tss;
+
+	char entry_trampoline[PAGE_SIZE];
 };
 
 #define CPU_ENTRY_AREA_PAGES (sizeof(struct cpu_entry_area) / PAGE_SIZE)
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 55858b277cf6..61b1af88ac07 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -101,4 +101,5 @@ void common(void) {
 
 	/* Layout info for cpu_entry_area */
 	OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss);
+	OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline);
 }
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 57968880e39b..430f950b0b7f 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -486,6 +486,8 @@ DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area);
 static inline void setup_cpu_entry_area(int cpu)
 {
 #ifdef CONFIG_X86_64
+	extern char _entry_trampoline[];
+
 	/* On 64-bit systems, we use a read-only fixmap GDT. */
 	pgprot_t gdt_prot = PAGE_KERNEL_RO;
 #else
@@ -532,6 +534,11 @@ static inline void setup_cpu_entry_area(int cpu)
 #ifdef CONFIG_X86_32
 	this_cpu_write(cpu_entry_area, get_cpu_entry_area(cpu));
 #endif
+
+#ifdef CONFIG_X86_64
+	__set_fixmap(get_cpu_entry_area_index(cpu, entry_trampoline),
+		     __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX);
+#endif
 }
 
 /* Load the original GDT from the per-cpu structure */
@@ -1395,10 +1402,16 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
 /* May not be marked __init: used by software suspend */
 void syscall_init(void)
 {
+	extern char _entry_trampoline[];
+	extern char entry_SYSCALL_64_trampoline[];
+
 	int cpu = smp_processor_id();
+	unsigned long SYSCALL64_entry_trampoline =
+		(unsigned long)get_cpu_entry_area(cpu)->entry_trampoline +
+		(entry_SYSCALL_64_trampoline - _entry_trampoline);
 
 	wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
-	wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
+	wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline);
 
 #ifdef CONFIG_IA32_EMULATION
 	wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index a4009fb9be87..d2a8b5a24a44 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -107,6 +107,15 @@ SECTIONS
 		SOFTIRQENTRY_TEXT
 		*(.fixup)
 		*(.gnu.warning)
+
+#ifdef CONFIG_X86_64
+		. = ALIGN(PAGE_SIZE);
+		_entry_trampoline = .;
+		*(.entry_trampoline)
+		. = ALIGN(PAGE_SIZE);
+		ASSERT(. - _entry_trampoline == PAGE_SIZE, "entry trampoline is too big");
+#endif
+
 		/* End of text section */
 		_etext = .;
 	} :text = 0x9090

From 40e7f949e0d9a33968ebde5d67f7e3a47c97742a Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Mon, 4 Dec 2017 15:07:26 +0100
Subject: [PATCH 183/808] x86/entry/64: Move the IST stacks into struct
 cpu_entry_area

The IST stacks are needed when an IST exception occurs and are accessed
before any kernel code at all runs.  Move them into struct cpu_entry_area.

The IST stacks are unlike the rest of cpu_entry_area: they're used even for
entries from kernel mode.  This means that they should be set up before we
load the final IDT.  Move cpu_entry_area setup to trap_init() for the boot
CPU and set it up for all possible CPUs at once in native_smp_prepare_cpus().

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Link: https://lkml.kernel.org/r/20171204150606.480598743@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/fixmap.h | 12 ++++++
 arch/x86/kernel/cpu/common.c  | 74 ++++++++++++++++++++---------------
 arch/x86/kernel/traps.c       |  3 ++
 3 files changed, 57 insertions(+), 32 deletions(-)

diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 6a699474c2c7..451da7d9a502 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -63,10 +63,22 @@ struct cpu_entry_area {
 	struct tss_struct tss;
 
 	char entry_trampoline[PAGE_SIZE];
+
+#ifdef CONFIG_X86_64
+	/*
+	 * Exception stacks used for IST entries.
+	 *
+	 * In the future, this should have a separate slot for each stack
+	 * with guard pages between them.
+	 */
+	char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ];
+#endif
 };
 
 #define CPU_ENTRY_AREA_PAGES (sizeof(struct cpu_entry_area) / PAGE_SIZE)
 
+extern void setup_cpu_entry_areas(void);
+
 /*
  * Here we define all the compile-time 'special' virtual
  * addresses. The point is to have a constant address at
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 430f950b0b7f..fb01a8e5e9b7 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -466,24 +466,36 @@ void load_percpu_segment(int cpu)
 	load_stack_canary_segment();
 }
 
-static void set_percpu_fixmap_pages(int fixmap_index, void *ptr,
-				    int pages, pgprot_t prot)
-{
-	int i;
-
-	for (i = 0; i < pages; i++) {
-		__set_fixmap(fixmap_index - i,
-			     per_cpu_ptr_to_phys(ptr + i * PAGE_SIZE), prot);
-	}
-}
-
 #ifdef CONFIG_X86_32
 /* The 32-bit entry code needs to find cpu_entry_area. */
 DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area);
 #endif
 
+#ifdef CONFIG_X86_64
+/*
+ * Special IST stacks which the CPU switches to when it calls
+ * an IST-marked descriptor entry. Up to 7 stacks (hardware
+ * limit), all of them are 4K, except the debug stack which
+ * is 8K.
+ */
+static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
+	  [0 ... N_EXCEPTION_STACKS - 1]	= EXCEPTION_STKSZ,
+	  [DEBUG_STACK - 1]			= DEBUG_STKSZ
+};
+
+static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
+	[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
+#endif
+
+static void __init
+set_percpu_fixmap_pages(int idx, void *ptr, int pages, pgprot_t prot)
+{
+	for ( ; pages; pages--, idx--, ptr += PAGE_SIZE)
+		__set_fixmap(idx, per_cpu_ptr_to_phys(ptr), prot);
+}
+
 /* Setup the fixmap mappings only once per-processor */
-static inline void setup_cpu_entry_area(int cpu)
+static void __init setup_cpu_entry_area(int cpu)
 {
 #ifdef CONFIG_X86_64
 	extern char _entry_trampoline[];
@@ -532,15 +544,31 @@ static inline void setup_cpu_entry_area(int cpu)
 				PAGE_KERNEL);
 
 #ifdef CONFIG_X86_32
-	this_cpu_write(cpu_entry_area, get_cpu_entry_area(cpu));
+	per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu);
 #endif
 
 #ifdef CONFIG_X86_64
+	BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0);
+	BUILD_BUG_ON(sizeof(exception_stacks) !=
+		     sizeof(((struct cpu_entry_area *)0)->exception_stacks));
+	set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, exception_stacks),
+				&per_cpu(exception_stacks, cpu),
+				sizeof(exception_stacks) / PAGE_SIZE,
+				PAGE_KERNEL);
+
 	__set_fixmap(get_cpu_entry_area_index(cpu, entry_trampoline),
 		     __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX);
 #endif
 }
 
+void __init setup_cpu_entry_areas(void)
+{
+	unsigned int cpu;
+
+	for_each_possible_cpu(cpu)
+		setup_cpu_entry_area(cpu);
+}
+
 /* Load the original GDT from the per-cpu structure */
 void load_direct_gdt(int cpu)
 {
@@ -1385,20 +1413,6 @@ DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1;
 DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
 EXPORT_PER_CPU_SYMBOL(__preempt_count);
 
-/*
- * Special IST stacks which the CPU switches to when it calls
- * an IST-marked descriptor entry. Up to 7 stacks (hardware
- * limit), all of them are 4K, except the debug stack which
- * is 8K.
- */
-static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
-	  [0 ... N_EXCEPTION_STACKS - 1]	= EXCEPTION_STKSZ,
-	  [DEBUG_STACK - 1]			= DEBUG_STKSZ
-};
-
-static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
-	[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
-
 /* May not be marked __init: used by software suspend */
 void syscall_init(void)
 {
@@ -1607,7 +1621,7 @@ void cpu_init(void)
 	 * set up and load the per-CPU TSS
 	 */
 	if (!oist->ist[0]) {
-		char *estacks = per_cpu(exception_stacks, cpu);
+		char *estacks = get_cpu_entry_area(cpu)->exception_stacks;
 
 		for (v = 0; v < N_EXCEPTION_STACKS; v++) {
 			estacks += exception_stack_sizes[v];
@@ -1633,8 +1647,6 @@ void cpu_init(void)
 	initialize_tlbstate_and_flush();
 	enter_lazy_tlb(&init_mm, me);
 
-	setup_cpu_entry_area(cpu);
-
 	/*
 	 * Initialize the TSS.  sp0 points to the entry trampoline stack
 	 * regardless of what task is running.
@@ -1694,8 +1706,6 @@ void cpu_init(void)
 	initialize_tlbstate_and_flush();
 	enter_lazy_tlb(&init_mm, curr);
 
-	setup_cpu_entry_area(cpu);
-
 	/*
 	 * Initialize the TSS.  Don't bother initializing sp0, as the initial
 	 * task never enters user mode.
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index ee9ca0ad4388..3e29aad5c7cc 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -947,6 +947,9 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
 
 void __init trap_init(void)
 {
+	/* Init cpu_entry_area before IST entries are set up */
+	setup_cpu_entry_areas();
+
 	idt_setup_traps();
 
 	/*

From 7fbbd5cbebf118a9e09f5453f686656a167c3d1c Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Mon, 4 Dec 2017 15:07:27 +0100
Subject: [PATCH 184/808] x86/entry/64: Remove the SYSENTER stack canary

Now that the SYSENTER stack has a guard page, there's no need for a canary
to detect overflow after the fact.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Link: https://lkml.kernel.org/r/20171204150606.572577316@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/processor.h | 1 -
 arch/x86/kernel/dumpstack.c      | 3 +--
 arch/x86/kernel/process.c        | 1 -
 arch/x86/kernel/traps.c          | 7 -------
 4 files changed, 1 insertion(+), 11 deletions(-)

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index b0cf0612a454..d34ac13c5866 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -341,7 +341,6 @@ struct tss_struct {
 	 * Space for the temporary SYSENTER stack, used for SYSENTER
 	 * and the entry trampoline as well.
 	 */
-	unsigned long		SYSENTER_stack_canary;
 	unsigned long		SYSENTER_stack[64];
 
 	/*
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 60267850125e..ae1ce2e3f132 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -48,8 +48,7 @@ bool in_sysenter_stack(unsigned long *stack, struct stack_info *info)
 	int cpu = smp_processor_id();
 	struct tss_struct *tss = &get_cpu_entry_area(cpu)->tss;
 
-	/* Treat the canary as part of the stack for unwinding purposes. */
-	void *begin = &tss->SYSENTER_stack_canary;
+	void *begin = &tss->SYSENTER_stack;
 	void *end = (void *)&tss->SYSENTER_stack + sizeof(tss->SYSENTER_stack);
 
 	if ((void *)stack < begin || (void *)stack >= end)
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 86e83762e3b3..6a04287f222b 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -81,7 +81,6 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
 	  */
 	.io_bitmap		= { [0 ... IO_BITMAP_LONGS] = ~0 },
 #endif
-	.SYSENTER_stack_canary	= STACK_END_MAGIC,
 };
 EXPORT_PER_CPU_SYMBOL(cpu_tss);
 
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 3e29aad5c7cc..5ade4f89a6d1 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -814,13 +814,6 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
 	debug_stack_usage_dec();
 
 exit:
-	/*
-	 * This is the most likely code path that involves non-trivial use
-	 * of the SYSENTER stack.  Check that we haven't overrun it.
-	 */
-	WARN(this_cpu_read(cpu_tss.SYSENTER_stack_canary) != STACK_END_MAGIC,
-	     "Overran or corrupted SYSENTER stack\n");
-
 	ist_exit(regs);
 }
 NOKPROBE_SYMBOL(do_debug);

From 0f9a48100fba3f189724ae88a450c2261bf91c80 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Mon, 4 Dec 2017 15:07:28 +0100
Subject: [PATCH 185/808] x86/entry: Clean up the SYSENTER_stack code

The existing code was a mess, mainly because C arrays are nasty.  Turn
SYSENTER_stack into a struct, add a helper to find it, and do all the
obvious cleanups this enables.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bpetkov@suse.de>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Link: https://lkml.kernel.org/r/20171204150606.653244723@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/entry_32.S        |  4 ++--
 arch/x86/entry/entry_64.S        |  2 +-
 arch/x86/include/asm/fixmap.h    |  5 +++++
 arch/x86/include/asm/processor.h |  6 +++++-
 arch/x86/kernel/asm-offsets.c    |  6 ++----
 arch/x86/kernel/cpu/common.c     | 14 +++-----------
 arch/x86/kernel/dumpstack.c      |  7 +++----
 7 files changed, 21 insertions(+), 23 deletions(-)

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 0ab316c46806..3629bcbf85a2 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -942,7 +942,7 @@ ENTRY(debug)
 
 	/* Are we currently on the SYSENTER stack? */
 	movl	PER_CPU_VAR(cpu_entry_area), %ecx
-	addl	$CPU_ENTRY_AREA_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx
+	addl	$CPU_ENTRY_AREA_tss + TSS_STRUCT_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx
 	subl	%eax, %ecx	/* ecx = (end of SYSENTER_stack) - esp */
 	cmpl	$SIZEOF_SYSENTER_stack, %ecx
 	jb	.Ldebug_from_sysenter_stack
@@ -986,7 +986,7 @@ ENTRY(nmi)
 
 	/* Are we currently on the SYSENTER stack? */
 	movl	PER_CPU_VAR(cpu_entry_area), %ecx
-	addl	$CPU_ENTRY_AREA_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx
+	addl	$CPU_ENTRY_AREA_tss + TSS_STRUCT_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx
 	subl	%eax, %ecx	/* ecx = (end of SYSENTER_stack) - esp */
 	cmpl	$SIZEOF_SYSENTER_stack, %ecx
 	jb	.Lnmi_from_sysenter_stack
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 2582984ffb4b..575b184f377f 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -154,7 +154,7 @@ END(native_usergs_sysret64)
 	_entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip)
 
 /* The top word of the SYSENTER stack is hot and is usable as scratch space. */
-#define RSP_SCRATCH	CPU_ENTRY_AREA_tss + CPU_TSS_SYSENTER_stack + \
+#define RSP_SCRATCH	CPU_ENTRY_AREA_tss + TSS_STRUCT_SYSENTER_stack + \
 			SIZEOF_SYSENTER_stack - 8 + CPU_ENTRY_AREA
 
 ENTRY(entry_SYSCALL_64_trampoline)
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 451da7d9a502..cc5d98bdca37 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -245,5 +245,10 @@ static inline struct cpu_entry_area *get_cpu_entry_area(int cpu)
 	return (struct cpu_entry_area *)__fix_to_virt(__get_cpu_entry_area_page_index(cpu, 0));
 }
 
+static inline struct SYSENTER_stack *cpu_SYSENTER_stack(int cpu)
+{
+	return &get_cpu_entry_area(cpu)->tss.SYSENTER_stack;
+}
+
 #endif /* !__ASSEMBLY__ */
 #endif /* _ASM_X86_FIXMAP_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index d34ac13c5866..f933869470b8 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -336,12 +336,16 @@ struct x86_hw_tss {
 #define IO_BITMAP_OFFSET		(offsetof(struct tss_struct, io_bitmap) - offsetof(struct tss_struct, x86_tss))
 #define INVALID_IO_BITMAP_OFFSET	0x8000
 
+struct SYSENTER_stack {
+	unsigned long		words[64];
+};
+
 struct tss_struct {
 	/*
 	 * Space for the temporary SYSENTER stack, used for SYSENTER
 	 * and the entry trampoline as well.
 	 */
-	unsigned long		SYSENTER_stack[64];
+	struct SYSENTER_stack	SYSENTER_stack;
 
 	/*
 	 * The fixed hardware portion.  This must not cross a page boundary
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 61b1af88ac07..46c0995344aa 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -94,10 +94,8 @@ void common(void) {
 	BLANK();
 	DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
 
-	/* Offset from cpu_tss to SYSENTER_stack */
-	OFFSET(CPU_TSS_SYSENTER_stack, tss_struct, SYSENTER_stack);
-	/* Size of SYSENTER_stack */
-	DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack));
+	OFFSET(TSS_STRUCT_SYSENTER_stack, tss_struct, SYSENTER_stack);
+	DEFINE(SIZEOF_SYSENTER_stack, sizeof(struct SYSENTER_stack));
 
 	/* Layout info for cpu_entry_area */
 	OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index fb01a8e5e9b7..3de7480e4f32 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1314,12 +1314,7 @@ void enable_sep_cpu(void)
 
 	tss->x86_tss.ss1 = __KERNEL_CS;
 	wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0);
-
-	wrmsr(MSR_IA32_SYSENTER_ESP,
-	      (unsigned long)&get_cpu_entry_area(cpu)->tss +
-	      offsetofend(struct tss_struct, SYSENTER_stack),
-	      0);
-
+	wrmsr(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_SYSENTER_stack(cpu) + 1), 0);
 	wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0);
 
 	put_cpu();
@@ -1436,9 +1431,7 @@ void syscall_init(void)
 	 * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
 	 */
 	wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
-	wrmsrl_safe(MSR_IA32_SYSENTER_ESP,
-		    (unsigned long)&get_cpu_entry_area(cpu)->tss +
-		    offsetofend(struct tss_struct, SYSENTER_stack));
+	wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_SYSENTER_stack(cpu) + 1));
 	wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
 #else
 	wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret);
@@ -1653,8 +1646,7 @@ void cpu_init(void)
 	 */
 	set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
 	load_TR_desc();
-	load_sp0((unsigned long)&get_cpu_entry_area(cpu)->tss +
-		 offsetofend(struct tss_struct, SYSENTER_stack));
+	load_sp0((unsigned long)(cpu_SYSENTER_stack(cpu) + 1));
 
 	load_mm_ldt(&init_mm);
 
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index ae1ce2e3f132..bbd6d986e2d0 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -45,11 +45,10 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task,
 
 bool in_sysenter_stack(unsigned long *stack, struct stack_info *info)
 {
-	int cpu = smp_processor_id();
-	struct tss_struct *tss = &get_cpu_entry_area(cpu)->tss;
+	struct SYSENTER_stack *ss = cpu_SYSENTER_stack(smp_processor_id());
 
-	void *begin = &tss->SYSENTER_stack;
-	void *end = (void *)&tss->SYSENTER_stack + sizeof(tss->SYSENTER_stack);
+	void *begin = ss;
+	void *end = ss + 1;
 
 	if ((void *)stack < begin || (void *)stack >= end)
 		return false;

From c482feefe1aeb150156248ba0fd3e029bc886605 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Mon, 4 Dec 2017 15:07:29 +0100
Subject: [PATCH 186/808] x86/entry/64: Make cpu_entry_area.tss read-only

The TSS is a fairly juicy target for exploits, and, now that the TSS
is in the cpu_entry_area, it's no longer protected by kASLR.  Make it
read-only on x86_64.

On x86_32, it can't be RO because it's written by the CPU during task
switches, and we use a task gate for double faults.  I'd also be
nervous about errata if we tried to make it RO even on configurations
without double fault handling.

[ tglx: AMD confirmed that there is no problem on 64-bit with TSS RO.  So
  	it's probably safe to assume that it's a non issue, though Intel
  	might have been creative in that area. Still waiting for
  	confirmation. ]

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bpetkov@suse.de>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Link: https://lkml.kernel.org/r/20171204150606.733700132@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/entry_32.S          |  4 ++--
 arch/x86/entry/entry_64.S          |  8 ++++----
 arch/x86/include/asm/fixmap.h      | 13 +++++++++----
 arch/x86/include/asm/processor.h   | 17 ++++++++---------
 arch/x86/include/asm/switch_to.h   |  4 ++--
 arch/x86/include/asm/thread_info.h |  2 +-
 arch/x86/kernel/asm-offsets.c      |  5 ++---
 arch/x86/kernel/asm-offsets_32.c   |  4 ++--
 arch/x86/kernel/cpu/common.c       | 29 +++++++++++++++++++----------
 arch/x86/kernel/ioport.c           |  2 +-
 arch/x86/kernel/process.c          |  6 +++---
 arch/x86/kernel/process_32.c       |  2 +-
 arch/x86/kernel/process_64.c       |  2 +-
 arch/x86/kernel/traps.c            |  4 ++--
 arch/x86/lib/delay.c               |  4 ++--
 arch/x86/xen/enlighten_pv.c        |  2 +-
 16 files changed, 60 insertions(+), 48 deletions(-)

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 3629bcbf85a2..bd8b57a5c874 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -942,7 +942,7 @@ ENTRY(debug)
 
 	/* Are we currently on the SYSENTER stack? */
 	movl	PER_CPU_VAR(cpu_entry_area), %ecx
-	addl	$CPU_ENTRY_AREA_tss + TSS_STRUCT_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx
+	addl	$CPU_ENTRY_AREA_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx
 	subl	%eax, %ecx	/* ecx = (end of SYSENTER_stack) - esp */
 	cmpl	$SIZEOF_SYSENTER_stack, %ecx
 	jb	.Ldebug_from_sysenter_stack
@@ -986,7 +986,7 @@ ENTRY(nmi)
 
 	/* Are we currently on the SYSENTER stack? */
 	movl	PER_CPU_VAR(cpu_entry_area), %ecx
-	addl	$CPU_ENTRY_AREA_tss + TSS_STRUCT_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx
+	addl	$CPU_ENTRY_AREA_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx
 	subl	%eax, %ecx	/* ecx = (end of SYSENTER_stack) - esp */
 	cmpl	$SIZEOF_SYSENTER_stack, %ecx
 	jb	.Lnmi_from_sysenter_stack
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 575b184f377f..2812ce043a7a 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -154,7 +154,7 @@ END(native_usergs_sysret64)
 	_entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip)
 
 /* The top word of the SYSENTER stack is hot and is usable as scratch space. */
-#define RSP_SCRATCH	CPU_ENTRY_AREA_tss + TSS_STRUCT_SYSENTER_stack + \
+#define RSP_SCRATCH	CPU_ENTRY_AREA_SYSENTER_stack + \
 			SIZEOF_SYSENTER_stack - 8 + CPU_ENTRY_AREA
 
 ENTRY(entry_SYSCALL_64_trampoline)
@@ -390,7 +390,7 @@ syscall_return_via_sysret:
 	 * Save old stack pointer and switch to trampoline stack.
 	 */
 	movq	%rsp, %rdi
-	movq	PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp
+	movq	PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
 
 	pushq	RSP-RDI(%rdi)	/* RSP */
 	pushq	(%rdi)		/* RDI */
@@ -719,7 +719,7 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode)
 	 * Save old stack pointer and switch to trampoline stack.
 	 */
 	movq	%rsp, %rdi
-	movq	PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp
+	movq	PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
 
 	/* Copy the IRET frame to the trampoline stack. */
 	pushq	6*8(%rdi)	/* SS */
@@ -934,7 +934,7 @@ apicinterrupt IRQ_WORK_VECTOR			irq_work_interrupt		smp_irq_work_interrupt
 /*
  * Exception entry points.
  */
-#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8)
+#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + ((x) - 1) * 8)
 
 /*
  * Switch to the thread stack.  This is called with the IRET frame and
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index cc5d98bdca37..94fc4fa14127 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -56,9 +56,14 @@ struct cpu_entry_area {
 	char gdt[PAGE_SIZE];
 
 	/*
-	 * The GDT is just below cpu_tss and thus serves (on x86_64) as a
-	 * a read-only guard page for the SYSENTER stack at the bottom
-	 * of the TSS region.
+	 * The GDT is just below SYSENTER_stack and thus serves (on x86_64) as
+	 * a a read-only guard page.
+	 */
+	struct SYSENTER_stack_page SYSENTER_stack_page;
+
+	/*
+	 * On x86_64, the TSS is mapped RO.  On x86_32, it's mapped RW because
+	 * we need task switches to work, and task switches write to the TSS.
 	 */
 	struct tss_struct tss;
 
@@ -247,7 +252,7 @@ static inline struct cpu_entry_area *get_cpu_entry_area(int cpu)
 
 static inline struct SYSENTER_stack *cpu_SYSENTER_stack(int cpu)
 {
-	return &get_cpu_entry_area(cpu)->tss.SYSENTER_stack;
+	return &get_cpu_entry_area(cpu)->SYSENTER_stack_page.stack;
 }
 
 #endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index f933869470b8..e8991d7f7034 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -340,13 +340,11 @@ struct SYSENTER_stack {
 	unsigned long		words[64];
 };
 
-struct tss_struct {
-	/*
-	 * Space for the temporary SYSENTER stack, used for SYSENTER
-	 * and the entry trampoline as well.
-	 */
-	struct SYSENTER_stack	SYSENTER_stack;
+struct SYSENTER_stack_page {
+	struct SYSENTER_stack stack;
+} __aligned(PAGE_SIZE);
 
+struct tss_struct {
 	/*
 	 * The fixed hardware portion.  This must not cross a page boundary
 	 * at risk of violating the SDM's advice and potentially triggering
@@ -363,7 +361,7 @@ struct tss_struct {
 	unsigned long		io_bitmap[IO_BITMAP_LONGS + 1];
 } __aligned(PAGE_SIZE);
 
-DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss);
+DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw);
 
 /*
  * sizeof(unsigned long) coming from an extra "long" at the end
@@ -378,7 +376,8 @@ DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss);
 #ifdef CONFIG_X86_32
 DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
 #else
-#define cpu_current_top_of_stack cpu_tss.x86_tss.sp1
+/* The RO copy can't be accessed with this_cpu_xyz(), so use the RW copy. */
+#define cpu_current_top_of_stack cpu_tss_rw.x86_tss.sp1
 #endif
 
 /*
@@ -538,7 +537,7 @@ static inline void native_set_iopl_mask(unsigned mask)
 static inline void
 native_load_sp0(unsigned long sp0)
 {
-	this_cpu_write(cpu_tss.x86_tss.sp0, sp0);
+	this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0);
 }
 
 static inline void native_swapgs(void)
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index cbc71e73bd32..9b6df68d8fd1 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -79,10 +79,10 @@ do {									\
 static inline void refresh_sysenter_cs(struct thread_struct *thread)
 {
 	/* Only happens when SEP is enabled, no need to test "SEP"arately: */
-	if (unlikely(this_cpu_read(cpu_tss.x86_tss.ss1) == thread->sysenter_cs))
+	if (unlikely(this_cpu_read(cpu_tss_rw.x86_tss.ss1) == thread->sysenter_cs))
 		return;
 
-	this_cpu_write(cpu_tss.x86_tss.ss1, thread->sysenter_cs);
+	this_cpu_write(cpu_tss_rw.x86_tss.ss1, thread->sysenter_cs);
 	wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
 }
 #endif
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 44a04999791e..00223333821a 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -207,7 +207,7 @@ static inline int arch_within_stack_frames(const void * const stack,
 #else /* !__ASSEMBLY__ */
 
 #ifdef CONFIG_X86_64
-# define cpu_current_top_of_stack (cpu_tss + TSS_sp1)
+# define cpu_current_top_of_stack (cpu_tss_rw + TSS_sp1)
 #endif
 
 #endif
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 46c0995344aa..cd360a5e0dca 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -94,10 +94,9 @@ void common(void) {
 	BLANK();
 	DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
 
-	OFFSET(TSS_STRUCT_SYSENTER_stack, tss_struct, SYSENTER_stack);
-	DEFINE(SIZEOF_SYSENTER_stack, sizeof(struct SYSENTER_stack));
-
 	/* Layout info for cpu_entry_area */
 	OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss);
 	OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline);
+	OFFSET(CPU_ENTRY_AREA_SYSENTER_stack, cpu_entry_area, SYSENTER_stack_page);
+	DEFINE(SIZEOF_SYSENTER_stack, sizeof(struct SYSENTER_stack));
 }
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 52ce4ea16e53..7d20d9c0b3d6 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -47,8 +47,8 @@ void foo(void)
 	BLANK();
 
 	/* Offset from the sysenter stack to tss.sp0 */
-	DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) -
-	       offsetofend(struct tss_struct, SYSENTER_stack));
+	DEFINE(TSS_sysenter_sp0, offsetof(struct cpu_entry_area, tss.x86_tss.sp0) -
+	       offsetofend(struct cpu_entry_area, SYSENTER_stack_page.stack));
 
 #ifdef CONFIG_CC_STACKPROTECTOR
 	BLANK();
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 3de7480e4f32..c2eada1056de 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -487,6 +487,9 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
 	[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
 #endif
 
+static DEFINE_PER_CPU_PAGE_ALIGNED(struct SYSENTER_stack_page,
+				   SYSENTER_stack_storage);
+
 static void __init
 set_percpu_fixmap_pages(int idx, void *ptr, int pages, pgprot_t prot)
 {
@@ -500,23 +503,29 @@ static void __init setup_cpu_entry_area(int cpu)
 #ifdef CONFIG_X86_64
 	extern char _entry_trampoline[];
 
-	/* On 64-bit systems, we use a read-only fixmap GDT. */
+	/* On 64-bit systems, we use a read-only fixmap GDT and TSS. */
 	pgprot_t gdt_prot = PAGE_KERNEL_RO;
+	pgprot_t tss_prot = PAGE_KERNEL_RO;
 #else
 	/*
 	 * On native 32-bit systems, the GDT cannot be read-only because
 	 * our double fault handler uses a task gate, and entering through
-	 * a task gate needs to change an available TSS to busy.  If the GDT
-	 * is read-only, that will triple fault.
+	 * a task gate needs to change an available TSS to busy.  If the
+	 * GDT is read-only, that will triple fault.  The TSS cannot be
+	 * read-only because the CPU writes to it on task switches.
 	 *
-	 * On Xen PV, the GDT must be read-only because the hypervisor requires
-	 * it.
+	 * On Xen PV, the GDT must be read-only because the hypervisor
+	 * requires it.
 	 */
 	pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ?
 		PAGE_KERNEL_RO : PAGE_KERNEL;
+	pgprot_t tss_prot = PAGE_KERNEL;
 #endif
 
 	__set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot);
+	set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, SYSENTER_stack_page),
+				per_cpu_ptr(&SYSENTER_stack_storage, cpu), 1,
+				PAGE_KERNEL);
 
 	/*
 	 * The Intel SDM says (Volume 3, 7.2.1):
@@ -539,9 +548,9 @@ static void __init setup_cpu_entry_area(int cpu)
 		      offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK);
 	BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0);
 	set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, tss),
-				&per_cpu(cpu_tss, cpu),
+				&per_cpu(cpu_tss_rw, cpu),
 				sizeof(struct tss_struct) / PAGE_SIZE,
-				PAGE_KERNEL);
+				tss_prot);
 
 #ifdef CONFIG_X86_32
 	per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu);
@@ -1305,7 +1314,7 @@ void enable_sep_cpu(void)
 		return;
 
 	cpu = get_cpu();
-	tss = &per_cpu(cpu_tss, cpu);
+	tss = &per_cpu(cpu_tss_rw, cpu);
 
 	/*
 	 * We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field --
@@ -1575,7 +1584,7 @@ void cpu_init(void)
 	if (cpu)
 		load_ucode_ap();
 
-	t = &per_cpu(cpu_tss, cpu);
+	t = &per_cpu(cpu_tss_rw, cpu);
 	oist = &per_cpu(orig_ist, cpu);
 
 #ifdef CONFIG_NUMA
@@ -1667,7 +1676,7 @@ void cpu_init(void)
 {
 	int cpu = smp_processor_id();
 	struct task_struct *curr = current;
-	struct tss_struct *t = &per_cpu(cpu_tss, cpu);
+	struct tss_struct *t = &per_cpu(cpu_tss_rw, cpu);
 
 	wait_for_master_cpu(cpu);
 
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 3feb648781c4..2f723301eb58 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -67,7 +67,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
 	 * because the ->io_bitmap_max value must match the bitmap
 	 * contents:
 	 */
-	tss = &per_cpu(cpu_tss, get_cpu());
+	tss = &per_cpu(cpu_tss_rw, get_cpu());
 
 	if (turn_on)
 		bitmap_clear(t->io_bitmap_ptr, from, num);
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 6a04287f222b..517415978409 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -47,7 +47,7 @@
  * section. Since TSS's are completely CPU-local, we want them
  * on exact cacheline boundaries, to eliminate cacheline ping-pong.
  */
-__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
+__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss_rw) = {
 	.x86_tss = {
 		/*
 		 * .sp0 is only used when entering ring 0 from a lower
@@ -82,7 +82,7 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
 	.io_bitmap		= { [0 ... IO_BITMAP_LONGS] = ~0 },
 #endif
 };
-EXPORT_PER_CPU_SYMBOL(cpu_tss);
+EXPORT_PER_CPU_SYMBOL(cpu_tss_rw);
 
 DEFINE_PER_CPU(bool, __tss_limit_invalid);
 EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid);
@@ -111,7 +111,7 @@ void exit_thread(struct task_struct *tsk)
 	struct fpu *fpu = &t->fpu;
 
 	if (bp) {
-		struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu());
+		struct tss_struct *tss = &per_cpu(cpu_tss_rw, get_cpu());
 
 		t->io_bitmap_ptr = NULL;
 		clear_thread_flag(TIF_IO_BITMAP);
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 45bf0c5f93e1..5224c6099184 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -234,7 +234,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	struct fpu *prev_fpu = &prev->fpu;
 	struct fpu *next_fpu = &next->fpu;
 	int cpu = smp_processor_id();
-	struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
+	struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu);
 
 	/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
 
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 157f81816915..c75466232016 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -399,7 +399,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	struct fpu *prev_fpu = &prev->fpu;
 	struct fpu *next_fpu = &next->fpu;
 	int cpu = smp_processor_id();
-	struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
+	struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu);
 
 	WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
 		     this_cpu_read(irq_count) != -1);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 5ade4f89a6d1..74136fd16f49 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -364,7 +364,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
 		regs->cs == __KERNEL_CS &&
 		regs->ip == (unsigned long)native_irq_return_iret)
 	{
-		struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss.x86_tss.sp0) - 1;
+		struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
 
 		/*
 		 * regs->sp points to the failing IRET frame on the
@@ -649,7 +649,7 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s)
 	 * exception came from the IRET target.
 	 */
 	struct bad_iret_stack *new_stack =
-		(struct bad_iret_stack *)this_cpu_read(cpu_tss.x86_tss.sp0) - 1;
+		(struct bad_iret_stack *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
 
 	/* Copy the IRET target to the new stack. */
 	memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8);
diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c
index 553f8fd23cc4..4846eff7e4c8 100644
--- a/arch/x86/lib/delay.c
+++ b/arch/x86/lib/delay.c
@@ -107,10 +107,10 @@ static void delay_mwaitx(unsigned long __loops)
 		delay = min_t(u64, MWAITX_MAX_LOOPS, loops);
 
 		/*
-		 * Use cpu_tss as a cacheline-aligned, seldomly
+		 * Use cpu_tss_rw as a cacheline-aligned, seldomly
 		 * accessed per-cpu variable as the monitor target.
 		 */
-		__monitorx(raw_cpu_ptr(&cpu_tss), 0, 0);
+		__monitorx(raw_cpu_ptr(&cpu_tss_rw), 0, 0);
 
 		/*
 		 * AMD, like Intel, supports the EAX hint and EAX=0xf
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index fbd054d6ac97..ae3a071e1d0f 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -818,7 +818,7 @@ static void xen_load_sp0(unsigned long sp0)
 	mcs = xen_mc_entry(0);
 	MULTI_stack_switch(mcs.mc, __KERNEL_DS, sp0);
 	xen_mc_issue(PARAVIRT_LAZY_CPU);
-	this_cpu_write(cpu_tss.x86_tss.sp0, sp0);
+	this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0);
 }
 
 void xen_set_iopl_mask(unsigned mask)

From a035795499ca1c2bd1928808d1a156eda1420383 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 4 Dec 2017 15:07:30 +0100
Subject: [PATCH 187/808] x86/paravirt: Dont patch flush_tlb_single

native_flush_tlb_single() will be changed with the upcoming
PAGE_TABLE_ISOLATION feature. This requires to have more code in
there than INVLPG.

Remove the paravirt patching for it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
Reviewed-by: Juergen Gross <jgross@suse.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Cc: linux-mm@kvack.org
Cc: michael.schwarz@iaik.tugraz.at
Cc: moritz.lipp@iaik.tugraz.at
Cc: richard.fellner@student.tugraz.at
Link: https://lkml.kernel.org/r/20171204150606.828111617@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/paravirt_patch_64.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index ac0be8283325..9edadabf04f6 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -10,7 +10,6 @@ DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax");
 DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");
 DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");
 DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3");
-DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)");
 DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd");
 
 DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq");
@@ -60,7 +59,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
 		PATCH_SITE(pv_mmu_ops, read_cr2);
 		PATCH_SITE(pv_mmu_ops, read_cr3);
 		PATCH_SITE(pv_mmu_ops, write_cr3);
-		PATCH_SITE(pv_mmu_ops, flush_tlb_single);
 		PATCH_SITE(pv_cpu_ops, wbinvd);
 #if defined(CONFIG_PARAVIRT_SPINLOCKS)
 		case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):

From 79cc74155218316b9a5d28577c7077b2adba8e58 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 4 Dec 2017 15:07:31 +0100
Subject: [PATCH 188/808] x86/paravirt: Provide a way to check for hypervisors

There is no generic way to test whether a kernel is running on a specific
hypervisor. But that's required to prevent the upcoming user address space
separation feature in certain guest modes.

Make the hypervisor type enum unconditionally available and provide a
helper function which allows to test for a specific type.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Juergen Gross <jgross@suse.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Link: https://lkml.kernel.org/r/20171204150606.912938129@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/hypervisor.h | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h
index 1b0a5abcd8ae..96aa6b9884dc 100644
--- a/arch/x86/include/asm/hypervisor.h
+++ b/arch/x86/include/asm/hypervisor.h
@@ -20,16 +20,7 @@
 #ifndef _ASM_X86_HYPERVISOR_H
 #define _ASM_X86_HYPERVISOR_H
 
-#ifdef CONFIG_HYPERVISOR_GUEST
-
-#include <asm/kvm_para.h>
-#include <asm/x86_init.h>
-#include <asm/xen/hypervisor.h>
-
-/*
- * x86 hypervisor information
- */
-
+/* x86 hypervisor types  */
 enum x86_hypervisor_type {
 	X86_HYPER_NATIVE = 0,
 	X86_HYPER_VMWARE,
@@ -39,6 +30,12 @@ enum x86_hypervisor_type {
 	X86_HYPER_KVM,
 };
 
+#ifdef CONFIG_HYPERVISOR_GUEST
+
+#include <asm/kvm_para.h>
+#include <asm/x86_init.h>
+#include <asm/xen/hypervisor.h>
+
 struct hypervisor_x86 {
 	/* Hypervisor name */
 	const char	*name;
@@ -58,7 +55,15 @@ struct hypervisor_x86 {
 
 extern enum x86_hypervisor_type x86_hyper_type;
 extern void init_hypervisor_platform(void);
+static inline bool hypervisor_is_type(enum x86_hypervisor_type type)
+{
+	return x86_hyper_type == type;
+}
 #else
 static inline void init_hypervisor_platform(void) { }
+static inline bool hypervisor_is_type(enum x86_hypervisor_type type)
+{
+	return type == X86_HYPER_NATIVE;
+}
 #endif /* CONFIG_HYPERVISOR_GUEST */
 #endif /* _ASM_X86_HYPERVISOR_H */

From 6cbd2171e89b13377261d15e64384df60ecb530e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 4 Dec 2017 15:07:32 +0100
Subject: [PATCH 189/808] x86/cpufeatures: Make CPU bugs sticky

There is currently no way to force CPU bug bits like CPU feature bits. That
makes it impossible to set a bug bit once at boot and have it stick for all
upcoming CPUs.

Extend the force set/clear arrays to handle bug bits as well.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Link: https://lkml.kernel.org/r/20171204150606.992156574@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/cpufeature.h | 2 ++
 arch/x86/include/asm/processor.h  | 4 ++--
 arch/x86/kernel/cpu/common.c      | 6 +++---
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index bf6a76202a77..ea9a7dde62e5 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -135,6 +135,8 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit);
 	set_bit(bit, (unsigned long *)cpu_caps_set);	\
 } while (0)
 
+#define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit)
+
 #if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_X86_FAST_FEATURE_TESTS)
 /*
  * Static testing of CPU features.  Used the same as boot_cpu_has().
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index e8991d7f7034..da943411d3d8 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -163,8 +163,8 @@ extern struct cpuinfo_x86	boot_cpu_data;
 extern struct cpuinfo_x86	new_cpu_data;
 
 extern struct x86_hw_tss	doublefault_tss;
-extern __u32			cpu_caps_cleared[NCAPINTS];
-extern __u32			cpu_caps_set[NCAPINTS];
+extern __u32			cpu_caps_cleared[NCAPINTS + NBUGINTS];
+extern __u32			cpu_caps_set[NCAPINTS + NBUGINTS];
 
 #ifdef CONFIG_SMP
 DECLARE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c2eada1056de..034900623adf 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -452,8 +452,8 @@ static const char *table_lookup_model(struct cpuinfo_x86 *c)
 	return NULL;		/* Not found */
 }
 
-__u32 cpu_caps_cleared[NCAPINTS];
-__u32 cpu_caps_set[NCAPINTS];
+__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS];
+__u32 cpu_caps_set[NCAPINTS + NBUGINTS];
 
 void load_percpu_segment(int cpu)
 {
@@ -812,7 +812,7 @@ static void apply_forced_caps(struct cpuinfo_x86 *c)
 {
 	int i;
 
-	for (i = 0; i < NCAPINTS; i++) {
+	for (i = 0; i < NCAPINTS + NBUGINTS; i++) {
 		c->x86_capability[i] &= ~cpu_caps_cleared[i];
 		c->x86_capability[i] |= cpu_caps_set[i];
 	}

From 203c110b39a89b48156c7450504e454fedb7f7f6 Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Tue, 12 Dec 2017 21:32:16 +0100
Subject: [PATCH 190/808] parisc: Fix indenting in puts()

Static analysis tools complain that we intended to have curly braces
around this indent block. In this case this assumption is wrong, so fix
the indenting.

Fixes: 2f3c7b8137ef ("parisc: Add core code for self-extracting kernel")
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Helge Deller <deller@gmx.de>
Cc: <stable@vger.kernel.org> # v4.14+
---
 arch/parisc/boot/compressed/misc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/parisc/boot/compressed/misc.c b/arch/parisc/boot/compressed/misc.c
index 9345b44b86f0..f57118e1f6b4 100644
--- a/arch/parisc/boot/compressed/misc.c
+++ b/arch/parisc/boot/compressed/misc.c
@@ -123,8 +123,8 @@ int puts(const char *s)
 	while ((nuline = strchr(s, '\n')) != NULL) {
 		if (nuline != s)
 			pdc_iodc_print(s, nuline - s);
-			pdc_iodc_print("\r\n", 2);
-			s = nuline + 1;
+		pdc_iodc_print("\r\n", 2);
+		s = nuline + 1;
 	}
 	if (*s != '\0')
 		pdc_iodc_print(s, strlen(s));

From 0ed9d3de5f8f97e6efd5ca0e3377cab5f0451ead Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Tue, 12 Dec 2017 21:25:41 +0100
Subject: [PATCH 191/808] parisc: Align os_hpmc_size on word boundary

The os_hpmc_size variable sometimes wasn't aligned at word boundary and thus
triggered the unaligned fault handler at startup.
Fix it by aligning it properly.

Signed-off-by: Helge Deller <deller@gmx.de>
Cc: <stable@vger.kernel.org> # v4.14+
---
 arch/parisc/kernel/hpmc.S | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/parisc/kernel/hpmc.S b/arch/parisc/kernel/hpmc.S
index e3a8e5e4d5de..8d072c44f300 100644
--- a/arch/parisc/kernel/hpmc.S
+++ b/arch/parisc/kernel/hpmc.S
@@ -305,6 +305,7 @@ ENDPROC_CFI(os_hpmc)
 
 
 	__INITRODATA
+	.align 4
 	.export os_hpmc_size
 os_hpmc_size:
 	.word .os_hpmc_end-.os_hpmc

From bcf3f1752a622f1372d3252d0fea8855d89812e7 Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Tue, 12 Dec 2017 21:52:26 +0100
Subject: [PATCH 192/808] parisc: Hide Diva-built-in serial aux and graphics
 card

Diva GSP card has built-in serial AUX port and ATI graphic card which simply
don't work and which both don't have external connectors.  User Guides even
mention that those devices shouldn't be used.
So, prevent that Linux drivers try to enable those devices.

Signed-off-by: Helge Deller <deller@gmx.de>
Cc: <stable@vger.kernel.org> # v3.0+
---
 drivers/parisc/lba_pci.c | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/drivers/parisc/lba_pci.c b/drivers/parisc/lba_pci.c
index a25fed52f7e9..41b740aed3a3 100644
--- a/drivers/parisc/lba_pci.c
+++ b/drivers/parisc/lba_pci.c
@@ -1692,3 +1692,36 @@ void lba_set_iregs(struct parisc_device *lba, u32 ibase, u32 imask)
 	iounmap(base_addr);
 }
 
+
+/*
+ * The design of the Diva management card in rp34x0 machines (rp3410, rp3440)
+ * seems rushed, so that many built-in components simply don't work.
+ * The following quirks disable the serial AUX port and the built-in ATI RV100
+ * Radeon 7000 graphics card which both don't have any external connectors and
+ * thus are useless, and even worse, e.g. the AUX port occupies ttyS0 and as
+ * such makes those machines the only PARISC machines on which we can't use
+ * ttyS0 as boot console.
+ */
+static void quirk_diva_ati_card(struct pci_dev *dev)
+{
+	if (dev->subsystem_vendor != PCI_VENDOR_ID_HP ||
+	    dev->subsystem_device != 0x1292)
+		return;
+
+	dev_info(&dev->dev, "Hiding Diva built-in ATI card");
+	dev->device = 0;
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_RADEON_QY,
+	quirk_diva_ati_card);
+
+static void quirk_diva_aux_disable(struct pci_dev *dev)
+{
+	if (dev->subsystem_vendor != PCI_VENDOR_ID_HP ||
+	    dev->subsystem_device != 0x1291)
+		return;
+
+	dev_info(&dev->dev, "Hiding Diva built-in AUX serial device");
+	dev->device = 0;
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_DIVA_AUX,
+	quirk_diva_aux_disable);

From 6a16fc322085bb3163d7d6e44856adfda06a8001 Mon Sep 17 00:00:00 2001
From: Pravin Shedge <pravin.shedge4linux@gmail.com>
Date: Sun, 10 Dec 2017 23:54:33 +0530
Subject: [PATCH 193/808] parisc: remove duplicate includes

These duplicate includes have been found with scripts/checkincludes.pl
but they have been removed manually to avoid removing false positives.

Signed-off-by: Pravin Shedge <pravin.shedge4linux@gmail.com>
Signed-off-by: Helge Deller <deller@gmx.de>
---
 arch/parisc/kernel/unwind.c | 1 -
 arch/parisc/lib/delay.c     | 2 --
 2 files changed, 3 deletions(-)

diff --git a/arch/parisc/kernel/unwind.c b/arch/parisc/kernel/unwind.c
index 5a657986ebbf..143f90e2f9f3 100644
--- a/arch/parisc/kernel/unwind.c
+++ b/arch/parisc/kernel/unwind.c
@@ -15,7 +15,6 @@
 #include <linux/slab.h>
 #include <linux/kallsyms.h>
 #include <linux/sort.h>
-#include <linux/sched.h>
 
 #include <linux/uaccess.h>
 #include <asm/assembly.h>
diff --git a/arch/parisc/lib/delay.c b/arch/parisc/lib/delay.c
index 7eab4bb8abe6..66e506520505 100644
--- a/arch/parisc/lib/delay.c
+++ b/arch/parisc/lib/delay.c
@@ -16,9 +16,7 @@
 #include <linux/preempt.h>
 #include <linux/init.h>
 
-#include <asm/processor.h>
 #include <asm/delay.h>
-
 #include <asm/special_insns.h>    /* for mfctl() */
 #include <asm/processor.h> /* for boot_cpu_data */
 

From 9352aeada4d8d8753fc0e414fbfe8fdfcb68a12c Mon Sep 17 00:00:00 2001
From: John David Anglin <dave.anglin@bell.net>
Date: Mon, 13 Nov 2017 19:35:33 -0500
Subject: [PATCH 194/808] Revert "parisc: Re-enable interrupts early"

This reverts commit 5c38602d83e584047906b41b162ababd4db4106d.

Interrupts can't be enabled early because the register saves are done on
the thread stack prior to switching to the IRQ stack.  This caused stack
overflows and the thread stack needed increasing to 32k.  Even then,
stack overflows still occasionally occurred.

Background:
Even with a 32 kB thread stack, I have seen instances where the thread
stack overflowed on the mx3210 buildd.  Detection of stack overflow only
occurs when we have an external interrupt.  When an external interrupt
occurs, we switch to the thread stack if we are not already on a kernel
stack.  Then, registers and specials are saved to the kernel stack.

The bug occurs in intr_return where interrupts are reenabled prior to
returning from the interrupt.  This was done incase we need to schedule
or deliver signals.  However, it introduces the possibility that
multiple external interrupts may occur on the thread stack and cause a
stack overflow.  These might not be detected and cause the kernel to
misbehave in random ways.

This patch changes the code back to only reenable interrupts when we are
going to schedule or deliver signals.  As a result, we generally return
from an interrupt before reenabling interrupts.  This minimizes the
growth of the thread stack.

Fixes: 5c38602d83e5 ("parisc: Re-enable interrupts early")
Signed-off-by: John David Anglin <dave.anglin@bell.net>
Cc: <stable@vger.kernel.org> # v4.10+
Signed-off-by: Helge Deller <deller@gmx.de>
---
 arch/parisc/kernel/entry.S | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/arch/parisc/kernel/entry.S b/arch/parisc/kernel/entry.S
index a4fd296c958e..f3cecf5117cf 100644
--- a/arch/parisc/kernel/entry.S
+++ b/arch/parisc/kernel/entry.S
@@ -878,9 +878,6 @@ ENTRY_CFI(syscall_exit_rfi)
 	STREG   %r19,PT_SR7(%r16)
 
 intr_return:
-	/* NOTE: Need to enable interrupts incase we schedule. */
-	ssm     PSW_SM_I, %r0
-
 	/* check for reschedule */
 	mfctl   %cr30,%r1
 	LDREG   TI_FLAGS(%r1),%r19	/* sched.h: TIF_NEED_RESCHED */
@@ -907,6 +904,11 @@ intr_check_sig:
 	LDREG	PT_IASQ1(%r16), %r20
 	cmpib,COND(=),n 0,%r20,intr_restore /* backward */
 
+	/* NOTE: We need to enable interrupts if we have to deliver
+	 * signals. We used to do this earlier but it caused kernel
+	 * stack overflows. */
+	ssm     PSW_SM_I, %r0
+
 	copy	%r0, %r25			/* long in_syscall = 0 */
 #ifdef CONFIG_64BIT
 	ldo	-16(%r30),%r29			/* Reference param save area */
@@ -958,6 +960,10 @@ intr_do_resched:
 	cmpib,COND(=)	0, %r20, intr_do_preempt
 	nop
 
+	/* NOTE: We need to enable interrupts if we schedule.  We used
+	 * to do this earlier but it caused kernel stack overflows. */
+	ssm     PSW_SM_I, %r0
+
 #ifdef CONFIG_64BIT
 	ldo	-16(%r30),%r29		/* Reference param save area */
 #endif

From da57c5414f49ef9e4bcb9ae0bbafd1d650b31411 Mon Sep 17 00:00:00 2001
From: John David Anglin <dave.anglin@bell.net>
Date: Mon, 13 Nov 2017 19:35:33 -0500
Subject: [PATCH 195/808] parisc: Reduce thread stack to 16 kb

In testing, I found that the thread stack can be 16 kB when using an irq
stack.  Without it, the thread stack needs to be 32 kB. Currently, the irq
stack is 32 kB. While it probably could be 16 kB, I would prefer to leave it
as is for safety.

Signed-off-by: John David Anglin <dave.anglin@bell.net>
Signed-off-by: Helge Deller <deller@gmx.de>
---
 arch/parisc/include/asm/thread_info.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/arch/parisc/include/asm/thread_info.h b/arch/parisc/include/asm/thread_info.h
index c980a02a52bc..598c8d60fa5e 100644
--- a/arch/parisc/include/asm/thread_info.h
+++ b/arch/parisc/include/asm/thread_info.h
@@ -35,7 +35,12 @@ struct thread_info {
 
 /* thread information allocation */
 
+#ifdef CONFIG_IRQSTACKS
+#define THREAD_SIZE_ORDER	2 /* PA-RISC requires at least 16k stack */
+#else
 #define THREAD_SIZE_ORDER	3 /* PA-RISC requires at least 32k stack */
+#endif
+
 /* Be sure to hunt all references to this down when you change the size of
  * the kernel stack */
 #define THREAD_SIZE             (PAGE_SIZE << THREAD_SIZE_ORDER)

From 36b0cb84ee858f02c256d26f0cb4229c78e3399e Mon Sep 17 00:00:00 2001
From: Chunyan Zhang <zhang.lyra@gmail.com>
Date: Fri, 1 Dec 2017 03:51:04 +0100
Subject: [PATCH 196/808] ARM: 8731/1: Fix csum_partial_copy_from_user() stack
 mismatch

An additional 'ip' will be pushed to the stack, for restoring the
DACR later, if CONFIG_CPU_SW_DOMAIN_PAN defined.

However, the fixup still get the err_ptr by add #8*4 to sp, which
results in the fact that the code area pointed by the LR will be
overwritten, or the kernel will crash if CONFIG_DEBUG_RODATA is enabled.

This patch fixes the stack mismatch.

Fixes: a5e090acbf54 ("ARM: software-based priviledged-no-access support")
Signed-off-by: Lvqiang Huang <Lvqiang.Huang@spreadtrum.com>
Signed-off-by: Chunyan Zhang <zhang.lyra@gmail.com>
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
---
 arch/arm/lib/csumpartialcopyuser.S | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/arm/lib/csumpartialcopyuser.S b/arch/arm/lib/csumpartialcopyuser.S
index 1712f132b80d..b83fdc06286a 100644
--- a/arch/arm/lib/csumpartialcopyuser.S
+++ b/arch/arm/lib/csumpartialcopyuser.S
@@ -85,7 +85,11 @@
 		.pushsection .text.fixup,"ax"
 		.align	4
 9001:		mov	r4, #-EFAULT
+#ifdef CONFIG_CPU_SW_DOMAIN_PAN
+		ldr	r5, [sp, #9*4]		@ *err_ptr
+#else
 		ldr	r5, [sp, #8*4]		@ *err_ptr
+#endif
 		str	r4, [r5]
 		ldmia	sp, {r1, r2}		@ retrieve dst, len
 		add	r2, r2, r1

From d82c3682168431d29ba1741d0cd5ef45c68bf8e0 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@free-electrons.com>
Date: Mon, 18 Dec 2017 08:26:28 +0100
Subject: [PATCH 197/808] mtd: Fix mtd_check_oob_ops()

The mtd_check_oob_ops() helper verifies if the operation defined by the
user is correct.

Fix the check that verifies if the entire requested area exists. This
check is too restrictive and will fail anytime the last data byte of the
very last page is included in an operation.

Fixes: 5cdd929da53d ("mtd: Add sanity checks in mtd_write/read_oob()")
Signed-off-by: Miquel Raynal <miquel.raynal@free-electrons.com>
Acked-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 drivers/mtd/mtdcore.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c
index f80e911b8843..73b605577447 100644
--- a/drivers/mtd/mtdcore.c
+++ b/drivers/mtd/mtdcore.c
@@ -1114,7 +1114,7 @@ static int mtd_check_oob_ops(struct mtd_info *mtd, loff_t offs,
 	if (!ops->oobbuf)
 		ops->ooblen = 0;
 
-	if (offs < 0 || offs + ops->len >= mtd->size)
+	if (offs < 0 || offs + ops->len > mtd->size)
 		return -EINVAL;
 
 	if (ops->ooblen) {

From bfe766cf65fb65e68c4764f76158718560bdcee5 Mon Sep 17 00:00:00 2001
From: Julien Thierry <julien.thierry@arm.com>
Date: Wed, 6 Dec 2017 17:09:49 +0000
Subject: [PATCH 198/808] arm64: kvm: Prevent restoring stale PMSCR_EL1 for
 vcpu

When VHE is not present, KVM needs to save and restores PMSCR_EL1 when
possible. If SPE is used by the host, value of PMSCR_EL1 cannot be saved
for the guest.
If the host starts using SPE between two save+restore on the same vcpu,
restore will write the value of PMSCR_EL1 read during the first save.

Make sure __debug_save_spe_nvhe clears the value of the saved PMSCR_EL1
when the guest cannot use SPE.

Signed-off-by: Julien Thierry <julien.thierry@arm.com>
Cc: Christoffer Dall <christoffer.dall@linaro.org>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: <stable@vger.kernel.org>
Reviewed-by: Will Deacon <will.deacon@arm.com>
Reviewed-by: Christoffer Dall <christoffer.dall@linaro.org>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 arch/arm64/kvm/hyp/debug-sr.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/arm64/kvm/hyp/debug-sr.c b/arch/arm64/kvm/hyp/debug-sr.c
index 321c9c05dd9e..f4363d40e2cd 100644
--- a/arch/arm64/kvm/hyp/debug-sr.c
+++ b/arch/arm64/kvm/hyp/debug-sr.c
@@ -74,6 +74,9 @@ static void __hyp_text __debug_save_spe_nvhe(u64 *pmscr_el1)
 {
 	u64 reg;
 
+	/* Clear pmscr in case of early return */
+	*pmscr_el1 = 0;
+
 	/* SPE present on this CPU? */
 	if (!cpuid_feature_extract_unsigned_field(read_sysreg(id_aa64dfr0_el1),
 						  ID_AA64DFR0_PMSVER_SHIFT))

From 7839c672e58bf62da8f2f0197fefb442c02ba1dd Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Thu, 7 Dec 2017 11:45:45 +0000
Subject: [PATCH 199/808] KVM: arm/arm64: Fix HYP unmapping going off limits

When we unmap the HYP memory, we try to be clever and unmap one
PGD at a time. If we start with a non-PGD aligned address and try
to unmap a whole PGD, things go horribly wrong in unmap_hyp_range
(addr and end can never match, and it all goes really badly as we
keep incrementing pgd and parse random memory as page tables...).

The obvious fix is to let unmap_hyp_range do what it does best,
which is to iterate over a range.

The size of the linear mapping, which begins at PAGE_OFFSET, can be
easily calculated by subtracting PAGE_OFFSET form high_memory, because
high_memory is defined as the linear map address of the last byte of
DRAM, plus one.

The size of the vmalloc region is given trivially by VMALLOC_END -
VMALLOC_START.

Cc: stable@vger.kernel.org
Reported-by: Andre Przywara <andre.przywara@arm.com>
Tested-by: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: Christoffer Dall <christoffer.dall@linaro.org>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 virt/kvm/arm/mmu.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
index b36945d49986..b4b69c2d1012 100644
--- a/virt/kvm/arm/mmu.c
+++ b/virt/kvm/arm/mmu.c
@@ -509,8 +509,6 @@ static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size)
  */
 void free_hyp_pgds(void)
 {
-	unsigned long addr;
-
 	mutex_lock(&kvm_hyp_pgd_mutex);
 
 	if (boot_hyp_pgd) {
@@ -521,10 +519,10 @@ void free_hyp_pgds(void)
 
 	if (hyp_pgd) {
 		unmap_hyp_range(hyp_pgd, hyp_idmap_start, PAGE_SIZE);
-		for (addr = PAGE_OFFSET; virt_addr_valid(addr); addr += PGDIR_SIZE)
-			unmap_hyp_range(hyp_pgd, kern_hyp_va(addr), PGDIR_SIZE);
-		for (addr = VMALLOC_START; is_vmalloc_addr((void*)addr); addr += PGDIR_SIZE)
-			unmap_hyp_range(hyp_pgd, kern_hyp_va(addr), PGDIR_SIZE);
+		unmap_hyp_range(hyp_pgd, kern_hyp_va(PAGE_OFFSET),
+				(uintptr_t)high_memory - PAGE_OFFSET);
+		unmap_hyp_range(hyp_pgd, kern_hyp_va(VMALLOC_START),
+				VMALLOC_END - VMALLOC_START);
 
 		free_pages((unsigned long)hyp_pgd, hyp_pgd_order);
 		hyp_pgd = NULL;

From f384dcfe4d918c1d80477d290c22ce0093823771 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Thu, 7 Dec 2017 11:46:15 +0000
Subject: [PATCH 200/808] KVM: arm/arm64: timer: Don't set irq as forwarded if
 no usable GIC

If we don't have a usable GIC, do not try to set the vcpu affinity
as this is guaranteed to fail.

Reported-by: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: Andre Przywara <andre.przywara@arm.com>
Tested-by: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: Christoffer Dall <christoffer.dall@linaro.org>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 include/kvm/arm_arch_timer.h |  2 +-
 virt/kvm/arm/arch_timer.c    | 13 ++++++++-----
 virt/kvm/arm/arm.c           |  2 +-
 3 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h
index 6e45608b2399..9da6ce22803f 100644
--- a/include/kvm/arm_arch_timer.h
+++ b/include/kvm/arm_arch_timer.h
@@ -62,7 +62,7 @@ struct arch_timer_cpu {
 	bool			enabled;
 };
 
-int kvm_timer_hyp_init(void);
+int kvm_timer_hyp_init(bool);
 int kvm_timer_enable(struct kvm_vcpu *vcpu);
 int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu);
 void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu);
diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
index f9555b1e7f15..aa9adfafe12b 100644
--- a/virt/kvm/arm/arch_timer.c
+++ b/virt/kvm/arm/arch_timer.c
@@ -720,7 +720,7 @@ static int kvm_timer_dying_cpu(unsigned int cpu)
 	return 0;
 }
 
-int kvm_timer_hyp_init(void)
+int kvm_timer_hyp_init(bool has_gic)
 {
 	struct arch_timer_kvm_info *info;
 	int err;
@@ -756,10 +756,13 @@ int kvm_timer_hyp_init(void)
 		return err;
 	}
 
-	err = irq_set_vcpu_affinity(host_vtimer_irq, kvm_get_running_vcpus());
-	if (err) {
-		kvm_err("kvm_arch_timer: error setting vcpu affinity\n");
-		goto out_free_irq;
+	if (has_gic) {
+		err = irq_set_vcpu_affinity(host_vtimer_irq,
+					    kvm_get_running_vcpus());
+		if (err) {
+			kvm_err("kvm_arch_timer: error setting vcpu affinity\n");
+			goto out_free_irq;
+		}
 	}
 
 	kvm_info("virtual timer IRQ%d\n", host_vtimer_irq);
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index 6b60c98a6e22..2e43f9d42bd5 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -1326,7 +1326,7 @@ static int init_subsystems(void)
 	/*
 	 * Init HYP architected timer support
 	 */
-	err = kvm_timer_hyp_init();
+	err = kvm_timer_hyp_init(vgic_present);
 	if (err)
 		goto out;
 

From 36e5cfd410ad6060b527e51d1b4bc174a8068cfd Mon Sep 17 00:00:00 2001
From: Christoffer Dall <christoffer.dall@linaro.org>
Date: Thu, 14 Dec 2017 19:54:50 +0100
Subject: [PATCH 201/808] KVM: arm/arm64: Properly handle arch-timer IRQs after
 vtimer_save_state

The recent timer rework was assuming that once the timer was disabled,
we should no longer see any interrupts from the timer.  This assumption
turns out to not be true, and instead we have to handle the case when
the timer ISR runs even after the timer has been disabled.

This requires a couple of changes:

First, we should never overwrite the cached guest state of the timer
control register when the ISR runs, because KVM may have disabled its
timers when doing vcpu_put(), even though the guest still had the timer
enabled.

Second, we shouldn't assume that the timer is actually firing just
because we see an interrupt, but we should check the actual state of the
timer in the timer control register to understand if the hardware timer
is really firing or not.

We also add an ISB to vtimer_save_state() to ensure the timer is
actually disabled once we enable interrupts, which should clarify the
intention of the implementation, and reduce the risk of unwanted
interrupts.

Fixes: b103cc3f10c0 ("KVM: arm/arm64: Avoid timer save/restore in vcpu entry/exit")
Reported-by: Marc Zyngier <marc.zyngier@arm.com>
Reported-by: Jia He <hejianet@gmail.com>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Tested-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 virt/kvm/arm/arch_timer.c | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
index aa9adfafe12b..14c018f990a7 100644
--- a/virt/kvm/arm/arch_timer.c
+++ b/virt/kvm/arm/arch_timer.c
@@ -92,16 +92,23 @@ static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id)
 {
 	struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)dev_id;
 	struct arch_timer_context *vtimer;
+	u32 cnt_ctl;
+
+	/*
+	 * We may see a timer interrupt after vcpu_put() has been called which
+	 * sets the CPU's vcpu pointer to NULL, because even though the timer
+	 * has been disabled in vtimer_save_state(), the hardware interrupt
+	 * signal may not have been retired from the interrupt controller yet.
+	 */
+	if (!vcpu)
+		return IRQ_HANDLED;
 
-	if (!vcpu) {
-		pr_warn_once("Spurious arch timer IRQ on non-VCPU thread\n");
-		return IRQ_NONE;
-	}
 	vtimer = vcpu_vtimer(vcpu);
-
 	if (!vtimer->irq.level) {
-		vtimer->cnt_ctl = read_sysreg_el0(cntv_ctl);
-		if (kvm_timer_irq_can_fire(vtimer))
+		cnt_ctl = read_sysreg_el0(cntv_ctl);
+		cnt_ctl &= ARCH_TIMER_CTRL_ENABLE | ARCH_TIMER_CTRL_IT_STAT |
+			   ARCH_TIMER_CTRL_IT_MASK;
+		if (cnt_ctl == (ARCH_TIMER_CTRL_ENABLE | ARCH_TIMER_CTRL_IT_STAT))
 			kvm_timer_update_irq(vcpu, true, vtimer);
 	}
 
@@ -355,6 +362,7 @@ static void vtimer_save_state(struct kvm_vcpu *vcpu)
 
 	/* Disable the virtual timer */
 	write_sysreg_el0(0, cntv_ctl);
+	isb();
 
 	vtimer->loaded = false;
 out:

From 0eb7c33cadf6b2f1a94e58ded8b0eb89b4eba382 Mon Sep 17 00:00:00 2001
From: Christoffer Dall <christoffer.dall@linaro.org>
Date: Fri, 15 Dec 2017 00:30:12 +0100
Subject: [PATCH 202/808] KVM: arm/arm64: Fix timer enable flow

When enabling the timer on the first run, we fail to ever restore the
state and mark it as loaded.  That means, that in the initial entry to
the VCPU ioctl, unless we exit to userspace for some reason such as a
pending signal, if the guest programs a timer and blocks, we will wait
forever, because we never read back the hardware state (the loaded flag
is not set), and so we think the timer is disabled, and we never
schedule a background soft timer.

The end result?  The VCPU blocks forever, and the only solution is to
kill the thread.

Fixes: 4a2c4da1250d ("arm/arm64: KVM: Load the timer state when enabling the timer")
Reported-by: Marc Zyngier <marc.zyngier@arm.com>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Tested-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 virt/kvm/arm/arch_timer.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
index 14c018f990a7..cc29a8148328 100644
--- a/virt/kvm/arm/arch_timer.c
+++ b/virt/kvm/arm/arch_timer.c
@@ -846,10 +846,7 @@ int kvm_timer_enable(struct kvm_vcpu *vcpu)
 no_vgic:
 	preempt_disable();
 	timer->enabled = 1;
-	if (!irqchip_in_kernel(vcpu->kvm))
-		kvm_timer_vcpu_load_user(vcpu);
-	else
-		kvm_timer_vcpu_load_vgic(vcpu);
+	kvm_timer_vcpu_load(vcpu);
 	preempt_enable();
 
 	return 0;

From 9226665159f0367ad08bc7d5dd194aeadb90316f Mon Sep 17 00:00:00 2001
From: Kailang Yang <kailang@realtek.com>
Date: Thu, 14 Dec 2017 15:28:58 +0800
Subject: [PATCH 203/808] ALSA: hda/realtek - Fix Dell AIO LineOut issue

Dell AIO had LineOut jack.
Add LineOut verb into this patch.

[ Additional notes:
  the ALC274 codec seems requiring the fixed pin / DAC connections for
  HP / line-out pins for enabling EQ for speakers; i.e. the HP / LO
  pins expect to be connected with NID 0x03 while keeping the speaker
  with NID 0x02.  However, by adding a new line-out pin, the
  auto-parser assigns the NID 0x02 for HP/LO pins as primary outputs.
  As an easy workaround, we provide the preferred_pairs[] to map
  forcibly for these pins. -- tiwai ]

Fixes: 75ee94b20b46 ("ALSA: hda - fix headset mic problem for Dell machines with alc274")
Signed-off-by: Kailang Yang <kailang@realtek.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/pci/hda/patch_realtek.c | 35 ++++++++++++++++++++++++++++++++++-
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
index 4b21f71d685c..6a4db00511ab 100644
--- a/sound/pci/hda/patch_realtek.c
+++ b/sound/pci/hda/patch_realtek.c
@@ -5185,6 +5185,22 @@ static void alc233_alc662_fixup_lenovo_dual_codecs(struct hda_codec *codec,
 	}
 }
 
+/* Forcibly assign NID 0x03 to HP/LO while NID 0x02 to SPK for EQ */
+static void alc274_fixup_bind_dacs(struct hda_codec *codec,
+				    const struct hda_fixup *fix, int action)
+{
+	struct alc_spec *spec = codec->spec;
+	static hda_nid_t preferred_pairs[] = {
+		0x21, 0x03, 0x1b, 0x03, 0x16, 0x02,
+		0
+	};
+
+	if (action != HDA_FIXUP_ACT_PRE_PROBE)
+		return;
+
+	spec->gen.preferred_dacs = preferred_pairs;
+}
+
 /* for hda_fixup_thinkpad_acpi() */
 #include "thinkpad_helper.c"
 
@@ -5302,6 +5318,8 @@ enum {
 	ALC233_FIXUP_LENOVO_MULTI_CODECS,
 	ALC294_FIXUP_LENOVO_MIC_LOCATION,
 	ALC700_FIXUP_INTEL_REFERENCE,
+	ALC274_FIXUP_DELL_BIND_DACS,
+	ALC274_FIXUP_DELL_AIO_LINEOUT_VERB,
 };
 
 static const struct hda_fixup alc269_fixups[] = {
@@ -6112,6 +6130,21 @@ static const struct hda_fixup alc269_fixups[] = {
 			{}
 		}
 	},
+	[ALC274_FIXUP_DELL_BIND_DACS] = {
+		.type = HDA_FIXUP_FUNC,
+		.v.func = alc274_fixup_bind_dacs,
+		.chained = true,
+		.chain_id = ALC269_FIXUP_DELL1_MIC_NO_PRESENCE
+	},
+	[ALC274_FIXUP_DELL_AIO_LINEOUT_VERB] = {
+		.type = HDA_FIXUP_PINS,
+		.v.pins = (const struct hda_pintbl[]) {
+			{ 0x1b, 0x0401102f },
+			{ }
+		},
+		.chained = true,
+		.chain_id = ALC274_FIXUP_DELL_BIND_DACS
+	},
 };
 
 static const struct snd_pci_quirk alc269_fixup_tbl[] = {
@@ -6578,7 +6611,7 @@ static const struct snd_hda_pin_quirk alc269_pin_fixup_tbl[] = {
 		{0x14, 0x90170110},
 		{0x1b, 0x90a70130},
 		{0x21, 0x03211020}),
-	SND_HDA_PIN_QUIRK(0x10ec0274, 0x1028, "Dell", ALC269_FIXUP_DELL1_MIC_NO_PRESENCE,
+	SND_HDA_PIN_QUIRK(0x10ec0274, 0x1028, "Dell", ALC274_FIXUP_DELL_AIO_LINEOUT_VERB,
 		{0x12, 0xb7a60130},
 		{0x13, 0xb8a61140},
 		{0x16, 0x90170110},

From 5839ee7389e893a31e4e3c9cf17b50d14103c902 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 15 Dec 2017 03:07:18 +0100
Subject: [PATCH 204/808] PCI / PM: Force devices to D0 in pci_pm_thaw_noirq()

It is incorrect to call pci_restore_state() for devices in low-power
states (D1-D3), as that involves the restoration of MSI setup which
requires MMIO to be operational and that is only the case in D0.

However, pci_pm_thaw_noirq() may do that if the driver's "freeze"
callbacks put the device into a low-power state, so fix it by making
it force devices into D0 via pci_set_power_state() instead of trying
to "update" their power state which is pointless.

Fixes: e60514bd4485 (PCI/PM: Restore the status of PCI devices across hibernation)
Cc: 4.13+ <stable@vger.kernel.org> # 4.13+
Reported-by: Thomas Gleixner <tglx@linutronix.de>
Reported-by: Maarten Lankhorst <dev@mblankhorst.nl>
Tested-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Maarten Lankhorst <dev@mblankhorst.nl>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/pci-driver.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index 945099d49f8f..14fd865a5120 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -1012,7 +1012,12 @@ static int pci_pm_thaw_noirq(struct device *dev)
 	if (pci_has_legacy_pm_support(pci_dev))
 		return pci_legacy_resume_early(dev);
 
-	pci_update_current_state(pci_dev, PCI_D0);
+	/*
+	 * pci_restore_state() requires the device to be in D0 (because of MSI
+	 * restoration among other things), so force it into D0 in case the
+	 * driver's "freeze" callbacks put it into a low-power state directly.
+	 */
+	pci_set_power_state(pci_dev, PCI_D0);
 	pci_restore_state(pci_dev);
 
 	if (drv && drv->pm && drv->pm->thaw_noirq)

From ccc153a6de1f7741b5ef7c996f9be133772b2092 Mon Sep 17 00:00:00 2001
From: Lucas Stach <l.stach@pengutronix.de>
Date: Mon, 11 Dec 2017 14:19:00 +0100
Subject: [PATCH 205/808] cpufreq: imx6q: fix speed grading regression on i.MX6
 QuadPlus

The commit moving the speed grading check to the cpufreq driver introduced
some additional checks, so the OPP disable is only attempted on SoCs where
those OPPs are present. The compatible checks are missing the QuadPlus
compatible, so invalid OPPs are not correctly disabled there.

Move both checks to a single condition, so we don't need to sprinkle even
more calls to of_machine_is_compatible().

Fixes: 2b3d58a3adca (cpufreq: imx6q: Move speed grading check to cpufreq driver)
Signed-off-by: Lucas Stach <l.stach@pengutronix.de>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/imx6q-cpufreq.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/drivers/cpufreq/imx6q-cpufreq.c b/drivers/cpufreq/imx6q-cpufreq.c
index 628fe899cb48..d9b2c2de49c4 100644
--- a/drivers/cpufreq/imx6q-cpufreq.c
+++ b/drivers/cpufreq/imx6q-cpufreq.c
@@ -226,17 +226,18 @@ static void imx6q_opp_check_speed_grading(struct device *dev)
 	val >>= OCOTP_CFG3_SPEED_SHIFT;
 	val &= 0x3;
 
-	if ((val != OCOTP_CFG3_SPEED_1P2GHZ) &&
-	     of_machine_is_compatible("fsl,imx6q"))
-		if (dev_pm_opp_disable(dev, 1200000000))
-			dev_warn(dev, "failed to disable 1.2GHz OPP\n");
 	if (val < OCOTP_CFG3_SPEED_996MHZ)
 		if (dev_pm_opp_disable(dev, 996000000))
 			dev_warn(dev, "failed to disable 996MHz OPP\n");
-	if (of_machine_is_compatible("fsl,imx6q")) {
+
+	if (of_machine_is_compatible("fsl,imx6q") ||
+	    of_machine_is_compatible("fsl,imx6qp")) {
 		if (val != OCOTP_CFG3_SPEED_852MHZ)
 			if (dev_pm_opp_disable(dev, 852000000))
 				dev_warn(dev, "failed to disable 852MHz OPP\n");
+		if (val != OCOTP_CFG3_SPEED_1P2GHZ)
+			if (dev_pm_opp_disable(dev, 1200000000))
+				dev_warn(dev, "failed to disable 1.2GHz OPP\n");
 	}
 	iounmap(base);
 put_node:

From 56026645e2b6f11ede34a5e6ab69d3eb56f9c8fc Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 18 Dec 2017 02:15:32 +0100
Subject: [PATCH 206/808] cpufreq: governor: Ensure sufficiently large sampling
 intervals

After commit aa7519af450d (cpufreq: Use transition_delay_us for legacy
governors as well) the sampling_rate field of struct dbs_data may be
less than the tick period which causes dbs_update() to produce
incorrect results, so make the code ensure that the value of that
field will always be sufficiently large.

Fixes: aa7519af450d (cpufreq: Use transition_delay_us for legacy governors as well)
Reported-by: Andy Tang <andy.tang@nxp.com>
Reported-by: Doug Smythies <dsmythies@telus.net>
Tested-by: Andy Tang <andy.tang@nxp.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/cpufreq_governor.c | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index 58d4f4e1ad6a..ca38229b045a 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -22,6 +22,8 @@
 
 #include "cpufreq_governor.h"
 
+#define CPUFREQ_DBS_MIN_SAMPLING_INTERVAL	(2 * TICK_NSEC / NSEC_PER_USEC)
+
 static DEFINE_PER_CPU(struct cpu_dbs_info, cpu_dbs);
 
 static DEFINE_MUTEX(gov_dbs_data_mutex);
@@ -47,11 +49,15 @@ ssize_t store_sampling_rate(struct gov_attr_set *attr_set, const char *buf,
 {
 	struct dbs_data *dbs_data = to_dbs_data(attr_set);
 	struct policy_dbs_info *policy_dbs;
+	unsigned int sampling_interval;
 	int ret;
-	ret = sscanf(buf, "%u", &dbs_data->sampling_rate);
-	if (ret != 1)
+
+	ret = sscanf(buf, "%u", &sampling_interval);
+	if (ret != 1 || sampling_interval < CPUFREQ_DBS_MIN_SAMPLING_INTERVAL)
 		return -EINVAL;
 
+	dbs_data->sampling_rate = sampling_interval;
+
 	/*
 	 * We are operating under dbs_data->mutex and so the list and its
 	 * entries can't be freed concurrently.
@@ -430,7 +436,14 @@ int cpufreq_dbs_governor_init(struct cpufreq_policy *policy)
 	if (ret)
 		goto free_policy_dbs_info;
 
-	dbs_data->sampling_rate = cpufreq_policy_transition_delay_us(policy);
+	/*
+	 * The sampling interval should not be less than the transition latency
+	 * of the CPU and it also cannot be too small for dbs_update() to work
+	 * correctly.
+	 */
+	dbs_data->sampling_rate = max_t(unsigned int,
+					CPUFREQ_DBS_MIN_SAMPLING_INTERVAL,
+					cpufreq_policy_transition_delay_us(policy));
 
 	if (!have_governor_per_policy())
 		gov->gdbs_data = dbs_data;

From 951ef0e19f0736b45d1c4d81f4dfa04a43f87df5 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Fri, 8 Dec 2017 23:59:49 +0000
Subject: [PATCH 207/808] ACPI: CPPC: remove initial assignment of pcc_ss_data

The initialization of pcc_ss_data from pcc_data[pcc_ss_id] before
pcc_ss_id is being range checked could lead to an out-of-bounds array
read.  This very same initialization is also being performed after
the range check on pcc_ss_id, so we can just remove this problematic
and also redundant assignment to fix the issue.

Detected by cppcheck:
warning: Value stored to 'pcc_ss_data' during its initialization is never
read

Fixes: 85b1407bf6d2 (ACPI / CPPC: Make CPPC ACPI driver aware of PCC subspace IDs)
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/cppc_acpi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c
index 30e84cc600ae..06ea4749ebd9 100644
--- a/drivers/acpi/cppc_acpi.c
+++ b/drivers/acpi/cppc_acpi.c
@@ -1171,7 +1171,7 @@ int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls)
 	struct cpc_desc *cpc_desc = per_cpu(cpc_desc_ptr, cpu);
 	struct cpc_register_resource *desired_reg;
 	int pcc_ss_id = per_cpu(cpu_pcc_subspace_idx, cpu);
-	struct cppc_pcc_data *pcc_ss_data = pcc_data[pcc_ss_id];
+	struct cppc_pcc_data *pcc_ss_data;
 	int ret = 0;
 
 	if (!cpc_desc || pcc_ss_id < 0) {

From bb82e0b4a7e96494f0c1004ce50cec3d7b5fb3d1 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Thu, 14 Dec 2017 13:31:16 +0100
Subject: [PATCH 208/808] ACPI: APEI / ERST: Fix missing error handling in
 erst_reader()

The commit f6f828513290 ("pstore: pass allocated memory region back to
caller") changed the check of the return value from erst_read() in
erst_reader() in the following way:

        if (len == -ENOENT)
                goto skip;
-       else if (len < 0) {
-               rc = -1;
+       else if (len < sizeof(*rcd)) {
+               rc = -EIO;
                goto out;

This introduced another bug: since the comparison with sizeof() is
cast to unsigned, a negative len value doesn't hit any longer.
As a result, when an error is returned from erst_read(), the code
falls through, and it may eventually lead to some weird thing like
memory corruption.

This patch adds the negative error value check more explicitly for
addressing the issue.

Fixes: f6f828513290 (pstore: pass allocated memory region back to caller)
Cc: All applicable <stable@vger.kernel.org>
Tested-by: Jerry Tang <jtang@suse.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Acked-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/apei/erst.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/acpi/apei/erst.c b/drivers/acpi/apei/erst.c
index 6742f6c68034..9bff853e85f3 100644
--- a/drivers/acpi/apei/erst.c
+++ b/drivers/acpi/apei/erst.c
@@ -1007,7 +1007,7 @@ skip:
 	/* The record may be cleared by others, try read next record */
 	if (len == -ENOENT)
 		goto skip;
-	else if (len < sizeof(*rcd)) {
+	else if (len < 0 || len < sizeof(*rcd)) {
 		rc = -EIO;
 		goto out;
 	}

From e39d200fa5bf5b94a0948db0dae44c1b73b84a56 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Thu, 14 Dec 2017 17:40:50 -0800
Subject: [PATCH 209/808] KVM: Fix stack-out-of-bounds read in write_mmio
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reported by syzkaller:

  BUG: KASAN: stack-out-of-bounds in write_mmio+0x11e/0x270 [kvm]
  Read of size 8 at addr ffff8803259df7f8 by task syz-executor/32298

  CPU: 6 PID: 32298 Comm: syz-executor Tainted: G           OE    4.15.0-rc2+ #18
  Hardware name: LENOVO ThinkCentre M8500t-N000/SHARKBAY, BIOS FBKTC1AUS 02/16/2016
  Call Trace:
   dump_stack+0xab/0xe1
   print_address_description+0x6b/0x290
   kasan_report+0x28a/0x370
   write_mmio+0x11e/0x270 [kvm]
   emulator_read_write_onepage+0x311/0x600 [kvm]
   emulator_read_write+0xef/0x240 [kvm]
   emulator_fix_hypercall+0x105/0x150 [kvm]
   em_hypercall+0x2b/0x80 [kvm]
   x86_emulate_insn+0x2b1/0x1640 [kvm]
   x86_emulate_instruction+0x39a/0xb90 [kvm]
   handle_exception+0x1b4/0x4d0 [kvm_intel]
   vcpu_enter_guest+0x15a0/0x2640 [kvm]
   kvm_arch_vcpu_ioctl_run+0x549/0x7d0 [kvm]
   kvm_vcpu_ioctl+0x479/0x880 [kvm]
   do_vfs_ioctl+0x142/0x9a0
   SyS_ioctl+0x74/0x80
   entry_SYSCALL_64_fastpath+0x23/0x9a

The path of patched vmmcall will patch 3 bytes opcode 0F 01 C1(vmcall)
to the guest memory, however, write_mmio tracepoint always prints 8 bytes
through *(u64 *)val since kvm splits the mmio access into 8 bytes. This
leaks 5 bytes from the kernel stack (CVE-2017-17741).  This patch fixes
it by just accessing the bytes which we operate on.

Before patch:

syz-executor-5567  [007] .... 51370.561696: kvm_mmio: mmio write len 3 gpa 0x10 val 0x1ffff10077c1010f

After patch:

syz-executor-13416 [002] .... 51302.299573: kvm_mmio: mmio write len 3 gpa 0x10 val 0xc1010f

Reported-by: Dmitry Vyukov <dvyukov@google.com>
Reviewed-by: Darren Kenny <darren.kenny@oracle.com>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Tested-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Christoffer Dall <christoffer.dall@linaro.org>
Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c         | 8 ++++----
 include/trace/events/kvm.h | 7 +++++--
 virt/kvm/arm/mmio.c        | 6 +++---
 3 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3a82f2d4333b..1cec2c62a0b0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4384,7 +4384,7 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
 					 addr, n, v))
 		    && kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v))
 			break;
-		trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, *(u64 *)v);
+		trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, v);
 		handled += n;
 		addr += n;
 		len -= n;
@@ -4643,7 +4643,7 @@ static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes)
 {
 	if (vcpu->mmio_read_completed) {
 		trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
-			       vcpu->mmio_fragments[0].gpa, *(u64 *)val);
+			       vcpu->mmio_fragments[0].gpa, val);
 		vcpu->mmio_read_completed = 0;
 		return 1;
 	}
@@ -4665,14 +4665,14 @@ static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
 
 static int write_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes, void *val)
 {
-	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
+	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, val);
 	return vcpu_mmio_write(vcpu, gpa, bytes, val);
 }
 
 static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
 			  void *val, int bytes)
 {
-	trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);
+	trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, NULL);
 	return X86EMUL_IO_NEEDED;
 }
 
diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index e4b0b8e09932..2c735a3e6613 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -211,7 +211,7 @@ TRACE_EVENT(kvm_ack_irq,
 	{ KVM_TRACE_MMIO_WRITE, "write" }
 
 TRACE_EVENT(kvm_mmio,
-	TP_PROTO(int type, int len, u64 gpa, u64 val),
+	TP_PROTO(int type, int len, u64 gpa, void *val),
 	TP_ARGS(type, len, gpa, val),
 
 	TP_STRUCT__entry(
@@ -225,7 +225,10 @@ TRACE_EVENT(kvm_mmio,
 		__entry->type		= type;
 		__entry->len		= len;
 		__entry->gpa		= gpa;
-		__entry->val		= val;
+		__entry->val		= 0;
+		if (val)
+			memcpy(&__entry->val, val,
+			       min_t(u32, sizeof(__entry->val), len));
 	),
 
 	TP_printk("mmio %s len %u gpa 0x%llx val 0x%llx",
diff --git a/virt/kvm/arm/mmio.c b/virt/kvm/arm/mmio.c
index b6e715fd3c90..dac7ceb1a677 100644
--- a/virt/kvm/arm/mmio.c
+++ b/virt/kvm/arm/mmio.c
@@ -112,7 +112,7 @@ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run)
 		}
 
 		trace_kvm_mmio(KVM_TRACE_MMIO_READ, len, run->mmio.phys_addr,
-			       data);
+			       &data);
 		data = vcpu_data_host_to_guest(vcpu, data, len);
 		vcpu_set_reg(vcpu, vcpu->arch.mmio_decode.rt, data);
 	}
@@ -182,14 +182,14 @@ int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run,
 		data = vcpu_data_guest_to_host(vcpu, vcpu_get_reg(vcpu, rt),
 					       len);
 
-		trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, len, fault_ipa, data);
+		trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, len, fault_ipa, &data);
 		kvm_mmio_write_buf(data_buf, len, data);
 
 		ret = kvm_io_bus_write(vcpu, KVM_MMIO_BUS, fault_ipa, len,
 				       data_buf);
 	} else {
 		trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, len,
-			       fault_ipa, 0);
+			       fault_ipa, NULL);
 
 		ret = kvm_io_bus_read(vcpu, KVM_MMIO_BUS, fault_ipa, len,
 				      data_buf);

From 9d5f38ba6c82359b7cec31fb27fb78ecc02f3946 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Fri, 15 Dec 2017 10:20:12 -0600
Subject: [PATCH 210/808] x86/mm: Unbreak modules that use the DMA API

Commit d8aa7eea78a1 ("x86/mm: Add Secure Encrypted Virtualization (SEV)
support") changed sme_active() from an inline function that referenced
sme_me_mask to a non-inlined function in order to make the sev_enabled
variable a static variable.  This function was marked EXPORT_SYMBOL_GPL
because at the time the patch was submitted, sme_me_mask was marked
EXPORT_SYMBOL_GPL.

Commit 87df26175e67 ("x86/mm: Unbreak modules that rely on external
PAGE_KERNEL availability") changed sme_me_mask variable from
EXPORT_SYMBOL_GPL to EXPORT_SYMBOL, allowing external modules the ability
to build with CONFIG_AMD_MEM_ENCRYPT=y.  Now, however, with sev_active()
no longer an inline function and marked as EXPORT_SYMBOL_GPL, external
modules that use the DMA API are once again broken in 4.15. Since the DMA
API is meant to be used by external modules, this needs to be changed.

Change the sme_active() and sev_active() functions from EXPORT_SYMBOL_GPL
to EXPORT_SYMBOL.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Link: https://lkml.kernel.org/r/20171215162011.14125.7113.stgit@tlendack-t1.amdoffice.net
---
 arch/x86/mm/mem_encrypt.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index d9a9e9fc75dd..391b13402e40 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -405,13 +405,13 @@ bool sme_active(void)
 {
 	return sme_me_mask && !sev_enabled;
 }
-EXPORT_SYMBOL_GPL(sme_active);
+EXPORT_SYMBOL(sme_active);
 
 bool sev_active(void)
 {
 	return sme_me_mask && sev_enabled;
 }
-EXPORT_SYMBOL_GPL(sev_active);
+EXPORT_SYMBOL(sev_active);
 
 static const struct dma_map_ops sev_dma_ops = {
 	.alloc                  = sev_alloc,

From bf29cb238dc0656e6564b6a94bb82e11d2129437 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 14 Dec 2017 19:18:25 +0100
Subject: [PATCH 211/808] sched/isolation: Make CONFIG_NO_HZ_FULL select
 CONFIG_CPU_ISOLATION

CONFIG_NO_HZ_FULL doesn't make sense without CONFIG_CPU_ISOLATION. In
fact enabling the first without the second is a regression as nohz_full=
boot parameter gets silently ignored.

Besides this unnatural combination hangs RCU gp kthread when running
rcutorture for reasons that are not yet fully understood:

	rcu_preempt kthread starved for 9974 jiffies! g4294967208
	+c4294967207 f0x0 RCU_GP_WAIT_FQS(3) ->state=0x402 ->cpu=0
	rcu_preempt     I 7464     8      2 0x80000000
	Call Trace:
		__schedule+0x493/0x620
		schedule+0x24/0x40
		schedule_timeout+0x330/0x3b0
		? preempt_count_sub+0xea/0x140
		? collect_expired_timers+0xb0/0xb0
		rcu_gp_kthread+0x6bf/0xef0

This commit therefore makes NO_HZ_FULL select CPU_ISOLATION, which
prevents all these bad behaviours.

Reported-by: kernel test robot <xiaolong.ye@intel.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wanpeng Li <kernellwp@gmail.com>
Fixes: 5c4991e24c69 ("sched/isolation: Split out new CONFIG_CPU_ISOLATION=y config from CONFIG_NO_HZ_FULL")
Link: http://lkml.kernel.org/r/1513275507-29200-2-git-send-email-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index e776fc8cc1df..f6b5f19223d6 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -95,6 +95,7 @@ config NO_HZ_FULL
 	select RCU_NOCB_CPU
 	select VIRT_CPU_ACCOUNTING_GEN
 	select IRQ_WORK
+	select CPU_ISOLATION
 	help
 	 Adaptively try to shutdown the tick whenever possible, even when
 	 the CPU is running tasks. Typically this requires running a single

From 2c43838c99d9d23f17eb2bdadafcb2879cca6995 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Thu, 14 Dec 2017 19:18:26 +0100
Subject: [PATCH 212/808] sched/isolation: Enable CONFIG_CPU_ISOLATION=y by
 default

The "isolcpus=" boot parameter support was always built-in before we
moved the related code under CONFIG_CPU_ISOLATION. Having it disabled by
default is very confusing for people accustomed to use this parameter.

So enable it by dafault to keep the previous behaviour but keep it
optable for those who want to tinify their kernels.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wanpeng Li <kernellwp@gmail.com>
Cc: kernel test robot <xiaolong.ye@intel.com>
Link: http://lkml.kernel.org/r/1513275507-29200-3-git-send-email-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 init/Kconfig | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/init/Kconfig b/init/Kconfig
index 2934249fba46..690a381adee0 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -461,10 +461,14 @@ endmenu # "CPU/Task time and stats accounting"
 
 config CPU_ISOLATION
 	bool "CPU isolation"
+	default y
 	help
 	  Make sure that CPUs running critical tasks are not disturbed by
 	  any source of "noise" such as unbound workqueues, timers, kthreads...
-	  Unbound jobs get offloaded to housekeeping CPUs.
+	  Unbound jobs get offloaded to housekeeping CPUs. This is driven by
+	  the "isolcpus=" boot parameter.
+
+	  Say Y if unsure.
 
 source "kernel/rcu/Kconfig"
 

From d94d105329e4a8a874853b5bd854b6587c41adda Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Thu, 14 Dec 2017 19:18:27 +0100
Subject: [PATCH 213/808] sched/isolation: Document boot parameters dependency
 on CONFIG_CPU_ISOLATION=y

The "isolcpus=" and "nohz_full=" boot parameters depend on CPU Isolation
support. Let's document that.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wanpeng Li <kernellwp@gmail.com>
Cc: kernel test robot <xiaolong.ye@intel.com>
Link: http://lkml.kernel.org/r/1513275507-29200-4-git-send-email-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/admin-guide/kernel-parameters.rst | 1 +
 Documentation/admin-guide/kernel-parameters.txt | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.rst b/Documentation/admin-guide/kernel-parameters.rst
index b2598cc9834c..7242cbda15dd 100644
--- a/Documentation/admin-guide/kernel-parameters.rst
+++ b/Documentation/admin-guide/kernel-parameters.rst
@@ -109,6 +109,7 @@ parameter is applicable::
 	IPV6	IPv6 support is enabled.
 	ISAPNP	ISA PnP code is enabled.
 	ISDN	Appropriate ISDN support is enabled.
+	ISOL	CPU Isolation is enabled.
 	JOY	Appropriate joystick support is enabled.
 	KGDB	Kernel debugger support is enabled.
 	KVM	Kernel Virtual Machine support is enabled.
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 6571fbfdb2a1..168310707ec2 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1737,7 +1737,7 @@
 	isapnp=		[ISAPNP]
 			Format: <RDP>,<reset>,<pci_scan>,<verbosity>
 
-	isolcpus=	[KNL,SMP] Isolate a given set of CPUs from disturbance.
+	isolcpus=	[KNL,SMP,ISOL] Isolate a given set of CPUs from disturbance.
 			[Deprecated - use cpusets instead]
 			Format: [flag-list,]<cpu-list>
 
@@ -2662,7 +2662,7 @@
 			Valid arguments: on, off
 			Default: on
 
-	nohz_full=	[KNL,BOOT]
+	nohz_full=	[KNL,BOOT,SMP,ISOL]
 			The argument is a cpu list, as described above.
 			In kernels built with CONFIG_NO_HZ_FULL=y, set
 			the specified list of CPUs whose tick will be stopped

From 869b5567e12f63ea7407f81728ca87f8c0abbfdb Mon Sep 17 00:00:00 2001
From: Dexuan Cui <decui@microsoft.com>
Date: Tue, 14 Nov 2017 06:53:32 -0700
Subject: [PATCH 214/808] vmbus: unregister device_obj->channels_kset

Without the patch, a device can't be thoroughly destroyed, because
vmbus_device_register() -> kset_create_and_add() still holds a reference
to the hv_device's device.kobj.

Signed-off-by: Dexuan Cui <decui@microsoft.com>
Cc: Stephen Hemminger <sthemmin@microsoft.com>
Fixes: c2e5df616e1a ("vmbus: add per-channel sysfs info")
Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hv/vmbus_drv.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index 76ed9a216f10..610223f0e945 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -1378,6 +1378,8 @@ void vmbus_device_unregister(struct hv_device *device_obj)
 	pr_debug("child device %s unregistered\n",
 		dev_name(&device_obj->device));
 
+	kset_unregister(device_obj->channels_kset);
+
 	/*
 	 * Kick off the process of unregistering the device.
 	 * This will call vmbus_remove() and eventually vmbus_device_release()

From 7f3dc0088b98533f17128058fac73cd8b2752ef1 Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos@android.com>
Date: Mon, 27 Nov 2017 09:32:33 -0800
Subject: [PATCH 215/808] binder: fix proc->files use-after-free

proc->files cleanup is initiated by binder_vma_close. Therefore
a reference on the binder_proc is not enough to prevent the
files_struct from being released while the binder_proc still has
a reference. This can lead to an attempt to dereference the
stale pointer obtained from proc->files prior to proc->files
cleanup. This has been seen once in task_get_unused_fd_flags()
when __alloc_fd() is called with a stale "files".

The fix is to protect proc->files with a mutex to prevent cleanup
while in use.

Signed-off-by: Todd Kjos <tkjos@google.com>
Cc: stable <stable@vger.kernel.org> # 4.14
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/android/binder.c | 44 ++++++++++++++++++++++++++++------------
 1 file changed, 31 insertions(+), 13 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index bccec9de0533..a7ecfde66b7b 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -482,7 +482,8 @@ enum binder_deferred_state {
  * @tsk                   task_struct for group_leader of process
  *                        (invariant after initialized)
  * @files                 files_struct for process
- *                        (invariant after initialized)
+ *                        (protected by @files_lock)
+ * @files_lock            mutex to protect @files
  * @deferred_work_node:   element for binder_deferred_list
  *                        (protected by binder_deferred_lock)
  * @deferred_work:        bitmap of deferred work to perform
@@ -530,6 +531,7 @@ struct binder_proc {
 	int pid;
 	struct task_struct *tsk;
 	struct files_struct *files;
+	struct mutex files_lock;
 	struct hlist_node deferred_work_node;
 	int deferred_work;
 	bool is_dead;
@@ -877,20 +879,26 @@ static void binder_inc_node_tmpref_ilocked(struct binder_node *node);
 
 static int task_get_unused_fd_flags(struct binder_proc *proc, int flags)
 {
-	struct files_struct *files = proc->files;
 	unsigned long rlim_cur;
 	unsigned long irqs;
+	int ret;
 
-	if (files == NULL)
-		return -ESRCH;
-
-	if (!lock_task_sighand(proc->tsk, &irqs))
-		return -EMFILE;
-
+	mutex_lock(&proc->files_lock);
+	if (proc->files == NULL) {
+		ret = -ESRCH;
+		goto err;
+	}
+	if (!lock_task_sighand(proc->tsk, &irqs)) {
+		ret = -EMFILE;
+		goto err;
+	}
 	rlim_cur = task_rlimit(proc->tsk, RLIMIT_NOFILE);
 	unlock_task_sighand(proc->tsk, &irqs);
 
-	return __alloc_fd(files, 0, rlim_cur, flags);
+	ret = __alloc_fd(proc->files, 0, rlim_cur, flags);
+err:
+	mutex_unlock(&proc->files_lock);
+	return ret;
 }
 
 /*
@@ -899,8 +907,10 @@ static int task_get_unused_fd_flags(struct binder_proc *proc, int flags)
 static void task_fd_install(
 	struct binder_proc *proc, unsigned int fd, struct file *file)
 {
+	mutex_lock(&proc->files_lock);
 	if (proc->files)
 		__fd_install(proc->files, fd, file);
+	mutex_unlock(&proc->files_lock);
 }
 
 /*
@@ -910,9 +920,11 @@ static long task_close_fd(struct binder_proc *proc, unsigned int fd)
 {
 	int retval;
 
-	if (proc->files == NULL)
-		return -ESRCH;
-
+	mutex_lock(&proc->files_lock);
+	if (proc->files == NULL) {
+		retval = -ESRCH;
+		goto err;
+	}
 	retval = __close_fd(proc->files, fd);
 	/* can't restart close syscall because file table entry was cleared */
 	if (unlikely(retval == -ERESTARTSYS ||
@@ -920,7 +932,8 @@ static long task_close_fd(struct binder_proc *proc, unsigned int fd)
 		     retval == -ERESTARTNOHAND ||
 		     retval == -ERESTART_RESTARTBLOCK))
 		retval = -EINTR;
-
+err:
+	mutex_unlock(&proc->files_lock);
 	return retval;
 }
 
@@ -4627,7 +4640,9 @@ static int binder_mmap(struct file *filp, struct vm_area_struct *vma)
 	ret = binder_alloc_mmap_handler(&proc->alloc, vma);
 	if (ret)
 		return ret;
+	mutex_lock(&proc->files_lock);
 	proc->files = get_files_struct(current);
+	mutex_unlock(&proc->files_lock);
 	return 0;
 
 err_bad_arg:
@@ -4651,6 +4666,7 @@ static int binder_open(struct inode *nodp, struct file *filp)
 	spin_lock_init(&proc->outer_lock);
 	get_task_struct(current->group_leader);
 	proc->tsk = current->group_leader;
+	mutex_init(&proc->files_lock);
 	INIT_LIST_HEAD(&proc->todo);
 	proc->default_priority = task_nice(current);
 	binder_dev = container_of(filp->private_data, struct binder_device,
@@ -4903,9 +4919,11 @@ static void binder_deferred_func(struct work_struct *work)
 
 		files = NULL;
 		if (defer & BINDER_DEFERRED_PUT_FILES) {
+			mutex_lock(&proc->files_lock);
 			files = proc->files;
 			if (files)
 				proc->files = NULL;
+			mutex_unlock(&proc->files_lock);
 		}
 
 		if (defer & BINDER_DEFERRED_FLUSH)

From 5cfee7a357f60675cae32b494bb2096d7203efd3 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 8 Nov 2017 11:27:37 +0100
Subject: [PATCH 216/808] perf tools: Use shell function for perl cflags
 retrieval

Using the shell function for perl CFLAGS retrieval instead of back
quotes (``). Both execute shell with the command, but the latter is more
explicit and seems to be the preferred way.

Also we don't have any other use of the back quotes in perf Makefiles.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20171108102739.30338-2-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Makefile.config | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config
index ed65e82f034e..710623ddb8af 100644
--- a/tools/perf/Makefile.config
+++ b/tools/perf/Makefile.config
@@ -583,7 +583,7 @@ else
   PERL_EMBED_LDOPTS = $(shell perl -MExtUtils::Embed -e ldopts 2>/dev/null)
   PERL_EMBED_LDFLAGS = $(call strip-libs,$(PERL_EMBED_LDOPTS))
   PERL_EMBED_LIBADD = $(call grep-libs,$(PERL_EMBED_LDOPTS))
-  PERL_EMBED_CCOPTS = `perl -MExtUtils::Embed -e ccopts 2>/dev/null`
+  PERL_EMBED_CCOPTS = $(shell perl -MExtUtils::Embed -e ccopts 2>/dev/null)
   FLAGS_PERL_EMBED=$(PERL_EMBED_CCOPTS) $(PERL_EMBED_LDOPTS)
 
   ifneq ($(feature-libperl), 1)

From 61fb26a6a23c0f1a07a0f8a11b54bafb1ac2398b Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 4 Dec 2017 12:23:08 -0300
Subject: [PATCH 217/808] perf tools: Fix up build in hardened environments

On Fedora systems the perl and python CFLAGS/LDFLAGS include the
hardened specs from redhat-rpm-config package. We apply them only for
perl/python objects, which makes them not compatible with the rest of
the objects and the build fails with:

  /usr/bin/ld: perf-in.o: relocation R_X86_64_32 against `.rodata.str1.1' can not be used when making a shared object; recompile with -f
+PIC
  /usr/bin/ld: libperf.a(libperf-in.o): relocation R_X86_64_32S against `.text' can not be used when making a shared object; recompile w
+ith -fPIC
  /usr/bin/ld: final link failed: Nonrepresentable section on output
  collect2: error: ld returned 1 exit status
  make[2]: *** [Makefile.perf:507: perf] Error 1
  make[1]: *** [Makefile.perf:210: sub-make] Error 2
  make: *** [Makefile:69: all] Error 2

Mainly it's caused by perl/python objects being compiled with:

  -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1

which prevent the final link impossible, because it will check
for 'proper' objects with following option:

  -specs=/usr/lib/rpm/redhat/redhat-hardened-ld

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20171204082437.GC30564@krava
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Makefile.config | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config
index 710623ddb8af..0294bfb6c5f8 100644
--- a/tools/perf/Makefile.config
+++ b/tools/perf/Makefile.config
@@ -188,9 +188,7 @@ ifdef PYTHON_CONFIG
   PYTHON_EMBED_LDFLAGS := $(call strip-libs,$(PYTHON_EMBED_LDOPTS))
   PYTHON_EMBED_LIBADD := $(call grep-libs,$(PYTHON_EMBED_LDOPTS)) -lutil
   PYTHON_EMBED_CCOPTS := $(shell $(PYTHON_CONFIG_SQ) --cflags 2>/dev/null)
-  ifeq ($(CC_NO_CLANG), 1)
-    PYTHON_EMBED_CCOPTS := $(filter-out -specs=%,$(PYTHON_EMBED_CCOPTS))
-  endif
+  PYTHON_EMBED_CCOPTS := $(filter-out -specs=%,$(PYTHON_EMBED_CCOPTS))
   FLAGS_PYTHON_EMBED := $(PYTHON_EMBED_CCOPTS) $(PYTHON_EMBED_LDOPTS)
 endif
 
@@ -576,7 +574,6 @@ ifndef NO_GTK2
   endif
 endif
 
-
 ifdef NO_LIBPERL
   CFLAGS += -DNO_LIBPERL
 else
@@ -584,6 +581,8 @@ else
   PERL_EMBED_LDFLAGS = $(call strip-libs,$(PERL_EMBED_LDOPTS))
   PERL_EMBED_LIBADD = $(call grep-libs,$(PERL_EMBED_LDOPTS))
   PERL_EMBED_CCOPTS = $(shell perl -MExtUtils::Embed -e ccopts 2>/dev/null)
+  PERL_EMBED_CCOPTS := $(filter-out -specs=%,$(PERL_EMBED_CCOPTS))
+  PERL_EMBED_LDOPTS := $(filter-out -specs=%,$(PERL_EMBED_LDOPTS))
   FLAGS_PERL_EMBED=$(PERL_EMBED_CCOPTS) $(PERL_EMBED_LDOPTS)
 
   ifneq ($(feature-libperl), 1)

From ca58d7e64bdfc54f7dfe46713c1e2acc68d7522d Mon Sep 17 00:00:00 2001
From: Ben Gainey <ben.gainey@arm.com>
Date: Wed, 22 Nov 2017 18:25:41 -0600
Subject: [PATCH 218/808] perf jvmti: Generate correct debug information for
 inlined code

tools/perf/jvmti is broken in so far as it generates incorrect debug
information. Specifically it attributes all debug lines to the original
method being output even in the case that some code is being inlined
from elsewhere.  This patch fixes the issue.

To test (from within linux/tools/perf):

export JDIR=/usr/lib/jvm/java-8-openjdk-amd64/
make
cat << __EOF > Test.java
public class Test
{
    private StringBuilder b = new StringBuilder();

    private void loop(int i, String... args)
    {
        for (String a : args)
            b.append(a);

        long hc = b.hashCode() * System.nanoTime();

        b = new StringBuilder();
        b.append(hc);

        System.out.printf("Iteration %d = %d\n", i, hc);
    }

    public void run(String... args)
    {
        for (int i = 0; i < 10000; ++i)
        {
            loop(i, args);
        }
    }

    public static void main(String... args)
    {
        Test t = new Test();
        t.run(args);
    }
}
__EOF
$JDIR/bin/javac Test.java
./perf record -F 10000 -g -k mono $JDIR/bin/java -agentpath:`pwd`/libperf-jvmti.so Test
./perf inject --jit -i perf.data -o perf.data.jitted
./perf annotate -i perf.data.jitted --stdio | grep Test\.java: | sort -u

Before this patch, Test.java line numbers get reported that are greater
than the number of lines in the Test.java file.  They come from the
source file of the inlined function, e.g. java/lang/String.java:1085.
For further validation one can examine those lines in the JDK source
distribution and confirm that they map to inlined functions called by
Test.java.

After this patch, the filename of the inlined function is output
rather than the incorrect original source filename.

Signed-off-by: Ben Gainey <ben.gainey@arm.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Tested-by: Stephane Eranian <eranian@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ben Gainey <ben.gainey@arm.com>
Cc: Colin King <colin.king@canonical.com>
Cc: Darren Hart <dvhart@infradead.org>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 598b7c6919c7 ("perf jit: add source line info support")
Link: http://lkml.kernel.org/r/20171122182541.d25599a3eb1ada3480d142fa@arm.com
Signed-off-by: Kim Phillips <kim.phillips@arm.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/jvmti/jvmti_agent.c |  16 ++--
 tools/perf/jvmti/jvmti_agent.h |   7 +-
 tools/perf/jvmti/libjvmti.c    | 147 +++++++++++++++++++++++++++------
 3 files changed, 134 insertions(+), 36 deletions(-)

diff --git a/tools/perf/jvmti/jvmti_agent.c b/tools/perf/jvmti/jvmti_agent.c
index cf36de7ea255..0c6d1002b524 100644
--- a/tools/perf/jvmti/jvmti_agent.c
+++ b/tools/perf/jvmti/jvmti_agent.c
@@ -384,13 +384,13 @@ jvmti_write_code(void *agent, char const *sym,
 }
 
 int
-jvmti_write_debug_info(void *agent, uint64_t code, const char *file,
-		       jvmti_line_info_t *li, int nr_lines)
+jvmti_write_debug_info(void *agent, uint64_t code,
+    int nr_lines, jvmti_line_info_t *li,
+    const char * const * file_names)
 {
 	struct jr_code_debug_info rec;
-	size_t sret, len, size, flen;
+	size_t sret, len, size, flen = 0;
 	uint64_t addr;
-	const char *fn = file;
 	FILE *fp = agent;
 	int i;
 
@@ -405,7 +405,9 @@ jvmti_write_debug_info(void *agent, uint64_t code, const char *file,
 		return -1;
 	}
 
-	flen = strlen(file) + 1;
+	for (i = 0; i < nr_lines; ++i) {
+	    flen += strlen(file_names[i]) + 1;
+	}
 
 	rec.p.id        = JIT_CODE_DEBUG_INFO;
 	size            = sizeof(rec);
@@ -421,7 +423,7 @@ jvmti_write_debug_info(void *agent, uint64_t code, const char *file,
 	 * file[]   : source file name
 	 */
 	size += nr_lines * sizeof(struct debug_entry);
-	size += flen * nr_lines;
+	size += flen;
 	rec.p.total_size = size;
 
 	/*
@@ -452,7 +454,7 @@ jvmti_write_debug_info(void *agent, uint64_t code, const char *file,
 		if (sret != 1)
 			goto error;
 
-		sret = fwrite_unlocked(fn, flen, 1, fp);
+		sret = fwrite_unlocked(file_names[i], strlen(file_names[i]) + 1, 1, fp);
 		if (sret != 1)
 			goto error;
 	}
diff --git a/tools/perf/jvmti/jvmti_agent.h b/tools/perf/jvmti/jvmti_agent.h
index fe32d8344a82..6ed82f6c06dd 100644
--- a/tools/perf/jvmti/jvmti_agent.h
+++ b/tools/perf/jvmti/jvmti_agent.h
@@ -14,6 +14,7 @@ typedef struct {
 	unsigned long	pc;
 	int		line_number;
 	int		discrim; /* discriminator -- 0 for now */
+	jmethodID	methodID;
 } jvmti_line_info_t;
 
 void *jvmti_open(void);
@@ -22,11 +23,9 @@ int   jvmti_write_code(void *agent, char const *symbol_name,
 		       uint64_t vma, void const *code,
 		       const unsigned int code_size);
 
-int   jvmti_write_debug_info(void *agent,
-		             uint64_t code,
-			     const char *file,
+int   jvmti_write_debug_info(void *agent, uint64_t code, int nr_lines,
 			     jvmti_line_info_t *li,
-			     int nr_lines);
+			     const char * const * file_names);
 
 #if defined(__cplusplus)
 }
diff --git a/tools/perf/jvmti/libjvmti.c b/tools/perf/jvmti/libjvmti.c
index c62c9fc9a525..6add3e982614 100644
--- a/tools/perf/jvmti/libjvmti.c
+++ b/tools/perf/jvmti/libjvmti.c
@@ -47,6 +47,7 @@ do_get_line_numbers(jvmtiEnv *jvmti, void *pc, jmethodID m, jint bci,
 			tab[lines].pc = (unsigned long)pc;
 			tab[lines].line_number = loc_tab[i].line_number;
 			tab[lines].discrim = 0; /* not yet used */
+			tab[lines].methodID = m;
 			lines++;
 		} else {
 			break;
@@ -125,6 +126,99 @@ get_line_numbers(jvmtiEnv *jvmti, const void *compile_info, jvmti_line_info_t **
 	return JVMTI_ERROR_NONE;
 }
 
+static void
+copy_class_filename(const char * class_sign, const char * file_name, char * result, size_t max_length)
+{
+	/*
+	* Assume path name is class hierarchy, this is a common practice with Java programs
+	*/
+	if (*class_sign == 'L') {
+		int j, i = 0;
+		char *p = strrchr(class_sign, '/');
+		if (p) {
+			/* drop the 'L' prefix and copy up to the final '/' */
+			for (i = 0; i < (p - class_sign); i++)
+				result[i] = class_sign[i+1];
+		}
+		/*
+		* append file name, we use loops and not string ops to avoid modifying
+		* class_sign which is used later for the symbol name
+		*/
+		for (j = 0; i < (max_length - 1) && file_name && j < strlen(file_name); j++, i++)
+			result[i] = file_name[j];
+
+		result[i] = '\0';
+	} else {
+		/* fallback case */
+		size_t file_name_len = strlen(file_name);
+		strncpy(result, file_name, file_name_len < max_length ? file_name_len : max_length);
+	}
+}
+
+static jvmtiError
+get_source_filename(jvmtiEnv *jvmti, jmethodID methodID, char ** buffer)
+{
+	jvmtiError ret;
+	jclass decl_class;
+	char *file_name = NULL;
+	char *class_sign = NULL;
+	char fn[PATH_MAX];
+	size_t len;
+
+	ret = (*jvmti)->GetMethodDeclaringClass(jvmti, methodID, &decl_class);
+	if (ret != JVMTI_ERROR_NONE) {
+		print_error(jvmti, "GetMethodDeclaringClass", ret);
+		return ret;
+	}
+
+	ret = (*jvmti)->GetSourceFileName(jvmti, decl_class, &file_name);
+	if (ret != JVMTI_ERROR_NONE) {
+		print_error(jvmti, "GetSourceFileName", ret);
+		return ret;
+	}
+
+	ret = (*jvmti)->GetClassSignature(jvmti, decl_class, &class_sign, NULL);
+	if (ret != JVMTI_ERROR_NONE) {
+		print_error(jvmti, "GetClassSignature", ret);
+		goto free_file_name_error;
+	}
+
+	copy_class_filename(class_sign, file_name, fn, PATH_MAX);
+	len = strlen(fn);
+	*buffer = malloc((len + 1) * sizeof(char));
+	if (!*buffer) {
+		print_error(jvmti, "GetClassSignature", ret);
+		ret = JVMTI_ERROR_OUT_OF_MEMORY;
+		goto free_class_sign_error;
+	}
+	strcpy(*buffer, fn);
+	ret = JVMTI_ERROR_NONE;
+
+free_class_sign_error:
+	(*jvmti)->Deallocate(jvmti, (unsigned char *)class_sign);
+free_file_name_error:
+	(*jvmti)->Deallocate(jvmti, (unsigned char *)file_name);
+
+	return ret;
+}
+
+static jvmtiError
+fill_source_filenames(jvmtiEnv *jvmti, int nr_lines,
+		      const jvmti_line_info_t * line_tab,
+		      char ** file_names)
+{
+	int index;
+	jvmtiError ret;
+
+	for (index = 0; index < nr_lines; ++index) {
+		ret = get_source_filename(jvmti, line_tab[index].methodID, &(file_names[index]));
+		if (ret != JVMTI_ERROR_NONE)
+			return ret;
+	}
+
+	return JVMTI_ERROR_NONE;
+}
+
 static void JNICALL
 compiled_method_load_cb(jvmtiEnv *jvmti,
 			jmethodID method,
@@ -135,16 +229,18 @@ compiled_method_load_cb(jvmtiEnv *jvmti,
 			const void *compile_info)
 {
 	jvmti_line_info_t *line_tab = NULL;
+	char ** line_file_names = NULL;
 	jclass decl_class;
 	char *class_sign = NULL;
 	char *func_name = NULL;
 	char *func_sign = NULL;
-	char *file_name= NULL;
+	char *file_name = NULL;
 	char fn[PATH_MAX];
 	uint64_t addr = (uint64_t)(uintptr_t)code_addr;
 	jvmtiError ret;
 	int nr_lines = 0; /* in line_tab[] */
 	size_t len;
+	int output_debug_info = 0;
 
 	ret = (*jvmti)->GetMethodDeclaringClass(jvmti, method,
 						&decl_class);
@@ -158,6 +254,19 @@ compiled_method_load_cb(jvmtiEnv *jvmti,
 		if (ret != JVMTI_ERROR_NONE) {
 			warnx("jvmti: cannot get line table for method");
 			nr_lines = 0;
+		} else if (nr_lines > 0) {
+			line_file_names = malloc(sizeof(char*) * nr_lines);
+			if (!line_file_names) {
+				warnx("jvmti: cannot allocate space for line table method names");
+			} else {
+				memset(line_file_names, 0, sizeof(char*) * nr_lines);
+				ret = fill_source_filenames(jvmti, nr_lines, line_tab, line_file_names);
+				if (ret != JVMTI_ERROR_NONE) {
+					warnx("jvmti: fill_source_filenames failed");
+				} else {
+					output_debug_info = 1;
+				}
+			}
 		}
 	}
 
@@ -181,33 +290,14 @@ compiled_method_load_cb(jvmtiEnv *jvmti,
 		goto error;
 	}
 
-	/*
-	 * Assume path name is class hierarchy, this is a common practice with Java programs
-	 */
-	if (*class_sign == 'L') {
-		int j, i = 0;
-		char *p = strrchr(class_sign, '/');
-		if (p) {
-			/* drop the 'L' prefix and copy up to the final '/' */
-			for (i = 0; i < (p - class_sign); i++)
-				fn[i] = class_sign[i+1];
-		}
-		/*
-		 * append file name, we use loops and not string ops to avoid modifying
-		 * class_sign which is used later for the symbol name
-		 */
-		for (j = 0; i < (PATH_MAX - 1) && file_name && j < strlen(file_name); j++, i++)
-			fn[i] = file_name[j];
-		fn[i] = '\0';
-	} else {
-		/* fallback case */
-		strcpy(fn, file_name);
-	}
+	copy_class_filename(class_sign, file_name, fn, PATH_MAX);
+
 	/*
 	 * write source line info record if we have it
 	 */
-	if (jvmti_write_debug_info(jvmti_agent, addr, fn, line_tab, nr_lines))
-		warnx("jvmti: write_debug_info() failed");
+	if (output_debug_info)
+		if (jvmti_write_debug_info(jvmti_agent, addr, nr_lines, line_tab, (const char * const *) line_file_names))
+			warnx("jvmti: write_debug_info() failed");
 
 	len = strlen(func_name) + strlen(class_sign) + strlen(func_sign) + 2;
 	{
@@ -223,6 +313,13 @@ error:
 	(*jvmti)->Deallocate(jvmti, (unsigned char *)class_sign);
 	(*jvmti)->Deallocate(jvmti, (unsigned char *)file_name);
 	free(line_tab);
+	while (line_file_names && (nr_lines > 0)) {
+	    if (line_file_names[nr_lines - 1]) {
+	        free(line_file_names[nr_lines - 1]);
+	    }
+	    nr_lines -= 1;
+	}
+	free(line_file_names);
 }
 
 static void JNICALL

From 10b9baa701d5023897f70a4acb3bf0235da3dc4f Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 28 Nov 2017 11:08:41 -0300
Subject: [PATCH 219/808] tools arch s390: Do not include header files from the
 kernel sources

Long ago we decided to be verbotten including files in the kernel git
sources from tools/ living source code, to avoid disturbing kernel
development (and perf's and other tools/) when, say, a kernel hacker
adds something, tests everything but tools/ and have tools/ build
broken.

This got broken recently by s/390, fix it by copying
arch/s390/include/uapi/asm/perf_regs.h to tools/arch/s390/include/uapi/asm/,
making this one be used by means of <asm/perf_regs.h> and updating
tools/perf/check_headers.sh to make sure we are notified when the
original changes, so that we can check if anything is needed on the
tooling side.

This would have been caught by the 'tarkpg' test entry in:

$ make -C tools/perf build-test

When run on a s/390 build system or container.

Acked-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Cc: Thomas Richter <tmricht@linux.vnet.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Wang Nan <wangnan0@huawei.com>
Fixes: f704ef44602f ("s390/perf: add support for perf_regs and libdw")
Link: https://lkml.kernel.org/n/tip-n57139ic0v9uffx8wdqi3d8a@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/arch/s390/include/uapi/asm/perf_regs.h | 44 ++++++++++++++++++++
 tools/perf/arch/s390/include/perf_regs.h     |  2 +-
 tools/perf/check-headers.sh                  |  1 +
 3 files changed, 46 insertions(+), 1 deletion(-)
 create mode 100644 tools/arch/s390/include/uapi/asm/perf_regs.h

diff --git a/tools/arch/s390/include/uapi/asm/perf_regs.h b/tools/arch/s390/include/uapi/asm/perf_regs.h
new file mode 100644
index 000000000000..d17dd9e5d516
--- /dev/null
+++ b/tools/arch/s390/include/uapi/asm/perf_regs.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _ASM_S390_PERF_REGS_H
+#define _ASM_S390_PERF_REGS_H
+
+enum perf_event_s390_regs {
+	PERF_REG_S390_R0,
+	PERF_REG_S390_R1,
+	PERF_REG_S390_R2,
+	PERF_REG_S390_R3,
+	PERF_REG_S390_R4,
+	PERF_REG_S390_R5,
+	PERF_REG_S390_R6,
+	PERF_REG_S390_R7,
+	PERF_REG_S390_R8,
+	PERF_REG_S390_R9,
+	PERF_REG_S390_R10,
+	PERF_REG_S390_R11,
+	PERF_REG_S390_R12,
+	PERF_REG_S390_R13,
+	PERF_REG_S390_R14,
+	PERF_REG_S390_R15,
+	PERF_REG_S390_FP0,
+	PERF_REG_S390_FP1,
+	PERF_REG_S390_FP2,
+	PERF_REG_S390_FP3,
+	PERF_REG_S390_FP4,
+	PERF_REG_S390_FP5,
+	PERF_REG_S390_FP6,
+	PERF_REG_S390_FP7,
+	PERF_REG_S390_FP8,
+	PERF_REG_S390_FP9,
+	PERF_REG_S390_FP10,
+	PERF_REG_S390_FP11,
+	PERF_REG_S390_FP12,
+	PERF_REG_S390_FP13,
+	PERF_REG_S390_FP14,
+	PERF_REG_S390_FP15,
+	PERF_REG_S390_MASK,
+	PERF_REG_S390_PC,
+
+	PERF_REG_S390_MAX
+};
+
+#endif /* _ASM_S390_PERF_REGS_H */
diff --git a/tools/perf/arch/s390/include/perf_regs.h b/tools/perf/arch/s390/include/perf_regs.h
index d2df54a6bc5a..bcfbaed78cc2 100644
--- a/tools/perf/arch/s390/include/perf_regs.h
+++ b/tools/perf/arch/s390/include/perf_regs.h
@@ -3,7 +3,7 @@
 
 #include <stdlib.h>
 #include <linux/types.h>
-#include <../../../../arch/s390/include/uapi/asm/perf_regs.h>
+#include <asm/perf_regs.h>
 
 void perf_regs_load(u64 *regs);
 
diff --git a/tools/perf/check-headers.sh b/tools/perf/check-headers.sh
index 6db9d809fe97..3e64f10b6d66 100755
--- a/tools/perf/check-headers.sh
+++ b/tools/perf/check-headers.sh
@@ -21,6 +21,7 @@ arch/x86/include/asm/cpufeatures.h
 arch/arm/include/uapi/asm/perf_regs.h
 arch/arm64/include/uapi/asm/perf_regs.h
 arch/powerpc/include/uapi/asm/perf_regs.h
+arch/s390/include/uapi/asm/perf_regs.h
 arch/x86/include/uapi/asm/perf_regs.h
 arch/x86/include/uapi/asm/kvm.h
 arch/x86/include/uapi/asm/kvm_perf.h

From ca26cffa4e4aaeb09bb9e308f95c7835cb149248 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 4 Dec 2017 13:08:47 -0300
Subject: [PATCH 220/808] x86/asm: Allow again using asm.h when building for
 the 'bpf' clang target

Up to f5caf621ee35 ("x86/asm: Fix inline asm call constraints for Clang")
we were able to use x86 headers to build to the 'bpf' clang target, as
done by the BPF code in tools/perf/.

With that commit, we ended up with following failure for 'perf test LLVM', this
is because "clang ... -target bpf ..." fails since 4.0 does not have bpf inline
asm support and 6.0 does not recognize the register 'esp', fix it by guarding
that part with an #ifndef __BPF__, that is defined by clang when building to
the "bpf" target.

  # perf test -v LLVM
  37: LLVM search and compile                               :
  37.1: Basic BPF llvm compile                              :
  --- start ---
  test child forked, pid 25526
  Kernel build dir is set to /lib/modules/4.14.0+/build
  set env: KBUILD_DIR=/lib/modules/4.14.0+/build
  unset env: KBUILD_OPTS
  include option is set to  -nostdinc -isystem /usr/lib/gcc/x86_64-redhat-linux/7/include -I/home/acme/git/linux/arch/x86/include -I./arch/x86/include/generated  -I/home/acme/git/linux/include -I./include -I/home/acme/git/linux/arch/x86/include/uapi -I./arch/x86/include/generated/uapi -I/home/acme/git/linux/include/uapi -I./include/generated/uapi -include /home/acme/git/linux/include/linux/kconfig.h
  set env: NR_CPUS=4
  set env: LINUX_VERSION_CODE=0x40e00
  set env: CLANG_EXEC=/usr/local/bin/clang
  set env: CLANG_OPTIONS=-xc
  set env: KERNEL_INC_OPTIONS= -nostdinc -isystem /usr/lib/gcc/x86_64-redhat-linux/7/include -I/home/acme/git/linux/arch/x86/include -I./arch/x86/include/generated  -I/home/acme/git/linux/include -I./include -I/home/acme/git/linux/arch/x86/include/uapi -I./arch/x86/include/generated/uapi -I/home/acme/git/linux/include/uapi -I./include/generated/uapi -include /home/acme/git/linux/include/linux/kconfig.h
  set env: WORKING_DIR=/lib/modules/4.14.0+/build
  set env: CLANG_SOURCE=-
  llvm compiling command template: echo '/*
   * bpf-script-example.c
   * Test basic LLVM building
   */
  #ifndef LINUX_VERSION_CODE
  # error Need LINUX_VERSION_CODE
  # error Example: for 4.2 kernel, put 'clang-opt="-DLINUX_VERSION_CODE=0x40200" into llvm section of ~/.perfconfig'
  #endif
  #define BPF_ANY 0
  #define BPF_MAP_TYPE_ARRAY 2
  #define BPF_FUNC_map_lookup_elem 1
  #define BPF_FUNC_map_update_elem 2

  static void *(*bpf_map_lookup_elem)(void *map, void *key) =
	  (void *) BPF_FUNC_map_lookup_elem;
  static void *(*bpf_map_update_elem)(void *map, void *key, void *value, int flags) =
	  (void *) BPF_FUNC_map_update_elem;

  struct bpf_map_def {
	  unsigned int type;
	  unsigned int key_size;
	  unsigned int value_size;
	  unsigned int max_entries;
  };

  #define SEC(NAME) __attribute__((section(NAME), used))
  struct bpf_map_def SEC("maps") flip_table = {
	  .type = BPF_MAP_TYPE_ARRAY,
	  .key_size = sizeof(int),
	  .value_size = sizeof(int),
	  .max_entries = 1,
  };

  SEC("func=SyS_epoll_wait")
  int bpf_func__SyS_epoll_wait(void *ctx)
  {
	  int ind =0;
	  int *flag = bpf_map_lookup_elem(&flip_table, &ind);
	  int new_flag;
	  if (!flag)
		  return 0;
	  /* flip flag and store back */
	  new_flag = !*flag;
	  bpf_map_update_elem(&flip_table, &ind, &new_flag, BPF_ANY);
	  return new_flag;
  }
  char _license[] SEC("license") = "GPL";
  int _version SEC("version") = LINUX_VERSION_CODE;
  ' | $CLANG_EXEC -D__KERNEL__ -D__NR_CPUS__=$NR_CPUS -DLINUX_VERSION_CODE=$LINUX_VERSION_CODE $CLANG_OPTIONS $KERNEL_INC_OPTIONS -Wno-unused-value -Wno-pointer-sign -working-directory $WORKING_DIR -c "$CLANG_SOURCE" -target bpf -O2 -o -
  test child finished with 0
  ---- end ----
  LLVM search and compile subtest 0: Ok
  37.2: kbuild searching                                    :
  --- start ---
  test child forked, pid 25950
  Kernel build dir is set to /lib/modules/4.14.0+/build
  set env: KBUILD_DIR=/lib/modules/4.14.0+/build
  unset env: KBUILD_OPTS
  include option is set to  -nostdinc -isystem /usr/lib/gcc/x86_64-redhat-linux/7/include -I/home/acme/git/linux/arch/x86/include -I./arch/x86/include/generated  -I/home/acme/git/linux/include -I./include -I/home/acme/git/linux/arch/x86/include/uapi -I./arch/x86/include/generated/uapi -I/home/acme/git/linux/include/uapi -I./include/generated/uapi -include /home/acme/git/linux/include/linux/kconfig.h
  set env: NR_CPUS=4
  set env: LINUX_VERSION_CODE=0x40e00
  set env: CLANG_EXEC=/usr/local/bin/clang
  set env: CLANG_OPTIONS=-xc
  set env: KERNEL_INC_OPTIONS= -nostdinc -isystem /usr/lib/gcc/x86_64-redhat-linux/7/include -I/home/acme/git/linux/arch/x86/include -I./arch/x86/include/generated  -I/home/acme/git/linux/include -I./include -I/home/acme/git/linux/arch/x86/include/uapi -I./arch/x86/include/generated/uapi -I/home/acme/git/linux/include/uapi -I./include/generated/uapi -include /home/acme/git/linux/include/linux/kconfig.h
  set env: WORKING_DIR=/lib/modules/4.14.0+/build
  set env: CLANG_SOURCE=-
  llvm compiling command template: echo '/*
   * bpf-script-test-kbuild.c
   * Test include from kernel header
   */
  #ifndef LINUX_VERSION_CODE
  # error Need LINUX_VERSION_CODE
  # error Example: for 4.2 kernel, put 'clang-opt="-DLINUX_VERSION_CODE=0x40200" into llvm section of ~/.perfconfig'
  #endif
  #define SEC(NAME) __attribute__((section(NAME), used))

  #include <uapi/linux/fs.h>
  #include <uapi/asm/ptrace.h>

  SEC("func=vfs_llseek")
  int bpf_func__vfs_llseek(void *ctx)
  {
	  return 0;
  }

  char _license[] SEC("license") = "GPL";
  int _version SEC("version") = LINUX_VERSION_CODE;
  ' | $CLANG_EXEC -D__KERNEL__ -D__NR_CPUS__=$NR_CPUS -DLINUX_VERSION_CODE=$LINUX_VERSION_CODE $CLANG_OPTIONS $KERNEL_INC_OPTIONS -Wno-unused-value -Wno-pointer-sign -working-directory $WORKING_DIR -c "$CLANG_SOURCE" -target bpf -O2 -o -
  In file included from <stdin>:12:
  In file included from /home/acme/git/linux/arch/x86/include/uapi/asm/ptrace.h:5:
  In file included from /home/acme/git/linux/include/linux/compiler.h:242:
  In file included from /home/acme/git/linux/arch/x86/include/asm/barrier.h:5:
  In file included from /home/acme/git/linux/arch/x86/include/asm/alternative.h:10:
  /home/acme/git/linux/arch/x86/include/asm/asm.h:145:50: error: unknown register name 'esp' in asm
  register unsigned long current_stack_pointer asm(_ASM_SP);
                                                   ^
  /home/acme/git/linux/arch/x86/include/asm/asm.h:44:18: note: expanded from macro '_ASM_SP'
  #define _ASM_SP         __ASM_REG(sp)
                          ^
  /home/acme/git/linux/arch/x86/include/asm/asm.h:27:32: note: expanded from macro '__ASM_REG'
  #define __ASM_REG(reg)         __ASM_SEL_RAW(e##reg, r##reg)
                                 ^
  /home/acme/git/linux/arch/x86/include/asm/asm.h:18:29: note: expanded from macro '__ASM_SEL_RAW'
  # define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(a)
                              ^
  /home/acme/git/linux/arch/x86/include/asm/asm.h:11:32: note: expanded from macro '__ASM_FORM_RAW'
  # define __ASM_FORM_RAW(x)     #x
                                 ^
  <scratch space>:4:1: note: expanded from here
  "esp"
  ^
  1 error generated.
  ERROR:	unable to compile -
  Hint:	Check error message shown above.
  Hint:	You can also pre-compile it into .o using:
     		  clang -target bpf -O2 -c -
     	  with proper -I and -D options.
  Failed to compile test case: 'kbuild searching'
  test child finished with -1
  ---- end ----
  LLVM search and compile subtest 1: FAILED!

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David Ahern <dsahern@gmail.com>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matthias Kaehlcke <mka@chromium.org>
Cc: Miguel Bernal Marin <miguel.bernal.marin@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wang Nan <wangnan0@huawei.com>
Cc: Yonghong Song <yhs@fb.com>
Link: https://lkml.kernel.org/r/20171128175948.GL3298@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 arch/x86/include/asm/asm.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 219faaec51df..386a6900e206 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -136,6 +136,7 @@
 #endif
 
 #ifndef __ASSEMBLY__
+#ifndef __BPF__
 /*
  * This output constraint should be used for any inline asm which has a "call"
  * instruction.  Otherwise the asm may be inserted before the frame pointer
@@ -145,5 +146,6 @@
 register unsigned long current_stack_pointer asm(_ASM_SP);
 #define ASM_CALL_CONSTRAINT "+r" (current_stack_pointer)
 #endif
+#endif
 
 #endif /* _ASM_X86_ASM_H */

From 234833991e14681f61cbfd93e65a5c976089cf11 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Mon, 18 Dec 2017 17:34:16 +0100
Subject: [PATCH 221/808] tipc: fix lost member events bug

Group messages are not supposed to be returned to sender when the
destination socket disappears. This is done correctly for regular
traffic messages, by setting the 'dest_droppable' bit in the header.
But we forget to do that in group protocol messages. This has the effect
that such messages may sometimes bounce back to the sender, be perceived
as a legitimate peer message, and wreak general havoc for the rest of
the session. In particular, we have seen that a member in state LEAVING
may go back to state RECLAIMED or REMITTED, hence causing suppression
of an otherwise expected 'member down' event to the user.

We fix this by setting the 'dest_droppable' bit even in group protocol
messages.

Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/group.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/tipc/group.c b/net/tipc/group.c
index 95fec2c057d6..efb5714e7a85 100644
--- a/net/tipc/group.c
+++ b/net/tipc/group.c
@@ -648,6 +648,7 @@ static void tipc_group_proto_xmit(struct tipc_group *grp, struct tipc_member *m,
 	} else if (mtyp == GRP_REMIT_MSG) {
 		msg_set_grp_remitted(hdr, m->window);
 	}
+	msg_set_dest_droppable(hdr, true);
 	__skb_queue_tail(xmitq, skb);
 }
 

From 3f42f5fe31c8715a34064bfd7b788488d1ea2f7c Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Mon, 18 Dec 2017 18:13:34 +0100
Subject: [PATCH 222/808] tipc: remove leaving group member from all lists

A group member going into state LEAVING should never go back to any
other state before it is finally deleted. However, this might happen
if the socket needs to send out a RECLAIM message during this interval.
Since we forget to remove the leaving member from the group's 'active'
or 'pending' list, the member might be selected for reclaiming, change
state to RECLAIMING, and get stuck in this state instead of being
deleted. This might lead to suppression of the expected 'member down'
event to the receiver.

We fix this by removing the member from all lists, except the RB tree,
at the moment it goes into state LEAVING.

Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/group.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/tipc/group.c b/net/tipc/group.c
index efb5714e7a85..b96ec429bb9b 100644
--- a/net/tipc/group.c
+++ b/net/tipc/group.c
@@ -699,6 +699,9 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup,
 		if (!m)
 			return;
 		m->bc_syncpt = msg_grp_bc_syncpt(hdr);
+		list_del_init(&m->list);
+		list_del_init(&m->congested);
+		*usr_wakeup = true;
 
 		/* Wait until WITHDRAW event is received */
 		if (m->state != MBR_LEAVING) {
@@ -710,8 +713,6 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup,
 		ehdr = buf_msg(m->event_msg);
 		msg_set_grp_bc_seqno(ehdr, m->bc_syncpt);
 		__skb_queue_tail(inputq, m->event_msg);
-		*usr_wakeup = true;
-		list_del_init(&m->congested);
 		return;
 	case GRP_ADV_MSG:
 		if (!m)
@@ -863,6 +864,7 @@ void tipc_group_member_evt(struct tipc_group *grp,
 				msg_set_grp_bc_seqno(hdr, m->bc_rcv_nxt);
 			__skb_queue_tail(inputq, skb);
 		}
+		list_del_init(&m->list);
 		list_del_init(&m->congested);
 	}
 	*sk_rcvbuf = tipc_group_rcvbuf_limit(grp);

From c505873eaece2b4aefd07d339dc7e1400e0235ac Mon Sep 17 00:00:00 2001
From: Zhao Qiang <qiang.zhao@nxp.com>
Date: Mon, 18 Dec 2017 10:26:43 +0800
Subject: [PATCH 223/808] net: phy: marvell: Limit 88m1101 autoneg errata to
 88E1145 as well.

88E1145 also need this autoneg errata.

Fixes: f2899788353c ("net: phy: marvell: Limit errata to 88m1101")
Signed-off-by: Zhao Qiang <qiang.zhao@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/marvell.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c
index b5a8f750e433..26c9a11220ca 100644
--- a/drivers/net/phy/marvell.c
+++ b/drivers/net/phy/marvell.c
@@ -2073,7 +2073,7 @@ static struct phy_driver marvell_drivers[] = {
 		.flags = PHY_HAS_INTERRUPT,
 		.probe = marvell_probe,
 		.config_init = &m88e1145_config_init,
-		.config_aneg = &marvell_config_aneg,
+		.config_aneg = &m88e1101_config_aneg,
 		.read_status = &genphy_read_status,
 		.ack_interrupt = &marvell_ack_interrupt,
 		.config_intr = &marvell_config_intr,

From ac3241d5c81bf6e85095481435f29a4627ff820e Mon Sep 17 00:00:00 2001
From: Hemanth Puranik <hpuranik@codeaurora.org>
Date: Mon, 18 Dec 2017 11:27:47 +0530
Subject: [PATCH 224/808] net: qcom/emac: Change the order of mac up and sgmii
 open

This patch fixes the order of mac_up and sgmii_open for the
reasons noted below:

- If open takes more time(if the SGMII block is not responding or
  if we want to do some delay based task) in this situation we
  will hit NETDEV watchdog
- The main reason : We should signal to upper layers that we are
  ready to receive packets "only" when the entire path is initialized
  not the other way around, this is followed in the reset path where
  we do mac_down, sgmii_reset and mac_up. This also makes the driver
  uniform across the reset and open paths.
- In the future there may be need for delay based tasks to be done in
  sgmii open which will result in NETDEV watchdog
- As per the documentation the order of init should be sgmii, mac, rings
  and DMA

Signed-off-by: Hemanth Puranik <hpuranik@codeaurora.org>
Acked-by: Timur Tabi <timur@codeaurora.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qualcomm/emac/emac.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/qualcomm/emac/emac.c b/drivers/net/ethernet/qualcomm/emac/emac.c
index 70c92b649b29..38c924bdd32e 100644
--- a/drivers/net/ethernet/qualcomm/emac/emac.c
+++ b/drivers/net/ethernet/qualcomm/emac/emac.c
@@ -253,18 +253,18 @@ static int emac_open(struct net_device *netdev)
 		return ret;
 	}
 
-	ret = emac_mac_up(adpt);
+	ret = adpt->phy.open(adpt);
 	if (ret) {
 		emac_mac_rx_tx_rings_free_all(adpt);
 		free_irq(irq->irq, irq);
 		return ret;
 	}
 
-	ret = adpt->phy.open(adpt);
+	ret = emac_mac_up(adpt);
 	if (ret) {
-		emac_mac_down(adpt);
 		emac_mac_rx_tx_rings_free_all(adpt);
 		free_irq(irq->irq, irq);
+		adpt->phy.close(adpt);
 		return ret;
 	}
 

From 5c468674d17056148da06218d4da5d04baf22eac Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Mon, 18 Dec 2017 14:07:25 +0800
Subject: [PATCH 225/808] sctp: fix the issue that a __u16 variable may
 overflow in sctp_ulpq_renege

Now when reneging events in sctp_ulpq_renege(), the variable freed
could be increased by a __u16 value twice while freed is of __u16
type. It means freed may overflow at the second addition.

This patch is to fix it by using __u32 type for 'freed', while at
it, also to remove 'if (chunk)' check, as all renege commands are
generated in sctp_eat_data and it can't be NULL.

Reported-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/ulpqueue.c | 24 ++++++++----------------
 1 file changed, 8 insertions(+), 16 deletions(-)

diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
index a71be33f3afe..e36ec5dd64c6 100644
--- a/net/sctp/ulpqueue.c
+++ b/net/sctp/ulpqueue.c
@@ -1084,29 +1084,21 @@ void sctp_ulpq_partial_delivery(struct sctp_ulpq *ulpq,
 void sctp_ulpq_renege(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk,
 		      gfp_t gfp)
 {
-	struct sctp_association *asoc;
-	__u16 needed, freed;
+	struct sctp_association *asoc = ulpq->asoc;
+	__u32 freed = 0;
+	__u16 needed;
 
-	asoc = ulpq->asoc;
-
-	if (chunk) {
-		needed = ntohs(chunk->chunk_hdr->length);
-		needed -= sizeof(struct sctp_data_chunk);
-	} else
-		needed = SCTP_DEFAULT_MAXWINDOW;
-
-	freed = 0;
+	needed = ntohs(chunk->chunk_hdr->length) -
+		 sizeof(struct sctp_data_chunk);
 
 	if (skb_queue_empty(&asoc->base.sk->sk_receive_queue)) {
 		freed = sctp_ulpq_renege_order(ulpq, needed);
-		if (freed < needed) {
+		if (freed < needed)
 			freed += sctp_ulpq_renege_frags(ulpq, needed - freed);
-		}
 	}
 	/* If able to free enough room, accept this chunk. */
-	if (chunk && (freed >= needed)) {
-		int retval;
-		retval = sctp_ulpq_tail_data(ulpq, chunk, gfp);
+	if (freed >= needed) {
+		int retval = sctp_ulpq_tail_data(ulpq, chunk, gfp);
 		/*
 		 * Enter partial delivery if chunk has not been
 		 * delivered; otherwise, drain the reassembly queue.

From d196975905b2bb227dc54547c03b3d9d0013805c Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Mon, 18 Dec 2017 14:13:17 +0800
Subject: [PATCH 226/808] sctp: add SCTP_CID_RECONF conversion in sctp_cname

Whenever a new type of chunk is added, the corresp conversion in
sctp_cname should be added. Otherwise, in some places, pr_debug
will print it as "unknown chunk".

Fixes: cc16f00f6529 ("sctp: add support for generating stream reconf ssn reset request chunk")
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo R. Leitner <marcelo.leitner@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/debug.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/sctp/debug.c b/net/sctp/debug.c
index 3f619fdcbf0a..291c97b07058 100644
--- a/net/sctp/debug.c
+++ b/net/sctp/debug.c
@@ -78,6 +78,9 @@ const char *sctp_cname(const union sctp_subtype cid)
 	case SCTP_CID_AUTH:
 		return "AUTH";
 
+	case SCTP_CID_RECONF:
+		return "RECONF";
+
 	default:
 		break;
 	}

From 84aeb437ab98a2bce3d4b2111c79723aedfceb33 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Mon, 18 Dec 2017 17:35:09 +0200
Subject: [PATCH 227/808] net: bridge: fix early call to
 br_stp_change_bridge_id and plug newlink leaks

The early call to br_stp_change_bridge_id in bridge's newlink can cause
a memory leak if an error occurs during the newlink because the fdb
entries are not cleaned up if a different lladdr was specified, also
another minor issue is that it generates fdb notifications with
ifindex = 0. Another unrelated memory leak is the bridge sysfs entries
which get added on NETDEV_REGISTER event, but are not cleaned up in the
newlink error path. To remove this special case the call to
br_stp_change_bridge_id is done after netdev register and we cleanup the
bridge on changelink error via br_dev_delete to plug all leaks.

This patch makes netlink bridge destruction on newlink error the same as
dellink and ioctl del which is necessary since at that point we have a
fully initialized bridge device.

To reproduce the issue:
$ ip l add br0 address 00:11:22:33:44:55 type bridge group_fwd_mask 1
RTNETLINK answers: Invalid argument

$ rmmod bridge
[ 1822.142525] =============================================================================
[ 1822.143640] BUG bridge_fdb_cache (Tainted: G           O    ): Objects remaining in bridge_fdb_cache on __kmem_cache_shutdown()
[ 1822.144821] -----------------------------------------------------------------------------

[ 1822.145990] Disabling lock debugging due to kernel taint
[ 1822.146732] INFO: Slab 0x0000000092a844b2 objects=32 used=2 fp=0x00000000fef011b0 flags=0x1ffff8000000100
[ 1822.147700] CPU: 2 PID: 13584 Comm: rmmod Tainted: G    B      O     4.15.0-rc2+ #87
[ 1822.148578] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.7.5-20140531_083030-gandalf 04/01/2014
[ 1822.150008] Call Trace:
[ 1822.150510]  dump_stack+0x78/0xa9
[ 1822.151156]  slab_err+0xb1/0xd3
[ 1822.151834]  ? __kmalloc+0x1bb/0x1ce
[ 1822.152546]  __kmem_cache_shutdown+0x151/0x28b
[ 1822.153395]  shutdown_cache+0x13/0x144
[ 1822.154126]  kmem_cache_destroy+0x1c0/0x1fb
[ 1822.154669]  SyS_delete_module+0x194/0x244
[ 1822.155199]  ? trace_hardirqs_on_thunk+0x1a/0x1c
[ 1822.155773]  entry_SYSCALL_64_fastpath+0x23/0x9a
[ 1822.156343] RIP: 0033:0x7f929bd38b17
[ 1822.156859] RSP: 002b:00007ffd160e9a98 EFLAGS: 00000202 ORIG_RAX: 00000000000000b0
[ 1822.157728] RAX: ffffffffffffffda RBX: 00005578316ba090 RCX: 00007f929bd38b17
[ 1822.158422] RDX: 00007f929bd9ec60 RSI: 0000000000000800 RDI: 00005578316ba0f0
[ 1822.159114] RBP: 0000000000000003 R08: 00007f929bff5f20 R09: 00007ffd160e8a11
[ 1822.159808] R10: 00007ffd160e9860 R11: 0000000000000202 R12: 00007ffd160e8a80
[ 1822.160513] R13: 0000000000000000 R14: 0000000000000000 R15: 00005578316ba090
[ 1822.161278] INFO: Object 0x000000007645de29 @offset=0
[ 1822.161666] INFO: Object 0x00000000d5df2ab5 @offset=128

Fixes: 30313a3d5794 ("bridge: Handle IFLA_ADDRESS correctly when creating bridge device")
Fixes: 5b8d5429daa0 ("bridge: netlink: register netdevice before executing changelink")
Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_netlink.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index d0ef0a8e8831..015f465c514b 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -1262,19 +1262,20 @@ static int br_dev_newlink(struct net *src_net, struct net_device *dev,
 	struct net_bridge *br = netdev_priv(dev);
 	int err;
 
+	err = register_netdevice(dev);
+	if (err)
+		return err;
+
 	if (tb[IFLA_ADDRESS]) {
 		spin_lock_bh(&br->lock);
 		br_stp_change_bridge_id(br, nla_data(tb[IFLA_ADDRESS]));
 		spin_unlock_bh(&br->lock);
 	}
 
-	err = register_netdevice(dev);
-	if (err)
-		return err;
-
 	err = br_changelink(dev, tb, data, extack);
 	if (err)
-		unregister_netdevice(dev);
+		br_dev_delete(dev, NULL);
+
 	return err;
 }
 

From bb422a738f6566f7439cd347d54e321e4fe92a9f Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Mon, 18 Dec 2017 20:31:41 +0900
Subject: [PATCH 228/808] mm,vmscan: Make unregister_shrinker() no-op if
 register_shrinker() failed.

Syzbot caught an oops at unregister_shrinker() because combination of
commit 1d3d4437eae1bb29 ("vmscan: per-node deferred work") and fault
injection made register_shrinker() fail and the caller of
register_shrinker() did not check for failure.

----------
[  554.881422] FAULT_INJECTION: forcing a failure.
[  554.881422] name failslab, interval 1, probability 0, space 0, times 0
[  554.881438] CPU: 1 PID: 13231 Comm: syz-executor1 Not tainted 4.14.0-rc8+ #82
[  554.881443] Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
[  554.881445] Call Trace:
[  554.881459]  dump_stack+0x194/0x257
[  554.881474]  ? arch_local_irq_restore+0x53/0x53
[  554.881486]  ? find_held_lock+0x35/0x1d0
[  554.881507]  should_fail+0x8c0/0xa40
[  554.881522]  ? fault_create_debugfs_attr+0x1f0/0x1f0
[  554.881537]  ? check_noncircular+0x20/0x20
[  554.881546]  ? find_next_zero_bit+0x2c/0x40
[  554.881560]  ? ida_get_new_above+0x421/0x9d0
[  554.881577]  ? find_held_lock+0x35/0x1d0
[  554.881594]  ? __lock_is_held+0xb6/0x140
[  554.881628]  ? check_same_owner+0x320/0x320
[  554.881634]  ? lock_downgrade+0x990/0x990
[  554.881649]  ? find_held_lock+0x35/0x1d0
[  554.881672]  should_failslab+0xec/0x120
[  554.881684]  __kmalloc+0x63/0x760
[  554.881692]  ? lock_downgrade+0x990/0x990
[  554.881712]  ? register_shrinker+0x10e/0x2d0
[  554.881721]  ? trace_event_raw_event_module_request+0x320/0x320
[  554.881737]  register_shrinker+0x10e/0x2d0
[  554.881747]  ? prepare_kswapd_sleep+0x1f0/0x1f0
[  554.881755]  ? _down_write_nest_lock+0x120/0x120
[  554.881765]  ? memcpy+0x45/0x50
[  554.881785]  sget_userns+0xbcd/0xe20
(...snipped...)
[  554.898693] kasan: CONFIG_KASAN_INLINE enabled
[  554.898724] kasan: GPF could be caused by NULL-ptr deref or user memory access
[  554.898732] general protection fault: 0000 [#1] SMP KASAN
[  554.898737] Dumping ftrace buffer:
[  554.898741]    (ftrace buffer empty)
[  554.898743] Modules linked in:
[  554.898752] CPU: 1 PID: 13231 Comm: syz-executor1 Not tainted 4.14.0-rc8+ #82
[  554.898755] Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
[  554.898760] task: ffff8801d1dbe5c0 task.stack: ffff8801c9e38000
[  554.898772] RIP: 0010:__list_del_entry_valid+0x7e/0x150
[  554.898775] RSP: 0018:ffff8801c9e3f108 EFLAGS: 00010246
[  554.898780] RAX: dffffc0000000000 RBX: 0000000000000000 RCX: 0000000000000000
[  554.898784] RDX: 0000000000000000 RSI: ffff8801c53c6f98 RDI: ffff8801c53c6fa0
[  554.898788] RBP: ffff8801c9e3f120 R08: 1ffff100393c7d55 R09: 0000000000000004
[  554.898791] R10: ffff8801c9e3ef70 R11: 0000000000000000 R12: 0000000000000000
[  554.898795] R13: dffffc0000000000 R14: 1ffff100393c7e45 R15: ffff8801c53c6f98
[  554.898800] FS:  0000000000000000(0000) GS:ffff8801db300000(0000) knlGS:0000000000000000
[  554.898804] CS:  0010 DS: 002b ES: 002b CR0: 0000000080050033
[  554.898807] CR2: 00000000dbc23000 CR3: 00000001c7269000 CR4: 00000000001406e0
[  554.898813] DR0: 0000000020000000 DR1: 0000000020000000 DR2: 0000000000000000
[  554.898816] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000600
[  554.898818] Call Trace:
[  554.898828]  unregister_shrinker+0x79/0x300
[  554.898837]  ? perf_trace_mm_vmscan_writepage+0x750/0x750
[  554.898844]  ? down_write+0x87/0x120
[  554.898851]  ? deactivate_super+0x139/0x1b0
[  554.898857]  ? down_read+0x150/0x150
[  554.898864]  ? check_same_owner+0x320/0x320
[  554.898875]  deactivate_locked_super+0x64/0xd0
[  554.898883]  deactivate_super+0x141/0x1b0
----------

Since allowing register_shrinker() callers to call unregister_shrinker()
when register_shrinker() failed can simplify error recovery path, this
patch makes unregister_shrinker() no-op when register_shrinker() failed.
Also, reset shrinker->nr_deferred in case unregister_shrinker() was
by error called twice.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Aliaksei Karaliou <akaraliou.dev@gmail.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Cc: Glauber Costa <glauber@scylladb.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 mm/vmscan.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index c02c850ea349..47d5ced51f2d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -297,10 +297,13 @@ EXPORT_SYMBOL(register_shrinker);
  */
 void unregister_shrinker(struct shrinker *shrinker)
 {
+	if (!shrinker->nr_deferred)
+		return;
 	down_write(&shrinker_rwsem);
 	list_del(&shrinker->list);
 	up_write(&shrinker_rwsem);
 	kfree(shrinker->nr_deferred);
+	shrinker->nr_deferred = NULL;
 }
 EXPORT_SYMBOL(unregister_shrinker);
 

From 6623c0fba10ef45b64ca213ad5dec926f37fa9a0 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Fri, 15 Dec 2017 16:10:20 +0000
Subject: [PATCH 229/808] net: phy: marvell: avoid pause mode on
 SGMII-to-Copper for 88e151x

Observed on the 88e1512 in SGMII-to-Copper mode, negotiating pause
is unreliable.  While the pause bits can be set in the advertisment
register, they clear shortly after negotiation with a link partner
commences irrespective of the cause of the negotiation.

While these bits may be correctly conveyed to the link partner on the
first negotiation, a subsequent negotiation (eg, due to negotiation
restart by the link partner, or reconnection of the cable) will result
in the link partner seeing these bits as zero, while the kernel
believes that it has advertised pause modes.

This leads to the local kernel evaluating (eg) symmetric pause mode,
while the remote end evaluates that we have no pause mode capability.

Since we can't guarantee the advertisment, disable pause mode support
with this PHY when used in SGMII-to-Copper mode.

The 88e1510 in RGMII-to-Copper mode appears to behave correctly.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/marvell.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c
index 26c9a11220ca..82104edca393 100644
--- a/drivers/net/phy/marvell.c
+++ b/drivers/net/phy/marvell.c
@@ -879,6 +879,8 @@ static int m88e1510_config_init(struct phy_device *phydev)
 
 	/* SGMII-to-Copper mode initialization */
 	if (phydev->interface == PHY_INTERFACE_MODE_SGMII) {
+		u32 pause;
+
 		/* Select page 18 */
 		err = marvell_set_page(phydev, 18);
 		if (err < 0)
@@ -902,6 +904,16 @@ static int m88e1510_config_init(struct phy_device *phydev)
 		err = marvell_set_page(phydev, MII_MARVELL_COPPER_PAGE);
 		if (err < 0)
 			return err;
+
+		/* There appears to be a bug in the 88e1512 when used in
+		 * SGMII to copper mode, where the AN advertisment register
+		 * clears the pause bits each time a negotiation occurs.
+		 * This means we can never be truely sure what was advertised,
+		 * so disable Pause support.
+		 */
+		pause = SUPPORTED_Pause | SUPPORTED_Asym_Pause;
+		phydev->supported &= ~pause;
+		phydev->advertising &= ~pause;
 	}
 
 	return m88e1121_config_init(phydev);

From 9ee332d99e4d5a97548943b81c54668450ce641b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 18 Dec 2017 15:05:07 -0500
Subject: [PATCH 230/808] sget(): handle failures of register_shrinker()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/super.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/fs/super.c b/fs/super.c
index 7ff1349609e4..06bd25d90ba5 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -517,7 +517,11 @@ retry:
 	hlist_add_head(&s->s_instances, &type->fs_supers);
 	spin_unlock(&sb_lock);
 	get_filesystem(type);
-	register_shrinker(&s->s_shrink);
+	err = register_shrinker(&s->s_shrink);
+	if (err) {
+		deactivate_locked_super(s);
+		s = ERR_PTR(err);
+	}
 	return s;
 }
 

From ab14436065c8066c265540312742390d6d07ddd2 Mon Sep 17 00:00:00 2001
From: Alexey Khoroshilov <khoroshilov@ispras.ru>
Date: Sat, 16 Dec 2017 00:52:39 +0300
Subject: [PATCH 231/808] net: phy: xgene: disable clk on error paths

There are several error paths in xgene_mdio_probe(),
where clk is left undisabled. The patch fixes them.

Found by Linux Driver Verification project (linuxtesting.org).

Signed-off-by: Alexey Khoroshilov <khoroshilov@ispras.ru>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/mdio-xgene.c | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/drivers/net/phy/mdio-xgene.c b/drivers/net/phy/mdio-xgene.c
index bfd3090fb055..07c6048200c6 100644
--- a/drivers/net/phy/mdio-xgene.c
+++ b/drivers/net/phy/mdio-xgene.c
@@ -194,8 +194,11 @@ static int xgene_mdio_reset(struct xgene_mdio_pdata *pdata)
 	}
 
 	ret = xgene_enet_ecc_init(pdata);
-	if (ret)
+	if (ret) {
+		if (pdata->dev->of_node)
+			clk_disable_unprepare(pdata->clk);
 		return ret;
+	}
 	xgene_gmac_reset(pdata);
 
 	return 0;
@@ -388,8 +391,10 @@ static int xgene_mdio_probe(struct platform_device *pdev)
 		return ret;
 
 	mdio_bus = mdiobus_alloc();
-	if (!mdio_bus)
-		return -ENOMEM;
+	if (!mdio_bus) {
+		ret = -ENOMEM;
+		goto out_clk;
+	}
 
 	mdio_bus->name = "APM X-Gene MDIO bus";
 
@@ -418,7 +423,7 @@ static int xgene_mdio_probe(struct platform_device *pdev)
 		mdio_bus->phy_mask = ~0;
 		ret = mdiobus_register(mdio_bus);
 		if (ret)
-			goto out;
+			goto out_mdiobus;
 
 		acpi_walk_namespace(ACPI_TYPE_DEVICE, ACPI_HANDLE(dev), 1,
 				    acpi_register_phy, NULL, mdio_bus, NULL);
@@ -426,16 +431,20 @@ static int xgene_mdio_probe(struct platform_device *pdev)
 	}
 
 	if (ret)
-		goto out;
+		goto out_mdiobus;
 
 	pdata->mdio_bus = mdio_bus;
 	xgene_mdio_status = true;
 
 	return 0;
 
-out:
+out_mdiobus:
 	mdiobus_free(mdio_bus);
 
+out_clk:
+	if (dev->of_node)
+		clk_disable_unprepare(pdata->clk);
+
 	return ret;
 }
 

From 14cb0dc6479dc5ebc63b3a459a5d89a2f1b39fed Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Mon, 18 Dec 2017 15:40:43 +0800
Subject: [PATCH 232/808] block: don't let passthrough IO go into
 .make_request_fn()

Commit a8821f3f3("block: Improvements to bounce-buffer handling") tries
to make sure that the bio to .make_request_fn won't exceed BIO_MAX_PAGES,
but ignores that passthrough I/O can use blk_queue_bounce() too.
Especially, passthrough IO may not be sector-aligned, and the check
of 'sectors < bio_sectors(*bio_orig)' inside __blk_queue_bounce() may
become true even though the max bvec number doesn't exceed BIO_MAX_PAGES,
then cause the bio splitted, and the original passthrough bio is submited
to generic_make_request().

This patch fixes this issue by checking if the bio is passthrough IO,
and use bio_kmalloc() to allocate the cloned passthrough bio.

Cc: NeilBrown <neilb@suse.com>
Fixes: a8821f3f3("block: Improvements to bounce-buffer handling")
Tested-by: Michele Ballabio <barra_cuda@katamail.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bounce.c         |  6 ++++--
 include/linux/blkdev.h | 21 +++++++++++++++++++--
 2 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/block/bounce.c b/block/bounce.c
index fceb1a96480b..1d05c422c932 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -200,6 +200,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
 	unsigned i = 0;
 	bool bounce = false;
 	int sectors = 0;
+	bool passthrough = bio_is_passthrough(*bio_orig);
 
 	bio_for_each_segment(from, *bio_orig, iter) {
 		if (i++ < BIO_MAX_PAGES)
@@ -210,13 +211,14 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
 	if (!bounce)
 		return;
 
-	if (sectors < bio_sectors(*bio_orig)) {
+	if (!passthrough && sectors < bio_sectors(*bio_orig)) {
 		bio = bio_split(*bio_orig, sectors, GFP_NOIO, bounce_bio_split);
 		bio_chain(bio, *bio_orig);
 		generic_make_request(*bio_orig);
 		*bio_orig = bio;
 	}
-	bio = bio_clone_bioset(*bio_orig, GFP_NOIO, bounce_bio_set);
+	bio = bio_clone_bioset(*bio_orig, GFP_NOIO, passthrough ? NULL :
+			bounce_bio_set);
 
 	bio_for_each_segment_all(to, bio, i) {
 		struct page *page = to->bv_page;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 8089ca17db9a..abd06f540863 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -241,14 +241,24 @@ struct request {
 	struct request *next_rq;
 };
 
+static inline bool blk_op_is_scsi(unsigned int op)
+{
+	return op == REQ_OP_SCSI_IN || op == REQ_OP_SCSI_OUT;
+}
+
+static inline bool blk_op_is_private(unsigned int op)
+{
+	return op == REQ_OP_DRV_IN || op == REQ_OP_DRV_OUT;
+}
+
 static inline bool blk_rq_is_scsi(struct request *rq)
 {
-	return req_op(rq) == REQ_OP_SCSI_IN || req_op(rq) == REQ_OP_SCSI_OUT;
+	return blk_op_is_scsi(req_op(rq));
 }
 
 static inline bool blk_rq_is_private(struct request *rq)
 {
-	return req_op(rq) == REQ_OP_DRV_IN || req_op(rq) == REQ_OP_DRV_OUT;
+	return blk_op_is_private(req_op(rq));
 }
 
 static inline bool blk_rq_is_passthrough(struct request *rq)
@@ -256,6 +266,13 @@ static inline bool blk_rq_is_passthrough(struct request *rq)
 	return blk_rq_is_scsi(rq) || blk_rq_is_private(rq);
 }
 
+static inline bool bio_is_passthrough(struct bio *bio)
+{
+	unsigned op = bio_op(bio);
+
+	return blk_op_is_scsi(op) || blk_op_is_private(op);
+}
+
 static inline unsigned short req_get_ioprio(struct request *req)
 {
 	return req->ioprio;

From 0abc2a10389f0c9070f76ca906c7382788036b93 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 18 Dec 2017 15:40:44 +0800
Subject: [PATCH 233/808] block: fix blk_rq_append_bio

Commit caa4b02476e3(blk-map: call blk_queue_bounce from blk_rq_append_bio)
moves blk_queue_bounce() into blk_rq_append_bio(), but don't consider
the fact that the bounced bio becomes invisible to caller since the
parameter type is 'struct bio *'. Make it a pointer to a pointer to
a bio, so the caller sees the right bio also after a bounce.

Fixes: caa4b02476e3 ("blk-map: call blk_queue_bounce from blk_rq_append_bio")
Cc: Christoph Hellwig <hch@lst.de>
Reported-by: Michele Ballabio <barra_cuda@katamail.com>
(handling failure of blk_rq_append_bio(), only call bio_get() after
blk_rq_append_bio() returns OK)
Tested-by: Michele Ballabio <barra_cuda@katamail.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-map.c                    | 38 +++++++++++++++++-------------
 drivers/scsi/osd/osd_initiator.c   |  4 +++-
 drivers/target/target_core_pscsi.c |  4 ++--
 include/linux/blkdev.h             |  2 +-
 4 files changed, 28 insertions(+), 20 deletions(-)

diff --git a/block/blk-map.c b/block/blk-map.c
index b21f8e86f120..d3a94719f03f 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -12,22 +12,29 @@
 #include "blk.h"
 
 /*
- * Append a bio to a passthrough request.  Only works can be merged into
- * the request based on the driver constraints.
+ * Append a bio to a passthrough request.  Only works if the bio can be merged
+ * into the request based on the driver constraints.
  */
-int blk_rq_append_bio(struct request *rq, struct bio *bio)
+int blk_rq_append_bio(struct request *rq, struct bio **bio)
 {
-	blk_queue_bounce(rq->q, &bio);
+	struct bio *orig_bio = *bio;
+
+	blk_queue_bounce(rq->q, bio);
 
 	if (!rq->bio) {
-		blk_rq_bio_prep(rq->q, rq, bio);
+		blk_rq_bio_prep(rq->q, rq, *bio);
 	} else {
-		if (!ll_back_merge_fn(rq->q, rq, bio))
+		if (!ll_back_merge_fn(rq->q, rq, *bio)) {
+			if (orig_bio != *bio) {
+				bio_put(*bio);
+				*bio = orig_bio;
+			}
 			return -EINVAL;
+		}
 
-		rq->biotail->bi_next = bio;
-		rq->biotail = bio;
-		rq->__data_len += bio->bi_iter.bi_size;
+		rq->biotail->bi_next = *bio;
+		rq->biotail = *bio;
+		rq->__data_len += (*bio)->bi_iter.bi_size;
 	}
 
 	return 0;
@@ -73,14 +80,12 @@ static int __blk_rq_map_user_iov(struct request *rq,
 	 * We link the bounce buffer in and could have to traverse it
 	 * later so we have to get a ref to prevent it from being freed
 	 */
-	ret = blk_rq_append_bio(rq, bio);
-	bio_get(bio);
+	ret = blk_rq_append_bio(rq, &bio);
 	if (ret) {
-		bio_endio(bio);
 		__blk_rq_unmap_user(orig_bio);
-		bio_put(bio);
 		return ret;
 	}
+	bio_get(bio);
 
 	return 0;
 }
@@ -213,7 +218,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
 	int reading = rq_data_dir(rq) == READ;
 	unsigned long addr = (unsigned long) kbuf;
 	int do_copy = 0;
-	struct bio *bio;
+	struct bio *bio, *orig_bio;
 	int ret;
 
 	if (len > (queue_max_hw_sectors(q) << 9))
@@ -236,10 +241,11 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
 	if (do_copy)
 		rq->rq_flags |= RQF_COPY_USER;
 
-	ret = blk_rq_append_bio(rq, bio);
+	orig_bio = bio;
+	ret = blk_rq_append_bio(rq, &bio);
 	if (unlikely(ret)) {
 		/* request is too big */
-		bio_put(bio);
+		bio_put(orig_bio);
 		return ret;
 	}
 
diff --git a/drivers/scsi/osd/osd_initiator.c b/drivers/scsi/osd/osd_initiator.c
index a4f28b7e4c65..e18877177f1b 100644
--- a/drivers/scsi/osd/osd_initiator.c
+++ b/drivers/scsi/osd/osd_initiator.c
@@ -1576,7 +1576,9 @@ static struct request *_make_request(struct request_queue *q, bool has_write,
 		return req;
 
 	for_each_bio(bio) {
-		ret = blk_rq_append_bio(req, bio);
+		struct bio *bounce_bio = bio;
+
+		ret = blk_rq_append_bio(req, &bounce_bio);
 		if (ret)
 			return ERR_PTR(ret);
 	}
diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c
index 7c69b4a9694d..0d99b242e82e 100644
--- a/drivers/target/target_core_pscsi.c
+++ b/drivers/target/target_core_pscsi.c
@@ -920,7 +920,7 @@ pscsi_map_sg(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents,
 					" %d i: %d bio: %p, allocating another"
 					" bio\n", bio->bi_vcnt, i, bio);
 
-				rc = blk_rq_append_bio(req, bio);
+				rc = blk_rq_append_bio(req, &bio);
 				if (rc) {
 					pr_err("pSCSI: failed to append bio\n");
 					goto fail;
@@ -938,7 +938,7 @@ pscsi_map_sg(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents,
 	}
 
 	if (bio) {
-		rc = blk_rq_append_bio(req, bio);
+		rc = blk_rq_append_bio(req, &bio);
 		if (rc) {
 			pr_err("pSCSI: failed to append bio\n");
 			goto fail;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index abd06f540863..100d0df38026 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -965,7 +965,7 @@ extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
 extern void blk_rq_unprep_clone(struct request *rq);
 extern blk_status_t blk_insert_cloned_request(struct request_queue *q,
 				     struct request *rq);
-extern int blk_rq_append_bio(struct request *rq, struct bio *bio);
+extern int blk_rq_append_bio(struct request *rq, struct bio **bio);
 extern void blk_delay_queue(struct request_queue *, unsigned long);
 extern void blk_queue_split(struct request_queue *, struct bio **);
 extern void blk_recount_segments(struct request_queue *, struct bio *);

From 8b7e9d9e2d8b4de6f0d5d7a5fc63f48b1fbcf4d4 Mon Sep 17 00:00:00 2001
From: Anthony Kim <anthony.kim@hideep.com>
Date: Mon, 18 Dec 2017 11:50:48 -0800
Subject: [PATCH 234/808] Input: hideep - fix compile error due to missing
 include file

gpiod_() API requires including "linux/gpio/consumer.h". Also, we are not
using the legacy API nor the static board files descriptions, so no need to
include gpio.h nor gpio/machine.h.

Reported-by: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: Anthony Kim <anthony.kim@hideep.com>
Patchwork-Id: 10094831
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/touchscreen/hideep.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/input/touchscreen/hideep.c b/drivers/input/touchscreen/hideep.c
index fc080a7c2e1f..f1cd4dd9a4a3 100644
--- a/drivers/input/touchscreen/hideep.c
+++ b/drivers/input/touchscreen/hideep.c
@@ -10,8 +10,7 @@
 #include <linux/of.h>
 #include <linux/firmware.h>
 #include <linux/delay.h>
-#include <linux/gpio.h>
-#include <linux/gpio/machine.h>
+#include <linux/gpio/consumer.h>
 #include <linux/i2c.h>
 #include <linux/acpi.h>
 #include <linux/interrupt.h>

From 34112bf4935dabe3c1d1fd42842ed771e279bf61 Mon Sep 17 00:00:00 2001
From: Karol Herbst <kherbst@redhat.com>
Date: Mon, 6 Nov 2017 16:20:33 +0100
Subject: [PATCH 235/808] drm/nouveau/fbcon: fix NULL pointer access in
 nouveau_fbcon_destroy

When the fbcon object is initialized, but nouveau_fbcon_create is not
called, we run into a NULL pointer access within nouveau_fbcon_create when
unloading nouveau.

The call to drm_fb_helper_funcs.fb_probe is deferred until there is a
display for real since 4.14, that's why fbcon->helper.fb is still not set.

Signed-off-by: Karol Herbst <kherbst@redhat.com>
Signed-off-by: Ben Skeggs <bskeggs@redhat.com>
---
 drivers/gpu/drm/nouveau/nouveau_fbcon.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/nouveau/nouveau_fbcon.c b/drivers/gpu/drm/nouveau/nouveau_fbcon.c
index c533d8e04afc..be7357bf2246 100644
--- a/drivers/gpu/drm/nouveau/nouveau_fbcon.c
+++ b/drivers/gpu/drm/nouveau/nouveau_fbcon.c
@@ -429,7 +429,7 @@ nouveau_fbcon_destroy(struct drm_device *dev, struct nouveau_fbdev *fbcon)
 	drm_fb_helper_unregister_fbi(&fbcon->helper);
 	drm_fb_helper_fini(&fbcon->helper);
 
-	if (nouveau_fb->nvbo) {
+	if (nouveau_fb && nouveau_fb->nvbo) {
 		nouveau_vma_del(&nouveau_fb->vma);
 		nouveau_bo_unmap(nouveau_fb->nvbo);
 		nouveau_bo_unpin(nouveau_fb->nvbo);

From f60707a69a225f2dd87f42628b44e24ceb219d28 Mon Sep 17 00:00:00 2001
From: Ben Skeggs <bskeggs@redhat.com>
Date: Thu, 7 Dec 2017 10:49:35 +1000
Subject: [PATCH 236/808] drm/nouveau/bios/dp: support DP Info Table 2.0

Reported-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Ben Skeggs <bskeggs@redhat.com>
---
 drivers/gpu/drm/nouveau/nvkm/subdev/bios/dp.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/bios/dp.c b/drivers/gpu/drm/nouveau/nvkm/subdev/bios/dp.c
index 972370ed36f0..7c7efa4ea0d0 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/bios/dp.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/bios/dp.c
@@ -36,6 +36,7 @@ nvbios_dp_table(struct nvkm_bios *bios, u8 *ver, u8 *hdr, u8 *cnt, u8 *len)
 			if (data) {
 				*ver = nvbios_rd08(bios, data + 0x00);
 				switch (*ver) {
+				case 0x20:
 				case 0x21:
 				case 0x30:
 				case 0x40:
@@ -63,6 +64,7 @@ nvbios_dpout_entry(struct nvkm_bios *bios, u8 idx,
 	if (data && idx < *cnt) {
 		u16 outp = nvbios_rd16(bios, data + *hdr + idx * *len);
 		switch (*ver * !!outp) {
+		case 0x20:
 		case 0x21:
 		case 0x30:
 			*hdr = nvbios_rd08(bios, data + 0x04);
@@ -96,12 +98,16 @@ nvbios_dpout_parse(struct nvkm_bios *bios, u8 idx,
 		info->type = nvbios_rd16(bios, data + 0x00);
 		info->mask = nvbios_rd16(bios, data + 0x02);
 		switch (*ver) {
+		case 0x20:
+			info->mask |= 0x00c0; /* match any link */
+			/* fall-through */
 		case 0x21:
 		case 0x30:
 			info->flags     = nvbios_rd08(bios, data + 0x05);
 			info->script[0] = nvbios_rd16(bios, data + 0x06);
 			info->script[1] = nvbios_rd16(bios, data + 0x08);
-			info->lnkcmp    = nvbios_rd16(bios, data + 0x0a);
+			if (*len >= 0x0c)
+				info->lnkcmp    = nvbios_rd16(bios, data + 0x0a);
 			if (*len >= 0x0f) {
 				info->script[2] = nvbios_rd16(bios, data + 0x0c);
 				info->script[3] = nvbios_rd16(bios, data + 0x0e);
@@ -170,6 +176,7 @@ nvbios_dpcfg_parse(struct nvkm_bios *bios, u16 outp, u8 idx,
 	memset(info, 0x00, sizeof(*info));
 	if (data) {
 		switch (*ver) {
+		case 0x20:
 		case 0x21:
 			info->dc    = nvbios_rd08(bios, data + 0x02);
 			info->pe    = nvbios_rd08(bios, data + 0x03);

From 81a24b9ae8eea95b74337c253059da761043ed06 Mon Sep 17 00:00:00 2001
From: Ben Skeggs <bskeggs@redhat.com>
Date: Thu, 7 Dec 2017 11:08:52 +1000
Subject: [PATCH 237/808] drm/nouveau/imem/nv50: fix refcount_t warning

Signed-off-by: Ben Skeggs <bskeggs@redhat.com>
---
 drivers/gpu/drm/nouveau/nvkm/subdev/instmem/nv50.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/nv50.c b/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/nv50.c
index 1ba7289684aa..db48a1daca0c 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/nv50.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/nv50.c
@@ -249,7 +249,7 @@ nv50_instobj_acquire(struct nvkm_memory *memory)
 			iobj->base.memory.ptrs = &nv50_instobj_fast;
 		else
 			iobj->base.memory.ptrs = &nv50_instobj_slow;
-		refcount_inc(&iobj->maps);
+		refcount_set(&iobj->maps, 1);
 	}
 
 	mutex_unlock(&imem->subdev.mutex);

From a121027d2747168df0aac0c3da35509eea39f61c Mon Sep 17 00:00:00 2001
From: Karol Herbst <kherbst@redhat.com>
Date: Fri, 24 Nov 2017 03:56:26 +0100
Subject: [PATCH 238/808] drm/nouveau/pci: do a msi rearm on init

On my GP107 when I load nouveau after unloading it, for some reason the
GPU stopped sending or the CPU stopped receiving interrupts if MSI was
enabled.

Doing a rearm once before getting any interrupts fixes this.

Signed-off-by: Karol Herbst <kherbst@redhat.com>
Reviewed-by: Thierry Reding <treding@nvidia.com>
Signed-off-by: Ben Skeggs <bskeggs@redhat.com>
---
 drivers/gpu/drm/nouveau/nvkm/subdev/pci/base.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/pci/base.c b/drivers/gpu/drm/nouveau/nvkm/subdev/pci/base.c
index b1b1f3626b96..deb96de54b00 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/pci/base.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/pci/base.c
@@ -136,6 +136,13 @@ nvkm_pci_init(struct nvkm_subdev *subdev)
 		return ret;
 
 	pci->irq = pdev->irq;
+
+	/* Ensure MSI interrupts are armed, for the case where there are
+	 * already interrupts pending (for whatever reason) at load time.
+	 */
+	if (pci->msi)
+		pci->func->msi_rearm(pci);
+
 	return ret;
 }
 

From 6cb0f2a39d3b7ccdd7269af4ddadb38e78aee744 Mon Sep 17 00:00:00 2001
From: Ben Skeggs <bskeggs@redhat.com>
Date: Thu, 7 Dec 2017 15:04:32 +1000
Subject: [PATCH 239/808] drm/nouveau/mmu/gp10b: use correct implementation

Reported-by: Mikko Perttunen <cyndis@kapsi.fi>
Fixes: 6359c98224 ("drm/nouveau/mmu/gp10b: fork from gf100")
Signed-off-by: Ben Skeggs <bskeggs@redhat.com>
Tested-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/drm/nouveau/nvkm/engine/device/base.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c b/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c
index e14643615698..00eeaaffeae5 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c
@@ -2369,7 +2369,7 @@ nv13b_chipset = {
 	.imem = gk20a_instmem_new,
 	.ltc = gp100_ltc_new,
 	.mc = gp10b_mc_new,
-	.mmu = gf100_mmu_new,
+	.mmu = gp10b_mmu_new,
 	.secboot = gp10b_secboot_new,
 	.pmu = gm20b_pmu_new,
 	.timer = gk20a_timer_new,

From f29f18eb952bc3e71deedf8bd8fc902f66853c48 Mon Sep 17 00:00:00 2001
From: Ben Skeggs <bskeggs@redhat.com>
Date: Thu, 7 Dec 2017 15:25:14 +1000
Subject: [PATCH 240/808] drm/nouveau: avoid GPU page sizes > PAGE_SIZE for
 buffer objects in host memory

While the Tegra (GK20A, GM20B, GP10B) MMUs support large pages in host
memory, we're currently lacking IOMMU support for merging system pages
into large enough chunks to be mapped as such by the GPU.

The core VMM code actually supports automatically determining the best
page size to map with, which is intended for these situations, but for
various complicated reasons the DRM is currently forcing the page size
selection on a per-BO basis.

This should fix breakage reported on Tegra GPUs in the meantime, until
one or both of the above issues are resolved properly.

Reported-by: Mikko Perttunen <cyndis@kapsi.fi>
Fixes: 7dc6a446da7c ("drm/nouveau: improve selection of GPU page size")
Signed-off-by: Ben Skeggs <bskeggs@redhat.com>
Tested-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/drm/nouveau/nouveau_bo.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c
index 2615912430cc..42c1827bbb8e 100644
--- a/drivers/gpu/drm/nouveau/nouveau_bo.c
+++ b/drivers/gpu/drm/nouveau/nouveau_bo.c
@@ -262,7 +262,8 @@ nouveau_bo_new(struct nouveau_cli *cli, u64 size, int align,
 		if (cli->device.info.family > NV_DEVICE_INFO_V0_CURIE &&
 		    (flags & TTM_PL_FLAG_VRAM) && !vmm->page[i].vram)
 			continue;
-		if ((flags & TTM_PL_FLAG_TT  ) && !vmm->page[i].host)
+		if ((flags & TTM_PL_FLAG_TT) &&
+		    (!vmm->page[i].host || vmm->page[i].shift > PAGE_SHIFT))
 			continue;
 
 		/* Select this page size if it's the first that supports

From 74a39954a4900a7dea7010e3063e2bf16b23934b Mon Sep 17 00:00:00 2001
From: Ben Skeggs <bskeggs@redhat.com>
Date: Thu, 14 Dec 2017 11:19:27 +1000
Subject: [PATCH 241/808] drm/nouveau: use alternate memory type for
 system-memory buffers with kind != 0

Fixes bug on Tegra where we'd strip kind information from system memory
(ie. all) buffers, resulting in misrendering.

Behaviour on dGPU should be unchanged.

Reported-by: Thierry Reding <treding@nvidia.com>
Fixes: d7722134b8 ("drm/nouveau: switch over to new memory and vmm interfaces")
Signed-off-by: Ben Skeggs <bskeggs@redhat.com>
Tested-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/drm/nouveau/nouveau_bo.c  |  2 +-
 drivers/gpu/drm/nouveau/nouveau_drv.h | 11 +++++--
 drivers/gpu/drm/nouveau/nouveau_mem.c |  6 ++--
 drivers/gpu/drm/nouveau/nouveau_ttm.c | 41 ++++++++++++++++++++-------
 4 files changed, 43 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c
index 42c1827bbb8e..435ff8662cfa 100644
--- a/drivers/gpu/drm/nouveau/nouveau_bo.c
+++ b/drivers/gpu/drm/nouveau/nouveau_bo.c
@@ -224,7 +224,7 @@ nouveau_bo_new(struct nouveau_cli *cli, u64 size, int align,
 		/* Determine if we can get a cache-coherent map, forcing
 		 * uncached mapping if we can't.
 		 */
-		if (mmu->type[drm->ttm.type_host].type & NVIF_MEM_UNCACHED)
+		if (!nouveau_drm_use_coherent_gpu_mapping(drm))
 			nvbo->force_coherent = true;
 	}
 
diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.h b/drivers/gpu/drm/nouveau/nouveau_drv.h
index e86b8220a4bb..6a1b1debe5b8 100644
--- a/drivers/gpu/drm/nouveau/nouveau_drv.h
+++ b/drivers/gpu/drm/nouveau/nouveau_drv.h
@@ -156,8 +156,8 @@ struct nouveau_drm {
 		struct nvif_object copy;
 		int mtrr;
 		int type_vram;
-		int type_host;
-		int type_ncoh;
+		int type_host[2];
+		int type_ncoh[2];
 	} ttm;
 
 	/* GEM interface support */
@@ -216,6 +216,13 @@ nouveau_drm(struct drm_device *dev)
 	return dev->dev_private;
 }
 
+static inline bool
+nouveau_drm_use_coherent_gpu_mapping(struct nouveau_drm *drm)
+{
+	struct nvif_mmu *mmu = &drm->client.mmu;
+	return !(mmu->type[drm->ttm.type_host[0]].type & NVIF_MEM_UNCACHED);
+}
+
 int nouveau_pmops_suspend(struct device *);
 int nouveau_pmops_resume(struct device *);
 bool nouveau_pmops_runtime(void);
diff --git a/drivers/gpu/drm/nouveau/nouveau_mem.c b/drivers/gpu/drm/nouveau/nouveau_mem.c
index 589a9621db76..c002f8968507 100644
--- a/drivers/gpu/drm/nouveau/nouveau_mem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_mem.c
@@ -103,10 +103,10 @@ nouveau_mem_host(struct ttm_mem_reg *reg, struct ttm_dma_tt *tt)
 	u8 type;
 	int ret;
 
-	if (mmu->type[drm->ttm.type_host].type & NVIF_MEM_UNCACHED)
-		type = drm->ttm.type_ncoh;
+	if (!nouveau_drm_use_coherent_gpu_mapping(drm))
+		type = drm->ttm.type_ncoh[!!mem->kind];
 	else
-		type = drm->ttm.type_host;
+		type = drm->ttm.type_host[0];
 
 	if (mem->kind && !(mmu->type[type].type & NVIF_MEM_KIND))
 		mem->comp = mem->kind = 0;
diff --git a/drivers/gpu/drm/nouveau/nouveau_ttm.c b/drivers/gpu/drm/nouveau/nouveau_ttm.c
index 08b974b30482..dff51a0ee028 100644
--- a/drivers/gpu/drm/nouveau/nouveau_ttm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_ttm.c
@@ -235,6 +235,27 @@ nouveau_ttm_global_release(struct nouveau_drm *drm)
 	drm->ttm.mem_global_ref.release = NULL;
 }
 
+static int
+nouveau_ttm_init_host(struct nouveau_drm *drm, u8 kind)
+{
+	struct nvif_mmu *mmu = &drm->client.mmu;
+	int typei;
+
+	typei = nvif_mmu_type(mmu, NVIF_MEM_HOST | NVIF_MEM_MAPPABLE |
+					    kind | NVIF_MEM_COHERENT);
+	if (typei < 0)
+		return -ENOSYS;
+
+	drm->ttm.type_host[!!kind] = typei;
+
+	typei = nvif_mmu_type(mmu, NVIF_MEM_HOST | NVIF_MEM_MAPPABLE | kind);
+	if (typei < 0)
+		return -ENOSYS;
+
+	drm->ttm.type_ncoh[!!kind] = typei;
+	return 0;
+}
+
 int
 nouveau_ttm_init(struct nouveau_drm *drm)
 {
@@ -244,18 +265,16 @@ nouveau_ttm_init(struct nouveau_drm *drm)
 	struct drm_device *dev = drm->dev;
 	int typei, ret;
 
-	typei = nvif_mmu_type(mmu, NVIF_MEM_HOST | NVIF_MEM_MAPPABLE |
-						   NVIF_MEM_COHERENT);
-	if (typei < 0)
-		return -ENOSYS;
+	ret = nouveau_ttm_init_host(drm, 0);
+	if (ret)
+		return ret;
 
-	drm->ttm.type_host = typei;
-
-	typei = nvif_mmu_type(mmu, NVIF_MEM_HOST | NVIF_MEM_MAPPABLE);
-	if (typei < 0)
-		return -ENOSYS;
-
-	drm->ttm.type_ncoh = typei;
+	if (drm->client.device.info.family >= NV_DEVICE_INFO_V0_TESLA &&
+	    drm->client.device.info.chipset != 0x50) {
+		ret = nouveau_ttm_init_host(drm, NVIF_MEM_KIND);
+		if (ret)
+			return ret;
+	}
 
 	if (drm->client.device.info.platform != NV_DEVICE_INFO_V0_SOC &&
 	    drm->client.device.info.family >= NV_DEVICE_INFO_V0_TESLA) {

From c682ccc4962a8fab949e1f2d7325b3e825dbf6d1 Mon Sep 17 00:00:00 2001
From: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Date: Mon, 18 Dec 2017 14:09:57 +0100
Subject: [PATCH 242/808] bpf: fix broken BPF selftest build on s390

With 720f228e8d31 ("bpf: fix broken BPF selftest build") the
inclusion of arch-specific header files changed.  Including the
asm/bpf_perf_event.h on s390, correctly includes the s390 specific
header file.  This header file tries then to include the s390
asm/ptrace.h and the build fails with:

cc -Wall -O2 -I../../../include/uapi -I../../../lib -I../../../../include/generated  -I../../../include    test_verifier.c
+/root/git/linux/tools/testing/selftests/bpf/libbpf.a /root/git/linux/tools/testing/selftests/bpf/cgroup_helpers.c -lcap -lelf -o
+/root/git/linux/tools/testing/selftests/bpf/test_verifier
In file included from ../../../include/uapi/asm/bpf_perf_event.h:4:0,
                 from ../../../include/uapi/linux/bpf_perf_event.h:11,
                 from test_verifier.c:29:
../../../include/uapi/../../arch/s390/include/uapi/asm/bpf_perf_event.h:7:9: error: unknown type name 'user_pt_regs'
 typedef user_pt_regs bpf_user_pt_regs_t;
         ^~~~~~~~~~~~
make: *** [../lib.mk:109: /root/git/linux/tools/testing/selftests/bpf/test_verifier] Error 1

This is caused by a recent update to the s390 asm/ptrace.h file
that is not (yet) available in the local installation.  That means,
the s390 asm/ptrace.h must be included from the tools/arch/s390
directory.

Because there is no proper framework to deal with asm specific
includes in tools/, slightly modify the s390 asm/bpf_perf_event.h
to include the local ptrace.h header file.

See also discussion on
https://marc.info/?l=linux-s390&m=151359424420691&w=2

Please note that this needs to be preserved until tools/ is able to
correctly handle asm specific headers.

References: https://marc.info/?l=linux-s390&m=151359424420691&w=2
Fixes: 720f228e8d31 ("bpf: fix broken BPF selftest build")
Signed-off-by: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/arch/s390/include/uapi/asm/bpf_perf_event.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/arch/s390/include/uapi/asm/bpf_perf_event.h b/tools/arch/s390/include/uapi/asm/bpf_perf_event.h
index cefe7c7cd4f6..0a8e37a519f2 100644
--- a/tools/arch/s390/include/uapi/asm/bpf_perf_event.h
+++ b/tools/arch/s390/include/uapi/asm/bpf_perf_event.h
@@ -2,7 +2,7 @@
 #ifndef _UAPI__ASM_BPF_PERF_EVENT_H__
 #define _UAPI__ASM_BPF_PERF_EVENT_H__
 
-#include <asm/ptrace.h>
+#include "ptrace.h"
 
 typedef user_pt_regs bpf_user_pt_regs_t;
 

From 182dc9c7f217146d69d9c0b75c150c0314b9b170 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Mon, 18 Dec 2017 16:33:36 +1100
Subject: [PATCH 243/808] powerpc/kernel: Print actual address of regs when
 oopsing

When we oops or otherwise call show_regs() we print the address of the
regs structure. Being able to see the address is fairly useful,
firstly to verify that the regs pointer is not completely bogus, and
secondly it allows you to dump the regs and surrounding memory with a
debugger if you have one.

In the normal case the regs will be located somewhere on the stack, so
printing their location discloses no further information than printing
the stack pointer does already.

So switch to %px and print the actual address, not the hashed value.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/process.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 5acb5a176dbe..72be0c32e902 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1403,7 +1403,7 @@ void show_regs(struct pt_regs * regs)
 
 	printk("NIP:  "REG" LR: "REG" CTR: "REG"\n",
 	       regs->nip, regs->link, regs->ctr);
-	printk("REGS: %p TRAP: %04lx   %s  (%s)\n",
+	printk("REGS: %px TRAP: %04lx   %s  (%s)\n",
 	       regs, regs->trap, print_tainted(), init_utsname()->release);
 	printk("MSR:  "REG" ", regs->msr);
 	print_msr_bits(regs->msr);

From 81b6c999897919d5a16fedc018fe375dbab091c5 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Wed, 13 Dec 2017 14:21:37 +0100
Subject: [PATCH 244/808] scsi: core: check for device state in
 __scsi_remove_target()

As it turned out device_get() doesn't use kref_get_unless_zero(), so we
will be always getting a device pointer.  Consequently, we need to check
for the device state in __scsi_remove_target() to avoid tripping over
deleted objects.

Fixes: fbce4d97fd43 ("scsi: fixup kernel warning during rmmod()")
Reported-by: Jason Yan <yanaijie@huawei.com>
Signed-off-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Ewan D. Milne <emilne@redhat.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/scsi_sysfs.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c
index a9996c16f4ae..26ce17178401 100644
--- a/drivers/scsi/scsi_sysfs.c
+++ b/drivers/scsi/scsi_sysfs.c
@@ -1415,7 +1415,10 @@ static void __scsi_remove_target(struct scsi_target *starget)
 		 * check.
 		 */
 		if (sdev->channel != starget->channel ||
-		    sdev->id != starget->id ||
+		    sdev->id != starget->id)
+			continue;
+		if (sdev->sdev_state == SDEV_DEL ||
+		    sdev->sdev_state == SDEV_CANCEL ||
 		    !get_device(&sdev->sdev_gendev))
 			continue;
 		spin_unlock_irqrestore(shost->host_lock, flags);

From 5a15f289ee87eaf33f13f08a4909ec99d837ec5f Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Mon, 18 Dec 2017 23:36:57 +0100
Subject: [PATCH 245/808] ALSA: usb-audio: Fix the missing ctl name suffix at
 parsing SU

The commit 89b89d121ffc ("ALSA: usb-audio: Add check return value for
usb_string()") added the check of the return value from
snd_usb_copy_string_desc(), which is correct per se, but it introduced
a regression.  In the original code, either the "Clock Source",
"Playback Source" or "Capture Source" suffix is added after the
terminal string, while the commit changed it to add the suffix only
when get_term_name() is failing.  It ended up with an incorrect ctl
name like "PCM" instead of "PCM Capture Source".

Also, even the original code has a similar bug: when the ctl name is
generated from snd_usb_copy_string_desc() for the given iSelector, it
also doesn't put the suffix.

This patch addresses these issues: the suffix is added always when no
static mapping is found.  Also the patch tries to put more comments
and cleans up the if/else block for better readability in order to
avoid the same pitfall again.

Fixes: 89b89d121ffc ("ALSA: usb-audio: Add check return value for usb_string()")
Reported-and-tested-by: Mauro Santos <registo.mailling@gmail.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/usb/mixer.c | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/sound/usb/mixer.c b/sound/usb/mixer.c
index afc208e1c756..60ebc99ae323 100644
--- a/sound/usb/mixer.c
+++ b/sound/usb/mixer.c
@@ -2173,20 +2173,25 @@ static int parse_audio_selector_unit(struct mixer_build *state, int unitid,
 	kctl->private_value = (unsigned long)namelist;
 	kctl->private_free = usb_mixer_selector_elem_free;
 
-	nameid = uac_selector_unit_iSelector(desc);
+	/* check the static mapping table at first */
 	len = check_mapped_name(map, kctl->id.name, sizeof(kctl->id.name));
-	if (len)
-		;
-	else if (nameid)
-		len = snd_usb_copy_string_desc(state, nameid, kctl->id.name,
-					 sizeof(kctl->id.name));
-	else
-		len = get_term_name(state, &state->oterm,
-				    kctl->id.name, sizeof(kctl->id.name), 0);
-
 	if (!len) {
-		strlcpy(kctl->id.name, "USB", sizeof(kctl->id.name));
+		/* no mapping ? */
+		/* if iSelector is given, use it */
+		nameid = uac_selector_unit_iSelector(desc);
+		if (nameid)
+			len = snd_usb_copy_string_desc(state, nameid,
+						       kctl->id.name,
+						       sizeof(kctl->id.name));
+		/* ... or pick up the terminal name at next */
+		if (!len)
+			len = get_term_name(state, &state->oterm,
+				    kctl->id.name, sizeof(kctl->id.name), 0);
+		/* ... or use the fixed string "USB" as the last resort */
+		if (!len)
+			strlcpy(kctl->id.name, "USB", sizeof(kctl->id.name));
 
+		/* and add the proper suffix */
 		if (desc->bDescriptorSubtype == UAC2_CLOCK_SELECTOR)
 			append_ctl_name(kctl, " Clock Source");
 		else if ((state->oterm.type & 0xff00) == 0x0100)

From acf568ee859f098279eadf551612f103afdacb4e Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Fri, 15 Dec 2017 16:40:44 +1100
Subject: [PATCH 246/808] xfrm: Reinject transport-mode packets through tasklet

This is an old bugbear of mine:

https://www.mail-archive.com/netdev@vger.kernel.org/msg03894.html

By crafting special packets, it is possible to cause recursion
in our kernel when processing transport-mode packets at levels
that are only limited by packet size.

The easiest one is with DNAT, but an even worse one is where
UDP encapsulation is used in which case you just have to insert
an UDP encapsulation header in between each level of recursion.

This patch avoids this problem by reinjecting tranport-mode packets
through a tasklet.

Fixes: b05e106698d9 ("[IPV4/6]: Netfilter IPsec input hooks")
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h     |  3 +++
 net/ipv4/xfrm4_input.c | 12 ++++++++-
 net/ipv6/xfrm6_input.c | 10 +++++++-
 net/xfrm/xfrm_input.c  | 57 ++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 80 insertions(+), 2 deletions(-)

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index dc28a98ce97c..ae35991b5877 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1570,6 +1570,9 @@ int xfrm_init_state(struct xfrm_state *x);
 int xfrm_prepare_input(struct xfrm_state *x, struct sk_buff *skb);
 int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type);
 int xfrm_input_resume(struct sk_buff *skb, int nexthdr);
+int xfrm_trans_queue(struct sk_buff *skb,
+		     int (*finish)(struct net *, struct sock *,
+				   struct sk_buff *));
 int xfrm_output_resume(struct sk_buff *skb, int err);
 int xfrm_output(struct sock *sk, struct sk_buff *skb);
 int xfrm_inner_extract_output(struct xfrm_state *x, struct sk_buff *skb);
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index e50b7fea57ee..bcfc00e88756 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -23,6 +23,12 @@ int xfrm4_extract_input(struct xfrm_state *x, struct sk_buff *skb)
 	return xfrm4_extract_header(skb);
 }
 
+static int xfrm4_rcv_encap_finish2(struct net *net, struct sock *sk,
+				   struct sk_buff *skb)
+{
+	return dst_input(skb);
+}
+
 static inline int xfrm4_rcv_encap_finish(struct net *net, struct sock *sk,
 					 struct sk_buff *skb)
 {
@@ -33,7 +39,11 @@ static inline int xfrm4_rcv_encap_finish(struct net *net, struct sock *sk,
 					 iph->tos, skb->dev))
 			goto drop;
 	}
-	return dst_input(skb);
+
+	if (xfrm_trans_queue(skb, xfrm4_rcv_encap_finish2))
+		goto drop;
+
+	return 0;
 drop:
 	kfree_skb(skb);
 	return NET_RX_DROP;
diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c
index fe04e23af986..841f4a07438e 100644
--- a/net/ipv6/xfrm6_input.c
+++ b/net/ipv6/xfrm6_input.c
@@ -32,6 +32,14 @@ int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi,
 }
 EXPORT_SYMBOL(xfrm6_rcv_spi);
 
+static int xfrm6_transport_finish2(struct net *net, struct sock *sk,
+				   struct sk_buff *skb)
+{
+	if (xfrm_trans_queue(skb, ip6_rcv_finish))
+		__kfree_skb(skb);
+	return -1;
+}
+
 int xfrm6_transport_finish(struct sk_buff *skb, int async)
 {
 	struct xfrm_offload *xo = xfrm_offload(skb);
@@ -56,7 +64,7 @@ int xfrm6_transport_finish(struct sk_buff *skb, int async)
 
 	NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING,
 		dev_net(skb->dev), NULL, skb, skb->dev, NULL,
-		ip6_rcv_finish);
+		xfrm6_transport_finish2);
 	return -1;
 }
 
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index da6447389ffb..3f6f6f8c9fa5 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -8,15 +8,29 @@
  *
  */
 
+#include <linux/bottom_half.h>
+#include <linux/interrupt.h>
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/netdevice.h>
+#include <linux/percpu.h>
 #include <net/dst.h>
 #include <net/ip.h>
 #include <net/xfrm.h>
 #include <net/ip_tunnels.h>
 #include <net/ip6_tunnel.h>
 
+struct xfrm_trans_tasklet {
+	struct tasklet_struct tasklet;
+	struct sk_buff_head queue;
+};
+
+struct xfrm_trans_cb {
+	int (*finish)(struct net *net, struct sock *sk, struct sk_buff *skb);
+};
+
+#define XFRM_TRANS_SKB_CB(__skb) ((struct xfrm_trans_cb *)&((__skb)->cb[0]))
+
 static struct kmem_cache *secpath_cachep __read_mostly;
 
 static DEFINE_SPINLOCK(xfrm_input_afinfo_lock);
@@ -25,6 +39,8 @@ static struct xfrm_input_afinfo const __rcu *xfrm_input_afinfo[AF_INET6 + 1];
 static struct gro_cells gro_cells;
 static struct net_device xfrm_napi_dev;
 
+static DEFINE_PER_CPU(struct xfrm_trans_tasklet, xfrm_trans_tasklet);
+
 int xfrm_input_register_afinfo(const struct xfrm_input_afinfo *afinfo)
 {
 	int err = 0;
@@ -477,9 +493,41 @@ int xfrm_input_resume(struct sk_buff *skb, int nexthdr)
 }
 EXPORT_SYMBOL(xfrm_input_resume);
 
+static void xfrm_trans_reinject(unsigned long data)
+{
+	struct xfrm_trans_tasklet *trans = (void *)data;
+	struct sk_buff_head queue;
+	struct sk_buff *skb;
+
+	__skb_queue_head_init(&queue);
+	skb_queue_splice_init(&trans->queue, &queue);
+
+	while ((skb = __skb_dequeue(&queue)))
+		XFRM_TRANS_SKB_CB(skb)->finish(dev_net(skb->dev), NULL, skb);
+}
+
+int xfrm_trans_queue(struct sk_buff *skb,
+		     int (*finish)(struct net *, struct sock *,
+				   struct sk_buff *))
+{
+	struct xfrm_trans_tasklet *trans;
+
+	trans = this_cpu_ptr(&xfrm_trans_tasklet);
+
+	if (skb_queue_len(&trans->queue) >= netdev_max_backlog)
+		return -ENOBUFS;
+
+	XFRM_TRANS_SKB_CB(skb)->finish = finish;
+	skb_queue_tail(&trans->queue, skb);
+	tasklet_schedule(&trans->tasklet);
+	return 0;
+}
+EXPORT_SYMBOL(xfrm_trans_queue);
+
 void __init xfrm_input_init(void)
 {
 	int err;
+	int i;
 
 	init_dummy_netdev(&xfrm_napi_dev);
 	err = gro_cells_init(&gro_cells, &xfrm_napi_dev);
@@ -490,4 +538,13 @@ void __init xfrm_input_init(void)
 					   sizeof(struct sec_path),
 					   0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
 					   NULL);
+
+	for_each_possible_cpu(i) {
+		struct xfrm_trans_tasklet *trans;
+
+		trans = &per_cpu(xfrm_trans_tasklet, i);
+		__skb_queue_head_init(&trans->queue);
+		tasklet_init(&trans->tasklet, xfrm_trans_reinject,
+			     (unsigned long)trans);
+	}
 }

From 6454b3bdd138dfc640deb5e7b9a0668fca2d55dd Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Mon, 18 Dec 2017 15:13:44 -0600
Subject: [PATCH 247/808] x86/stacktrace: Make zombie stack traces reliable

Commit:

  1959a60182f4 ("x86/dumpstack: Pin the target stack when dumping it")

changed the behavior of stack traces for zombies.  Before that commit,
/proc/<pid>/stack reported the last execution path of the zombie before
it died:

  [<ffffffff8105b877>] do_exit+0x6f7/0xa80
  [<ffffffff8105bc79>] do_group_exit+0x39/0xa0
  [<ffffffff8105bcf0>] __wake_up_parent+0x0/0x30
  [<ffffffff8152dd09>] system_call_fastpath+0x16/0x1b
  [<00007fd128f9c4f9>] 0x7fd128f9c4f9
  [<ffffffffffffffff>] 0xffffffffffffffff

After the commit, it just reports an empty stack trace.

The new behavior is actually probably more correct.  If the stack
refcount has gone down to zero, then the task has already gone through
do_exit() and isn't going to run anymore.  The stack could be freed at
any time and is basically gone, so reporting an empty stack makes sense.

However, save_stack_trace_tsk_reliable() treats such a missing stack
condition as an error.  That can cause livepatch transition stalls if
there are any unreaped zombies.  Instead, just treat it as a reliable,
empty stack.

Reported-and-tested-by: Miroslav Benes <mbenes@suse.cz>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: live-patching@vger.kernel.org
Fixes: af085d9084b4 ("stacktrace/x86: add function for detecting reliable stack traces")
Link: http://lkml.kernel.org/r/e4b09e630e99d0c1080528f0821fc9d9dbaeea82.1513631620.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/stacktrace.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 77835bc021c7..20161ef53537 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -164,8 +164,12 @@ int save_stack_trace_tsk_reliable(struct task_struct *tsk,
 {
 	int ret;
 
+	/*
+	 * If the task doesn't have a stack (e.g., a zombie), the stack is
+	 * "reliably" empty.
+	 */
 	if (!try_get_task_stack(tsk))
-		return -EINVAL;
+		return 0;
 
 	ret = __save_stack_trace_reliable(trace, tsk);
 

From b65c7b8aeac818eb8f80ce825073c12ad081b177 Mon Sep 17 00:00:00 2001
From: Adiel Aloni <adiel.aloni@intel.com>
Date: Mon, 18 Dec 2017 12:14:04 +0200
Subject: [PATCH 248/808] mac80211_hwsim: enable TODS BIT in null data frame

Same as in ieee80211_nullfunc_get, enable the TODS bit, otherwise the
nullfunc packet will not be handled in ap rx path.
(will be dropped in ieee80211_accept_frame()).

Signed-off-by: Adiel Aloni <adiel.aloni@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/mac80211_hwsim.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c
index 10b075a46b26..59b0cedcdf7b 100644
--- a/drivers/net/wireless/mac80211_hwsim.c
+++ b/drivers/net/wireless/mac80211_hwsim.c
@@ -684,6 +684,7 @@ static void hwsim_send_nullfunc(struct mac80211_hwsim_data *data, u8 *mac,
 	hdr = skb_put(skb, sizeof(*hdr) - ETH_ALEN);
 	hdr->frame_control = cpu_to_le16(IEEE80211_FTYPE_DATA |
 					 IEEE80211_STYPE_NULLFUNC |
+					 IEEE80211_FCTL_TODS |
 					 (ps ? IEEE80211_FCTL_PM : 0));
 	hdr->duration_id = cpu_to_le16(0);
 	memcpy(hdr->addr1, vp->bssid, ETH_ALEN);

From 5d32407396b0433f9b738fcfcb9599bcba7379ae Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Thu, 14 Dec 2017 14:33:38 +0100
Subject: [PATCH 249/808] cfg80211: always rewrite generated files from scratch

Currently the certs C code generation appends to the generated files,
which is most likely a leftover from commit 715a12334764 ("wireless:
don't write C files on failures"). This causes duplicate code in the
generated files if the certificates have their timestamps modified
between builds and thereby trigger the generation rules.

Fixes: 715a12334764 ("wireless: don't write C files on failures")
Signed-off-by: Thierry Reding <treding@nvidia.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/wireless/Makefile b/net/wireless/Makefile
index d7d6cb00c47b..b662be3422e1 100644
--- a/net/wireless/Makefile
+++ b/net/wireless/Makefile
@@ -43,7 +43,7 @@ $(obj)/shipped-certs.c: $(wildcard $(srctree)/$(src)/certs/*.x509)
 	      echo "$$allf"; \
 	      echo '};'; \
 	      echo 'unsigned int shipped_regdb_certs_len = sizeof(shipped_regdb_certs);'; \
-	  ) >> $@)
+	  ) > $@)
 
 $(obj)/extra-certs.c: $(CONFIG_CFG80211_EXTRA_REGDB_KEYDIR:"%"=%) \
 		      $(wildcard $(CONFIG_CFG80211_EXTRA_REGDB_KEYDIR:"%"=%)/*.x509)
@@ -66,4 +66,4 @@ $(obj)/extra-certs.c: $(CONFIG_CFG80211_EXTRA_REGDB_KEYDIR:"%"=%) \
 	      echo "$$allf"; \
 	      echo '};'; \
 	      echo 'unsigned int extra_regdb_certs_len = sizeof(extra_regdb_certs);'; \
-	  ) >> $@)
+	  ) > $@)

From 162bd5e5fd921785077b5862d8f2ffabe2fe11e5 Mon Sep 17 00:00:00 2001
From: Jia-Ju Bai <baijiaju1990@163.com>
Date: Tue, 12 Dec 2017 17:26:36 +0800
Subject: [PATCH 250/808] mac80211_hwsim: Fix a possible sleep-in-atomic bug in
 hwsim_get_radio_nl

The driver may sleep under a spinlock.
The function call path is:
hwsim_get_radio_nl (acquire the spinlock)
  nlmsg_new(GFP_KERNEL) --> may sleep

To fix it, GFP_KERNEL is replaced with GFP_ATOMIC.

This bug is found by my static analysis tool(DSAC) and checked by my code review.

Signed-off-by: Jia-Ju Bai <baijiaju1990@163.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/mac80211_hwsim.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c
index 59b0cedcdf7b..e8189c07b41f 100644
--- a/drivers/net/wireless/mac80211_hwsim.c
+++ b/drivers/net/wireless/mac80211_hwsim.c
@@ -3216,7 +3216,7 @@ static int hwsim_get_radio_nl(struct sk_buff *msg, struct genl_info *info)
 		if (!net_eq(wiphy_net(data->hw->wiphy), genl_info_net(info)))
 			continue;
 
-		skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+		skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
 		if (!skb) {
 			res = -ENOMEM;
 			goto out_err;

From 958a1b5a5ed02a768eb27760268251af93090caf Mon Sep 17 00:00:00 2001
From: Jonathan Corbet <corbet@lwn.net>
Date: Mon, 11 Dec 2017 15:37:49 -0700
Subject: [PATCH 251/808] nl80211: Remove obsolete kerneldoc line

Commit ca986ad9bcd3 (nl80211: allow multiple active scheduled scan
requests) removed WIPHY_FLAG_SUPPORTS_SCHED_SCAN but left the kerneldoc
description in place, leading to this docs-build warning:

   ./include/net/cfg80211.h:3278: warning: Excess enum value
           'WIPHY_FLAG_SUPPORTS_SCHED_SCAN' description in 'wiphy_flags'

Remove the line and gain a bit of peace.

Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Acked-by: Arend van Spriel <arend.vanspriel@broadcom.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 8b8118a7fadb..cb4d92b79cd9 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -3226,7 +3226,6 @@ struct cfg80211_ops {
  * @WIPHY_FLAG_IBSS_RSN: The device supports IBSS RSN.
  * @WIPHY_FLAG_MESH_AUTH: The device supports mesh authentication by routing
  *	auth frames to userspace. See @NL80211_MESH_SETUP_USERSPACE_AUTH.
- * @WIPHY_FLAG_SUPPORTS_SCHED_SCAN: The device supports scheduled scans.
  * @WIPHY_FLAG_SUPPORTS_FW_ROAM: The device supports roaming feature in the
  *	firmware.
  * @WIPHY_FLAG_AP_UAPSD: The device supports uapsd on AP.

From 04a7279ff12fc47b657f70731d401c0064f5838a Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 19 Dec 2017 09:26:17 +0100
Subject: [PATCH 252/808] cfg80211: ship certificates as hex files

Not only does this remove the need for the hexdump code in most
normal kernel builds (still there for the extra directory), but
it also removes the need to ship binary files, which apparently
is somewhat problematic, as Randy reported.

While at it, also add the generated files to clean-files.

Reported-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/Makefile            |  29 ++++-------
 net/wireless/certs/sforshee.hex  |  86 +++++++++++++++++++++++++++++++
 net/wireless/certs/sforshee.x509 | Bin 680 -> 0 bytes
 3 files changed, 95 insertions(+), 20 deletions(-)
 create mode 100644 net/wireless/certs/sforshee.hex
 delete mode 100644 net/wireless/certs/sforshee.x509

diff --git a/net/wireless/Makefile b/net/wireless/Makefile
index b662be3422e1..1d84f91bbfb0 100644
--- a/net/wireless/Makefile
+++ b/net/wireless/Makefile
@@ -23,27 +23,14 @@ ifneq ($(CONFIG_CFG80211_EXTRA_REGDB_KEYDIR),)
 cfg80211-y += extra-certs.o
 endif
 
-$(obj)/shipped-certs.c: $(wildcard $(srctree)/$(src)/certs/*.x509)
+$(obj)/shipped-certs.c: $(wildcard $(srctree)/$(src)/certs/*.hex)
 	@$(kecho) "  GEN     $@"
-	@(set -e; \
-	  allf=""; \
-	  for f in $^ ; do \
-	      # similar to hexdump -v -e '1/1 "0x%.2x," "\n"' \
-	      thisf=$$(od -An -v -tx1 < $$f | \
-	                   sed -e 's/ /\n/g' | \
-	                   sed -e 's/^[0-9a-f]\+$$/\0/;t;d' | \
-	                   sed -e 's/^/0x/;s/$$/,/'); \
-	      # file should not be empty - maybe command substitution failed? \
-	      test ! -z "$$thisf";\
-	      allf=$$allf$$thisf;\
-	  done; \
-	  ( \
-	      echo '#include "reg.h"'; \
-	      echo 'const u8 shipped_regdb_certs[] = {'; \
-	      echo "$$allf"; \
-	      echo '};'; \
-	      echo 'unsigned int shipped_regdb_certs_len = sizeof(shipped_regdb_certs);'; \
-	  ) > $@)
+	@(echo '#include "reg.h"'; \
+	  echo 'const u8 shipped_regdb_certs[] = {'; \
+	  cat $^ ; \
+	  echo '};'; \
+	  echo 'unsigned int shipped_regdb_certs_len = sizeof(shipped_regdb_certs);'; \
+	 ) > $@
 
 $(obj)/extra-certs.c: $(CONFIG_CFG80211_EXTRA_REGDB_KEYDIR:"%"=%) \
 		      $(wildcard $(CONFIG_CFG80211_EXTRA_REGDB_KEYDIR:"%"=%)/*.x509)
@@ -67,3 +54,5 @@ $(obj)/extra-certs.c: $(CONFIG_CFG80211_EXTRA_REGDB_KEYDIR:"%"=%) \
 	      echo '};'; \
 	      echo 'unsigned int extra_regdb_certs_len = sizeof(extra_regdb_certs);'; \
 	  ) > $@)
+
+clean-files += shipped-certs.c extra-certs.c
diff --git a/net/wireless/certs/sforshee.hex b/net/wireless/certs/sforshee.hex
new file mode 100644
index 000000000000..14ea66643ffa
--- /dev/null
+++ b/net/wireless/certs/sforshee.hex
@@ -0,0 +1,86 @@
+/* Seth Forshee's regdb certificate */
+0x30, 0x82, 0x02, 0xa4, 0x30, 0x82, 0x01, 0x8c,
+0x02, 0x09, 0x00, 0xb2, 0x8d, 0xdf, 0x47, 0xae,
+0xf9, 0xce, 0xa7, 0x30, 0x0d, 0x06, 0x09, 0x2a,
+0x86, 0x48, 0x86, 0xf7, 0x0d, 0x01, 0x01, 0x0b,
+0x05, 0x00, 0x30, 0x13, 0x31, 0x11, 0x30, 0x0f,
+0x06, 0x03, 0x55, 0x04, 0x03, 0x0c, 0x08, 0x73,
+0x66, 0x6f, 0x72, 0x73, 0x68, 0x65, 0x65, 0x30,
+0x20, 0x17, 0x0d, 0x31, 0x37, 0x31, 0x30, 0x30,
+0x36, 0x31, 0x39, 0x34, 0x30, 0x33, 0x35, 0x5a,
+0x18, 0x0f, 0x32, 0x31, 0x31, 0x37, 0x30, 0x39,
+0x31, 0x32, 0x31, 0x39, 0x34, 0x30, 0x33, 0x35,
+0x5a, 0x30, 0x13, 0x31, 0x11, 0x30, 0x0f, 0x06,
+0x03, 0x55, 0x04, 0x03, 0x0c, 0x08, 0x73, 0x66,
+0x6f, 0x72, 0x73, 0x68, 0x65, 0x65, 0x30, 0x82,
+0x01, 0x22, 0x30, 0x0d, 0x06, 0x09, 0x2a, 0x86,
+0x48, 0x86, 0xf7, 0x0d, 0x01, 0x01, 0x01, 0x05,
+0x00, 0x03, 0x82, 0x01, 0x0f, 0x00, 0x30, 0x82,
+0x01, 0x0a, 0x02, 0x82, 0x01, 0x01, 0x00, 0xb5,
+0x40, 0xe3, 0x9c, 0x28, 0x84, 0x39, 0x03, 0xf2,
+0x39, 0xd7, 0x66, 0x2c, 0x41, 0x38, 0x15, 0xac,
+0x7e, 0xa5, 0x83, 0x71, 0x25, 0x7e, 0x90, 0x7c,
+0x68, 0xdd, 0x6f, 0x3f, 0xd9, 0xd7, 0x59, 0x38,
+0x9f, 0x7c, 0x6a, 0x52, 0xc2, 0x03, 0x2a, 0x2d,
+0x7e, 0x66, 0xf4, 0x1e, 0xb3, 0x12, 0x70, 0x20,
+0x5b, 0xd4, 0x97, 0x32, 0x3d, 0x71, 0x8b, 0x3b,
+0x1b, 0x08, 0x17, 0x14, 0x6b, 0x61, 0xc4, 0x57,
+0x8b, 0x96, 0x16, 0x1c, 0xfd, 0x24, 0xd5, 0x0b,
+0x09, 0xf9, 0x68, 0x11, 0x84, 0xfb, 0xca, 0x51,
+0x0c, 0xd1, 0x45, 0x19, 0xda, 0x10, 0x44, 0x8a,
+0xd9, 0xfe, 0x76, 0xa9, 0xfd, 0x60, 0x2d, 0x18,
+0x0b, 0x28, 0x95, 0xb2, 0x2d, 0xea, 0x88, 0x98,
+0xb8, 0xd1, 0x56, 0x21, 0xf0, 0x53, 0x1f, 0xf1,
+0x02, 0x6f, 0xe9, 0x46, 0x9b, 0x93, 0x5f, 0x28,
+0x90, 0x0f, 0xac, 0x36, 0xfa, 0x68, 0x23, 0x71,
+0x57, 0x56, 0xf6, 0xcc, 0xd3, 0xdf, 0x7d, 0x2a,
+0xd9, 0x1b, 0x73, 0x45, 0xeb, 0xba, 0x27, 0x85,
+0xef, 0x7a, 0x7f, 0xa5, 0xcb, 0x80, 0xc7, 0x30,
+0x36, 0xd2, 0x53, 0xee, 0xec, 0xac, 0x1e, 0xe7,
+0x31, 0xf1, 0x36, 0xa2, 0x9c, 0x63, 0xc6, 0x65,
+0x5b, 0x7f, 0x25, 0x75, 0x68, 0xa1, 0xea, 0xd3,
+0x7e, 0x00, 0x5c, 0x9a, 0x5e, 0xd8, 0x20, 0x18,
+0x32, 0x77, 0x07, 0x29, 0x12, 0x66, 0x1e, 0x36,
+0x73, 0xe7, 0x97, 0x04, 0x41, 0x37, 0xb1, 0xb1,
+0x72, 0x2b, 0xf4, 0xa1, 0x29, 0x20, 0x7c, 0x96,
+0x79, 0x0b, 0x2b, 0xd0, 0xd8, 0xde, 0xc8, 0x6c,
+0x3f, 0x93, 0xfb, 0xc5, 0xee, 0x78, 0x52, 0x11,
+0x15, 0x1b, 0x7a, 0xf6, 0xe2, 0x68, 0x99, 0xe7,
+0xfb, 0x46, 0x16, 0x84, 0xe3, 0xc7, 0xa1, 0xe6,
+0xe0, 0xd2, 0x46, 0xd5, 0xe1, 0xc4, 0x5f, 0xa0,
+0x66, 0xf4, 0xda, 0xc4, 0xff, 0x95, 0x1d, 0x02,
+0x03, 0x01, 0x00, 0x01, 0x30, 0x0d, 0x06, 0x09,
+0x2a, 0x86, 0x48, 0x86, 0xf7, 0x0d, 0x01, 0x01,
+0x0b, 0x05, 0x00, 0x03, 0x82, 0x01, 0x01, 0x00,
+0x87, 0x03, 0xda, 0xf2, 0x82, 0xc2, 0xdd, 0xaf,
+0x7c, 0x44, 0x2f, 0x86, 0xd3, 0x5f, 0x4c, 0x93,
+0x48, 0xb9, 0xfe, 0x07, 0x17, 0xbb, 0x21, 0xf7,
+0x25, 0x23, 0x4e, 0xaa, 0x22, 0x0c, 0x16, 0xb9,
+0x73, 0xae, 0x9d, 0x46, 0x7c, 0x75, 0xd9, 0xc3,
+0x49, 0x57, 0x47, 0xbf, 0x33, 0xb7, 0x97, 0xec,
+0xf5, 0x40, 0x75, 0xc0, 0x46, 0x22, 0xf0, 0xa0,
+0x5d, 0x9c, 0x79, 0x13, 0xa1, 0xff, 0xb8, 0xa3,
+0x2f, 0x7b, 0x8e, 0x06, 0x3f, 0xc8, 0xb6, 0xe4,
+0x6a, 0x28, 0xf2, 0x34, 0x5c, 0x23, 0x3f, 0x32,
+0xc0, 0xe6, 0xad, 0x0f, 0xac, 0xcf, 0x55, 0x74,
+0x47, 0x73, 0xd3, 0x01, 0x85, 0xb7, 0x0b, 0x22,
+0x56, 0x24, 0x7d, 0x9f, 0x09, 0xa9, 0x0e, 0x86,
+0x9e, 0x37, 0x5b, 0x9c, 0x6d, 0x02, 0xd9, 0x8c,
+0xc8, 0x50, 0x6a, 0xe2, 0x59, 0xf3, 0x16, 0x06,
+0xea, 0xb2, 0x42, 0xb5, 0x58, 0xfe, 0xba, 0xd1,
+0x81, 0x57, 0x1a, 0xef, 0xb2, 0x38, 0x88, 0x58,
+0xf6, 0xaa, 0xc4, 0x2e, 0x8b, 0x5a, 0x27, 0xe4,
+0xa5, 0xe8, 0xa4, 0xca, 0x67, 0x5c, 0xac, 0x72,
+0x67, 0xc3, 0x6f, 0x13, 0xc3, 0x2d, 0x35, 0x79,
+0xd7, 0x8a, 0xe7, 0xf5, 0xd4, 0x21, 0x30, 0x4a,
+0xd5, 0xf6, 0xa3, 0xd9, 0x79, 0x56, 0xf2, 0x0f,
+0x10, 0xf7, 0x7d, 0xd0, 0x51, 0x93, 0x2f, 0x47,
+0xf8, 0x7d, 0x4b, 0x0a, 0x84, 0x55, 0x12, 0x0a,
+0x7d, 0x4e, 0x3b, 0x1f, 0x2b, 0x2f, 0xfc, 0x28,
+0xb3, 0x69, 0x34, 0xe1, 0x80, 0x80, 0xbb, 0xe2,
+0xaf, 0xb9, 0xd6, 0x30, 0xf1, 0x1d, 0x54, 0x87,
+0x23, 0x99, 0x9f, 0x51, 0x03, 0x4c, 0x45, 0x7d,
+0x02, 0x65, 0x73, 0xab, 0xfd, 0xcf, 0x94, 0xcc,
+0x0d, 0x3a, 0x60, 0xfd, 0x3c, 0x14, 0x2f, 0x16,
+0x33, 0xa9, 0x21, 0x1f, 0xcb, 0x50, 0xb1, 0x8f,
+0x03, 0xee, 0xa0, 0x66, 0xa9, 0x16, 0x79, 0x14,
diff --git a/net/wireless/certs/sforshee.x509 b/net/wireless/certs/sforshee.x509
deleted file mode 100644
index c6f8f9d6b98839048822ebbe27ecb831614ccf3d..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 680
zcmXqLVp?L*#Mr~c$*`&SzWchL=aw7rvT<s)d9;1!Wn|=LWiSvn6g1#xV-96u=HVz#
z%P%U<NKG|R5a%^CH#9IXGqf}@FgA^n;5RZfG&is`G=d6X*Ve?Sgls$`D+6;ABR>Ps
z1zb!`jEoFh9UjloXt8AeWO+SJ$I(J`P2JMwLe;tnH5qsF?QdR>w3uI$6?BMMOSdlV
zi`-_R0)^-+(~WEkyRD@;#6_|bkA!zm6O;L?a+RC&XNF+Q?^A(17hNT93Al9K{8zT}
zZ-TA_x5m^>y01EB?6?@F_#s&SBUAoMx7m~9H74+{G5eLFTo@kq?abx-wOTi&i(Oyu
zQg3}<RloFf!*K(%OTq8ntdV<e_|a_9oaAGv(e<jO84F)su49Oq6?a2H!l;~GQz%W&
ztoZqK7Dw}q8;i8REYwt}nO4cIec{Hv6FK&ie;<8U5hN%oUG?oz#?0rx-NafRA7A+F
z!6mn=50AtzNc(c@$p5LbOw5c7jL0Dbi~wK=F*39>-}=;a=<fO&7yY)&@jjD1cK%}*
z->vvvRoQQq5|7x<;&pS~YD#Y&_6&F5Z@hi_o39R~2i%lCEQp;`DZKFij>Y=beQfq8
zwmr$x_+%2JY;Sbn*;@WJ=R-@}i!U>_Zs%4CQ>mTLxstDKo_X|~T&9~nCjzn_MSd1z
zd$q}FYs9}@7aPN+-fyz#i1@bZh+cP;`je$EmYhnDSyPmLIA8d%u4(1<uIFE`C>nTO
z{kHgKW!NWvf$y~!0w?Rc|ETrmY6%tMs`ay$*Vg}|u{qP^VMD|2N9%W9Gx#VQ(ylyn
seju}tYb{f1@#??lr<~!nO89FdqAzB=Qc?bNz{Y;&cMH;1idBjL01XcsegFUf


From eac6a3639decefcc8eb0941dd3cebe79993670ad Mon Sep 17 00:00:00 2001
From: Maxime Ripard <maxime.ripard@free-electrons.com>
Date: Thu, 7 Dec 2017 16:58:59 +0100
Subject: [PATCH 253/808] ARM: dts: sun8i: a711: Reinstate the PMIC compatible

When we added the regulator support in commit 90c5d7cdae64 ("ARM: dts:
sun8i: a711: Add regulator support"), we also dropped the PMIC's
compatible. Since it's not in the PMIC DTSI, unlike most other PMIC
DTSI, it obviously wasn't probing anymore.

Re-add it so that everything works again.

Fixes: 90c5d7cdae64 ("ARM: dts: sun8i: a711: Add regulator support")
Reviewed-by: Chen-Yu Tsai <wens@csie.org>
Signed-off-by: Maxime Ripard <maxime.ripard@free-electrons.com>
---
 arch/arm/boot/dts/sun8i-a83t-tbs-a711.dts | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm/boot/dts/sun8i-a83t-tbs-a711.dts b/arch/arm/boot/dts/sun8i-a83t-tbs-a711.dts
index 98715538932f..a021ee6da396 100644
--- a/arch/arm/boot/dts/sun8i-a83t-tbs-a711.dts
+++ b/arch/arm/boot/dts/sun8i-a83t-tbs-a711.dts
@@ -146,6 +146,7 @@
 	status = "okay";
 
 	axp81x: pmic@3a3 {
+		compatible = "x-powers,axp813";
 		reg = <0x3a3>;
 		interrupt-parent = <&r_intc>;
 		interrupts = <0 IRQ_TYPE_LEVEL_LOW>;

From 92411f6d7f1afcc95e54295d40e96a75385212ec Mon Sep 17 00:00:00 2001
From: Maxime Ripard <maxime.ripard@free-electrons.com>
Date: Thu, 7 Dec 2017 16:58:50 +0100
Subject: [PATCH 254/808] drm/sun4i: Fix error path handling

The commit 4c7f16d14a33 ("drm/sun4i: Fix TCON clock and regmap
initialization sequence") moved a bunch of logic around, but forgot to
update the gotos after the introduction of the err_free_dotclock label.

It means that if we fail later that the one introduced in that commit,
we'll just to the old label which isn't free the clock we created. This
will result in a breakage as soon as someone tries to do something with
that clock, since its resources will have been long reclaimed.

Cc: <stable@vger.kernel.org>
Fixes: 4c7f16d14a33 ("drm/sun4i: Fix TCON clock and regmap initialization sequence")
Reviewed-by: Chen-Yu Tsai <wens@csie.org>
Signed-off-by: Maxime Ripard <maxime.ripard@free-electrons.com>
Link: https://patchwork.freedesktop.org/patch/msgid/f83c1cebc731f0b4251f5ddd7b38c718cd79bb0b.1512662253.git-series.maxime.ripard@free-electrons.com
---
 drivers/gpu/drm/sun4i/sun4i_tcon.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/sun4i/sun4i_tcon.c b/drivers/gpu/drm/sun4i/sun4i_tcon.c
index e122f5b2a395..f4284b51bdca 100644
--- a/drivers/gpu/drm/sun4i/sun4i_tcon.c
+++ b/drivers/gpu/drm/sun4i/sun4i_tcon.c
@@ -724,12 +724,12 @@ static int sun4i_tcon_bind(struct device *dev, struct device *master,
 	if (IS_ERR(tcon->crtc)) {
 		dev_err(dev, "Couldn't create our CRTC\n");
 		ret = PTR_ERR(tcon->crtc);
-		goto err_free_clocks;
+		goto err_free_dotclock;
 	}
 
 	ret = sun4i_rgb_init(drm, tcon);
 	if (ret < 0)
-		goto err_free_clocks;
+		goto err_free_dotclock;
 
 	if (tcon->quirks->needs_de_be_mux) {
 		/*

From 66e900a3d225575c8b48b59ae1fe74bb6e5a65cc Mon Sep 17 00:00:00 2001
From: Radu Pirea <radu.pirea@microchip.com>
Date: Fri, 15 Dec 2017 17:40:17 +0200
Subject: [PATCH 255/808] spi: atmel: fixed spin_lock usage inside
 atmel_spi_remove

The only part of atmel_spi_remove which needs to be atomic is hardware
reset.

atmel_spi_stop_dma calls dma_terminate_all and this needs interrupts
enabled.
atmel_spi_release_dma calls dma_release_channel and dma_release_channel
locks a mutex inside of spin_lock.

So the call of these functions can't be inside a spin_lock.

Reported-by: Jia-Ju Bai <baijiaju1990@gmail.com>
Signed-off-by: Radu Pirea <radu.pirea@microchip.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi-atmel.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/spi/spi-atmel.c b/drivers/spi/spi-atmel.c
index f95da364c283..669470971023 100644
--- a/drivers/spi/spi-atmel.c
+++ b/drivers/spi/spi-atmel.c
@@ -1661,12 +1661,12 @@ static int atmel_spi_remove(struct platform_device *pdev)
 	pm_runtime_get_sync(&pdev->dev);
 
 	/* reset the hardware and block queue progress */
-	spin_lock_irq(&as->lock);
 	if (as->use_dma) {
 		atmel_spi_stop_dma(master);
 		atmel_spi_release_dma(master);
 	}
 
+	spin_lock_irq(&as->lock);
 	spi_writel(as, CR, SPI_BIT(SWRST));
 	spi_writel(as, CR, SPI_BIT(SWRST)); /* AT91SAM9263 Rev B workaround */
 	spi_readl(as, SR);

From 3920bb713038810f25770e7545b79f204685c8f2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?SZ=20Lin=20=28=E6=9E=97=E4=B8=8A=E6=99=BA=29?=
 <sz.lin@moxa.com>
Date: Tue, 19 Dec 2017 17:40:32 +0800
Subject: [PATCH 256/808] USB: serial: option: adding support for YUGA
 CLM920-NC5
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch adds support for YUGA CLM920-NC5 PID 0x9625 USB modem to option
driver.

Interface layout:
0: QCDM/DIAG
1: ADB
2: MODEM
3: AT
4: RMNET

Signed-off-by: Taiyi Wu <taiyity.wu@moxa.com>
Signed-off-by: SZ Lin (林上智) <sz.lin@moxa.com>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Johan Hovold <johan@kernel.org>
---
 drivers/usb/serial/option.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c
index b02fb576b856..b6320e3be429 100644
--- a/drivers/usb/serial/option.c
+++ b/drivers/usb/serial/option.c
@@ -233,6 +233,8 @@ static void option_instat_callback(struct urb *urb);
 /* These Quectel products use Qualcomm's vendor ID */
 #define QUECTEL_PRODUCT_UC20			0x9003
 #define QUECTEL_PRODUCT_UC15			0x9090
+/* These Yuga products use Qualcomm's vendor ID */
+#define YUGA_PRODUCT_CLM920_NC5			0x9625
 
 #define QUECTEL_VENDOR_ID			0x2c7c
 /* These Quectel products use Quectel's vendor ID */
@@ -680,6 +682,10 @@ static const struct option_blacklist_info cinterion_rmnet2_blacklist = {
 	.reserved = BIT(4) | BIT(5),
 };
 
+static const struct option_blacklist_info yuga_clm920_nc5_blacklist = {
+	.reserved = BIT(1) | BIT(4),
+};
+
 static const struct usb_device_id option_ids[] = {
 	{ USB_DEVICE(OPTION_VENDOR_ID, OPTION_PRODUCT_COLT) },
 	{ USB_DEVICE(OPTION_VENDOR_ID, OPTION_PRODUCT_RICOLA) },
@@ -1184,6 +1190,9 @@ static const struct usb_device_id option_ids[] = {
 	{ USB_DEVICE(QUALCOMM_VENDOR_ID, QUECTEL_PRODUCT_UC15)},
 	{ USB_DEVICE(QUALCOMM_VENDOR_ID, QUECTEL_PRODUCT_UC20),
 	  .driver_info = (kernel_ulong_t)&net_intf4_blacklist },
+	/* Yuga products use Qualcomm vendor ID */
+	{ USB_DEVICE(QUALCOMM_VENDOR_ID, YUGA_PRODUCT_CLM920_NC5),
+	  .driver_info = (kernel_ulong_t)&yuga_clm920_nc5_blacklist },
 	/* Quectel products using Quectel vendor ID */
 	{ USB_DEVICE(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EC21),
 	  .driver_info = (kernel_ulong_t)&net_intf4_blacklist },

From 07b9f12864d16c3a861aef4817eb1efccbc5d0e6 Mon Sep 17 00:00:00 2001
From: Mathias Nyman <mathias.nyman@linux.intel.com>
Date: Tue, 19 Dec 2017 11:14:42 +0200
Subject: [PATCH 257/808] USB: Fix off by one in type-specific length check of
 BOS SSP capability

USB 3.1 devices are not detected as 3.1 capable since 4.15-rc3 due to a
off by one in commit 81cf4a45360f ("USB: core: Add type-specific length
check of BOS descriptors")

It uses USB_DT_USB_SSP_CAP_SIZE() to get SSP capability size which takes
the zero based SSAC as argument, not the actual count of sublink speed
attributes.

USB3 spec 9.6.2.5 says "The number of Sublink Speed Attributes = SSAC + 1."

The type-specific length check patch was added to stable and needs to be
fixed there as well

Fixes: 81cf4a45360f ("USB: core: Add type-specific length check of BOS descriptors")
Cc: linux-stable <stable@vger.kernel.org>
CC: Masakazu Mokuno <masakazu.mokuno@gmail.com>
Signed-off-by: Mathias Nyman <mathias.nyman@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/core/config.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/usb/core/config.c b/drivers/usb/core/config.c
index 78e92d29f8d9..c821b4b9647e 100644
--- a/drivers/usb/core/config.c
+++ b/drivers/usb/core/config.c
@@ -1007,7 +1007,7 @@ int usb_get_bos_descriptor(struct usb_device *dev)
 		case USB_SSP_CAP_TYPE:
 			ssp_cap = (struct usb_ssp_cap_descriptor *)buffer;
 			ssac = (le32_to_cpu(ssp_cap->bmAttributes) &
-				USB_SSP_SUBLINK_SPEED_ATTRIBS) + 1;
+				USB_SSP_SUBLINK_SPEED_ATTRIBS);
 			if (length >= USB_DT_USB_SSP_CAP_SIZE(ssac))
 				dev->bos->ssp_cap = ssp_cap;
 			break;

From 8272d099d05f7ab2776cf56a2ab9f9443be18907 Mon Sep 17 00:00:00 2001
From: Shuah Khan <shuahkh@osg.samsung.com>
Date: Mon, 18 Dec 2017 17:24:22 -0700
Subject: [PATCH 258/808] usbip: vhci: stop printing kernel pointer addresses
 in messages

Remove and/or change debug, info. and error messages to not print
kernel pointer addresses.

Signed-off-by: Shuah Khan <shuahkh@osg.samsung.com>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/usbip/vhci_hcd.c | 10 ----------
 drivers/usb/usbip/vhci_rx.c  | 23 +++++++++++------------
 drivers/usb/usbip/vhci_tx.c  |  3 ++-
 3 files changed, 13 insertions(+), 23 deletions(-)

diff --git a/drivers/usb/usbip/vhci_hcd.c b/drivers/usb/usbip/vhci_hcd.c
index 6b3278c4b72a..9efab3dc3734 100644
--- a/drivers/usb/usbip/vhci_hcd.c
+++ b/drivers/usb/usbip/vhci_hcd.c
@@ -656,9 +656,6 @@ static int vhci_urb_enqueue(struct usb_hcd *hcd, struct urb *urb, gfp_t mem_flag
 	struct vhci_device *vdev;
 	unsigned long flags;
 
-	usbip_dbg_vhci_hc("enter, usb_hcd %p urb %p mem_flags %d\n",
-			  hcd, urb, mem_flags);
-
 	if (portnum > VHCI_HC_PORTS) {
 		pr_err("invalid port number %d\n", portnum);
 		return -ENODEV;
@@ -822,8 +819,6 @@ static int vhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status)
 	struct vhci_device *vdev;
 	unsigned long flags;
 
-	pr_info("dequeue a urb %p\n", urb);
-
 	spin_lock_irqsave(&vhci->lock, flags);
 
 	priv = urb->hcpriv;
@@ -851,7 +846,6 @@ static int vhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status)
 		/* tcp connection is closed */
 		spin_lock(&vdev->priv_lock);
 
-		pr_info("device %p seems to be disconnected\n", vdev);
 		list_del(&priv->list);
 		kfree(priv);
 		urb->hcpriv = NULL;
@@ -863,8 +857,6 @@ static int vhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status)
 		 * vhci_rx will receive RET_UNLINK and give back the URB.
 		 * Otherwise, we give back it here.
 		 */
-		pr_info("gives back urb %p\n", urb);
-
 		usb_hcd_unlink_urb_from_ep(hcd, urb);
 
 		spin_unlock_irqrestore(&vhci->lock, flags);
@@ -892,8 +884,6 @@ static int vhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status)
 
 		unlink->unlink_seqnum = priv->seqnum;
 
-		pr_info("device %p seems to be still connected\n", vdev);
-
 		/* send cmd_unlink and try to cancel the pending URB in the
 		 * peer */
 		list_add_tail(&unlink->list, &vdev->unlink_tx);
diff --git a/drivers/usb/usbip/vhci_rx.c b/drivers/usb/usbip/vhci_rx.c
index 90577e8b2282..112ebb90d8c9 100644
--- a/drivers/usb/usbip/vhci_rx.c
+++ b/drivers/usb/usbip/vhci_rx.c
@@ -23,24 +23,23 @@ struct urb *pickup_urb_and_free_priv(struct vhci_device *vdev, __u32 seqnum)
 		urb = priv->urb;
 		status = urb->status;
 
-		usbip_dbg_vhci_rx("find urb %p vurb %p seqnum %u\n",
-				urb, priv, seqnum);
+		usbip_dbg_vhci_rx("find urb seqnum %u\n", seqnum);
 
 		switch (status) {
 		case -ENOENT:
 			/* fall through */
 		case -ECONNRESET:
-			dev_info(&urb->dev->dev,
-				 "urb %p was unlinked %ssynchronuously.\n", urb,
-				 status == -ENOENT ? "" : "a");
+			dev_dbg(&urb->dev->dev,
+				 "urb seq# %u was unlinked %ssynchronuously\n",
+				 seqnum, status == -ENOENT ? "" : "a");
 			break;
 		case -EINPROGRESS:
 			/* no info output */
 			break;
 		default:
-			dev_info(&urb->dev->dev,
-				 "urb %p may be in a error, status %d\n", urb,
-				 status);
+			dev_dbg(&urb->dev->dev,
+				 "urb seq# %u may be in a error, status %d\n",
+				 seqnum, status);
 		}
 
 		list_del(&priv->list);
@@ -67,8 +66,8 @@ static void vhci_recv_ret_submit(struct vhci_device *vdev,
 	spin_unlock_irqrestore(&vdev->priv_lock, flags);
 
 	if (!urb) {
-		pr_err("cannot find a urb of seqnum %u\n", pdu->base.seqnum);
-		pr_info("max seqnum %d\n",
+		pr_err("cannot find a urb of seqnum %u max seqnum %d\n",
+			pdu->base.seqnum,
 			atomic_read(&vhci_hcd->seqnum));
 		usbip_event_add(ud, VDEV_EVENT_ERROR_TCP);
 		return;
@@ -91,7 +90,7 @@ static void vhci_recv_ret_submit(struct vhci_device *vdev,
 	if (usbip_dbg_flag_vhci_rx)
 		usbip_dump_urb(urb);
 
-	usbip_dbg_vhci_rx("now giveback urb %p\n", urb);
+	usbip_dbg_vhci_rx("now giveback urb %u\n", pdu->base.seqnum);
 
 	spin_lock_irqsave(&vhci->lock, flags);
 	usb_hcd_unlink_urb_from_ep(vhci_hcd_to_hcd(vhci_hcd), urb);
@@ -158,7 +157,7 @@ static void vhci_recv_ret_unlink(struct vhci_device *vdev,
 		pr_info("the urb (seqnum %d) was already given back\n",
 			pdu->base.seqnum);
 	} else {
-		usbip_dbg_vhci_rx("now giveback urb %p\n", urb);
+		usbip_dbg_vhci_rx("now giveback urb %d\n", pdu->base.seqnum);
 
 		/* If unlink is successful, status is -ECONNRESET */
 		urb->status = pdu->u.ret_unlink.status;
diff --git a/drivers/usb/usbip/vhci_tx.c b/drivers/usb/usbip/vhci_tx.c
index d625a2ff4b71..9aed15a358b7 100644
--- a/drivers/usb/usbip/vhci_tx.c
+++ b/drivers/usb/usbip/vhci_tx.c
@@ -69,7 +69,8 @@ static int vhci_send_cmd_submit(struct vhci_device *vdev)
 		memset(&msg, 0, sizeof(msg));
 		memset(&iov, 0, sizeof(iov));
 
-		usbip_dbg_vhci_tx("setup txdata urb %p\n", urb);
+		usbip_dbg_vhci_tx("setup txdata urb seqnum %lu\n",
+				  priv->seqnum);
 
 		/* 1. setup usbip_header */
 		setup_cmd_submit_pdu(&pdu_header, urb);

From 248a22044366f588d46754c54dfe29ffe4f8b4df Mon Sep 17 00:00:00 2001
From: Shuah Khan <shuahkh@osg.samsung.com>
Date: Mon, 18 Dec 2017 17:23:37 -0700
Subject: [PATCH 259/808] usbip: stub: stop printing kernel pointer addresses
 in messages

Remove and/or change debug, info. and error messages to not print
kernel pointer addresses.

Signed-off-by: Shuah Khan <shuahkh@osg.samsung.com>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/usbip/stub_main.c | 5 +++--
 drivers/usb/usbip/stub_rx.c   | 7 ++-----
 drivers/usb/usbip/stub_tx.c   | 6 +++---
 3 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/drivers/usb/usbip/stub_main.c b/drivers/usb/usbip/stub_main.c
index 4f48b306713f..c31c8402a0c5 100644
--- a/drivers/usb/usbip/stub_main.c
+++ b/drivers/usb/usbip/stub_main.c
@@ -237,11 +237,12 @@ void stub_device_cleanup_urbs(struct stub_device *sdev)
 	struct stub_priv *priv;
 	struct urb *urb;
 
-	dev_dbg(&sdev->udev->dev, "free sdev %p\n", sdev);
+	dev_dbg(&sdev->udev->dev, "Stub device cleaning up urbs\n");
 
 	while ((priv = stub_priv_pop(sdev))) {
 		urb = priv->urb;
-		dev_dbg(&sdev->udev->dev, "free urb %p\n", urb);
+		dev_dbg(&sdev->udev->dev, "free urb seqnum %lu\n",
+			priv->seqnum);
 		usb_kill_urb(urb);
 
 		kmem_cache_free(stub_priv_cache, priv);
diff --git a/drivers/usb/usbip/stub_rx.c b/drivers/usb/usbip/stub_rx.c
index 493ac2928391..2f29be474098 100644
--- a/drivers/usb/usbip/stub_rx.c
+++ b/drivers/usb/usbip/stub_rx.c
@@ -211,9 +211,6 @@ static int stub_recv_cmd_unlink(struct stub_device *sdev,
 		if (priv->seqnum != pdu->u.cmd_unlink.seqnum)
 			continue;
 
-		dev_info(&priv->urb->dev->dev, "unlink urb %p\n",
-			 priv->urb);
-
 		/*
 		 * This matched urb is not completed yet (i.e., be in
 		 * flight in usb hcd hardware/driver). Now we are
@@ -252,8 +249,8 @@ static int stub_recv_cmd_unlink(struct stub_device *sdev,
 		ret = usb_unlink_urb(priv->urb);
 		if (ret != -EINPROGRESS)
 			dev_err(&priv->urb->dev->dev,
-				"failed to unlink a urb %p, ret %d\n",
-				priv->urb, ret);
+				"failed to unlink a urb # %lu, ret %d\n",
+				priv->seqnum, ret);
 
 		return 0;
 	}
diff --git a/drivers/usb/usbip/stub_tx.c b/drivers/usb/usbip/stub_tx.c
index 53172b1f6257..f0ec41a50cbc 100644
--- a/drivers/usb/usbip/stub_tx.c
+++ b/drivers/usb/usbip/stub_tx.c
@@ -88,7 +88,7 @@ void stub_complete(struct urb *urb)
 	/* link a urb to the queue of tx. */
 	spin_lock_irqsave(&sdev->priv_lock, flags);
 	if (sdev->ud.tcp_socket == NULL) {
-		usbip_dbg_stub_tx("ignore urb for closed connection %p", urb);
+		usbip_dbg_stub_tx("ignore urb for closed connection\n");
 		/* It will be freed in stub_device_cleanup_urbs(). */
 	} else if (priv->unlinking) {
 		stub_enqueue_ret_unlink(sdev, priv->seqnum, urb->status);
@@ -190,8 +190,8 @@ static int stub_send_ret_submit(struct stub_device *sdev)
 
 		/* 1. setup usbip_header */
 		setup_ret_submit_pdu(&pdu_header, urb);
-		usbip_dbg_stub_tx("setup txdata seqnum: %d urb: %p\n",
-				  pdu_header.base.seqnum, urb);
+		usbip_dbg_stub_tx("setup txdata seqnum: %d\n",
+				  pdu_header.base.seqnum);
 		usbip_header_correct_endian(&pdu_header, 1);
 
 		iov[iovnum].iov_base = &pdu_header;

From 90120d15f4c397272aaf41077960a157fc4212bf Mon Sep 17 00:00:00 2001
From: Shuah Khan <shuahkh@osg.samsung.com>
Date: Fri, 15 Dec 2017 10:50:09 -0700
Subject: [PATCH 260/808] usbip: prevent leaking socket pointer address in
 messages

usbip driver is leaking socket pointer address in messages. Remove
the messages that aren't useful and print sockfd in the ones that
are useful for debugging.

Signed-off-by: Shuah Khan <shuahkh@osg.samsung.com>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/usbip/stub_dev.c     |  3 +--
 drivers/usb/usbip/usbip_common.c | 16 +++++-----------
 drivers/usb/usbip/vhci_hcd.c     |  2 +-
 3 files changed, 7 insertions(+), 14 deletions(-)

diff --git a/drivers/usb/usbip/stub_dev.c b/drivers/usb/usbip/stub_dev.c
index a3df8ee82faf..e31a6f204397 100644
--- a/drivers/usb/usbip/stub_dev.c
+++ b/drivers/usb/usbip/stub_dev.c
@@ -149,8 +149,7 @@ static void stub_shutdown_connection(struct usbip_device *ud)
 	 * step 1?
 	 */
 	if (ud->tcp_socket) {
-		dev_dbg(&sdev->udev->dev, "shutdown tcp_socket %p\n",
-			ud->tcp_socket);
+		dev_dbg(&sdev->udev->dev, "shutdown sockfd %d\n", ud->sockfd);
 		kernel_sock_shutdown(ud->tcp_socket, SHUT_RDWR);
 	}
 
diff --git a/drivers/usb/usbip/usbip_common.c b/drivers/usb/usbip/usbip_common.c
index f7978933b402..7b219d9109b4 100644
--- a/drivers/usb/usbip/usbip_common.c
+++ b/drivers/usb/usbip/usbip_common.c
@@ -317,26 +317,20 @@ int usbip_recv(struct socket *sock, void *buf, int size)
 	struct msghdr msg = {.msg_flags = MSG_NOSIGNAL};
 	int total = 0;
 
+	if (!sock || !buf || !size)
+		return -EINVAL;
+
 	iov_iter_kvec(&msg.msg_iter, READ|ITER_KVEC, &iov, 1, size);
 
 	usbip_dbg_xmit("enter\n");
 
-	if (!sock || !buf || !size) {
-		pr_err("invalid arg, sock %p buff %p size %d\n", sock, buf,
-		       size);
-		return -EINVAL;
-	}
-
 	do {
-		int sz = msg_data_left(&msg);
+		msg_data_left(&msg);
 		sock->sk->sk_allocation = GFP_NOIO;
 
 		result = sock_recvmsg(sock, &msg, MSG_WAITALL);
-		if (result <= 0) {
-			pr_debug("receive sock %p buf %p size %u ret %d total %d\n",
-				 sock, buf + total, sz, result, total);
+		if (result <= 0)
 			goto err;
-		}
 
 		total += result;
 	} while (msg_data_left(&msg));
diff --git a/drivers/usb/usbip/vhci_hcd.c b/drivers/usb/usbip/vhci_hcd.c
index 9efab3dc3734..c3e1008aa491 100644
--- a/drivers/usb/usbip/vhci_hcd.c
+++ b/drivers/usb/usbip/vhci_hcd.c
@@ -965,7 +965,7 @@ static void vhci_shutdown_connection(struct usbip_device *ud)
 
 	/* need this? see stub_dev.c */
 	if (ud->tcp_socket) {
-		pr_debug("shutdown tcp_socket %p\n", ud->tcp_socket);
+		pr_debug("shutdown tcp_socket %d\n", ud->sockfd);
 		kernel_sock_shutdown(ud->tcp_socket, SHUT_RDWR);
 	}
 

From 10c90120930628e8b959bf58d4a0aaef3ae5d945 Mon Sep 17 00:00:00 2001
From: Shuah Khan <shuahkh@osg.samsung.com>
Date: Fri, 15 Dec 2017 10:05:15 -0700
Subject: [PATCH 261/808] usbip: stub_rx: fix static checker warning on
 unnecessary checks

Fix the following static checker warnings:

The patch c6688ef9f297: "usbip: fix stub_rx: harden CMD_SUBMIT path
to handle malicious input" from Dec 7, 2017, leads to the following
static checker warning:

    drivers/usb/usbip/stub_rx.c:346 get_pipe()
    warn: impossible condition
'(pdu->u.cmd_submit.transfer_buffer_length > ((~0 >> 1))) =>
(s32min-s32max > s32max)'
    drivers/usb/usbip/stub_rx.c:486 stub_recv_cmd_submit()
    warn: always true condition
'(pdu->u.cmd_submit.transfer_buffer_length <= ((~0 >> 1))) =>
(s32min-s32max <= s32max)'

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Shuah Khan <shuahkh@osg.samsung.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/usbip/stub_rx.c | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/drivers/usb/usbip/stub_rx.c b/drivers/usb/usbip/stub_rx.c
index 2f29be474098..6c5a59313999 100644
--- a/drivers/usb/usbip/stub_rx.c
+++ b/drivers/usb/usbip/stub_rx.c
@@ -339,14 +339,6 @@ static int get_pipe(struct stub_device *sdev, struct usbip_header *pdu)
 
 	epd = &ep->desc;
 
-	/* validate transfer_buffer_length */
-	if (pdu->u.cmd_submit.transfer_buffer_length > INT_MAX) {
-		dev_err(&sdev->udev->dev,
-			"CMD_SUBMIT: -EMSGSIZE transfer_buffer_length %d\n",
-			pdu->u.cmd_submit.transfer_buffer_length);
-		return -1;
-	}
-
 	if (usb_endpoint_xfer_control(epd)) {
 		if (dir == USBIP_DIR_OUT)
 			return usb_sndctrlpipe(udev, epnum);
@@ -479,8 +471,7 @@ static void stub_recv_cmd_submit(struct stub_device *sdev,
 	}
 
 	/* allocate urb transfer buffer, if needed */
-	if (pdu->u.cmd_submit.transfer_buffer_length > 0 &&
-	    pdu->u.cmd_submit.transfer_buffer_length <= INT_MAX) {
+	if (pdu->u.cmd_submit.transfer_buffer_length > 0) {
 		priv->urb->transfer_buffer =
 			kzalloc(pdu->u.cmd_submit.transfer_buffer_length,
 				GFP_KERNEL);

From 544c4605acc5ae4afe7dd5914147947db182f2fb Mon Sep 17 00:00:00 2001
From: Juan Zea <juan.zea@qindel.com>
Date: Fri, 15 Dec 2017 10:21:20 +0100
Subject: [PATCH 262/808] usbip: fix usbip bind writing random string after
 command in match_busid

usbip bind writes commands followed by random string when writing to
match_busid attribute in sysfs, caused by using full variable size
instead of string length.

Signed-off-by: Juan Zea <juan.zea@qindel.com>
Acked-by: Shuah Khan <shuahkh@osg.samsung.com>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 tools/usb/usbip/src/utils.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tools/usb/usbip/src/utils.c b/tools/usb/usbip/src/utils.c
index 2b3d6d235015..3d7b42e77299 100644
--- a/tools/usb/usbip/src/utils.c
+++ b/tools/usb/usbip/src/utils.c
@@ -30,6 +30,7 @@ int modify_match_busid(char *busid, int add)
 	char command[SYSFS_BUS_ID_SIZE + 4];
 	char match_busid_attr_path[SYSFS_PATH_MAX];
 	int rc;
+	int cmd_size;
 
 	snprintf(match_busid_attr_path, sizeof(match_busid_attr_path),
 		 "%s/%s/%s/%s/%s/%s", SYSFS_MNT_PATH, SYSFS_BUS_NAME,
@@ -37,12 +38,14 @@ int modify_match_busid(char *busid, int add)
 		 attr_name);
 
 	if (add)
-		snprintf(command, SYSFS_BUS_ID_SIZE + 4, "add %s", busid);
+		cmd_size = snprintf(command, SYSFS_BUS_ID_SIZE + 4, "add %s",
+				    busid);
 	else
-		snprintf(command, SYSFS_BUS_ID_SIZE + 4, "del %s", busid);
+		cmd_size = snprintf(command, SYSFS_BUS_ID_SIZE + 4, "del %s",
+				    busid);
 
 	rc = write_sysfs_attribute(match_busid_attr_path, command,
-				   sizeof(command));
+				   cmd_size);
 	if (rc < 0) {
 		dbg("failed to write match_busid: %s", strerror(errno));
 		return -1;

From b9096d9f15c142574ebebe8fbb137012bb9d99c2 Mon Sep 17 00:00:00 2001
From: Oliver Neukum <oneukum@suse.com>
Date: Tue, 12 Dec 2017 16:11:30 +0100
Subject: [PATCH 263/808] usb: add RESET_RESUME for ELSA MicroLink 56K

This modem needs this quirk to operate. It produces timeouts when
resumed without reset.

Signed-off-by: Oliver Neukum <oneukum@suse.com>
CC: stable@vger.kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/core/quirks.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c
index a10b346b9777..95812656d9b9 100644
--- a/drivers/usb/core/quirks.c
+++ b/drivers/usb/core/quirks.c
@@ -149,6 +149,9 @@ static const struct usb_device_id usb_quirk_list[] = {
 	/* Genesys Logic hub, internally used by KY-688 USB 3.1 Type-C Hub */
 	{ USB_DEVICE(0x05e3, 0x0612), .driver_info = USB_QUIRK_NO_LPM },
 
+	/* ELSA MicroLink 56K */
+	{ USB_DEVICE(0x05cc, 0x2267), .driver_info = USB_QUIRK_RESET_RESUME },
+
 	/* Genesys Logic hub, internally used by Moshi USB to Ethernet Adapter */
 	{ USB_DEVICE(0x05e3, 0x0616), .driver_info = USB_QUIRK_NO_LPM },
 

From 7f038d256c723dd390d2fca942919573995f4cfd Mon Sep 17 00:00:00 2001
From: Dmitry Fleytman Dmitry Fleytman <dmitry.fleytman@gmail.com>
Date: Tue, 19 Dec 2017 06:02:04 +0200
Subject: [PATCH 264/808] usb: Add device quirk for Logitech HD Pro Webcam
 C925e

Commit e0429362ab15
("usb: Add device quirk for Logitech HD Pro Webcams C920 and C930e")
introduced quirk to workaround an issue with some Logitech webcams.

There is one more model that has the same issue - C925e, so applying
the same quirk as well.

See aforementioned commit message for detailed explanation of the problem.

Signed-off-by: Dmitry Fleytman <dmitry.fleytman@gmail.com>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/core/quirks.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c
index 95812656d9b9..4024926c1d68 100644
--- a/drivers/usb/core/quirks.c
+++ b/drivers/usb/core/quirks.c
@@ -52,10 +52,11 @@ static const struct usb_device_id usb_quirk_list[] = {
 	/* Microsoft LifeCam-VX700 v2.0 */
 	{ USB_DEVICE(0x045e, 0x0770), .driver_info = USB_QUIRK_RESET_RESUME },
 
-	/* Logitech HD Pro Webcams C920, C920-C and C930e */
+	/* Logitech HD Pro Webcams C920, C920-C, C925e and C930e */
 	{ USB_DEVICE(0x046d, 0x082d), .driver_info = USB_QUIRK_DELAY_INIT },
 	{ USB_DEVICE(0x046d, 0x0841), .driver_info = USB_QUIRK_DELAY_INIT },
 	{ USB_DEVICE(0x046d, 0x0843), .driver_info = USB_QUIRK_DELAY_INIT },
+	{ USB_DEVICE(0x046d, 0x085b), .driver_info = USB_QUIRK_DELAY_INIT },
 
 	/* Logitech ConferenceCam CC3000e */
 	{ USB_DEVICE(0x046d, 0x0847), .driver_info = USB_QUIRK_DELAY_INIT },

From a93639090a2743c8e205c1ac25439702702b4ce4 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Thu, 14 Dec 2017 15:43:43 +1100
Subject: [PATCH 265/808] staging: lustre: lnet: Fix recent breakage from
 list_for_each conversion

Commit 8e55b6fd0660 ("staging: lustre: lnet: replace list_for_each
with list_for_each_entry") was intended to be an idempotent change,
but actually broke the behavior of ksocknal_add_peer() causing mounts to fail.
The fact that it caused an existing "route2 = NULL;" to become
redundant could have been a clue.  The fact that the loop body
set the new loop variable to NULL might also have been a clue

The original code relied on "route2" being NULL if nothing was found.
The new code would always set route2 to a non-NULL value if the list
was empty, and would likely crash if the list was not empty.

Restore correct functionality by using code-flow rather the value of
"route2" to determine whether to use on old route, or to add a new one.

Fixes: 8e55b6fd0660 ("staging: lustre: lnet: replace list_for_each with list_for_each_entry")
Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 .../lustre/lnet/klnds/socklnd/socklnd.c       | 23 ++++++++-----------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c
index 986c2a40d978..8267119ccc8e 100644
--- a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c
+++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c
@@ -487,21 +487,18 @@ ksocknal_add_peer(struct lnet_ni *ni, struct lnet_process_id id, __u32 ipaddr,
 			      ksocknal_nid2peerlist(id.nid));
 	}
 
-	route2 = NULL;
 	list_for_each_entry(route2, &peer->ksnp_routes, ksnr_list) {
-		if (route2->ksnr_ipaddr == ipaddr)
-			break;
-
-		route2 = NULL;
+		if (route2->ksnr_ipaddr == ipaddr) {
+			/* Route already exists, use the old one */
+			ksocknal_route_decref(route);
+			route2->ksnr_share_count++;
+			goto out;
+		}
 	}
-	if (!route2) {
-		ksocknal_add_route_locked(peer, route);
-		route->ksnr_share_count++;
-	} else {
-		ksocknal_route_decref(route);
-		route2->ksnr_share_count++;
-	}
-
+	/* Route doesn't already exist, add the new one */
+	ksocknal_add_route_locked(peer, route);
+	route->ksnr_share_count++;
+out:
 	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
 
 	return 0;

From d070f7c703ef26e3db613f24206823f916272fc6 Mon Sep 17 00:00:00 2001
From: Abhijeet Kumar <abhijeet.kumar@intel.com>
Date: Tue, 12 Dec 2017 00:40:25 +0530
Subject: [PATCH 266/808] ASoC: nau8825: fix issue that pop noise when start
 capture

In skylake platform, we hear a loud pop noise(0 dB) at start of
audio capture power up sequence. This patch removes the pop noise
from the recording by adding a delay before enabling ADC.

Signed-off-by: Abhijeet Kumar <abhijeet.kumar@intel.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 sound/soc/codecs/nau8825.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sound/soc/codecs/nau8825.c b/sound/soc/codecs/nau8825.c
index 714ce17da717..e853a6dfd33b 100644
--- a/sound/soc/codecs/nau8825.c
+++ b/sound/soc/codecs/nau8825.c
@@ -905,6 +905,7 @@ static int nau8825_adc_event(struct snd_soc_dapm_widget *w,
 
 	switch (event) {
 	case SND_SOC_DAPM_POST_PMU:
+		msleep(125);
 		regmap_update_bits(nau8825->regmap, NAU8825_REG_ENA_CTRL,
 			NAU8825_ENABLE_ADC, NAU8825_ENABLE_ADC);
 		break;

From 20220945b1a8e77c789dd4bb9aa1471b6e8695cc Mon Sep 17 00:00:00 2001
From: Brian Norris <briannorris@chromium.org>
Date: Fri, 15 Dec 2017 20:07:23 -0800
Subject: [PATCH 267/808] ASoC: rt5514-spi: only enable wakeup when fully
 initialized

If an rt5514-spi device is probed but the platform hasn't linked it in,
we might never fully request the SPI IRQ, nor configure the rt5514 DSP,
but we still might try to enable the SPI IRQ (enable_irq_wake()). This
is bad, and among other things, can cause the interrupt to trigger every
time we try to suspend the system (e.g., because the interrupt trigger
setting was never set properly).

Instead of setting our wakeup capabilities in the SPI driver probe
routine, let's wait until we've actually requested the IRQ.

Fixes issues seen on the "kevin" Chromebook (Samsung Chromebook Plus).

Fixes: 58f1c07d23cd ("ASoC: rt5514: Voice wakeup support.")
Signed-off-by: Brian Norris <briannorris@chromium.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 sound/soc/codecs/rt5514-spi.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sound/soc/codecs/rt5514-spi.c b/sound/soc/codecs/rt5514-spi.c
index ca6a90d8fc39..64bf26cec20d 100644
--- a/sound/soc/codecs/rt5514-spi.c
+++ b/sound/soc/codecs/rt5514-spi.c
@@ -289,6 +289,8 @@ static int rt5514_spi_pcm_probe(struct snd_soc_platform *platform)
 			dev_err(&rt5514_spi->dev,
 				"%s Failed to reguest IRQ: %d\n", __func__,
 				ret);
+		else
+			device_init_wakeup(rt5514_dsp->dev, true);
 	}
 
 	return 0;
@@ -456,8 +458,6 @@ static int rt5514_spi_probe(struct spi_device *spi)
 		return ret;
 	}
 
-	device_init_wakeup(&spi->dev, true);
-
 	return 0;
 }
 

From e0795606ad565cc2da0b926a00c7e6b8187a6d71 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Wed, 13 Dec 2017 18:28:58 +0000
Subject: [PATCH 268/808] drm/i915/lpe: Remove double-encapsulation of info
 string

Just printk the string, or at least do not double up on the newlines!

Fixes: eef57324d926 ("drm/i915: setup bridge for HDMI LPE audio driver")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Cc: Jerome Anand <jerome.anand@intel.com>
Cc: Jani Nikula <jani.nikula@intel.com>
Cc: Takashi Iwai <tiwai@suse.de>
Reviewed-by: Dhinakaran Pandiyan <dhinakaran.pandiyan@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20171213182858.2159-1-chris@chris-wilson.co.uk
(cherry picked from commit 99cd05c43baac8ef56c20eb1776a15b02c81ccc3)
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
---
 drivers/gpu/drm/i915/intel_lpe_audio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/intel_lpe_audio.c b/drivers/gpu/drm/i915/intel_lpe_audio.c
index 3bf65288ffff..5809b29044fc 100644
--- a/drivers/gpu/drm/i915/intel_lpe_audio.c
+++ b/drivers/gpu/drm/i915/intel_lpe_audio.c
@@ -193,7 +193,7 @@ static bool lpe_audio_detect(struct drm_i915_private *dev_priv)
 		};
 
 		if (!pci_dev_present(atom_hdaudio_ids)) {
-			DRM_INFO("%s\n", "HDaudio controller not detected, using LPE audio instead\n");
+			DRM_INFO("HDaudio controller not detected, using LPE audio instead\n");
 			lpe_present = true;
 		}
 	}

From a4ffdc2b6726958c07d535318400124e3a3bc19b Mon Sep 17 00:00:00 2001
From: Rodrigo Vivi <rodrigo.vivi@intel.com>
Date: Fri, 15 Dec 2017 14:43:10 -0800
Subject: [PATCH 269/808] drm/i915: Protect DDI port to DPLL map from
 theoretical race.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In case we have multiple modesets for different connectors
happening in parallel we could have a race on the RMW on these
shared registers.

This possibility was initially raised by Paulo when reviewing
commit '555e38d27317 ("drm/i915/cnl: DDI - PLL mapping")'
but the original possibility comes from commit '5416d871136d
("drm/i915/skl: Set the eDP link rate on DPLL0")'. Or maybe
later when atomic commits entered into picture.

Apparently the discussion around this topic showed that the
right solution would be on serializing the atomic commits in
a way that we don't have the possibility of races here since
if that parallel modeset happenings apparently many other
things will be on fire.

Code is there since SKL and there was no report of issue,
but since we never looked back to that serialization possibility,
and also we don't have an igt case for that it is better to at
least protect this corner.

Suggested-by: Paulo Zanoni <paulo.r.zanoni@intel.com>
Fixes: 555e38d27317 ("drm/i915/cnl: DDI - PLL mapping")
Fixes: 5416d871136d ("drm/i915/skl: Set the eDP link rate on DPLL0")
Cc: Paulo Zanoni <paulo.r.zanoni@intel.com>
Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
Cc: Maarten Lankhorst maarten.lankhorst@linux.intel.com
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Reviewed-by: Maarten Lankhorst maarten.lankhorst@linux.intel.com
Link: https://patchwork.freedesktop.org/patch/msgid/20171215224310.19103-1-rodrigo.vivi@intel.com
(cherry picked from commit 8edcda1266f93816fde77c9754f388ae0ae343fc)
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
---
 drivers/gpu/drm/i915/intel_ddi.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/gpu/drm/i915/intel_ddi.c b/drivers/gpu/drm/i915/intel_ddi.c
index e0843bb99169..58a3755544b2 100644
--- a/drivers/gpu/drm/i915/intel_ddi.c
+++ b/drivers/gpu/drm/i915/intel_ddi.c
@@ -2128,6 +2128,8 @@ static void intel_ddi_clk_select(struct intel_encoder *encoder,
 	if (WARN_ON(!pll))
 		return;
 
+	 mutex_lock(&dev_priv->dpll_lock);
+
 	if (IS_CANNONLAKE(dev_priv)) {
 		/* Configure DPCLKA_CFGCR0 to map the DPLL to the DDI. */
 		val = I915_READ(DPCLKA_CFGCR0);
@@ -2157,6 +2159,8 @@ static void intel_ddi_clk_select(struct intel_encoder *encoder,
 	} else if (INTEL_INFO(dev_priv)->gen < 9) {
 		I915_WRITE(PORT_CLK_SEL(port), hsw_pll_to_ddi_pll_sel(pll));
 	}
+
+	mutex_unlock(&dev_priv->dpll_lock);
 }
 
 static void intel_ddi_clk_disable(struct intel_encoder *encoder)

From 116d2f7496c51b2e02e8e4ecdd2bdf5fb9d5a641 Mon Sep 17 00:00:00 2001
From: Prateek Sood <prsood@codeaurora.org>
Date: Tue, 19 Dec 2017 12:56:57 +0530
Subject: [PATCH 270/808] cgroup: Fix deadlock in cpu hotplug path

Deadlock during cgroup migration from cpu hotplug path when a task T is
being moved from source to destination cgroup.

kworker/0:0
cpuset_hotplug_workfn()
   cpuset_hotplug_update_tasks()
      hotplug_update_tasks_legacy()
        remove_tasks_in_empty_cpuset()
          cgroup_transfer_tasks() // stuck in iterator loop
            cgroup_migrate()
              cgroup_migrate_add_task()

In cgroup_migrate_add_task() it checks for PF_EXITING flag of task T.
Task T will not migrate to destination cgroup. css_task_iter_start()
will keep pointing to task T in loop waiting for task T cg_list node
to be removed.

Task T
do_exit()
  exit_signals() // sets PF_EXITING
  exit_task_namespaces()
    switch_task_namespaces()
      free_nsproxy()
        put_mnt_ns()
          drop_collected_mounts()
            namespace_unlock()
              synchronize_rcu()
                _synchronize_rcu_expedited()
                  schedule_work() // on cpu0 low priority worker pool
                  wait_event() // waiting for work item to execute

Task T inserted a work item in the worklist of cpu0 low priority
worker pool. It is waiting for expedited grace period work item
to execute. This work item will only be executed once kworker/0:0
complete execution of cpuset_hotplug_workfn().

kworker/0:0 ==> Task T ==>kworker/0:0

In case of PF_EXITING task being migrated from source to destination
cgroup, migrate next available task in source cgroup.

Signed-off-by: Prateek Sood <prsood@codeaurora.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/cgroup-v1.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 024085daab1a..a2c05d2476ac 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -123,7 +123,11 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
 	 */
 	do {
 		css_task_iter_start(&from->self, 0, &it);
-		task = css_task_iter_next(&it);
+
+		do {
+			task = css_task_iter_next(&it);
+		} while (task && (task->flags & PF_EXITING));
+
 		if (task)
 			get_task_struct(task);
 		css_task_iter_end(&it);

From f292b9b28097d8fe870336108e91bd95a14294bf Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Fri, 15 Dec 2017 19:59:47 -0800
Subject: [PATCH 271/808] staging: ion: Fix ion_cma_heap allocations

In trying to add support for drm_hwcomposer to HiKey,
I've needed to utilize the ION CMA heap, and I've noticed
problems with allocations on newer kernels failing.

It seems back with 204f672255c2 ("ion: Use CMA APIs directly"),
the ion_cma_heap code was modified to use the CMA API, but
kept the arguments as buffer lengths rather then number of pages.

This results in errors as we don't have enough pages in CMA to
satisfy the exaggerated requests.

This patch converts the ion_cma_heap CMA API usage to properly
request pages.

It also fixes a minor issue in the allocation where in the error
path, the cma_release is called with the buffer->size value which
hasn't yet been set.

Cc: Laura Abbott <labbott@redhat.com>
Cc: Sumit Semwal <sumit.semwal@linaro.org>
Cc: Benjamin Gaignard <benjamin.gaignard@linaro.org>
Cc: Archit Taneja <architt@codeaurora.org>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Dmitry Shmidt <dimitrysh@google.com>
Cc: Todd Kjos <tkjos@google.com>
Cc: Amit Pundir <amit.pundir@linaro.org>
Fixes: 204f672255c2 ("staging: android: ion: Use CMA APIs directly")
Acked-by: Laura Abbott <labbott@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/staging/android/ion/Kconfig        |  2 +-
 drivers/staging/android/ion/ion_cma_heap.c | 15 +++++++++++----
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/drivers/staging/android/ion/Kconfig b/drivers/staging/android/ion/Kconfig
index a517b2d29f1b..8f6494158d3d 100644
--- a/drivers/staging/android/ion/Kconfig
+++ b/drivers/staging/android/ion/Kconfig
@@ -37,7 +37,7 @@ config ION_CHUNK_HEAP
 
 config ION_CMA_HEAP
 	bool "Ion CMA heap support"
-	depends on ION && CMA
+	depends on ION && DMA_CMA
 	help
 	  Choose this option to enable CMA heaps with Ion. This heap is backed
 	  by the Contiguous Memory Allocator (CMA). If your system has these
diff --git a/drivers/staging/android/ion/ion_cma_heap.c b/drivers/staging/android/ion/ion_cma_heap.c
index dd5545d9990a..86196ffd2faf 100644
--- a/drivers/staging/android/ion/ion_cma_heap.c
+++ b/drivers/staging/android/ion/ion_cma_heap.c
@@ -39,9 +39,15 @@ static int ion_cma_allocate(struct ion_heap *heap, struct ion_buffer *buffer,
 	struct ion_cma_heap *cma_heap = to_cma_heap(heap);
 	struct sg_table *table;
 	struct page *pages;
+	unsigned long size = PAGE_ALIGN(len);
+	unsigned long nr_pages = size >> PAGE_SHIFT;
+	unsigned long align = get_order(size);
 	int ret;
 
-	pages = cma_alloc(cma_heap->cma, len, 0, GFP_KERNEL);
+	if (align > CONFIG_CMA_ALIGNMENT)
+		align = CONFIG_CMA_ALIGNMENT;
+
+	pages = cma_alloc(cma_heap->cma, nr_pages, align, GFP_KERNEL);
 	if (!pages)
 		return -ENOMEM;
 
@@ -53,7 +59,7 @@ static int ion_cma_allocate(struct ion_heap *heap, struct ion_buffer *buffer,
 	if (ret)
 		goto free_mem;
 
-	sg_set_page(table->sgl, pages, len, 0);
+	sg_set_page(table->sgl, pages, size, 0);
 
 	buffer->priv_virt = pages;
 	buffer->sg_table = table;
@@ -62,7 +68,7 @@ static int ion_cma_allocate(struct ion_heap *heap, struct ion_buffer *buffer,
 free_mem:
 	kfree(table);
 err:
-	cma_release(cma_heap->cma, pages, buffer->size);
+	cma_release(cma_heap->cma, pages, nr_pages);
 	return -ENOMEM;
 }
 
@@ -70,9 +76,10 @@ static void ion_cma_free(struct ion_buffer *buffer)
 {
 	struct ion_cma_heap *cma_heap = to_cma_heap(buffer->heap);
 	struct page *pages = buffer->priv_virt;
+	unsigned long nr_pages = PAGE_ALIGN(buffer->size) >> PAGE_SHIFT;
 
 	/* release memory */
-	cma_release(cma_heap->cma, pages, buffer->size);
+	cma_release(cma_heap->cma, pages, nr_pages);
 	/* release sg table */
 	sg_free_table(buffer->sg_table);
 	kfree(buffer->sg_table);

From d6b246bb7a29703f53aa4c050b8b3205d749caee Mon Sep 17 00:00:00 2001
From: Sushmita Susheelendra <ssusheel@codeaurora.org>
Date: Fri, 15 Dec 2017 13:59:13 -0700
Subject: [PATCH 272/808] staging: android: ion: Fix dma direction for
 dma_sync_sg_for_cpu/device

Use the direction argument passed into begin_cpu_access
and end_cpu_access when calling the dma_sync_sg_for_cpu/device.
The actual cache primitive called depends on the direction
passed in.

Signed-off-by: Sushmita Susheelendra <ssusheel@codeaurora.org>
Cc: stable <stable@vger.kernel.org>
Acked-by: Laura Abbott <labbott@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/staging/android/ion/ion.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/staging/android/ion/ion.c b/drivers/staging/android/ion/ion.c
index a7d9b0e98572..f480885e346b 100644
--- a/drivers/staging/android/ion/ion.c
+++ b/drivers/staging/android/ion/ion.c
@@ -346,7 +346,7 @@ static int ion_dma_buf_begin_cpu_access(struct dma_buf *dmabuf,
 	mutex_lock(&buffer->lock);
 	list_for_each_entry(a, &buffer->attachments, list) {
 		dma_sync_sg_for_cpu(a->dev, a->table->sgl, a->table->nents,
-				    DMA_BIDIRECTIONAL);
+				    direction);
 	}
 	mutex_unlock(&buffer->lock);
 
@@ -368,7 +368,7 @@ static int ion_dma_buf_end_cpu_access(struct dma_buf *dmabuf,
 	mutex_lock(&buffer->lock);
 	list_for_each_entry(a, &buffer->attachments, list) {
 		dma_sync_sg_for_device(a->dev, a->table->sgl, a->table->nents,
-				       DMA_BIDIRECTIONAL);
+				       direction);
 	}
 	mutex_unlock(&buffer->lock);
 

From 748a240c589824e9121befb1cba5341c319885bc Mon Sep 17 00:00:00 2001
From: Brian King <brking@linux.vnet.ibm.com>
Date: Fri, 15 Dec 2017 15:21:50 -0600
Subject: [PATCH 273/808] tg3: Fix rx hang on MTU change with 5717/5719

This fixes a hang issue seen when changing the MTU size from 1500 MTU
to 9000 MTU on both 5717 and 5719 chips. In discussion with Broadcom,
they've indicated that these chipsets have the same phy as the 57766
chipset, so the same workarounds apply. This has been tested by IBM
on both Power 8 and Power 9 systems as well as by Broadcom on x86
hardware and has been confirmed to resolve the hang issue.

Signed-off-by: Brian King <brking@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/tg3.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index de51c2177d03..d09c5a9c53b5 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -14225,7 +14225,9 @@ static int tg3_change_mtu(struct net_device *dev, int new_mtu)
 	/* Reset PHY, otherwise the read DMA engine will be in a mode that
 	 * breaks all requests to 256 bytes.
 	 */
-	if (tg3_asic_rev(tp) == ASIC_REV_57766)
+	if (tg3_asic_rev(tp) == ASIC_REV_57766 ||
+	    tg3_asic_rev(tp) == ASIC_REV_5717 ||
+	    tg3_asic_rev(tp) == ASIC_REV_5719)
 		reset_phy = true;
 
 	err = tg3_restart_hw(tp, reset_phy);

From 8ba6b30ef700e16f3bc668e6f4f8375da9229e4d Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Sun, 17 Dec 2017 17:16:43 +0100
Subject: [PATCH 274/808] mlxsw: spectrum_router: Remove batch neighbour
 deletion causing FW bug

This reverts commit 63dd00fa3e524c27cc0509190084ab147ecc8ae2.

RAUHT DELETE_ALL seems to trigger a bug in FW. That manifests by later
calls to RAUHT ADD of an IPv6 neighbor to fail with "bad parameter"
error code.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Fixes: 63dd00fa3e52 ("mlxsw: spectrum_router: Add batch neighbour deletion")
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/mellanox/mlxsw/spectrum_router.c | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index 72ef4f8025f0..be657b8533f0 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -2436,25 +2436,16 @@ static void mlxsw_sp_neigh_fini(struct mlxsw_sp *mlxsw_sp)
 	rhashtable_destroy(&mlxsw_sp->router->neigh_ht);
 }
 
-static int mlxsw_sp_neigh_rif_flush(struct mlxsw_sp *mlxsw_sp,
-				    const struct mlxsw_sp_rif *rif)
-{
-	char rauht_pl[MLXSW_REG_RAUHT_LEN];
-
-	mlxsw_reg_rauht_pack(rauht_pl, MLXSW_REG_RAUHT_OP_WRITE_DELETE_ALL,
-			     rif->rif_index, rif->addr);
-	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(rauht), rauht_pl);
-}
-
 static void mlxsw_sp_neigh_rif_gone_sync(struct mlxsw_sp *mlxsw_sp,
 					 struct mlxsw_sp_rif *rif)
 {
 	struct mlxsw_sp_neigh_entry *neigh_entry, *tmp;
 
-	mlxsw_sp_neigh_rif_flush(mlxsw_sp, rif);
 	list_for_each_entry_safe(neigh_entry, tmp, &rif->neigh_list,
-				 rif_list_node)
+				 rif_list_node) {
+		mlxsw_sp_neigh_entry_update(mlxsw_sp, neigh_entry, false);
 		mlxsw_sp_neigh_entry_destroy(mlxsw_sp, neigh_entry);
+	}
 }
 
 enum mlxsw_sp_nexthop_type {

From 2cc42bac1c795f75fcc062b95c6ca7ac1b84d5d8 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@suse.com>
Date: Mon, 18 Dec 2017 09:37:45 -0700
Subject: [PATCH 275/808] x86-64/Xen: eliminate W+X mappings

A few thousand such pages are usually left around due to the re-use of
L1 tables having been provided by the hypervisor (Dom0) or tool stack
(DomU). Set NX in the direct map variant, which needs to be done in L2
due to the dual use of the re-used L1s.

For x86_configure_nx() to actually do what it is supposed to do, call
get_cpu_cap() first. This was broken by commit 4763ed4d45 ("x86, mm:
Clean up and simplify NX enablement") when switching away from the
direct EFER read.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 arch/x86/xen/enlighten_pv.c |  3 +++
 arch/x86/xen/mmu_pv.c       | 12 ++++++++++++
 2 files changed, 15 insertions(+)

diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 69b9deff7e5c..86f26ea99324 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -87,6 +87,8 @@
 #include "multicalls.h"
 #include "pmu.h"
 
+#include "../kernel/cpu/cpu.h" /* get_cpu_cap() */
+
 void *xen_initial_gdt;
 
 static int xen_cpu_up_prepare_pv(unsigned int cpu);
@@ -1249,6 +1251,7 @@ asmlinkage __visible void __init xen_start_kernel(void)
 	__userpte_alloc_gfp &= ~__GFP_HIGHMEM;
 
 	/* Work out if we support NX */
+	get_cpu_cap(&boot_cpu_data);
 	x86_configure_nx();
 
 	/* Get mfn list */
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index 9d9cc3870722..7118f776cd49 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -1916,6 +1916,18 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
 	/* Graft it onto L4[511][510] */
 	copy_page(level2_kernel_pgt, l2);
 
+	/*
+	 * Zap execute permission from the ident map. Due to the sharing of
+	 * L1 entries we need to do this in the L2.
+	 */
+	if (__supported_pte_mask & _PAGE_NX) {
+		for (i = 0; i < PTRS_PER_PMD; ++i) {
+			if (pmd_none(level2_ident_pgt[i]))
+				continue;
+			level2_ident_pgt[i] = pmd_set_flags(level2_ident_pgt[i], _PAGE_NX);
+		}
+	}
+
 	/* Copy the initial P->M table mappings if necessary. */
 	i = pgd_index(xen_start_info->mfn_list);
 	if (i && i < pgd_index(__START_KERNEL_map))

From 7352e252b5bf40d59342494a70354a2d436fd0cd Mon Sep 17 00:00:00 2001
From: Sean Wang <sean.wang@mediatek.com>
Date: Mon, 18 Dec 2017 17:00:17 +0800
Subject: [PATCH 276/808] net: mediatek: setup proper state for disabled GMAC
 on the default

The current solution would setup fixed and force link of 1Gbps to the both
GMAC on the default. However, The GMAC should always be put to link down
state when the GMAC is disabled on certain target boards. Otherwise,
the driver possibly receives unexpected data from the floating hardware
connection through the unused GMAC. Although the driver had been added
certain protection in RX path to get rid of such kind of unexpected data
sent to the upper stack.

Signed-off-by: Sean Wang <sean.wang@mediatek.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mediatek/mtk_eth_soc.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
index 54adfd967858..fc67e35b253e 100644
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
@@ -1961,11 +1961,12 @@ static int mtk_hw_init(struct mtk_eth *eth)
 	/* set GE2 TUNE */
 	regmap_write(eth->pctl, GPIO_BIAS_CTRL, 0x0);
 
-	/* GE1, Force 1000M/FD, FC ON */
-	mtk_w32(eth, MAC_MCR_FIXED_LINK, MTK_MAC_MCR(0));
-
-	/* GE2, Force 1000M/FD, FC ON */
-	mtk_w32(eth, MAC_MCR_FIXED_LINK, MTK_MAC_MCR(1));
+	/* Set linkdown as the default for each GMAC. Its own MCR would be set
+	 * up with the more appropriate value when mtk_phy_link_adjust call is
+	 * being invoked.
+	 */
+	for (i = 0; i < MTK_MAC_COUNT; i++)
+		mtk_w32(eth, 0, MTK_MAC_MCR(i));
 
 	/* Indicates CDM to parse the MTK special tag from CPU
 	 * which also is working out for untag packets.

From e688822d035b494071ecbadcccbd6f3325fb0f59 Mon Sep 17 00:00:00 2001
From: Alexander Kochetkov <al.kochet@gmail.com>
Date: Fri, 15 Dec 2017 20:20:06 +0300
Subject: [PATCH 277/808] net: arc_emac: fix arc_emac_rx() error paths

arc_emac_rx() has some issues found by code review.

In case netdev_alloc_skb_ip_align() or dma_map_single() failure
rx fifo entry will not be returned to EMAC.

In case dma_map_single() failure previously allocated skb became
lost to driver. At the same time address of newly allocated skb
will not be provided to EMAC.

Signed-off-by: Alexander Kochetkov <al.kochet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/arc/emac_main.c | 53 ++++++++++++++++------------
 1 file changed, 31 insertions(+), 22 deletions(-)

diff --git a/drivers/net/ethernet/arc/emac_main.c b/drivers/net/ethernet/arc/emac_main.c
index 3241af1ce718..5b422be56165 100644
--- a/drivers/net/ethernet/arc/emac_main.c
+++ b/drivers/net/ethernet/arc/emac_main.c
@@ -210,39 +210,48 @@ static int arc_emac_rx(struct net_device *ndev, int budget)
 			continue;
 		}
 
-		pktlen = info & LEN_MASK;
-		stats->rx_packets++;
-		stats->rx_bytes += pktlen;
-		skb = rx_buff->skb;
-		skb_put(skb, pktlen);
-		skb->dev = ndev;
-		skb->protocol = eth_type_trans(skb, ndev);
-
-		dma_unmap_single(&ndev->dev, dma_unmap_addr(rx_buff, addr),
-				 dma_unmap_len(rx_buff, len), DMA_FROM_DEVICE);
-
-		/* Prepare the BD for next cycle */
-		rx_buff->skb = netdev_alloc_skb_ip_align(ndev,
-							 EMAC_BUFFER_SIZE);
-		if (unlikely(!rx_buff->skb)) {
+		/* Prepare the BD for next cycle. netif_receive_skb()
+		 * only if new skb was allocated and mapped to avoid holes
+		 * in the RX fifo.
+		 */
+		skb = netdev_alloc_skb_ip_align(ndev, EMAC_BUFFER_SIZE);
+		if (unlikely(!skb)) {
+			if (net_ratelimit())
+				netdev_err(ndev, "cannot allocate skb\n");
+			/* Return ownership to EMAC */
+			rxbd->info = cpu_to_le32(FOR_EMAC | EMAC_BUFFER_SIZE);
 			stats->rx_errors++;
-			/* Because receive_skb is below, increment rx_dropped */
 			stats->rx_dropped++;
 			continue;
 		}
 
-		/* receive_skb only if new skb was allocated to avoid holes */
-		netif_receive_skb(skb);
-
-		addr = dma_map_single(&ndev->dev, (void *)rx_buff->skb->data,
+		addr = dma_map_single(&ndev->dev, (void *)skb->data,
 				      EMAC_BUFFER_SIZE, DMA_FROM_DEVICE);
 		if (dma_mapping_error(&ndev->dev, addr)) {
 			if (net_ratelimit())
-				netdev_err(ndev, "cannot dma map\n");
-			dev_kfree_skb(rx_buff->skb);
+				netdev_err(ndev, "cannot map dma buffer\n");
+			dev_kfree_skb(skb);
+			/* Return ownership to EMAC */
+			rxbd->info = cpu_to_le32(FOR_EMAC | EMAC_BUFFER_SIZE);
 			stats->rx_errors++;
+			stats->rx_dropped++;
 			continue;
 		}
+
+		/* unmap previosly mapped skb */
+		dma_unmap_single(&ndev->dev, dma_unmap_addr(rx_buff, addr),
+				 dma_unmap_len(rx_buff, len), DMA_FROM_DEVICE);
+
+		pktlen = info & LEN_MASK;
+		stats->rx_packets++;
+		stats->rx_bytes += pktlen;
+		skb_put(rx_buff->skb, pktlen);
+		rx_buff->skb->dev = ndev;
+		rx_buff->skb->protocol = eth_type_trans(rx_buff->skb, ndev);
+
+		netif_receive_skb(rx_buff->skb);
+
+		rx_buff->skb = skb;
 		dma_unmap_addr_set(rx_buff, addr, addr);
 		dma_unmap_len_set(rx_buff, len, EMAC_BUFFER_SIZE);
 

From 78aa09754d69ba19a55c59f490788ec1c85f41f0 Mon Sep 17 00:00:00 2001
From: Alexander Kochetkov <al.kochet@gmail.com>
Date: Tue, 19 Dec 2017 14:03:57 +0300
Subject: [PATCH 278/808] net: arc_emac: restart stalled EMAC

Under certain conditions EMAC stop reception of incoming packets and
continuously increment R_MISS register instead of saving data into
provided buffer. The commit implement workaround for such situation.
Then the stall detected EMAC will be restarted.

On device the stall looks like the device lost it's dynamic IP address.
ifconfig shows that interface error counter rapidly increments.
At the same time on the DHCP server we can see continues DHCP-requests
from device.

In real network stalls happen really rarely. To make them frequent the
broadcast storm[1] should be simulated. For simulation it is necessary
to make following connections:
    1. connect radxarock to 1st port of switch
    2. connect some PC to 2nd port of switch
    3. connect two other free ports together using standard ethernet cable,
       in order to make a switching loop.

After that, is necessary to make a broadcast storm. For example, running on
PC 'ping' to some IP address triggers ARP-request storm. After some
time (~10sec), EMAC on rk3188 will stall.

Observed and tested on rk3188 radxarock.

[1] https://en.wikipedia.org/wiki/Broadcast_radiation

Signed-off-by: Alexander Kochetkov <al.kochet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/arc/emac.h      |   2 +
 drivers/net/ethernet/arc/emac_main.c | 111 +++++++++++++++++++++++++++
 2 files changed, 113 insertions(+)

diff --git a/drivers/net/ethernet/arc/emac.h b/drivers/net/ethernet/arc/emac.h
index 3c63b16d485f..d9efbc8d783b 100644
--- a/drivers/net/ethernet/arc/emac.h
+++ b/drivers/net/ethernet/arc/emac.h
@@ -159,6 +159,8 @@ struct arc_emac_priv {
 	unsigned int link;
 	unsigned int duplex;
 	unsigned int speed;
+
+	unsigned int rx_missed_errors;
 };
 
 /**
diff --git a/drivers/net/ethernet/arc/emac_main.c b/drivers/net/ethernet/arc/emac_main.c
index 5b422be56165..bd277b0dc615 100644
--- a/drivers/net/ethernet/arc/emac_main.c
+++ b/drivers/net/ethernet/arc/emac_main.c
@@ -26,6 +26,8 @@
 
 #include "emac.h"
 
+static void arc_emac_restart(struct net_device *ndev);
+
 /**
  * arc_emac_tx_avail - Return the number of available slots in the tx ring.
  * @priv: Pointer to ARC EMAC private data structure.
@@ -267,6 +269,53 @@ static int arc_emac_rx(struct net_device *ndev, int budget)
 	return work_done;
 }
 
+/**
+ * arc_emac_rx_miss_handle - handle R_MISS register
+ * @ndev:	Pointer to the net_device structure.
+ */
+static void arc_emac_rx_miss_handle(struct net_device *ndev)
+{
+	struct arc_emac_priv *priv = netdev_priv(ndev);
+	struct net_device_stats *stats = &ndev->stats;
+	unsigned int miss;
+
+	miss = arc_reg_get(priv, R_MISS);
+	if (miss) {
+		stats->rx_errors += miss;
+		stats->rx_missed_errors += miss;
+		priv->rx_missed_errors += miss;
+	}
+}
+
+/**
+ * arc_emac_rx_stall_check - check RX stall
+ * @ndev:	Pointer to the net_device structure.
+ * @budget:	How many BDs requested to process on 1 call.
+ * @work_done:	How many BDs processed
+ *
+ * Under certain conditions EMAC stop reception of incoming packets and
+ * continuously increment R_MISS register instead of saving data into
+ * provided buffer. This function detect that condition and restart
+ * EMAC.
+ */
+static void arc_emac_rx_stall_check(struct net_device *ndev,
+				    int budget, unsigned int work_done)
+{
+	struct arc_emac_priv *priv = netdev_priv(ndev);
+	struct arc_emac_bd *rxbd;
+
+	if (work_done)
+		priv->rx_missed_errors = 0;
+
+	if (priv->rx_missed_errors && budget) {
+		rxbd = &priv->rxbd[priv->last_rx_bd];
+		if (le32_to_cpu(rxbd->info) & FOR_EMAC) {
+			arc_emac_restart(ndev);
+			priv->rx_missed_errors = 0;
+		}
+	}
+}
+
 /**
  * arc_emac_poll - NAPI poll handler.
  * @napi:	Pointer to napi_struct structure.
@@ -281,6 +330,7 @@ static int arc_emac_poll(struct napi_struct *napi, int budget)
 	unsigned int work_done;
 
 	arc_emac_tx_clean(ndev);
+	arc_emac_rx_miss_handle(ndev);
 
 	work_done = arc_emac_rx(ndev, budget);
 	if (work_done < budget) {
@@ -288,6 +338,8 @@ static int arc_emac_poll(struct napi_struct *napi, int budget)
 		arc_reg_or(priv, R_ENABLE, RXINT_MASK | TXINT_MASK);
 	}
 
+	arc_emac_rx_stall_check(ndev, budget, work_done);
+
 	return work_done;
 }
 
@@ -329,6 +381,8 @@ static irqreturn_t arc_emac_intr(int irq, void *dev_instance)
 		if (status & MSER_MASK) {
 			stats->rx_missed_errors += 0x100;
 			stats->rx_errors += 0x100;
+			priv->rx_missed_errors += 0x100;
+			napi_schedule(&priv->napi);
 		}
 
 		if (status & RXCR_MASK) {
@@ -741,6 +795,63 @@ static int arc_emac_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 }
 
 
+/**
+ * arc_emac_restart - Restart EMAC
+ * @ndev:	Pointer to net_device structure.
+ *
+ * This function do hardware reset of EMAC in order to restore
+ * network packets reception.
+ */
+static void arc_emac_restart(struct net_device *ndev)
+{
+	struct arc_emac_priv *priv = netdev_priv(ndev);
+	struct net_device_stats *stats = &ndev->stats;
+	int i;
+
+	if (net_ratelimit())
+		netdev_warn(ndev, "restarting stalled EMAC\n");
+
+	netif_stop_queue(ndev);
+
+	/* Disable interrupts */
+	arc_reg_clr(priv, R_ENABLE, RXINT_MASK | TXINT_MASK | ERR_MASK);
+
+	/* Disable EMAC */
+	arc_reg_clr(priv, R_CTRL, EN_MASK);
+
+	/* Return the sk_buff to system */
+	arc_free_tx_queue(ndev);
+
+	/* Clean Tx BD's */
+	priv->txbd_curr = 0;
+	priv->txbd_dirty = 0;
+	memset(priv->txbd, 0, TX_RING_SZ);
+
+	for (i = 0; i < RX_BD_NUM; i++) {
+		struct arc_emac_bd *rxbd = &priv->rxbd[i];
+		unsigned int info = le32_to_cpu(rxbd->info);
+
+		if (!(info & FOR_EMAC)) {
+			stats->rx_errors++;
+			stats->rx_dropped++;
+		}
+		/* Return ownership to EMAC */
+		rxbd->info = cpu_to_le32(FOR_EMAC | EMAC_BUFFER_SIZE);
+	}
+	priv->last_rx_bd = 0;
+
+	/* Make sure info is visible to EMAC before enable */
+	wmb();
+
+	/* Enable interrupts */
+	arc_reg_set(priv, R_ENABLE, RXINT_MASK | TXINT_MASK | ERR_MASK);
+
+	/* Enable EMAC */
+	arc_reg_or(priv, R_CTRL, EN_MASK);
+
+	netif_start_queue(ndev);
+}
+
 static const struct net_device_ops arc_emac_netdev_ops = {
 	.ndo_open		= arc_emac_open,
 	.ndo_stop		= arc_emac_stop,

From a93bf0ff449064e6b7f44e58522e940f88c0d966 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Mon, 18 Dec 2017 14:20:56 +0800
Subject: [PATCH 279/808] vxlan: update skb dst pmtu on tx path

Unlike ip tunnels, now vxlan doesn't do any pmtu update for
upper dst pmtu, even if it doesn't match the lower dst pmtu
any more.

The problem can be reproduced when reducing the vxlan lower
dev's pmtu when running netperf. In jianlin's testing, the
performance went to 1/7 of the previous.

This patch is to update the upper dst pmtu to match the lower
dst pmtu on tx path so that packets can be sent out even when
lower dev's pmtu has been changed.

It also works for metadata dst.

Note that this patch doesn't process any pmtu icmp packet.
But even in the future, the support for pmtu icmp packets
process of udp tunnels will also needs this.

The same thing will be done for geneve in another patch.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 1000b0e4ee01..31f4b7911ef8 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -2155,6 +2155,13 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 		}
 
 		ndst = &rt->dst;
+		if (skb_dst(skb)) {
+			int mtu = dst_mtu(ndst) - VXLAN_HEADROOM;
+
+			skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL,
+						       skb, mtu);
+		}
+
 		tos = ip_tunnel_ecn_encap(tos, old_iph, skb);
 		ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
 		err = vxlan_build_skb(skb, ndst, sizeof(struct iphdr),
@@ -2190,6 +2197,13 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 				goto out_unlock;
 		}
 
+		if (skb_dst(skb)) {
+			int mtu = dst_mtu(ndst) - VXLAN6_HEADROOM;
+
+			skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL,
+						       skb, mtu);
+		}
+
 		tos = ip_tunnel_ecn_encap(tos, old_iph, skb);
 		ttl = ttl ? : ip6_dst_hoplimit(ndst);
 		skb_scrub_packet(skb, xnet);

From cfddd4c33c254954927942599d299b3865743146 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Mon, 18 Dec 2017 14:24:35 +0800
Subject: [PATCH 280/808] ip_gre: remove the incorrect mtu limit for ipgre tap

ipgre tap driver calls ether_setup(), after commit 61e84623ace3
("net: centralize net_device min/max MTU checking"), the range
of mtu is [min_mtu, max_mtu], which is [68, 1500] by default.

It causes the dev mtu of the ipgre tap device to not be greater
than 1500, this limit value is not correct for ipgre tap device.

Besides, it's .change_mtu already does the right check. So this
patch is just to set max_mtu as 0, and leave the check to it's
.change_mtu.

Fixes: 61e84623ace3 ("net: centralize net_device min/max MTU checking")
Reported-by: Jianlin Shi <jishi@redhat.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_gre.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 9c1735632c8c..45ffd3d045d2 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -1310,6 +1310,7 @@ static const struct net_device_ops erspan_netdev_ops = {
 static void ipgre_tap_setup(struct net_device *dev)
 {
 	ether_setup(dev);
+	dev->max_mtu = 0;
 	dev->netdev_ops	= &gre_tap_netdev_ops;
 	dev->priv_flags &= ~IFF_TX_SKB_SHARING;
 	dev->priv_flags	|= IFF_LIVE_ADDR_CHANGE;

From 2c52129a7d74d017320804c6928de770815c5f4a Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Mon, 18 Dec 2017 14:25:09 +0800
Subject: [PATCH 281/808] ip6_gre: remove the incorrect mtu limit for ipgre tap

The same fix as the patch "ip_gre: remove the incorrect mtu limit for
ipgre tap" is also needed for ip6_gre.

Fixes: 61e84623ace3 ("net: centralize net_device min/max MTU checking")
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_gre.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 4cfd8e0696fe..416c8913f132 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -1308,6 +1308,7 @@ static void ip6gre_tap_setup(struct net_device *dev)
 
 	ether_setup(dev);
 
+	dev->max_mtu = 0;
 	dev->netdev_ops = &ip6gre_tap_netdev_ops;
 	dev->needs_free_netdev = true;
 	dev->priv_destructor = ip6gre_dev_free;

From c9fefa08190fc879fb2e681035d7774e0a8c5170 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Mon, 18 Dec 2017 14:26:21 +0800
Subject: [PATCH 282/808] ip6_tunnel: get the min mtu properly in ip6_tnl_xmit

Now it's using IPV6_MIN_MTU as the min mtu in ip6_tnl_xmit, but
IPV6_MIN_MTU actually only works when the inner packet is ipv6.

With IPV6_MIN_MTU for ipv4 packets, the new pmtu for inner dst
couldn't be set less than 1280. It would cause tx_err and the
packet to be dropped when the outer dst pmtu is close to 1280.

Jianlin found it by running ipv4 traffic with the topo:

  (client) gre6 <---> eth1 (route) eth2 <---> gre6 (server)

After changing eth2 mtu to 1300, the performance became very
low, or the connection was even broken. The issue also affects
ip4ip6 and ip6ip6 tunnels.

So if the inner packet is ipv4, 576 should be considered as the
min mtu.

Note that for ip4ip6 and ip6ip6 tunnels, the inner packet can
only be ipv4 or ipv6, but for gre6 tunnel, it may also be ARP.
This patch using 576 as the min mtu for non-ipv6 packet works
for all those cases.

Reported-by: Jianlin Shi <jishi@redhat.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_tunnel.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index db84f523656d..931c38f6ff4a 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1123,8 +1123,13 @@ route_lookup:
 		max_headroom += 8;
 		mtu -= 8;
 	}
-	if (mtu < IPV6_MIN_MTU)
-		mtu = IPV6_MIN_MTU;
+	if (skb->protocol == htons(ETH_P_IPV6)) {
+		if (mtu < IPV6_MIN_MTU)
+			mtu = IPV6_MIN_MTU;
+	} else if (mtu < 576) {
+		mtu = 576;
+	}
+
 	if (skb_dst(skb) && !t->parms.collect_md)
 		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
 	if (skb->len - t->tun_hlen - eth_hlen > mtu && !skb_is_gso(skb)) {

From 3db096011722fd8717e57687ae94b6917a11c9cc Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Mon, 18 Dec 2017 20:03:05 +0100
Subject: [PATCH 283/808] tipc: fix list sorting bug in function
 tipc_group_update_member()

When, during a join operation, or during message transmission, a group
member needs to be added to the group's 'congested' list, we sort it
into the list in ascending order, according to its current advertised
window size. However, we miss the case when the member is already on
that list. This will have the result that the member, after the window
size has been decremented, might be at the wrong position in that list.
This again may have the effect that we during broadcast and multicast
transmissions miss the fact that a destination is not yet ready for
reception, and we end up sending anyway. From this point on, the
behavior during the remaining session is unpredictable, e.g., with
underflowing window sizes.

We now correct this bug by unconditionally removing the member from
the list before (re-)sorting it in.

Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/group.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/net/tipc/group.c b/net/tipc/group.c
index b96ec429bb9b..bbc004eaa31a 100644
--- a/net/tipc/group.c
+++ b/net/tipc/group.c
@@ -351,8 +351,7 @@ void tipc_group_update_member(struct tipc_member *m, int len)
 	if (m->window >= ADV_IDLE)
 		return;
 
-	if (!list_empty(&m->congested))
-		return;
+	list_del_init(&m->congested);
 
 	/* Sort member into congested members' list */
 	list_for_each_entry_safe(_m, tmp, &grp->congested, congested) {

From 200922c93f008e03ddc804c6dacdf26ca1ba86d7 Mon Sep 17 00:00:00 2001
From: Fredrik Hallenberg <megahallon@gmail.com>
Date: Mon, 18 Dec 2017 23:33:59 +0100
Subject: [PATCH 284/808] net: stmmac: Fix TX timestamp calculation

When using GMAC4 the value written in PTP_SSIR should be shifted however
the shifted value is also used in subsequent calculations which results
in a bad timestamp value.

Signed-off-by: Fredrik Hallenberg <megahallon@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c
index 721b61655261..08c19ebd5306 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c
@@ -34,6 +34,7 @@ static u32 stmmac_config_sub_second_increment(void __iomem *ioaddr,
 {
 	u32 value = readl(ioaddr + PTP_TCR);
 	unsigned long data;
+	u32 reg_value;
 
 	/* For GMAC3.x, 4.x versions, convert the ptp_clock to nano second
 	 *	formula = (1/ptp_clock) * 1000000000
@@ -50,10 +51,11 @@ static u32 stmmac_config_sub_second_increment(void __iomem *ioaddr,
 
 	data &= PTP_SSIR_SSINC_MASK;
 
+	reg_value = data;
 	if (gmac4)
-		data = data << GMAC4_PTP_SSIR_SSINC_SHIFT;
+		reg_value <<= GMAC4_PTP_SSIR_SSINC_SHIFT;
 
-	writel(data, ioaddr + PTP_SSIR);
+	writel(reg_value, ioaddr + PTP_SSIR);
 
 	return data;
 }

From a1762456993893795030d911106a7650481db0ef Mon Sep 17 00:00:00 2001
From: Fredrik Hallenberg <megahallon@gmail.com>
Date: Mon, 18 Dec 2017 23:34:00 +0100
Subject: [PATCH 285/808] net: stmmac: Fix bad RX timestamp extraction

As noted in dwmac4_wrback_get_rx_timestamp_status the timestamp is found
in the context descriptor following the current descriptor. However the
current code looks for the context descriptor in the current
descriptor, which will always fail.

Signed-off-by: Fredrik Hallenberg <megahallon@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/common.h       | 2 +-
 drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c | 5 +++--
 drivers/net/ethernet/stmicro/stmmac/enh_desc.c     | 3 ++-
 drivers/net/ethernet/stmicro/stmmac/norm_desc.c    | 2 +-
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c  | 2 +-
 5 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h
index e1e5ac053760..ce2ea2d491ac 100644
--- a/drivers/net/ethernet/stmicro/stmmac/common.h
+++ b/drivers/net/ethernet/stmicro/stmmac/common.h
@@ -409,7 +409,7 @@ struct stmmac_desc_ops {
 	/* get timestamp value */
 	 u64(*get_timestamp) (void *desc, u32 ats);
 	/* get rx timestamp status */
-	int (*get_rx_timestamp_status) (void *desc, u32 ats);
+	int (*get_rx_timestamp_status)(void *desc, void *next_desc, u32 ats);
 	/* Display ring */
 	void (*display_ring)(void *head, unsigned int size, bool rx);
 	/* set MSS via context descriptor */
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c
index 4b286e27c4ca..7e089bf906b4 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c
@@ -258,7 +258,8 @@ static int dwmac4_rx_check_timestamp(void *desc)
 	return ret;
 }
 
-static int dwmac4_wrback_get_rx_timestamp_status(void *desc, u32 ats)
+static int dwmac4_wrback_get_rx_timestamp_status(void *desc, void *next_desc,
+						 u32 ats)
 {
 	struct dma_desc *p = (struct dma_desc *)desc;
 	int ret = -EINVAL;
@@ -270,7 +271,7 @@ static int dwmac4_wrback_get_rx_timestamp_status(void *desc, u32 ats)
 
 			/* Check if timestamp is OK from context descriptor */
 			do {
-				ret = dwmac4_rx_check_timestamp(desc);
+				ret = dwmac4_rx_check_timestamp(next_desc);
 				if (ret < 0)
 					goto exit;
 				i++;
diff --git a/drivers/net/ethernet/stmicro/stmmac/enh_desc.c b/drivers/net/ethernet/stmicro/stmmac/enh_desc.c
index 7546b3664113..2a828a312814 100644
--- a/drivers/net/ethernet/stmicro/stmmac/enh_desc.c
+++ b/drivers/net/ethernet/stmicro/stmmac/enh_desc.c
@@ -400,7 +400,8 @@ static u64 enh_desc_get_timestamp(void *desc, u32 ats)
 	return ns;
 }
 
-static int enh_desc_get_rx_timestamp_status(void *desc, u32 ats)
+static int enh_desc_get_rx_timestamp_status(void *desc, void *next_desc,
+					    u32 ats)
 {
 	if (ats) {
 		struct dma_extended_desc *p = (struct dma_extended_desc *)desc;
diff --git a/drivers/net/ethernet/stmicro/stmmac/norm_desc.c b/drivers/net/ethernet/stmicro/stmmac/norm_desc.c
index f817f8f36569..db4cee57bb24 100644
--- a/drivers/net/ethernet/stmicro/stmmac/norm_desc.c
+++ b/drivers/net/ethernet/stmicro/stmmac/norm_desc.c
@@ -265,7 +265,7 @@ static u64 ndesc_get_timestamp(void *desc, u32 ats)
 	return ns;
 }
 
-static int ndesc_get_rx_timestamp_status(void *desc, u32 ats)
+static int ndesc_get_rx_timestamp_status(void *desc, void *next_desc, u32 ats)
 {
 	struct dma_desc *p = (struct dma_desc *)desc;
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index d7250539d0bd..337d53d12e94 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -482,7 +482,7 @@ static void stmmac_get_rx_hwtstamp(struct stmmac_priv *priv, struct dma_desc *p,
 		desc = np;
 
 	/* Check if timestamp is available */
-	if (priv->hw->desc->get_rx_timestamp_status(desc, priv->adv_ts)) {
+	if (priv->hw->desc->get_rx_timestamp_status(p, np, priv->adv_ts)) {
 		ns = priv->hw->desc->get_timestamp(desc, priv->adv_ts);
 		netdev_dbg(priv->dev, "get valid RX hw timestamp %llu\n", ns);
 		shhwtstamp = skb_hwtstamps(skb);

From d03a45572efa068fa64db211d6d45222660e76c5 Mon Sep 17 00:00:00 2001
From: Phil Sutter <phil@nwl.cc>
Date: Tue, 19 Dec 2017 15:17:13 +0100
Subject: [PATCH 286/808] ipv4: fib: Fix metrics match when deleting a route

The recently added fib_metrics_match() causes a regression for routes
with both RTAX_FEATURES and RTAX_CC_ALGO if the latter has
TCP_CONG_NEEDS_ECN flag set:

| # ip link add d0 type dummy
| # ip link set d0 up
| # ip route add 172.29.29.0/24 dev d0 features ecn congctl dctcp
| # ip route del 172.29.29.0/24 dev d0 features ecn congctl dctcp
| RTNETLINK answers: No such process

During route insertion, fib_convert_metrics() detects that the given CC
algo requires ECN and hence sets DST_FEATURE_ECN_CA bit in
RTAX_FEATURES.

During route deletion though, fib_metrics_match() compares stored
RTAX_FEATURES value with that from userspace (which obviously has no
knowledge about DST_FEATURE_ECN_CA) and fails.

Fixes: 5f9ae3d9e7e4a ("ipv4: do metrics match when looking up and deleting a route")
Signed-off-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_semantics.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index f04d944f8abe..c586597da20d 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -698,7 +698,7 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi)
 
 	nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
 		int type = nla_type(nla);
-		u32 val;
+		u32 fi_val, val;
 
 		if (!type)
 			continue;
@@ -715,7 +715,11 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi)
 			val = nla_get_u32(nla);
 		}
 
-		if (fi->fib_metrics->metrics[type - 1] != val)
+		fi_val = fi->fib_metrics->metrics[type - 1];
+		if (type == RTAX_FEATURES)
+			fi_val &= ~DST_FEATURE_ECN_CA;
+
+		if (fi_val != val)
 			return false;
 	}
 

From 61d2f2a05765a5f57149efbd93e3e81a83cbc2c1 Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wens@csie.org>
Date: Mon, 18 Dec 2017 11:57:51 +0800
Subject: [PATCH 287/808] clk: sunxi: sun9i-mmc: Implement reset callback for
 reset controls

Our MMC host driver now issues a reset, instead of just deasserting
the reset control, since commit c34eda69ad4c ("mmc: sunxi: Reset the
device at probe time"). The sun9i-mmc clock driver does not support
this, and will fail, which results in MMC not probing.

This patch implements the reset callback by asserting the reset control,
then deasserting it after a small delay.

Fixes: 7a6fca879f59 ("clk: sunxi: Add driver for A80 MMC config clocks/resets")
Cc: <stable@vger.kernel.org> # 4.14.x
Signed-off-by: Chen-Yu Tsai <wens@csie.org>
Acked-by: Philipp Zabel <p.zabel@pengutronix.de>
Acked-by: Maxime Ripard <maxime.ripard@free-electrons.com>
Signed-off-by: Michael Turquette <mturquette@baylibre.com>
Link: lkml.kernel.org/r/20171218035751.20661-1-wens@csie.org
---
 drivers/clk/sunxi/clk-sun9i-mmc.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/drivers/clk/sunxi/clk-sun9i-mmc.c b/drivers/clk/sunxi/clk-sun9i-mmc.c
index a1a634253d6f..f00d8758ba24 100644
--- a/drivers/clk/sunxi/clk-sun9i-mmc.c
+++ b/drivers/clk/sunxi/clk-sun9i-mmc.c
@@ -16,6 +16,7 @@
 
 #include <linux/clk.h>
 #include <linux/clk-provider.h>
+#include <linux/delay.h>
 #include <linux/init.h>
 #include <linux/of.h>
 #include <linux/of_device.h>
@@ -83,9 +84,20 @@ static int sun9i_mmc_reset_deassert(struct reset_controller_dev *rcdev,
 	return 0;
 }
 
+static int sun9i_mmc_reset_reset(struct reset_controller_dev *rcdev,
+				 unsigned long id)
+{
+	sun9i_mmc_reset_assert(rcdev, id);
+	udelay(10);
+	sun9i_mmc_reset_deassert(rcdev, id);
+
+	return 0;
+}
+
 static const struct reset_control_ops sun9i_mmc_reset_ops = {
 	.assert		= sun9i_mmc_reset_assert,
 	.deassert	= sun9i_mmc_reset_deassert,
+	.reset		= sun9i_mmc_reset_reset,
 };
 
 static int sun9i_a80_mmc_config_clk_probe(struct platform_device *pdev)

From bae115a2bb479142605726e6aa130f43f50e801a Mon Sep 17 00:00:00 2001
From: Kamal Heib <kamalh@mellanox.com>
Date: Sun, 29 Oct 2017 04:03:37 +0200
Subject: [PATCH 288/808] net/mlx5: FPGA, return -EINVAL if size is zero

Currently, if a size of zero is passed to
mlx5_fpga_mem_{read|write}_i2c()
the "err" return value will not be initialized, which triggers gcc
warnings:

[..]/mlx5/core/fpga/sdk.c:87 mlx5_fpga_mem_read_i2c() error:
uninitialized symbol 'err'.
[..]/mlx5/core/fpga/sdk.c:115 mlx5_fpga_mem_write_i2c() error:
uninitialized symbol 'err'.

fix that.

Fixes: a9956d35d199 ('net/mlx5: FPGA, Add SBU infrastructure')
Signed-off-by: Kamal Heib <kamalh@mellanox.com>
Reviewed-by: Yevgeny Kliteynik <kliteyn@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.c
index 3c11d6e2160a..14962969c5ba 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.c
@@ -66,6 +66,9 @@ static int mlx5_fpga_mem_read_i2c(struct mlx5_fpga_device *fdev, size_t size,
 	u8 actual_size;
 	int err;
 
+	if (!size)
+		return -EINVAL;
+
 	if (!fdev->mdev)
 		return -ENOTCONN;
 
@@ -95,6 +98,9 @@ static int mlx5_fpga_mem_write_i2c(struct mlx5_fpga_device *fdev, size_t size,
 	u8 actual_size;
 	int err;
 
+	if (!size)
+		return -EINVAL;
+
 	if (!fdev->mdev)
 		return -ENOTCONN;
 

From 231243c82793428467524227ae02ca451e6a98e7 Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@mellanox.com>
Date: Fri, 10 Nov 2017 15:59:52 +0900
Subject: [PATCH 289/808] Revert "mlx5: move affinity hints assignments to
 generic code"

Before the offending commit, mlx5 core did the IRQ affinity itself,
and it seems that the new generic code have some drawbacks and one
of them is the lack for user ability to modify irq affinity after
the initial affinity values got assigned.

The issue is still being discussed and a solution in the new generic code
is required, until then we need to revert this patch.

This fixes the following issue:
echo <new affinity> > /proc/irq/<x>/smp_affinity
fails with  -EIO

This reverts commit a435393acafbf0ecff4deb3e3cb554b34f0d0664.
Note: kept mlx5_get_vector_affinity in include/linux/mlx5/driver.h since
it is used in mlx5_ib driver.

Fixes: a435393acafb ("mlx5: move affinity hints assignments to generic code")
Cc: Sagi Grimberg <sagi@grimberg.me>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Jes Sorensen <jsorensen@fb.com>
Reported-by: Jes Sorensen <jsorensen@fb.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h  |  1 +
 .../net/ethernet/mellanox/mlx5/core/en_main.c | 45 ++++++-----
 .../net/ethernet/mellanox/mlx5/core/main.c    | 75 +++++++++++++++++--
 include/linux/mlx5/driver.h                   |  1 +
 4 files changed, 93 insertions(+), 29 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index c0872b3284cb..43f9054830e5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -590,6 +590,7 @@ struct mlx5e_channel {
 	struct mlx5_core_dev      *mdev;
 	struct hwtstamp_config    *tstamp;
 	int                        ix;
+	int                        cpu;
 };
 
 struct mlx5e_channels {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index d2b057a3e512..cbec66bc82f1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -71,11 +71,6 @@ struct mlx5e_channel_param {
 	struct mlx5e_cq_param      icosq_cq;
 };
 
-static int mlx5e_get_node(struct mlx5e_priv *priv, int ix)
-{
-	return pci_irq_get_node(priv->mdev->pdev, MLX5_EQ_VEC_COMP_BASE + ix);
-}
-
 static bool mlx5e_check_fragmented_striding_rq_cap(struct mlx5_core_dev *mdev)
 {
 	return MLX5_CAP_GEN(mdev, striding_rq) &&
@@ -444,17 +439,16 @@ static int mlx5e_rq_alloc_mpwqe_info(struct mlx5e_rq *rq,
 	int wq_sz = mlx5_wq_ll_get_size(&rq->wq);
 	int mtt_sz = mlx5e_get_wqe_mtt_sz();
 	int mtt_alloc = mtt_sz + MLX5_UMR_ALIGN - 1;
-	int node = mlx5e_get_node(c->priv, c->ix);
 	int i;
 
 	rq->mpwqe.info = kzalloc_node(wq_sz * sizeof(*rq->mpwqe.info),
-					GFP_KERNEL, node);
+				      GFP_KERNEL, cpu_to_node(c->cpu));
 	if (!rq->mpwqe.info)
 		goto err_out;
 
 	/* We allocate more than mtt_sz as we will align the pointer */
-	rq->mpwqe.mtt_no_align = kzalloc_node(mtt_alloc * wq_sz,
-					GFP_KERNEL, node);
+	rq->mpwqe.mtt_no_align = kzalloc_node(mtt_alloc * wq_sz, GFP_KERNEL,
+					cpu_to_node(c->cpu));
 	if (unlikely(!rq->mpwqe.mtt_no_align))
 		goto err_free_wqe_info;
 
@@ -562,7 +556,7 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 	int err;
 	int i;
 
-	rqp->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix);
+	rqp->wq.db_numa_node = cpu_to_node(c->cpu);
 
 	err = mlx5_wq_ll_create(mdev, &rqp->wq, rqc_wq, &rq->wq,
 				&rq->wq_ctrl);
@@ -629,8 +623,7 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 	default: /* MLX5_WQ_TYPE_LINKED_LIST */
 		rq->wqe.frag_info =
 			kzalloc_node(wq_sz * sizeof(*rq->wqe.frag_info),
-				     GFP_KERNEL,
-				     mlx5e_get_node(c->priv, c->ix));
+				     GFP_KERNEL, cpu_to_node(c->cpu));
 		if (!rq->wqe.frag_info) {
 			err = -ENOMEM;
 			goto err_rq_wq_destroy;
@@ -1000,13 +993,13 @@ static int mlx5e_alloc_xdpsq(struct mlx5e_channel *c,
 	sq->uar_map   = mdev->mlx5e_res.bfreg.map;
 	sq->min_inline_mode = params->tx_min_inline_mode;
 
-	param->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix);
+	param->wq.db_numa_node = cpu_to_node(c->cpu);
 	err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, &sq->wq, &sq->wq_ctrl);
 	if (err)
 		return err;
 	sq->wq.db = &sq->wq.db[MLX5_SND_DBR];
 
-	err = mlx5e_alloc_xdpsq_db(sq, mlx5e_get_node(c->priv, c->ix));
+	err = mlx5e_alloc_xdpsq_db(sq, cpu_to_node(c->cpu));
 	if (err)
 		goto err_sq_wq_destroy;
 
@@ -1053,13 +1046,13 @@ static int mlx5e_alloc_icosq(struct mlx5e_channel *c,
 	sq->channel   = c;
 	sq->uar_map   = mdev->mlx5e_res.bfreg.map;
 
-	param->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix);
+	param->wq.db_numa_node = cpu_to_node(c->cpu);
 	err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, &sq->wq, &sq->wq_ctrl);
 	if (err)
 		return err;
 	sq->wq.db = &sq->wq.db[MLX5_SND_DBR];
 
-	err = mlx5e_alloc_icosq_db(sq, mlx5e_get_node(c->priv, c->ix));
+	err = mlx5e_alloc_icosq_db(sq, cpu_to_node(c->cpu));
 	if (err)
 		goto err_sq_wq_destroy;
 
@@ -1126,13 +1119,13 @@ static int mlx5e_alloc_txqsq(struct mlx5e_channel *c,
 	if (MLX5_IPSEC_DEV(c->priv->mdev))
 		set_bit(MLX5E_SQ_STATE_IPSEC, &sq->state);
 
-	param->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix);
+	param->wq.db_numa_node = cpu_to_node(c->cpu);
 	err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, &sq->wq, &sq->wq_ctrl);
 	if (err)
 		return err;
 	sq->wq.db    = &sq->wq.db[MLX5_SND_DBR];
 
-	err = mlx5e_alloc_txqsq_db(sq, mlx5e_get_node(c->priv, c->ix));
+	err = mlx5e_alloc_txqsq_db(sq, cpu_to_node(c->cpu));
 	if (err)
 		goto err_sq_wq_destroy;
 
@@ -1504,8 +1497,8 @@ static int mlx5e_alloc_cq(struct mlx5e_channel *c,
 	struct mlx5_core_dev *mdev = c->priv->mdev;
 	int err;
 
-	param->wq.buf_numa_node = mlx5e_get_node(c->priv, c->ix);
-	param->wq.db_numa_node  = mlx5e_get_node(c->priv, c->ix);
+	param->wq.buf_numa_node = cpu_to_node(c->cpu);
+	param->wq.db_numa_node  = cpu_to_node(c->cpu);
 	param->eq_ix   = c->ix;
 
 	err = mlx5e_alloc_cq_common(mdev, param, cq);
@@ -1604,6 +1597,11 @@ static void mlx5e_close_cq(struct mlx5e_cq *cq)
 	mlx5e_free_cq(cq);
 }
 
+static int mlx5e_get_cpu(struct mlx5e_priv *priv, int ix)
+{
+	return cpumask_first(priv->mdev->priv.irq_info[ix].mask);
+}
+
 static int mlx5e_open_tx_cqs(struct mlx5e_channel *c,
 			     struct mlx5e_params *params,
 			     struct mlx5e_channel_param *cparam)
@@ -1752,12 +1750,13 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix,
 {
 	struct mlx5e_cq_moder icocq_moder = {0, 0};
 	struct net_device *netdev = priv->netdev;
+	int cpu = mlx5e_get_cpu(priv, ix);
 	struct mlx5e_channel *c;
 	unsigned int irq;
 	int err;
 	int eqn;
 
-	c = kzalloc_node(sizeof(*c), GFP_KERNEL, mlx5e_get_node(priv, ix));
+	c = kzalloc_node(sizeof(*c), GFP_KERNEL, cpu_to_node(cpu));
 	if (!c)
 		return -ENOMEM;
 
@@ -1765,6 +1764,7 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix,
 	c->mdev     = priv->mdev;
 	c->tstamp   = &priv->tstamp;
 	c->ix       = ix;
+	c->cpu      = cpu;
 	c->pdev     = &priv->mdev->pdev->dev;
 	c->netdev   = priv->netdev;
 	c->mkey_be  = cpu_to_be32(priv->mdev->mlx5e_res.mkey.key);
@@ -1853,8 +1853,7 @@ static void mlx5e_activate_channel(struct mlx5e_channel *c)
 	for (tc = 0; tc < c->num_tc; tc++)
 		mlx5e_activate_txqsq(&c->sq[tc]);
 	mlx5e_activate_rq(&c->rq);
-	netif_set_xps_queue(c->netdev,
-		mlx5_get_vector_affinity(c->priv->mdev, c->ix), c->ix);
+	netif_set_xps_queue(c->netdev, get_cpu_mask(c->cpu), c->ix);
 }
 
 static void mlx5e_deactivate_channel(struct mlx5e_channel *c)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 5f323442cc5a..8a89c7e8cd63 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -317,9 +317,6 @@ static int mlx5_alloc_irq_vectors(struct mlx5_core_dev *dev)
 {
 	struct mlx5_priv *priv = &dev->priv;
 	struct mlx5_eq_table *table = &priv->eq_table;
-	struct irq_affinity irqdesc = {
-		.pre_vectors = MLX5_EQ_VEC_COMP_BASE,
-	};
 	int num_eqs = 1 << MLX5_CAP_GEN(dev, log_max_eq);
 	int nvec;
 
@@ -333,10 +330,9 @@ static int mlx5_alloc_irq_vectors(struct mlx5_core_dev *dev)
 	if (!priv->irq_info)
 		goto err_free_msix;
 
-	nvec = pci_alloc_irq_vectors_affinity(dev->pdev,
+	nvec = pci_alloc_irq_vectors(dev->pdev,
 			MLX5_EQ_VEC_COMP_BASE + 1, nvec,
-			PCI_IRQ_MSIX | PCI_IRQ_AFFINITY,
-			&irqdesc);
+			PCI_IRQ_MSIX);
 	if (nvec < 0)
 		return nvec;
 
@@ -622,6 +618,63 @@ u64 mlx5_read_internal_timer(struct mlx5_core_dev *dev)
 	return (u64)timer_l | (u64)timer_h1 << 32;
 }
 
+static int mlx5_irq_set_affinity_hint(struct mlx5_core_dev *mdev, int i)
+{
+	struct mlx5_priv *priv  = &mdev->priv;
+	int irq = pci_irq_vector(mdev->pdev, MLX5_EQ_VEC_COMP_BASE + i);
+
+	if (!zalloc_cpumask_var(&priv->irq_info[i].mask, GFP_KERNEL)) {
+		mlx5_core_warn(mdev, "zalloc_cpumask_var failed");
+		return -ENOMEM;
+	}
+
+	cpumask_set_cpu(cpumask_local_spread(i, priv->numa_node),
+			priv->irq_info[i].mask);
+
+	if (IS_ENABLED(CONFIG_SMP) &&
+	    irq_set_affinity_hint(irq, priv->irq_info[i].mask))
+		mlx5_core_warn(mdev, "irq_set_affinity_hint failed, irq 0x%.4x", irq);
+
+	return 0;
+}
+
+static void mlx5_irq_clear_affinity_hint(struct mlx5_core_dev *mdev, int i)
+{
+	struct mlx5_priv *priv  = &mdev->priv;
+	int irq = pci_irq_vector(mdev->pdev, MLX5_EQ_VEC_COMP_BASE + i);
+
+	irq_set_affinity_hint(irq, NULL);
+	free_cpumask_var(priv->irq_info[i].mask);
+}
+
+static int mlx5_irq_set_affinity_hints(struct mlx5_core_dev *mdev)
+{
+	int err;
+	int i;
+
+	for (i = 0; i < mdev->priv.eq_table.num_comp_vectors; i++) {
+		err = mlx5_irq_set_affinity_hint(mdev, i);
+		if (err)
+			goto err_out;
+	}
+
+	return 0;
+
+err_out:
+	for (i--; i >= 0; i--)
+		mlx5_irq_clear_affinity_hint(mdev, i);
+
+	return err;
+}
+
+static void mlx5_irq_clear_affinity_hints(struct mlx5_core_dev *mdev)
+{
+	int i;
+
+	for (i = 0; i < mdev->priv.eq_table.num_comp_vectors; i++)
+		mlx5_irq_clear_affinity_hint(mdev, i);
+}
+
 int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn,
 		    unsigned int *irqn)
 {
@@ -1097,6 +1150,12 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
 		goto err_stop_eqs;
 	}
 
+	err = mlx5_irq_set_affinity_hints(dev);
+	if (err) {
+		dev_err(&pdev->dev, "Failed to alloc affinity hint cpumask\n");
+		goto err_affinity_hints;
+	}
+
 	err = mlx5_init_fs(dev);
 	if (err) {
 		dev_err(&pdev->dev, "Failed to init flow steering\n");
@@ -1154,6 +1213,9 @@ err_sriov:
 	mlx5_cleanup_fs(dev);
 
 err_fs:
+	mlx5_irq_clear_affinity_hints(dev);
+
+err_affinity_hints:
 	free_comp_eqs(dev);
 
 err_stop_eqs:
@@ -1222,6 +1284,7 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
 
 	mlx5_sriov_detach(dev);
 	mlx5_cleanup_fs(dev);
+	mlx5_irq_clear_affinity_hints(dev);
 	free_comp_eqs(dev);
 	mlx5_stop_eqs(dev);
 	mlx5_put_uars_page(dev, priv->uar);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index a886b51511ab..40a6f33c4cde 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -556,6 +556,7 @@ struct mlx5_core_sriov {
 };
 
 struct mlx5_irq_info {
+	cpumask_var_t mask;
 	char name[MLX5_MAX_IRQ_NAME];
 };
 

From 37e92a9d4fe38dc3e7308913575983a6a088c8d4 Mon Sep 17 00:00:00 2001
From: Eran Ben Elisha <eranbe@mellanox.com>
Date: Mon, 13 Nov 2017 10:11:27 +0200
Subject: [PATCH 290/808] net/mlx5: Fix rate limit packet pacing naming and
 struct

In mlx5_ifc, struct size was not complete, and thus driver was sending
garbage after the last defined field. Fixed it by adding reserved field
to complete the struct size.

In addition, rename all set_rate_limit to set_pp_rate_limit to be
compliant with the Firmware <-> Driver definition.

Fixes: 7486216b3a0b ("{net,IB}/mlx5: mlx5_ifc updates")
Fixes: 1466cc5b23d1 ("net/mlx5: Rate limit tables support")
Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/cmd.c |  4 ++--
 drivers/net/ethernet/mellanox/mlx5/core/rl.c  | 22 +++++++++----------
 include/linux/mlx5/mlx5_ifc.h                 |  8 ++++---
 3 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
index 1fffdebbc9e8..e9a1fbcc4adf 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
@@ -362,7 +362,7 @@ static int mlx5_internal_err_ret_value(struct mlx5_core_dev *dev, u16 op,
 	case MLX5_CMD_OP_QUERY_VPORT_COUNTER:
 	case MLX5_CMD_OP_ALLOC_Q_COUNTER:
 	case MLX5_CMD_OP_QUERY_Q_COUNTER:
-	case MLX5_CMD_OP_SET_RATE_LIMIT:
+	case MLX5_CMD_OP_SET_PP_RATE_LIMIT:
 	case MLX5_CMD_OP_QUERY_RATE_LIMIT:
 	case MLX5_CMD_OP_CREATE_SCHEDULING_ELEMENT:
 	case MLX5_CMD_OP_QUERY_SCHEDULING_ELEMENT:
@@ -505,7 +505,7 @@ const char *mlx5_command_str(int command)
 	MLX5_COMMAND_STR_CASE(ALLOC_Q_COUNTER);
 	MLX5_COMMAND_STR_CASE(DEALLOC_Q_COUNTER);
 	MLX5_COMMAND_STR_CASE(QUERY_Q_COUNTER);
-	MLX5_COMMAND_STR_CASE(SET_RATE_LIMIT);
+	MLX5_COMMAND_STR_CASE(SET_PP_RATE_LIMIT);
 	MLX5_COMMAND_STR_CASE(QUERY_RATE_LIMIT);
 	MLX5_COMMAND_STR_CASE(CREATE_SCHEDULING_ELEMENT);
 	MLX5_COMMAND_STR_CASE(DESTROY_SCHEDULING_ELEMENT);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/rl.c b/drivers/net/ethernet/mellanox/mlx5/core/rl.c
index e651e4c02867..d3c33e9eea72 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/rl.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/rl.c
@@ -125,16 +125,16 @@ static struct mlx5_rl_entry *find_rl_entry(struct mlx5_rl_table *table,
 	return ret_entry;
 }
 
-static int mlx5_set_rate_limit_cmd(struct mlx5_core_dev *dev,
+static int mlx5_set_pp_rate_limit_cmd(struct mlx5_core_dev *dev,
 				   u32 rate, u16 index)
 {
-	u32 in[MLX5_ST_SZ_DW(set_rate_limit_in)]   = {0};
-	u32 out[MLX5_ST_SZ_DW(set_rate_limit_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(set_pp_rate_limit_in)]   = {0};
+	u32 out[MLX5_ST_SZ_DW(set_pp_rate_limit_out)] = {0};
 
-	MLX5_SET(set_rate_limit_in, in, opcode,
-		 MLX5_CMD_OP_SET_RATE_LIMIT);
-	MLX5_SET(set_rate_limit_in, in, rate_limit_index, index);
-	MLX5_SET(set_rate_limit_in, in, rate_limit, rate);
+	MLX5_SET(set_pp_rate_limit_in, in, opcode,
+		 MLX5_CMD_OP_SET_PP_RATE_LIMIT);
+	MLX5_SET(set_pp_rate_limit_in, in, rate_limit_index, index);
+	MLX5_SET(set_pp_rate_limit_in, in, rate_limit, rate);
 	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
 }
 
@@ -173,7 +173,7 @@ int mlx5_rl_add_rate(struct mlx5_core_dev *dev, u32 rate, u16 *index)
 		entry->refcount++;
 	} else {
 		/* new rate limit */
-		err = mlx5_set_rate_limit_cmd(dev, rate, entry->index);
+		err = mlx5_set_pp_rate_limit_cmd(dev, rate, entry->index);
 		if (err) {
 			mlx5_core_err(dev, "Failed configuring rate: %u (%d)\n",
 				      rate, err);
@@ -209,7 +209,7 @@ void mlx5_rl_remove_rate(struct mlx5_core_dev *dev, u32 rate)
 	entry->refcount--;
 	if (!entry->refcount) {
 		/* need to remove rate */
-		mlx5_set_rate_limit_cmd(dev, 0, entry->index);
+		mlx5_set_pp_rate_limit_cmd(dev, 0, entry->index);
 		entry->rate = 0;
 	}
 
@@ -262,8 +262,8 @@ void mlx5_cleanup_rl_table(struct mlx5_core_dev *dev)
 	/* Clear all configured rates */
 	for (i = 0; i < table->max_size; i++)
 		if (table->rl_entry[i].rate)
-			mlx5_set_rate_limit_cmd(dev, 0,
-						table->rl_entry[i].index);
+			mlx5_set_pp_rate_limit_cmd(dev, 0,
+						   table->rl_entry[i].index);
 
 	kfree(dev->priv.rl_table.rl_entry);
 }
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 38a7577a9ce7..d44ec5f41d4a 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -147,7 +147,7 @@ enum {
 	MLX5_CMD_OP_ALLOC_Q_COUNTER               = 0x771,
 	MLX5_CMD_OP_DEALLOC_Q_COUNTER             = 0x772,
 	MLX5_CMD_OP_QUERY_Q_COUNTER               = 0x773,
-	MLX5_CMD_OP_SET_RATE_LIMIT                = 0x780,
+	MLX5_CMD_OP_SET_PP_RATE_LIMIT             = 0x780,
 	MLX5_CMD_OP_QUERY_RATE_LIMIT              = 0x781,
 	MLX5_CMD_OP_CREATE_SCHEDULING_ELEMENT      = 0x782,
 	MLX5_CMD_OP_DESTROY_SCHEDULING_ELEMENT     = 0x783,
@@ -7239,7 +7239,7 @@ struct mlx5_ifc_add_vxlan_udp_dport_in_bits {
 	u8         vxlan_udp_port[0x10];
 };
 
-struct mlx5_ifc_set_rate_limit_out_bits {
+struct mlx5_ifc_set_pp_rate_limit_out_bits {
 	u8         status[0x8];
 	u8         reserved_at_8[0x18];
 
@@ -7248,7 +7248,7 @@ struct mlx5_ifc_set_rate_limit_out_bits {
 	u8         reserved_at_40[0x40];
 };
 
-struct mlx5_ifc_set_rate_limit_in_bits {
+struct mlx5_ifc_set_pp_rate_limit_in_bits {
 	u8         opcode[0x10];
 	u8         reserved_at_10[0x10];
 
@@ -7261,6 +7261,8 @@ struct mlx5_ifc_set_rate_limit_in_bits {
 	u8         reserved_at_60[0x20];
 
 	u8         rate_limit[0x20];
+
+	u8         reserved_at_a0[0x160];
 };
 
 struct mlx5_ifc_access_register_out_bits {

From ff0891915cd7b24ab27eee9b360c0452853bf9f6 Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huyn@mellanox.com>
Date: Thu, 26 Oct 2017 09:56:34 -0500
Subject: [PATCH 291/808] net/mlx5e: Fix ETS BW check

Fix bug that allows ets bw sum to be 0% when ets tc type exists.

Fixes: 08fb1dacdd76 ('net/mlx5e: Support DCBNL IEEE ETS')
Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
Reviewed-by: Huy Nguyen <huyn@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c
index c6d90b6dd80e..9bcf38f4123b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c
@@ -274,6 +274,7 @@ int mlx5e_dcbnl_ieee_setets_core(struct mlx5e_priv *priv, struct ieee_ets *ets)
 static int mlx5e_dbcnl_validate_ets(struct net_device *netdev,
 				    struct ieee_ets *ets)
 {
+	bool have_ets_tc = false;
 	int bw_sum = 0;
 	int i;
 
@@ -288,11 +289,14 @@ static int mlx5e_dbcnl_validate_ets(struct net_device *netdev,
 	}
 
 	/* Validate Bandwidth Sum */
-	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++)
-		if (ets->tc_tsa[i] == IEEE_8021QAZ_TSA_ETS)
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		if (ets->tc_tsa[i] == IEEE_8021QAZ_TSA_ETS) {
+			have_ets_tc = true;
 			bw_sum += ets->tc_tx_bw[i];
+		}
+	}
 
-	if (bw_sum != 0 && bw_sum != 100) {
+	if (have_ets_tc && bw_sum != 100) {
 		netdev_err(netdev,
 			   "Failed to validate ETS: BW sum is illegal\n");
 		return -EINVAL;

From 2989ad1ec03021ee6d2193c35414f1d970a243de Mon Sep 17 00:00:00 2001
From: Gal Pressman <galp@mellanox.com>
Date: Tue, 21 Nov 2017 17:49:36 +0200
Subject: [PATCH 292/808] net/mlx5e: Fix features check of IPv6 traffic

The assumption that the next header field contains the transport
protocol is wrong for IPv6 packets with extension headers.
Instead, we should look the inner-most next header field in the buffer.
This will fix TSO offload for tunnels over IPv6 with extension headers.

Performance testing: 19.25x improvement, cool!
Measuring bandwidth of 16 threads TCP traffic over IPv6 GRE tap.
CPU: Intel(R) Xeon(R) CPU E5-2660 v2 @ 2.20GHz
NIC: Mellanox Technologies MT28800 Family [ConnectX-5 Ex]
TSO: Enabled
Before: 4,926.24  Mbps
Now   : 94,827.91 Mbps

Fixes: b3f63c3d5e2c ("net/mlx5e: Add netdev support for VXLAN tunneling")
Signed-off-by: Gal Pressman <galp@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index cbec66bc82f1..c535a44ab8ac 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -3678,6 +3678,7 @@ static netdev_features_t mlx5e_tunnel_features_check(struct mlx5e_priv *priv,
 						     struct sk_buff *skb,
 						     netdev_features_t features)
 {
+	unsigned int offset = 0;
 	struct udphdr *udph;
 	u8 proto;
 	u16 port;
@@ -3687,7 +3688,7 @@ static netdev_features_t mlx5e_tunnel_features_check(struct mlx5e_priv *priv,
 		proto = ip_hdr(skb)->protocol;
 		break;
 	case htons(ETH_P_IPV6):
-		proto = ipv6_hdr(skb)->nexthdr;
+		proto = ipv6_find_hdr(skb, &offset, -1, NULL, NULL);
 		break;
 	default:
 		goto out;

From 696a97cf9f5c551fca257e0d4aa07b5cbde6084a Mon Sep 17 00:00:00 2001
From: Eugenia Emantayev <eugenia@mellanox.com>
Date: Tue, 14 Nov 2017 09:44:55 +0200
Subject: [PATCH 293/808] net/mlx5e: Fix defaulting RX ring size when not
 needed

Fixes the bug when turning on/off CQE compression mechanism
resets the RX rings size to default value when it is not
needed.

Fixes: 2fc4bfb7250d ("net/mlx5e: Dynamic RQ type infrastructure")
Signed-off-by: Eugenia Emantayev <eugenia@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h      |  8 ++++++--
 .../net/ethernet/mellanox/mlx5/core/en_ethtool.c  | 10 ++++++++--
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 15 +++++++--------
 .../net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c |  2 +-
 4 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 43f9054830e5..543060c305a0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -82,6 +82,9 @@
 	max_t(u32, MLX5_MPWRQ_MIN_LOG_STRIDE_SZ(mdev), req)
 #define MLX5_MPWRQ_DEF_LOG_STRIDE_SZ(mdev)       MLX5_MPWRQ_LOG_STRIDE_SZ(mdev, 6)
 #define MLX5_MPWRQ_CQE_CMPRS_LOG_STRIDE_SZ(mdev) MLX5_MPWRQ_LOG_STRIDE_SZ(mdev, 8)
+#define MLX5E_MPWQE_STRIDE_SZ(mdev, cqe_cmprs) \
+	(cqe_cmprs ? MLX5_MPWRQ_CQE_CMPRS_LOG_STRIDE_SZ(mdev) : \
+	MLX5_MPWRQ_DEF_LOG_STRIDE_SZ(mdev))
 
 #define MLX5_MPWRQ_LOG_WQE_SZ			18
 #define MLX5_MPWRQ_WQE_PAGE_ORDER  (MLX5_MPWRQ_LOG_WQE_SZ - PAGE_SHIFT > 0 ? \
@@ -936,8 +939,9 @@ void mlx5e_set_tx_cq_mode_params(struct mlx5e_params *params,
 				 u8 cq_period_mode);
 void mlx5e_set_rx_cq_mode_params(struct mlx5e_params *params,
 				 u8 cq_period_mode);
-void mlx5e_set_rq_type_params(struct mlx5_core_dev *mdev,
-			      struct mlx5e_params *params, u8 rq_type);
+void mlx5e_init_rq_type_params(struct mlx5_core_dev *mdev,
+			       struct mlx5e_params *params,
+			       u8 rq_type);
 
 static inline bool mlx5e_tunnel_inner_ft_supported(struct mlx5_core_dev *mdev)
 {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index 23425f028405..8f05efa5c829 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -1523,8 +1523,10 @@ int mlx5e_modify_rx_cqe_compression_locked(struct mlx5e_priv *priv, bool new_val
 	new_channels.params = priv->channels.params;
 	MLX5E_SET_PFLAG(&new_channels.params, MLX5E_PFLAG_RX_CQE_COMPRESS, new_val);
 
-	mlx5e_set_rq_type_params(priv->mdev, &new_channels.params,
-				 new_channels.params.rq_wq_type);
+	new_channels.params.mpwqe_log_stride_sz =
+		MLX5E_MPWQE_STRIDE_SZ(priv->mdev, new_val);
+	new_channels.params.mpwqe_log_num_strides =
+		MLX5_MPWRQ_LOG_WQE_SZ - new_channels.params.mpwqe_log_stride_sz;
 
 	if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) {
 		priv->channels.params = new_channels.params;
@@ -1536,6 +1538,10 @@ int mlx5e_modify_rx_cqe_compression_locked(struct mlx5e_priv *priv, bool new_val
 		return err;
 
 	mlx5e_switch_priv_channels(priv, &new_channels, NULL);
+	mlx5e_dbg(DRV, priv, "MLX5E: RxCqeCmprss was turned %s\n",
+		  MLX5E_GET_PFLAG(&priv->channels.params,
+				  MLX5E_PFLAG_RX_CQE_COMPRESS) ? "ON" : "OFF");
+
 	return 0;
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index c535a44ab8ac..d9d8227f195f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -78,8 +78,8 @@ static bool mlx5e_check_fragmented_striding_rq_cap(struct mlx5_core_dev *mdev)
 		MLX5_CAP_ETH(mdev, reg_umr_sq);
 }
 
-void mlx5e_set_rq_type_params(struct mlx5_core_dev *mdev,
-			      struct mlx5e_params *params, u8 rq_type)
+void mlx5e_init_rq_type_params(struct mlx5_core_dev *mdev,
+			       struct mlx5e_params *params, u8 rq_type)
 {
 	params->rq_wq_type = rq_type;
 	params->lro_wqe_sz = MLX5E_PARAMS_DEFAULT_LRO_WQE_SZ;
@@ -88,10 +88,8 @@ void mlx5e_set_rq_type_params(struct mlx5_core_dev *mdev,
 		params->log_rq_size = is_kdump_kernel() ?
 			MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE_MPW :
 			MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE_MPW;
-		params->mpwqe_log_stride_sz =
-			MLX5E_GET_PFLAG(params, MLX5E_PFLAG_RX_CQE_COMPRESS) ?
-			MLX5_MPWRQ_CQE_CMPRS_LOG_STRIDE_SZ(mdev) :
-			MLX5_MPWRQ_DEF_LOG_STRIDE_SZ(mdev);
+		params->mpwqe_log_stride_sz = MLX5E_MPWQE_STRIDE_SZ(mdev,
+			MLX5E_GET_PFLAG(params, MLX5E_PFLAG_RX_CQE_COMPRESS));
 		params->mpwqe_log_num_strides = MLX5_MPWRQ_LOG_WQE_SZ -
 			params->mpwqe_log_stride_sz;
 		break;
@@ -115,13 +113,14 @@ void mlx5e_set_rq_type_params(struct mlx5_core_dev *mdev,
 		       MLX5E_GET_PFLAG(params, MLX5E_PFLAG_RX_CQE_COMPRESS));
 }
 
-static void mlx5e_set_rq_params(struct mlx5_core_dev *mdev, struct mlx5e_params *params)
+static void mlx5e_set_rq_params(struct mlx5_core_dev *mdev,
+				struct mlx5e_params *params)
 {
 	u8 rq_type = mlx5e_check_fragmented_striding_rq_cap(mdev) &&
 		    !params->xdp_prog && !MLX5_IPSEC_DEV(mdev) ?
 		    MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ :
 		    MLX5_WQ_TYPE_LINKED_LIST;
-	mlx5e_set_rq_type_params(mdev, params, rq_type);
+	mlx5e_init_rq_type_params(mdev, params, rq_type);
 }
 
 static void mlx5e_update_carrier(struct mlx5e_priv *priv)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
index d2a66dc4adc6..8812d7208e8f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
@@ -57,7 +57,7 @@ static void mlx5i_build_nic_params(struct mlx5_core_dev *mdev,
 				   struct mlx5e_params *params)
 {
 	/* Override RQ params as IPoIB supports only LINKED LIST RQ for now */
-	mlx5e_set_rq_type_params(mdev, params, MLX5_WQ_TYPE_LINKED_LIST);
+	mlx5e_init_rq_type_params(mdev, params, MLX5_WQ_TYPE_LINKED_LIST);
 
 	/* RQ size in ipoib by default is 512 */
 	params->log_rq_size = is_kdump_kernel() ?

From 777ec2b2a3f2760505db395de1a9fa4115d74548 Mon Sep 17 00:00:00 2001
From: Eugenia Emantayev <eugenia@mellanox.com>
Date: Thu, 16 Nov 2017 14:57:48 +0200
Subject: [PATCH 294/808] net/mlx5: Fix misspelling in the error message and
 comment

Fix misspelling in word syndrome.

Fixes: e126ba97dba9 ("mlx5: Add driver for Mellanox Connect-IB adapters")
Signed-off-by: Eugenia Emantayev <eugenia@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/eq.c     | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/health.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index 60771865c99c..0308a2b4823c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -466,7 +466,7 @@ static irqreturn_t mlx5_eq_int(int irq, void *eq_ptr)
 			break;
 		case MLX5_EVENT_TYPE_CQ_ERROR:
 			cqn = be32_to_cpu(eqe->data.cq_err.cqn) & 0xffffff;
-			mlx5_core_warn(dev, "CQ error on CQN 0x%x, syndrom 0x%x\n",
+			mlx5_core_warn(dev, "CQ error on CQN 0x%x, syndrome 0x%x\n",
 				       cqn, eqe->data.cq_err.syndrome);
 			mlx5_cq_event(dev, cqn, eqe->type);
 			break;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
index 1a0e797ad001..21d29f7936f6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/health.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
@@ -241,7 +241,7 @@ static void print_health_info(struct mlx5_core_dev *dev)
 	u32 fw;
 	int i;
 
-	/* If the syndrom is 0, the device is OK and no need to print buffer */
+	/* If the syndrome is 0, the device is OK and no need to print buffer */
 	if (!ioread8(&h->synd))
 		return;
 

From dbff26e44dc3ec4de6578733b054a0114652a764 Mon Sep 17 00:00:00 2001
From: Moni Shoua <monis@mellanox.com>
Date: Mon, 4 Dec 2017 08:59:25 +0200
Subject: [PATCH 295/808] net/mlx5: Fix error flow in CREATE_QP command

In error flow, when DESTROY_QP command should be executed, the wrong
mailbox was set with data, not the one that is written to hardware,
Fix that.

Fixes: 09a7d9eca1a6 '{net,IB}/mlx5: QP/XRCD commands via mlx5 ifc'
Signed-off-by: Moni Shoua <monis@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/qp.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/qp.c b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
index db9e665ab104..889130edb715 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/qp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
@@ -213,8 +213,8 @@ int mlx5_core_create_qp(struct mlx5_core_dev *dev,
 err_cmd:
 	memset(din, 0, sizeof(din));
 	memset(dout, 0, sizeof(dout));
-	MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP);
-	MLX5_SET(destroy_qp_in, in, qpn, qp->qpn);
+	MLX5_SET(destroy_qp_in, din, opcode, MLX5_CMD_OP_DESTROY_QP);
+	MLX5_SET(destroy_qp_in, din, qpn, qp->qpn);
 	mlx5_cmd_exec(dev, din, sizeof(din), dout, sizeof(dout));
 	return err;
 }

From 6323514116404cc651df1b7fffa1311ddf8ce647 Mon Sep 17 00:00:00 2001
From: Gal Pressman <galp@mellanox.com>
Date: Thu, 23 Nov 2017 13:52:28 +0200
Subject: [PATCH 296/808] net/mlx5e: Fix possible deadlock of VXLAN lock

mlx5e_vxlan_lookup_port is called both from mlx5e_add_vxlan_port (user
context) and mlx5e_features_check (softirq), but the lock acquired does
not disable bottom half and might result in deadlock. Fix it by simply
replacing spin_lock() with spin_lock_bh().
While at it, replace all unnecessary spin_lock_irq() to spin_lock_bh().

lockdep's WARNING: inconsistent lock state
[  654.028136] inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage.
[  654.028229] swapper/5/0 [HC0[0]:SC1[9]:HE1:SE0] takes:
[  654.028321]  (&(&vxlan_db->lock)->rlock){+.?.}, at: [<ffffffffa06e7f0e>] mlx5e_vxlan_lookup_port+0x1e/0x50 [mlx5_core]
[  654.028528] {SOFTIRQ-ON-W} state was registered at:
[  654.028607]   _raw_spin_lock+0x3c/0x70
[  654.028689]   mlx5e_vxlan_lookup_port+0x1e/0x50 [mlx5_core]
[  654.028794]   mlx5e_vxlan_add_port+0x2e/0x120 [mlx5_core]
[  654.028878]   process_one_work+0x1e9/0x640
[  654.028942]   worker_thread+0x4a/0x3f0
[  654.029002]   kthread+0x141/0x180
[  654.029056]   ret_from_fork+0x24/0x30
[  654.029114] irq event stamp: 579088
[  654.029174] hardirqs last  enabled at (579088): [<ffffffff818f475a>] ip6_finish_output2+0x49a/0x8c0
[  654.029309] hardirqs last disabled at (579087): [<ffffffff818f470e>] ip6_finish_output2+0x44e/0x8c0
[  654.029446] softirqs last  enabled at (579030): [<ffffffff810b3b3d>] irq_enter+0x6d/0x80
[  654.029567] softirqs last disabled at (579031): [<ffffffff810b3c05>] irq_exit+0xb5/0xc0
[  654.029684] other info that might help us debug this:
[  654.029781]  Possible unsafe locking scenario:

[  654.029868]        CPU0
[  654.029908]        ----
[  654.029947]   lock(&(&vxlan_db->lock)->rlock);
[  654.030045]   <Interrupt>
[  654.030090]     lock(&(&vxlan_db->lock)->rlock);
[  654.030162]
 *** DEADLOCK ***

Fixes: b3f63c3d5e2c ("net/mlx5e: Add netdev support for VXLAN tunneling")
Signed-off-by: Gal Pressman <galp@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 .../net/ethernet/mellanox/mlx5/core/vxlan.c   | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
index 07a9ba6cfc70..f8238275759f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
@@ -71,9 +71,9 @@ struct mlx5e_vxlan *mlx5e_vxlan_lookup_port(struct mlx5e_priv *priv, u16 port)
 	struct mlx5e_vxlan_db *vxlan_db = &priv->vxlan;
 	struct mlx5e_vxlan *vxlan;
 
-	spin_lock(&vxlan_db->lock);
+	spin_lock_bh(&vxlan_db->lock);
 	vxlan = radix_tree_lookup(&vxlan_db->tree, port);
-	spin_unlock(&vxlan_db->lock);
+	spin_unlock_bh(&vxlan_db->lock);
 
 	return vxlan;
 }
@@ -100,9 +100,9 @@ static void mlx5e_vxlan_add_port(struct work_struct *work)
 
 	vxlan->udp_port = port;
 
-	spin_lock_irq(&vxlan_db->lock);
+	spin_lock_bh(&vxlan_db->lock);
 	err = radix_tree_insert(&vxlan_db->tree, vxlan->udp_port, vxlan);
-	spin_unlock_irq(&vxlan_db->lock);
+	spin_unlock_bh(&vxlan_db->lock);
 	if (err)
 		goto err_free;
 
@@ -121,9 +121,9 @@ static void __mlx5e_vxlan_core_del_port(struct mlx5e_priv *priv, u16 port)
 	struct mlx5e_vxlan_db *vxlan_db = &priv->vxlan;
 	struct mlx5e_vxlan *vxlan;
 
-	spin_lock_irq(&vxlan_db->lock);
+	spin_lock_bh(&vxlan_db->lock);
 	vxlan = radix_tree_delete(&vxlan_db->tree, port);
-	spin_unlock_irq(&vxlan_db->lock);
+	spin_unlock_bh(&vxlan_db->lock);
 
 	if (!vxlan)
 		return;
@@ -171,12 +171,12 @@ void mlx5e_vxlan_cleanup(struct mlx5e_priv *priv)
 	struct mlx5e_vxlan *vxlan;
 	unsigned int port = 0;
 
-	spin_lock_irq(&vxlan_db->lock);
+	spin_lock_bh(&vxlan_db->lock);
 	while (radix_tree_gang_lookup(&vxlan_db->tree, (void **)&vxlan, port, 1)) {
 		port = vxlan->udp_port;
-		spin_unlock_irq(&vxlan_db->lock);
+		spin_unlock_bh(&vxlan_db->lock);
 		__mlx5e_vxlan_core_del_port(priv, (u16)port);
-		spin_lock_irq(&vxlan_db->lock);
+		spin_lock_bh(&vxlan_db->lock);
 	}
-	spin_unlock_irq(&vxlan_db->lock);
+	spin_unlock_bh(&vxlan_db->lock);
 }

From 23f4cc2cd9ed92570647220aca60d0197d8c1fa9 Mon Sep 17 00:00:00 2001
From: Gal Pressman <galp@mellanox.com>
Date: Sun, 3 Dec 2017 13:58:50 +0200
Subject: [PATCH 297/808] net/mlx5e: Add refcount to VXLAN structure

A refcount mechanism must be implemented in order to prevent unwanted
scenarios such as:
- Open an IPv4 VXLAN interface
- Open an IPv6 VXLAN interface (different socket)
- Remove one of the interfaces

With current implementation, the UDP port will be removed from our VXLAN
database and turn off the offloads for the other interface, which is
still active.
The reference count mechanism will only allow UDP port removals once all
consumers are gone.

Fixes: b3f63c3d5e2c ("net/mlx5e: Add netdev support for VXLAN tunneling")
Signed-off-by: Gal Pressman <galp@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 .../net/ethernet/mellanox/mlx5/core/vxlan.c   | 54 ++++++++++---------
 .../net/ethernet/mellanox/mlx5/core/vxlan.h   |  1 +
 2 files changed, 30 insertions(+), 25 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
index f8238275759f..25f782344667 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
@@ -88,8 +88,11 @@ static void mlx5e_vxlan_add_port(struct work_struct *work)
 	struct mlx5e_vxlan *vxlan;
 	int err;
 
-	if (mlx5e_vxlan_lookup_port(priv, port))
+	vxlan = mlx5e_vxlan_lookup_port(priv, port);
+	if (vxlan) {
+		atomic_inc(&vxlan->refcount);
 		goto free_work;
+	}
 
 	if (mlx5e_vxlan_core_add_port_cmd(priv->mdev, port))
 		goto free_work;
@@ -99,6 +102,7 @@ static void mlx5e_vxlan_add_port(struct work_struct *work)
 		goto err_delete_port;
 
 	vxlan->udp_port = port;
+	atomic_set(&vxlan->refcount, 1);
 
 	spin_lock_bh(&vxlan_db->lock);
 	err = radix_tree_insert(&vxlan_db->tree, vxlan->udp_port, vxlan);
@@ -116,32 +120,33 @@ free_work:
 	kfree(vxlan_work);
 }
 
-static void __mlx5e_vxlan_core_del_port(struct mlx5e_priv *priv, u16 port)
-{
-	struct mlx5e_vxlan_db *vxlan_db = &priv->vxlan;
-	struct mlx5e_vxlan *vxlan;
-
-	spin_lock_bh(&vxlan_db->lock);
-	vxlan = radix_tree_delete(&vxlan_db->tree, port);
-	spin_unlock_bh(&vxlan_db->lock);
-
-	if (!vxlan)
-		return;
-
-	mlx5e_vxlan_core_del_port_cmd(priv->mdev, vxlan->udp_port);
-
-	kfree(vxlan);
-}
-
 static void mlx5e_vxlan_del_port(struct work_struct *work)
 {
 	struct mlx5e_vxlan_work *vxlan_work =
 		container_of(work, struct mlx5e_vxlan_work, work);
-	struct mlx5e_priv *priv = vxlan_work->priv;
+	struct mlx5e_priv *priv         = vxlan_work->priv;
+	struct mlx5e_vxlan_db *vxlan_db = &priv->vxlan;
 	u16 port = vxlan_work->port;
+	struct mlx5e_vxlan *vxlan;
+	bool remove = false;
 
-	__mlx5e_vxlan_core_del_port(priv, port);
+	spin_lock_bh(&vxlan_db->lock);
+	vxlan = radix_tree_lookup(&vxlan_db->tree, port);
+	if (!vxlan)
+		goto out_unlock;
 
+	if (atomic_dec_and_test(&vxlan->refcount)) {
+		radix_tree_delete(&vxlan_db->tree, port);
+		remove = true;
+	}
+
+out_unlock:
+	spin_unlock_bh(&vxlan_db->lock);
+
+	if (remove) {
+		mlx5e_vxlan_core_del_port_cmd(priv->mdev, port);
+		kfree(vxlan);
+	}
 	kfree(vxlan_work);
 }
 
@@ -171,12 +176,11 @@ void mlx5e_vxlan_cleanup(struct mlx5e_priv *priv)
 	struct mlx5e_vxlan *vxlan;
 	unsigned int port = 0;
 
-	spin_lock_bh(&vxlan_db->lock);
+	/* Lockless since we are the only radix-tree consumers, wq is disabled */
 	while (radix_tree_gang_lookup(&vxlan_db->tree, (void **)&vxlan, port, 1)) {
 		port = vxlan->udp_port;
-		spin_unlock_bh(&vxlan_db->lock);
-		__mlx5e_vxlan_core_del_port(priv, (u16)port);
-		spin_lock_bh(&vxlan_db->lock);
+		radix_tree_delete(&vxlan_db->tree, port);
+		mlx5e_vxlan_core_del_port_cmd(priv->mdev, port);
+		kfree(vxlan);
 	}
-	spin_unlock_bh(&vxlan_db->lock);
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.h b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.h
index 5def12c048e3..5ef6ae7d568a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.h
@@ -36,6 +36,7 @@
 #include "en.h"
 
 struct mlx5e_vxlan {
+	atomic_t refcount;
 	u16 udp_port;
 };
 

From 0c1cc8b2215f5122ca614b5adca60346018758c3 Mon Sep 17 00:00:00 2001
From: Gal Pressman <galp@mellanox.com>
Date: Mon, 4 Dec 2017 09:57:43 +0200
Subject: [PATCH 298/808] net/mlx5e: Prevent possible races in VXLAN control
 flow

When calling add/remove VXLAN port, a lock must be held in order to
prevent race scenarios when more than one add/remove happens at the
same time.
Fix by holding our state_lock (mutex) as done by all other parts of the
driver.
Note that the spinlock protecting the radix-tree is still needed in
order to synchronize radix-tree access from softirq context.

Fixes: b3f63c3d5e2c ("net/mlx5e: Add netdev support for VXLAN tunneling")
Signed-off-by: Gal Pressman <galp@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/vxlan.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
index 25f782344667..2f74953e4561 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
@@ -88,6 +88,7 @@ static void mlx5e_vxlan_add_port(struct work_struct *work)
 	struct mlx5e_vxlan *vxlan;
 	int err;
 
+	mutex_lock(&priv->state_lock);
 	vxlan = mlx5e_vxlan_lookup_port(priv, port);
 	if (vxlan) {
 		atomic_inc(&vxlan->refcount);
@@ -117,6 +118,7 @@ err_free:
 err_delete_port:
 	mlx5e_vxlan_core_del_port_cmd(priv->mdev, port);
 free_work:
+	mutex_unlock(&priv->state_lock);
 	kfree(vxlan_work);
 }
 
@@ -130,6 +132,7 @@ static void mlx5e_vxlan_del_port(struct work_struct *work)
 	struct mlx5e_vxlan *vxlan;
 	bool remove = false;
 
+	mutex_lock(&priv->state_lock);
 	spin_lock_bh(&vxlan_db->lock);
 	vxlan = radix_tree_lookup(&vxlan_db->tree, port);
 	if (!vxlan)
@@ -147,6 +150,7 @@ out_unlock:
 		mlx5e_vxlan_core_del_port_cmd(priv->mdev, port);
 		kfree(vxlan);
 	}
+	mutex_unlock(&priv->state_lock);
 	kfree(vxlan_work);
 }
 

From 139ed6c6c46aa3d8970a086b8e0cf1f3522f5d4a Mon Sep 17 00:00:00 2001
From: Maor Gottlieb <maorg@mellanox.com>
Date: Tue, 5 Dec 2017 13:45:21 +0200
Subject: [PATCH 299/808] net/mlx5: Fix steering memory leak

Flow steering priority and namespace are software only objects that
didn't have the proper destructors and were not freed during steering
cleanup.

Fix it by adding destructor functions for these objects.

Fixes: bd71b08ec2ee ("net/mlx5: Support multiple updates of steering rules in parallel")
Signed-off-by: Maor Gottlieb <maorg@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 .../net/ethernet/mellanox/mlx5/core/fs_core.c    | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index c70fd663a633..dfaad9ecb2b8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -174,6 +174,8 @@ static void del_hw_fte(struct fs_node *node);
 static void del_sw_flow_table(struct fs_node *node);
 static void del_sw_flow_group(struct fs_node *node);
 static void del_sw_fte(struct fs_node *node);
+static void del_sw_prio(struct fs_node *node);
+static void del_sw_ns(struct fs_node *node);
 /* Delete rule (destination) is special case that 
  * requires to lock the FTE for all the deletion process.
  */
@@ -408,6 +410,16 @@ static inline struct mlx5_core_dev *get_dev(struct fs_node *node)
 	return NULL;
 }
 
+static void del_sw_ns(struct fs_node *node)
+{
+	kfree(node);
+}
+
+static void del_sw_prio(struct fs_node *node)
+{
+	kfree(node);
+}
+
 static void del_hw_flow_table(struct fs_node *node)
 {
 	struct mlx5_flow_table *ft;
@@ -2064,7 +2076,7 @@ static struct fs_prio *fs_create_prio(struct mlx5_flow_namespace *ns,
 		return ERR_PTR(-ENOMEM);
 
 	fs_prio->node.type = FS_TYPE_PRIO;
-	tree_init_node(&fs_prio->node, NULL, NULL);
+	tree_init_node(&fs_prio->node, NULL, del_sw_prio);
 	tree_add_node(&fs_prio->node, &ns->node);
 	fs_prio->num_levels = num_levels;
 	fs_prio->prio = prio;
@@ -2090,7 +2102,7 @@ static struct mlx5_flow_namespace *fs_create_namespace(struct fs_prio *prio)
 		return ERR_PTR(-ENOMEM);
 
 	fs_init_namespace(ns);
-	tree_init_node(&ns->node, NULL, NULL);
+	tree_init_node(&ns->node, NULL, del_sw_ns);
 	tree_add_node(&ns->node, &prio->node);
 	list_add_tail(&ns->node.list, &prio->node.children);
 

From d6b2785cd55ee72e9608762650b3ef299f801b1b Mon Sep 17 00:00:00 2001
From: Moshe Shemesh <moshe@mellanox.com>
Date: Tue, 21 Nov 2017 15:15:51 +0200
Subject: [PATCH 300/808] net/mlx5: Cleanup IRQs in case of unload failure

When mlx5_stop_eqs fails to destroy any of the eqs it returns with an error.
In such failure flow the function will return without
releasing all EQs irqs and then pci_free_irq_vectors will fail.
Fix by only warn on destroy EQ failure and continue to release other
EQs and their irqs.

It fixes the following kernel trace:
kernel: kernel BUG at drivers/pci/msi.c:352!
...
...
kernel: Call Trace:
kernel: pci_disable_msix+0xd3/0x100
kernel: pci_free_irq_vectors+0xe/0x20
kernel: mlx5_load_one.isra.17+0x9f5/0xec0 [mlx5_core]

Fixes: e126ba97dba9 ("mlx5: Add driver for Mellanox Connect-IB adapters")
Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/eq.c | 20 +++++++++++++-------
 include/linux/mlx5/driver.h                  |  2 +-
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index 0308a2b4823c..ab4d1465b7e4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -775,7 +775,7 @@ err1:
 	return err;
 }
 
-int mlx5_stop_eqs(struct mlx5_core_dev *dev)
+void mlx5_stop_eqs(struct mlx5_core_dev *dev)
 {
 	struct mlx5_eq_table *table = &dev->priv.eq_table;
 	int err;
@@ -784,22 +784,28 @@ int mlx5_stop_eqs(struct mlx5_core_dev *dev)
 	if (MLX5_CAP_GEN(dev, pg)) {
 		err = mlx5_destroy_unmap_eq(dev, &table->pfault_eq);
 		if (err)
-			return err;
+			mlx5_core_err(dev, "failed to destroy page fault eq, err(%d)\n",
+				      err);
 	}
 #endif
 
 	err = mlx5_destroy_unmap_eq(dev, &table->pages_eq);
 	if (err)
-		return err;
+		mlx5_core_err(dev, "failed to destroy pages eq, err(%d)\n",
+			      err);
 
-	mlx5_destroy_unmap_eq(dev, &table->async_eq);
+	err = mlx5_destroy_unmap_eq(dev, &table->async_eq);
+	if (err)
+		mlx5_core_err(dev, "failed to destroy async eq, err(%d)\n",
+			      err);
 	mlx5_cmd_use_polling(dev);
 
 	err = mlx5_destroy_unmap_eq(dev, &table->cmd_eq);
-	if (err)
+	if (err) {
+		mlx5_core_err(dev, "failed to destroy command eq, err(%d)\n",
+			      err);
 		mlx5_cmd_use_events(dev);
-
-	return err;
+	}
 }
 
 int mlx5_core_eq_query(struct mlx5_core_dev *dev, struct mlx5_eq *eq,
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 40a6f33c4cde..57b109c6e422 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1049,7 +1049,7 @@ int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx,
 		       enum mlx5_eq_type type);
 int mlx5_destroy_unmap_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq);
 int mlx5_start_eqs(struct mlx5_core_dev *dev);
-int mlx5_stop_eqs(struct mlx5_core_dev *dev);
+void mlx5_stop_eqs(struct mlx5_core_dev *dev);
 int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn,
 		    unsigned int *irqn);
 int mlx5_core_attach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn);

From a2fba188fd5eadd6061bef4f2f2577a43231ebf3 Mon Sep 17 00:00:00 2001
From: Moshe Shemesh <moshe@mellanox.com>
Date: Mon, 4 Dec 2017 15:23:51 +0200
Subject: [PATCH 301/808] net/mlx5: Stay in polling mode when command EQ
 destroy fails

During unload, on mlx5_stop_eqs we move command interface from events
mode to polling mode, but if command interface EQ destroy fail we move
back to events mode.
That's wrong since even if we fail to destroy command interface EQ, we
do release its irq, so no interrupts will be received.

Fixes: e126ba97dba9 ("mlx5: Add driver for Mellanox Connect-IB adapters")
Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/eq.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index ab4d1465b7e4..e7e7cef2bde4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -801,11 +801,9 @@ void mlx5_stop_eqs(struct mlx5_core_dev *dev)
 	mlx5_cmd_use_polling(dev);
 
 	err = mlx5_destroy_unmap_eq(dev, &table->cmd_eq);
-	if (err) {
+	if (err)
 		mlx5_core_err(dev, "failed to destroy command eq, err(%d)\n",
 			      err);
-		mlx5_cmd_use_events(dev);
-	}
 }
 
 int mlx5_core_eq_query(struct mlx5_core_dev *dev, struct mlx5_eq *eq,

From 4ef928929987c19fff4d3c1650f139560ba1cc13 Mon Sep 17 00:00:00 2001
From: Ben Skeggs <bskeggs@redhat.com>
Date: Wed, 20 Dec 2017 08:38:46 +1000
Subject: [PATCH 302/808] drm/nouveau: fix obvious memory leak

fdo#104340.

Signed-off-by: Ben Skeggs <bskeggs@redhat.com>
---
 drivers/gpu/drm/nouveau/nouveau_vmm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/nouveau/nouveau_vmm.c b/drivers/gpu/drm/nouveau/nouveau_vmm.c
index 9e2628dd8e4d..f5371d96b003 100644
--- a/drivers/gpu/drm/nouveau/nouveau_vmm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_vmm.c
@@ -67,8 +67,8 @@ nouveau_vma_del(struct nouveau_vma **pvma)
 			nvif_vmm_put(&vma->vmm->vmm, &tmp);
 		}
 		list_del(&vma->head);
-		*pvma = NULL;
 		kfree(*pvma);
+		*pvma = NULL;
 	}
 }
 

From 19deaa217bc04e83b59b5e8c8229eb0e53ad9efc Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 19 Dec 2017 15:07:10 -0800
Subject: [PATCH 303/808] libnvdimm, pfn: fix start_pad handling for aligned
 namespaces

The alignment checks at pfn driver startup fail to properly account for
the 'start_pad' in the case where the namespace is misaligned relative
to its internal alignment. This is typically triggered in 1G aligned
namespace, but could theoretically trigger with small namespace
alignments. When this triggers the kernel reports messages of the form:

    dax2.1: bad offset: 0x3c000000 dax disabled align: 0x40000000

Cc: <stable@vger.kernel.org>
Fixes: 1ee6667cd8d1 ("libnvdimm, pfn, dax: fix initialization vs autodetect...")
Reported-by: Jane Chu <jane.chu@oracle.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/pfn_devs.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index 65cc171c721d..db2fc7c02e01 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -364,9 +364,9 @@ struct device *nd_pfn_create(struct nd_region *nd_region)
 int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig)
 {
 	u64 checksum, offset;
-	unsigned long align;
 	enum nd_pfn_mode mode;
 	struct nd_namespace_io *nsio;
+	unsigned long align, start_pad;
 	struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb;
 	struct nd_namespace_common *ndns = nd_pfn->ndns;
 	const u8 *parent_uuid = nd_dev_to_uuid(&ndns->dev);
@@ -410,6 +410,7 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig)
 
 	align = le32_to_cpu(pfn_sb->align);
 	offset = le64_to_cpu(pfn_sb->dataoff);
+	start_pad = le32_to_cpu(pfn_sb->start_pad);
 	if (align == 0)
 		align = 1UL << ilog2(offset);
 	mode = le32_to_cpu(pfn_sb->mode);
@@ -468,7 +469,7 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig)
 		return -EBUSY;
 	}
 
-	if ((align && !IS_ALIGNED(offset, align))
+	if ((align && !IS_ALIGNED(nsio->res.start + offset + start_pad, align))
 			|| !IS_ALIGNED(offset, PAGE_SIZE)) {
 		dev_err(&nd_pfn->dev,
 				"bad offset: %#llx dax disabled align: %#lx\n",

From 19c832ed9b8f7b49fa5eeef06b4338af5fe5c1dc Mon Sep 17 00:00:00 2001
From: David Miller <davem@davemloft.net>
Date: Tue, 19 Dec 2017 15:22:03 -0500
Subject: [PATCH 304/808] bpf: Fix tools and testing build.

I'm getting various build failures on sparc64.  The key is
usually that the userland tools get built 32-bit.

1) clock_gettime() is in librt, so that must be added to the link
   libraries.

2) "sizeof(x)" must be printed with "%Z" printf prefix.

Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/Makefile     | 2 +-
 tools/testing/selftests/bpf/test_progs.c | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 792af7c3b74f..05fc4e2e7b3a 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -11,7 +11,7 @@ ifneq ($(wildcard $(GENHDR)),)
 endif
 
 CFLAGS += -Wall -O2 -I$(APIDIR) -I$(LIBDIR) -I$(GENDIR) $(GENFLAGS) -I../../../include
-LDLIBS += -lcap -lelf
+LDLIBS += -lcap -lelf -lrt
 
 TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \
 	test_align test_verifier_log test_dev_cgroup
diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index 69427531408d..6761be18a91f 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -351,7 +351,7 @@ static void test_bpf_obj_id(void)
 			  info_len != sizeof(struct bpf_map_info) ||
 			  strcmp((char *)map_infos[i].name, expected_map_name),
 			  "get-map-info(fd)",
-			  "err %d errno %d type %d(%d) info_len %u(%lu) key_size %u value_size %u max_entries %u map_flags %X name %s(%s)\n",
+			  "err %d errno %d type %d(%d) info_len %u(%Zu) key_size %u value_size %u max_entries %u map_flags %X name %s(%s)\n",
 			  err, errno,
 			  map_infos[i].type, BPF_MAP_TYPE_ARRAY,
 			  info_len, sizeof(struct bpf_map_info),
@@ -395,7 +395,7 @@ static void test_bpf_obj_id(void)
 			  *(int *)prog_infos[i].map_ids != map_infos[i].id ||
 			  strcmp((char *)prog_infos[i].name, expected_prog_name),
 			  "get-prog-info(fd)",
-			  "err %d errno %d i %d type %d(%d) info_len %u(%lu) jit_enabled %d jited_prog_len %u xlated_prog_len %u jited_prog %d xlated_prog %d load_time %lu(%lu) uid %u(%u) nr_map_ids %u(%u) map_id %u(%u) name %s(%s)\n",
+			  "err %d errno %d i %d type %d(%d) info_len %u(%Zu) jit_enabled %d jited_prog_len %u xlated_prog_len %u jited_prog %d xlated_prog %d load_time %lu(%lu) uid %u(%u) nr_map_ids %u(%u) map_id %u(%u) name %s(%s)\n",
 			  err, errno, i,
 			  prog_infos[i].type, BPF_PROG_TYPE_SOCKET_FILTER,
 			  info_len, sizeof(struct bpf_prog_info),
@@ -463,7 +463,7 @@ static void test_bpf_obj_id(void)
 		      memcmp(&prog_info, &prog_infos[i], info_len) ||
 		      *(int *)prog_info.map_ids != saved_map_id,
 		      "get-prog-info(next_id->fd)",
-		      "err %d errno %d info_len %u(%lu) memcmp %d map_id %u(%u)\n",
+		      "err %d errno %d info_len %u(%Zu) memcmp %d map_id %u(%u)\n",
 		      err, errno, info_len, sizeof(struct bpf_prog_info),
 		      memcmp(&prog_info, &prog_infos[i], info_len),
 		      *(int *)prog_info.map_ids, saved_map_id);
@@ -509,7 +509,7 @@ static void test_bpf_obj_id(void)
 		      memcmp(&map_info, &map_infos[i], info_len) ||
 		      array_value != array_magic_value,
 		      "check get-map-info(next_id->fd)",
-		      "err %d errno %d info_len %u(%lu) memcmp %d array_value %llu(%llu)\n",
+		      "err %d errno %d info_len %u(%Zu) memcmp %d array_value %llu(%llu)\n",
 		      err, errno, info_len, sizeof(struct bpf_map_info),
 		      memcmp(&map_info, &map_infos[i], info_len),
 		      array_value, array_magic_value);

From 41fce90f26333c4fa82e8e43b9ace86c4e8a0120 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Mon, 4 Dec 2017 14:07:43 -0800
Subject: [PATCH 305/808] libnvdimm, dax: fix 1GB-aligned namespaces vs
 physical misalignment

The following namespace configuration attempt:

    # ndctl create-namespace -e namespace0.0 -m devdax -a 1G -f
    libndctl: ndctl_dax_enable: dax0.1: failed to enable
      Error: namespace0.0: failed to enable

    failed to reconfigure namespace: No such device or address

...fails when the backing memory range is not physically aligned to 1G:

    # cat /proc/iomem | grep Persistent
    210000000-30fffffff : Persistent Memory (legacy)

In the above example the 4G persistent memory range starts and ends on a
256MB boundary.

We handle this case correctly when needing to handle cases that violate
section alignment (128MB) collisions against "System RAM", and we simply
need to extend that padding/truncation for the 1GB alignment use case.

Cc: <stable@vger.kernel.org>
Fixes: 315c562536c4 ("libnvdimm, pfn: add 'align' attribute...")
Reported-and-tested-by: Jane Chu <jane.chu@oracle.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/pfn_devs.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index db2fc7c02e01..2adada1a5855 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -583,6 +583,12 @@ static struct vmem_altmap *__nvdimm_setup_pfn(struct nd_pfn *nd_pfn,
 	return altmap;
 }
 
+static u64 phys_pmem_align_down(struct nd_pfn *nd_pfn, u64 phys)
+{
+	return min_t(u64, PHYS_SECTION_ALIGN_DOWN(phys),
+			ALIGN_DOWN(phys, nd_pfn->align));
+}
+
 static int nd_pfn_init(struct nd_pfn *nd_pfn)
 {
 	u32 dax_label_reserve = is_nd_dax(&nd_pfn->dev) ? SZ_128K : 0;
@@ -638,13 +644,16 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
 	start = nsio->res.start;
 	size = PHYS_SECTION_ALIGN_UP(start + size) - start;
 	if (region_intersects(start, size, IORESOURCE_SYSTEM_RAM,
-				IORES_DESC_NONE) == REGION_MIXED) {
+				IORES_DESC_NONE) == REGION_MIXED
+			|| !IS_ALIGNED(start + resource_size(&nsio->res),
+				nd_pfn->align)) {
 		size = resource_size(&nsio->res);
-		end_trunc = start + size - PHYS_SECTION_ALIGN_DOWN(start + size);
+		end_trunc = start + size - phys_pmem_align_down(nd_pfn,
+				start + size);
 	}
 
 	if (start_pad + end_trunc)
-		dev_info(&nd_pfn->dev, "%s section collision, truncate %d bytes\n",
+		dev_info(&nd_pfn->dev, "%s alignment collision, truncate %d bytes\n",
 				dev_name(&ndns->dev), start_pad + end_trunc);
 
 	/*

From 10a7e9d849150a2879efc0b04d8a51068c9dd0c5 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 19 Dec 2017 13:52:23 -0800
Subject: [PATCH 306/808] Do not hash userspace addresses in fault handlers

The hashing of %p was designed to restrict kernel addresses. There is
no reason to hash the userspace values seen during a segfault report,
so switch these to %px. (Some architectures already use %lx.)

Fixes: ad67b74d2469d9b8 ("printk: hash addresses printed with %p")
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/sparc/mm/fault_32.c | 2 +-
 arch/sparc/mm/fault_64.c | 2 +-
 arch/um/kernel/trap.c    | 2 +-
 arch/x86/mm/fault.c      | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c
index be3136f142a9..a8103a84b4ac 100644
--- a/arch/sparc/mm/fault_32.c
+++ b/arch/sparc/mm/fault_32.c
@@ -113,7 +113,7 @@ show_signal_msg(struct pt_regs *regs, int sig, int code,
 	if (!printk_ratelimit())
 		return;
 
-	printk("%s%s[%d]: segfault at %lx ip %p (rpc %p) sp %p error %x",
+	printk("%s%s[%d]: segfault at %lx ip %px (rpc %px) sp %px error %x",
 	       task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
 	       tsk->comm, task_pid_nr(tsk), address,
 	       (void *)regs->pc, (void *)regs->u_regs[UREG_I7],
diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c
index 815c03d7a765..41363f46797b 100644
--- a/arch/sparc/mm/fault_64.c
+++ b/arch/sparc/mm/fault_64.c
@@ -154,7 +154,7 @@ show_signal_msg(struct pt_regs *regs, int sig, int code,
 	if (!printk_ratelimit())
 		return;
 
-	printk("%s%s[%d]: segfault at %lx ip %p (rpc %p) sp %p error %x",
+	printk("%s%s[%d]: segfault at %lx ip %px (rpc %px) sp %px error %x",
 	       task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
 	       tsk->comm, task_pid_nr(tsk), address,
 	       (void *)regs->tpc, (void *)regs->u_regs[UREG_I7],
diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c
index 4e6fcb32620f..428644175956 100644
--- a/arch/um/kernel/trap.c
+++ b/arch/um/kernel/trap.c
@@ -150,7 +150,7 @@ static void show_segv_info(struct uml_pt_regs *regs)
 	if (!printk_ratelimit())
 		return;
 
-	printk("%s%s[%d]: segfault at %lx ip %p sp %p error %x",
+	printk("%s%s[%d]: segfault at %lx ip %px sp %px error %x",
 		task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
 		tsk->comm, task_pid_nr(tsk), FAULT_ADDRESS(*fi),
 		(void *)UPT_IP(regs), (void *)UPT_SP(regs),
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index febf6980e653..06fe3d51d385 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -860,7 +860,7 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,
 	if (!printk_ratelimit())
 		return;
 
-	printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
+	printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx",
 		task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
 		tsk->comm, task_pid_nr(tsk), address,
 		(void *)regs->ip, (void *)regs->sp, error_code);

From d5aa24825da5711f8cb829f873160ddf1a29b19c Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Wed, 20 Dec 2017 06:11:59 +0000
Subject: [PATCH 307/808] ASoC: rsnd: fixup ADG register mask

BRGCKR should use 0x80770000, instead of 0x80FF0000.

R-Car Gen2 xxx_TIMSEL should use 0x0F1F,
R-Car Gen3 xxx_TIMSEL should use 0x1F1F.
Here, Gen3 doesn't support AVD, thus, both case can use 0x0F1F.

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Reviewed-by: Hiroyuki Yokoyama <hiroyuki.yokoyama.vx@renesas.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 sound/soc/sh/rcar/adg.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sound/soc/sh/rcar/adg.c b/sound/soc/sh/rcar/adg.c
index 8ddb08714faa..4672688cac32 100644
--- a/sound/soc/sh/rcar/adg.c
+++ b/sound/soc/sh/rcar/adg.c
@@ -222,7 +222,7 @@ int rsnd_adg_set_cmd_timsel_gen2(struct rsnd_mod *cmd_mod,
 				   NULL, &val, NULL);
 
 	val  = val	<< shift;
-	mask = 0xffff	<< shift;
+	mask = 0x0f1f	<< shift;
 
 	rsnd_mod_bset(adg_mod, CMDOUT_TIMSEL, mask, val);
 
@@ -250,7 +250,7 @@ int rsnd_adg_set_src_timesel_gen2(struct rsnd_mod *src_mod,
 
 	in   = in	<< shift;
 	out  = out	<< shift;
-	mask = 0xffff	<< shift;
+	mask = 0x0f1f	<< shift;
 
 	switch (id / 2) {
 	case 0:
@@ -380,7 +380,7 @@ int rsnd_adg_ssi_clk_try_start(struct rsnd_mod *ssi_mod, unsigned int rate)
 			ckr = 0x80000000;
 	}
 
-	rsnd_mod_bset(adg_mod, BRGCKR, 0x80FF0000, adg->ckr | ckr);
+	rsnd_mod_bset(adg_mod, BRGCKR, 0x80770000, adg->ckr | ckr);
 	rsnd_mod_write(adg_mod, BRRA,  adg->rbga);
 	rsnd_mod_write(adg_mod, BRRB,  adg->rbgb);
 

From b67336eee3fcb8ecedc6c13e2bf88aacfa3151e2 Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@mips.com>
Date: Mon, 27 Nov 2017 09:33:03 +0000
Subject: [PATCH 308/808] MIPS: Validate PR_SET_FP_MODE prctl(2) requests
 against the ABI of the task

Fix an API loophole introduced with commit 9791554b45a2 ("MIPS,prctl:
add PR_[GS]ET_FP_MODE prctl options for MIPS"), where the caller of
prctl(2) is incorrectly allowed to make a change to CP0.Status.FR or
CP0.Config5.FRE register bits even if CONFIG_MIPS_O32_FP64_SUPPORT has
not been enabled, despite that an executable requesting the mode
requested via ELF file annotation would not be allowed to run in the
first place, or for n64 and n64 ABI tasks which do not have non-default
modes defined at all.  Add suitable checks to `mips_set_process_fp_mode'
and bail out if an invalid mode change has been requested for the ABI in
effect, even if the FPU hardware or emulation would otherwise allow it.

Always succeed however without taking any further action if the mode
requested is the same as one already in effect, regardless of whether
any mode change, should it be requested, would actually be allowed for
the task concerned.

Signed-off-by: Maciej W. Rozycki <macro@mips.com>
Fixes: 9791554b45a2 ("MIPS,prctl: add PR_[GS]ET_FP_MODE prctl options for MIPS")
Reviewed-by: Paul Burton <paul.burton@mips.com>
Cc: James Hogan <james.hogan@mips.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Cc: stable@vger.kernel.org # 4.0+
Patchwork: https://patchwork.linux-mips.org/patch/17800/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
---
 arch/mips/kernel/process.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c
index 45d0b6b037ee..57028d49c202 100644
--- a/arch/mips/kernel/process.c
+++ b/arch/mips/kernel/process.c
@@ -705,6 +705,18 @@ int mips_set_process_fp_mode(struct task_struct *task, unsigned int value)
 	struct task_struct *t;
 	int max_users;
 
+	/* If nothing to change, return right away, successfully.  */
+	if (value == mips_get_process_fp_mode(task))
+		return 0;
+
+	/* Only accept a mode change if 64-bit FP enabled for o32.  */
+	if (!IS_ENABLED(CONFIG_MIPS_O32_FP64_SUPPORT))
+		return -EOPNOTSUPP;
+
+	/* And only for o32 tasks.  */
+	if (IS_ENABLED(CONFIG_64BIT) && !test_thread_flag(TIF_32BIT_REGS))
+		return -EOPNOTSUPP;
+
 	/* Check the value is valid */
 	if (value & ~known_bits)
 		return -EOPNOTSUPP;

From 2c08cd7c20968ddf71feeac2265b4741d2b3fdde Mon Sep 17 00:00:00 2001
From: Maxime Ripard <maxime.ripard@free-electrons.com>
Date: Wed, 20 Dec 2017 11:52:47 +0100
Subject: [PATCH 309/808] drm/sun4i: hdmi: Move the mode_valid callback to the
 encoder

When attached to the connector, the mode_valid callback will only filter
the modes provided by the connector itself as part of its probe.

However, it will not be doing it when the mode is provided by the
userspace, which still might result in a broken configuration.

In order to enforce these constraints, move our mode_valid callback to the
encoder which doesn't have this behaviour.

Acked-by: Daniel Vetter <daniel@ffwll.ch>
Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
[maxime: Wrote the commit log in order to update the patch from the merged
	 v3 to the v4 that was correct.]
Signed-off-by: Maxime Ripard <maxime.ripard@free-electrons.com>
Link: https://patchwork.freedesktop.org/patch/msgid/0fa230a8-d01d-561a-f74f-6b4fd421255b@xs4all.nl
---
 drivers/gpu/drm/sun4i/sun4i_hdmi_enc.c | 39 +++++++++++++-------------
 1 file changed, 20 insertions(+), 19 deletions(-)

diff --git a/drivers/gpu/drm/sun4i/sun4i_hdmi_enc.c b/drivers/gpu/drm/sun4i/sun4i_hdmi_enc.c
index c12f9bd12904..500b6fb3e028 100644
--- a/drivers/gpu/drm/sun4i/sun4i_hdmi_enc.c
+++ b/drivers/gpu/drm/sun4i/sun4i_hdmi_enc.c
@@ -175,11 +175,31 @@ static void sun4i_hdmi_mode_set(struct drm_encoder *encoder,
 	writel(val, hdmi->base + SUN4I_HDMI_VID_TIMING_POL_REG);
 }
 
+static enum drm_mode_status sun4i_hdmi_mode_valid(struct drm_encoder *encoder,
+					const struct drm_display_mode *mode)
+{
+	struct sun4i_hdmi *hdmi = drm_encoder_to_sun4i_hdmi(encoder);
+	unsigned long rate = mode->clock * 1000;
+	unsigned long diff = rate / 200; /* +-0.5% allowed by HDMI spec */
+	long rounded_rate;
+
+	/* 165 MHz is the typical max pixelclock frequency for HDMI <= 1.2 */
+	if (rate > 165000000)
+		return MODE_CLOCK_HIGH;
+	rounded_rate = clk_round_rate(hdmi->tmds_clk, rate);
+	if (rounded_rate > 0 &&
+	    max_t(unsigned long, rounded_rate, rate) -
+	    min_t(unsigned long, rounded_rate, rate) < diff)
+		return MODE_OK;
+	return MODE_NOCLOCK;
+}
+
 static const struct drm_encoder_helper_funcs sun4i_hdmi_helper_funcs = {
 	.atomic_check	= sun4i_hdmi_atomic_check,
 	.disable	= sun4i_hdmi_disable,
 	.enable		= sun4i_hdmi_enable,
 	.mode_set	= sun4i_hdmi_mode_set,
+	.mode_valid	= sun4i_hdmi_mode_valid,
 };
 
 static const struct drm_encoder_funcs sun4i_hdmi_funcs = {
@@ -208,27 +228,8 @@ static int sun4i_hdmi_get_modes(struct drm_connector *connector)
 	return ret;
 }
 
-static int sun4i_hdmi_mode_valid(struct drm_connector *connector,
-				 struct drm_display_mode *mode)
-{
-	struct sun4i_hdmi *hdmi = drm_connector_to_sun4i_hdmi(connector);
-	long rate = mode->clock * 1000;
-	long diff = rate / 200; /* +-0.5% allowed by HDMI spec */
-	long rounded_rate;
-
-	/* 165 MHz is the typical max pixelclock frequency for HDMI <= 1.2 */
-	if (rate > 165000000)
-		return MODE_CLOCK_HIGH;
-	rounded_rate = clk_round_rate(hdmi->tmds_clk, rate);
-	if (max(rounded_rate, rate) - min(rounded_rate, rate) < diff &&
-	    rounded_rate > 0)
-		return MODE_OK;
-	return MODE_NOCLOCK;
-}
-
 static const struct drm_connector_helper_funcs sun4i_hdmi_connector_helper_funcs = {
 	.get_modes	= sun4i_hdmi_get_modes,
-	.mode_valid	= sun4i_hdmi_mode_valid,
 };
 
 static enum drm_connector_status

From ce0769e0ea4b3e192466243a1a9fd39acf214f1e Mon Sep 17 00:00:00 2001
From: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Date: Wed, 20 Dec 2017 10:35:43 +0100
Subject: [PATCH 310/808] drm/plane: Make framebuffer refcounting the
 responsibility of setplane_internal callers

lock_all_ctx in setplane_internal may return -EINTR, and
__setplane_internal could return -EDEADLK. Making more
special cases for fb would make the code even harder to
read, so the easiest solution is not taking over the fb
refcount, and making callers responsible for dropping
the ref.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=102707
Fixes: 13736ba3b38b ("drm/legacy: Convert setplane ioctl locking to interruptible.")
Testcase: kms_atomic_interruptible
Signed-off-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20171220093545.613-2-maarten.lankhorst@linux.intel.com
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/drm_plane.c | 42 ++++++++++++++++++-------------------
 1 file changed, 20 insertions(+), 22 deletions(-)

diff --git a/drivers/gpu/drm/drm_plane.c b/drivers/gpu/drm/drm_plane.c
index 37a93cdffb4a..2c90519576a3 100644
--- a/drivers/gpu/drm/drm_plane.c
+++ b/drivers/gpu/drm/drm_plane.c
@@ -558,11 +558,10 @@ int drm_plane_check_pixel_format(const struct drm_plane *plane, u32 format)
 }
 
 /*
- * setplane_internal - setplane handler for internal callers
+ * __setplane_internal - setplane handler for internal callers
  *
- * Note that we assume an extra reference has already been taken on fb.  If the
- * update fails, this reference will be dropped before return; if it succeeds,
- * the previous framebuffer (if any) will be unreferenced instead.
+ * This function will take a reference on the new fb for the plane
+ * on success.
  *
  * src_{x,y,w,h} are provided in 16.16 fixed point format
  */
@@ -630,14 +629,12 @@ static int __setplane_internal(struct drm_plane *plane,
 	if (!ret) {
 		plane->crtc = crtc;
 		plane->fb = fb;
-		fb = NULL;
+		drm_framebuffer_get(plane->fb);
 	} else {
 		plane->old_fb = NULL;
 	}
 
 out:
-	if (fb)
-		drm_framebuffer_put(fb);
 	if (plane->old_fb)
 		drm_framebuffer_put(plane->old_fb);
 	plane->old_fb = NULL;
@@ -685,6 +682,7 @@ int drm_mode_setplane(struct drm_device *dev, void *data,
 	struct drm_plane *plane;
 	struct drm_crtc *crtc = NULL;
 	struct drm_framebuffer *fb = NULL;
+	int ret;
 
 	if (!drm_core_check_feature(dev, DRIVER_MODESET))
 		return -EINVAL;
@@ -717,15 +715,16 @@ int drm_mode_setplane(struct drm_device *dev, void *data,
 		}
 	}
 
-	/*
-	 * setplane_internal will take care of deref'ing either the old or new
-	 * framebuffer depending on success.
-	 */
-	return setplane_internal(plane, crtc, fb,
-				 plane_req->crtc_x, plane_req->crtc_y,
-				 plane_req->crtc_w, plane_req->crtc_h,
-				 plane_req->src_x, plane_req->src_y,
-				 plane_req->src_w, plane_req->src_h);
+	ret = setplane_internal(plane, crtc, fb,
+				plane_req->crtc_x, plane_req->crtc_y,
+				plane_req->crtc_w, plane_req->crtc_h,
+				plane_req->src_x, plane_req->src_y,
+				plane_req->src_w, plane_req->src_h);
+
+	if (fb)
+		drm_framebuffer_put(fb);
+
+	return ret;
 }
 
 static int drm_mode_cursor_universal(struct drm_crtc *crtc,
@@ -788,13 +787,12 @@ static int drm_mode_cursor_universal(struct drm_crtc *crtc,
 		src_h = fb->height << 16;
 	}
 
-	/*
-	 * setplane_internal will take care of deref'ing either the old or new
-	 * framebuffer depending on success.
-	 */
 	ret = __setplane_internal(crtc->cursor, crtc, fb,
-				crtc_x, crtc_y, crtc_w, crtc_h,
-				0, 0, src_w, src_h, ctx);
+				  crtc_x, crtc_y, crtc_w, crtc_h,
+				  0, 0, src_w, src_h, ctx);
+
+	if (fb)
+		drm_framebuffer_put(fb);
 
 	/* Update successful; save new cursor position, if necessary */
 	if (ret == 0 && req->flags & DRM_MODE_CURSOR_MOVE) {

From 74d0833c659a8a54735e5efdd44f4b225af68586 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 20 Dec 2017 07:09:19 -0800
Subject: [PATCH 311/808] cgroup: fix css_task_iter crash on CSS_TASK_ITER_PROC

While teaching css_task_iter to handle skipping over tasks which
aren't group leaders, bc2fb7ed089f ("cgroup: add @flags to
css_task_iter_start() and implement CSS_TASK_ITER_PROCS") introduced a
silly bug.

CSS_TASK_ITER_PROCS is implemented by repeating
css_task_iter_advance() while the advanced cursor is pointing to a
non-leader thread.  However, the cursor variable, @l, wasn't updated
when the iteration has to advance to the next css_set and the
following repetition would operate on the terminal @l from the
previous iteration which isn't pointing to a valid task leading to
oopses like the following or infinite looping.

  BUG: unable to handle kernel NULL pointer dereference at 0000000000000254
  IP: __task_pid_nr_ns+0xc7/0xf0
  PGD 0 P4D 0
  Oops: 0000 [#1] SMP
  ...
  CPU: 2 PID: 1 Comm: systemd Not tainted 4.14.4-200.fc26.x86_64 #1
  Hardware name: System manufacturer System Product Name/PRIME B350M-A, BIOS 3203 11/09/2017
  task: ffff88c4baee8000 task.stack: ffff96d5c3158000
  RIP: 0010:__task_pid_nr_ns+0xc7/0xf0
  RSP: 0018:ffff96d5c315bd50 EFLAGS: 00010206
  RAX: 0000000000000000 RBX: ffff88c4b68c6000 RCX: 0000000000000250
  RDX: ffffffffa5e47960 RSI: 0000000000000000 RDI: ffff88c490f6ab00
  RBP: ffff96d5c315bd50 R08: 0000000000001000 R09: 0000000000000005
  R10: ffff88c4be006b80 R11: ffff88c42f1b8004 R12: ffff96d5c315bf18
  R13: ffff88c42d7dd200 R14: ffff88c490f6a510 R15: ffff88c4b68c6000
  FS:  00007f9446f8ea00(0000) GS:ffff88c4be680000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 0000000000000254 CR3: 00000007f956f000 CR4: 00000000003406e0
  Call Trace:
   cgroup_procs_show+0x19/0x30
   cgroup_seqfile_show+0x4c/0xb0
   kernfs_seq_show+0x21/0x30
   seq_read+0x2ec/0x3f0
   kernfs_fop_read+0x134/0x180
   __vfs_read+0x37/0x160
   ? security_file_permission+0x9b/0xc0
   vfs_read+0x8e/0x130
   SyS_read+0x55/0xc0
   entry_SYSCALL_64_fastpath+0x1a/0xa5
  RIP: 0033:0x7f94455f942d
  RSP: 002b:00007ffe81ba2d00 EFLAGS: 00000293 ORIG_RAX: 0000000000000000
  RAX: ffffffffffffffda RBX: 00005574e2233f00 RCX: 00007f94455f942d
  RDX: 0000000000001000 RSI: 00005574e2321a90 RDI: 000000000000002b
  RBP: 0000000000000000 R08: 00005574e2321a90 R09: 00005574e231de60
  R10: 00007f94458c8b38 R11: 0000000000000293 R12: 00007f94458c8ae0
  R13: 00007ffe81ba3800 R14: 0000000000000000 R15: 00005574e2116560
  Code: 04 74 0e 89 f6 48 8d 04 76 48 8d 04 c5 f0 05 00 00 48 8b bf b8 05 00 00 48 01 c7 31 c0 48 8b 0f 48 85 c9 74 18 8b b2 30 08 00 00 <3b> 71 04 77 0d 48 c1 e6 05 48 01 f1 48 3b 51 38 74 09 5d c3 8b
  RIP: __task_pid_nr_ns+0xc7/0xf0 RSP: ffff96d5c315bd50

Fix it by moving the initialization of the cursor below the repeat
label.  While at it, rename it to @next for readability.

Signed-off-by: Tejun Heo <tj@kernel.org>
Fixes: bc2fb7ed089f ("cgroup: add @flags to css_task_iter_start() and implement CSS_TASK_ITER_PROCS")
Cc: stable@vger.kernel.org # v4.14+
Reported-by: Laura Abbott <labbott@redhat.com>
Reported-by: Bronek Kozicki <brok@incorrekt.com>
Reported-by: George Amanakis <gamanakis@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/cgroup.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index f4c2f8cb5748..2cf06c274e4c 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -4125,26 +4125,24 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it)
 
 static void css_task_iter_advance(struct css_task_iter *it)
 {
-	struct list_head *l = it->task_pos;
+	struct list_head *next;
 
 	lockdep_assert_held(&css_set_lock);
-	WARN_ON_ONCE(!l);
-
 repeat:
 	/*
 	 * Advance iterator to find next entry.  cset->tasks is consumed
 	 * first and then ->mg_tasks.  After ->mg_tasks, we move onto the
 	 * next cset.
 	 */
-	l = l->next;
+	next = it->task_pos->next;
 
-	if (l == it->tasks_head)
-		l = it->mg_tasks_head->next;
+	if (next == it->tasks_head)
+		next = it->mg_tasks_head->next;
 
-	if (l == it->mg_tasks_head)
+	if (next == it->mg_tasks_head)
 		css_task_iter_advance_css_set(it);
 	else
-		it->task_pos = l;
+		it->task_pos = next;
 
 	/* if PROCS, skip over tasks which aren't group leaders */
 	if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos &&

From 4423c18e466afdfb02a36ee8b9f901d144b3c607 Mon Sep 17 00:00:00 2001
From: Yelena Krivosheev <yelena@marvell.com>
Date: Tue, 19 Dec 2017 17:59:45 +0100
Subject: [PATCH 312/808] net: mvneta: clear interface link status on port
 disable

When port connect to PHY in polling mode (with poll interval 1 sec),
port and phy link status must be synchronize in order don't loss link
change event.

[gregory.clement@free-electrons.com: add fixes tag]
Cc: <stable@vger.kernel.org>
Fixes: c5aff18204da ("net: mvneta: driver for Marvell Armada 370/XP network unit")
Signed-off-by: Yelena Krivosheev <yelena@marvell.com>
Tested-by: Dmitri Epshtein <dima@marvell.com>
Signed-off-by: Gregory CLEMENT <gregory.clement@free-electrons.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/marvell/mvneta.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c
index bc93b69cfd1e..16b2bfb2cf51 100644
--- a/drivers/net/ethernet/marvell/mvneta.c
+++ b/drivers/net/ethernet/marvell/mvneta.c
@@ -1214,6 +1214,10 @@ static void mvneta_port_disable(struct mvneta_port *pp)
 	val &= ~MVNETA_GMAC0_PORT_ENABLE;
 	mvreg_write(pp, MVNETA_GMAC_CTRL_0, val);
 
+	pp->link = 0;
+	pp->duplex = -1;
+	pp->speed = 0;
+
 	udelay(200);
 }
 

From ca5902a6547f662419689ca28b3c29a772446caa Mon Sep 17 00:00:00 2001
From: Yelena Krivosheev <yelena@marvell.com>
Date: Tue, 19 Dec 2017 17:59:46 +0100
Subject: [PATCH 313/808] net: mvneta: use proper rxq_number in loop on rx
 queues

When adding the RX queue association with each CPU, a typo was made in
the mvneta_cleanup_rxqs() function. This patch fixes it.

[gregory.clement@free-electrons.com: add commit log and fixes tag]
Cc: stable@vger.kernel.org
Fixes: 2dcf75e2793c ("net: mvneta: Associate RX queues with each CPU")
Signed-off-by: Yelena Krivosheev <yelena@marvell.com>
Tested-by: Dmitri Epshtein <dima@marvell.com>
Signed-off-by: Gregory CLEMENT <gregory.clement@free-electrons.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/marvell/mvneta.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c
index 16b2bfb2cf51..1e0835655c93 100644
--- a/drivers/net/ethernet/marvell/mvneta.c
+++ b/drivers/net/ethernet/marvell/mvneta.c
@@ -3015,7 +3015,7 @@ static void mvneta_cleanup_rxqs(struct mvneta_port *pp)
 {
 	int queue;
 
-	for (queue = 0; queue < txq_number; queue++)
+	for (queue = 0; queue < rxq_number; queue++)
 		mvneta_rxq_deinit(pp, &pp->rxqs[queue]);
 }
 

From 2eecb2e04abb62ef8ea7b43e1a46bdb5b99d1bf8 Mon Sep 17 00:00:00 2001
From: Yelena Krivosheev <yelena@marvell.com>
Date: Tue, 19 Dec 2017 17:59:47 +0100
Subject: [PATCH 314/808] net: mvneta: eliminate wrong call to handle rx
 descriptor error

There are few reasons in mvneta_rx_swbm() function when received packet
is dropped. mvneta_rx_error() should be called only if error bit [16]
is set in rx descriptor.

[gregory.clement@free-electrons.com: add fixes tag]
Cc: stable@vger.kernel.org
Fixes: dc35a10f68d3 ("net: mvneta: bm: add support for hardware buffer management")
Signed-off-by: Yelena Krivosheev <yelena@marvell.com>
Tested-by: Dmitri Epshtein <dima@marvell.com>
Signed-off-by: Gregory CLEMENT <gregory.clement@free-electrons.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/marvell/mvneta.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c
index 1e0835655c93..a539263cd79c 100644
--- a/drivers/net/ethernet/marvell/mvneta.c
+++ b/drivers/net/ethernet/marvell/mvneta.c
@@ -1962,9 +1962,9 @@ static int mvneta_rx_swbm(struct mvneta_port *pp, int rx_todo,
 
 		if (!mvneta_rxq_desc_is_first_last(rx_status) ||
 		    (rx_status & MVNETA_RXD_ERR_SUMMARY)) {
+			mvneta_rx_error(pp, rx_desc);
 err_drop_frame:
 			dev->stats.rx_errors++;
-			mvneta_rx_error(pp, rx_desc);
 			/* leave the descriptor untouched */
 			continue;
 		}

From 21b5944350052d2583e82dd59b19a9ba94a007f0 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 19 Dec 2017 11:27:56 -0600
Subject: [PATCH 315/808] net: Fix double free and memory corruption in
 get_net_ns_by_id()

(I can trivially verify that that idr_remove in cleanup_net happens
 after the network namespace count has dropped to zero --EWB)

Function get_net_ns_by_id() does not check for net::count
after it has found a peer in netns_ids idr.

It may dereference a peer, after its count has already been
finaly decremented. This leads to double free and memory
corruption:

put_net(peer)                                   rtnl_lock()
atomic_dec_and_test(&peer->count) [count=0]     ...
__put_net(peer)                                 get_net_ns_by_id(net, id)
  spin_lock(&cleanup_list_lock)
  list_add(&net->cleanup_list, &cleanup_list)
  spin_unlock(&cleanup_list_lock)
queue_work()                                      peer = idr_find(&net->netns_ids, id)
  |                                               get_net(peer) [count=1]
  |                                               ...
  |                                               (use after final put)
  v                                               ...
  cleanup_net()                                   ...
    spin_lock(&cleanup_list_lock)                 ...
    list_replace_init(&cleanup_list, ..)          ...
    spin_unlock(&cleanup_list_lock)               ...
    ...                                           ...
    ...                                           put_net(peer)
    ...                                             atomic_dec_and_test(&peer->count) [count=0]
    ...                                               spin_lock(&cleanup_list_lock)
    ...                                               list_add(&net->cleanup_list, &cleanup_list)
    ...                                               spin_unlock(&cleanup_list_lock)
    ...                                             queue_work()
    ...                                           rtnl_unlock()
    rtnl_lock()                                   ...
    for_each_net(tmp) {                           ...
      id = __peernet2id(tmp, peer)                ...
      spin_lock_irq(&tmp->nsid_lock)              ...
      idr_remove(&tmp->netns_ids, id)             ...
      ...                                         ...
      net_drop_ns()                               ...
	net_free(peer)                            ...
    }                                             ...
  |
  v
  cleanup_net()
    ...
    (Second free of peer)

Also, put_net() on the right cpu may reorder with left's cpu
list_replace_init(&cleanup_list, ..), and then cleanup_list
will be corrupted.

Since cleanup_net() is executed in worker thread, while
put_net(peer) can happen everywhere, there should be
enough time for concurrent get_net_ns_by_id() to pick
the peer up, and the race does not seem to be unlikely.
The patch fixes the problem in standard way.

(Also, there is possible problem in peernet2id_alloc(), which requires
check for net::count under nsid_lock and maybe_get_net(peer), but
in current stable kernel it's used under rtnl_lock() and it has to be
safe. Openswitch begun to use peernet2id_alloc(), and possibly it should
be fixed too. While this is not in stable kernel yet, so I'll send
a separate message to netdev@ later).

Cc: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Fixes: 0c7aecd4bde4 "netns: add rtnl cmd to add and get peer netns ids"
Reviewed-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Reviewed-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/net_namespace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index b797832565d3..60a71be75aea 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -267,7 +267,7 @@ struct net *get_net_ns_by_id(struct net *net, int id)
 	spin_lock_bh(&net->nsid_lock);
 	peer = idr_find(&net->netns_ids, id);
 	if (peer)
-		get_net(peer);
+		peer = maybe_get_net(peer);
 	spin_unlock_bh(&net->nsid_lock);
 	rcu_read_unlock();
 

From 102740bd9436a3a6ba129af3a48271d794009fa5 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Tue, 19 Dec 2017 13:32:13 -0800
Subject: [PATCH 316/808] cls_bpf: fix offload assumptions after callback
 conversion

cls_bpf used to take care of tracking what offload state a filter
is in, i.e. it would track if offload request succeeded or not.
This information would then be used to issue correct requests to
the driver, e.g. requests for statistics only on offloaded filters,
removing only filters which were offloaded, using add instead of
replace if previous filter was not added etc.

This tracking of offload state no longer functions with the new
callback infrastructure.  There could be multiple entities trying
to offload the same filter.

Throw out all the tracking and corresponding commands and simply
pass to the drivers both old and new bpf program.  Drivers will
have to deal with offload state tracking by themselves.

Fixes: 3f7889c4c79b ("net: sched: cls_bpf: call block callbacks for offload")
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/main.c | 12 +--
 include/net/pkt_cls.h                         |  5 +-
 net/sched/cls_bpf.c                           | 93 ++++++++-----------
 3 files changed, 43 insertions(+), 67 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c
index e379b78e86ef..a4cf62ba4604 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c
@@ -110,16 +110,10 @@ static int nfp_bpf_setup_tc_block_cb(enum tc_setup_type type,
 		return -EOPNOTSUPP;
 	}
 
-	switch (cls_bpf->command) {
-	case TC_CLSBPF_REPLACE:
-		return nfp_net_bpf_offload(nn, cls_bpf->prog, true);
-	case TC_CLSBPF_ADD:
-		return nfp_net_bpf_offload(nn, cls_bpf->prog, false);
-	case TC_CLSBPF_DESTROY:
-		return nfp_net_bpf_offload(nn, NULL, true);
-	default:
+	if (cls_bpf->command != TC_CLSBPF_OFFLOAD)
 		return -EOPNOTSUPP;
-	}
+
+	return nfp_net_bpf_offload(nn, cls_bpf->prog, cls_bpf->oldprog);
 }
 
 static int nfp_bpf_setup_tc_block(struct net_device *netdev,
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 0105445cab83..8e08b6da72f3 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -694,9 +694,7 @@ struct tc_cls_matchall_offload {
 };
 
 enum tc_clsbpf_command {
-	TC_CLSBPF_ADD,
-	TC_CLSBPF_REPLACE,
-	TC_CLSBPF_DESTROY,
+	TC_CLSBPF_OFFLOAD,
 	TC_CLSBPF_STATS,
 };
 
@@ -705,6 +703,7 @@ struct tc_cls_bpf_offload {
 	enum tc_clsbpf_command command;
 	struct tcf_exts *exts;
 	struct bpf_prog *prog;
+	struct bpf_prog *oldprog;
 	const char *name;
 	bool exts_integrated;
 	u32 gen_flags;
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 6fe798c2df1a..8d78e7f4ecc3 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -42,7 +42,6 @@ struct cls_bpf_prog {
 	struct list_head link;
 	struct tcf_result res;
 	bool exts_integrated;
-	bool offloaded;
 	u32 gen_flags;
 	struct tcf_exts exts;
 	u32 handle;
@@ -148,33 +147,37 @@ static bool cls_bpf_is_ebpf(const struct cls_bpf_prog *prog)
 }
 
 static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog,
-			       enum tc_clsbpf_command cmd)
+			       struct cls_bpf_prog *oldprog)
 {
-	bool addorrep = cmd == TC_CLSBPF_ADD || cmd == TC_CLSBPF_REPLACE;
 	struct tcf_block *block = tp->chain->block;
-	bool skip_sw = tc_skip_sw(prog->gen_flags);
 	struct tc_cls_bpf_offload cls_bpf = {};
+	struct cls_bpf_prog *obj;
+	bool skip_sw;
 	int err;
 
+	skip_sw = prog && tc_skip_sw(prog->gen_flags);
+	obj = prog ?: oldprog;
+
 	tc_cls_common_offload_init(&cls_bpf.common, tp);
-	cls_bpf.command = cmd;
-	cls_bpf.exts = &prog->exts;
-	cls_bpf.prog = prog->filter;
-	cls_bpf.name = prog->bpf_name;
-	cls_bpf.exts_integrated = prog->exts_integrated;
-	cls_bpf.gen_flags = prog->gen_flags;
+	cls_bpf.command = TC_CLSBPF_OFFLOAD;
+	cls_bpf.exts = &obj->exts;
+	cls_bpf.prog = prog ? prog->filter : NULL;
+	cls_bpf.oldprog = oldprog ? oldprog->filter : NULL;
+	cls_bpf.name = obj->bpf_name;
+	cls_bpf.exts_integrated = obj->exts_integrated;
+	cls_bpf.gen_flags = obj->gen_flags;
 
 	err = tc_setup_cb_call(block, NULL, TC_SETUP_CLSBPF, &cls_bpf, skip_sw);
-	if (addorrep) {
+	if (prog) {
 		if (err < 0) {
-			cls_bpf_offload_cmd(tp, prog, TC_CLSBPF_DESTROY);
+			cls_bpf_offload_cmd(tp, oldprog, prog);
 			return err;
 		} else if (err > 0) {
 			prog->gen_flags |= TCA_CLS_FLAGS_IN_HW;
 		}
 	}
 
-	if (addorrep && skip_sw && !(prog->gen_flags & TCA_CLS_FLAGS_IN_HW))
+	if (prog && skip_sw && !(prog->gen_flags & TCA_CLS_FLAGS_IN_HW))
 		return -EINVAL;
 
 	return 0;
@@ -183,38 +186,17 @@ static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog,
 static int cls_bpf_offload(struct tcf_proto *tp, struct cls_bpf_prog *prog,
 			   struct cls_bpf_prog *oldprog)
 {
-	struct cls_bpf_prog *obj = prog;
-	enum tc_clsbpf_command cmd;
-	bool skip_sw;
-	int ret;
+	if (prog && oldprog && prog->gen_flags != oldprog->gen_flags)
+		return -EINVAL;
 
-	skip_sw = tc_skip_sw(prog->gen_flags) ||
-		(oldprog && tc_skip_sw(oldprog->gen_flags));
+	if (prog && tc_skip_hw(prog->gen_flags))
+		prog = NULL;
+	if (oldprog && tc_skip_hw(oldprog->gen_flags))
+		oldprog = NULL;
+	if (!prog && !oldprog)
+		return 0;
 
-	if (oldprog && oldprog->offloaded) {
-		if (!tc_skip_hw(prog->gen_flags)) {
-			cmd = TC_CLSBPF_REPLACE;
-		} else if (!tc_skip_sw(prog->gen_flags)) {
-			obj = oldprog;
-			cmd = TC_CLSBPF_DESTROY;
-		} else {
-			return -EINVAL;
-		}
-	} else {
-		if (tc_skip_hw(prog->gen_flags))
-			return skip_sw ? -EINVAL : 0;
-		cmd = TC_CLSBPF_ADD;
-	}
-
-	ret = cls_bpf_offload_cmd(tp, obj, cmd);
-	if (ret)
-		return ret;
-
-	obj->offloaded = true;
-	if (oldprog)
-		oldprog->offloaded = false;
-
-	return 0;
+	return cls_bpf_offload_cmd(tp, prog, oldprog);
 }
 
 static void cls_bpf_stop_offload(struct tcf_proto *tp,
@@ -222,25 +204,26 @@ static void cls_bpf_stop_offload(struct tcf_proto *tp,
 {
 	int err;
 
-	if (!prog->offloaded)
-		return;
-
-	err = cls_bpf_offload_cmd(tp, prog, TC_CLSBPF_DESTROY);
-	if (err) {
+	err = cls_bpf_offload_cmd(tp, NULL, prog);
+	if (err)
 		pr_err("Stopping hardware offload failed: %d\n", err);
-		return;
-	}
-
-	prog->offloaded = false;
 }
 
 static void cls_bpf_offload_update_stats(struct tcf_proto *tp,
 					 struct cls_bpf_prog *prog)
 {
-	if (!prog->offloaded)
-		return;
+	struct tcf_block *block = tp->chain->block;
+	struct tc_cls_bpf_offload cls_bpf = {};
 
-	cls_bpf_offload_cmd(tp, prog, TC_CLSBPF_STATS);
+	tc_cls_common_offload_init(&cls_bpf.common, tp);
+	cls_bpf.command = TC_CLSBPF_STATS;
+	cls_bpf.exts = &prog->exts;
+	cls_bpf.prog = prog->filter;
+	cls_bpf.name = prog->bpf_name;
+	cls_bpf.exts_integrated = prog->exts_integrated;
+	cls_bpf.gen_flags = prog->gen_flags;
+
+	tc_setup_cb_call(block, NULL, TC_SETUP_CLSBPF, &cls_bpf, false);
 }
 
 static int cls_bpf_init(struct tcf_proto *tp)

From d3f89b98e391475419ae2d8834813d3ecbb48f67 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Tue, 19 Dec 2017 13:32:14 -0800
Subject: [PATCH 317/808] nfp: bpf: keep track of the offloaded program

After TC offloads were converted to callbacks we have no choice
but keep track of the offloaded filter in the driver.

The check for nn->dp.bpf_offload_xdp was a stop gap solution
to make sure failed TC offload won't disable XDP, it's no longer
necessary.  nfp_net_bpf_offload() will return -EBUSY on
TC vs XDP conflicts.

Fixes: 3f7889c4c79b ("net: sched: cls_bpf: call block callbacks for offload")
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/main.c | 47 +++++++++++++++++--
 drivers/net/ethernet/netronome/nfp/bpf/main.h |  8 ++++
 2 files changed, 51 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c
index a4cf62ba4604..13190aa09faf 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c
@@ -82,10 +82,33 @@ static const char *nfp_bpf_extra_cap(struct nfp_app *app, struct nfp_net *nn)
 	return nfp_net_ebpf_capable(nn) ? "BPF" : "";
 }
 
+static int
+nfp_bpf_vnic_alloc(struct nfp_app *app, struct nfp_net *nn, unsigned int id)
+{
+	int err;
+
+	nn->app_priv = kzalloc(sizeof(struct nfp_bpf_vnic), GFP_KERNEL);
+	if (!nn->app_priv)
+		return -ENOMEM;
+
+	err = nfp_app_nic_vnic_alloc(app, nn, id);
+	if (err)
+		goto err_free_priv;
+
+	return 0;
+err_free_priv:
+	kfree(nn->app_priv);
+	return err;
+}
+
 static void nfp_bpf_vnic_free(struct nfp_app *app, struct nfp_net *nn)
 {
+	struct nfp_bpf_vnic *bv = nn->app_priv;
+
 	if (nn->dp.bpf_offload_xdp)
 		nfp_bpf_xdp_offload(app, nn, NULL);
+	WARN_ON(bv->tc_prog);
+	kfree(bv);
 }
 
 static int nfp_bpf_setup_tc_block_cb(enum tc_setup_type type,
@@ -93,6 +116,9 @@ static int nfp_bpf_setup_tc_block_cb(enum tc_setup_type type,
 {
 	struct tc_cls_bpf_offload *cls_bpf = type_data;
 	struct nfp_net *nn = cb_priv;
+	struct bpf_prog *oldprog;
+	struct nfp_bpf_vnic *bv;
+	int err;
 
 	if (type != TC_SETUP_CLSBPF ||
 	    !tc_can_offload(nn->dp.netdev) ||
@@ -100,8 +126,6 @@ static int nfp_bpf_setup_tc_block_cb(enum tc_setup_type type,
 	    cls_bpf->common.protocol != htons(ETH_P_ALL) ||
 	    cls_bpf->common.chain_index)
 		return -EOPNOTSUPP;
-	if (nn->dp.bpf_offload_xdp)
-		return -EBUSY;
 
 	/* Only support TC direct action */
 	if (!cls_bpf->exts_integrated ||
@@ -113,7 +137,22 @@ static int nfp_bpf_setup_tc_block_cb(enum tc_setup_type type,
 	if (cls_bpf->command != TC_CLSBPF_OFFLOAD)
 		return -EOPNOTSUPP;
 
-	return nfp_net_bpf_offload(nn, cls_bpf->prog, cls_bpf->oldprog);
+	bv = nn->app_priv;
+	oldprog = cls_bpf->oldprog;
+
+	/* Don't remove if oldprog doesn't match driver's state */
+	if (bv->tc_prog != oldprog) {
+		oldprog = NULL;
+		if (!cls_bpf->prog)
+			return 0;
+	}
+
+	err = nfp_net_bpf_offload(nn, cls_bpf->prog, oldprog);
+	if (err)
+		return err;
+
+	bv->tc_prog = cls_bpf->prog;
+	return 0;
 }
 
 static int nfp_bpf_setup_tc_block(struct net_device *netdev,
@@ -161,7 +200,7 @@ const struct nfp_app_type app_bpf = {
 
 	.extra_cap	= nfp_bpf_extra_cap,
 
-	.vnic_alloc	= nfp_app_nic_vnic_alloc,
+	.vnic_alloc	= nfp_bpf_vnic_alloc,
 	.vnic_free	= nfp_bpf_vnic_free,
 
 	.setup_tc	= nfp_bpf_setup_tc,
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h
index 082a15f6dfb5..57b6043177a3 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.h
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h
@@ -172,6 +172,14 @@ struct nfp_prog {
 	struct list_head insns;
 };
 
+/**
+ * struct nfp_bpf_vnic - per-vNIC BPF priv structure
+ * @tc_prog:	currently loaded cls_bpf program
+ */
+struct nfp_bpf_vnic {
+	struct bpf_prog *tc_prog;
+};
+
 int nfp_bpf_jit(struct nfp_prog *prog);
 
 extern const struct bpf_ext_analyzer_ops nfp_bpf_analyzer_ops;

From 111be883981748acc9a56e855c8336404a8e787c Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Wed, 20 Dec 2017 11:10:17 -0700
Subject: [PATCH 318/808] block-throttle: avoid double charge

If a bio is throttled and split after throttling, the bio could be
resubmited and enters the throttling again. This will cause part of the
bio to be charged multiple times. If the cgroup has an IO limit, the
double charge will significantly harm the performance. The bio split
becomes quite common after arbitrary bio size change.

To fix this, we always set the BIO_THROTTLED flag if a bio is throttled.
If the bio is cloned/split, we copy the flag to new bio too to avoid a
double charge. However, cloned bio could be directed to a new disk,
keeping the flag be a problem. The observation is we always set new disk
for the bio in this case, so we can clear the flag in bio_set_dev().

This issue exists for a long time, arbitrary bio size change just makes
it worse, so this should go into stable at least since v4.2.

V1-> V2: Not add extra field in bio based on discussion with Tejun

Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: stable@vger.kernel.org
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c               | 2 ++
 block/blk-throttle.c      | 8 +-------
 include/linux/bio.h       | 2 ++
 include/linux/blk_types.h | 9 ++++-----
 4 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 8bfdea58159b..9ef6cf3addb3 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -599,6 +599,8 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
 	bio->bi_disk = bio_src->bi_disk;
 	bio->bi_partno = bio_src->bi_partno;
 	bio_set_flag(bio, BIO_CLONED);
+	if (bio_flagged(bio_src, BIO_THROTTLED))
+		bio_set_flag(bio, BIO_THROTTLED);
 	bio->bi_opf = bio_src->bi_opf;
 	bio->bi_write_hint = bio_src->bi_write_hint;
 	bio->bi_iter = bio_src->bi_iter;
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 825bc29767e6..d19f416d6101 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -2226,13 +2226,7 @@ again:
 out_unlock:
 	spin_unlock_irq(q->queue_lock);
 out:
-	/*
-	 * As multiple blk-throtls may stack in the same issue path, we
-	 * don't want bios to leave with the flag set.  Clear the flag if
-	 * being issued.
-	 */
-	if (!throttled)
-		bio_clear_flag(bio, BIO_THROTTLED);
+	bio_set_flag(bio, BIO_THROTTLED);
 
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
 	if (throttled || !td->track_bio_latency)
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 82f0c8fd7be8..23d29b39f71e 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -492,6 +492,8 @@ extern unsigned int bvec_nr_vecs(unsigned short idx);
 
 #define bio_set_dev(bio, bdev) 			\
 do {						\
+	if ((bio)->bi_disk != (bdev)->bd_disk)	\
+		bio_clear_flag(bio, BIO_THROTTLED);\
 	(bio)->bi_disk = (bdev)->bd_disk;	\
 	(bio)->bi_partno = (bdev)->bd_partno;	\
 } while (0)
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index a1e628e032da..9e7d8bd776d2 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -50,8 +50,6 @@ struct blk_issue_stat {
 struct bio {
 	struct bio		*bi_next;	/* request queue link */
 	struct gendisk		*bi_disk;
-	u8			bi_partno;
-	blk_status_t		bi_status;
 	unsigned int		bi_opf;		/* bottom bits req flags,
 						 * top bits REQ_OP. Use
 						 * accessors.
@@ -59,8 +57,8 @@ struct bio {
 	unsigned short		bi_flags;	/* status, etc and bvec pool number */
 	unsigned short		bi_ioprio;
 	unsigned short		bi_write_hint;
-
-	struct bvec_iter	bi_iter;
+	blk_status_t		bi_status;
+	u8			bi_partno;
 
 	/* Number of segments in this BIO after
 	 * physical address coalescing is performed.
@@ -74,8 +72,9 @@ struct bio {
 	unsigned int		bi_seg_front_size;
 	unsigned int		bi_seg_back_size;
 
-	atomic_t		__bi_remaining;
+	struct bvec_iter	bi_iter;
 
+	atomic_t		__bi_remaining;
 	bio_end_io_t		*bi_end_io;
 
 	void			*bi_private;

From b3cf8528bb21febb650a7ecbf080d0647be40b9f Mon Sep 17 00:00:00 2001
From: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Date: Tue, 12 Dec 2017 15:08:21 -0500
Subject: [PATCH 319/808] xen/balloon: Mark unallocated host memory as UNUSABLE

Commit f5775e0b6116 ("x86/xen: discard RAM regions above the maximum
reservation") left host memory not assigned to dom0 as available for
memory hotplug.

Unfortunately this also meant that those regions could be used by
others. Specifically, commit fa564ad96366 ("x86/PCI: Enable a 64bit BAR
on AMD Family 15h (Models 00-1f, 30-3f, 60-7f)") may try to map those
addresses as MMIO.

To prevent this mark unallocated host memory as E820_TYPE_UNUSABLE (thus
effectively reverting f5775e0b6116) and keep track of that region as
a hostmem resource that can be used for the hotplug.

Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Reviewed-by: Juergen Gross <jgross@suse.com>
---
 arch/x86/xen/enlighten.c | 81 ++++++++++++++++++++++++++++++++++++++++
 arch/x86/xen/setup.c     |  6 +--
 drivers/xen/balloon.c    | 65 +++++++++++++++++++++++++++-----
 include/xen/balloon.h    |  5 +++
 4 files changed, 144 insertions(+), 13 deletions(-)

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index d669e9d89001..c9081c6671f0 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1,8 +1,12 @@
+#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG
+#include <linux/bootmem.h>
+#endif
 #include <linux/cpu.h>
 #include <linux/kexec.h>
 
 #include <xen/features.h>
 #include <xen/page.h>
+#include <xen/interface/memory.h>
 
 #include <asm/xen/hypercall.h>
 #include <asm/xen/hypervisor.h>
@@ -331,3 +335,80 @@ void xen_arch_unregister_cpu(int num)
 }
 EXPORT_SYMBOL(xen_arch_unregister_cpu);
 #endif
+
+#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG
+void __init arch_xen_balloon_init(struct resource *hostmem_resource)
+{
+	struct xen_memory_map memmap;
+	int rc;
+	unsigned int i, last_guest_ram;
+	phys_addr_t max_addr = PFN_PHYS(max_pfn);
+	struct e820_table *xen_e820_table;
+	const struct e820_entry *entry;
+	struct resource *res;
+
+	if (!xen_initial_domain())
+		return;
+
+	xen_e820_table = kmalloc(sizeof(*xen_e820_table), GFP_KERNEL);
+	if (!xen_e820_table)
+		return;
+
+	memmap.nr_entries = ARRAY_SIZE(xen_e820_table->entries);
+	set_xen_guest_handle(memmap.buffer, xen_e820_table->entries);
+	rc = HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap);
+	if (rc) {
+		pr_warn("%s: Can't read host e820 (%d)\n", __func__, rc);
+		goto out;
+	}
+
+	last_guest_ram = 0;
+	for (i = 0; i < memmap.nr_entries; i++) {
+		if (xen_e820_table->entries[i].addr >= max_addr)
+			break;
+		if (xen_e820_table->entries[i].type == E820_TYPE_RAM)
+			last_guest_ram = i;
+	}
+
+	entry = &xen_e820_table->entries[last_guest_ram];
+	if (max_addr >= entry->addr + entry->size)
+		goto out; /* No unallocated host RAM. */
+
+	hostmem_resource->start = max_addr;
+	hostmem_resource->end = entry->addr + entry->size;
+
+	/*
+	 * Mark non-RAM regions between the end of dom0 RAM and end of host RAM
+	 * as unavailable. The rest of that region can be used for hotplug-based
+	 * ballooning.
+	 */
+	for (; i < memmap.nr_entries; i++) {
+		entry = &xen_e820_table->entries[i];
+
+		if (entry->type == E820_TYPE_RAM)
+			continue;
+
+		if (entry->addr >= hostmem_resource->end)
+			break;
+
+		res = kzalloc(sizeof(*res), GFP_KERNEL);
+		if (!res)
+			goto out;
+
+		res->name = "Unavailable host RAM";
+		res->start = entry->addr;
+		res->end = (entry->addr + entry->size < hostmem_resource->end) ?
+			    entry->addr + entry->size : hostmem_resource->end;
+		rc = insert_resource(hostmem_resource, res);
+		if (rc) {
+			pr_warn("%s: Can't insert [%llx - %llx) (%d)\n",
+				__func__, res->start, res->end, rc);
+			kfree(res);
+			goto  out;
+		}
+	}
+
+ out:
+	kfree(xen_e820_table);
+}
+#endif /* CONFIG_XEN_BALLOON_MEMORY_HOTPLUG */
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index ac55c02f98e9..e9011e1ee3de 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -807,7 +807,6 @@ char * __init xen_memory_setup(void)
 	addr = xen_e820_table.entries[0].addr;
 	size = xen_e820_table.entries[0].size;
 	while (i < xen_e820_table.nr_entries) {
-		bool discard = false;
 
 		chunk_size = size;
 		type = xen_e820_table.entries[i].type;
@@ -823,11 +822,10 @@ char * __init xen_memory_setup(void)
 				xen_add_extra_mem(pfn_s, n_pfns);
 				xen_max_p2m_pfn = pfn_s + n_pfns;
 			} else
-				discard = true;
+				type = E820_TYPE_UNUSABLE;
 		}
 
-		if (!discard)
-			xen_align_and_add_e820_region(addr, chunk_size, type);
+		xen_align_and_add_e820_region(addr, chunk_size, type);
 
 		addr += chunk_size;
 		size -= chunk_size;
diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
index f77e499afddd..065f0b607373 100644
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -257,10 +257,25 @@ static void release_memory_resource(struct resource *resource)
 	kfree(resource);
 }
 
+/*
+ * Host memory not allocated to dom0. We can use this range for hotplug-based
+ * ballooning.
+ *
+ * It's a type-less resource. Setting IORESOURCE_MEM will make resource
+ * management algorithms (arch_remove_reservations()) look into guest e820,
+ * which we don't want.
+ */
+static struct resource hostmem_resource = {
+	.name   = "Host RAM",
+};
+
+void __attribute__((weak)) __init arch_xen_balloon_init(struct resource *res)
+{}
+
 static struct resource *additional_memory_resource(phys_addr_t size)
 {
-	struct resource *res;
-	int ret;
+	struct resource *res, *res_hostmem;
+	int ret = -ENOMEM;
 
 	res = kzalloc(sizeof(*res), GFP_KERNEL);
 	if (!res)
@@ -269,13 +284,42 @@ static struct resource *additional_memory_resource(phys_addr_t size)
 	res->name = "System RAM";
 	res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
 
-	ret = allocate_resource(&iomem_resource, res,
-				size, 0, -1,
-				PAGES_PER_SECTION * PAGE_SIZE, NULL, NULL);
-	if (ret < 0) {
-		pr_err("Cannot allocate new System RAM resource\n");
-		kfree(res);
-		return NULL;
+	res_hostmem = kzalloc(sizeof(*res), GFP_KERNEL);
+	if (res_hostmem) {
+		/* Try to grab a range from hostmem */
+		res_hostmem->name = "Host memory";
+		ret = allocate_resource(&hostmem_resource, res_hostmem,
+					size, 0, -1,
+					PAGES_PER_SECTION * PAGE_SIZE, NULL, NULL);
+	}
+
+	if (!ret) {
+		/*
+		 * Insert this resource into iomem. Because hostmem_resource
+		 * tracks portion of guest e820 marked as UNUSABLE noone else
+		 * should try to use it.
+		 */
+		res->start = res_hostmem->start;
+		res->end = res_hostmem->end;
+		ret = insert_resource(&iomem_resource, res);
+		if (ret < 0) {
+			pr_err("Can't insert iomem_resource [%llx - %llx]\n",
+				res->start, res->end);
+			release_memory_resource(res_hostmem);
+			res_hostmem = NULL;
+			res->start = res->end = 0;
+		}
+	}
+
+	if (ret) {
+		ret = allocate_resource(&iomem_resource, res,
+					size, 0, -1,
+					PAGES_PER_SECTION * PAGE_SIZE, NULL, NULL);
+		if (ret < 0) {
+			pr_err("Cannot allocate new System RAM resource\n");
+			kfree(res);
+			return NULL;
+		}
 	}
 
 #ifdef CONFIG_SPARSEMEM
@@ -287,6 +331,7 @@ static struct resource *additional_memory_resource(phys_addr_t size)
 			pr_err("New System RAM resource outside addressable RAM (%lu > %lu)\n",
 			       pfn, limit);
 			release_memory_resource(res);
+			release_memory_resource(res_hostmem);
 			return NULL;
 		}
 	}
@@ -765,6 +810,8 @@ static int __init balloon_init(void)
 	set_online_page_callback(&xen_online_page);
 	register_memory_notifier(&xen_memory_nb);
 	register_sysctl_table(xen_root);
+
+	arch_xen_balloon_init(&hostmem_resource);
 #endif
 
 #ifdef CONFIG_XEN_PV
diff --git a/include/xen/balloon.h b/include/xen/balloon.h
index 8906361bb50c..d0adfc78dcbd 100644
--- a/include/xen/balloon.h
+++ b/include/xen/balloon.h
@@ -43,3 +43,8 @@ static inline void xen_balloon_init(void)
 {
 }
 #endif
+
+#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG
+struct resource;
+void arch_xen_balloon_init(struct resource *hostmem_resource);
+#endif

From 1c8e77fb361a4a116a41ac1d9819eb79d068735d Mon Sep 17 00:00:00 2001
From: Naresh Kamboju <naresh.kamboju@linaro.org>
Date: Wed, 20 Dec 2017 12:50:22 +0530
Subject: [PATCH 320/808] selftests: net: Adding config fragment CONFIG_NUMA=y

kernel config fragement CONFIG_NUMA=y is need for reuseport_bpf_numa.

Signed-off-by: Naresh Kamboju <naresh.kamboju@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/config | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/testing/selftests/net/config b/tools/testing/selftests/net/config
index e57b4ac40e72..7177bea1fdfa 100644
--- a/tools/testing/selftests/net/config
+++ b/tools/testing/selftests/net/config
@@ -1,3 +1,4 @@
 CONFIG_USER_NS=y
 CONFIG_BPF_SYSCALL=y
 CONFIG_TEST_BPF=m
+CONFIG_NUMA=y

From bb25c3855a12cc58e33cd7ee9b69943790fe35f7 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Wed, 20 Dec 2017 11:03:15 +0100
Subject: [PATCH 321/808] tipc: remove joining group member from congested list

When we receive a JOIN message from a peer member, the message may
contain an advertised window value ADV_IDLE that permits removing the
member in question from the tipc_group::congested list. However, since
the removal has been made conditional on that the advertised window is
*not* ADV_IDLE, we miss this case. This has the effect that a sender
sometimes may enter a state of permanent, false, broadcast congestion.

We fix this by unconditinally removing the member from the congested
list before calling tipc_member_update(), which might potentially sort
it into the list again.

Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/group.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/net/tipc/group.c b/net/tipc/group.c
index bbc004eaa31a..7ebbdeb2a90e 100644
--- a/net/tipc/group.c
+++ b/net/tipc/group.c
@@ -689,10 +689,8 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup,
 			msg_set_grp_bc_seqno(ehdr, m->bc_syncpt);
 			__skb_queue_tail(inputq, m->event_msg);
 		}
-		if (m->window < ADV_IDLE)
-			tipc_group_update_member(m, 0);
-		else
-			list_del_init(&m->congested);
+		list_del_init(&m->congested);
+		tipc_group_update_member(m, 0);
 		return;
 	case GRP_LEAVE_MSG:
 		if (!m)

From ad3cbf61332914711e5f506972b1dc9af8d62146 Mon Sep 17 00:00:00 2001
From: Julian Wiedmann <jwi@linux.vnet.ibm.com>
Date: Wed, 20 Dec 2017 18:07:18 +0100
Subject: [PATCH 322/808] s390/qeth: fix error handling in checksum cmd
 callback

Make sure to check both return code fields before processing the
response. Otherwise we risk operating on invalid data.

Fixes: c9475369bd2b ("s390/qeth: rework RX/TX checksum offload")
Signed-off-by: Julian Wiedmann <jwi@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/s390/net/qeth_core_main.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c
index 6c815207f4f5..3614df68830f 100644
--- a/drivers/s390/net/qeth_core_main.c
+++ b/drivers/s390/net/qeth_core_main.c
@@ -5386,6 +5386,13 @@ out:
 }
 EXPORT_SYMBOL_GPL(qeth_poll);
 
+static int qeth_setassparms_inspect_rc(struct qeth_ipa_cmd *cmd)
+{
+	if (!cmd->hdr.return_code)
+		cmd->hdr.return_code = cmd->data.setassparms.hdr.return_code;
+	return cmd->hdr.return_code;
+}
+
 int qeth_setassparms_cb(struct qeth_card *card,
 			struct qeth_reply *reply, unsigned long data)
 {
@@ -6242,7 +6249,7 @@ static int qeth_ipa_checksum_run_cmd_cb(struct qeth_card *card,
 				(struct qeth_checksum_cmd *)reply->param;
 
 	QETH_CARD_TEXT(card, 4, "chkdoccb");
-	if (cmd->hdr.return_code)
+	if (qeth_setassparms_inspect_rc(cmd))
 		return 0;
 
 	memset(chksum_cb, 0, sizeof(*chksum_cb));

From b4681c2829e24943aadd1a7bb3a30d41d0a20050 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Wed, 20 Dec 2017 19:34:19 +0200
Subject: [PATCH 323/808] ipv4: Fix use-after-free when flushing FIB tables

Since commit 0ddcf43d5d4a ("ipv4: FIB Local/MAIN table collapse") the
local table uses the same trie allocated for the main table when custom
rules are not in use.

When a net namespace is dismantled, the main table is flushed and freed
(via an RCU callback) before the local table. In case the callback is
invoked before the local table is iterated, a use-after-free can occur.

Fix this by iterating over the FIB tables in reverse order, so that the
main table is always freed after the local table.

v3: Reworded comment according to Alex's suggestion.
v2: Add a comment to make the fix more explicit per Dave's and Alex's
feedback.

Fixes: 0ddcf43d5d4a ("ipv4: FIB Local/MAIN table collapse")
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Acked-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_frontend.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index f52d27a422c3..08259d078b1c 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -1298,14 +1298,19 @@ err_table_hash_alloc:
 
 static void ip_fib_net_exit(struct net *net)
 {
-	unsigned int i;
+	int i;
 
 	rtnl_lock();
 #ifdef CONFIG_IP_MULTIPLE_TABLES
 	RCU_INIT_POINTER(net->ipv4.fib_main, NULL);
 	RCU_INIT_POINTER(net->ipv4.fib_default, NULL);
 #endif
-	for (i = 0; i < FIB_TABLE_HASHSZ; i++) {
+	/* Destroy the tables in reverse order to guarantee that the
+	 * local table, ID 255, is destroyed before the main table, ID
+	 * 254. This is necessary as the local table may contain
+	 * references to data contained in the main table.
+	 */
+	for (i = FIB_TABLE_HASHSZ - 1; i >= 0; i--) {
 		struct hlist_head *head = &net->ipv4.fib_table_hash[i];
 		struct hlist_node *tmp;
 		struct fib_table *tb;

From 4ccafe032005e9b96acbef2e389a4de5b1254add Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 20 Dec 2017 13:13:58 -0700
Subject: [PATCH 324/808] block: unalign call_single_data in struct request

A previous change blindly added massive alignment to the
call_single_data structure in struct request. This ballooned it in size
from 296 to 320 bytes on my setup, for no valid reason at all.

Use the unaligned struct __call_single_data variant instead.

Fixes: 966a967116e69 ("smp: Avoid using two cache lines for struct call_single_data")
Cc: stable@vger.kernel.org # v4.14
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 100d0df38026..0ce8a372d506 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -135,7 +135,7 @@ typedef __u32 __bitwise req_flags_t;
 struct request {
 	struct list_head queuelist;
 	union {
-		call_single_data_t csd;
+		struct __call_single_data csd;
 		u64 fifo_time;
 	};
 

From 0864fe09ab90ab32b7d21fe3cd72df5b5af8492e Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 20 Dec 2017 13:14:42 -0700
Subject: [PATCH 325/808] null_blk: unalign call_single_data

Commit 966a967116e6 randomly added alignment to this structure, but
it's actually detrimental to performance of null_blk. Test case:

Running on both the home and remote node shows a ~5% degradation
in performance.

While in there, move blk_status_t to the hole after the integer tag
in the nullb_cmd structure. After this patch, we shrink the size
from 192 to 152 bytes.

Fixes: 966a967116e69 ("smp: Avoid using two cache lines for struct call_single_data")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/null_blk.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index ccb9975a97fa..ad0477ae820f 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -35,13 +35,13 @@ static inline u64 mb_per_tick(int mbps)
 struct nullb_cmd {
 	struct list_head list;
 	struct llist_node ll_list;
-	call_single_data_t csd;
+	struct __call_single_data csd;
 	struct request *rq;
 	struct bio *bio;
 	unsigned int tag;
+	blk_status_t error;
 	struct nullb_queue *nq;
 	struct hrtimer timer;
-	blk_status_t error;
 };
 
 struct nullb_queue {

From d0729bc6bee797fb4bcca87583af5adbfe79ecfb Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>
Date: Mon, 11 Dec 2017 21:50:25 +0900
Subject: [PATCH 326/808] arc: do not use __print_symbol()

__print_symbol() uses extra stack space to sprintf() symbol
information and then to feed that buffer to printk()

  char buffer[KSYM_SYMBOL_LEN];

  sprint_symbol(buffer, address);
  printk(fmt, buffer);

Replace __print_symbol() with a direct printk("%pS") call.

Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
---
 arch/arc/kernel/stacktrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arc/kernel/stacktrace.c b/arch/arc/kernel/stacktrace.c
index 74315f302971..bf40e06f3fb8 100644
--- a/arch/arc/kernel/stacktrace.c
+++ b/arch/arc/kernel/stacktrace.c
@@ -163,7 +163,7 @@ arc_unwind_core(struct task_struct *tsk, struct pt_regs *regs,
  */
 static int __print_sym(unsigned int address, void *unused)
 {
-	__print_symbol("  %s\n", address);
+	printk("  %pS\n", (void *)address);
 	return 0;
 }
 

From c18fc9071762769acb4040cabae45c817aefc537 Mon Sep 17 00:00:00 2001
From: Alexey Brodkin <Alexey.Brodkin@synopsys.com>
Date: Tue, 5 Dec 2017 13:19:38 +0300
Subject: [PATCH 327/808] ARC: [plat-hsdk] Switch DisplayLink driver from fbdev
 to DRM

Currently there're 2 different implementations of the driver for
DisplayLink USB2.0-to-HDMI/DVI adapters: older FBDEV and modern true
DRM.

We initially decided to use FBDEV version just because with it
/dev/fbX is usable from user-space while in DRM version
with DRM_FBDEV_EMULATION user-space cannot draw anything on a real
screen, for more info read [1].

But today /dev/fbX is not that important as more and more software
projects switch to use of DRI (/dev/dri/cardX).

But what's even more important DRM driver allows building of complicated
graphics processing chains. The most important for us is rendering of
3D on a dedicated GPU while outputting video through a simpler
bitstreamer like DisplayLink. So let's use much more future-proof
driver from now on.

[1] https://lists.freedesktop.org/archives/dri-devel/2017-December/159519.html

Signed-off-by: Alexey Brodkin <abrodkin@synopsys.com>
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
---
 arch/arc/configs/hsdk_defconfig | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/arc/configs/hsdk_defconfig b/arch/arc/configs/hsdk_defconfig
index 7b8f8faf8a24..ac6b0ed8341e 100644
--- a/arch/arc/configs/hsdk_defconfig
+++ b/arch/arc/configs/hsdk_defconfig
@@ -49,10 +49,11 @@ CONFIG_SERIAL_8250_DW=y
 CONFIG_SERIAL_OF_PLATFORM=y
 # CONFIG_HW_RANDOM is not set
 # CONFIG_HWMON is not set
+CONFIG_DRM=y
+# CONFIG_DRM_FBDEV_EMULATION is not set
+CONFIG_DRM_UDL=y
 CONFIG_FB=y
-CONFIG_FB_UDL=y
 CONFIG_FRAMEBUFFER_CONSOLE=y
-CONFIG_USB=y
 CONFIG_USB_EHCI_HCD=y
 CONFIG_USB_EHCI_HCD_PLATFORM=y
 CONFIG_USB_OHCI_HCD=y

From a08c832f277d7a6f9d3b341a5d5df2f5576220d8 Mon Sep 17 00:00:00 2001
From: Eugeniy Paltsev <Eugeniy.Paltsev@synopsys.com>
Date: Sat, 9 Dec 2017 16:59:15 +0300
Subject: [PATCH 328/808] ARC: [plat-hsdk]: Set initial core pll output
 frequency

Set initial core pll output frequency specified in device tree to
1GHz. It will be applied at the core pll driver probing.

Acked-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Eugeniy Paltsev <Eugeniy.Paltsev@synopsys.com>
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
---
 arch/arc/boot/dts/hsdk.dts | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arch/arc/boot/dts/hsdk.dts b/arch/arc/boot/dts/hsdk.dts
index 8f627c200d60..006aa3de5348 100644
--- a/arch/arc/boot/dts/hsdk.dts
+++ b/arch/arc/boot/dts/hsdk.dts
@@ -114,6 +114,14 @@
 			reg = <0x00 0x10>, <0x14B8 0x4>;
 			#clock-cells = <0>;
 			clocks = <&input_clk>;
+
+			/*
+			 * Set initial core pll output frequency to 1GHz.
+			 * It will be applied at the core pll driver probing
+			 * on early boot.
+			 */
+			assigned-clocks = <&core_clk>;
+			assigned-clock-rates = <1000000000>;
 		};
 
 		serial: serial@5000 {

From 7bde846d0957fb81ac0bf8c4e2cab284a1da34e0 Mon Sep 17 00:00:00 2001
From: Eugeniy Paltsev <Eugeniy.Paltsev@synopsys.com>
Date: Sat, 9 Dec 2017 16:59:16 +0300
Subject: [PATCH 329/808] ARC: [plat-hsdk]: Get rid of core pll frequency set
 in platform code

Get rid of core pll frequency set in platform code as we set it via
device tree using 'assigned-clock-rates' property.

Acked-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Eugeniy Paltsev <Eugeniy.Paltsev@synopsys.com>
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
---
 arch/arc/plat-hsdk/platform.c | 42 -----------------------------------
 1 file changed, 42 deletions(-)

diff --git a/arch/arc/plat-hsdk/platform.c b/arch/arc/plat-hsdk/platform.c
index fd0ae5e38639..2958aedb649a 100644
--- a/arch/arc/plat-hsdk/platform.c
+++ b/arch/arc/plat-hsdk/platform.c
@@ -38,42 +38,6 @@ static void __init hsdk_init_per_cpu(unsigned int cpu)
 #define CREG_PAE		(CREG_BASE + 0x180)
 #define CREG_PAE_UPDATE		(CREG_BASE + 0x194)
 
-#define CREG_CORE_IF_CLK_DIV	(CREG_BASE + 0x4B8)
-#define CREG_CORE_IF_CLK_DIV_2	0x1
-#define CGU_BASE		ARC_PERIPHERAL_BASE
-#define CGU_PLL_STATUS		(ARC_PERIPHERAL_BASE + 0x4)
-#define CGU_PLL_CTRL		(ARC_PERIPHERAL_BASE + 0x0)
-#define CGU_PLL_STATUS_LOCK	BIT(0)
-#define CGU_PLL_STATUS_ERR	BIT(1)
-#define CGU_PLL_CTRL_1GHZ	0x3A10
-#define HSDK_PLL_LOCK_TIMEOUT	500
-
-#define HSDK_PLL_LOCKED() \
-	!!(ioread32((void __iomem *) CGU_PLL_STATUS) & CGU_PLL_STATUS_LOCK)
-
-#define HSDK_PLL_ERR() \
-	!!(ioread32((void __iomem *) CGU_PLL_STATUS) & CGU_PLL_STATUS_ERR)
-
-static void __init hsdk_set_cpu_freq_1ghz(void)
-{
-	u32 timeout = HSDK_PLL_LOCK_TIMEOUT;
-
-	/*
-	 * As we set cpu clock which exceeds 500MHz, the divider for the interface
-	 * clock must be programmed to div-by-2.
-	 */
-	iowrite32(CREG_CORE_IF_CLK_DIV_2, (void __iomem *) CREG_CORE_IF_CLK_DIV);
-
-	/* Set cpu clock to 1GHz */
-	iowrite32(CGU_PLL_CTRL_1GHZ, (void __iomem *) CGU_PLL_CTRL);
-
-	while (!HSDK_PLL_LOCKED() && timeout--)
-		cpu_relax();
-
-	if (!HSDK_PLL_LOCKED() || HSDK_PLL_ERR())
-		pr_err("Failed to setup CPU frequency to 1GHz!");
-}
-
 #define SDIO_BASE		(ARC_PERIPHERAL_BASE + 0xA000)
 #define SDIO_UHS_REG_EXT	(SDIO_BASE + 0x108)
 #define SDIO_UHS_REG_EXT_DIV_2	(2 << 30)
@@ -98,12 +62,6 @@ static void __init hsdk_init_early(void)
 	 * minimum possible div-by-2.
 	 */
 	iowrite32(SDIO_UHS_REG_EXT_DIV_2, (void __iomem *) SDIO_UHS_REG_EXT);
-
-	/*
-	 * Setup CPU frequency to 1GHz.
-	 * TODO: remove it after smart hsdk pll driver will be introduced.
-	 */
-	hsdk_set_cpu_freq_1ghz();
 }
 
 static const char *hsdk_compat[] __initconst = {

From fbd1cec57064aa1380726ec899c49fcd84e702b9 Mon Sep 17 00:00:00 2001
From: Eugeniy Paltsev <Eugeniy.Paltsev@synopsys.com>
Date: Sat, 9 Dec 2017 16:59:17 +0300
Subject: [PATCH 330/808] ARC: [plat-axs103]: Set initial core pll output
 frequency

Set initial core pll output frequency specified in device tree to
100MHz for SMP configuration and 90MHz for UP configuration.
It will be applied at the core pll driver probing.

Update platform quirk for decreasing core frequency for quad core
configuration.

Acked-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Eugeniy Paltsev <Eugeniy.Paltsev@synopsys.com>
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
---
 arch/arc/boot/dts/axc003.dtsi     | 8 ++++++++
 arch/arc/boot/dts/axc003_idu.dtsi | 8 ++++++++
 arch/arc/plat-axs10x/axs10x.c     | 8 ++------
 3 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/arch/arc/boot/dts/axc003.dtsi b/arch/arc/boot/dts/axc003.dtsi
index 4e6e9f57e790..dc91c663bcc0 100644
--- a/arch/arc/boot/dts/axc003.dtsi
+++ b/arch/arc/boot/dts/axc003.dtsi
@@ -35,6 +35,14 @@
 			reg = <0x80 0x10>, <0x100 0x10>;
 			#clock-cells = <0>;
 			clocks = <&input_clk>;
+
+			/*
+			 * Set initial core pll output frequency to 90MHz.
+			 * It will be applied at the core pll driver probing
+			 * on early boot.
+			 */
+			assigned-clocks = <&core_clk>;
+			assigned-clock-rates = <90000000>;
 		};
 
 		core_intc: archs-intc@cpu {
diff --git a/arch/arc/boot/dts/axc003_idu.dtsi b/arch/arc/boot/dts/axc003_idu.dtsi
index 63954a8b0100..69ff4895f2ba 100644
--- a/arch/arc/boot/dts/axc003_idu.dtsi
+++ b/arch/arc/boot/dts/axc003_idu.dtsi
@@ -35,6 +35,14 @@
 			reg = <0x80 0x10>, <0x100 0x10>;
 			#clock-cells = <0>;
 			clocks = <&input_clk>;
+
+			/*
+			 * Set initial core pll output frequency to 100MHz.
+			 * It will be applied at the core pll driver probing
+			 * on early boot.
+			 */
+			assigned-clocks = <&core_clk>;
+			assigned-clock-rates = <100000000>;
 		};
 
 		core_intc: archs-intc@cpu {
diff --git a/arch/arc/plat-axs10x/axs10x.c b/arch/arc/plat-axs10x/axs10x.c
index f1ac6790da5f..ac1a712f6f1f 100644
--- a/arch/arc/plat-axs10x/axs10x.c
+++ b/arch/arc/plat-axs10x/axs10x.c
@@ -320,22 +320,18 @@ static void __init axs103_early_init(void)
 	unsigned int num_cores = (read_aux_reg(ARC_REG_MCIP_BCR) >> 16) & 0x3F;
 	if (num_cores > 2) {
 		u32 freq = 50, orig;
-		/*
-		 * TODO: use cpu node "cpu-freq" param instead of platform-specific
-		 * "/cpu_card/core_clk" as it works only if we use fixed-clock for cpu.
-		 */
 		int off = fdt_path_offset(initial_boot_params, "/cpu_card/core_clk");
 		const struct fdt_property *prop;
 
 		prop = fdt_get_property(initial_boot_params, off,
-					"clock-frequency", NULL);
+					"assigned-clock-rates", NULL);
 		orig = be32_to_cpu(*(u32*)(prop->data)) / 1000000;
 
 		/* Patching .dtb in-place with new core clock value */
 		if (freq != orig ) {
 			freq = cpu_to_be32(freq * 1000000);
 			fdt_setprop_inplace(initial_boot_params, off,
-					    "clock-frequency", &freq, sizeof(freq));
+					    "assigned-clock-rates", &freq, sizeof(freq));
 		}
 	}
 #endif

From d7de73b586b2db540187ff8a077330fa1a8efd64 Mon Sep 17 00:00:00 2001
From: Eugeniy Paltsev <Eugeniy.Paltsev@synopsys.com>
Date: Sat, 9 Dec 2017 16:59:18 +0300
Subject: [PATCH 331/808] ARC: [plat-axs103] refactor the quad core DT quirk
 code

Refactor the quad core DT quirk code:
get rid of waste division and multiplication by 1000000 constant.

Acked-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Eugeniy Paltsev <Eugeniy.Paltsev@synopsys.com>
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
---
 arch/arc/plat-axs10x/axs10x.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/arch/arc/plat-axs10x/axs10x.c b/arch/arc/plat-axs10x/axs10x.c
index ac1a712f6f1f..46544e88492d 100644
--- a/arch/arc/plat-axs10x/axs10x.c
+++ b/arch/arc/plat-axs10x/axs10x.c
@@ -317,19 +317,21 @@ static void __init axs103_early_init(void)
 	 * Instead of duplicating defconfig/DT for SMP/QUAD, add a small hack
 	 * of fudging the freq in DT
 	 */
+#define AXS103_QUAD_CORE_CPU_FREQ_HZ	50000000
+
 	unsigned int num_cores = (read_aux_reg(ARC_REG_MCIP_BCR) >> 16) & 0x3F;
 	if (num_cores > 2) {
-		u32 freq = 50, orig;
+		u32 freq;
 		int off = fdt_path_offset(initial_boot_params, "/cpu_card/core_clk");
 		const struct fdt_property *prop;
 
 		prop = fdt_get_property(initial_boot_params, off,
 					"assigned-clock-rates", NULL);
-		orig = be32_to_cpu(*(u32*)(prop->data)) / 1000000;
+		freq = be32_to_cpu(*(u32 *)(prop->data));
 
 		/* Patching .dtb in-place with new core clock value */
-		if (freq != orig ) {
-			freq = cpu_to_be32(freq * 1000000);
+		if (freq != AXS103_QUAD_CORE_CPU_FREQ_HZ) {
+			freq = cpu_to_be32(AXS103_QUAD_CORE_CPU_FREQ_HZ);
 			fdt_setprop_inplace(initial_boot_params, off,
 					    "assigned-clock-rates", &freq, sizeof(freq));
 		}

From 79435ac78d160e4c245544d457850a56f805ac0d Mon Sep 17 00:00:00 2001
From: Vineet Gupta <vgupta@synopsys.com>
Date: Fri, 8 Dec 2017 08:26:58 -0800
Subject: [PATCH 332/808] ARC: uaccess: dont use "l" gcc inline asm constraint
 modifier

This used to setup the LP_COUNT register automatically, but now has been
removed.

There was an earlier fix 3c7c7a2fc8811 which fixed instance in delay.h but
somehow missed this one as gcc change had not made its way into
production toolchains and was not pedantic as it is now !

Cc: stable@vger.kernel.org
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
---
 arch/arc/include/asm/uaccess.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/arc/include/asm/uaccess.h b/arch/arc/include/asm/uaccess.h
index f35974ee7264..c9173c02081c 100644
--- a/arch/arc/include/asm/uaccess.h
+++ b/arch/arc/include/asm/uaccess.h
@@ -668,6 +668,7 @@ __arc_strncpy_from_user(char *dst, const char __user *src, long count)
 		return 0;
 
 	__asm__ __volatile__(
+	"	mov	lp_count, %5		\n"
 	"	lp	3f			\n"
 	"1:	ldb.ab  %3, [%2, 1]		\n"
 	"	breq.d	%3, 0, 3f               \n"
@@ -684,8 +685,8 @@ __arc_strncpy_from_user(char *dst, const char __user *src, long count)
 	"	.word   1b, 4b			\n"
 	"	.previous			\n"
 	: "+r"(res), "+r"(dst), "+r"(src), "=r"(val)
-	: "g"(-EFAULT), "l"(count)
-	: "memory");
+	: "g"(-EFAULT), "r"(count)
+	: "lp_count", "lp_start", "lp_end", "memory");
 
 	return res;
 }

From 24c0df82ef7919e4d10cf2e4e65d368eb2e8ea21 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 19 Dec 2017 12:01:21 +0100
Subject: [PATCH 333/808] netfilter: nf_tables: fix chain filter in
 nf_tables_dump_rules()

ctx->chain may be null now that we have very large object names,
so we cannot check for ctx->chain[0] here.

Fixes: b7263e071aba7 ("netfilter: nf_tables: Allow table names of up to 255 chars")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Acked-by: Phil Sutter <phil@nwl.cc>
---
 net/netfilter/nf_tables_api.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 10798b357481..8d4526651661 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -2072,7 +2072,7 @@ static int nf_tables_dump_rules(struct sk_buff *skb,
 				continue;
 
 			list_for_each_entry_rcu(chain, &table->chains, list) {
-				if (ctx && ctx->chain[0] &&
+				if (ctx && ctx->chain &&
 				    strcmp(ctx->chain, chain->name) != 0)
 					continue;
 

From f5a16b93e6291ba1f65f55647cb4cd8d75ed1b35 Mon Sep 17 00:00:00 2001
From: Vineet Gupta <vgupta@synopsys.com>
Date: Wed, 20 Dec 2017 12:37:54 -0800
Subject: [PATCH 334/808] ARC: handle gcc generated __builtin_trap()

gcc toggle -fisolate-erroneous-paths-dereference (default at -O2
onwards) isolates faulty code paths such as null pointer access, divide
by zero etc by emitting __builtin_trap()

Newer ARC gcc generates TRAP_S 5 instruction which needs to be handled
and treated like any other unexpected exception
  - user mode  : task terminated with a SEGV
  - kernel mode: die() called after register and stack dump

Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
---
 arch/arc/kernel/traps.c        | 6 ++++++
 arch/arc/kernel/troubleshoot.c | 3 +++
 2 files changed, 9 insertions(+)

diff --git a/arch/arc/kernel/traps.c b/arch/arc/kernel/traps.c
index bcd7c9fc5d0f..004f4e4a4c10 100644
--- a/arch/arc/kernel/traps.c
+++ b/arch/arc/kernel/traps.c
@@ -83,6 +83,7 @@ DO_ERROR_INFO(SIGILL, "Illegal Insn (or Seq)", insterror_is_error, ILL_ILLOPC)
 DO_ERROR_INFO(SIGBUS, "Invalid Mem Access", __weak do_memory_error, BUS_ADRERR)
 DO_ERROR_INFO(SIGTRAP, "Breakpoint Set", trap_is_brkpt, TRAP_BRKPT)
 DO_ERROR_INFO(SIGBUS, "Misaligned Access", do_misaligned_error, BUS_ADRALN)
+DO_ERROR_INFO(SIGSEGV, "gcc generated __builtin_trap", do_trap5_error, 0)
 
 /*
  * Entry Point for Misaligned Data access Exception, for emulating in software
@@ -115,6 +116,8 @@ void do_machine_check_fault(unsigned long address, struct pt_regs *regs)
  * Thus TRAP_S <n> can be used for specific purpose
  *  -1 used for software breakpointing (gdb)
  *  -2 used by kprobes
+ *  -5 __builtin_trap() generated by gcc (2018.03 onwards) for toggle such as
+ *     -fno-isolate-erroneous-paths-dereference
  */
 void do_non_swi_trap(unsigned long address, struct pt_regs *regs)
 {
@@ -134,6 +137,9 @@ void do_non_swi_trap(unsigned long address, struct pt_regs *regs)
 		kgdb_trap(regs);
 		break;
 
+	case 5:
+		do_trap5_error(address, regs);
+		break;
 	default:
 		break;
 	}
diff --git a/arch/arc/kernel/troubleshoot.c b/arch/arc/kernel/troubleshoot.c
index 7d8c1d6c2f60..6e9a0a9a6a04 100644
--- a/arch/arc/kernel/troubleshoot.c
+++ b/arch/arc/kernel/troubleshoot.c
@@ -163,6 +163,9 @@ static void show_ecr_verbose(struct pt_regs *regs)
 		else
 			pr_cont("Bus Error, check PRM\n");
 #endif
+	} else if (vec == ECR_V_TRAP) {
+		if (regs->ecr_param == 5)
+			pr_cont("gcc generated __builtin_trap\n");
 	} else {
 		pr_cont("Check Programmer's Manual\n");
 	}

From 91aae6be4139b9e3902656d819e6af66e051bd7a Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Thu, 14 Dec 2017 15:42:22 -0800
Subject: [PATCH 335/808] xfs: track cowblocks separately in i_flags

The EOFBLOCKS/COWBLOCKS tags are totally separate things, so track them
with separate i_flags.  Right now we're abusing IEOFBLOCKS for both,
which is totally bogus because we won't tag the inode with COWBLOCKS if
IEOFBLOCKS was set by a previous tagging of the inode with EOFBLOCKS.
Found by wiring up clonerange to fsstress in xfs/017.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_icache.c | 33 ++++++++++++++++++++++++---------
 fs/xfs/xfs_inode.h  |  1 +
 2 files changed, 25 insertions(+), 9 deletions(-)

diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 43005fbe8b1e..58d2d4253c8e 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -1536,8 +1536,23 @@ xfs_inode_free_quota_eofblocks(
 	return __xfs_inode_free_quota_eofblocks(ip, xfs_icache_free_eofblocks);
 }
 
+static inline unsigned long
+xfs_iflag_for_tag(
+	int		tag)
+{
+	switch (tag) {
+	case XFS_ICI_EOFBLOCKS_TAG:
+		return XFS_IEOFBLOCKS;
+	case XFS_ICI_COWBLOCKS_TAG:
+		return XFS_ICOWBLOCKS;
+	default:
+		ASSERT(0);
+		return 0;
+	}
+}
+
 static void
-__xfs_inode_set_eofblocks_tag(
+__xfs_inode_set_blocks_tag(
 	xfs_inode_t	*ip,
 	void		(*execute)(struct xfs_mount *mp),
 	void		(*set_tp)(struct xfs_mount *mp, xfs_agnumber_t agno,
@@ -1552,10 +1567,10 @@ __xfs_inode_set_eofblocks_tag(
 	 * Don't bother locking the AG and looking up in the radix trees
 	 * if we already know that we have the tag set.
 	 */
-	if (ip->i_flags & XFS_IEOFBLOCKS)
+	if (ip->i_flags & xfs_iflag_for_tag(tag))
 		return;
 	spin_lock(&ip->i_flags_lock);
-	ip->i_flags |= XFS_IEOFBLOCKS;
+	ip->i_flags |= xfs_iflag_for_tag(tag);
 	spin_unlock(&ip->i_flags_lock);
 
 	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
@@ -1587,13 +1602,13 @@ xfs_inode_set_eofblocks_tag(
 	xfs_inode_t	*ip)
 {
 	trace_xfs_inode_set_eofblocks_tag(ip);
-	return __xfs_inode_set_eofblocks_tag(ip, xfs_queue_eofblocks,
+	return __xfs_inode_set_blocks_tag(ip, xfs_queue_eofblocks,
 			trace_xfs_perag_set_eofblocks,
 			XFS_ICI_EOFBLOCKS_TAG);
 }
 
 static void
-__xfs_inode_clear_eofblocks_tag(
+__xfs_inode_clear_blocks_tag(
 	xfs_inode_t	*ip,
 	void		(*clear_tp)(struct xfs_mount *mp, xfs_agnumber_t agno,
 				    int error, unsigned long caller_ip),
@@ -1603,7 +1618,7 @@ __xfs_inode_clear_eofblocks_tag(
 	struct xfs_perag *pag;
 
 	spin_lock(&ip->i_flags_lock);
-	ip->i_flags &= ~XFS_IEOFBLOCKS;
+	ip->i_flags &= ~xfs_iflag_for_tag(tag);
 	spin_unlock(&ip->i_flags_lock);
 
 	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
@@ -1630,7 +1645,7 @@ xfs_inode_clear_eofblocks_tag(
 	xfs_inode_t	*ip)
 {
 	trace_xfs_inode_clear_eofblocks_tag(ip);
-	return __xfs_inode_clear_eofblocks_tag(ip,
+	return __xfs_inode_clear_blocks_tag(ip,
 			trace_xfs_perag_clear_eofblocks, XFS_ICI_EOFBLOCKS_TAG);
 }
 
@@ -1724,7 +1739,7 @@ xfs_inode_set_cowblocks_tag(
 	xfs_inode_t	*ip)
 {
 	trace_xfs_inode_set_cowblocks_tag(ip);
-	return __xfs_inode_set_eofblocks_tag(ip, xfs_queue_cowblocks,
+	return __xfs_inode_set_blocks_tag(ip, xfs_queue_cowblocks,
 			trace_xfs_perag_set_cowblocks,
 			XFS_ICI_COWBLOCKS_TAG);
 }
@@ -1734,6 +1749,6 @@ xfs_inode_clear_cowblocks_tag(
 	xfs_inode_t	*ip)
 {
 	trace_xfs_inode_clear_cowblocks_tag(ip);
-	return __xfs_inode_clear_eofblocks_tag(ip,
+	return __xfs_inode_clear_blocks_tag(ip,
 			trace_xfs_perag_clear_cowblocks, XFS_ICI_COWBLOCKS_TAG);
 }
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index b2136af9289f..d383e392ec9d 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -232,6 +232,7 @@ static inline bool xfs_is_reflink_inode(struct xfs_inode *ip)
  * log recovery to replay a bmap operation on the inode.
  */
 #define XFS_IRECOVERY		(1 << 11)
+#define XFS_ICOWBLOCKS		(1 << 12)/* has the cowblocks tag set */
 
 /*
  * Per-lifetime flags need to be reset when re-using a reclaimable inode during

From 4374f256ce8182019353c0c639bb8d0695b4c941 Mon Sep 17 00:00:00 2001
From: Edward Cree <ecree@solarflare.com>
Date: Mon, 18 Dec 2017 20:11:53 -0800
Subject: [PATCH 336/808] bpf/verifier: fix bounds calculation on BPF_RSH

Incorrect signed bounds were being computed.
If the old upper signed bound was positive and the old lower signed bound was
negative, this could cause the new upper signed bound to be too low,
leading to security issues.

Fixes: b03c9f9fdc37 ("bpf/verifier: track signed and unsigned min/max values")
Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Edward Cree <ecree@solarflare.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
[jannh@google.com: changed description to reflect bug impact]
Signed-off-by: Jann Horn <jannh@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/verifier.c | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index e39b01317b6f..625e358ca765 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2190,20 +2190,22 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 			mark_reg_unknown(env, regs, insn->dst_reg);
 			break;
 		}
-		/* BPF_RSH is an unsigned shift, so make the appropriate casts */
-		if (dst_reg->smin_value < 0) {
-			if (umin_val) {
-				/* Sign bit will be cleared */
-				dst_reg->smin_value = 0;
-			} else {
-				/* Lost sign bit information */
-				dst_reg->smin_value = S64_MIN;
-				dst_reg->smax_value = S64_MAX;
-			}
-		} else {
-			dst_reg->smin_value =
-				(u64)(dst_reg->smin_value) >> umax_val;
-		}
+		/* BPF_RSH is an unsigned shift.  If the value in dst_reg might
+		 * be negative, then either:
+		 * 1) src_reg might be zero, so the sign bit of the result is
+		 *    unknown, so we lose our signed bounds
+		 * 2) it's known negative, thus the unsigned bounds capture the
+		 *    signed bounds
+		 * 3) the signed bounds cross zero, so they tell us nothing
+		 *    about the result
+		 * If the value in dst_reg is known nonnegative, then again the
+		 * unsigned bounts capture the signed bounds.
+		 * Thus, in all cases it suffices to blow away our signed bounds
+		 * and rely on inferring new ones from the unsigned bounds and
+		 * var_off of the result.
+		 */
+		dst_reg->smin_value = S64_MIN;
+		dst_reg->smax_value = S64_MAX;
 		if (src_known)
 			dst_reg->var_off = tnum_rshift(dst_reg->var_off,
 						       umin_val);

From 95a762e2c8c942780948091f8f2a4f32fce1ac6f Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Mon, 18 Dec 2017 20:11:54 -0800
Subject: [PATCH 337/808] bpf: fix incorrect sign extension in check_alu_op()

Distinguish between
BPF_ALU64|BPF_MOV|BPF_K (load 32-bit immediate, sign-extended to 64-bit)
and BPF_ALU|BPF_MOV|BPF_K (load 32-bit immediate, zero-padded to 64-bit);
only perform sign extension in the first case.

Starting with v4.14, this is exploitable by unprivileged users as long as
the unprivileged_bpf_disabled sysctl isn't set.

Debian assigned CVE-2017-16995 for this issue.

v3:
 - add CVE number (Ben Hutchings)

Fixes: 484611357c19 ("bpf: allow access into map value arrays")
Signed-off-by: Jann Horn <jannh@google.com>
Acked-by: Edward Cree <ecree@solarflare.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/verifier.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 625e358ca765..c086010ae51e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2408,7 +2408,13 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 			 * remember the value we stored into this reg
 			 */
 			regs[insn->dst_reg].type = SCALAR_VALUE;
-			__mark_reg_known(regs + insn->dst_reg, insn->imm);
+			if (BPF_CLASS(insn->code) == BPF_ALU64) {
+				__mark_reg_known(regs + insn->dst_reg,
+						 insn->imm);
+			} else {
+				__mark_reg_known(regs + insn->dst_reg,
+						 (u32)insn->imm);
+			}
 		}
 
 	} else if (opcode > BPF_END) {

From 0c17d1d2c61936401f4702e1846e2c19b200f958 Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Mon, 18 Dec 2017 20:11:55 -0800
Subject: [PATCH 338/808] bpf: fix incorrect tracking of register size
 truncation

Properly handle register truncation to a smaller size.

The old code first mirrors the clearing of the high 32 bits in the bitwise
tristate representation, which is correct. But then, it computes the new
arithmetic bounds as the intersection between the old arithmetic bounds and
the bounds resulting from the bitwise tristate representation. Therefore,
when coerce_reg_to_32() is called on a number with bounds
[0xffff'fff8, 0x1'0000'0007], the verifier computes
[0xffff'fff8, 0xffff'ffff] as bounds of the truncated number.
This is incorrect: The truncated number could also be in the range [0, 7],
and no meaningful arithmetic bounds can be computed in that case apart from
the obvious [0, 0xffff'ffff].

Starting with v4.14, this is exploitable by unprivileged users as long as
the unprivileged_bpf_disabled sysctl isn't set.

Debian assigned CVE-2017-16996 for this issue.

v2:
 - flip the mask during arithmetic bounds calculation (Ben Hutchings)
v3:
 - add CVE number (Ben Hutchings)

Fixes: b03c9f9fdc37 ("bpf/verifier: track signed and unsigned min/max values")
Signed-off-by: Jann Horn <jannh@google.com>
Acked-by: Edward Cree <ecree@solarflare.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/verifier.c | 44 ++++++++++++++++++++++++++-----------------
 1 file changed, 27 insertions(+), 17 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index c086010ae51e..f716bdf29dd0 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1067,6 +1067,29 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
 					   strict);
 }
 
+/* truncate register to smaller size (in bytes)
+ * must be called with size < BPF_REG_SIZE
+ */
+static void coerce_reg_to_size(struct bpf_reg_state *reg, int size)
+{
+	u64 mask;
+
+	/* clear high bits in bit representation */
+	reg->var_off = tnum_cast(reg->var_off, size);
+
+	/* fix arithmetic bounds */
+	mask = ((u64)1 << (size * 8)) - 1;
+	if ((reg->umin_value & ~mask) == (reg->umax_value & ~mask)) {
+		reg->umin_value &= mask;
+		reg->umax_value &= mask;
+	} else {
+		reg->umin_value = 0;
+		reg->umax_value = mask;
+	}
+	reg->smin_value = reg->umin_value;
+	reg->smax_value = reg->umax_value;
+}
+
 /* check whether memory at (regno + off) is accessible for t = (read | write)
  * if t==write, value_regno is a register which value is stored into memory
  * if t==read, value_regno is a register which will receive the value from memory
@@ -1200,9 +1223,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 	if (!err && size < BPF_REG_SIZE && value_regno >= 0 && t == BPF_READ &&
 	    regs[value_regno].type == SCALAR_VALUE) {
 		/* b/h/w load zero-extends, mark upper bits as known 0 */
-		regs[value_regno].var_off =
-			tnum_cast(regs[value_regno].var_off, size);
-		__update_reg_bounds(&regs[value_regno]);
+		coerce_reg_to_size(&regs[value_regno], size);
 	}
 	return err;
 }
@@ -1772,14 +1793,6 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 	return 0;
 }
 
-static void coerce_reg_to_32(struct bpf_reg_state *reg)
-{
-	/* clear high 32 bits */
-	reg->var_off = tnum_cast(reg->var_off, 4);
-	/* Update bounds */
-	__update_reg_bounds(reg);
-}
-
 static bool signed_add_overflows(s64 a, s64 b)
 {
 	/* Do the add in u64, where overflow is well-defined */
@@ -2017,8 +2030,8 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 
 	if (BPF_CLASS(insn->code) != BPF_ALU64) {
 		/* 32-bit ALU ops are (32,32)->64 */
-		coerce_reg_to_32(dst_reg);
-		coerce_reg_to_32(&src_reg);
+		coerce_reg_to_size(dst_reg, 4);
+		coerce_reg_to_size(&src_reg, 4);
 	}
 	smin_val = src_reg.smin_value;
 	smax_val = src_reg.smax_value;
@@ -2398,10 +2411,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 					return -EACCES;
 				}
 				mark_reg_unknown(env, regs, insn->dst_reg);
-				/* high 32 bits are known zero. */
-				regs[insn->dst_reg].var_off = tnum_cast(
-						regs[insn->dst_reg].var_off, 4);
-				__update_reg_bounds(&regs[insn->dst_reg]);
+				coerce_reg_to_size(&regs[insn->dst_reg], 4);
 			}
 		} else {
 			/* case: R = imm

From 468f6eafa6c44cb2c5d8aad35e12f06c240a812a Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Mon, 18 Dec 2017 20:11:56 -0800
Subject: [PATCH 339/808] bpf: fix 32-bit ALU op verification

32-bit ALU ops operate on 32-bit values and have 32-bit outputs.
Adjust the verifier accordingly.

Fixes: f1174f77b50c ("bpf/verifier: rework value tracking")
Signed-off-by: Jann Horn <jannh@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/verifier.c | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index f716bdf29dd0..ecdc265244ca 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2017,6 +2017,10 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 	return 0;
 }
 
+/* WARNING: This function does calculations on 64-bit values, but the actual
+ * execution may occur on 32-bit values. Therefore, things like bitshifts
+ * need extra checks in the 32-bit case.
+ */
 static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 				      struct bpf_insn *insn,
 				      struct bpf_reg_state *dst_reg,
@@ -2027,12 +2031,8 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 	bool src_known, dst_known;
 	s64 smin_val, smax_val;
 	u64 umin_val, umax_val;
+	u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32;
 
-	if (BPF_CLASS(insn->code) != BPF_ALU64) {
-		/* 32-bit ALU ops are (32,32)->64 */
-		coerce_reg_to_size(dst_reg, 4);
-		coerce_reg_to_size(&src_reg, 4);
-	}
 	smin_val = src_reg.smin_value;
 	smax_val = src_reg.smax_value;
 	umin_val = src_reg.umin_value;
@@ -2168,9 +2168,9 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 		__update_reg_bounds(dst_reg);
 		break;
 	case BPF_LSH:
-		if (umax_val > 63) {
-			/* Shifts greater than 63 are undefined.  This includes
-			 * shifts by a negative number.
+		if (umax_val >= insn_bitness) {
+			/* Shifts greater than 31 or 63 are undefined.
+			 * This includes shifts by a negative number.
 			 */
 			mark_reg_unknown(env, regs, insn->dst_reg);
 			break;
@@ -2196,9 +2196,9 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 		__update_reg_bounds(dst_reg);
 		break;
 	case BPF_RSH:
-		if (umax_val > 63) {
-			/* Shifts greater than 63 are undefined.  This includes
-			 * shifts by a negative number.
+		if (umax_val >= insn_bitness) {
+			/* Shifts greater than 31 or 63 are undefined.
+			 * This includes shifts by a negative number.
 			 */
 			mark_reg_unknown(env, regs, insn->dst_reg);
 			break;
@@ -2234,6 +2234,12 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 		break;
 	}
 
+	if (BPF_CLASS(insn->code) != BPF_ALU64) {
+		/* 32-bit ALU ops are (32,32)->32 */
+		coerce_reg_to_size(dst_reg, 4);
+		coerce_reg_to_size(&src_reg, 4);
+	}
+
 	__reg_deduce_bounds(dst_reg);
 	__reg_bound_offset(dst_reg);
 	return 0;

From ea25f914dc164c8d56b36147ecc86bc65f83c469 Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Mon, 18 Dec 2017 20:11:57 -0800
Subject: [PATCH 340/808] bpf: fix missing error return in
 check_stack_boundary()

Prevent indirect stack accesses at non-constant addresses, which would
permit reading and corrupting spilled pointers.

Fixes: f1174f77b50c ("bpf/verifier: rework value tracking")
Signed-off-by: Jann Horn <jannh@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/verifier.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index ecdc265244ca..77e4b5223867 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1303,6 +1303,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
 		tnum_strn(tn_buf, sizeof(tn_buf), regs[regno].var_off);
 		verbose(env, "invalid variable stack read R%d var_off=%s\n",
 			regno, tn_buf);
+		return -EACCES;
 	}
 	off = regs[regno].off + regs[regno].var_off.value;
 	if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||

From a5ec6ae161d72f01411169a938fa5f8baea16e8f Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Mon, 18 Dec 2017 20:11:58 -0800
Subject: [PATCH 341/808] bpf: force strict alignment checks for stack pointers

Force strict alignment checks for stack pointers because the tracking of
stack spills relies on it; unaligned stack accesses can lead to corruption
of spilled registers, which is exploitable.

Fixes: f1174f77b50c ("bpf/verifier: rework value tracking")
Signed-off-by: Jann Horn <jannh@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/verifier.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 77e4b5223867..102c519836f6 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1059,6 +1059,11 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
 		break;
 	case PTR_TO_STACK:
 		pointer_desc = "stack ";
+		/* The stack spill tracking logic in check_stack_write()
+		 * and check_stack_read() relies on stack accesses being
+		 * aligned.
+		 */
+		strict = true;
 		break;
 	default:
 		break;

From 179d1c5602997fef5a940c6ddcf31212cbfebd14 Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Mon, 18 Dec 2017 20:11:59 -0800
Subject: [PATCH 342/808] bpf: don't prune branches when a scalar is replaced
 with a pointer

This could be made safe by passing through a reference to env and checking
for env->allow_ptr_leaks, but it would only work one way and is probably
not worth the hassle - not doing it will not directly lead to program
rejection.

Fixes: f1174f77b50c ("bpf/verifier: rework value tracking")
Signed-off-by: Jann Horn <jannh@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/verifier.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 102c519836f6..982bd9ec721a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3467,15 +3467,14 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
 			return range_within(rold, rcur) &&
 			       tnum_in(rold->var_off, rcur->var_off);
 		} else {
-			/* if we knew anything about the old value, we're not
-			 * equal, because we can't know anything about the
-			 * scalar value of the pointer in the new value.
+			/* We're trying to use a pointer in place of a scalar.
+			 * Even if the scalar was unbounded, this could lead to
+			 * pointer leaks because scalars are allowed to leak
+			 * while pointers are not. We could make this safe in
+			 * special cases if root is calling us, but it's
+			 * probably not worth the hassle.
 			 */
-			return rold->umin_value == 0 &&
-			       rold->umax_value == U64_MAX &&
-			       rold->smin_value == S64_MIN &&
-			       rold->smax_value == S64_MAX &&
-			       tnum_is_unknown(rold->var_off);
+			return false;
 		}
 	case PTR_TO_MAP_VALUE:
 		/* If the new min/max/var_off satisfy the old ones and

From bb7f0f989ca7de1153bd128a40a71709e339fa03 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 18 Dec 2017 20:12:00 -0800
Subject: [PATCH 343/808] bpf: fix integer overflows

There were various issues related to the limited size of integers used in
the verifier:
 - `off + size` overflow in __check_map_access()
 - `off + reg->off` overflow in check_mem_access()
 - `off + reg->var_off.value` overflow or 32-bit truncation of
   `reg->var_off.value` in check_mem_access()
 - 32-bit truncation in check_stack_boundary()

Make sure that any integer math cannot overflow by not allowing
pointer math with large values.

Also reduce the scope of "scalar op scalar" tracking.

Fixes: f1174f77b50c ("bpf/verifier: rework value tracking")
Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf_verifier.h |  4 +--
 kernel/bpf/verifier.c        | 48 ++++++++++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+), 2 deletions(-)

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index c561b986bab0..1632bb13ad8a 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -15,11 +15,11 @@
  * In practice this is far bigger than any realistic pointer offset; this limit
  * ensures that umax_value + (int)off + (int)size cannot overflow a u64.
  */
-#define BPF_MAX_VAR_OFF	(1ULL << 31)
+#define BPF_MAX_VAR_OFF	(1 << 29)
 /* Maximum variable size permitted for ARG_CONST_SIZE[_OR_ZERO].  This ensures
  * that converting umax_value to int cannot overflow.
  */
-#define BPF_MAX_VAR_SIZ	INT_MAX
+#define BPF_MAX_VAR_SIZ	(1 << 29)
 
 /* Liveness marks, used for registers and spilled-regs (in stack slots).
  * Read marks propagate upwards until they find a write mark; they record that
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 982bd9ec721a..86dfe6b5c243 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1819,6 +1819,41 @@ static bool signed_sub_overflows(s64 a, s64 b)
 	return res > a;
 }
 
+static bool check_reg_sane_offset(struct bpf_verifier_env *env,
+				  const struct bpf_reg_state *reg,
+				  enum bpf_reg_type type)
+{
+	bool known = tnum_is_const(reg->var_off);
+	s64 val = reg->var_off.value;
+	s64 smin = reg->smin_value;
+
+	if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) {
+		verbose(env, "math between %s pointer and %lld is not allowed\n",
+			reg_type_str[type], val);
+		return false;
+	}
+
+	if (reg->off >= BPF_MAX_VAR_OFF || reg->off <= -BPF_MAX_VAR_OFF) {
+		verbose(env, "%s pointer offset %d is not allowed\n",
+			reg_type_str[type], reg->off);
+		return false;
+	}
+
+	if (smin == S64_MIN) {
+		verbose(env, "math between %s pointer and register with unbounded min value is not allowed\n",
+			reg_type_str[type]);
+		return false;
+	}
+
+	if (smin >= BPF_MAX_VAR_OFF || smin <= -BPF_MAX_VAR_OFF) {
+		verbose(env, "value %lld makes %s pointer be out of bounds\n",
+			smin, reg_type_str[type]);
+		return false;
+	}
+
+	return true;
+}
+
 /* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off.
  * Caller should also handle BPF_MOV case separately.
  * If we return -EACCES, caller may want to try again treating pointer as a
@@ -1887,6 +1922,10 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 	dst_reg->type = ptr_reg->type;
 	dst_reg->id = ptr_reg->id;
 
+	if (!check_reg_sane_offset(env, off_reg, ptr_reg->type) ||
+	    !check_reg_sane_offset(env, ptr_reg, ptr_reg->type))
+		return -EINVAL;
+
 	switch (opcode) {
 	case BPF_ADD:
 		/* We can take a fixed offset as long as it doesn't overflow
@@ -2017,6 +2056,9 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 		return -EACCES;
 	}
 
+	if (!check_reg_sane_offset(env, dst_reg, ptr_reg->type))
+		return -EINVAL;
+
 	__update_reg_bounds(dst_reg);
 	__reg_deduce_bounds(dst_reg);
 	__reg_bound_offset(dst_reg);
@@ -2046,6 +2088,12 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 	src_known = tnum_is_const(src_reg.var_off);
 	dst_known = tnum_is_const(dst_reg->var_off);
 
+	if (!src_known &&
+	    opcode != BPF_ADD && opcode != BPF_SUB && opcode != BPF_AND) {
+		__mark_reg_unknown(dst_reg);
+		return 0;
+	}
+
 	switch (opcode) {
 	case BPF_ADD:
 		if (signed_add_overflows(dst_reg->smin_value, smin_val) ||

From 2255f8d520b0a318fc6d387d0940854b2f522a7f Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Mon, 18 Dec 2017 20:12:01 -0800
Subject: [PATCH 344/808] selftests/bpf: add tests for recent bugfixes

These tests should cover the following cases:

 - MOV with both zero-extended and sign-extended immediates
 - implicit truncation of register contents via ALU32/MOV32
 - implicit 32-bit truncation of ALU32 output
 - oversized register source operand for ALU32 shift
 - right-shift of a number that could be positive or negative
 - map access where adding the operation size to the offset causes signed
   32-bit overflow
 - direct stack access at a ~4GiB offset

Also remove the F_LOAD_WITH_STRICT_ALIGNMENT flag from a bunch of tests
that should fail independent of what flags userspace passes.

Signed-off-by: Jann Horn <jannh@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/test_verifier.c | 549 +++++++++++++++++++-
 1 file changed, 533 insertions(+), 16 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index b03ecfd7185b..961c1426fbf2 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -606,7 +606,6 @@ static struct bpf_test tests[] = {
 		},
 		.errstr = "misaligned stack access",
 		.result = REJECT,
-		.flags = F_LOAD_WITH_STRICT_ALIGNMENT,
 	},
 	{
 		"invalid map_fd for function call",
@@ -1797,7 +1796,6 @@ static struct bpf_test tests[] = {
 		},
 		.result = REJECT,
 		.errstr = "misaligned stack access off (0x0; 0x0)+-8+2 size 8",
-		.flags = F_LOAD_WITH_STRICT_ALIGNMENT,
 	},
 	{
 		"PTR_TO_STACK store/load - bad alignment on reg",
@@ -1810,7 +1808,6 @@ static struct bpf_test tests[] = {
 		},
 		.result = REJECT,
 		.errstr = "misaligned stack access off (0x0; 0x0)+-10+8 size 8",
-		.flags = F_LOAD_WITH_STRICT_ALIGNMENT,
 	},
 	{
 		"PTR_TO_STACK store/load - out of bounds low",
@@ -6324,7 +6321,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.fixup_map1 = { 3 },
-		.errstr = "R0 min value is negative",
+		.errstr = "unbounded min value",
 		.result = REJECT,
 	},
 	{
@@ -6348,7 +6345,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.fixup_map1 = { 3 },
-		.errstr = "R0 min value is negative",
+		.errstr = "unbounded min value",
 		.result = REJECT,
 	},
 	{
@@ -6374,7 +6371,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.fixup_map1 = { 3 },
-		.errstr = "R8 invalid mem access 'inv'",
+		.errstr = "unbounded min value",
 		.result = REJECT,
 	},
 	{
@@ -6399,7 +6396,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.fixup_map1 = { 3 },
-		.errstr = "R8 invalid mem access 'inv'",
+		.errstr = "unbounded min value",
 		.result = REJECT,
 	},
 	{
@@ -6447,7 +6444,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.fixup_map1 = { 3 },
-		.errstr = "R0 min value is negative",
+		.errstr = "unbounded min value",
 		.result = REJECT,
 	},
 	{
@@ -6518,7 +6515,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.fixup_map1 = { 3 },
-		.errstr = "R0 min value is negative",
+		.errstr = "unbounded min value",
 		.result = REJECT,
 	},
 	{
@@ -6569,7 +6566,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.fixup_map1 = { 3 },
-		.errstr = "R0 min value is negative",
+		.errstr = "unbounded min value",
 		.result = REJECT,
 	},
 	{
@@ -6596,7 +6593,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.fixup_map1 = { 3 },
-		.errstr = "R0 min value is negative",
+		.errstr = "unbounded min value",
 		.result = REJECT,
 	},
 	{
@@ -6622,7 +6619,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.fixup_map1 = { 3 },
-		.errstr = "R0 min value is negative",
+		.errstr = "unbounded min value",
 		.result = REJECT,
 	},
 	{
@@ -6651,7 +6648,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.fixup_map1 = { 3 },
-		.errstr = "R0 min value is negative",
+		.errstr = "unbounded min value",
 		.result = REJECT,
 	},
 	{
@@ -6681,7 +6678,7 @@ static struct bpf_test tests[] = {
 			BPF_JMP_IMM(BPF_JA, 0, 0, -7),
 		},
 		.fixup_map1 = { 4 },
-		.errstr = "R0 min value is negative",
+		.errstr = "unbounded min value",
 		.result = REJECT,
 	},
 	{
@@ -6709,8 +6706,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.fixup_map1 = { 3 },
-		.errstr_unpriv = "R0 pointer comparison prohibited",
-		.errstr = "R0 min value is negative",
+		.errstr = "unbounded min value",
 		.result = REJECT,
 		.result_unpriv = REJECT,
 	},
@@ -6765,6 +6761,462 @@ static struct bpf_test tests[] = {
 		.errstr = "R0 min value is negative, either use unsigned index or do a if (index >=0) check.",
 		.result = REJECT,
 	},
+	{
+		"bounds check based on zero-extended MOV",
+		.insns = {
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+			/* r2 = 0x0000'0000'ffff'ffff */
+			BPF_MOV32_IMM(BPF_REG_2, 0xffffffff),
+			/* r2 = 0 */
+			BPF_ALU64_IMM(BPF_RSH, BPF_REG_2, 32),
+			/* no-op */
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_2),
+			/* access at offset 0 */
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0),
+			/* exit */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map1 = { 3 },
+		.result = ACCEPT
+	},
+	{
+		"bounds check based on sign-extended MOV. test1",
+		.insns = {
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+			/* r2 = 0xffff'ffff'ffff'ffff */
+			BPF_MOV64_IMM(BPF_REG_2, 0xffffffff),
+			/* r2 = 0xffff'ffff */
+			BPF_ALU64_IMM(BPF_RSH, BPF_REG_2, 32),
+			/* r0 = <oob pointer> */
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_2),
+			/* access to OOB pointer */
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0),
+			/* exit */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map1 = { 3 },
+		.errstr = "map_value pointer and 4294967295",
+		.result = REJECT
+	},
+	{
+		"bounds check based on sign-extended MOV. test2",
+		.insns = {
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+			/* r2 = 0xffff'ffff'ffff'ffff */
+			BPF_MOV64_IMM(BPF_REG_2, 0xffffffff),
+			/* r2 = 0xfff'ffff */
+			BPF_ALU64_IMM(BPF_RSH, BPF_REG_2, 36),
+			/* r0 = <oob pointer> */
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_2),
+			/* access to OOB pointer */
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0),
+			/* exit */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map1 = { 3 },
+		.errstr = "R0 min value is outside of the array range",
+		.result = REJECT
+	},
+	{
+		"bounds check based on reg_off + var_off + insn_off. test1",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+				    offsetof(struct __sk_buff, mark)),
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+			BPF_ALU64_IMM(BPF_AND, BPF_REG_6, 1),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, (1 << 29) - 1),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_6),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, (1 << 29) - 1),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 3),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map1 = { 4 },
+		.errstr = "value_size=8 off=1073741825",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+	},
+	{
+		"bounds check based on reg_off + var_off + insn_off. test2",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+				    offsetof(struct __sk_buff, mark)),
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+			BPF_ALU64_IMM(BPF_AND, BPF_REG_6, 1),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, (1 << 30) - 1),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_6),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, (1 << 29) - 1),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 3),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map1 = { 4 },
+		.errstr = "value 1073741823",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+	},
+	{
+		"bounds check after truncation of non-boundary-crossing range",
+		.insns = {
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9),
+			/* r1 = [0x00, 0xff] */
+			BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0),
+			BPF_MOV64_IMM(BPF_REG_2, 1),
+			/* r2 = 0x10'0000'0000 */
+			BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 36),
+			/* r1 = [0x10'0000'0000, 0x10'0000'00ff] */
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_2),
+			/* r1 = [0x10'7fff'ffff, 0x10'8000'00fe] */
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0x7fffffff),
+			/* r1 = [0x00, 0xff] */
+			BPF_ALU32_IMM(BPF_SUB, BPF_REG_1, 0x7fffffff),
+			/* r1 = 0 */
+			BPF_ALU64_IMM(BPF_RSH, BPF_REG_1, 8),
+			/* no-op */
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+			/* access at offset 0 */
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0),
+			/* exit */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map1 = { 3 },
+		.result = ACCEPT
+	},
+	{
+		"bounds check after truncation of boundary-crossing range (1)",
+		.insns = {
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9),
+			/* r1 = [0x00, 0xff] */
+			BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0xffffff80 >> 1),
+			/* r1 = [0xffff'ff80, 0x1'0000'007f] */
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0xffffff80 >> 1),
+			/* r1 = [0xffff'ff80, 0xffff'ffff] or
+			 *      [0x0000'0000, 0x0000'007f]
+			 */
+			BPF_ALU32_IMM(BPF_ADD, BPF_REG_1, 0),
+			BPF_ALU64_IMM(BPF_SUB, BPF_REG_1, 0xffffff80 >> 1),
+			/* r1 = [0x00, 0xff] or
+			 *      [0xffff'ffff'0000'0080, 0xffff'ffff'ffff'ffff]
+			 */
+			BPF_ALU64_IMM(BPF_SUB, BPF_REG_1, 0xffffff80 >> 1),
+			/* r1 = 0 or
+			 *      [0x00ff'ffff'ff00'0000, 0x00ff'ffff'ffff'ffff]
+			 */
+			BPF_ALU64_IMM(BPF_RSH, BPF_REG_1, 8),
+			/* no-op or OOB pointer computation */
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+			/* potentially OOB access */
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0),
+			/* exit */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map1 = { 3 },
+		/* not actually fully unbounded, but the bound is very high */
+		.errstr = "R0 unbounded memory access",
+		.result = REJECT
+	},
+	{
+		"bounds check after truncation of boundary-crossing range (2)",
+		.insns = {
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9),
+			/* r1 = [0x00, 0xff] */
+			BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0xffffff80 >> 1),
+			/* r1 = [0xffff'ff80, 0x1'0000'007f] */
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0xffffff80 >> 1),
+			/* r1 = [0xffff'ff80, 0xffff'ffff] or
+			 *      [0x0000'0000, 0x0000'007f]
+			 * difference to previous test: truncation via MOV32
+			 * instead of ALU32.
+			 */
+			BPF_MOV32_REG(BPF_REG_1, BPF_REG_1),
+			BPF_ALU64_IMM(BPF_SUB, BPF_REG_1, 0xffffff80 >> 1),
+			/* r1 = [0x00, 0xff] or
+			 *      [0xffff'ffff'0000'0080, 0xffff'ffff'ffff'ffff]
+			 */
+			BPF_ALU64_IMM(BPF_SUB, BPF_REG_1, 0xffffff80 >> 1),
+			/* r1 = 0 or
+			 *      [0x00ff'ffff'ff00'0000, 0x00ff'ffff'ffff'ffff]
+			 */
+			BPF_ALU64_IMM(BPF_RSH, BPF_REG_1, 8),
+			/* no-op or OOB pointer computation */
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+			/* potentially OOB access */
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0),
+			/* exit */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map1 = { 3 },
+		/* not actually fully unbounded, but the bound is very high */
+		.errstr = "R0 unbounded memory access",
+		.result = REJECT
+	},
+	{
+		"bounds check after wrapping 32-bit addition",
+		.insns = {
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5),
+			/* r1 = 0x7fff'ffff */
+			BPF_MOV64_IMM(BPF_REG_1, 0x7fffffff),
+			/* r1 = 0xffff'fffe */
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0x7fffffff),
+			/* r1 = 0 */
+			BPF_ALU32_IMM(BPF_ADD, BPF_REG_1, 2),
+			/* no-op */
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+			/* access at offset 0 */
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0),
+			/* exit */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map1 = { 3 },
+		.result = ACCEPT
+	},
+	{
+		"bounds check after shift with oversized count operand",
+		.insns = {
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
+			BPF_MOV64_IMM(BPF_REG_2, 32),
+			BPF_MOV64_IMM(BPF_REG_1, 1),
+			/* r1 = (u32)1 << (u32)32 = ? */
+			BPF_ALU32_REG(BPF_LSH, BPF_REG_1, BPF_REG_2),
+			/* r1 = [0x0000, 0xffff] */
+			BPF_ALU64_IMM(BPF_AND, BPF_REG_1, 0xffff),
+			/* computes unknown pointer, potentially OOB */
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+			/* potentially OOB access */
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0),
+			/* exit */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map1 = { 3 },
+		.errstr = "R0 max value is outside of the array range",
+		.result = REJECT
+	},
+	{
+		"bounds check after right shift of maybe-negative number",
+		.insns = {
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
+			/* r1 = [0x00, 0xff] */
+			BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0),
+			/* r1 = [-0x01, 0xfe] */
+			BPF_ALU64_IMM(BPF_SUB, BPF_REG_1, 1),
+			/* r1 = 0 or 0xff'ffff'ffff'ffff */
+			BPF_ALU64_IMM(BPF_RSH, BPF_REG_1, 8),
+			/* r1 = 0 or 0xffff'ffff'ffff */
+			BPF_ALU64_IMM(BPF_RSH, BPF_REG_1, 8),
+			/* computes unknown pointer, potentially OOB */
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+			/* potentially OOB access */
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0),
+			/* exit */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map1 = { 3 },
+		.errstr = "R0 unbounded memory access",
+		.result = REJECT
+	},
+	{
+		"bounds check map access with off+size signed 32bit overflow. test1",
+		.insns = {
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 0x7ffffffe),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0),
+			BPF_JMP_A(0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map1 = { 3 },
+		.errstr = "map_value pointer and 2147483646",
+		.result = REJECT
+	},
+	{
+		"bounds check map access with off+size signed 32bit overflow. test2",
+		.insns = {
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 0x1fffffff),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 0x1fffffff),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 0x1fffffff),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0),
+			BPF_JMP_A(0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map1 = { 3 },
+		.errstr = "pointer offset 1073741822",
+		.result = REJECT
+	},
+	{
+		"bounds check map access with off+size signed 32bit overflow. test3",
+		.insns = {
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU64_IMM(BPF_SUB, BPF_REG_0, 0x1fffffff),
+			BPF_ALU64_IMM(BPF_SUB, BPF_REG_0, 0x1fffffff),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 2),
+			BPF_JMP_A(0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map1 = { 3 },
+		.errstr = "pointer offset -1073741822",
+		.result = REJECT
+	},
+	{
+		"bounds check map access with off+size signed 32bit overflow. test4",
+		.insns = {
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+			BPF_EXIT_INSN(),
+			BPF_MOV64_IMM(BPF_REG_1, 1000000),
+			BPF_ALU64_IMM(BPF_MUL, BPF_REG_1, 1000000),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 2),
+			BPF_JMP_A(0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map1 = { 3 },
+		.errstr = "map_value pointer and 1000000000000",
+		.result = REJECT
+	},
+	{
+		"pointer/scalar confusion in state equality check (way 1)",
+		.insns = {
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0),
+			BPF_JMP_A(1),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_10),
+			BPF_JMP_A(0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map1 = { 3 },
+		.result = ACCEPT,
+		.result_unpriv = REJECT,
+		.errstr_unpriv = "R0 leaks addr as return value"
+	},
+	{
+		"pointer/scalar confusion in state equality check (way 2)",
+		.insns = {
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_10),
+			BPF_JMP_A(1),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map1 = { 3 },
+		.result = ACCEPT,
+		.result_unpriv = REJECT,
+		.errstr_unpriv = "R0 leaks addr as return value"
+	},
 	{
 		"variable-offset ctx access",
 		.insns = {
@@ -6806,6 +7258,71 @@ static struct bpf_test tests[] = {
 		.result = REJECT,
 		.prog_type = BPF_PROG_TYPE_LWT_IN,
 	},
+	{
+		"indirect variable-offset stack access",
+		.insns = {
+			/* Fill the top 8 bytes of the stack */
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			/* Get an unknown value */
+			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 0),
+			/* Make it small and 4-byte aligned */
+			BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 4),
+			BPF_ALU64_IMM(BPF_SUB, BPF_REG_2, 8),
+			/* add it to fp.  We now have either fp-4 or fp-8, but
+			 * we don't know which
+			 */
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_10),
+			/* dereference it indirectly */
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map1 = { 5 },
+		.errstr = "variable stack read R2",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_LWT_IN,
+	},
+	{
+		"direct stack access with 32-bit wraparound. test1",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0x7fffffff),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0x7fffffff),
+			BPF_MOV32_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0),
+			BPF_EXIT_INSN()
+		},
+		.errstr = "fp pointer and 2147483647",
+		.result = REJECT
+	},
+	{
+		"direct stack access with 32-bit wraparound. test2",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0x3fffffff),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0x3fffffff),
+			BPF_MOV32_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0),
+			BPF_EXIT_INSN()
+		},
+		.errstr = "fp pointer and 1073741823",
+		.result = REJECT
+	},
+	{
+		"direct stack access with 32-bit wraparound. test3",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0x1fffffff),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0x1fffffff),
+			BPF_MOV32_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0),
+			BPF_EXIT_INSN()
+		},
+		.errstr = "fp pointer offset 1073741822",
+		.result = REJECT
+	},
 	{
 		"liveness pruning and write screening",
 		.insns = {

From 82abbf8d2fc46d79611ab58daa7c608df14bb3ee Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 18 Dec 2017 20:15:20 -0800
Subject: [PATCH 345/808] bpf: do not allow root to mangle valid pointers

Do not allow root to convert valid pointers into unknown scalars.
In particular disallow:
 ptr &= reg
 ptr <<= reg
 ptr += ptr
and explicitly allow:
 ptr -= ptr
since pkt_end - pkt == length

1.
This minimizes amount of address leaks root can do.
In the future may need to further tighten the leaks with kptr_restrict.

2.
If program has such pointer math it's likely a user mistake and
when verifier complains about it right away instead of many instructions
later on invalid memory access it's easier for users to fix their progs.

3.
when register holding a pointer cannot change to scalar it allows JITs to
optimize better. Like 32-bit archs could use single register for pointers
instead of a pair required to hold 64-bit scalars.

4.
reduces architecture dependent behavior. Since code:
r1 = r10;
r1 &= 0xff;
if (r1 ...)
will behave differently arm64 vs x64 and offloaded vs native.

A significant chunk of ptr mangling was allowed by
commit f1174f77b50c ("bpf/verifier: rework value tracking")
yet some of it was allowed even earlier.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/verifier.c                       | 102 +++++++-------------
 tools/testing/selftests/bpf/test_verifier.c |  56 +++++------
 2 files changed, 63 insertions(+), 95 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 86dfe6b5c243..04b24876cd23 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1890,29 +1890,25 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 
 	if (BPF_CLASS(insn->code) != BPF_ALU64) {
 		/* 32-bit ALU ops on pointers produce (meaningless) scalars */
-		if (!env->allow_ptr_leaks)
-			verbose(env,
-				"R%d 32-bit pointer arithmetic prohibited\n",
-				dst);
+		verbose(env,
+			"R%d 32-bit pointer arithmetic prohibited\n",
+			dst);
 		return -EACCES;
 	}
 
 	if (ptr_reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
-		if (!env->allow_ptr_leaks)
-			verbose(env, "R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n",
-				dst);
+		verbose(env, "R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n",
+			dst);
 		return -EACCES;
 	}
 	if (ptr_reg->type == CONST_PTR_TO_MAP) {
-		if (!env->allow_ptr_leaks)
-			verbose(env, "R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n",
-				dst);
+		verbose(env, "R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n",
+			dst);
 		return -EACCES;
 	}
 	if (ptr_reg->type == PTR_TO_PACKET_END) {
-		if (!env->allow_ptr_leaks)
-			verbose(env, "R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n",
-				dst);
+		verbose(env, "R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n",
+			dst);
 		return -EACCES;
 	}
 
@@ -1979,9 +1975,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 	case BPF_SUB:
 		if (dst_reg == off_reg) {
 			/* scalar -= pointer.  Creates an unknown scalar */
-			if (!env->allow_ptr_leaks)
-				verbose(env, "R%d tried to subtract pointer from scalar\n",
-					dst);
+			verbose(env, "R%d tried to subtract pointer from scalar\n",
+				dst);
 			return -EACCES;
 		}
 		/* We don't allow subtraction from FP, because (according to
@@ -1989,9 +1984,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 		 * be able to deal with it.
 		 */
 		if (ptr_reg->type == PTR_TO_STACK) {
-			if (!env->allow_ptr_leaks)
-				verbose(env, "R%d subtraction from stack pointer prohibited\n",
-					dst);
+			verbose(env, "R%d subtraction from stack pointer prohibited\n",
+				dst);
 			return -EACCES;
 		}
 		if (known && (ptr_reg->off - smin_val ==
@@ -2040,19 +2034,14 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 	case BPF_AND:
 	case BPF_OR:
 	case BPF_XOR:
-		/* bitwise ops on pointers are troublesome, prohibit for now.
-		 * (However, in principle we could allow some cases, e.g.
-		 * ptr &= ~3 which would reduce min_value by 3.)
-		 */
-		if (!env->allow_ptr_leaks)
-			verbose(env, "R%d bitwise operator %s on pointer prohibited\n",
-				dst, bpf_alu_string[opcode >> 4]);
+		/* bitwise ops on pointers are troublesome, prohibit. */
+		verbose(env, "R%d bitwise operator %s on pointer prohibited\n",
+			dst, bpf_alu_string[opcode >> 4]);
 		return -EACCES;
 	default:
 		/* other operators (e.g. MUL,LSH) produce non-pointer results */
-		if (!env->allow_ptr_leaks)
-			verbose(env, "R%d pointer arithmetic with %s operator prohibited\n",
-				dst, bpf_alu_string[opcode >> 4]);
+		verbose(env, "R%d pointer arithmetic with %s operator prohibited\n",
+			dst, bpf_alu_string[opcode >> 4]);
 		return -EACCES;
 	}
 
@@ -2308,7 +2297,6 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 	struct bpf_reg_state *regs = cur_regs(env), *dst_reg, *src_reg;
 	struct bpf_reg_state *ptr_reg = NULL, off_reg = {0};
 	u8 opcode = BPF_OP(insn->code);
-	int rc;
 
 	dst_reg = &regs[insn->dst_reg];
 	src_reg = NULL;
@@ -2319,43 +2307,29 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 		if (src_reg->type != SCALAR_VALUE) {
 			if (dst_reg->type != SCALAR_VALUE) {
 				/* Combining two pointers by any ALU op yields
-				 * an arbitrary scalar.
+				 * an arbitrary scalar. Disallow all math except
+				 * pointer subtraction
 				 */
-				if (!env->allow_ptr_leaks) {
-					verbose(env, "R%d pointer %s pointer prohibited\n",
-						insn->dst_reg,
-						bpf_alu_string[opcode >> 4]);
-					return -EACCES;
+				if (opcode == BPF_SUB){
+					mark_reg_unknown(env, regs, insn->dst_reg);
+					return 0;
 				}
-				mark_reg_unknown(env, regs, insn->dst_reg);
-				return 0;
+				verbose(env, "R%d pointer %s pointer prohibited\n",
+					insn->dst_reg,
+					bpf_alu_string[opcode >> 4]);
+				return -EACCES;
 			} else {
 				/* scalar += pointer
 				 * This is legal, but we have to reverse our
 				 * src/dest handling in computing the range
 				 */
-				rc = adjust_ptr_min_max_vals(env, insn,
-							     src_reg, dst_reg);
-				if (rc == -EACCES && env->allow_ptr_leaks) {
-					/* scalar += unknown scalar */
-					__mark_reg_unknown(&off_reg);
-					return adjust_scalar_min_max_vals(
-							env, insn,
-							dst_reg, off_reg);
-				}
-				return rc;
+				return adjust_ptr_min_max_vals(env, insn,
+							       src_reg, dst_reg);
 			}
 		} else if (ptr_reg) {
 			/* pointer += scalar */
-			rc = adjust_ptr_min_max_vals(env, insn,
-						     dst_reg, src_reg);
-			if (rc == -EACCES && env->allow_ptr_leaks) {
-				/* unknown scalar += scalar */
-				__mark_reg_unknown(dst_reg);
-				return adjust_scalar_min_max_vals(
-						env, insn, dst_reg, *src_reg);
-			}
-			return rc;
+			return adjust_ptr_min_max_vals(env, insn,
+						       dst_reg, src_reg);
 		}
 	} else {
 		/* Pretend the src is a reg with a known value, since we only
@@ -2364,17 +2338,9 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 		off_reg.type = SCALAR_VALUE;
 		__mark_reg_known(&off_reg, insn->imm);
 		src_reg = &off_reg;
-		if (ptr_reg) { /* pointer += K */
-			rc = adjust_ptr_min_max_vals(env, insn,
-						     ptr_reg, src_reg);
-			if (rc == -EACCES && env->allow_ptr_leaks) {
-				/* unknown scalar += K */
-				__mark_reg_unknown(dst_reg);
-				return adjust_scalar_min_max_vals(
-						env, insn, dst_reg, off_reg);
-			}
-			return rc;
-		}
+		if (ptr_reg) /* pointer += K */
+			return adjust_ptr_min_max_vals(env, insn,
+						       ptr_reg, src_reg);
 	}
 
 	/* Got here implies adding two SCALAR_VALUEs */
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 961c1426fbf2..b51017404c62 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -422,9 +422,7 @@ static struct bpf_test tests[] = {
 			BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, 0),
 			BPF_EXIT_INSN(),
 		},
-		.errstr_unpriv = "R1 subtraction from stack pointer",
-		.result_unpriv = REJECT,
-		.errstr = "R1 invalid mem access",
+		.errstr = "R1 subtraction from stack pointer",
 		.result = REJECT,
 	},
 	{
@@ -1859,9 +1857,8 @@ static struct bpf_test tests[] = {
 			BPF_MOV64_IMM(BPF_REG_0, 0),
 			BPF_EXIT_INSN(),
 		},
-		.result = ACCEPT,
-		.result_unpriv = REJECT,
-		.errstr_unpriv = "R1 pointer += pointer",
+		.result = REJECT,
+		.errstr = "R1 pointer += pointer",
 	},
 	{
 		"unpriv: neg pointer",
@@ -2589,7 +2586,8 @@ static struct bpf_test tests[] = {
 			BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
 				    offsetof(struct __sk_buff, data)),
 			BPF_ALU64_REG(BPF_ADD, BPF_REG_3, BPF_REG_4),
-			BPF_MOV64_REG(BPF_REG_2, BPF_REG_1),
+			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+				    offsetof(struct __sk_buff, len)),
 			BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 49),
 			BPF_ALU64_IMM(BPF_RSH, BPF_REG_2, 49),
 			BPF_ALU64_REG(BPF_ADD, BPF_REG_3, BPF_REG_2),
@@ -2896,7 +2894,7 @@ static struct bpf_test tests[] = {
 			BPF_MOV64_IMM(BPF_REG_0, 0),
 			BPF_EXIT_INSN(),
 		},
-		.errstr = "invalid access to packet",
+		.errstr = "R3 pointer arithmetic on PTR_TO_PACKET_END",
 		.result = REJECT,
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 	},
@@ -3882,9 +3880,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.fixup_map2 = { 3, 11 },
-		.errstr_unpriv = "R0 pointer += pointer",
-		.errstr = "R0 invalid mem access 'inv'",
-		.result_unpriv = REJECT,
+		.errstr = "R0 pointer += pointer",
 		.result = REJECT,
 		.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
 	},
@@ -3925,7 +3921,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.fixup_map1 = { 4 },
-		.errstr = "R4 invalid mem access",
+		.errstr = "R4 pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL",
 		.result = REJECT,
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS
 	},
@@ -3946,7 +3942,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.fixup_map1 = { 4 },
-		.errstr = "R4 invalid mem access",
+		.errstr = "R4 pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL",
 		.result = REJECT,
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS
 	},
@@ -3967,7 +3963,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.fixup_map1 = { 4 },
-		.errstr = "R4 invalid mem access",
+		.errstr = "R4 pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL",
 		.result = REJECT,
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS
 	},
@@ -5192,10 +5188,8 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.fixup_map2 = { 3 },
-		.errstr_unpriv = "R0 bitwise operator &= on pointer",
-		.errstr = "invalid mem access 'inv'",
+		.errstr = "R0 bitwise operator &= on pointer",
 		.result = REJECT,
-		.result_unpriv = REJECT,
 	},
 	{
 		"map element value illegal alu op, 2",
@@ -5211,10 +5205,8 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.fixup_map2 = { 3 },
-		.errstr_unpriv = "R0 32-bit pointer arithmetic prohibited",
-		.errstr = "invalid mem access 'inv'",
+		.errstr = "R0 32-bit pointer arithmetic prohibited",
 		.result = REJECT,
-		.result_unpriv = REJECT,
 	},
 	{
 		"map element value illegal alu op, 3",
@@ -5230,10 +5222,8 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.fixup_map2 = { 3 },
-		.errstr_unpriv = "R0 pointer arithmetic with /= operator",
-		.errstr = "invalid mem access 'inv'",
+		.errstr = "R0 pointer arithmetic with /= operator",
 		.result = REJECT,
-		.result_unpriv = REJECT,
 	},
 	{
 		"map element value illegal alu op, 4",
@@ -6016,8 +6006,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.fixup_map_in_map = { 3 },
-		.errstr = "R1 type=inv expected=map_ptr",
-		.errstr_unpriv = "R1 pointer arithmetic on CONST_PTR_TO_MAP prohibited",
+		.errstr = "R1 pointer arithmetic on CONST_PTR_TO_MAP prohibited",
 		.result = REJECT,
 	},
 	{
@@ -7644,6 +7633,19 @@ static struct bpf_test tests[] = {
 		.result = REJECT,
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 	},
+	{
+		"pkt_end - pkt_start is allowed",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, data_end)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+				    offsetof(struct __sk_buff, data)),
+			BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_2),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+	},
 	{
 		"XDP pkt read, pkt_end mangling, bad access 1",
 		.insns = {
@@ -7659,7 +7661,7 @@ static struct bpf_test tests[] = {
 			BPF_MOV64_IMM(BPF_REG_0, 0),
 			BPF_EXIT_INSN(),
 		},
-		.errstr = "R1 offset is outside of the packet",
+		.errstr = "R3 pointer arithmetic on PTR_TO_PACKET_END",
 		.result = REJECT,
 		.prog_type = BPF_PROG_TYPE_XDP,
 	},
@@ -7678,7 +7680,7 @@ static struct bpf_test tests[] = {
 			BPF_MOV64_IMM(BPF_REG_0, 0),
 			BPF_EXIT_INSN(),
 		},
-		.errstr = "R1 offset is outside of the packet",
+		.errstr = "R3 pointer arithmetic on PTR_TO_PACKET_END",
 		.result = REJECT,
 		.prog_type = BPF_PROG_TYPE_XDP,
 	},

From d1b8b2391c24751e44f618fcf86fb55d9a9247fd Mon Sep 17 00:00:00 2001
From: Cathy Avery <cavery@redhat.com>
Date: Tue, 19 Dec 2017 13:32:48 -0500
Subject: [PATCH 346/808] scsi: storvsc: Fix scsi_cmd error assignments in
 storvsc_handle_error

When an I/O is returned with an srb_status of SRB_STATUS_INVALID_LUN
which has zero good_bytes it must be assigned an error. Otherwise the
I/O will be continuously requeued and will cause a deadlock in the case
where disks are being hot added and removed. sd_probe_async will wait
forever for its I/O to complete while holding scsi_sd_probe_domain.

Also returning the default error of DID_TARGET_FAILURE causes multipath
to not retry the I/O resulting in applications receiving I/O errors
before a failover can occur.

Signed-off-by: Cathy Avery <cavery@redhat.com>
Signed-off-by: Long Li <longli@microsoft.com>
Reviewed-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/storvsc_drv.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
index 1b06cf0375dc..3b3d1d050cac 100644
--- a/drivers/scsi/storvsc_drv.c
+++ b/drivers/scsi/storvsc_drv.c
@@ -953,10 +953,11 @@ static void storvsc_handle_error(struct vmscsi_request *vm_srb,
 		case TEST_UNIT_READY:
 			break;
 		default:
-			set_host_byte(scmnd, DID_TARGET_FAILURE);
+			set_host_byte(scmnd, DID_ERROR);
 		}
 		break;
 	case SRB_STATUS_INVALID_LUN:
+		set_host_byte(scmnd, DID_NO_CONNECT);
 		do_work = true;
 		process_err_fn = storvsc_remove_lun;
 		break;

From 4c82fd0abb87e20d0d68ef5237e74732352806c8 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 20 Dec 2017 12:08:33 +0100
Subject: [PATCH 347/808] netfilter: uapi: correct UNTRACKED conntrack state
 bit number

nft_ct exposes this bit to userspace.  This used to be

  #define NF_CT_STATE_UNTRACKED_BIT              (1 << (IP_CT_NUMBER + 1))
  (IP_CT_NUMBER is 5, so this was 0x40)

.. but this got changed to 8 (0x100) when the untracked object got removed.
Replace this with a literal 6 to prevent further incompatible changes
in case IP_CT_NUMBER ever increases.

Fixes: cc41c84b7e7f2 ("netfilter: kill the fake untracked conntrack objects")
Reported-by: Li Shuang <shuali@redhat.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_conntrack_common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h b/include/uapi/linux/netfilter/nf_conntrack_common.h
index 3fea7709a441..57ccfb32e87f 100644
--- a/include/uapi/linux/netfilter/nf_conntrack_common.h
+++ b/include/uapi/linux/netfilter/nf_conntrack_common.h
@@ -36,7 +36,7 @@ enum ip_conntrack_info {
 
 #define NF_CT_STATE_INVALID_BIT			(1 << 0)
 #define NF_CT_STATE_BIT(ctinfo)			(1 << ((ctinfo) % IP_CT_IS_REPLY + 1))
-#define NF_CT_STATE_UNTRACKED_BIT		(1 << (IP_CT_UNTRACKED + 1))
+#define NF_CT_STATE_UNTRACKED_BIT		(1 << 6)
 
 /* Bitset representing status of connection. */
 enum ip_conntrack_status {

From d2a48e52541cdf474ef35d51e8d73ded5be33122 Mon Sep 17 00:00:00 2001
From: Keith Packard <keithp@keithp.com>
Date: Wed, 20 Dec 2017 22:54:24 -0800
Subject: [PATCH 348/808] drm: move lease init after validation in
 drm_lease_create

Patch bd36d3bab2e3d08f80766c86487090dbceed4651 fixed a deadlock in the
failure path of drm_lease_create. This made the partially initialized
lease object visible for a short window of time.

To avoid having the lessee state appear transiently, I've rearranged
the code so that the lessor fields are not filled in until the
parameters are all validated and the function will succeed.

Signed-off-by: Keith Packard <keithp@keithp.com>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/20171221065424.1304-1-keithp@keithp.com
---
 drivers/gpu/drm/drm_lease.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/drm_lease.c b/drivers/gpu/drm/drm_lease.c
index 59849f02e2ad..1402c0e71b03 100644
--- a/drivers/gpu/drm/drm_lease.c
+++ b/drivers/gpu/drm/drm_lease.c
@@ -220,17 +220,6 @@ static struct drm_master *drm_lease_create(struct drm_master *lessor, struct idr
 
 	mutex_lock(&dev->mode_config.idr_mutex);
 
-	/* Insert the new lessee into the tree */
-	id = idr_alloc(&(drm_lease_owner(lessor)->lessee_idr), lessee, 1, 0, GFP_KERNEL);
-	if (id < 0) {
-		error = id;
-		goto out_lessee;
-	}
-
-	lessee->lessee_id = id;
-	lessee->lessor = drm_master_get(lessor);
-	list_add_tail(&lessee->lessee_list, &lessor->lessees);
-
 	idr_for_each_entry(leases, entry, object) {
 		error = 0;
 		if (!idr_find(&dev->mode_config.crtc_idr, object))
@@ -246,6 +235,17 @@ static struct drm_master *drm_lease_create(struct drm_master *lessor, struct idr
 		}
 	}
 
+	/* Insert the new lessee into the tree */
+	id = idr_alloc(&(drm_lease_owner(lessor)->lessee_idr), lessee, 1, 0, GFP_KERNEL);
+	if (id < 0) {
+		error = id;
+		goto out_lessee;
+	}
+
+	lessee->lessee_id = id;
+	lessee->lessor = drm_master_get(lessor);
+	list_add_tail(&lessee->lessee_list, &lessor->lessees);
+
 	/* Move the leases over */
 	lessee->leases = *leases;
 	DRM_DEBUG_LEASE("new lessee %d %p, lessor %d %p\n", lessee->lessee_id, lessee, lessor->lessee_id, lessor);

From 9b3fa47d4a76b1d606a396455f9bbeee083ef008 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Wed, 13 Dec 2017 15:21:22 -0800
Subject: [PATCH 349/808] kobject: fix suppressing modalias in uevents
 delivered over netlink

The commit 4a336a23d619 ("kobject: copy env blob in one go") optimized
constructing uevent data for delivery over netlink by using the raw
environment buffer, instead of reconstructing it from individual
environment pointers. Unfortunately in doing so it broke suppressing
MODALIAS attribute for KOBJ_UNBIND events, as the code that suppressed this
attribute only adjusted the environment pointers, but left the buffer
itself alone. Let's fix it by making sure the offending attribute is
obliterated form the buffer as well.

Reported-by: Tariq Toukan <tariqt@mellanox.com>
Reported-by: Casey Leedom <leedom@chelsio.com>
Fixes: 4a336a23d619 ("kobject: copy env blob in one go")
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 lib/kobject_uevent.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c
index c3e84edc47c9..2615074d3de5 100644
--- a/lib/kobject_uevent.c
+++ b/lib/kobject_uevent.c
@@ -346,7 +346,8 @@ static int kobject_uevent_net_broadcast(struct kobject *kobj,
 static void zap_modalias_env(struct kobj_uevent_env *env)
 {
 	static const char modalias_prefix[] = "MODALIAS=";
-	int i;
+	size_t len;
+	int i, j;
 
 	for (i = 0; i < env->envp_idx;) {
 		if (strncmp(env->envp[i], modalias_prefix,
@@ -355,11 +356,18 @@ static void zap_modalias_env(struct kobj_uevent_env *env)
 			continue;
 		}
 
-		if (i != env->envp_idx - 1)
-			memmove(&env->envp[i], &env->envp[i + 1],
-				sizeof(env->envp[i]) * env->envp_idx - 1);
+		len = strlen(env->envp[i]) + 1;
+
+		if (i != env->envp_idx - 1) {
+			memmove(env->envp[i], env->envp[i + 1],
+				env->buflen - len);
+
+			for (j = i; j < env->envp_idx - 1; j++)
+				env->envp[j] = env->envp[j + 1] - len;
+		}
 
 		env->envp_idx--;
+		env->buflen -= len;
 	}
 }
 

From 966031f340185eddd05affcf72b740549f056348 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 20 Dec 2017 17:57:06 -0800
Subject: [PATCH 350/808] n_tty: fix EXTPROC vs ICANON interaction with TIOCINQ
 (aka FIONREAD)

We added support for EXTPROC back in 2010 in commit 26df6d13406d ("tty:
Add EXTPROC support for LINEMODE") and the intent was to allow it to
override some (all?) ICANON behavior.  Quoting from that original commit
message:

         There is a new bit in the termios local flag word, EXTPROC.
         When this bit is set, several aspects of the terminal driver
         are disabled.  Input line editing, character echo, and mapping
         of signals are all disabled.  This allows the telnetd to turn
         off these functions when in linemode, but still keep track of
         what state the user wants the terminal to be in.

but the problem turns out that "several aspects of the terminal driver
are disabled" is a bit ambiguous, and you can really confuse the n_tty
layer by setting EXTPROC and then causing some of the ICANON invariants
to no longer be maintained.

This fixes at least one such case (TIOCINQ) becoming unhappy because of
the confusion over whether ICANON really means ICANON when EXTPROC is set.

This basically makes TIOCINQ match the case of read: if EXTPROC is set,
we ignore ICANON.  Also, make sure to reset the ICANON state ie EXTPROC
changes, not just if ICANON changes.

Fixes: 26df6d13406d ("tty: Add EXTPROC support for LINEMODE")
Reported-by: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Reported-by: syzkaller <syzkaller@googlegroups.com>
Cc: Jiri Slaby <jslaby@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/n_tty.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/tty/n_tty.c b/drivers/tty/n_tty.c
index 427e0d5d8f13..539b49adb6af 100644
--- a/drivers/tty/n_tty.c
+++ b/drivers/tty/n_tty.c
@@ -1762,7 +1762,7 @@ static void n_tty_set_termios(struct tty_struct *tty, struct ktermios *old)
 {
 	struct n_tty_data *ldata = tty->disc_data;
 
-	if (!old || (old->c_lflag ^ tty->termios.c_lflag) & ICANON) {
+	if (!old || (old->c_lflag ^ tty->termios.c_lflag) & (ICANON | EXTPROC)) {
 		bitmap_zero(ldata->read_flags, N_TTY_BUF_SIZE);
 		ldata->line_start = ldata->read_tail;
 		if (!L_ICANON(tty) || !read_cnt(ldata)) {
@@ -2425,7 +2425,7 @@ static int n_tty_ioctl(struct tty_struct *tty, struct file *file,
 		return put_user(tty_chars_in_buffer(tty), (int __user *) arg);
 	case TIOCINQ:
 		down_write(&tty->termios_rwsem);
-		if (L_ICANON(tty))
+		if (L_ICANON(tty) && !L_EXTPROC(tty))
 			retval = inq_canon(ldata);
 		else
 			retval = read_cnt(ldata);

From fae1a3e775cca8c3a9e0eb34443b310871a15a92 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 21 Dec 2017 00:49:14 +0100
Subject: [PATCH 351/808] kvm: x86: fix RSM when PCID is non-zero

rsm_load_state_64() and rsm_enter_protected_mode() load CR3, then
CR4 & ~PCIDE, then CR0, then CR4.

However, setting CR4.PCIDE fails if CR3[11:0] != 0.  It's probably easier
in the long run to replace rsm_enter_protected_mode() with an emulator
callback that sets all the special registers (like KVM_SET_SREGS would
do).  For now, set the PCID field of CR3 only after CR4.PCIDE is 1.

Reported-by: Laszlo Ersek <lersek@redhat.com>
Tested-by: Laszlo Ersek <lersek@redhat.com>
Fixes: 660a5d517aaab9187f93854425c4c63f4a09195c
Cc: stable@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/emulate.c | 32 +++++++++++++++++++++++++-------
 1 file changed, 25 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index abe74f779f9d..b514b2b2845a 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2390,9 +2390,21 @@ static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, u64 smbase, int n)
 }
 
 static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt,
-				     u64 cr0, u64 cr4)
+				    u64 cr0, u64 cr3, u64 cr4)
 {
 	int bad;
+	u64 pcid;
+
+	/* In order to later set CR4.PCIDE, CR3[11:0] must be zero.  */
+	pcid = 0;
+	if (cr4 & X86_CR4_PCIDE) {
+		pcid = cr3 & 0xfff;
+		cr3 &= ~0xfff;
+	}
+
+	bad = ctxt->ops->set_cr(ctxt, 3, cr3);
+	if (bad)
+		return X86EMUL_UNHANDLEABLE;
 
 	/*
 	 * First enable PAE, long mode needs it before CR0.PG = 1 is set.
@@ -2411,6 +2423,12 @@ static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt,
 		bad = ctxt->ops->set_cr(ctxt, 4, cr4);
 		if (bad)
 			return X86EMUL_UNHANDLEABLE;
+		if (pcid) {
+			bad = ctxt->ops->set_cr(ctxt, 3, cr3 | pcid);
+			if (bad)
+				return X86EMUL_UNHANDLEABLE;
+		}
+
 	}
 
 	return X86EMUL_CONTINUE;
@@ -2421,11 +2439,11 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, u64 smbase)
 	struct desc_struct desc;
 	struct desc_ptr dt;
 	u16 selector;
-	u32 val, cr0, cr4;
+	u32 val, cr0, cr3, cr4;
 	int i;
 
 	cr0 =                      GET_SMSTATE(u32, smbase, 0x7ffc);
-	ctxt->ops->set_cr(ctxt, 3, GET_SMSTATE(u32, smbase, 0x7ff8));
+	cr3 =                      GET_SMSTATE(u32, smbase, 0x7ff8);
 	ctxt->eflags =             GET_SMSTATE(u32, smbase, 0x7ff4) | X86_EFLAGS_FIXED;
 	ctxt->_eip =               GET_SMSTATE(u32, smbase, 0x7ff0);
 
@@ -2467,14 +2485,14 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, u64 smbase)
 
 	ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7ef8));
 
-	return rsm_enter_protected_mode(ctxt, cr0, cr4);
+	return rsm_enter_protected_mode(ctxt, cr0, cr3, cr4);
 }
 
 static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase)
 {
 	struct desc_struct desc;
 	struct desc_ptr dt;
-	u64 val, cr0, cr4;
+	u64 val, cr0, cr3, cr4;
 	u32 base3;
 	u16 selector;
 	int i, r;
@@ -2491,7 +2509,7 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase)
 	ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1);
 
 	cr0 =                       GET_SMSTATE(u64, smbase, 0x7f58);
-	ctxt->ops->set_cr(ctxt, 3,  GET_SMSTATE(u64, smbase, 0x7f50));
+	cr3 =                       GET_SMSTATE(u64, smbase, 0x7f50);
 	cr4 =                       GET_SMSTATE(u64, smbase, 0x7f48);
 	ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7f00));
 	val =                       GET_SMSTATE(u64, smbase, 0x7ed0);
@@ -2519,7 +2537,7 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase)
 	dt.address =                GET_SMSTATE(u64, smbase, 0x7e68);
 	ctxt->ops->set_gdt(ctxt, &dt);
 
-	r = rsm_enter_protected_mode(ctxt, cr0, cr4);
+	r = rsm_enter_protected_mode(ctxt, cr0, cr3, cr4);
 	if (r != X86EMUL_CONTINUE)
 		return r;
 

From aa12f594f97efe50223611dbd13ecca4e8dafee6 Mon Sep 17 00:00:00 2001
From: Stefan Raspl <stefan.raspl@de.ibm.com>
Date: Thu, 21 Dec 2017 13:03:27 +0100
Subject: [PATCH 352/808] tools/kvm_stat: sort '-f help' output

Sort the fields returned by specifying '-f help' on the command line.
While at it, simplify the code a bit, indent the output and eliminate an
extra blank line at the beginning.

Signed-off-by: Stefan Raspl <raspl@linux.vnet.ibm.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/kvm/kvm_stat/kvm_stat | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat
index 566a70ddd005..a5684d0968b4 100755
--- a/tools/kvm/kvm_stat/kvm_stat
+++ b/tools/kvm/kvm_stat/kvm_stat
@@ -1579,17 +1579,13 @@ def main():
 
     stats = Stats(options)
 
-    if options.fields == "help":
+    if options.fields == 'help':
         stats.fields_filter = None
-        event_list = "\n"
-        s = stats.get()
-        for key in s.keys():
-            if key.find('(') != -1:
-                key = key[0:key.find('(')]
-            if event_list.find('\n' + key + '\n') == -1:
-                event_list += key + '\n'
-        sys.stdout.write(event_list)
-        return ""
+        event_list = []
+        for key in stats.get().keys():
+            event_list.append(key.split('(', 1)[0])
+        sys.stdout.write('  ' + '\n  '.join(sorted(set(event_list))) + '\n')
+        sys.exit(0)
 
     if options.log:
         log(stats)

From 976a9b35d77a9d297cb03154aa61a6214a213b5e Mon Sep 17 00:00:00 2001
From: Javier Martinez Canillas <javierm@redhat.com>
Date: Wed, 20 Dec 2017 18:17:29 +0100
Subject: [PATCH 353/808] ARM: dts: exynos: Enable Mixer node for Exynos5800
 Peach Pi machine

Commit 1cb686c08d12 ("ARM: dts: exynos: Add status property to Exynos 542x
Mixer nodes") disabled the Mixer node by default in the DTSI and enabled
for each Exynos 542x DTS. But unfortunately it missed to enable it for the
Exynos5800 Peach Pi machine, since the 5800 is also an 542x SoC variant.

Fixes: 1cb686c08d12 ("ARM: dts: exynos: Add status property to Exynos 542x Mixer nodes")
Signed-off-by: Javier Martinez Canillas <javierm@redhat.com>
Acked-by: Marek Szyprowski <m.szyprowski@samsung.com>
Tested-by: Guillaume Tucker <guillaume.tucker@collabora.com>
Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 arch/arm/boot/dts/exynos5800-peach-pi.dts | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/arm/boot/dts/exynos5800-peach-pi.dts b/arch/arm/boot/dts/exynos5800-peach-pi.dts
index b2b95ff205e8..0029ec27819c 100644
--- a/arch/arm/boot/dts/exynos5800-peach-pi.dts
+++ b/arch/arm/boot/dts/exynos5800-peach-pi.dts
@@ -664,6 +664,10 @@
 	status = "okay";
 };
 
+&mixer {
+	status = "okay";
+};
+
 /* eMMC flash */
 &mmc_0 {
 	status = "okay";

From d2271826e58b83f9a75634a3f4334082ecf0a02e Mon Sep 17 00:00:00 2001
From: Joel Stanley <joel@jms.id.au>
Date: Fri, 15 Dec 2017 16:03:32 +1030
Subject: [PATCH 354/808] ARM: dts: aspeed-g4: Correct VUART IRQ number
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This should have always been 8.

Fixes: db4d6d9d80fa ("ARM: dts: aspeed: Correctly order UART nodes")
Cc: stable@vger.kernel.org
Signed-off-by: Joel Stanley <joel@jms.id.au>
Reviewed-by: Cédric Le Goater <clg@kaod.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 arch/arm/boot/dts/aspeed-g4.dtsi | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm/boot/dts/aspeed-g4.dtsi b/arch/arm/boot/dts/aspeed-g4.dtsi
index 45d815a86d42..de08d9045cb8 100644
--- a/arch/arm/boot/dts/aspeed-g4.dtsi
+++ b/arch/arm/boot/dts/aspeed-g4.dtsi
@@ -219,7 +219,7 @@
 				compatible = "aspeed,ast2400-vuart";
 				reg = <0x1e787000 0x40>;
 				reg-shift = <2>;
-				interrupts = <10>;
+				interrupts = <8>;
 				clocks = <&clk_uart>;
 				no-loopback-test;
 				status = "disabled";

From 363e59baa4f76d3f97c0133ff7014cba3d90a7c3 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Thu, 14 Dec 2017 15:42:59 -0800
Subject: [PATCH 355/808] xfs: don't be so eager to clear the cowblocks tag on
 truncate

Currently, xfs_itruncate_extents clears the cowblocks tag if i_cnextents
is zero.  This is wrong, since i_cnextents only tracks real extents in
the CoW fork, which means that we could have some delayed CoW
reservations still in there that will now never get cleaned.

Fix a further bug where we /don't/ clear the reflink iflag if there are
any attribute blocks -- really, it's only safe to clear the reflink flag
if there are no data fork extents and no cow fork extents.

Found by adding clonerange to fsstress in xfs/017.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_inode.c | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index b41952a4ddd8..6f95bdb408ce 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1487,6 +1487,24 @@ xfs_link(
 	return error;
 }
 
+/* Clear the reflink flag and the cowblocks tag if possible. */
+static void
+xfs_itruncate_clear_reflink_flags(
+	struct xfs_inode	*ip)
+{
+	struct xfs_ifork	*dfork;
+	struct xfs_ifork	*cfork;
+
+	if (!xfs_is_reflink_inode(ip))
+		return;
+	dfork = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+	cfork = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+	if (dfork->if_bytes == 0 && cfork->if_bytes == 0)
+		ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
+	if (cfork->if_bytes == 0)
+		xfs_inode_clear_cowblocks_tag(ip);
+}
+
 /*
  * Free up the underlying blocks past new_size.  The new size must be smaller
  * than the current size.  This routine can be used both for the attribute and
@@ -1583,15 +1601,7 @@ xfs_itruncate_extents(
 	if (error)
 		goto out;
 
-	/*
-	 * Clear the reflink flag if there are no data fork blocks and
-	 * there are no extents staged in the cow fork.
-	 */
-	if (xfs_is_reflink_inode(ip) && ip->i_cnextents == 0) {
-		if (ip->i_d.di_nblocks == 0)
-			ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
-		xfs_inode_clear_cowblocks_tag(ip);
-	}
+	xfs_itruncate_clear_reflink_flags(ip);
 
 	/*
 	 * Always re-log the inode so that our permanent transaction can keep

From 10ddf64e420f7f6c1a871bfb4ff2de08faef8235 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Thu, 14 Dec 2017 15:46:05 -0800
Subject: [PATCH 356/808] xfs: remove leftover CoW reservations when remounting
 ro

When we're remounting the filesystem readonly, remove all CoW
preallocations prior to going ro.  If the fs goes down after the ro
remount, we never clean up the staging extents, which means xfs_check
will trip over them on a subsequent run.  Practically speaking, the next
mount will clean them up too, so this is unlikely to be seen.  Since we
shut down the cowblocks cleaner on remount-ro, we also have to make sure
we start it back up if/when we remount-rw.

Found by adding clonerange to fsstress and running xfs/017.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_icache.c | 2 +-
 fs/xfs/xfs_icache.h | 1 +
 fs/xfs/xfs_super.c  | 9 +++++++++
 3 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 58d2d4253c8e..3861d61fb265 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -870,7 +870,7 @@ xfs_eofblocks_worker(
  * based on the 'speculative_cow_prealloc_lifetime' tunable (5m by default).
  * (We'll just piggyback on the post-EOF prealloc space workqueue.)
  */
-STATIC void
+void
 xfs_queue_cowblocks(
 	struct xfs_mount *mp)
 {
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index bff4d85e5498..d4a77588eca1 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -81,6 +81,7 @@ void xfs_inode_clear_cowblocks_tag(struct xfs_inode *ip);
 int xfs_icache_free_cowblocks(struct xfs_mount *, struct xfs_eofblocks *);
 int xfs_inode_free_quota_cowblocks(struct xfs_inode *ip);
 void xfs_cowblocks_worker(struct work_struct *);
+void xfs_queue_cowblocks(struct xfs_mount *);
 
 int xfs_inode_ag_iterator(struct xfs_mount *mp,
 	int (*execute)(struct xfs_inode *ip, int flags, void *args),
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index f663022353c0..2db6a40a96bd 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1360,6 +1360,7 @@ xfs_fs_remount(
 			xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
 			return error;
 		}
+		xfs_queue_cowblocks(mp);
 
 		/* Create the per-AG metadata reservation pool .*/
 		error = xfs_fs_reserve_ag_blocks(mp);
@@ -1369,6 +1370,14 @@ xfs_fs_remount(
 
 	/* rw -> ro */
 	if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) {
+		/* Get rid of any leftover CoW reservations... */
+		cancel_delayed_work_sync(&mp->m_cowblocks_work);
+		error = xfs_icache_free_cowblocks(mp, NULL);
+		if (error) {
+			xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+			return error;
+		}
+
 		/* Free the per-AG metadata reservation pool. */
 		error = xfs_fs_unreserve_ag_blocks(mp);
 		if (error) {

From 86d692bfad1b0097fa866f5fcfa5f5adf4cd82e8 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Thu, 14 Dec 2017 15:46:06 -0800
Subject: [PATCH 357/808] xfs: set cowblocks tag for direct cow writes too

If a user performs a direct CoW write, we end up loading the CoW fork
with preallocated extents.  Therefore, we must set the cowblocks tag so
that they can be cleared out if we run low on space.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_reflink.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index e49e6db415f7..47aea2e82c26 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -454,6 +454,8 @@ retry:
 	if (error)
 		goto out_bmap_cancel;
 
+	xfs_inode_set_cowblocks_tag(ip);
+
 	/* Finish up. */
 	error = xfs_defer_finish(&tp, &dfops);
 	if (error)

From 0525e952dcceb9fc947c6d395de7f72220c7d081 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Thu, 7 Dec 2017 19:07:03 -0800
Subject: [PATCH 358/808] xfs: queue deferred rmap ops for cow staging extent
 alloc/free in the right order

Under the deferred rmap operation scheme, there's a certain order in
which the rmap deferred ops have to be queued to maintain integrity
during log replay.  For alloc/map operations that order is cui -> rui;
for free/unmap operations that order is cui -> rui -> efi.  However, the
initial refcount code got the ordering wrong in the free side of things
because it queued refcount free op and an EFI and the refcount free op
queued a rmap free op, resulting in the order cui -> efi -> rui.

If we fail before the efd finishes, the efi recovery will try to do a
wildcard rmap removal and the subsequent rui will fail to find the rmap
and blow up.  This didn't ever happen due to other screws up in handling
unknown owner rmap removals, but those other screw ups broke recovery in
other ways, so fix the ordering to follow the intended rules.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_refcount.c | 52 +++++++++++++-----------------------
 1 file changed, 19 insertions(+), 33 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 585b35d34142..c40d26763075 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -1488,27 +1488,12 @@ __xfs_refcount_cow_alloc(
 	xfs_extlen_t		aglen,
 	struct xfs_defer_ops	*dfops)
 {
-	int			error;
-
 	trace_xfs_refcount_cow_increase(rcur->bc_mp, rcur->bc_private.a.agno,
 			agbno, aglen);
 
 	/* Add refcount btree reservation */
-	error = xfs_refcount_adjust_cow(rcur, agbno, aglen,
+	return xfs_refcount_adjust_cow(rcur, agbno, aglen,
 			XFS_REFCOUNT_ADJUST_COW_ALLOC, dfops);
-	if (error)
-		return error;
-
-	/* Add rmap entry */
-	if (xfs_sb_version_hasrmapbt(&rcur->bc_mp->m_sb)) {
-		error = xfs_rmap_alloc_extent(rcur->bc_mp, dfops,
-				rcur->bc_private.a.agno,
-				agbno, aglen, XFS_RMAP_OWN_COW);
-		if (error)
-			return error;
-	}
-
-	return error;
 }
 
 /*
@@ -1521,27 +1506,12 @@ __xfs_refcount_cow_free(
 	xfs_extlen_t		aglen,
 	struct xfs_defer_ops	*dfops)
 {
-	int			error;
-
 	trace_xfs_refcount_cow_decrease(rcur->bc_mp, rcur->bc_private.a.agno,
 			agbno, aglen);
 
 	/* Remove refcount btree reservation */
-	error = xfs_refcount_adjust_cow(rcur, agbno, aglen,
+	return xfs_refcount_adjust_cow(rcur, agbno, aglen,
 			XFS_REFCOUNT_ADJUST_COW_FREE, dfops);
-	if (error)
-		return error;
-
-	/* Remove rmap entry */
-	if (xfs_sb_version_hasrmapbt(&rcur->bc_mp->m_sb)) {
-		error = xfs_rmap_free_extent(rcur->bc_mp, dfops,
-				rcur->bc_private.a.agno,
-				agbno, aglen, XFS_RMAP_OWN_COW);
-		if (error)
-			return error;
-	}
-
-	return error;
 }
 
 /* Record a CoW staging extent in the refcount btree. */
@@ -1552,11 +1522,19 @@ xfs_refcount_alloc_cow_extent(
 	xfs_fsblock_t			fsb,
 	xfs_extlen_t			len)
 {
+	int				error;
+
 	if (!xfs_sb_version_hasreflink(&mp->m_sb))
 		return 0;
 
-	return __xfs_refcount_add(mp, dfops, XFS_REFCOUNT_ALLOC_COW,
+	error = __xfs_refcount_add(mp, dfops, XFS_REFCOUNT_ALLOC_COW,
 			fsb, len);
+	if (error)
+		return error;
+
+	/* Add rmap entry */
+	return xfs_rmap_alloc_extent(mp, dfops, XFS_FSB_TO_AGNO(mp, fsb),
+			XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW);
 }
 
 /* Forget a CoW staging event in the refcount btree. */
@@ -1567,9 +1545,17 @@ xfs_refcount_free_cow_extent(
 	xfs_fsblock_t			fsb,
 	xfs_extlen_t			len)
 {
+	int				error;
+
 	if (!xfs_sb_version_hasreflink(&mp->m_sb))
 		return 0;
 
+	/* Remove rmap entry */
+	error = xfs_rmap_free_extent(mp, dfops, XFS_FSB_TO_AGNO(mp, fsb),
+			XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW);
+	if (error)
+		return error;
+
 	return __xfs_refcount_add(mp, dfops, XFS_REFCOUNT_FREE_COW,
 			fsb, len);
 }

From 33df3a9cf925183a6a169bc3eff2bd0febd1298a Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Thu, 7 Dec 2017 19:07:27 -0800
Subject: [PATCH 359/808] xfs: always honor OWN_UNKNOWN rmap removal requests

Calling xfs_rmap_free with an unknown owner is supposed to remove any
rmaps covering that range regardless of owner.  This is used by the EFI
recovery code to say "we're freeing this, it mustn't be owned by
anything anymore", but for whatever reason xfs_free_ag_extent filters
them out.

Therefore, remove the filter and make xfs_rmap_unmap actually treat it
as a wildcard owner -- free anything that's already there, and if
there's no owner at all then that's fine too.

There are two existing callers of bmap_add_free that take care the rmap
deferred ops themselves and use OWN_UNKNOWN to skip the EFI-based rmap
cleanup; convert these to use OWN_NULL (via helpers), and now we really
require that an RUI (if any) gets added to the defer ops before any EFI.

Lastly, now that xfs_free_extent filters out OWN_NULL rmap free requests,
growfs will have to consult directly with the rmap to ensure that there
aren't any rmaps in the grown region.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_alloc.c |  4 ++--
 fs/xfs/libxfs/xfs_rmap.c  | 25 +++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_rmap.h  | 16 +++++++++++++++-
 fs/xfs/xfs_extfree_item.c |  2 +-
 fs/xfs/xfs_fsops.c        |  5 +++++
 5 files changed, 48 insertions(+), 4 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 0da80019a917..83ed7715f856 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -702,7 +702,7 @@ xfs_alloc_ag_vextent(
 	ASSERT(args->agbno % args->alignment == 0);
 
 	/* if not file data, insert new block into the reverse map btree */
-	if (args->oinfo.oi_owner != XFS_RMAP_OWN_UNKNOWN) {
+	if (!xfs_rmap_should_skip_owner_update(&args->oinfo)) {
 		error = xfs_rmap_alloc(args->tp, args->agbp, args->agno,
 				       args->agbno, args->len, &args->oinfo);
 		if (error)
@@ -1682,7 +1682,7 @@ xfs_free_ag_extent(
 	bno_cur = cnt_cur = NULL;
 	mp = tp->t_mountp;
 
-	if (oinfo->oi_owner != XFS_RMAP_OWN_UNKNOWN) {
+	if (!xfs_rmap_should_skip_owner_update(oinfo)) {
 		error = xfs_rmap_free(tp, agbp, agno, bno, len, oinfo);
 		if (error)
 			goto error0;
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index dd019cee1b3b..7465cfb39276 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -444,6 +444,30 @@ xfs_rmap_unmap(
 		goto out_done;
 	}
 
+	/*
+	 * If we're doing an unknown-owner removal for EFI recovery, we expect
+	 * to find the full range in the rmapbt or nothing at all.  If we
+	 * don't find any rmaps overlapping either end of the range, we're
+	 * done.  Hopefully this means that the EFI creator already queued
+	 * (and finished) a RUI to remove the rmap.
+	 */
+	if (owner == XFS_RMAP_OWN_UNKNOWN &&
+	    ltrec.rm_startblock + ltrec.rm_blockcount <= bno) {
+		struct xfs_rmap_irec    rtrec;
+
+		error = xfs_btree_increment(cur, 0, &i);
+		if (error)
+			goto out_error;
+		if (i == 0)
+			goto out_done;
+		error = xfs_rmap_get_rec(cur, &rtrec, &i);
+		if (error)
+			goto out_error;
+		XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error);
+		if (rtrec.rm_startblock >= bno + len)
+			goto out_done;
+	}
+
 	/* Make sure the unwritten flag matches. */
 	XFS_WANT_CORRUPTED_GOTO(mp, (flags & XFS_RMAP_UNWRITTEN) ==
 			(ltrec.rm_flags & XFS_RMAP_UNWRITTEN), out_error);
@@ -664,6 +688,7 @@ xfs_rmap_map(
 		flags |= XFS_RMAP_UNWRITTEN;
 	trace_xfs_rmap_map(mp, cur->bc_private.a.agno, bno, len,
 			unwritten, oinfo);
+	ASSERT(!xfs_rmap_should_skip_owner_update(oinfo));
 
 	/*
 	 * For the initial lookup, look for an exact match or the left-adjacent
diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
index 466ede637080..0fcd5b1ba729 100644
--- a/fs/xfs/libxfs/xfs_rmap.h
+++ b/fs/xfs/libxfs/xfs_rmap.h
@@ -61,7 +61,21 @@ static inline void
 xfs_rmap_skip_owner_update(
 	struct xfs_owner_info	*oi)
 {
-	oi->oi_owner = XFS_RMAP_OWN_UNKNOWN;
+	xfs_rmap_ag_owner(oi, XFS_RMAP_OWN_NULL);
+}
+
+static inline bool
+xfs_rmap_should_skip_owner_update(
+	struct xfs_owner_info	*oi)
+{
+	return oi->oi_owner == XFS_RMAP_OWN_NULL;
+}
+
+static inline void
+xfs_rmap_any_owner_update(
+	struct xfs_owner_info	*oi)
+{
+	xfs_rmap_ag_owner(oi, XFS_RMAP_OWN_UNKNOWN);
 }
 
 /* Reverse mapping functions. */
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 44f8c5451210..64da90655e95 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -538,7 +538,7 @@ xfs_efi_recover(
 		return error;
 	efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
 
-	xfs_rmap_skip_owner_update(&oinfo);
+	xfs_rmap_any_owner_update(&oinfo);
 	for (i = 0; i < efip->efi_format.efi_nextents; i++) {
 		extp = &efip->efi_format.efi_extents[i];
 		error = xfs_trans_free_extent(tp, efdp, extp->ext_start,
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 8f22fc579dbb..60a2e128cb6a 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -571,6 +571,11 @@ xfs_growfs_data_private(
 		 * this doesn't actually exist in the rmap btree.
 		 */
 		xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_NULL);
+		error = xfs_rmap_free(tp, bp, agno,
+				be32_to_cpu(agf->agf_length) - new,
+				new, &oinfo);
+		if (error)
+			goto error0;
 		error = xfs_free_extent(tp,
 				XFS_AGB_TO_FSB(mp, agno,
 					be32_to_cpu(agf->agf_length) - new),

From 68c58e9b9a88c1a9d0c2eaf6c7acefb00f5fbbfb Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Thu, 7 Dec 2017 19:07:55 -0800
Subject: [PATCH 360/808] xfs: only skip rmap owner checks for unknown-owner
 rmap removal

For rmap removal, refactor the rmap owner checks into a separate
function, then skip the checks if we are performing an unknown-owner
removal.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_rmap.c | 76 +++++++++++++++++++++++++++-------------
 1 file changed, 52 insertions(+), 24 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index 7465cfb39276..50db920ceeeb 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -367,6 +367,51 @@ xfs_rmap_lookup_le_range(
 	return error;
 }
 
+/*
+ * Perform all the relevant owner checks for a removal op.  If we're doing an
+ * unknown-owner removal then we have no owner information to check.
+ */
+static int
+xfs_rmap_free_check_owner(
+	struct xfs_mount	*mp,
+	uint64_t		ltoff,
+	struct xfs_rmap_irec	*rec,
+	xfs_fsblock_t		bno,
+	xfs_filblks_t		len,
+	uint64_t		owner,
+	uint64_t		offset,
+	unsigned int		flags)
+{
+	int			error = 0;
+
+	if (owner == XFS_RMAP_OWN_UNKNOWN)
+		return 0;
+
+	/* Make sure the unwritten flag matches. */
+	XFS_WANT_CORRUPTED_GOTO(mp, (flags & XFS_RMAP_UNWRITTEN) ==
+			(rec->rm_flags & XFS_RMAP_UNWRITTEN), out);
+
+	/* Make sure the owner matches what we expect to find in the tree. */
+	XFS_WANT_CORRUPTED_GOTO(mp, owner == rec->rm_owner, out);
+
+	/* Check the offset, if necessary. */
+	if (XFS_RMAP_NON_INODE_OWNER(owner))
+		goto out;
+
+	if (flags & XFS_RMAP_BMBT_BLOCK) {
+		XFS_WANT_CORRUPTED_GOTO(mp, rec->rm_flags & XFS_RMAP_BMBT_BLOCK,
+				out);
+	} else {
+		XFS_WANT_CORRUPTED_GOTO(mp, rec->rm_offset <= offset, out);
+		XFS_WANT_CORRUPTED_GOTO(mp,
+				ltoff + rec->rm_blockcount >= offset + len,
+				out);
+	}
+
+out:
+	return error;
+}
+
 /*
  * Find the extent in the rmap btree and remove it.
  *
@@ -468,33 +513,16 @@ xfs_rmap_unmap(
 			goto out_done;
 	}
 
-	/* Make sure the unwritten flag matches. */
-	XFS_WANT_CORRUPTED_GOTO(mp, (flags & XFS_RMAP_UNWRITTEN) ==
-			(ltrec.rm_flags & XFS_RMAP_UNWRITTEN), out_error);
-
 	/* Make sure the extent we found covers the entire freeing range. */
 	XFS_WANT_CORRUPTED_GOTO(mp, ltrec.rm_startblock <= bno &&
-		ltrec.rm_startblock + ltrec.rm_blockcount >=
-		bno + len, out_error);
+			ltrec.rm_startblock + ltrec.rm_blockcount >=
+			bno + len, out_error);
 
-	/* Make sure the owner matches what we expect to find in the tree. */
-	XFS_WANT_CORRUPTED_GOTO(mp, owner == ltrec.rm_owner ||
-				    XFS_RMAP_NON_INODE_OWNER(owner), out_error);
-
-	/* Check the offset, if necessary. */
-	if (!XFS_RMAP_NON_INODE_OWNER(owner)) {
-		if (flags & XFS_RMAP_BMBT_BLOCK) {
-			XFS_WANT_CORRUPTED_GOTO(mp,
-					ltrec.rm_flags & XFS_RMAP_BMBT_BLOCK,
-					out_error);
-		} else {
-			XFS_WANT_CORRUPTED_GOTO(mp,
-					ltrec.rm_offset <= offset, out_error);
-			XFS_WANT_CORRUPTED_GOTO(mp,
-					ltoff + ltrec.rm_blockcount >= offset + len,
-					out_error);
-		}
-	}
+	/* Check owner information. */
+	error = xfs_rmap_free_check_owner(mp, ltoff, &ltrec, bno, len, owner,
+			offset, flags);
+	if (error)
+		goto out_error;
 
 	if (ltrec.rm_startblock == bno && ltrec.rm_blockcount == len) {
 		/* exact match, simply remove the record from rmap tree */

From 58acfd714e6b02e8617448b431c2b64a2f1f0792 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Wed, 20 Dec 2017 12:28:25 +0200
Subject: [PATCH 361/808] ipv6: Honor specified parameters in fibmatch lookup

Currently, parameters such as oif and source address are not taken into
account during fibmatch lookup. Example (IPv4 for reference) before
patch:

$ ip -4 route show
192.0.2.0/24 dev dummy0 proto kernel scope link src 192.0.2.1
198.51.100.0/24 dev dummy1 proto kernel scope link src 198.51.100.1

$ ip -6 route show
2001:db8:1::/64 dev dummy0 proto kernel metric 256 pref medium
2001:db8:2::/64 dev dummy1 proto kernel metric 256 pref medium
fe80::/64 dev dummy0 proto kernel metric 256 pref medium
fe80::/64 dev dummy1 proto kernel metric 256 pref medium

$ ip -4 route get fibmatch 192.0.2.2 oif dummy0
192.0.2.0/24 dev dummy0 proto kernel scope link src 192.0.2.1
$ ip -4 route get fibmatch 192.0.2.2 oif dummy1
RTNETLINK answers: No route to host

$ ip -6 route get fibmatch 2001:db8:1::2 oif dummy0
2001:db8:1::/64 dev dummy0 proto kernel metric 256 pref medium
$ ip -6 route get fibmatch 2001:db8:1::2 oif dummy1
2001:db8:1::/64 dev dummy0 proto kernel metric 256 pref medium

After:

$ ip -6 route get fibmatch 2001:db8:1::2 oif dummy0
2001:db8:1::/64 dev dummy0 proto kernel metric 256 pref medium
$ ip -6 route get fibmatch 2001:db8:1::2 oif dummy1
RTNETLINK answers: Network is unreachable

The problem stems from the fact that the necessary route lookup flags
are not set based on these parameters.

Instead of duplicating the same logic for fibmatch, we can simply
resolve the original route from its copy and dump it instead.

Fixes: 18c3a61c4264 ("net: ipv6: RTM_GETROUTE: return matched fib result when requested")
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 2bc91c349273..0458b761f3c5 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -4298,19 +4298,13 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 		if (!ipv6_addr_any(&fl6.saddr))
 			flags |= RT6_LOOKUP_F_HAS_SADDR;
 
-		if (!fibmatch)
-			dst = ip6_route_input_lookup(net, dev, &fl6, flags);
-		else
-			dst = ip6_route_lookup(net, &fl6, 0);
+		dst = ip6_route_input_lookup(net, dev, &fl6, flags);
 
 		rcu_read_unlock();
 	} else {
 		fl6.flowi6_oif = oif;
 
-		if (!fibmatch)
-			dst = ip6_route_output(net, NULL, &fl6);
-		else
-			dst = ip6_route_lookup(net, &fl6, 0);
+		dst = ip6_route_output(net, NULL, &fl6);
 	}
 
 
@@ -4327,6 +4321,15 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 		goto errout;
 	}
 
+	if (fibmatch && rt->dst.from) {
+		struct rt6_info *ort = container_of(rt->dst.from,
+						    struct rt6_info, dst);
+
+		dst_hold(&ort->dst);
+		ip6_rt_put(rt);
+		rt = ort;
+	}
+
 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
 	if (!skb) {
 		ip6_rt_put(rt);

From 6d0e4827b72afc71349784336d5eb6df4df106e6 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 21 Dec 2017 10:01:30 -0700
Subject: [PATCH 362/808] Revert "bdi: add error handle for bdi_debug_register"

This reverts commit a0747a859ef6d3cc5b6cd50eb694499b78dd0025.

It breaks some booting for some users, and more than a week
into this, there's still no good fix. Revert this commit
for now until a solution has been found.

Reported-by: Laura Abbott <labbott@redhat.com>
Reported-by: Bruno Wolff III <bruno@wolff.to>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 mm/backing-dev.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 84b2dc76f140..b5f940ce0143 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -882,13 +882,10 @@ int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args)
 	if (IS_ERR(dev))
 		return PTR_ERR(dev);
 
-	if (bdi_debug_register(bdi, dev_name(dev))) {
-		device_destroy(bdi_class, dev->devt);
-		return -ENOMEM;
-	}
 	cgwb_bdi_register(bdi);
 	bdi->dev = dev;
 
+	bdi_debug_register(bdi, dev_name(dev));
 	set_bit(WB_registered, &bdi->wb.state);
 
 	spin_lock_bh(&bdi_lock);

From 8bc0d7ac934b6f2d0dc8f38a3104d281c9db1e98 Mon Sep 17 00:00:00 2001
From: Gabriel Krisman Bertazi <krisman@collabora.co.uk>
Date: Tue, 19 Dec 2017 22:24:10 -0200
Subject: [PATCH 363/808] i915: Reject CCS modifiers for pipe C on Geminilake

Current code advertises (on the modifiers blob property) support for CCS
modifier for pipe C on GLK, only to reject it later when validating the
request before the atomic commit.

This fixes the tests igt@kms_ccs@pipe-c-*, which should skip on GLK for
pipe C (see bug 104096).

A relevant discussion is archived at:

https://lists.freedesktop.org/archives/intel-gfx/2017-December/150646.html

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=104096
Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk>
Cc: Ben Widawsky <ben@bwidawsk.net>
Reviewed-by: Ben Widawsky <ben@bwidawsk.net>
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20171220002410.5604-1-krisman@collabora.co.uk
(cherry picked from commit f0cbd8bd877f3d8c5b80a6b1add9ca9010d7f9d8)
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
---
 drivers/gpu/drm/i915/intel_display.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index ff9397030092..30cf273d57aa 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -13194,7 +13194,7 @@ intel_primary_plane_create(struct drm_i915_private *dev_priv, enum pipe pipe)
 	primary->frontbuffer_bit = INTEL_FRONTBUFFER_PRIMARY(pipe);
 	primary->check_plane = intel_check_primary_plane;
 
-	if (INTEL_GEN(dev_priv) >= 10 || IS_GEMINILAKE(dev_priv)) {
+	if (INTEL_GEN(dev_priv) >= 10) {
 		intel_primary_formats = skl_primary_formats;
 		num_formats = ARRAY_SIZE(skl_primary_formats);
 		modifiers = skl_format_modifiers_ccs;

From c48e74736fccf25fb32bb015426359e1c2016e3b Mon Sep 17 00:00:00 2001
From: Eric Garver <e@erig.me>
Date: Wed, 20 Dec 2017 15:09:22 -0500
Subject: [PATCH 364/808] openvswitch: Fix pop_vlan action for double tagged
 frames

skb_vlan_pop() expects skb->protocol to be a valid TPID for double
tagged frames. So set skb->protocol to the TPID and let skb_vlan_pop()
shift the true ethertype into position for us.

Fixes: 5108bbaddc37 ("openvswitch: add processing of L3 packets")
Signed-off-by: Eric Garver <e@erig.me>
Reviewed-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/flow.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index dbe2379329c5..f039064ce922 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -579,6 +579,7 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
 			return -EINVAL;
 
 		skb_reset_network_header(skb);
+		key->eth.type = skb->protocol;
 	} else {
 		eth = eth_hdr(skb);
 		ether_addr_copy(key->eth.src, eth->h_source);
@@ -592,15 +593,23 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
 		if (unlikely(parse_vlan(skb, key)))
 			return -ENOMEM;
 
-		skb->protocol = parse_ethertype(skb);
-		if (unlikely(skb->protocol == htons(0)))
+		key->eth.type = parse_ethertype(skb);
+		if (unlikely(key->eth.type == htons(0)))
 			return -ENOMEM;
 
+		/* Multiple tagged packets need to retain TPID to satisfy
+		 * skb_vlan_pop(), which will later shift the ethertype into
+		 * skb->protocol.
+		 */
+		if (key->eth.cvlan.tci & htons(VLAN_TAG_PRESENT))
+			skb->protocol = key->eth.cvlan.tpid;
+		else
+			skb->protocol = key->eth.type;
+
 		skb_reset_network_header(skb);
 		__skb_push(skb, skb->data - skb_mac_header(skb));
 	}
 	skb_reset_mac_len(skb);
-	key->eth.type = skb->protocol;
 
 	/* Network layer. */
 	if (key->eth.type == htons(ETH_P_IP)) {

From 513674b5a2c9c7a67501506419da5c3c77ac6f08 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Wed, 20 Dec 2017 12:10:21 -0800
Subject: [PATCH 365/808] net: reevalulate autoflowlabel setting after sysctl
 setting

sysctl.ip6.auto_flowlabels is default 1. In our hosts, we set it to 2.
If sockopt doesn't set autoflowlabel, outcome packets from the hosts are
supposed to not include flowlabel. This is true for normal packet, but
not for reset packet.

The reason is ipv6_pinfo.autoflowlabel is set in sock creation. Later if
we change sysctl.ip6.auto_flowlabels, the ipv6_pinfo.autoflowlabel isn't
changed, so the sock will keep the old behavior in terms of auto
flowlabel. Reset packet is suffering from this problem, because reset
packet is sent from a special control socket, which is created at boot
time. Since sysctl.ipv6.auto_flowlabels is 1 by default, the control
socket will always have its ipv6_pinfo.autoflowlabel set, even after
user set sysctl.ipv6.auto_flowlabels to 1, so reset packset will always
have flowlabel. Normal sock created before sysctl setting suffers from
the same issue. We can't even turn off autoflowlabel unless we kill all
socks in the hosts.

To fix this, if IPV6_AUTOFLOWLABEL sockopt is used, we use the
autoflowlabel setting from user, otherwise we always call
ip6_default_np_autolabel() which has the new settings of sysctl.

Note, this changes behavior a little bit. Before commit 42240901f7c4
(ipv6: Implement different admin modes for automatic flow labels), the
autoflowlabel behavior of a sock isn't sticky, eg, if sysctl changes,
existing connection will change autoflowlabel behavior. After that
commit, autoflowlabel behavior is sticky in the whole life of the sock.
With this patch, the behavior isn't sticky again.

Cc: Martin KaFai Lau <kafai@fb.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Tom Herbert <tom@quantonium.net>
Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h     |  3 ++-
 net/ipv6/af_inet6.c      |  1 -
 net/ipv6/ip6_output.c    | 12 ++++++++++--
 net/ipv6/ipv6_sockglue.c |  1 +
 4 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index cb18c6290ca8..8415bf1a9776 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -273,7 +273,8 @@ struct ipv6_pinfo {
 						 * 100: prefer care-of address
 						 */
 				dontfrag:1,
-				autoflowlabel:1;
+				autoflowlabel:1,
+				autoflowlabel_set:1;
 	__u8			min_hopcount;
 	__u8			tclass;
 	__be32			rcv_flowinfo;
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index c26f71234b9c..c9441ca45399 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -210,7 +210,6 @@ lookup_protocol:
 	np->mcast_hops	= IPV6_DEFAULT_MCASTHOPS;
 	np->mc_loop	= 1;
 	np->pmtudisc	= IPV6_PMTUDISC_WANT;
-	np->autoflowlabel = ip6_default_np_autolabel(net);
 	np->repflow	= net->ipv6.sysctl.flowlabel_reflect;
 	sk->sk_ipv6only	= net->ipv6.sysctl.bindv6only;
 
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 5110a418cc4d..f7dd51c42314 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -166,6 +166,14 @@ int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 }
 
+static bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
+{
+	if (!np->autoflowlabel_set)
+		return ip6_default_np_autolabel(net);
+	else
+		return np->autoflowlabel;
+}
+
 /*
  * xmit an sk_buff (used by TCP, SCTP and DCCP)
  * Note : socket lock is not held for SYNACK packets, but might be modified
@@ -230,7 +238,7 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 		hlimit = ip6_dst_hoplimit(dst);
 
 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
-						     np->autoflowlabel, fl6));
+				ip6_autoflowlabel(net, np), fl6));
 
 	hdr->payload_len = htons(seg_len);
 	hdr->nexthdr = proto;
@@ -1626,7 +1634,7 @@ struct sk_buff *__ip6_make_skb(struct sock *sk,
 
 	ip6_flow_hdr(hdr, v6_cork->tclass,
 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
-					np->autoflowlabel, fl6));
+					ip6_autoflowlabel(net, np), fl6));
 	hdr->hop_limit = v6_cork->hop_limit;
 	hdr->nexthdr = proto;
 	hdr->saddr = fl6->saddr;
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index b9404feabd78..2d4680e0376f 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -886,6 +886,7 @@ pref_skip_coa:
 		break;
 	case IPV6_AUTOFLOWLABEL:
 		np->autoflowlabel = valbool;
+		np->autoflowlabel_set = 1;
 		retv = 0;
 		break;
 	case IPV6_RECVFRAGSIZE:

From 268b790679422a89e9ab0685d9f291edae780c98 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Wed, 20 Dec 2017 17:37:49 -0500
Subject: [PATCH 366/808] skbuff: orphan frags before zerocopy clone

Call skb_zerocopy_clone after skb_orphan_frags, to avoid duplicate
calls to skb_uarg(skb)->callback for the same data.

skb_zerocopy_clone associates skb_shinfo(skb)->uarg from frag_skb
with each segment. This is only safe for uargs that do refcounting,
which is those that pass skb_orphan_frags without dropping their
shared frags. For others, skb_orphan_frags drops the user frags and
sets the uarg to NULL, after which sock_zerocopy_clone has no effect.

Qemu hangs were reported due to duplicate vhost_net_zerocopy_callback
calls for the same data causing the vhost_net_ubuf_ref_>refcount to
drop below zero.

Link: http://lkml.kernel.org/r/<CAF=yD-LWyCD4Y0aJ9O0e_CHLR+3JOeKicRRTEVCPxgw4XOcqGQ@mail.gmail.com>
Fixes: 1f8b977ab32d ("sock: enable MSG_ZEROCOPY")
Reported-by: Andreas Hartmann <andihartmann@01019freenet.de>
Reported-by: David Hill <dhill@redhat.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index a592ca025fc4..edf40ac0cd07 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3654,8 +3654,6 @@ normal:
 
 		skb_shinfo(nskb)->tx_flags |= skb_shinfo(head_skb)->tx_flags &
 					      SKBTX_SHARED_FRAG;
-		if (skb_zerocopy_clone(nskb, head_skb, GFP_ATOMIC))
-			goto err;
 
 		while (pos < offset + len) {
 			if (i >= nfrags) {
@@ -3681,6 +3679,8 @@ normal:
 
 			if (unlikely(skb_orphan_frags(frag_skb, GFP_ATOMIC)))
 				goto err;
+			if (skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC))
+				goto err;
 
 			*nskb_frag = *frag;
 			__skb_frag_ref(nskb_frag);

From b90ddd568792bcb0054eaf0f61785c8f80c3bd1c Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Wed, 20 Dec 2017 17:37:50 -0500
Subject: [PATCH 367/808] skbuff: skb_copy_ubufs must release uarg even without
 user frags

skb_copy_ubufs creates a private copy of frags[] to release its hold
on user frags, then calls uarg->callback to notify the owner.

Call uarg->callback even when no frags exist. This edge case can
happen when zerocopy_sg_from_iter finds enough room in skb_headlen
to copy all the data.

Fixes: 3ece782693c4 ("sock: skb_copy_ubufs support for compound pages")
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index edf40ac0cd07..a3cb0be4c6f3 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1178,7 +1178,7 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
 	u32 d_off;
 
 	if (!num_frags)
-		return 0;
+		goto release;
 
 	if (skb_shared(skb) || skb_unclone(skb, gfp_mask))
 		return -EINVAL;
@@ -1238,6 +1238,7 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
 	__skb_fill_page_desc(skb, new_frags - 1, head, 0, d_off);
 	skb_shinfo(skb)->nr_frags = new_frags;
 
+release:
 	skb_zcopy_clear(skb, false);
 	return 0;
 }

From 13b7954c0b8dd2d6382b4ddb5053f09e389d5c6e Mon Sep 17 00:00:00 2001
From: Vishal Verma <vishal.l.verma@intel.com>
Date: Thu, 14 Dec 2017 17:26:13 -0700
Subject: [PATCH 368/808] libnvdimm, btt: add a couple of missing kernel-doc
 lines

Recent updates to btt.h neglected to add corresponding kernel-doc lines
for new structure members. Add them.

Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/btt.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/nvdimm/btt.h b/drivers/nvdimm/btt.h
index 578c2057524d..884fbbbdd18a 100644
--- a/drivers/nvdimm/btt.h
+++ b/drivers/nvdimm/btt.h
@@ -125,6 +125,7 @@ struct aligned_lock {
  * @list:		List head for list of arenas
  * @debugfs_dir:	Debugfs dentry
  * @flags:		Arena flags - may signify error states.
+ * @err_lock:		Mutex for synchronizing error clearing.
  *
  * arena_info is a per-arena handle. Once an arena is narrowed down for an
  * IO, this struct is passed around for the duration of the IO.
@@ -176,6 +177,7 @@ struct arena_info {
  * @init_lock:		Mutex used for the BTT initialization
  * @init_state:		Flag describing the initialization state for the BTT
  * @num_arenas:		Number of arenas in the BTT instance
+ * @phys_bb:		Pointer to the namespace's badblocks structure
  */
 struct btt {
 	struct gendisk *btt_disk;

From 24e3a7fb60a9187e5df90e5fa655ffc94b9c4f77 Mon Sep 17 00:00:00 2001
From: Vishal Verma <vishal.l.verma@intel.com>
Date: Mon, 18 Dec 2017 09:28:39 -0700
Subject: [PATCH 369/808] libnvdimm, btt: Fix an incompatibility in the log
 layout

Due to a spec misinterpretation, the Linux implementation of the BTT log
area had different padding scheme from other implementations, such as
UEFI and NVML.

This fixes the padding scheme, and defaults to it for new BTT layouts.
We attempt to detect the padding scheme in use when probing for an
existing BTT. If we detect the older/incompatible scheme, we continue
using it.

Reported-by: Juston Li <juston.li@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: <stable@vger.kernel.org>
Fixes: 5212e11fde4d ("nd_btt: atomic sector updates")
Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/btt.c | 201 +++++++++++++++++++++++++++++++++++--------
 drivers/nvdimm/btt.h |  45 +++++++++-
 2 files changed, 211 insertions(+), 35 deletions(-)

diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index e949e3302af4..c586bcdb5190 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -211,12 +211,12 @@ static int btt_map_read(struct arena_info *arena, u32 lba, u32 *mapping,
 	return ret;
 }
 
-static int btt_log_read_pair(struct arena_info *arena, u32 lane,
-			struct log_entry *ent)
+static int btt_log_group_read(struct arena_info *arena, u32 lane,
+			struct log_group *log)
 {
 	return arena_read_bytes(arena,
-			arena->logoff + (2 * lane * LOG_ENT_SIZE), ent,
-			2 * LOG_ENT_SIZE, 0);
+			arena->logoff + (lane * LOG_GRP_SIZE), log,
+			LOG_GRP_SIZE, 0);
 }
 
 static struct dentry *debugfs_root;
@@ -256,6 +256,8 @@ static void arena_debugfs_init(struct arena_info *a, struct dentry *parent,
 	debugfs_create_x64("logoff", S_IRUGO, d, &a->logoff);
 	debugfs_create_x64("info2off", S_IRUGO, d, &a->info2off);
 	debugfs_create_x32("flags", S_IRUGO, d, &a->flags);
+	debugfs_create_u32("log_index_0", S_IRUGO, d, &a->log_index[0]);
+	debugfs_create_u32("log_index_1", S_IRUGO, d, &a->log_index[1]);
 }
 
 static void btt_debugfs_init(struct btt *btt)
@@ -274,6 +276,11 @@ static void btt_debugfs_init(struct btt *btt)
 	}
 }
 
+static u32 log_seq(struct log_group *log, int log_idx)
+{
+	return le32_to_cpu(log->ent[log_idx].seq);
+}
+
 /*
  * This function accepts two log entries, and uses the
  * sequence number to find the 'older' entry.
@@ -283,8 +290,10 @@ static void btt_debugfs_init(struct btt *btt)
  *
  * TODO The logic feels a bit kludge-y. make it better..
  */
-static int btt_log_get_old(struct log_entry *ent)
+static int btt_log_get_old(struct arena_info *a, struct log_group *log)
 {
+	int idx0 = a->log_index[0];
+	int idx1 = a->log_index[1];
 	int old;
 
 	/*
@@ -292,23 +301,23 @@ static int btt_log_get_old(struct log_entry *ent)
 	 * the next time, the following logic works out to put this
 	 * (next) entry into [1]
 	 */
-	if (ent[0].seq == 0) {
-		ent[0].seq = cpu_to_le32(1);
+	if (log_seq(log, idx0) == 0) {
+		log->ent[idx0].seq = cpu_to_le32(1);
 		return 0;
 	}
 
-	if (ent[0].seq == ent[1].seq)
+	if (log_seq(log, idx0) == log_seq(log, idx1))
 		return -EINVAL;
-	if (le32_to_cpu(ent[0].seq) + le32_to_cpu(ent[1].seq) > 5)
+	if (log_seq(log, idx0) + log_seq(log, idx1) > 5)
 		return -EINVAL;
 
-	if (le32_to_cpu(ent[0].seq) < le32_to_cpu(ent[1].seq)) {
-		if (le32_to_cpu(ent[1].seq) - le32_to_cpu(ent[0].seq) == 1)
+	if (log_seq(log, idx0) < log_seq(log, idx1)) {
+		if ((log_seq(log, idx1) - log_seq(log, idx0)) == 1)
 			old = 0;
 		else
 			old = 1;
 	} else {
-		if (le32_to_cpu(ent[0].seq) - le32_to_cpu(ent[1].seq) == 1)
+		if ((log_seq(log, idx0) - log_seq(log, idx1)) == 1)
 			old = 1;
 		else
 			old = 0;
@@ -328,17 +337,18 @@ static int btt_log_read(struct arena_info *arena, u32 lane,
 {
 	int ret;
 	int old_ent, ret_ent;
-	struct log_entry log[2];
+	struct log_group log;
 
-	ret = btt_log_read_pair(arena, lane, log);
+	ret = btt_log_group_read(arena, lane, &log);
 	if (ret)
 		return -EIO;
 
-	old_ent = btt_log_get_old(log);
+	old_ent = btt_log_get_old(arena, &log);
 	if (old_ent < 0 || old_ent > 1) {
 		dev_err(to_dev(arena),
 				"log corruption (%d): lane %d seq [%d, %d]\n",
-			old_ent, lane, log[0].seq, log[1].seq);
+				old_ent, lane, log.ent[arena->log_index[0]].seq,
+				log.ent[arena->log_index[1]].seq);
 		/* TODO set error state? */
 		return -EIO;
 	}
@@ -346,7 +356,7 @@ static int btt_log_read(struct arena_info *arena, u32 lane,
 	ret_ent = (old_flag ? old_ent : (1 - old_ent));
 
 	if (ent != NULL)
-		memcpy(ent, &log[ret_ent], LOG_ENT_SIZE);
+		memcpy(ent, &log.ent[arena->log_index[ret_ent]], LOG_ENT_SIZE);
 
 	return ret_ent;
 }
@@ -360,17 +370,13 @@ static int __btt_log_write(struct arena_info *arena, u32 lane,
 			u32 sub, struct log_entry *ent, unsigned long flags)
 {
 	int ret;
-	/*
-	 * Ignore the padding in log_entry for calculating log_half.
-	 * The entry is 'committed' when we write the sequence number,
-	 * and we want to ensure that that is the last thing written.
-	 * We don't bother writing the padding as that would be extra
-	 * media wear and write amplification
-	 */
-	unsigned int log_half = (LOG_ENT_SIZE - 2 * sizeof(u64)) / 2;
-	u64 ns_off = arena->logoff + (((2 * lane) + sub) * LOG_ENT_SIZE);
+	u32 group_slot = arena->log_index[sub];
+	unsigned int log_half = LOG_ENT_SIZE / 2;
 	void *src = ent;
+	u64 ns_off;
 
+	ns_off = arena->logoff + (lane * LOG_GRP_SIZE) +
+		(group_slot * LOG_ENT_SIZE);
 	/* split the 16B write into atomic, durable halves */
 	ret = arena_write_bytes(arena, ns_off, src, log_half, flags);
 	if (ret)
@@ -453,7 +459,7 @@ static int btt_log_init(struct arena_info *arena)
 {
 	size_t logsize = arena->info2off - arena->logoff;
 	size_t chunk_size = SZ_4K, offset = 0;
-	struct log_entry log;
+	struct log_entry ent;
 	void *zerobuf;
 	int ret;
 	u32 i;
@@ -485,11 +491,11 @@ static int btt_log_init(struct arena_info *arena)
 	}
 
 	for (i = 0; i < arena->nfree; i++) {
-		log.lba = cpu_to_le32(i);
-		log.old_map = cpu_to_le32(arena->external_nlba + i);
-		log.new_map = cpu_to_le32(arena->external_nlba + i);
-		log.seq = cpu_to_le32(LOG_SEQ_INIT);
-		ret = __btt_log_write(arena, i, 0, &log, 0);
+		ent.lba = cpu_to_le32(i);
+		ent.old_map = cpu_to_le32(arena->external_nlba + i);
+		ent.new_map = cpu_to_le32(arena->external_nlba + i);
+		ent.seq = cpu_to_le32(LOG_SEQ_INIT);
+		ret = __btt_log_write(arena, i, 0, &ent, 0);
 		if (ret)
 			goto free;
 	}
@@ -594,6 +600,123 @@ static int btt_freelist_init(struct arena_info *arena)
 	return 0;
 }
 
+static bool ent_is_padding(struct log_entry *ent)
+{
+	return (ent->lba == 0) && (ent->old_map == 0) && (ent->new_map == 0)
+		&& (ent->seq == 0);
+}
+
+/*
+ * Detecting valid log indices: We read a log group (see the comments in btt.h
+ * for a description of a 'log_group' and its 'slots'), and iterate over its
+ * four slots. We expect that a padding slot will be all-zeroes, and use this
+ * to detect a padding slot vs. an actual entry.
+ *
+ * If a log_group is in the initial state, i.e. hasn't been used since the
+ * creation of this BTT layout, it will have three of the four slots with
+ * zeroes. We skip over these log_groups for the detection of log_index. If
+ * all log_groups are in the initial state (i.e. the BTT has never been
+ * written to), it is safe to assume the 'new format' of log entries in slots
+ * (0, 1).
+ */
+static int log_set_indices(struct arena_info *arena)
+{
+	bool idx_set = false, initial_state = true;
+	int ret, log_index[2] = {-1, -1};
+	u32 i, j, next_idx = 0;
+	struct log_group log;
+	u32 pad_count = 0;
+
+	for (i = 0; i < arena->nfree; i++) {
+		ret = btt_log_group_read(arena, i, &log);
+		if (ret < 0)
+			return ret;
+
+		for (j = 0; j < 4; j++) {
+			if (!idx_set) {
+				if (ent_is_padding(&log.ent[j])) {
+					pad_count++;
+					continue;
+				} else {
+					/* Skip if index has been recorded */
+					if ((next_idx == 1) &&
+						(j == log_index[0]))
+						continue;
+					/* valid entry, record index */
+					log_index[next_idx] = j;
+					next_idx++;
+				}
+				if (next_idx == 2) {
+					/* two valid entries found */
+					idx_set = true;
+				} else if (next_idx > 2) {
+					/* too many valid indices */
+					return -ENXIO;
+				}
+			} else {
+				/*
+				 * once the indices have been set, just verify
+				 * that all subsequent log groups are either in
+				 * their initial state or follow the same
+				 * indices.
+				 */
+				if (j == log_index[0]) {
+					/* entry must be 'valid' */
+					if (ent_is_padding(&log.ent[j]))
+						return -ENXIO;
+				} else if (j == log_index[1]) {
+					;
+					/*
+					 * log_index[1] can be padding if the
+					 * lane never got used and it is still
+					 * in the initial state (three 'padding'
+					 * entries)
+					 */
+				} else {
+					/* entry must be invalid (padding) */
+					if (!ent_is_padding(&log.ent[j]))
+						return -ENXIO;
+				}
+			}
+		}
+		/*
+		 * If any of the log_groups have more than one valid,
+		 * non-padding entry, then the we are no longer in the
+		 * initial_state
+		 */
+		if (pad_count < 3)
+			initial_state = false;
+		pad_count = 0;
+	}
+
+	if (!initial_state && !idx_set)
+		return -ENXIO;
+
+	/*
+	 * If all the entries in the log were in the initial state,
+	 * assume new padding scheme
+	 */
+	if (initial_state)
+		log_index[1] = 1;
+
+	/*
+	 * Only allow the known permutations of log/padding indices,
+	 * i.e. (0, 1), and (0, 2)
+	 */
+	if ((log_index[0] == 0) && ((log_index[1] == 1) || (log_index[1] == 2)))
+		; /* known index possibilities */
+	else {
+		dev_err(to_dev(arena), "Found an unknown padding scheme\n");
+		return -ENXIO;
+	}
+
+	arena->log_index[0] = log_index[0];
+	arena->log_index[1] = log_index[1];
+	dev_dbg(to_dev(arena), "log_index_0 = %d\n", log_index[0]);
+	dev_dbg(to_dev(arena), "log_index_1 = %d\n", log_index[1]);
+	return 0;
+}
+
 static int btt_rtt_init(struct arena_info *arena)
 {
 	arena->rtt = kcalloc(arena->nfree, sizeof(u32), GFP_KERNEL);
@@ -650,8 +773,7 @@ static struct arena_info *alloc_arena(struct btt *btt, size_t size,
 	available -= 2 * BTT_PG_SIZE;
 
 	/* The log takes a fixed amount of space based on nfree */
-	logsize = roundup(2 * arena->nfree * sizeof(struct log_entry),
-				BTT_PG_SIZE);
+	logsize = roundup(arena->nfree * LOG_GRP_SIZE, BTT_PG_SIZE);
 	available -= logsize;
 
 	/* Calculate optimal split between map and data area */
@@ -668,6 +790,10 @@ static struct arena_info *alloc_arena(struct btt *btt, size_t size,
 	arena->mapoff = arena->dataoff + datasize;
 	arena->logoff = arena->mapoff + mapsize;
 	arena->info2off = arena->logoff + logsize;
+
+	/* Default log indices are (0,1) */
+	arena->log_index[0] = 0;
+	arena->log_index[1] = 1;
 	return arena;
 }
 
@@ -758,6 +884,13 @@ static int discover_arenas(struct btt *btt)
 		arena->external_lba_start = cur_nlba;
 		parse_arena_meta(arena, super, cur_off);
 
+		ret = log_set_indices(arena);
+		if (ret) {
+			dev_err(to_dev(arena),
+				"Unable to deduce log/padding indices\n");
+			goto out;
+		}
+
 		mutex_init(&arena->err_lock);
 		ret = btt_freelist_init(arena);
 		if (ret)
diff --git a/drivers/nvdimm/btt.h b/drivers/nvdimm/btt.h
index 884fbbbdd18a..db3cb6d4d0d4 100644
--- a/drivers/nvdimm/btt.h
+++ b/drivers/nvdimm/btt.h
@@ -27,6 +27,7 @@
 #define MAP_ERR_MASK (1 << MAP_ERR_SHIFT)
 #define MAP_LBA_MASK (~((1 << MAP_TRIM_SHIFT) | (1 << MAP_ERR_SHIFT)))
 #define MAP_ENT_NORMAL 0xC0000000
+#define LOG_GRP_SIZE sizeof(struct log_group)
 #define LOG_ENT_SIZE sizeof(struct log_entry)
 #define ARENA_MIN_SIZE (1UL << 24)	/* 16 MB */
 #define ARENA_MAX_SIZE (1ULL << 39)	/* 512 GB */
@@ -50,12 +51,52 @@ enum btt_init_state {
 	INIT_READY
 };
 
+/*
+ * A log group represents one log 'lane', and consists of four log entries.
+ * Two of the four entries are valid entries, and the remaining two are
+ * padding. Due to an old bug in the padding location, we need to perform a
+ * test to determine the padding scheme being used, and use that scheme
+ * thereafter.
+ *
+ * In kernels prior to 4.15, 'log group' would have actual log entries at
+ * indices (0, 2) and padding at indices (1, 3), where as the correct/updated
+ * format has log entries at indices (0, 1) and padding at indices (2, 3).
+ *
+ * Old (pre 4.15) format:
+ * +-----------------+-----------------+
+ * |      ent[0]     |      ent[1]     |
+ * |       16B       |       16B       |
+ * | lba/old/new/seq |       pad       |
+ * +-----------------------------------+
+ * |      ent[2]     |      ent[3]     |
+ * |       16B       |       16B       |
+ * | lba/old/new/seq |       pad       |
+ * +-----------------+-----------------+
+ *
+ * New format:
+ * +-----------------+-----------------+
+ * |      ent[0]     |      ent[1]     |
+ * |       16B       |       16B       |
+ * | lba/old/new/seq | lba/old/new/seq |
+ * +-----------------------------------+
+ * |      ent[2]     |      ent[3]     |
+ * |       16B       |       16B       |
+ * |       pad       |       pad       |
+ * +-----------------+-----------------+
+ *
+ * We detect during start-up which format is in use, and set
+ * arena->log_index[(0, 1)] with the detected format.
+ */
+
 struct log_entry {
 	__le32 lba;
 	__le32 old_map;
 	__le32 new_map;
 	__le32 seq;
-	__le64 padding[2];
+};
+
+struct log_group {
+	struct log_entry ent[4];
 };
 
 struct btt_sb {
@@ -126,6 +167,7 @@ struct aligned_lock {
  * @debugfs_dir:	Debugfs dentry
  * @flags:		Arena flags - may signify error states.
  * @err_lock:		Mutex for synchronizing error clearing.
+ * @log_index:		Indices of the valid log entries in a log_group
  *
  * arena_info is a per-arena handle. Once an arena is narrowed down for an
  * IO, this struct is passed around for the duration of the IO.
@@ -158,6 +200,7 @@ struct arena_info {
 	/* Arena flags */
 	u32 flags;
 	struct mutex err_lock;
+	int log_index[2];
 };
 
 /**

From f55688c45442bc863f40ad678c638785b26cdce6 Mon Sep 17 00:00:00 2001
From: Steve Wise <swise@opengridcomputing.com>
Date: Mon, 18 Dec 2017 13:10:00 -0800
Subject: [PATCH 370/808] iw_cxgb4: Only validate the MSN for successful
 completions

If the RECV CQE is in error, ignore the MSN check.  This was causing
recvs that were flushed into the sw cq to be completed with the wrong
status (BAD_MSN instead of FLUSHED).

Cc: stable@vger.kernel.org
Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/cxgb4/cq.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/infiniband/hw/cxgb4/cq.c b/drivers/infiniband/hw/cxgb4/cq.c
index b7bfc536e00f..7ed87622e461 100644
--- a/drivers/infiniband/hw/cxgb4/cq.c
+++ b/drivers/infiniband/hw/cxgb4/cq.c
@@ -571,10 +571,10 @@ static int poll_cq(struct t4_wq *wq, struct t4_cq *cq, struct t4_cqe *cqe,
 			ret = -EAGAIN;
 			goto skip_cqe;
 		}
-		if (unlikely((CQE_WRID_MSN(hw_cqe) != (wq->rq.msn)))) {
+		if (unlikely(!CQE_STATUS(hw_cqe) &&
+			     CQE_WRID_MSN(hw_cqe) != wq->rq.msn)) {
 			t4_set_wq_in_error(wq);
-			hw_cqe->header |= htonl(CQE_STATUS_V(T4_ERR_MSN));
-			goto proc_cqe;
+			hw_cqe->header |= cpu_to_be32(CQE_STATUS_V(T4_ERR_MSN));
 		}
 		goto proc_cqe;
 	}

From 96a236ed286776554fbd227c6d2876fd3b5dc65d Mon Sep 17 00:00:00 2001
From: Steve Wise <swise@opengridcomputing.com>
Date: Tue, 19 Dec 2017 10:29:25 -0800
Subject: [PATCH 371/808] iw_cxgb4: reflect the original WR opcode in drain
 cqes

The flush/drain logic was not retaining the original wr opcode in
its completion.  This can cause problems if the application uses
the completion opcode to make decisions.

Use bit 10 of the CQE header word to indicate the CQE is a special
drain completion, and save the original WR opcode in the cqe header
opcode field.

Fixes: 4fe7c2962e11 ("iw_cxgb4: refactor sq/rq drain logic")

Cc: stable@vger.kernel.org
Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/cxgb4/cq.c       |  7 ++--
 drivers/infiniband/hw/cxgb4/iw_cxgb4.h |  2 --
 drivers/infiniband/hw/cxgb4/qp.c       | 46 +++++++++++++++++++++++---
 drivers/infiniband/hw/cxgb4/t4.h       |  6 ++++
 4 files changed, 50 insertions(+), 11 deletions(-)

diff --git a/drivers/infiniband/hw/cxgb4/cq.c b/drivers/infiniband/hw/cxgb4/cq.c
index 7ed87622e461..6f2b26126c64 100644
--- a/drivers/infiniband/hw/cxgb4/cq.c
+++ b/drivers/infiniband/hw/cxgb4/cq.c
@@ -395,7 +395,7 @@ next_cqe:
 
 static int cqe_completes_wr(struct t4_cqe *cqe, struct t4_wq *wq)
 {
-	if (CQE_OPCODE(cqe) == C4IW_DRAIN_OPCODE) {
+	if (DRAIN_CQE(cqe)) {
 		WARN_ONCE(1, "Unexpected DRAIN CQE qp id %u!\n", wq->sq.qid);
 		return 0;
 	}
@@ -494,7 +494,7 @@ static int poll_cq(struct t4_wq *wq, struct t4_cq *cq, struct t4_cqe *cqe,
 	/*
 	 * Special cqe for drain WR completions...
 	 */
-	if (CQE_OPCODE(hw_cqe) == C4IW_DRAIN_OPCODE) {
+	if (DRAIN_CQE(hw_cqe)) {
 		*cookie = CQE_DRAIN_COOKIE(hw_cqe);
 		*cqe = *hw_cqe;
 		goto skip_cqe;
@@ -748,9 +748,6 @@ static int c4iw_poll_cq_one(struct c4iw_cq *chp, struct ib_wc *wc)
 				c4iw_invalidate_mr(qhp->rhp,
 						   CQE_WRID_FR_STAG(&cqe));
 			break;
-		case C4IW_DRAIN_OPCODE:
-			wc->opcode = IB_WC_SEND;
-			break;
 		default:
 			pr_err("Unexpected opcode %d in the CQE received for QPID=0x%0x\n",
 			       CQE_OPCODE(&cqe), CQE_QPID(&cqe));
diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
index 470f97a79ebb..65dd3726ca02 100644
--- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
+++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
@@ -693,8 +693,6 @@ static inline int to_ib_qp_state(int c4iw_qp_state)
 	return IB_QPS_ERR;
 }
 
-#define C4IW_DRAIN_OPCODE FW_RI_SGE_EC_CR_RETURN
-
 static inline u32 c4iw_ib_to_tpt_access(int a)
 {
 	return (a & IB_ACCESS_REMOTE_WRITE ? FW_RI_MEM_ACCESS_REM_WRITE : 0) |
diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c
index 38bddd02a943..21495f917bcc 100644
--- a/drivers/infiniband/hw/cxgb4/qp.c
+++ b/drivers/infiniband/hw/cxgb4/qp.c
@@ -790,21 +790,57 @@ static int ring_kernel_rq_db(struct c4iw_qp *qhp, u16 inc)
 	return 0;
 }
 
-static void complete_sq_drain_wr(struct c4iw_qp *qhp, struct ib_send_wr *wr)
+static int ib_to_fw_opcode(int ib_opcode)
+{
+	int opcode;
+
+	switch (ib_opcode) {
+	case IB_WR_SEND_WITH_INV:
+		opcode = FW_RI_SEND_WITH_INV;
+		break;
+	case IB_WR_SEND:
+		opcode = FW_RI_SEND;
+		break;
+	case IB_WR_RDMA_WRITE:
+		opcode = FW_RI_RDMA_WRITE;
+		break;
+	case IB_WR_RDMA_READ:
+	case IB_WR_RDMA_READ_WITH_INV:
+		opcode = FW_RI_READ_REQ;
+		break;
+	case IB_WR_REG_MR:
+		opcode = FW_RI_FAST_REGISTER;
+		break;
+	case IB_WR_LOCAL_INV:
+		opcode = FW_RI_LOCAL_INV;
+		break;
+	default:
+		opcode = -EINVAL;
+	}
+	return opcode;
+}
+
+static int complete_sq_drain_wr(struct c4iw_qp *qhp, struct ib_send_wr *wr)
 {
 	struct t4_cqe cqe = {};
 	struct c4iw_cq *schp;
 	unsigned long flag;
 	struct t4_cq *cq;
+	int opcode;
 
 	schp = to_c4iw_cq(qhp->ibqp.send_cq);
 	cq = &schp->cq;
 
+	opcode = ib_to_fw_opcode(wr->opcode);
+	if (opcode < 0)
+		return opcode;
+
 	cqe.u.drain_cookie = wr->wr_id;
 	cqe.header = cpu_to_be32(CQE_STATUS_V(T4_ERR_SWFLUSH) |
-				 CQE_OPCODE_V(C4IW_DRAIN_OPCODE) |
+				 CQE_OPCODE_V(opcode) |
 				 CQE_TYPE_V(1) |
 				 CQE_SWCQE_V(1) |
+				 CQE_DRAIN_V(1) |
 				 CQE_QPID_V(qhp->wq.sq.qid));
 
 	spin_lock_irqsave(&schp->lock, flag);
@@ -819,6 +855,7 @@ static void complete_sq_drain_wr(struct c4iw_qp *qhp, struct ib_send_wr *wr)
 					   schp->ibcq.cq_context);
 		spin_unlock_irqrestore(&schp->comp_handler_lock, flag);
 	}
+	return 0;
 }
 
 static void complete_rq_drain_wr(struct c4iw_qp *qhp, struct ib_recv_wr *wr)
@@ -833,9 +870,10 @@ static void complete_rq_drain_wr(struct c4iw_qp *qhp, struct ib_recv_wr *wr)
 
 	cqe.u.drain_cookie = wr->wr_id;
 	cqe.header = cpu_to_be32(CQE_STATUS_V(T4_ERR_SWFLUSH) |
-				 CQE_OPCODE_V(C4IW_DRAIN_OPCODE) |
+				 CQE_OPCODE_V(FW_RI_SEND) |
 				 CQE_TYPE_V(0) |
 				 CQE_SWCQE_V(1) |
+				 CQE_DRAIN_V(1) |
 				 CQE_QPID_V(qhp->wq.sq.qid));
 
 	spin_lock_irqsave(&rchp->lock, flag);
@@ -875,7 +913,7 @@ int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
 	 */
 	if (qhp->wq.flushed) {
 		spin_unlock_irqrestore(&qhp->lock, flag);
-		complete_sq_drain_wr(qhp, wr);
+		err = complete_sq_drain_wr(qhp, wr);
 		return err;
 	}
 	num_wrs = t4_sq_avail(&qhp->wq);
diff --git a/drivers/infiniband/hw/cxgb4/t4.h b/drivers/infiniband/hw/cxgb4/t4.h
index e9ea94268d51..79e8ee12c391 100644
--- a/drivers/infiniband/hw/cxgb4/t4.h
+++ b/drivers/infiniband/hw/cxgb4/t4.h
@@ -197,6 +197,11 @@ struct t4_cqe {
 #define CQE_SWCQE_G(x)    ((((x) >> CQE_SWCQE_S)) & CQE_SWCQE_M)
 #define CQE_SWCQE_V(x)	  ((x)<<CQE_SWCQE_S)
 
+#define CQE_DRAIN_S       10
+#define CQE_DRAIN_M       0x1
+#define CQE_DRAIN_G(x)    ((((x) >> CQE_DRAIN_S)) & CQE_DRAIN_M)
+#define CQE_DRAIN_V(x)	  ((x)<<CQE_DRAIN_S)
+
 #define CQE_STATUS_S      5
 #define CQE_STATUS_M      0x1F
 #define CQE_STATUS_G(x)   ((((x) >> CQE_STATUS_S)) & CQE_STATUS_M)
@@ -213,6 +218,7 @@ struct t4_cqe {
 #define CQE_OPCODE_V(x)   ((x)<<CQE_OPCODE_S)
 
 #define SW_CQE(x)         (CQE_SWCQE_G(be32_to_cpu((x)->header)))
+#define DRAIN_CQE(x)      (CQE_DRAIN_G(be32_to_cpu((x)->header)))
 #define CQE_QPID(x)       (CQE_QPID_G(be32_to_cpu((x)->header)))
 #define CQE_TYPE(x)       (CQE_TYPE_G(be32_to_cpu((x)->header)))
 #define SQ_TYPE(x)	  (CQE_TYPE((x)))

From d14587334580bc94d3ee11e8320e0c157f91ae8f Mon Sep 17 00:00:00 2001
From: Steve Wise <swise@opengridcomputing.com>
Date: Tue, 19 Dec 2017 14:02:10 -0800
Subject: [PATCH 372/808] iw_cxgb4: when flushing, complete all wrs in a chain

If a wr chain was posted and needed to be flushed, only the first
wr in the chain was completed with FLUSHED status.  The rest were
never completed.  This caused isert to hang on shutdown due to the
missing completions which left iscsi IO commands referenced, stalling
the shutdown.

Fixes: 4fe7c2962e11 ("iw_cxgb4: refactor sq/rq drain logic")

Cc: stable@vger.kernel.org
Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/cxgb4/qp.c | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c
index 21495f917bcc..d5c92fc520d6 100644
--- a/drivers/infiniband/hw/cxgb4/qp.c
+++ b/drivers/infiniband/hw/cxgb4/qp.c
@@ -858,6 +858,22 @@ static int complete_sq_drain_wr(struct c4iw_qp *qhp, struct ib_send_wr *wr)
 	return 0;
 }
 
+static int complete_sq_drain_wrs(struct c4iw_qp *qhp, struct ib_send_wr *wr,
+				struct ib_send_wr **bad_wr)
+{
+	int ret = 0;
+
+	while (wr) {
+		ret = complete_sq_drain_wr(qhp, wr);
+		if (ret) {
+			*bad_wr = wr;
+			break;
+		}
+		wr = wr->next;
+	}
+	return ret;
+}
+
 static void complete_rq_drain_wr(struct c4iw_qp *qhp, struct ib_recv_wr *wr)
 {
 	struct t4_cqe cqe = {};
@@ -890,6 +906,14 @@ static void complete_rq_drain_wr(struct c4iw_qp *qhp, struct ib_recv_wr *wr)
 	}
 }
 
+static void complete_rq_drain_wrs(struct c4iw_qp *qhp, struct ib_recv_wr *wr)
+{
+	while (wr) {
+		complete_rq_drain_wr(qhp, wr);
+		wr = wr->next;
+	}
+}
+
 int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
 		   struct ib_send_wr **bad_wr)
 {
@@ -913,7 +937,7 @@ int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
 	 */
 	if (qhp->wq.flushed) {
 		spin_unlock_irqrestore(&qhp->lock, flag);
-		err = complete_sq_drain_wr(qhp, wr);
+		err = complete_sq_drain_wrs(qhp, wr, bad_wr);
 		return err;
 	}
 	num_wrs = t4_sq_avail(&qhp->wq);
@@ -1061,7 +1085,7 @@ int c4iw_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
 	 */
 	if (qhp->wq.flushed) {
 		spin_unlock_irqrestore(&qhp->lock, flag);
-		complete_rq_drain_wr(qhp, wr);
+		complete_rq_drain_wrs(qhp, wr);
 		return err;
 	}
 	num_wrs = t4_rq_avail(&qhp->wq);

From 17748056ce123ee37fb7382bc698fc721e3c4a09 Mon Sep 17 00:00:00 2001
From: Bryan Tan <bryantan@vmware.com>
Date: Wed, 20 Dec 2017 09:49:03 -0800
Subject: [PATCH 373/808] RDMA/vmw_pvrdma: Call ib_umem_release on destroy QP
 path

The QP cleanup did not previously call ib_umem_release,
resulting in a user-triggerable kernel resource leak.

Fixes: 29c8d9eba550 ("IB: Add vmw_pvrdma driver")
Reviewed-by: Adit Ranadive <aditr@vmware.com>
Reviewed-by: Aditya Sarwade <asarwade@vmware.com>
Reviewed-by: Jorgen Hansen <jhansen@vmware.com>
Signed-off-by: Bryan Tan <bryantan@vmware.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c
index 10420a18d02f..dceebc623d96 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c
@@ -431,6 +431,13 @@ static void pvrdma_free_qp(struct pvrdma_qp *qp)
 	atomic_dec(&qp->refcnt);
 	wait_event(qp->wait, !atomic_read(&qp->refcnt));
 
+	if (!qp->is_kernel) {
+		if (qp->rumem)
+			ib_umem_release(qp->rumem);
+		if (qp->sumem)
+			ib_umem_release(qp->sumem);
+	}
+
 	pvrdma_page_dir_cleanup(dev, &qp->pdir);
 
 	kfree(qp);

From 30a366a9dabd05a0d218288b7d732649886b6a53 Mon Sep 17 00:00:00 2001
From: Bryan Tan <bryantan@vmware.com>
Date: Wed, 20 Dec 2017 09:50:01 -0800
Subject: [PATCH 374/808] RDMA/vmw_pvrdma: Use refcount_dec_and_test to avoid
 warning

refcount_dec generates a warning when the operation
causes the refcount to hit zero. Avoid this by using
refcount_dec_and_test.

Fixes: 8b10ba783c9d ("RDMA/vmw_pvrdma: Add shared receive queue support")
Reviewed-by: Adit Ranadive <aditr@vmware.com>
Reviewed-by: Aditya Sarwade <asarwade@vmware.com>
Reviewed-by: Jorgen Hansen <jhansen@vmware.com>
Signed-off-by: Bryan Tan <bryantan@vmware.com>
Reviewed-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c
index 826ccb864596..a2b1a3c115f2 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c
@@ -236,8 +236,8 @@ static void pvrdma_free_srq(struct pvrdma_dev *dev, struct pvrdma_srq *srq)
 	dev->srq_tbl[srq->srq_handle] = NULL;
 	spin_unlock_irqrestore(&dev->srq_tbl_lock, flags);
 
-	refcount_dec(&srq->refcnt);
-	wait_event(srq->wait, !refcount_read(&srq->refcnt));
+	if (!refcount_dec_and_test(&srq->refcnt))
+		wait_event(srq->wait, !refcount_read(&srq->refcnt));
 
 	/* There is no support for kernel clients, so this is safe. */
 	ib_umem_release(srq->umem);

From e3524b269e451cff68b19f32b15448933a53a4f4 Mon Sep 17 00:00:00 2001
From: Bryan Tan <bryantan@vmware.com>
Date: Wed, 20 Dec 2017 09:51:40 -0800
Subject: [PATCH 375/808] RDMA/vmw_pvrdma: Avoid use after free due to
 QP/CQ/SRQ destroy

The use of wait queues in vmw_pvrdma for handling concurrent
access to a resource leaves a race condition which can cause a use
after free bug.

Fix this by using the pattern from other drivers, complete() protected by
dec_and_test to ensure complete() is called only once.

Fixes: 29c8d9eba550 ("IB: Add vmw_pvrdma driver")
Signed-off-by: Bryan Tan <bryantan@vmware.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/vmw_pvrdma/pvrdma.h      |  6 +++---
 drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c   |  7 ++++---
 drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c | 17 +++++++----------
 drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c   |  7 ++++---
 drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c  |  7 ++++---
 5 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h b/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h
index 63bc2efc34eb..4f7bd3b6a315 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h
@@ -94,7 +94,7 @@ struct pvrdma_cq {
 	u32 cq_handle;
 	bool is_kernel;
 	atomic_t refcnt;
-	wait_queue_head_t wait;
+	struct completion free;
 };
 
 struct pvrdma_id_table {
@@ -175,7 +175,7 @@ struct pvrdma_srq {
 	u32 srq_handle;
 	int npages;
 	refcount_t refcnt;
-	wait_queue_head_t wait;
+	struct completion free;
 };
 
 struct pvrdma_qp {
@@ -197,7 +197,7 @@ struct pvrdma_qp {
 	bool is_kernel;
 	struct mutex mutex; /* QP state mutex. */
 	atomic_t refcnt;
-	wait_queue_head_t wait;
+	struct completion free;
 };
 
 struct pvrdma_dev {
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c
index 3562c0c30492..e529622cefad 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c
@@ -179,7 +179,7 @@ struct ib_cq *pvrdma_create_cq(struct ib_device *ibdev,
 		pvrdma_page_dir_insert_umem(&cq->pdir, cq->umem, 0);
 
 	atomic_set(&cq->refcnt, 1);
-	init_waitqueue_head(&cq->wait);
+	init_completion(&cq->free);
 	spin_lock_init(&cq->cq_lock);
 
 	memset(cmd, 0, sizeof(*cmd));
@@ -230,8 +230,9 @@ err_cq:
 
 static void pvrdma_free_cq(struct pvrdma_dev *dev, struct pvrdma_cq *cq)
 {
-	atomic_dec(&cq->refcnt);
-	wait_event(cq->wait, !atomic_read(&cq->refcnt));
+	if (atomic_dec_and_test(&cq->refcnt))
+		complete(&cq->free);
+	wait_for_completion(&cq->free);
 
 	if (!cq->is_kernel)
 		ib_umem_release(cq->umem);
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c
index 1f4e18717a00..e92681878c93 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c
@@ -346,9 +346,8 @@ static void pvrdma_qp_event(struct pvrdma_dev *dev, u32 qpn, int type)
 		ibqp->event_handler(&e, ibqp->qp_context);
 	}
 	if (qp) {
-		atomic_dec(&qp->refcnt);
-		if (atomic_read(&qp->refcnt) == 0)
-			wake_up(&qp->wait);
+		if (atomic_dec_and_test(&qp->refcnt))
+			complete(&qp->free);
 	}
 }
 
@@ -373,9 +372,8 @@ static void pvrdma_cq_event(struct pvrdma_dev *dev, u32 cqn, int type)
 		ibcq->event_handler(&e, ibcq->cq_context);
 	}
 	if (cq) {
-		atomic_dec(&cq->refcnt);
-		if (atomic_read(&cq->refcnt) == 0)
-			wake_up(&cq->wait);
+		if (atomic_dec_and_test(&cq->refcnt))
+			complete(&cq->free);
 	}
 }
 
@@ -404,7 +402,7 @@ static void pvrdma_srq_event(struct pvrdma_dev *dev, u32 srqn, int type)
 	}
 	if (srq) {
 		if (refcount_dec_and_test(&srq->refcnt))
-			wake_up(&srq->wait);
+			complete(&srq->free);
 	}
 }
 
@@ -539,9 +537,8 @@ static irqreturn_t pvrdma_intrx_handler(int irq, void *dev_id)
 		if (cq && cq->ibcq.comp_handler)
 			cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
 		if (cq) {
-			atomic_dec(&cq->refcnt);
-			if (atomic_read(&cq->refcnt))
-				wake_up(&cq->wait);
+			if (atomic_dec_and_test(&cq->refcnt))
+				complete(&cq->free);
 		}
 		pvrdma_idx_ring_inc(&ring->cons_head, ring_slots);
 	}
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c
index dceebc623d96..4059308e1454 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c
@@ -246,7 +246,7 @@ struct ib_qp *pvrdma_create_qp(struct ib_pd *pd,
 		spin_lock_init(&qp->rq.lock);
 		mutex_init(&qp->mutex);
 		atomic_set(&qp->refcnt, 1);
-		init_waitqueue_head(&qp->wait);
+		init_completion(&qp->free);
 
 		qp->state = IB_QPS_RESET;
 
@@ -428,8 +428,9 @@ static void pvrdma_free_qp(struct pvrdma_qp *qp)
 
 	pvrdma_unlock_cqs(scq, rcq, &scq_flags, &rcq_flags);
 
-	atomic_dec(&qp->refcnt);
-	wait_event(qp->wait, !atomic_read(&qp->refcnt));
+	if (atomic_dec_and_test(&qp->refcnt))
+		complete(&qp->free);
+	wait_for_completion(&qp->free);
 
 	if (!qp->is_kernel) {
 		if (qp->rumem)
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c
index a2b1a3c115f2..5acebb1ef631 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c
@@ -149,7 +149,7 @@ struct ib_srq *pvrdma_create_srq(struct ib_pd *pd,
 
 	spin_lock_init(&srq->lock);
 	refcount_set(&srq->refcnt, 1);
-	init_waitqueue_head(&srq->wait);
+	init_completion(&srq->free);
 
 	dev_dbg(&dev->pdev->dev,
 		"create shared receive queue from user space\n");
@@ -236,8 +236,9 @@ static void pvrdma_free_srq(struct pvrdma_dev *dev, struct pvrdma_srq *srq)
 	dev->srq_tbl[srq->srq_handle] = NULL;
 	spin_unlock_irqrestore(&dev->srq_tbl_lock, flags);
 
-	if (!refcount_dec_and_test(&srq->refcnt))
-		wait_event(srq->wait, !refcount_read(&srq->refcnt));
+	if (refcount_dec_and_test(&srq->refcnt))
+		complete(&srq->free);
+	wait_for_completion(&srq->free);
 
 	/* There is no support for kernel clients, so this is safe. */
 	ib_umem_release(srq->umem);

From 71a0ff65a21bf3e2c4fde208c4a635ed2bbb4e81 Mon Sep 17 00:00:00 2001
From: Majd Dibbiny <majd@mellanox.com>
Date: Thu, 21 Dec 2017 17:38:26 +0200
Subject: [PATCH 376/808] IB/mlx5: Fix congestion counters in LAG mode

Congestion counters are counted and queried per physical function.
When working in LAG mode, CNP packets can be sent or received on both
of the functions, thus congestion counters should be aggregated from
the two physical functions.

Fixes: e1f24a79f424 ("IB/mlx5: Support congestion related counters")
Signed-off-by: Majd Dibbiny <majd@mellanox.com>
Reviewed-by: Aviv Heller <avivh@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx5/cmd.c              | 11 ----
 drivers/infiniband/hw/mlx5/cmd.h              |  2 -
 drivers/infiniband/hw/mlx5/main.c             | 35 ++----------
 drivers/net/ethernet/mellanox/mlx5/core/lag.c | 56 +++++++++++++++++++
 include/linux/mlx5/driver.h                   |  4 ++
 5 files changed, 66 insertions(+), 42 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/cmd.c b/drivers/infiniband/hw/mlx5/cmd.c
index 470995fa38d2..6f6712f87a73 100644
--- a/drivers/infiniband/hw/mlx5/cmd.c
+++ b/drivers/infiniband/hw/mlx5/cmd.c
@@ -47,17 +47,6 @@ int mlx5_cmd_null_mkey(struct mlx5_core_dev *dev, u32 *null_mkey)
 	return err;
 }
 
-int mlx5_cmd_query_cong_counter(struct mlx5_core_dev *dev,
-				bool reset, void *out, int out_size)
-{
-	u32 in[MLX5_ST_SZ_DW(query_cong_statistics_in)] = { };
-
-	MLX5_SET(query_cong_statistics_in, in, opcode,
-		 MLX5_CMD_OP_QUERY_CONG_STATISTICS);
-	MLX5_SET(query_cong_statistics_in, in, clear, reset);
-	return mlx5_cmd_exec(dev, in, sizeof(in), out, out_size);
-}
-
 int mlx5_cmd_query_cong_params(struct mlx5_core_dev *dev, int cong_point,
 			       void *out, int out_size)
 {
diff --git a/drivers/infiniband/hw/mlx5/cmd.h b/drivers/infiniband/hw/mlx5/cmd.h
index af4c24596274..78ffded7cc2c 100644
--- a/drivers/infiniband/hw/mlx5/cmd.h
+++ b/drivers/infiniband/hw/mlx5/cmd.h
@@ -37,8 +37,6 @@
 #include <linux/mlx5/driver.h>
 
 int mlx5_cmd_null_mkey(struct mlx5_core_dev *dev, u32 *null_mkey);
-int mlx5_cmd_query_cong_counter(struct mlx5_core_dev *dev,
-				bool reset, void *out, int out_size);
 int mlx5_cmd_query_cong_params(struct mlx5_core_dev *dev, int cong_point,
 			       void *out, int out_size);
 int mlx5_cmd_modify_cong_params(struct mlx5_core_dev *mdev,
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 543d0a4c8bf3..b4ef4d9b6ce5 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -3737,34 +3737,6 @@ free:
 	return ret;
 }
 
-static int mlx5_ib_query_cong_counters(struct mlx5_ib_dev *dev,
-				       struct mlx5_ib_port *port,
-				       struct rdma_hw_stats *stats)
-{
-	int outlen = MLX5_ST_SZ_BYTES(query_cong_statistics_out);
-	void *out;
-	int ret, i;
-	int offset = port->cnts.num_q_counters;
-
-	out = kvzalloc(outlen, GFP_KERNEL);
-	if (!out)
-		return -ENOMEM;
-
-	ret = mlx5_cmd_query_cong_counter(dev->mdev, false, out, outlen);
-	if (ret)
-		goto free;
-
-	for (i = 0; i < port->cnts.num_cong_counters; i++) {
-		stats->value[i + offset] =
-			be64_to_cpup((__be64 *)(out +
-				     port->cnts.offsets[i + offset]));
-	}
-
-free:
-	kvfree(out);
-	return ret;
-}
-
 static int mlx5_ib_get_hw_stats(struct ib_device *ibdev,
 				struct rdma_hw_stats *stats,
 				u8 port_num, int index)
@@ -3782,7 +3754,12 @@ static int mlx5_ib_get_hw_stats(struct ib_device *ibdev,
 	num_counters = port->cnts.num_q_counters;
 
 	if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) {
-		ret = mlx5_ib_query_cong_counters(dev, port, stats);
+		ret = mlx5_lag_query_cong_counters(dev->mdev,
+						   stats->value +
+						   port->cnts.num_q_counters,
+						   port->cnts.num_cong_counters,
+						   port->cnts.offsets +
+						   port->cnts.num_q_counters);
 		if (ret)
 			return ret;
 		num_counters += port->cnts.num_cong_counters;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag.c
index f26f97fe4666..582b2f18010a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag.c
@@ -137,6 +137,17 @@ int mlx5_cmd_destroy_vport_lag(struct mlx5_core_dev *dev)
 }
 EXPORT_SYMBOL(mlx5_cmd_destroy_vport_lag);
 
+static int mlx5_cmd_query_cong_counter(struct mlx5_core_dev *dev,
+				       bool reset, void *out, int out_size)
+{
+	u32 in[MLX5_ST_SZ_DW(query_cong_statistics_in)] = { };
+
+	MLX5_SET(query_cong_statistics_in, in, opcode,
+		 MLX5_CMD_OP_QUERY_CONG_STATISTICS);
+	MLX5_SET(query_cong_statistics_in, in, clear, reset);
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, out_size);
+}
+
 static struct mlx5_lag *mlx5_lag_dev_get(struct mlx5_core_dev *dev)
 {
 	return dev->priv.lag;
@@ -633,3 +644,48 @@ bool mlx5_lag_intf_add(struct mlx5_interface *intf, struct mlx5_priv *priv)
 	/* If bonded, we do not add an IB device for PF1. */
 	return false;
 }
+
+int mlx5_lag_query_cong_counters(struct mlx5_core_dev *dev,
+				 u64 *values,
+				 int num_counters,
+				 size_t *offsets)
+{
+	int outlen = MLX5_ST_SZ_BYTES(query_cong_statistics_out);
+	struct mlx5_core_dev *mdev[MLX5_MAX_PORTS];
+	struct mlx5_lag *ldev;
+	int num_ports;
+	int ret, i, j;
+	void *out;
+
+	out = kvzalloc(outlen, GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	memset(values, 0, sizeof(*values) * num_counters);
+
+	mutex_lock(&lag_mutex);
+	ldev = mlx5_lag_dev_get(dev);
+	if (ldev && mlx5_lag_is_bonded(ldev)) {
+		num_ports = MLX5_MAX_PORTS;
+		mdev[0] = ldev->pf[0].dev;
+		mdev[1] = ldev->pf[1].dev;
+	} else {
+		num_ports = 1;
+		mdev[0] = dev;
+	}
+
+	for (i = 0; i < num_ports; ++i) {
+		ret = mlx5_cmd_query_cong_counter(mdev[i], false, out, outlen);
+		if (ret)
+			goto unlock;
+
+		for (j = 0; j < num_counters; ++j)
+			values[j] += be64_to_cpup((__be64 *)(out + offsets[j]));
+	}
+
+unlock:
+	mutex_unlock(&lag_mutex);
+	kvfree(out);
+	return ret;
+}
+EXPORT_SYMBOL(mlx5_lag_query_cong_counters);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index a886b51511ab..8846919356ca 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1164,6 +1164,10 @@ int mlx5_cmd_create_vport_lag(struct mlx5_core_dev *dev);
 int mlx5_cmd_destroy_vport_lag(struct mlx5_core_dev *dev);
 bool mlx5_lag_is_active(struct mlx5_core_dev *dev);
 struct net_device *mlx5_lag_get_roce_netdev(struct mlx5_core_dev *dev);
+int mlx5_lag_query_cong_counters(struct mlx5_core_dev *dev,
+				 u64 *values,
+				 int num_counters,
+				 size_t *offsets);
 struct mlx5_uars_page *mlx5_get_uars_page(struct mlx5_core_dev *mdev);
 void mlx5_put_uars_page(struct mlx5_core_dev *mdev, struct mlx5_uars_page *up);
 

From 1f80bd6a6cc8358b81194e1f5fc16449947396ec Mon Sep 17 00:00:00 2001
From: Alex Vesker <valex@mellanox.com>
Date: Thu, 21 Dec 2017 17:38:27 +0200
Subject: [PATCH 377/808] IB/ipoib: Fix lockdep issue found on
 ipoib_ib_dev_heavy_flush

The locking order of vlan_rwsem (LOCK A) and then rtnl (LOCK B),
contradicts other flows such as ipoib_open possibly causing a deadlock.
To prevent this deadlock heavy flush is called with RTNL locked and
only then tries to acquire vlan_rwsem.
This deadlock is possible only when there are child interfaces.

[  140.941758] ======================================================
[  140.946276] WARNING: possible circular locking dependency detected
[  140.950950] 4.15.0-rc1+ #9 Tainted: G           O
[  140.954797] ------------------------------------------------------
[  140.959424] kworker/u32:1/146 is trying to acquire lock:
[  140.963450]  (rtnl_mutex){+.+.}, at: [<ffffffffc083516a>] __ipoib_ib_dev_flush+0x2da/0x4e0 [ib_ipoib]
[  140.970006]
but task is already holding lock:
[  140.975141]  (&priv->vlan_rwsem){++++}, at: [<ffffffffc0834ee1>] __ipoib_ib_dev_flush+0x51/0x4e0 [ib_ipoib]
[  140.982105]
which lock already depends on the new lock.
[  140.990023]
the existing dependency chain (in reverse order) is:
[  140.998650]
-> #1 (&priv->vlan_rwsem){++++}:
[  141.005276]        down_read+0x4d/0xb0
[  141.009560]        ipoib_open+0xad/0x120 [ib_ipoib]
[  141.014400]        __dev_open+0xcb/0x140
[  141.017919]        __dev_change_flags+0x1a4/0x1e0
[  141.022133]        dev_change_flags+0x23/0x60
[  141.025695]        devinet_ioctl+0x704/0x7d0
[  141.029156]        sock_do_ioctl+0x20/0x50
[  141.032526]        sock_ioctl+0x221/0x300
[  141.036079]        do_vfs_ioctl+0xa6/0x6d0
[  141.039656]        SyS_ioctl+0x74/0x80
[  141.042811]        entry_SYSCALL_64_fastpath+0x1f/0x96
[  141.046891]
-> #0 (rtnl_mutex){+.+.}:
[  141.051701]        lock_acquire+0xd4/0x220
[  141.055212]        __mutex_lock+0x88/0x970
[  141.058631]        __ipoib_ib_dev_flush+0x2da/0x4e0 [ib_ipoib]
[  141.063160]        __ipoib_ib_dev_flush+0x71/0x4e0 [ib_ipoib]
[  141.067648]        process_one_work+0x1f5/0x610
[  141.071429]        worker_thread+0x4a/0x3f0
[  141.074890]        kthread+0x141/0x180
[  141.078085]        ret_from_fork+0x24/0x30
[  141.081559]

other info that might help us debug this:
[  141.088967]  Possible unsafe locking scenario:
[  141.094280]        CPU0                    CPU1
[  141.097953]        ----                    ----
[  141.101640]   lock(&priv->vlan_rwsem);
[  141.104771]                                lock(rtnl_mutex);
[  141.109207]                                lock(&priv->vlan_rwsem);
[  141.114032]   lock(rtnl_mutex);
[  141.116800]
 *** DEADLOCK ***

Fixes: b4b678b06f6e ("IB/ipoib: Grab rtnl lock on heavy flush when calling ndo_open/stop")
Signed-off-by: Alex Vesker <valex@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/ulp/ipoib/ipoib_ib.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index 3b96cdaf9a83..e6151a29c412 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -1236,13 +1236,10 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv,
 		ipoib_ib_dev_down(dev);
 
 	if (level == IPOIB_FLUSH_HEAVY) {
-		rtnl_lock();
 		if (test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags))
 			ipoib_ib_dev_stop(dev);
 
-		result = ipoib_ib_dev_open(dev);
-		rtnl_unlock();
-		if (result)
+		if (ipoib_ib_dev_open(dev))
 			return;
 
 		if (netif_queue_stopped(dev))
@@ -1282,7 +1279,9 @@ void ipoib_ib_dev_flush_heavy(struct work_struct *work)
 	struct ipoib_dev_priv *priv =
 		container_of(work, struct ipoib_dev_priv, flush_heavy);
 
+	rtnl_lock();
 	__ipoib_ib_dev_flush(priv, IPOIB_FLUSH_HEAVY, 0);
+	rtnl_unlock();
 }
 
 void ipoib_ib_dev_cleanup(struct net_device *dev)

From cd95a89282ef61458c3758d70ebfbd91f303033f Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Thu, 21 Dec 2017 08:52:50 -0800
Subject: [PATCH 378/808] selftests/bpf: fix Makefile for passing LLC to the
 command line

Makefile has a LLC variable that is initialised to "llc", but can
theoretically be overridden from the command line ("make LLC=llc-6.0").
However, this fails because for LLVM probe check, "llc" is called
directly. Use the $(LLC) variable instead to fix this.

Fixes: 22c8852624fc ("bpf: improve selftests and add tests for meta pointer")
Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 05fc4e2e7b3a..9316e648a880 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -39,7 +39,7 @@ $(BPFOBJ): force
 CLANG ?= clang
 LLC   ?= llc
 
-PROBE := $(shell llc -march=bpf -mcpu=probe -filetype=null /dev/null 2>&1)
+PROBE := $(shell $(LLC) -march=bpf -mcpu=probe -filetype=null /dev/null 2>&1)
 
 # Let newer LLVM versions transparently probe the kernel for availability
 # of full BPF instruction set.

From e7cdf5c82f1773c3386b93bbcf13b9bfff29fa31 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Tue, 19 Dec 2017 12:07:00 +0000
Subject: [PATCH 379/808] drm/syncobj: Stop reusing the same struct file for
 all syncobj -> fd

The vk cts test:
dEQP-VK.api.external.semaphore.opaque_fd.export_multiple_times_temporary

triggers a lot of
VFS: Close: file count is 0

Dave pointed out that clearing the syncobj->file from
drm_syncobj_file_release() was sufficient to silence the test, but that
opens a can of worm since we assumed that the syncobj->file was never
unset. Stop trying to reuse the same struct file for every fd pointing
to the drm_syncobj, and allocate one file for each fd instead.

v2: Fixup return handling of drm_syncobj_fd_to_handle
v2.1: [airlied: fix possible syncobj ref race]

Reported-by: Dave Airlie <airlied@redhat.com>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Tested-by: Dave Airlie <airlied@redhat.com>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 drivers/gpu/drm/drm_syncobj.c | 77 +++++++++++++----------------------
 1 file changed, 29 insertions(+), 48 deletions(-)

diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/drm_syncobj.c
index f776fc1cc543..cb4d09c70fd4 100644
--- a/drivers/gpu/drm/drm_syncobj.c
+++ b/drivers/gpu/drm/drm_syncobj.c
@@ -369,40 +369,26 @@ static const struct file_operations drm_syncobj_file_fops = {
 	.release = drm_syncobj_file_release,
 };
 
-static int drm_syncobj_alloc_file(struct drm_syncobj *syncobj)
-{
-	struct file *file = anon_inode_getfile("syncobj_file",
-					       &drm_syncobj_file_fops,
-					       syncobj, 0);
-	if (IS_ERR(file))
-		return PTR_ERR(file);
-
-	drm_syncobj_get(syncobj);
-	if (cmpxchg(&syncobj->file, NULL, file)) {
-		/* lost the race */
-		fput(file);
-	}
-
-	return 0;
-}
-
 int drm_syncobj_get_fd(struct drm_syncobj *syncobj, int *p_fd)
 {
-	int ret;
+	struct file *file;
 	int fd;
 
 	fd = get_unused_fd_flags(O_CLOEXEC);
 	if (fd < 0)
 		return fd;
 
-	if (!syncobj->file) {
-		ret = drm_syncobj_alloc_file(syncobj);
-		if (ret) {
-			put_unused_fd(fd);
-			return ret;
-		}
+	file = anon_inode_getfile("syncobj_file",
+				  &drm_syncobj_file_fops,
+				  syncobj, 0);
+	if (IS_ERR(file)) {
+		put_unused_fd(fd);
+		return PTR_ERR(file);
 	}
-	fd_install(fd, syncobj->file);
+
+	drm_syncobj_get(syncobj);
+	fd_install(fd, file);
+
 	*p_fd = fd;
 	return 0;
 }
@@ -422,31 +408,24 @@ static int drm_syncobj_handle_to_fd(struct drm_file *file_private,
 	return ret;
 }
 
-static struct drm_syncobj *drm_syncobj_fdget(int fd)
-{
-	struct file *file = fget(fd);
-
-	if (!file)
-		return NULL;
-	if (file->f_op != &drm_syncobj_file_fops)
-		goto err;
-
-	return file->private_data;
-err:
-	fput(file);
-	return NULL;
-};
-
 static int drm_syncobj_fd_to_handle(struct drm_file *file_private,
 				    int fd, u32 *handle)
 {
-	struct drm_syncobj *syncobj = drm_syncobj_fdget(fd);
+	struct drm_syncobj *syncobj;
+	struct file *file;
 	int ret;
 
-	if (!syncobj)
+	file = fget(fd);
+	if (!file)
 		return -EINVAL;
 
+	if (file->f_op != &drm_syncobj_file_fops) {
+		fput(file);
+		return -EINVAL;
+	}
+
 	/* take a reference to put in the idr */
+	syncobj = file->private_data;
 	drm_syncobj_get(syncobj);
 
 	idr_preload(GFP_KERNEL);
@@ -455,12 +434,14 @@ static int drm_syncobj_fd_to_handle(struct drm_file *file_private,
 	spin_unlock(&file_private->syncobj_table_lock);
 	idr_preload_end();
 
-	if (ret < 0) {
-		fput(syncobj->file);
-		return ret;
-	}
-	*handle = ret;
-	return 0;
+	if (ret > 0) {
+		*handle = ret;
+		ret = 0;
+	} else
+		drm_syncobj_put(syncobj);
+
+	fput(file);
+	return ret;
 }
 
 static int drm_syncobj_import_sync_file_fence(struct drm_file *file_private,

From dc1c4165d189350cb51bdd3057deb6ecd164beda Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= <clg@kaod.org>
Date: Tue, 12 Dec 2017 12:02:04 +0000
Subject: [PATCH 380/808] KVM: PPC: Book3S: fix XIVE migration of pending
 interrupts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When restoring a pending interrupt, we are setting the Q bit to force
a retrigger in xive_finish_unmask(). But we also need to force an EOI
in this case to reach the same initial state : P=1, Q=0.

This can be done by not setting 'old_p' for pending interrupts which
will inform xive_finish_unmask() that an EOI needs to be sent.

Fixes: 5af50993850a ("KVM: PPC: Book3S HV: Native usage of the XIVE interrupt controller")
Cc: stable@vger.kernel.org # v4.12+
Suggested-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Cédric Le Goater <clg@kaod.org>
Reviewed-by: Laurent Vivier <lvivier@redhat.com>
Tested-by: Laurent Vivier <lvivier@redhat.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kvm/book3s_xive.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
index bf457843e032..b5e6d227a034 100644
--- a/arch/powerpc/kvm/book3s_xive.c
+++ b/arch/powerpc/kvm/book3s_xive.c
@@ -1558,7 +1558,7 @@ static int xive_set_source(struct kvmppc_xive *xive, long irq, u64 addr)
 
 	/*
 	 * Restore P and Q. If the interrupt was pending, we
-	 * force both P and Q, which will trigger a resend.
+	 * force Q and !P, which will trigger a resend.
 	 *
 	 * That means that a guest that had both an interrupt
 	 * pending (queued) and Q set will restore with only
@@ -1566,7 +1566,7 @@ static int xive_set_source(struct kvmppc_xive *xive, long irq, u64 addr)
 	 * is perfectly fine as coalescing interrupts that haven't
 	 * been presented yet is always allowed.
 	 */
-	if (val & KVM_XICS_PRESENTED || val & KVM_XICS_PENDING)
+	if (val & KVM_XICS_PRESENTED && !(val & KVM_XICS_PENDING))
 		state->old_p = true;
 	if (val & KVM_XICS_QUEUED || val & KVM_XICS_PENDING)
 		state->old_q = true;

From 7333b5aca412d6ad02667b5a513485838a91b136 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Tue, 12 Dec 2017 18:23:56 +0100
Subject: [PATCH 381/808] KVM: PPC: Book3S HV: Fix pending_pri value in
 kvmppc_xive_get_icp()

When we migrate a VM from a POWER8 host (XICS) to a POWER9 host
(XICS-on-XIVE), we have an error:

qemu-kvm: Unable to restore KVM interrupt controller state \
          (0xff000000) for CPU 0: Invalid argument

This is because kvmppc_xics_set_icp() checks the new state
is internaly consistent, and especially:

...
   1129         if (xisr == 0) {
   1130                 if (pending_pri != 0xff)
   1131                         return -EINVAL;
...

On the other side, kvmppc_xive_get_icp() doesn't set
neither the pending_pri value, nor the xisr value (set to 0)
(and kvmppc_xive_set_icp() ignores the pending_pri value)

As xisr is 0, pending_pri must be set to 0xff.

Fixes: 5af50993850a ("KVM: PPC: Book3S HV: Native usage of the XIVE interrupt controller")
Cc: stable@vger.kernel.org # v4.12+
Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kvm/book3s_xive.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
index b5e6d227a034..0d750d274c4e 100644
--- a/arch/powerpc/kvm/book3s_xive.c
+++ b/arch/powerpc/kvm/book3s_xive.c
@@ -725,7 +725,8 @@ u64 kvmppc_xive_get_icp(struct kvm_vcpu *vcpu)
 
 	/* Return the per-cpu state for state saving/migration */
 	return (u64)xc->cppr << KVM_REG_PPC_ICP_CPPR_SHIFT |
-	       (u64)xc->mfrr << KVM_REG_PPC_ICP_MFRR_SHIFT;
+	       (u64)xc->mfrr << KVM_REG_PPC_ICP_MFRR_SHIFT |
+	       (u64)0xff << KVM_REG_PPC_ICP_PPRI_SHIFT;
 }
 
 int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval)

From 506e8a912661c97b41adc8a286b875d01323ec45 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 21 Dec 2017 22:35:19 +0100
Subject: [PATCH 382/808] ARM: dts: ls1021a: fix incorrect clock references

dtc warns about two 'clocks' properties that have an extraneous '1'
at the end:

arch/arm/boot/dts/ls1021a-qds.dtb: Warning (clocks_property): arch/arm/boot/dts/ls1021a-twr.dtb: Warning (clocks_property): Property 'clocks', cell 1 is not a phandle reference in /soc/i2c@2180000/mux@77/i2c@4/sgtl5000@2a
arch/arm/boot/dts/ls1021a-qds.dtb: Warning (clocks_property): Missing property '#clock-cells' in node /soc/interrupt-controller@1400000 or bad phandle (referred from /soc/i2c@2180000/mux@77/i2c@4/sgtl5000@2a:clocks[1])
Property 'clocks', cell 1 is not a phandle reference in /soc/i2c@2190000/sgtl5000@a
arch/arm/boot/dts/ls1021a-twr.dtb: Warning (clocks_property): Missing property '#clock-cells' in node /soc/interrupt-controller@1400000 or bad phandle (referred from /soc/i2c@2190000/sgtl5000@a:clocks[1])

The clocks that get referenced here are fixed-rate, so they do not
take any argument, and dtc interprets the next cell as a phandle, which
is invalid.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 arch/arm/boot/dts/ls1021a-qds.dts | 2 +-
 arch/arm/boot/dts/ls1021a-twr.dts | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm/boot/dts/ls1021a-qds.dts b/arch/arm/boot/dts/ls1021a-qds.dts
index 940875316d0f..67b4de0e3439 100644
--- a/arch/arm/boot/dts/ls1021a-qds.dts
+++ b/arch/arm/boot/dts/ls1021a-qds.dts
@@ -215,7 +215,7 @@
 				reg = <0x2a>;
 				VDDA-supply = <&reg_3p3v>;
 				VDDIO-supply = <&reg_3p3v>;
-				clocks = <&sys_mclk 1>;
+				clocks = <&sys_mclk>;
 			};
 		};
 	};
diff --git a/arch/arm/boot/dts/ls1021a-twr.dts b/arch/arm/boot/dts/ls1021a-twr.dts
index a8b148ad1dd2..44715c8ef756 100644
--- a/arch/arm/boot/dts/ls1021a-twr.dts
+++ b/arch/arm/boot/dts/ls1021a-twr.dts
@@ -187,7 +187,7 @@
 		reg = <0x0a>;
 		VDDA-supply = <&reg_3p3v>;
 		VDDIO-supply = <&reg_3p3v>;
-		clocks = <&sys_mclk 1>;
+		clocks = <&sys_mclk>;
 	};
 };
 

From fbd90b4cae105fbd8364fa1ce3f41d0c06296f58 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 21 Dec 2017 22:45:24 +0100
Subject: [PATCH 383/808] ARM: dts: tango4: remove bogus interrupt-controller
 property

dtc points out that the parent node of the interrupt controllers is not
actually an interrupt controller itself, and lacks an #interrupt-cells
property:

arch/arm/boot/dts/tango4-vantage-1172.dtb: Warning (interrupts_property): Missing #interrupt-cells in interrupt-parent /soc/interrupt-controller@6e000

This removes the annotation.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 arch/arm/boot/dts/tango4-common.dtsi | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/arm/boot/dts/tango4-common.dtsi b/arch/arm/boot/dts/tango4-common.dtsi
index 0ec1b0a317b4..ff72a8efb73d 100644
--- a/arch/arm/boot/dts/tango4-common.dtsi
+++ b/arch/arm/boot/dts/tango4-common.dtsi
@@ -156,7 +156,6 @@
 			reg = <0x6e000 0x400>;
 			ranges = <0 0x6e000 0x400>;
 			interrupt-parent = <&gic>;
-			interrupt-controller;
 			#address-cells = <1>;
 			#size-cells = <1>;
 

From d042566d8c704e1ecec370300545d4a409222e39 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 5 Dec 2017 11:10:26 +0100
Subject: [PATCH 384/808] crypto: chelsio - select CRYPTO_GF128MUL

Without the gf128mul library support, we can run into a link
error:

drivers/crypto/chelsio/chcr_algo.o: In function `chcr_update_tweak':
chcr_algo.c:(.text+0x7e0): undefined reference to `gf128mul_x8_ble'

This adds a Kconfig select statement for it, next to the ones we
already have.

Cc: <stable@vger.kernel.org>
Fixes: b8fd1f4170e7 ("crypto: chcr - Add ctr mode and process large sg entries for cipher")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/chelsio/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/crypto/chelsio/Kconfig b/drivers/crypto/chelsio/Kconfig
index 3e104f5aa0c2..b56b3f711d94 100644
--- a/drivers/crypto/chelsio/Kconfig
+++ b/drivers/crypto/chelsio/Kconfig
@@ -5,6 +5,7 @@ config CRYPTO_DEV_CHELSIO
 	select CRYPTO_SHA256
 	select CRYPTO_SHA512
 	select CRYPTO_AUTHENC
+	select CRYPTO_GF128MUL
 	---help---
 	  The Chelsio Crypto Co-processor driver for T6 adapters.
 

From e57121d08c38dabec15cf3e1e2ad46721af30cae Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 11 Dec 2017 12:15:17 -0800
Subject: [PATCH 385/808] crypto: chacha20poly1305 - validate the digest size

If the rfc7539 template was instantiated with a hash algorithm with
digest size larger than 16 bytes (POLY1305_DIGEST_SIZE), then the digest
overran the 'tag' buffer in 'struct chachapoly_req_ctx', corrupting the
subsequent memory, including 'cryptlen'.  This caused a crash during
crypto_skcipher_decrypt().

Fix it by, when instantiating the template, requiring that the
underlying hash algorithm has the digest size expected for Poly1305.

Reproducer:

    #include <linux/if_alg.h>
    #include <sys/socket.h>
    #include <unistd.h>

    int main()
    {
            int algfd, reqfd;
            struct sockaddr_alg addr = {
                    .salg_type = "aead",
                    .salg_name = "rfc7539(chacha20,sha256)",
            };
            unsigned char buf[32] = { 0 };

            algfd = socket(AF_ALG, SOCK_SEQPACKET, 0);
            bind(algfd, (void *)&addr, sizeof(addr));
            setsockopt(algfd, SOL_ALG, ALG_SET_KEY, buf, sizeof(buf));
            reqfd = accept(algfd, 0, 0);
            write(reqfd, buf, 16);
            read(reqfd, buf, 16);
    }

Reported-by: syzbot <syzkaller@googlegroups.com>
Fixes: 71ebc4d1b27d ("crypto: chacha20poly1305 - Add a ChaCha20-Poly1305 AEAD construction, RFC7539")
Cc: <stable@vger.kernel.org> # v4.2+
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/chacha20poly1305.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/crypto/chacha20poly1305.c b/crypto/chacha20poly1305.c
index db1bc3147bc4..600afa99941f 100644
--- a/crypto/chacha20poly1305.c
+++ b/crypto/chacha20poly1305.c
@@ -610,6 +610,11 @@ static int chachapoly_create(struct crypto_template *tmpl, struct rtattr **tb,
 						    algt->mask));
 	if (IS_ERR(poly))
 		return PTR_ERR(poly);
+	poly_hash = __crypto_hash_alg_common(poly);
+
+	err = -EINVAL;
+	if (poly_hash->digestsize != POLY1305_DIGEST_SIZE)
+		goto out_put_poly;
 
 	err = -ENOMEM;
 	inst = kzalloc(sizeof(*inst) + sizeof(*ctx), GFP_KERNEL);
@@ -618,7 +623,6 @@ static int chachapoly_create(struct crypto_template *tmpl, struct rtattr **tb,
 
 	ctx = aead_instance_ctx(inst);
 	ctx->saltlen = CHACHAPOLY_IV_SIZE - ivsize;
-	poly_hash = __crypto_hash_alg_common(poly);
 	err = crypto_init_ahash_spawn(&ctx->poly, poly_hash,
 				      aead_crypto_instance(inst));
 	if (err)

From af955bf15d2c27496b0269b1f05c26f758c68314 Mon Sep 17 00:00:00 2001
From: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Date: Tue, 19 Dec 2017 10:27:24 +0000
Subject: [PATCH 386/808] crypto: af_alg - Fix race around ctx->rcvused by
 making it atomic_t

This variable was increased and decreased without any protection.
Result was an occasional misscount and negative wrap around resulting
in false resource allocation failures.

Fixes: 7d2c3f54e6f6 ("crypto: af_alg - remove locking in async callback")
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Stephan Mueller <smueller@chronox.de>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/af_alg.c         | 4 ++--
 crypto/algif_aead.c     | 2 +-
 crypto/algif_skcipher.c | 2 +-
 include/crypto/if_alg.h | 5 +++--
 4 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/crypto/af_alg.c b/crypto/af_alg.c
index f1a2caf1b59b..d3f1c431724b 100644
--- a/crypto/af_alg.c
+++ b/crypto/af_alg.c
@@ -664,7 +664,7 @@ void af_alg_free_areq_sgls(struct af_alg_async_req *areq)
 	unsigned int i;
 
 	list_for_each_entry_safe(rsgl, tmp, &areq->rsgl_list, list) {
-		ctx->rcvused -= rsgl->sg_num_bytes;
+		atomic_sub(rsgl->sg_num_bytes, &ctx->rcvused);
 		af_alg_free_sg(&rsgl->sgl);
 		list_del(&rsgl->list);
 		if (rsgl != &areq->first_rsgl)
@@ -1162,7 +1162,7 @@ int af_alg_get_rsgl(struct sock *sk, struct msghdr *msg, int flags,
 
 		areq->last_rsgl = rsgl;
 		len += err;
-		ctx->rcvused += err;
+		atomic_add(err, &ctx->rcvused);
 		rsgl->sg_num_bytes = err;
 		iov_iter_advance(&msg->msg_iter, err);
 	}
diff --git a/crypto/algif_aead.c b/crypto/algif_aead.c
index b73db2b27656..20df8c1b6851 100644
--- a/crypto/algif_aead.c
+++ b/crypto/algif_aead.c
@@ -571,7 +571,7 @@ static int aead_accept_parent_nokey(void *private, struct sock *sk)
 	INIT_LIST_HEAD(&ctx->tsgl_list);
 	ctx->len = len;
 	ctx->used = 0;
-	ctx->rcvused = 0;
+	atomic_set(&ctx->rcvused, 0);
 	ctx->more = 0;
 	ctx->merge = 0;
 	ctx->enc = 0;
diff --git a/crypto/algif_skcipher.c b/crypto/algif_skcipher.c
index baef9bfccdda..c5c47b680152 100644
--- a/crypto/algif_skcipher.c
+++ b/crypto/algif_skcipher.c
@@ -390,7 +390,7 @@ static int skcipher_accept_parent_nokey(void *private, struct sock *sk)
 	INIT_LIST_HEAD(&ctx->tsgl_list);
 	ctx->len = len;
 	ctx->used = 0;
-	ctx->rcvused = 0;
+	atomic_set(&ctx->rcvused, 0);
 	ctx->more = 0;
 	ctx->merge = 0;
 	ctx->enc = 0;
diff --git a/include/crypto/if_alg.h b/include/crypto/if_alg.h
index 38d9c5861ed8..f38227a78eae 100644
--- a/include/crypto/if_alg.h
+++ b/include/crypto/if_alg.h
@@ -18,6 +18,7 @@
 #include <linux/if_alg.h>
 #include <linux/scatterlist.h>
 #include <linux/types.h>
+#include <linux/atomic.h>
 #include <net/sock.h>
 
 #include <crypto/aead.h>
@@ -150,7 +151,7 @@ struct af_alg_ctx {
 	struct crypto_wait wait;
 
 	size_t used;
-	size_t rcvused;
+	atomic_t rcvused;
 
 	bool more;
 	bool merge;
@@ -215,7 +216,7 @@ static inline int af_alg_rcvbuf(struct sock *sk)
 	struct af_alg_ctx *ctx = ask->private;
 
 	return max_t(int, max_t(int, sk->sk_rcvbuf & PAGE_MASK, PAGE_SIZE) -
-			  ctx->rcvused, 0);
+		     atomic_read(&ctx->rcvused), 0);
 }
 
 /**

From 203f45003a3d03eea8fa28d74cfc74c354416fdb Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@inai.de>
Date: Tue, 19 Dec 2017 19:09:07 +0100
Subject: [PATCH 387/808] crypto: n2 - cure use after free

queue_cache_init is first called for the Control Word Queue
(n2_crypto_probe). At that time, queue_cache[0] is NULL and a new
kmem_cache will be allocated. If the subsequent n2_register_algs call
fails, the kmem_cache will be released in queue_cache_destroy, but
queue_cache_init[0] is not set back to NULL.

So when the Module Arithmetic Unit gets probed next (n2_mau_probe),
queue_cache_init will not allocate a kmem_cache again, but leave it
as its bogus value, causing a BUG() to trigger when queue_cache[0] is
eventually passed to kmem_cache_zalloc:

	n2_crypto: Found N2CP at /virtual-devices@100/n2cp@7
	n2_crypto: Registered NCS HVAPI version 2.0
	called queue_cache_init
	n2_crypto: md5 alg registration failed
	n2cp f028687c: /virtual-devices@100/n2cp@7: Unable to register algorithms.
	called queue_cache_destroy
	n2cp: probe of f028687c failed with error -22
	n2_crypto: Found NCP at /virtual-devices@100/ncp@6
	n2_crypto: Registered NCS HVAPI version 2.0
	called queue_cache_init
	kernel BUG at mm/slab.c:2993!
	Call Trace:
	 [0000000000604488] kmem_cache_alloc+0x1a8/0x1e0
                  (inlined) kmem_cache_zalloc
                  (inlined) new_queue
                  (inlined) spu_queue_setup
                  (inlined) handle_exec_unit
	 [0000000010c61eb4] spu_mdesc_scan+0x1f4/0x460 [n2_crypto]
	 [0000000010c62b80] n2_mau_probe+0x100/0x220 [n2_crypto]
	 [000000000084b174] platform_drv_probe+0x34/0xc0

Cc: <stable@vger.kernel.org>
Signed-off-by: Jan Engelhardt <jengelh@inai.de>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/n2_core.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/crypto/n2_core.c b/drivers/crypto/n2_core.c
index 48de52cf2ecc..662e709812cc 100644
--- a/drivers/crypto/n2_core.c
+++ b/drivers/crypto/n2_core.c
@@ -1625,6 +1625,7 @@ static int queue_cache_init(void)
 					  CWQ_ENTRY_SIZE, 0, NULL);
 	if (!queue_cache[HV_NCS_QTYPE_CWQ - 1]) {
 		kmem_cache_destroy(queue_cache[HV_NCS_QTYPE_MAU - 1]);
+		queue_cache[HV_NCS_QTYPE_MAU - 1] = NULL;
 		return -ENOMEM;
 	}
 	return 0;
@@ -1634,6 +1635,8 @@ static void queue_cache_destroy(void)
 {
 	kmem_cache_destroy(queue_cache[HV_NCS_QTYPE_MAU - 1]);
 	kmem_cache_destroy(queue_cache[HV_NCS_QTYPE_CWQ - 1]);
+	queue_cache[HV_NCS_QTYPE_MAU - 1] = NULL;
+	queue_cache[HV_NCS_QTYPE_CWQ - 1] = NULL;
 }
 
 static long spu_queue_register_workfn(void *arg)

From d76c68109f37cb85b243a1cf0f40313afd2bae68 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Wed, 20 Dec 2017 14:28:25 -0800
Subject: [PATCH 388/808] crypto: pcrypt - fix freeing pcrypt instances

pcrypt is using the old way of freeing instances, where the ->free()
method specified in the 'struct crypto_template' is passed a pointer to
the 'struct crypto_instance'.  But the crypto_instance is being
kfree()'d directly, which is incorrect because the memory was actually
allocated as an aead_instance, which contains the crypto_instance at a
nonzero offset.  Thus, the wrong pointer was being kfree()'d.

Fix it by switching to the new way to free aead_instance's where the
->free() method is specified in the aead_instance itself.

Reported-by: syzbot <syzkaller@googlegroups.com>
Fixes: 0496f56065e0 ("crypto: pcrypt - Add support for new AEAD interface")
Cc: <stable@vger.kernel.org> # v4.2+
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/pcrypt.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/crypto/pcrypt.c b/crypto/pcrypt.c
index ee9cfb99fe25..f8ec3d4ba4a8 100644
--- a/crypto/pcrypt.c
+++ b/crypto/pcrypt.c
@@ -254,6 +254,14 @@ static void pcrypt_aead_exit_tfm(struct crypto_aead *tfm)
 	crypto_free_aead(ctx->child);
 }
 
+static void pcrypt_free(struct aead_instance *inst)
+{
+	struct pcrypt_instance_ctx *ctx = aead_instance_ctx(inst);
+
+	crypto_drop_aead(&ctx->spawn);
+	kfree(inst);
+}
+
 static int pcrypt_init_instance(struct crypto_instance *inst,
 				struct crypto_alg *alg)
 {
@@ -319,6 +327,8 @@ static int pcrypt_create_aead(struct crypto_template *tmpl, struct rtattr **tb,
 	inst->alg.encrypt = pcrypt_aead_encrypt;
 	inst->alg.decrypt = pcrypt_aead_decrypt;
 
+	inst->free = pcrypt_free;
+
 	err = aead_register_instance(tmpl, inst);
 	if (err)
 		goto out_drop_aead;
@@ -349,14 +359,6 @@ static int pcrypt_create(struct crypto_template *tmpl, struct rtattr **tb)
 	return -EINVAL;
 }
 
-static void pcrypt_free(struct crypto_instance *inst)
-{
-	struct pcrypt_instance_ctx *ctx = crypto_instance_ctx(inst);
-
-	crypto_drop_aead(&ctx->spawn);
-	kfree(inst);
-}
-
 static int pcrypt_cpumask_change_notify(struct notifier_block *self,
 					unsigned long val, void *data)
 {
@@ -469,7 +471,6 @@ static void pcrypt_fini_padata(struct padata_pcrypt *pcrypt)
 static struct crypto_template pcrypt_tmpl = {
 	.name = "pcrypt",
 	.create = pcrypt_create,
-	.free = pcrypt_free,
 	.module = THIS_MODULE,
 };
 

From 87c059e9c39dae20b8b9bd19d9ec55a6d6c10468 Mon Sep 17 00:00:00 2001
From: Bogdan Mirea <Bogdan-Stefan_Mirea@mentor.com>
Date: Thu, 21 Dec 2017 17:18:58 +0200
Subject: [PATCH 389/808] arm64: dts: renesas: salvator-x: Remove renesas,
 no-ether-link property

The present change is a bug fix for AVB link iteratively up/down.

Steps to reproduce:
- start AVB TX stream (Using aplay via MSE),
- disconnect+reconnect the eth cable,
- after a reconnection the eth connection goes iteratively up/down
  without user interaction,
- this may heal after some seconds or even stay for minutes.

As the documentation specifies, the "renesas,no-ether-link" option
should be used when a board does not provide a proper AVB_LINK signal.
There is no need for this option enabled on RCAR H3/M3 Salvator-X/XS
and ULCB starter kits since the AVB_LINK is correctly handled by HW.

Choosing to keep or remove the "renesas,no-ether-link" option will
have impact on the code flow in the following ways:
- keeping this option enabled may lead to unexpected behavior since
  the RX & TX are enabled/disabled directly from adjust_link function
  without any HW interrogation,
- removing this option, the RX & TX will only be enabled/disabled after
  HW interrogation. The HW check is made through the LMON pin in PSR
  register which specifies AVB_LINK signal value (0 - at low level;
  1 - at high level).

In conclusion, the present change is also a safety improvement because
it removes the "renesas,no-ether-link" option leading to a proper way
of detecting the link state based on HW interrogation and not on
software heuristic.

Fixes: dc36965a8905 ("arm64: dts: r8a7796: salvator-x: Enable EthernetAVB")
Fixes: 6fa501c549aa ("arm64: dts: r8a7795: enable EthernetAVB on Salvator-X")
Signed-off-by: Bogdan Mirea <Bogdan-Stefan_Mirea@mentor.com>
Signed-off-by: Vladimir Zapolskiy <vladimir_zapolskiy@mentor.com>
Signed-off-by: Simon Horman <horms+renesas@verge.net.au>
---
 arch/arm64/boot/dts/renesas/salvator-common.dtsi | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/arm64/boot/dts/renesas/salvator-common.dtsi b/arch/arm64/boot/dts/renesas/salvator-common.dtsi
index a298df74ca6c..dbe2648649db 100644
--- a/arch/arm64/boot/dts/renesas/salvator-common.dtsi
+++ b/arch/arm64/boot/dts/renesas/salvator-common.dtsi
@@ -255,7 +255,6 @@
 &avb {
 	pinctrl-0 = <&avb_pins>;
 	pinctrl-names = "default";
-	renesas,no-ether-link;
 	phy-handle = <&phy0>;
 	status = "okay";
 

From bbc25bee37d2b32cf3a1fab9195b6da3a185614a Mon Sep 17 00:00:00 2001
From: James Hogan <jhogan@kernel.org>
Date: Tue, 5 Dec 2017 23:31:35 +0000
Subject: [PATCH 390/808] lib/mpi: Fix umul_ppmm() for MIPS64r6

Current MIPS64r6 toolchains aren't able to generate efficient
DMULU/DMUHU based code for the C implementation of umul_ppmm(), which
performs an unsigned 64 x 64 bit multiply and returns the upper and
lower 64-bit halves of the 128-bit result. Instead it widens the 64-bit
inputs to 128-bits and emits a __multi3 intrinsic call to perform a 128
x 128 multiply. This is both inefficient, and it results in a link error
since we don't include __multi3 in MIPS linux.

For example commit 90a53e4432b1 ("cfg80211: implement regdb signature
checking") merged in v4.15-rc1 recently broke the 64r6_defconfig and
64r6el_defconfig builds by indirectly selecting MPILIB. The same build
errors can be reproduced on older kernels by enabling e.g. CRYPTO_RSA:

lib/mpi/generic_mpih-mul1.o: In function `mpihelp_mul_1':
lib/mpi/generic_mpih-mul1.c:50: undefined reference to `__multi3'
lib/mpi/generic_mpih-mul2.o: In function `mpihelp_addmul_1':
lib/mpi/generic_mpih-mul2.c:49: undefined reference to `__multi3'
lib/mpi/generic_mpih-mul3.o: In function `mpihelp_submul_1':
lib/mpi/generic_mpih-mul3.c:49: undefined reference to `__multi3'
lib/mpi/mpih-div.o In function `mpihelp_divrem':
lib/mpi/mpih-div.c:205: undefined reference to `__multi3'
lib/mpi/mpih-div.c:142: undefined reference to `__multi3'

Therefore add an efficient MIPS64r6 implementation of umul_ppmm() using
inline assembly and the DMULU/DMUHU instructions, to prevent __multi3
calls being emitted.

Fixes: 7fd08ca58ae6 ("MIPS: Add build support for the MIPS R6 ISA")
Signed-off-by: James Hogan <jhogan@kernel.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: linux-mips@linux-mips.org
Cc: linux-crypto@vger.kernel.org
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 lib/mpi/longlong.h | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/lib/mpi/longlong.h b/lib/mpi/longlong.h
index 57fd45ab7af1..08c60d10747f 100644
--- a/lib/mpi/longlong.h
+++ b/lib/mpi/longlong.h
@@ -671,7 +671,23 @@ do {						\
 	**************  MIPS/64  **************
 	***************************************/
 #if (defined(__mips) && __mips >= 3) && W_TYPE_SIZE == 64
-#if (__GNUC__ >= 5) || (__GNUC__ >= 4 && __GNUC_MINOR__ >= 4)
+#if defined(__mips_isa_rev) && __mips_isa_rev >= 6
+/*
+ * GCC ends up emitting a __multi3 intrinsic call for MIPS64r6 with the plain C
+ * code below, so we special case MIPS64r6 until the compiler can do better.
+ */
+#define umul_ppmm(w1, w0, u, v)						\
+do {									\
+	__asm__ ("dmulu %0,%1,%2"					\
+		 : "=d" ((UDItype)(w0))					\
+		 : "d" ((UDItype)(u)),					\
+		   "d" ((UDItype)(v)));					\
+	__asm__ ("dmuhu %0,%1,%2"					\
+		 : "=d" ((UDItype)(w1))					\
+		 : "d" ((UDItype)(u)),					\
+		   "d" ((UDItype)(v)));					\
+} while (0)
+#elif (__GNUC__ >= 5) || (__GNUC__ >= 4 && __GNUC_MINOR__ >= 4)
 #define umul_ppmm(w1, w0, u, v) \
 do {									\
 	typedef unsigned int __ll_UTItype __attribute__((mode(TI)));	\

From 7d2901f809c110bd9a261e879d59efe62e3bc758 Mon Sep 17 00:00:00 2001
From: Bogdan Mirea <Bogdan-Stefan_Mirea@mentor.com>
Date: Thu, 21 Dec 2017 17:18:59 +0200
Subject: [PATCH 391/808] arm64: dts: renesas: ulcb: Remove renesas,
 no-ether-link property

The present change is a bug fix for AVB link iteratively up/down.

Steps to reproduce:
- start AVB TX stream (Using aplay via MSE),
- disconnect+reconnect the eth cable,
- after a reconnection the eth connection goes iteratively up/down
  without user interaction,
- this may heal after some seconds or even stay for minutes.

As the documentation specifies, the "renesas,no-ether-link" option
should be used when a board does not provide a proper AVB_LINK signal.
There is no need for this option enabled on RCAR H3/M3 Salvator-X/XS
and ULCB starter kits since the AVB_LINK is correctly handled by HW.

Choosing to keep or remove the "renesas,no-ether-link" option will
have impact on the code flow in the following ways:
- keeping this option enabled may lead to unexpected behavior since
  the RX & TX are enabled/disabled directly from adjust_link function
  without any HW interrogation,
- removing this option, the RX & TX will only be enabled/disabled after
  HW interrogation. The HW check is made through the LMON pin in PSR
  register which specifies AVB_LINK signal value (0 - at low level;
  1 - at high level).

In conclusion, the present change is also a safety improvement because
it removes the "renesas,no-ether-link" option leading to a proper way
of detecting the link state based on HW interrogation and not on
software heuristic.

Fixes: dc36965a8905 ("arm64: dts: r8a7796: salvator-x: Enable EthernetAVB")
Fixes: 6fa501c549aa ("arm64: dts: r8a7795: enable EthernetAVB on Salvator-X")
Signed-off-by: Bogdan Mirea <Bogdan-Stefan_Mirea@mentor.com>
Signed-off-by: Vladimir Zapolskiy <vladimir_zapolskiy@mentor.com>
Signed-off-by: Simon Horman <horms+renesas@verge.net.au>
---
 arch/arm64/boot/dts/renesas/ulcb.dtsi | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/arm64/boot/dts/renesas/ulcb.dtsi b/arch/arm64/boot/dts/renesas/ulcb.dtsi
index 0d85b315ce71..73439cf48659 100644
--- a/arch/arm64/boot/dts/renesas/ulcb.dtsi
+++ b/arch/arm64/boot/dts/renesas/ulcb.dtsi
@@ -145,7 +145,6 @@
 &avb {
 	pinctrl-0 = <&avb_pins>;
 	pinctrl-names = "default";
-	renesas,no-ether-link;
 	phy-handle = <&phy0>;
 	status = "okay";
 

From 1eb7b40386c97f6c4d1c62931bf306f4535a4bd6 Mon Sep 17 00:00:00 2001
From: Ofer Heifetz <oferh@marvell.com>
Date: Mon, 11 Dec 2017 12:10:55 +0100
Subject: [PATCH 392/808] crypto: inside-secure - per request invalidation

When an invalidation request is needed we currently override the context
.send and .handle_result helpers. This is wrong as under high load other
requests can already be queued and overriding the context helpers will
make them execute the wrong .send and .handle_result functions.

This commit fixes this by adding a needs_inv flag in the request to
choose the action to perform when sending requests or handling their
results. This flag will be set when needed (i.e. when the context flag
will be set).

Fixes: 1b44c5a60c13 ("crypto: inside-secure - add SafeXcel EIP197 crypto engine driver")
Signed-off-by: Ofer Heifetz <oferh@marvell.com>
[Antoine: commit message, and removed non related changes from the
original commit]
Signed-off-by: Antoine Tenart <antoine.tenart@free-electrons.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 .../crypto/inside-secure/safexcel_cipher.c    | 71 +++++++++++++++----
 drivers/crypto/inside-secure/safexcel_hash.c  | 67 +++++++++++++----
 2 files changed, 111 insertions(+), 27 deletions(-)

diff --git a/drivers/crypto/inside-secure/safexcel_cipher.c b/drivers/crypto/inside-secure/safexcel_cipher.c
index 5438552bc6d7..9ea24868d860 100644
--- a/drivers/crypto/inside-secure/safexcel_cipher.c
+++ b/drivers/crypto/inside-secure/safexcel_cipher.c
@@ -14,6 +14,7 @@
 
 #include <crypto/aes.h>
 #include <crypto/skcipher.h>
+#include <crypto/internal/skcipher.h>
 
 #include "safexcel.h"
 
@@ -33,6 +34,10 @@ struct safexcel_cipher_ctx {
 	unsigned int key_len;
 };
 
+struct safexcel_cipher_req {
+	bool needs_inv;
+};
+
 static void safexcel_cipher_token(struct safexcel_cipher_ctx *ctx,
 				  struct crypto_async_request *async,
 				  struct safexcel_command_desc *cdesc,
@@ -126,9 +131,9 @@ static int safexcel_context_control(struct safexcel_cipher_ctx *ctx,
 	return 0;
 }
 
-static int safexcel_handle_result(struct safexcel_crypto_priv *priv, int ring,
-				  struct crypto_async_request *async,
-				  bool *should_complete, int *ret)
+static int safexcel_handle_req_result(struct safexcel_crypto_priv *priv, int ring,
+				      struct crypto_async_request *async,
+				      bool *should_complete, int *ret)
 {
 	struct skcipher_request *req = skcipher_request_cast(async);
 	struct safexcel_result_desc *rdesc;
@@ -265,7 +270,6 @@ static int safexcel_aes_send(struct crypto_async_request *async,
 	spin_unlock_bh(&priv->ring[ring].egress_lock);
 
 	request->req = &req->base;
-	ctx->base.handle_result = safexcel_handle_result;
 
 	*commands = n_cdesc;
 	*results = n_rdesc;
@@ -341,8 +345,6 @@ static int safexcel_handle_inv_result(struct safexcel_crypto_priv *priv,
 
 	ring = safexcel_select_ring(priv);
 	ctx->base.ring = ring;
-	ctx->base.needs_inv = false;
-	ctx->base.send = safexcel_aes_send;
 
 	spin_lock_bh(&priv->ring[ring].queue_lock);
 	enq_ret = crypto_enqueue_request(&priv->ring[ring].queue, async);
@@ -359,6 +361,26 @@ static int safexcel_handle_inv_result(struct safexcel_crypto_priv *priv,
 	return ndesc;
 }
 
+static int safexcel_handle_result(struct safexcel_crypto_priv *priv, int ring,
+				  struct crypto_async_request *async,
+				  bool *should_complete, int *ret)
+{
+	struct skcipher_request *req = skcipher_request_cast(async);
+	struct safexcel_cipher_req *sreq = skcipher_request_ctx(req);
+	int err;
+
+	if (sreq->needs_inv) {
+		sreq->needs_inv = false;
+		err = safexcel_handle_inv_result(priv, ring, async,
+						 should_complete, ret);
+	} else {
+		err = safexcel_handle_req_result(priv, ring, async,
+						 should_complete, ret);
+	}
+
+	return err;
+}
+
 static int safexcel_cipher_send_inv(struct crypto_async_request *async,
 				    int ring, struct safexcel_request *request,
 				    int *commands, int *results)
@@ -368,8 +390,6 @@ static int safexcel_cipher_send_inv(struct crypto_async_request *async,
 	struct safexcel_crypto_priv *priv = ctx->priv;
 	int ret;
 
-	ctx->base.handle_result = safexcel_handle_inv_result;
-
 	ret = safexcel_invalidate_cache(async, &ctx->base, priv,
 					ctx->base.ctxr_dma, ring, request);
 	if (unlikely(ret))
@@ -381,11 +401,29 @@ static int safexcel_cipher_send_inv(struct crypto_async_request *async,
 	return 0;
 }
 
+static int safexcel_send(struct crypto_async_request *async,
+			 int ring, struct safexcel_request *request,
+			 int *commands, int *results)
+{
+	struct skcipher_request *req = skcipher_request_cast(async);
+	struct safexcel_cipher_req *sreq = skcipher_request_ctx(req);
+	int ret;
+
+	if (sreq->needs_inv)
+		ret = safexcel_cipher_send_inv(async, ring, request,
+					       commands, results);
+	else
+		ret = safexcel_aes_send(async, ring, request,
+					commands, results);
+	return ret;
+}
+
 static int safexcel_cipher_exit_inv(struct crypto_tfm *tfm)
 {
 	struct safexcel_cipher_ctx *ctx = crypto_tfm_ctx(tfm);
 	struct safexcel_crypto_priv *priv = ctx->priv;
 	struct skcipher_request req;
+	struct safexcel_cipher_req *sreq = skcipher_request_ctx(&req);
 	struct safexcel_inv_result result = {};
 	int ring = ctx->base.ring;
 
@@ -399,7 +437,7 @@ static int safexcel_cipher_exit_inv(struct crypto_tfm *tfm)
 	skcipher_request_set_tfm(&req, __crypto_skcipher_cast(tfm));
 	ctx = crypto_tfm_ctx(req.base.tfm);
 	ctx->base.exit_inv = true;
-	ctx->base.send = safexcel_cipher_send_inv;
+	sreq->needs_inv = true;
 
 	spin_lock_bh(&priv->ring[ring].queue_lock);
 	crypto_enqueue_request(&priv->ring[ring].queue, &req.base);
@@ -424,19 +462,21 @@ static int safexcel_aes(struct skcipher_request *req,
 			enum safexcel_cipher_direction dir, u32 mode)
 {
 	struct safexcel_cipher_ctx *ctx = crypto_tfm_ctx(req->base.tfm);
+	struct safexcel_cipher_req *sreq = skcipher_request_ctx(req);
 	struct safexcel_crypto_priv *priv = ctx->priv;
 	int ret, ring;
 
+	sreq->needs_inv = false;
 	ctx->direction = dir;
 	ctx->mode = mode;
 
 	if (ctx->base.ctxr) {
-		if (ctx->base.needs_inv)
-			ctx->base.send = safexcel_cipher_send_inv;
+		if (ctx->base.needs_inv) {
+			sreq->needs_inv = true;
+			ctx->base.needs_inv = false;
+		}
 	} else {
 		ctx->base.ring = safexcel_select_ring(priv);
-		ctx->base.send = safexcel_aes_send;
-
 		ctx->base.ctxr = dma_pool_zalloc(priv->context_pool,
 						 EIP197_GFP_FLAGS(req->base),
 						 &ctx->base.ctxr_dma);
@@ -476,6 +516,11 @@ static int safexcel_skcipher_cra_init(struct crypto_tfm *tfm)
 			     alg.skcipher.base);
 
 	ctx->priv = tmpl->priv;
+	ctx->base.send = safexcel_send;
+	ctx->base.handle_result = safexcel_handle_result;
+
+	crypto_skcipher_set_reqsize(__crypto_skcipher_cast(tfm),
+				    sizeof(struct safexcel_cipher_req));
 
 	return 0;
 }
diff --git a/drivers/crypto/inside-secure/safexcel_hash.c b/drivers/crypto/inside-secure/safexcel_hash.c
index 74feb6227101..79fe149804d3 100644
--- a/drivers/crypto/inside-secure/safexcel_hash.c
+++ b/drivers/crypto/inside-secure/safexcel_hash.c
@@ -32,6 +32,7 @@ struct safexcel_ahash_req {
 	bool last_req;
 	bool finish;
 	bool hmac;
+	bool needs_inv;
 
 	u8 state_sz;    /* expected sate size, only set once */
 	u32 state[SHA256_DIGEST_SIZE / sizeof(u32)];
@@ -119,9 +120,9 @@ static void safexcel_context_control(struct safexcel_ahash_ctx *ctx,
 	}
 }
 
-static int safexcel_handle_result(struct safexcel_crypto_priv *priv, int ring,
-				  struct crypto_async_request *async,
-				  bool *should_complete, int *ret)
+static int safexcel_handle_req_result(struct safexcel_crypto_priv *priv, int ring,
+				      struct crypto_async_request *async,
+				      bool *should_complete, int *ret)
 {
 	struct safexcel_result_desc *rdesc;
 	struct ahash_request *areq = ahash_request_cast(async);
@@ -165,9 +166,9 @@ static int safexcel_handle_result(struct safexcel_crypto_priv *priv, int ring,
 	return 1;
 }
 
-static int safexcel_ahash_send(struct crypto_async_request *async, int ring,
-			       struct safexcel_request *request, int *commands,
-			       int *results)
+static int safexcel_ahash_send_req(struct crypto_async_request *async, int ring,
+				   struct safexcel_request *request,
+				   int *commands, int *results)
 {
 	struct ahash_request *areq = ahash_request_cast(async);
 	struct crypto_ahash *ahash = crypto_ahash_reqtfm(areq);
@@ -292,7 +293,6 @@ send_command:
 
 	req->processed += len;
 	request->req = &areq->base;
-	ctx->base.handle_result = safexcel_handle_result;
 
 	*commands = n_cdesc;
 	*results = 1;
@@ -374,8 +374,6 @@ static int safexcel_handle_inv_result(struct safexcel_crypto_priv *priv,
 
 	ring = safexcel_select_ring(priv);
 	ctx->base.ring = ring;
-	ctx->base.needs_inv = false;
-	ctx->base.send = safexcel_ahash_send;
 
 	spin_lock_bh(&priv->ring[ring].queue_lock);
 	enq_ret = crypto_enqueue_request(&priv->ring[ring].queue, async);
@@ -392,6 +390,26 @@ static int safexcel_handle_inv_result(struct safexcel_crypto_priv *priv,
 	return 1;
 }
 
+static int safexcel_handle_result(struct safexcel_crypto_priv *priv, int ring,
+				  struct crypto_async_request *async,
+				  bool *should_complete, int *ret)
+{
+	struct ahash_request *areq = ahash_request_cast(async);
+	struct safexcel_ahash_req *req = ahash_request_ctx(areq);
+	int err;
+
+	if (req->needs_inv) {
+		req->needs_inv = false;
+		err = safexcel_handle_inv_result(priv, ring, async,
+						 should_complete, ret);
+	} else {
+		err = safexcel_handle_req_result(priv, ring, async,
+						 should_complete, ret);
+	}
+
+	return err;
+}
+
 static int safexcel_ahash_send_inv(struct crypto_async_request *async,
 				   int ring, struct safexcel_request *request,
 				   int *commands, int *results)
@@ -400,7 +418,6 @@ static int safexcel_ahash_send_inv(struct crypto_async_request *async,
 	struct safexcel_ahash_ctx *ctx = crypto_ahash_ctx(crypto_ahash_reqtfm(areq));
 	int ret;
 
-	ctx->base.handle_result = safexcel_handle_inv_result;
 	ret = safexcel_invalidate_cache(async, &ctx->base, ctx->priv,
 					ctx->base.ctxr_dma, ring, request);
 	if (unlikely(ret))
@@ -412,11 +429,29 @@ static int safexcel_ahash_send_inv(struct crypto_async_request *async,
 	return 0;
 }
 
+static int safexcel_ahash_send(struct crypto_async_request *async,
+			       int ring, struct safexcel_request *request,
+			       int *commands, int *results)
+{
+	struct ahash_request *areq = ahash_request_cast(async);
+	struct safexcel_ahash_req *req = ahash_request_ctx(areq);
+	int ret;
+
+	if (req->needs_inv)
+		ret = safexcel_ahash_send_inv(async, ring, request,
+					      commands, results);
+	else
+		ret = safexcel_ahash_send_req(async, ring, request,
+					      commands, results);
+	return ret;
+}
+
 static int safexcel_ahash_exit_inv(struct crypto_tfm *tfm)
 {
 	struct safexcel_ahash_ctx *ctx = crypto_tfm_ctx(tfm);
 	struct safexcel_crypto_priv *priv = ctx->priv;
 	struct ahash_request req;
+	struct safexcel_ahash_req *rctx = ahash_request_ctx(&req);
 	struct safexcel_inv_result result = {};
 	int ring = ctx->base.ring;
 
@@ -430,7 +465,7 @@ static int safexcel_ahash_exit_inv(struct crypto_tfm *tfm)
 	ahash_request_set_tfm(&req, __crypto_ahash_cast(tfm));
 	ctx = crypto_tfm_ctx(req.base.tfm);
 	ctx->base.exit_inv = true;
-	ctx->base.send = safexcel_ahash_send_inv;
+	rctx->needs_inv = true;
 
 	spin_lock_bh(&priv->ring[ring].queue_lock);
 	crypto_enqueue_request(&priv->ring[ring].queue, &req.base);
@@ -481,14 +516,16 @@ static int safexcel_ahash_enqueue(struct ahash_request *areq)
 	struct safexcel_crypto_priv *priv = ctx->priv;
 	int ret, ring;
 
-	ctx->base.send = safexcel_ahash_send;
+	req->needs_inv = false;
 
 	if (req->processed && ctx->digest == CONTEXT_CONTROL_DIGEST_PRECOMPUTED)
 		ctx->base.needs_inv = safexcel_ahash_needs_inv_get(areq);
 
 	if (ctx->base.ctxr) {
-		if (ctx->base.needs_inv)
-			ctx->base.send = safexcel_ahash_send_inv;
+		if (ctx->base.needs_inv) {
+			ctx->base.needs_inv = false;
+			req->needs_inv = true;
+		}
 	} else {
 		ctx->base.ring = safexcel_select_ring(priv);
 		ctx->base.ctxr = dma_pool_zalloc(priv->context_pool,
@@ -622,6 +659,8 @@ static int safexcel_ahash_cra_init(struct crypto_tfm *tfm)
 			     struct safexcel_alg_template, alg.ahash);
 
 	ctx->priv = tmpl->priv;
+	ctx->base.send = safexcel_ahash_send;
+	ctx->base.handle_result = safexcel_handle_result;
 
 	crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
 				 sizeof(struct safexcel_ahash_req));

From 0a02dcca126280595950f3ea809f77c9cb0a235c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Antoine=20T=C3=A9nart?= <antoine.tenart@free-electrons.com>
Date: Mon, 11 Dec 2017 12:10:56 +0100
Subject: [PATCH 393/808] crypto: inside-secure - free requests even if their
 handling failed

This patch frees the request private data even if its handling failed,
as it would never be freed otherwise.

Fixes: 1b44c5a60c13 ("crypto: inside-secure - add SafeXcel EIP197 crypto engine driver")
Suggested-by: Ofer Heifetz <oferh@marvell.com>
Signed-off-by: Antoine Tenart <antoine.tenart@free-electrons.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/inside-secure/safexcel.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/crypto/inside-secure/safexcel.c b/drivers/crypto/inside-secure/safexcel.c
index 89ba9e85c0f3..4bcef78a08aa 100644
--- a/drivers/crypto/inside-secure/safexcel.c
+++ b/drivers/crypto/inside-secure/safexcel.c
@@ -607,6 +607,7 @@ static inline void safexcel_handle_result_descriptor(struct safexcel_crypto_priv
 		ndesc = ctx->handle_result(priv, ring, sreq->req,
 					   &should_complete, &ret);
 		if (ndesc < 0) {
+			kfree(sreq);
 			dev_err(priv->dev, "failed to handle result (%d)", ndesc);
 			return;
 		}

From 7cad2fabd5691dbb17762877d4e7f236fe4bc181 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Antoine=20T=C3=A9nart?= <antoine.tenart@free-electrons.com>
Date: Mon, 11 Dec 2017 12:10:57 +0100
Subject: [PATCH 394/808] crypto: inside-secure - fix request allocations in
 invalidation path

This patch makes use of the SKCIPHER_REQUEST_ON_STACK and
AHASH_REQUEST_ON_STACK helpers to allocate enough memory to contain both
the crypto request structures and their embedded context (__ctx).

Fixes: 1b44c5a60c13 ("crypto: inside-secure - add SafeXcel EIP197 crypto engine driver")
Suggested-by: Ofer Heifetz <oferh@marvell.com>
Signed-off-by: Antoine Tenart <antoine.tenart@free-electrons.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/inside-secure/safexcel_cipher.c | 16 ++++++++--------
 drivers/crypto/inside-secure/safexcel_hash.c   | 14 +++++++-------
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/drivers/crypto/inside-secure/safexcel_cipher.c b/drivers/crypto/inside-secure/safexcel_cipher.c
index 9ea24868d860..fcc0a606d748 100644
--- a/drivers/crypto/inside-secure/safexcel_cipher.c
+++ b/drivers/crypto/inside-secure/safexcel_cipher.c
@@ -422,25 +422,25 @@ static int safexcel_cipher_exit_inv(struct crypto_tfm *tfm)
 {
 	struct safexcel_cipher_ctx *ctx = crypto_tfm_ctx(tfm);
 	struct safexcel_crypto_priv *priv = ctx->priv;
-	struct skcipher_request req;
-	struct safexcel_cipher_req *sreq = skcipher_request_ctx(&req);
+	SKCIPHER_REQUEST_ON_STACK(req, __crypto_skcipher_cast(tfm));
+	struct safexcel_cipher_req *sreq = skcipher_request_ctx(req);
 	struct safexcel_inv_result result = {};
 	int ring = ctx->base.ring;
 
-	memset(&req, 0, sizeof(struct skcipher_request));
+	memset(req, 0, sizeof(struct skcipher_request));
 
 	/* create invalidation request */
 	init_completion(&result.completion);
-	skcipher_request_set_callback(&req, CRYPTO_TFM_REQ_MAY_BACKLOG,
-					safexcel_inv_complete, &result);
+	skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+				      safexcel_inv_complete, &result);
 
-	skcipher_request_set_tfm(&req, __crypto_skcipher_cast(tfm));
-	ctx = crypto_tfm_ctx(req.base.tfm);
+	skcipher_request_set_tfm(req, __crypto_skcipher_cast(tfm));
+	ctx = crypto_tfm_ctx(req->base.tfm);
 	ctx->base.exit_inv = true;
 	sreq->needs_inv = true;
 
 	spin_lock_bh(&priv->ring[ring].queue_lock);
-	crypto_enqueue_request(&priv->ring[ring].queue, &req.base);
+	crypto_enqueue_request(&priv->ring[ring].queue, &req->base);
 	spin_unlock_bh(&priv->ring[ring].queue_lock);
 
 	if (!priv->ring[ring].need_dequeue)
diff --git a/drivers/crypto/inside-secure/safexcel_hash.c b/drivers/crypto/inside-secure/safexcel_hash.c
index 79fe149804d3..55ff8a340b11 100644
--- a/drivers/crypto/inside-secure/safexcel_hash.c
+++ b/drivers/crypto/inside-secure/safexcel_hash.c
@@ -450,25 +450,25 @@ static int safexcel_ahash_exit_inv(struct crypto_tfm *tfm)
 {
 	struct safexcel_ahash_ctx *ctx = crypto_tfm_ctx(tfm);
 	struct safexcel_crypto_priv *priv = ctx->priv;
-	struct ahash_request req;
-	struct safexcel_ahash_req *rctx = ahash_request_ctx(&req);
+	AHASH_REQUEST_ON_STACK(req, __crypto_ahash_cast(tfm));
+	struct safexcel_ahash_req *rctx = ahash_request_ctx(req);
 	struct safexcel_inv_result result = {};
 	int ring = ctx->base.ring;
 
-	memset(&req, 0, sizeof(struct ahash_request));
+	memset(req, 0, sizeof(struct ahash_request));
 
 	/* create invalidation request */
 	init_completion(&result.completion);
-	ahash_request_set_callback(&req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+	ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
 				   safexcel_inv_complete, &result);
 
-	ahash_request_set_tfm(&req, __crypto_ahash_cast(tfm));
-	ctx = crypto_tfm_ctx(req.base.tfm);
+	ahash_request_set_tfm(req, __crypto_ahash_cast(tfm));
+	ctx = crypto_tfm_ctx(req->base.tfm);
 	ctx->base.exit_inv = true;
 	rctx->needs_inv = true;
 
 	spin_lock_bh(&priv->ring[ring].queue_lock);
-	crypto_enqueue_request(&priv->ring[ring].queue, &req.base);
+	crypto_enqueue_request(&priv->ring[ring].queue, &req->base);
 	spin_unlock_bh(&priv->ring[ring].queue_lock);
 
 	if (!priv->ring[ring].need_dequeue)

From 2973633e9f09311e849f975d969737af81a521ff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Antoine=20T=C3=A9nart?= <antoine.tenart@free-electrons.com>
Date: Mon, 11 Dec 2017 12:10:58 +0100
Subject: [PATCH 395/808] crypto: inside-secure - do not use areq->result for
 partial results

This patches update the SafeXcel driver to stop using the crypto
ahash_request result field for partial results (i.e. on updates).
Instead the driver local safexcel_ahash_req state field is used, and
only on final operations the ahash_request result buffer is updated.

Fixes: 1b44c5a60c13 ("crypto: inside-secure - add SafeXcel EIP197 crypto engine driver")
Signed-off-by: Antoine Tenart <antoine.tenart@free-electrons.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/inside-secure/safexcel_hash.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/crypto/inside-secure/safexcel_hash.c b/drivers/crypto/inside-secure/safexcel_hash.c
index 55ff8a340b11..0c5a5820b06e 100644
--- a/drivers/crypto/inside-secure/safexcel_hash.c
+++ b/drivers/crypto/inside-secure/safexcel_hash.c
@@ -35,7 +35,7 @@ struct safexcel_ahash_req {
 	bool needs_inv;
 
 	u8 state_sz;    /* expected sate size, only set once */
-	u32 state[SHA256_DIGEST_SIZE / sizeof(u32)];
+	u32 state[SHA256_DIGEST_SIZE / sizeof(u32)] __aligned(sizeof(u32));
 
 	u64 len;
 	u64 processed;
@@ -128,7 +128,7 @@ static int safexcel_handle_req_result(struct safexcel_crypto_priv *priv, int rin
 	struct ahash_request *areq = ahash_request_cast(async);
 	struct crypto_ahash *ahash = crypto_ahash_reqtfm(areq);
 	struct safexcel_ahash_req *sreq = ahash_request_ctx(areq);
-	int cache_len, result_sz = sreq->state_sz;
+	int cache_len;
 
 	*ret = 0;
 
@@ -149,8 +149,8 @@ static int safexcel_handle_req_result(struct safexcel_crypto_priv *priv, int rin
 	spin_unlock_bh(&priv->ring[ring].egress_lock);
 
 	if (sreq->finish)
-		result_sz = crypto_ahash_digestsize(ahash);
-	memcpy(sreq->state, areq->result, result_sz);
+		memcpy(areq->result, sreq->state,
+		       crypto_ahash_digestsize(ahash));
 
 	dma_unmap_sg(priv->dev, areq->src,
 		     sg_nents_for_len(areq->src, areq->nbytes), DMA_TO_DEVICE);
@@ -274,7 +274,7 @@ send_command:
 	/* Add the token */
 	safexcel_hash_token(first_cdesc, len, req->state_sz);
 
-	ctx->base.result_dma = dma_map_single(priv->dev, areq->result,
+	ctx->base.result_dma = dma_map_single(priv->dev, req->state,
 					      req->state_sz, DMA_FROM_DEVICE);
 	if (dma_mapping_error(priv->dev, ctx->base.result_dma)) {
 		ret = -EINVAL;

From 322f74ede933b3e2cb78768b6a6fdbfbf478a0c1 Mon Sep 17 00:00:00 2001
From: Hui Wang <hui.wang@canonical.com>
Date: Fri, 22 Dec 2017 11:17:44 +0800
Subject: [PATCH 396/808] ALSA: hda - Add MIC_NO_PRESENCE fixup for 2 HP
 machines

There is a headset jack on the front panel, when we plug a headset
into it, the headset mic can't trigger unsol events, and
read_pin_sense() can't detect its presence too. So add this fixup
to fix this issue.

Cc: <stable@vger.kernel.org>
Signed-off-by: Hui Wang <hui.wang@canonical.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/pci/hda/patch_conexant.c | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/sound/pci/hda/patch_conexant.c b/sound/pci/hda/patch_conexant.c
index a81aacf684b2..37e1cf8218ff 100644
--- a/sound/pci/hda/patch_conexant.c
+++ b/sound/pci/hda/patch_conexant.c
@@ -271,6 +271,8 @@ enum {
 	CXT_FIXUP_HP_SPECTRE,
 	CXT_FIXUP_HP_GATE_MIC,
 	CXT_FIXUP_MUTE_LED_GPIO,
+	CXT_FIXUP_HEADSET_MIC,
+	CXT_FIXUP_HP_MIC_NO_PRESENCE,
 };
 
 /* for hda_fixup_thinkpad_acpi() */
@@ -350,6 +352,18 @@ static void cxt_fixup_headphone_mic(struct hda_codec *codec,
 	}
 }
 
+static void cxt_fixup_headset_mic(struct hda_codec *codec,
+				    const struct hda_fixup *fix, int action)
+{
+	struct conexant_spec *spec = codec->spec;
+
+	switch (action) {
+	case HDA_FIXUP_ACT_PRE_PROBE:
+		spec->parse_flags |= HDA_PINCFG_HEADSET_MIC;
+		break;
+	}
+}
+
 /* OPLC XO 1.5 fixup */
 
 /* OLPC XO-1.5 supports DC input mode (e.g. for use with analog sensors)
@@ -880,6 +894,19 @@ static const struct hda_fixup cxt_fixups[] = {
 		.type = HDA_FIXUP_FUNC,
 		.v.func = cxt_fixup_mute_led_gpio,
 	},
+	[CXT_FIXUP_HEADSET_MIC] = {
+		.type = HDA_FIXUP_FUNC,
+		.v.func = cxt_fixup_headset_mic,
+	},
+	[CXT_FIXUP_HP_MIC_NO_PRESENCE] = {
+		.type = HDA_FIXUP_PINS,
+		.v.pins = (const struct hda_pintbl[]) {
+			{ 0x1a, 0x02a1113c },
+			{ }
+		},
+		.chained = true,
+		.chain_id = CXT_FIXUP_HEADSET_MIC,
+	},
 };
 
 static const struct snd_pci_quirk cxt5045_fixups[] = {
@@ -934,6 +961,8 @@ static const struct snd_pci_quirk cxt5066_fixups[] = {
 	SND_PCI_QUIRK(0x103c, 0x8115, "HP Z1 Gen3", CXT_FIXUP_HP_GATE_MIC),
 	SND_PCI_QUIRK(0x103c, 0x814f, "HP ZBook 15u G3", CXT_FIXUP_MUTE_LED_GPIO),
 	SND_PCI_QUIRK(0x103c, 0x822e, "HP ProBook 440 G4", CXT_FIXUP_MUTE_LED_GPIO),
+	SND_PCI_QUIRK(0x103c, 0x8299, "HP 800 G3 SFF", CXT_FIXUP_HP_MIC_NO_PRESENCE),
+	SND_PCI_QUIRK(0x103c, 0x829a, "HP 800 G3 DM", CXT_FIXUP_HP_MIC_NO_PRESENCE),
 	SND_PCI_QUIRK(0x1043, 0x138d, "Asus", CXT_FIXUP_HEADPHONE_MIC_PIN),
 	SND_PCI_QUIRK(0x152d, 0x0833, "OLPC XO-1.5", CXT_FIXUP_OLPC_XO),
 	SND_PCI_QUIRK(0x17aa, 0x20f2, "Lenovo T400", CXT_PINCFG_LENOVO_TP410),

From 285d5ddcffafa5d5e68c586f4c9eaa8b24a2897d Mon Sep 17 00:00:00 2001
From: Hui Wang <hui.wang@canonical.com>
Date: Fri, 22 Dec 2017 11:17:45 +0800
Subject: [PATCH 397/808] ALSA: hda - fix headset mic detection issue on a Dell
 machine

It has the codec alc256, and add its pin definition to pin quirk
table to let it apply ALC255_FIXUP_DELL1_MIC_NO_PRESENCE.

Cc: <stable@vger.kernel.org>
Signed-off-by: Hui Wang <hui.wang@canonical.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/pci/hda/patch_realtek.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
index 6a4db00511ab..682858548b9b 100644
--- a/sound/pci/hda/patch_realtek.c
+++ b/sound/pci/hda/patch_realtek.c
@@ -6585,6 +6585,11 @@ static const struct snd_hda_pin_quirk alc269_pin_fixup_tbl[] = {
 	SND_HDA_PIN_QUIRK(0x10ec0255, 0x1028, "Dell", ALC255_FIXUP_DELL1_MIC_NO_PRESENCE,
 		{0x1b, 0x01011020},
 		{0x21, 0x02211010}),
+	SND_HDA_PIN_QUIRK(0x10ec0256, 0x1028, "Dell", ALC255_FIXUP_DELL1_MIC_NO_PRESENCE,
+		{0x12, 0x90a60130},
+		{0x14, 0x90170110},
+		{0x1b, 0x01011020},
+		{0x21, 0x0221101f}),
 	SND_HDA_PIN_QUIRK(0x10ec0256, 0x1028, "Dell", ALC255_FIXUP_DELL1_MIC_NO_PRESENCE,
 		{0x12, 0x90a60160},
 		{0x14, 0x90170120},

From 8da5bbfc7cbba909f4f32d5e1dda3750baa5d853 Mon Sep 17 00:00:00 2001
From: Hui Wang <hui.wang@canonical.com>
Date: Fri, 22 Dec 2017 11:17:46 +0800
Subject: [PATCH 398/808] ALSA: hda - change the location for one mic on a
 Lenovo machine

There are two front mics on this machine, and current driver assign
the same name Mic to both of them, but pulseaudio can't handle them.
As a workaround, we change the location for one of them, then the
driver will assign "Front Mic" and "Mic" for them.

Cc: <stable@vger.kernel.org>
Signed-off-by: Hui Wang <hui.wang@canonical.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/pci/hda/patch_realtek.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
index 682858548b9b..1522ba31e16d 100644
--- a/sound/pci/hda/patch_realtek.c
+++ b/sound/pci/hda/patch_realtek.c
@@ -6328,6 +6328,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = {
 	SND_PCI_QUIRK(0x17aa, 0x30bb, "ThinkCentre AIO", ALC233_FIXUP_LENOVO_LINE2_MIC_HOTKEY),
 	SND_PCI_QUIRK(0x17aa, 0x30e2, "ThinkCentre AIO", ALC233_FIXUP_LENOVO_LINE2_MIC_HOTKEY),
 	SND_PCI_QUIRK(0x17aa, 0x310c, "ThinkCentre Station", ALC294_FIXUP_LENOVO_MIC_LOCATION),
+	SND_PCI_QUIRK(0x17aa, 0x313c, "ThinkCentre Station", ALC294_FIXUP_LENOVO_MIC_LOCATION),
 	SND_PCI_QUIRK(0x17aa, 0x3112, "ThinkCentre AIO", ALC233_FIXUP_LENOVO_LINE2_MIC_HOTKEY),
 	SND_PCI_QUIRK(0x17aa, 0x3902, "Lenovo E50-80", ALC269_FIXUP_DMIC_THINKPAD_ACPI),
 	SND_PCI_QUIRK(0x17aa, 0x3977, "IdeaPad S210", ALC283_FIXUP_INT_MIC),

From a36c2638380c0a4676647a1f553b70b20d3ebce1 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Fri, 22 Dec 2017 10:45:07 +0100
Subject: [PATCH 399/808] ALSA: hda: Drop useless WARN_ON()

Since the commit 97cc2ed27e5a ("ALSA: hda - Fix yet another i915
pointer leftover in error path") cleared hdac_acomp pointer, the
WARN_ON() non-NULL check in snd_hdac_i915_register_notifier() may give
a false-positive warning, as the function gets called no matter
whether the component is registered or not.  For fixing it, let's get
rid of the spurious WARN_ON().

Fixes: 97cc2ed27e5a ("ALSA: hda - Fix yet another i915 pointer leftover in error path")
Cc: <stable@vger.kernel.org>
Reported-by: Kouta Okamoto <kouta.okamoto@toshiba.co.jp>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/hda/hdac_i915.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sound/hda/hdac_i915.c b/sound/hda/hdac_i915.c
index 038a180d3f81..cbe818eda336 100644
--- a/sound/hda/hdac_i915.c
+++ b/sound/hda/hdac_i915.c
@@ -325,7 +325,7 @@ static int hdac_component_master_match(struct device *dev, void *data)
  */
 int snd_hdac_i915_register_notifier(const struct i915_audio_component_audio_ops *aops)
 {
-	if (WARN_ON(!hdac_acomp))
+	if (!hdac_acomp)
 		return -ENODEV;
 
 	hdac_acomp->audio_ops = aops;

From 32aa144fc32abfcbf7140f473dfbd94c5b9b4105 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Fri, 15 Dec 2017 13:14:31 +0100
Subject: [PATCH 400/808] KVM: s390: fix cmma migration for multiple memory
 slots

When multiple memory slots are present the cmma migration code
does not allocate enough memory for the bitmap. The memory slots
are sorted in reverse order, so we must use gfn and size of
slot[0] instead of the last one.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Reviewed-by: Claudio Imbrenda <imbrenda@linux.vnet.ibm.com>
Cc: stable@vger.kernel.org # 4.13+
Fixes: 190df4a212a7 (KVM: s390: CMMA tracking, ESSA emulation, migration mode)
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
---
 arch/s390/kvm/kvm-s390.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index efa439f6ffb3..abcd24fdde3f 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -792,11 +792,12 @@ static int kvm_s390_vm_start_migration(struct kvm *kvm)
 
 	if (kvm->arch.use_cmma) {
 		/*
-		 * Get the last slot. They should be sorted by base_gfn, so the
-		 * last slot is also the one at the end of the address space.
-		 * We have verified above that at least one slot is present.
+		 * Get the first slot. They are reverse sorted by base_gfn, so
+		 * the first slot is also the one at the end of the address
+		 * space. We have verified above that at least one slot is
+		 * present.
 		 */
-		ms = slots->memslots + slots->used_slots - 1;
+		ms = slots->memslots;
 		/* round up so we only use full longs */
 		ram_pages = roundup(ms->base_gfn + ms->npages, BITS_PER_LONG);
 		/* allocate enough bytes to store all the bits */

From c2cf265d860882b51a200e4a7553c17827f2b730 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Thu, 21 Dec 2017 09:18:22 +0100
Subject: [PATCH 401/808] KVM: s390: prevent buffer overrun on memory hotplug
 during migration

We must not go beyond the pre-allocated buffer. This can happen when
a new memory slot is added during migration.

Reported-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: stable@vger.kernel.org # 4.13+
Fixes: 190df4a212a7 (KVM: s390: CMMA tracking, ESSA emulation, migration mode)
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
---
 arch/s390/kvm/priv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 572496c688cc..0714bfa56da0 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -1006,7 +1006,7 @@ static inline int do_essa(struct kvm_vcpu *vcpu, const int orc)
 		cbrlo[entries] = gfn << PAGE_SHIFT;
 	}
 
-	if (orc) {
+	if (orc && gfn < ms->bitmap_size) {
 		/* increment only if we are really flipping the bit to 1 */
 		if (!test_and_set_bit(gfn, ms->pgste_bitmap))
 			atomic64_inc(&ms->dirty_pages);

From 8bb65fc06c08f027980a917648e1cf6e4d51c5ad Mon Sep 17 00:00:00 2001
From: Grygorii Strashko <grygorii.strashko@ti.com>
Date: Wed, 6 Dec 2017 11:37:45 -0600
Subject: [PATCH 402/808] gpio: gpio-reg: fix build

Revert changes introduced by commit f0fbe7bce733 ("gpio: Move irqdomain
into struct gpio_irq_chip") as they are not aplicable to this driver.

Reported-by: Russell King - ARM Linux <linux@armlinux.org.uk>
Fixes: f0fbe7bce733 ("gpio: Move irqdomain into struct gpio_irq_chip")
Signed-off-by: Grygorii Strashko <grygorii.strashko@ti.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpio-reg.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpio/gpio-reg.c b/drivers/gpio/gpio-reg.c
index 23e771dba4c1..e85903eddc68 100644
--- a/drivers/gpio/gpio-reg.c
+++ b/drivers/gpio/gpio-reg.c
@@ -103,8 +103,8 @@ static int gpio_reg_to_irq(struct gpio_chip *gc, unsigned offset)
 	struct gpio_reg *r = to_gpio_reg(gc);
 	int irq = r->irqs[offset];
 
-	if (irq >= 0 && r->irq.domain)
-		irq = irq_find_mapping(r->irq.domain, irq);
+	if (irq >= 0 && r->irqdomain)
+		irq = irq_find_mapping(r->irqdomain, irq);
 
 	return irq;
 }

From 822703354774ec935169cbbc8d503236bcb54fda Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 15 Dec 2017 15:02:33 +0100
Subject: [PATCH 403/808] gpio: fix "gpio-line-names" property retrieval

Following commit 9427ecbed46cc ("gpio: Rework of_gpiochip_set_names()
to use device property accessors"), "gpio-line-names" DT property is
not retrieved anymore when chip->parent is not set by the driver.
This is due to OF based property reads having been replaced by device
based property reads.

This patch fixes that by making use of
fwnode_property_read_string_array() instead of
device_property_read_string_array() and handing over either
of_fwnode_handle(chip->of_node) or dev_fwnode(chip->parent)
to that function.

Fixes: 9427ecbed46cc ("gpio: Rework of_gpiochip_set_names() to use device property accessors")
Cc: stable@vger.kernel.org
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Acked-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpiolib-acpi.c    |  2 +-
 drivers/gpio/gpiolib-devprop.c | 17 +++++++----------
 drivers/gpio/gpiolib-of.c      |  3 ++-
 drivers/gpio/gpiolib.h         |  3 ++-
 4 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/drivers/gpio/gpiolib-acpi.c b/drivers/gpio/gpiolib-acpi.c
index eb4528c87c0b..d6f3d9ee1350 100644
--- a/drivers/gpio/gpiolib-acpi.c
+++ b/drivers/gpio/gpiolib-acpi.c
@@ -1074,7 +1074,7 @@ void acpi_gpiochip_add(struct gpio_chip *chip)
 	}
 
 	if (!chip->names)
-		devprop_gpiochip_set_names(chip);
+		devprop_gpiochip_set_names(chip, dev_fwnode(chip->parent));
 
 	acpi_gpiochip_request_regions(acpi_gpio);
 	acpi_gpiochip_scan_gpios(acpi_gpio);
diff --git a/drivers/gpio/gpiolib-devprop.c b/drivers/gpio/gpiolib-devprop.c
index 27f383bda7d9..f748aa3e77f7 100644
--- a/drivers/gpio/gpiolib-devprop.c
+++ b/drivers/gpio/gpiolib-devprop.c
@@ -19,30 +19,27 @@
 /**
  * devprop_gpiochip_set_names - Set GPIO line names using device properties
  * @chip: GPIO chip whose lines should be named, if possible
+ * @fwnode: Property Node containing the gpio-line-names property
  *
  * Looks for device property "gpio-line-names" and if it exists assigns
  * GPIO line names for the chip. The memory allocated for the assigned
  * names belong to the underlying firmware node and should not be released
  * by the caller.
  */
-void devprop_gpiochip_set_names(struct gpio_chip *chip)
+void devprop_gpiochip_set_names(struct gpio_chip *chip,
+				const struct fwnode_handle *fwnode)
 {
 	struct gpio_device *gdev = chip->gpiodev;
 	const char **names;
 	int ret, i;
 
-	if (!chip->parent) {
-		dev_warn(&gdev->dev, "GPIO chip parent is NULL\n");
-		return;
-	}
-
-	ret = device_property_read_string_array(chip->parent, "gpio-line-names",
+	ret = fwnode_property_read_string_array(fwnode, "gpio-line-names",
 						NULL, 0);
 	if (ret < 0)
 		return;
 
 	if (ret != gdev->ngpio) {
-		dev_warn(chip->parent,
+		dev_warn(&gdev->dev,
 			 "names %d do not match number of GPIOs %d\n", ret,
 			 gdev->ngpio);
 		return;
@@ -52,10 +49,10 @@ void devprop_gpiochip_set_names(struct gpio_chip *chip)
 	if (!names)
 		return;
 
-	ret = device_property_read_string_array(chip->parent, "gpio-line-names",
+	ret = fwnode_property_read_string_array(fwnode, "gpio-line-names",
 						names, gdev->ngpio);
 	if (ret < 0) {
-		dev_warn(chip->parent, "failed to read GPIO line names\n");
+		dev_warn(&gdev->dev, "failed to read GPIO line names\n");
 		kfree(names);
 		return;
 	}
diff --git a/drivers/gpio/gpiolib-of.c b/drivers/gpio/gpiolib-of.c
index e0d59e61b52f..72a0695d2ac3 100644
--- a/drivers/gpio/gpiolib-of.c
+++ b/drivers/gpio/gpiolib-of.c
@@ -493,7 +493,8 @@ int of_gpiochip_add(struct gpio_chip *chip)
 
 	/* If the chip defines names itself, these take precedence */
 	if (!chip->names)
-		devprop_gpiochip_set_names(chip);
+		devprop_gpiochip_set_names(chip,
+					   of_fwnode_handle(chip->of_node));
 
 	of_node_get(chip->of_node);
 
diff --git a/drivers/gpio/gpiolib.h b/drivers/gpio/gpiolib.h
index af48322839c3..6c44d1652139 100644
--- a/drivers/gpio/gpiolib.h
+++ b/drivers/gpio/gpiolib.h
@@ -228,7 +228,8 @@ static inline int gpio_chip_hwgpio(const struct gpio_desc *desc)
 	return desc - &desc->gdev->descs[0];
 }
 
-void devprop_gpiochip_set_names(struct gpio_chip *chip);
+void devprop_gpiochip_set_names(struct gpio_chip *chip,
+				const struct fwnode_handle *fwnode);
 
 /* With descriptor prefix */
 

From 4c009af473b2026caaa26107e34d7cc68dad7756 Mon Sep 17 00:00:00 2001
From: "Michael J. Ruhl" <michael.j.ruhl@intel.com>
Date: Fri, 22 Dec 2017 08:47:20 -0800
Subject: [PATCH 404/808] IB/hfi: Only read capability registers if the
 capability exists

During driver init, various registers are saved to allow restoration
after an FLR or gen3 bump.  Some of these registers are not available
in some circumstances (i.e. Virtual machines).

This bug makes the driver unusable when the PCI device is passed into
a VM, it fails during probe.

Delete unnecessary register read/write, and only access register if
the capability exists.

Cc: <stable@vger.kernel.org> # 4.14.x
Fixes: a618b7e40af2 ("IB/hfi1: Move saving PCI values to a separate function")
Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hfi1/hfi.h  |  1 -
 drivers/infiniband/hw/hfi1/pcie.c | 30 ++++++++++++------------------
 2 files changed, 12 insertions(+), 19 deletions(-)

diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h
index 4a9b4d7efe63..8ce9118d4a7f 100644
--- a/drivers/infiniband/hw/hfi1/hfi.h
+++ b/drivers/infiniband/hw/hfi1/hfi.h
@@ -1131,7 +1131,6 @@ struct hfi1_devdata {
 	u16 pcie_lnkctl;
 	u16 pcie_devctl2;
 	u32 pci_msix0;
-	u32 pci_lnkctl3;
 	u32 pci_tph2;
 
 	/*
diff --git a/drivers/infiniband/hw/hfi1/pcie.c b/drivers/infiniband/hw/hfi1/pcie.c
index 09e50fd2a08f..8c7e7a60b715 100644
--- a/drivers/infiniband/hw/hfi1/pcie.c
+++ b/drivers/infiniband/hw/hfi1/pcie.c
@@ -411,15 +411,12 @@ int restore_pci_variables(struct hfi1_devdata *dd)
 	if (ret)
 		goto error;
 
-	ret = pci_write_config_dword(dd->pcidev, PCIE_CFG_SPCIE1,
-				     dd->pci_lnkctl3);
-	if (ret)
-		goto error;
-
-	ret = pci_write_config_dword(dd->pcidev, PCIE_CFG_TPH2, dd->pci_tph2);
-	if (ret)
-		goto error;
-
+	if (pci_find_ext_capability(dd->pcidev, PCI_EXT_CAP_ID_TPH)) {
+		ret = pci_write_config_dword(dd->pcidev, PCIE_CFG_TPH2,
+					     dd->pci_tph2);
+		if (ret)
+			goto error;
+	}
 	return 0;
 
 error:
@@ -469,15 +466,12 @@ int save_pci_variables(struct hfi1_devdata *dd)
 	if (ret)
 		goto error;
 
-	ret = pci_read_config_dword(dd->pcidev, PCIE_CFG_SPCIE1,
-				    &dd->pci_lnkctl3);
-	if (ret)
-		goto error;
-
-	ret = pci_read_config_dword(dd->pcidev, PCIE_CFG_TPH2, &dd->pci_tph2);
-	if (ret)
-		goto error;
-
+	if (pci_find_ext_capability(dd->pcidev, PCI_EXT_CAP_ID_TPH)) {
+		ret = pci_read_config_dword(dd->pcidev, PCIE_CFG_TPH2,
+					    &dd->pci_tph2);
+		if (ret)
+			goto error;
+	}
 	return 0;
 
 error:

From 7bbcbd3d1cdcbacd0f9f8dc4c98d550972f1ca30 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 20 Dec 2017 18:02:34 +0100
Subject: [PATCH 405/808] x86/Kconfig: Limit NR_CPUS on 32-bit to a sane amount

The recent cpu_entry_area changes fail to compile on 32-bit when BIGSMP=y
and NR_CPUS=512, because the fixmap area becomes too big.

Limit the number of CPUs with BIGSMP to 64, which is already way to big for
32-bit, but it's at least a working limitation.

We performed a quick survey of 32-bit-only machines that might be affected
by this change negatively, but found none.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/Kconfig | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 665eba1b6103..cd5199de231e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -925,7 +925,8 @@ config MAXSMP
 config NR_CPUS
 	int "Maximum number of CPUs" if SMP && !MAXSMP
 	range 2 8 if SMP && X86_32 && !X86_BIGSMP
-	range 2 512 if SMP && !MAXSMP && !CPUMASK_OFFSTACK
+	range 2 64 if SMP && X86_32 && X86_BIGSMP
+	range 2 512 if SMP && !MAXSMP && !CPUMASK_OFFSTACK && X86_64
 	range 2 8192 if SMP && !MAXSMP && CPUMASK_OFFSTACK && X86_64
 	default "1" if !SMP
 	default "8192" if MAXSMP

From c05344947b37f7cda726e802457370bc6eac4d26 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 16 Dec 2017 01:14:39 +0100
Subject: [PATCH 406/808] x86/mm/dump_pagetables: Check PAGE_PRESENT for real

The check for a present page in printk_prot():

       if (!pgprot_val(prot)) {
                /* Not present */

is bogus. If a PTE is set to PAGE_NONE then the pgprot_val is not zero and
the entry is decoded in bogus ways, e.g. as RX GLB. That is confusing when
analyzing mapping correctness. Check for the present bit to make an
informed decision.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/dump_pagetables.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 5e3ac6fe6c9e..1014cfb21c2c 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -140,7 +140,7 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg)
 	static const char * const level_name[] =
 		{ "cr3", "pgd", "p4d", "pud", "pmd", "pte" };
 
-	if (!pgprot_val(prot)) {
+	if (!(pr & _PAGE_PRESENT)) {
 		/* Not present */
 		pt_dump_cont_printf(m, dmsg, "                              ");
 	} else {

From 146122e24bdf208015d629babba673e28d090709 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 20 Dec 2017 18:07:42 +0100
Subject: [PATCH 407/808] x86/mm/dump_pagetables: Make the address hints
 correct and readable

The address hints are a trainwreck. The array entry numbers have to kept
magically in sync with the actual hints, which is doomed as some of the
array members are initialized at runtime via the entry numbers.

Designated initializers have been around before this code was
implemented....

Use the entry numbers to populate the address hints array and add the
missing bits and pieces. Split 32 and 64 bit for readability sake.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/dump_pagetables.c | 90 +++++++++++++++++++++--------------
 1 file changed, 53 insertions(+), 37 deletions(-)

diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 1014cfb21c2c..fdf09d8f98da 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -44,10 +44,12 @@ struct addr_marker {
 	unsigned long max_lines;
 };
 
-/* indices for address_markers; keep sync'd w/ address_markers below */
+/* Address space markers hints */
+
+#ifdef CONFIG_X86_64
+
 enum address_markers_idx {
 	USER_SPACE_NR = 0,
-#ifdef CONFIG_X86_64
 	KERNEL_SPACE_NR,
 	LOW_KERNEL_NR,
 	VMALLOC_START_NR,
@@ -56,56 +58,70 @@ enum address_markers_idx {
 	KASAN_SHADOW_START_NR,
 	KASAN_SHADOW_END_NR,
 #endif
-# ifdef CONFIG_X86_ESPFIX64
+#ifdef CONFIG_X86_ESPFIX64
 	ESPFIX_START_NR,
-# endif
+#endif
+#ifdef CONFIG_EFI
+	EFI_END_NR,
+#endif
 	HIGH_KERNEL_NR,
 	MODULES_VADDR_NR,
 	MODULES_END_NR,
-#else
+	FIXADDR_START_NR,
+	END_OF_SPACE_NR,
+};
+
+static struct addr_marker address_markers[] = {
+	[USER_SPACE_NR]		= { 0,			"User Space" },
+	[KERNEL_SPACE_NR]	= { (1UL << 63),	"Kernel Space" },
+	[LOW_KERNEL_NR]		= { 0UL,		"Low Kernel Mapping" },
+	[VMALLOC_START_NR]	= { 0UL,		"vmalloc() Area" },
+	[VMEMMAP_START_NR]	= { 0UL,		"Vmemmap" },
+#ifdef CONFIG_KASAN
+	[KASAN_SHADOW_START_NR]	= { KASAN_SHADOW_START,	"KASAN shadow" },
+	[KASAN_SHADOW_END_NR]	= { KASAN_SHADOW_END,	"KASAN shadow end" },
+#endif
+#ifdef CONFIG_X86_ESPFIX64
+	[ESPFIX_START_NR]	= { ESPFIX_BASE_ADDR,	"ESPfix Area", 16 },
+#endif
+#ifdef CONFIG_EFI
+	[EFI_END_NR]		= { EFI_VA_END,		"EFI Runtime Services" },
+#endif
+	[HIGH_KERNEL_NR]	= { __START_KERNEL_map,	"High Kernel Mapping" },
+	[MODULES_VADDR_NR]	= { MODULES_VADDR,	"Modules" },
+	[MODULES_END_NR]	= { MODULES_END,	"End Modules" },
+	[FIXADDR_START_NR]	= { FIXADDR_START,	"Fixmap Area" },
+	[END_OF_SPACE_NR]	= { -1,			NULL }
+};
+
+#else /* CONFIG_X86_64 */
+
+enum address_markers_idx {
+	USER_SPACE_NR = 0,
 	KERNEL_SPACE_NR,
 	VMALLOC_START_NR,
 	VMALLOC_END_NR,
-# ifdef CONFIG_HIGHMEM
+#ifdef CONFIG_HIGHMEM
 	PKMAP_BASE_NR,
-# endif
-	FIXADDR_START_NR,
 #endif
+	FIXADDR_START_NR,
+	END_OF_SPACE_NR,
 };
 
-/* Address space markers hints */
 static struct addr_marker address_markers[] = {
-	{ 0, "User Space" },
-#ifdef CONFIG_X86_64
-	{ 0x8000000000000000UL, "Kernel Space" },
-	{ 0/* PAGE_OFFSET */,   "Low Kernel Mapping" },
-	{ 0/* VMALLOC_START */, "vmalloc() Area" },
-	{ 0/* VMEMMAP_START */, "Vmemmap" },
-#ifdef CONFIG_KASAN
-	{ KASAN_SHADOW_START,	"KASAN shadow" },
-	{ KASAN_SHADOW_END,	"KASAN shadow end" },
+	[USER_SPACE_NR]		= { 0,			"User Space" },
+	[KERNEL_SPACE_NR]	= { PAGE_OFFSET,	"Kernel Mapping" },
+	[VMALLOC_START_NR]	= { 0UL,		"vmalloc() Area" },
+	[VMALLOC_END_NR]	= { 0UL,		"vmalloc() End" },
+#ifdef CONFIG_HIGHMEM
+	[PKMAP_BASE_NR]		= { 0UL,		"Persistent kmap() Area" },
 #endif
-# ifdef CONFIG_X86_ESPFIX64
-	{ ESPFIX_BASE_ADDR,	"ESPfix Area", 16 },
-# endif
-# ifdef CONFIG_EFI
-	{ EFI_VA_END,		"EFI Runtime Services" },
-# endif
-	{ __START_KERNEL_map,   "High Kernel Mapping" },
-	{ MODULES_VADDR,        "Modules" },
-	{ MODULES_END,          "End Modules" },
-#else
-	{ PAGE_OFFSET,          "Kernel Mapping" },
-	{ 0/* VMALLOC_START */, "vmalloc() Area" },
-	{ 0/*VMALLOC_END*/,     "vmalloc() End" },
-# ifdef CONFIG_HIGHMEM
-	{ 0/*PKMAP_BASE*/,      "Persistent kmap() Area" },
-# endif
-	{ 0/*FIXADDR_START*/,   "Fixmap Area" },
-#endif
-	{ -1, NULL }		/* End of list */
+	[FIXADDR_START_NR]	= { 0UL,		"Fixmap area" },
+	[END_OF_SPACE_NR]	= { -1,			NULL }
 };
 
+#endif /* !CONFIG_X86_64 */
+
 /* Multipliers for offsets within the PTEs */
 #define PTE_LEVEL_MULT (PAGE_SIZE)
 #define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT)

From 49275fef986abfb8b476e4708aaecc07e7d3e087 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Sun, 10 Dec 2017 22:47:19 -0800
Subject: [PATCH 408/808] x86/vsyscall/64: Explicitly set _PAGE_USER in the
 pagetable hierarchy

The kernel is very erratic as to which pagetables have _PAGE_USER set.  The
vsyscall page gets lucky: it seems that all of the relevant pagetables are
among the apparently arbitrary ones that set _PAGE_USER.  Rather than
relying on chance, just explicitly set _PAGE_USER.

This will let us clean up pagetable setup to stop setting _PAGE_USER.  The
added code can also be reused by pagetable isolation to manage the
_PAGE_USER bit in the usermode tables.

[ tglx: Folded paravirt fix from Juergen Gross ]

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/vsyscall/vsyscall_64.c | 34 ++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index f279ba2643dc..daad57c76e42 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -37,6 +37,7 @@
 #include <asm/unistd.h>
 #include <asm/fixmap.h>
 #include <asm/traps.h>
+#include <asm/paravirt.h>
 
 #define CREATE_TRACE_POINTS
 #include "vsyscall_trace.h"
@@ -329,16 +330,47 @@ int in_gate_area_no_mm(unsigned long addr)
 	return vsyscall_mode != NONE && (addr & PAGE_MASK) == VSYSCALL_ADDR;
 }
 
+/*
+ * The VSYSCALL page is the only user-accessible page in the kernel address
+ * range.  Normally, the kernel page tables can have _PAGE_USER clear, but
+ * the tables covering VSYSCALL_ADDR need _PAGE_USER set if vsyscalls
+ * are enabled.
+ *
+ * Some day we may create a "minimal" vsyscall mode in which we emulate
+ * vsyscalls but leave the page not present.  If so, we skip calling
+ * this.
+ */
+static void __init set_vsyscall_pgtable_user_bits(void)
+{
+	pgd_t *pgd;
+	p4d_t *p4d;
+	pud_t *pud;
+	pmd_t *pmd;
+
+	pgd = pgd_offset_k(VSYSCALL_ADDR);
+	set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER));
+	p4d = p4d_offset(pgd, VSYSCALL_ADDR);
+#if CONFIG_PGTABLE_LEVELS >= 5
+	p4d->p4d |= _PAGE_USER;
+#endif
+	pud = pud_offset(p4d, VSYSCALL_ADDR);
+	set_pud(pud, __pud(pud_val(*pud) | _PAGE_USER));
+	pmd = pmd_offset(pud, VSYSCALL_ADDR);
+	set_pmd(pmd, __pmd(pmd_val(*pmd) | _PAGE_USER));
+}
+
 void __init map_vsyscall(void)
 {
 	extern char __vsyscall_page;
 	unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page);
 
-	if (vsyscall_mode != NONE)
+	if (vsyscall_mode != NONE) {
 		__set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall,
 			     vsyscall_mode == NATIVE
 			     ? PAGE_KERNEL_VSYSCALL
 			     : PAGE_KERNEL_VVAR);
+		set_vsyscall_pgtable_user_bits();
+	}
 
 	BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=
 		     (unsigned long)VSYSCALL_ADDR);

From 4831b779403a836158917d59a7ca880483c67378 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Sun, 10 Dec 2017 22:47:20 -0800
Subject: [PATCH 409/808] x86/vsyscall/64: Warn and fail vsyscall emulation in
 NATIVE mode

If something goes wrong with pagetable setup, vsyscall=native will
accidentally fall back to emulation.  Make it warn and fail so that we
notice.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/vsyscall/vsyscall_64.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index daad57c76e42..1faf40f2dda9 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -139,6 +139,10 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
 
 	WARN_ON_ONCE(address != regs->ip);
 
+	/* This should be unreachable in NATIVE mode. */
+	if (WARN_ON(vsyscall_mode == NATIVE))
+		return false;
+
 	if (vsyscall_mode == NONE) {
 		warn_bad_vsyscall(KERN_INFO, regs,
 				  "vsyscall attempted with vsyscall=none");

From c10e83f598d08046dd1ebc8360d4bb12d802d51b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 14 Dec 2017 12:27:29 +0100
Subject: [PATCH 410/808] arch, mm: Allow arch_dup_mmap() to fail

In order to sanitize the LDT initialization on x86 arch_dup_mmap() must be
allowed to fail. Fix up all instances.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirsky <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: dan.j.williams@intel.com
Cc: hughd@google.com
Cc: keescook@google.com
Cc: kirill.shutemov@linux.intel.com
Cc: linux-mm@kvack.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/powerpc/include/asm/mmu_context.h   | 5 +++--
 arch/um/include/asm/mmu_context.h        | 3 ++-
 arch/unicore32/include/asm/mmu_context.h | 5 +++--
 arch/x86/include/asm/mmu_context.h       | 4 ++--
 include/asm-generic/mm_hooks.h           | 5 +++--
 kernel/fork.c                            | 3 +--
 6 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
index 492d8140a395..44fdf4786638 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -114,9 +114,10 @@ static inline void enter_lazy_tlb(struct mm_struct *mm,
 #endif
 }
 
-static inline void arch_dup_mmap(struct mm_struct *oldmm,
-				 struct mm_struct *mm)
+static inline int arch_dup_mmap(struct mm_struct *oldmm,
+				struct mm_struct *mm)
 {
+	return 0;
 }
 
 static inline void arch_exit_mmap(struct mm_struct *mm)
diff --git a/arch/um/include/asm/mmu_context.h b/arch/um/include/asm/mmu_context.h
index b668e351fd6c..fca34b2177e2 100644
--- a/arch/um/include/asm/mmu_context.h
+++ b/arch/um/include/asm/mmu_context.h
@@ -15,9 +15,10 @@ extern void uml_setup_stubs(struct mm_struct *mm);
 /*
  * Needed since we do not use the asm-generic/mm_hooks.h:
  */
-static inline void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
+static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
 {
 	uml_setup_stubs(mm);
+	return 0;
 }
 extern void arch_exit_mmap(struct mm_struct *mm);
 static inline void arch_unmap(struct mm_struct *mm,
diff --git a/arch/unicore32/include/asm/mmu_context.h b/arch/unicore32/include/asm/mmu_context.h
index 59b06b48f27d..5c205a9cb5a6 100644
--- a/arch/unicore32/include/asm/mmu_context.h
+++ b/arch/unicore32/include/asm/mmu_context.h
@@ -81,9 +81,10 @@ do { \
 	} \
 } while (0)
 
-static inline void arch_dup_mmap(struct mm_struct *oldmm,
-				 struct mm_struct *mm)
+static inline int arch_dup_mmap(struct mm_struct *oldmm,
+				struct mm_struct *mm)
 {
+	return 0;
 }
 
 static inline void arch_unmap(struct mm_struct *mm,
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 6d16d15d09a0..c76162439c8a 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -176,10 +176,10 @@ do {						\
 } while (0)
 #endif
 
-static inline void arch_dup_mmap(struct mm_struct *oldmm,
-				 struct mm_struct *mm)
+static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
 {
 	paravirt_arch_dup_mmap(oldmm, mm);
+	return 0;
 }
 
 static inline void arch_exit_mmap(struct mm_struct *mm)
diff --git a/include/asm-generic/mm_hooks.h b/include/asm-generic/mm_hooks.h
index ea189d88a3cc..8ac4e68a12f0 100644
--- a/include/asm-generic/mm_hooks.h
+++ b/include/asm-generic/mm_hooks.h
@@ -7,9 +7,10 @@
 #ifndef _ASM_GENERIC_MM_HOOKS_H
 #define _ASM_GENERIC_MM_HOOKS_H
 
-static inline void arch_dup_mmap(struct mm_struct *oldmm,
-				 struct mm_struct *mm)
+static inline int arch_dup_mmap(struct mm_struct *oldmm,
+				struct mm_struct *mm)
 {
+	return 0;
 }
 
 static inline void arch_exit_mmap(struct mm_struct *mm)
diff --git a/kernel/fork.c b/kernel/fork.c
index 07cc743698d3..500ce64517d9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -721,8 +721,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
 			goto out;
 	}
 	/* a new mm has just been created */
-	arch_dup_mmap(oldmm, mm);
-	retval = 0;
+	retval = arch_dup_mmap(oldmm, mm);
 out:
 	up_write(&mm->mmap_sem);
 	flush_tlb_mm(oldmm);

From c2b3496bb30bd159e9de42e5c952e1f1f33c9a77 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 14 Dec 2017 12:27:30 +0100
Subject: [PATCH 411/808] x86/ldt: Rework locking

The LDT is duplicated on fork() and on exec(), which is wrong as exec()
should start from a clean state, i.e. without LDT. To fix this the LDT
duplication code will be moved into arch_dup_mmap() which is only called
for fork().

This introduces a locking problem. arch_dup_mmap() holds mmap_sem of the
parent process, but the LDT duplication code needs to acquire
mm->context.lock to access the LDT data safely, which is the reverse lock
order of write_ldt() where mmap_sem nests into context.lock.

Solve this by introducing a new rw semaphore which serializes the
read/write_ldt() syscall operations and use context.lock to protect the
actual installment of the LDT descriptor.

So context.lock stabilizes mm->context.ldt and can nest inside of the new
semaphore or mmap_sem.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirsky <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: dan.j.williams@intel.com
Cc: hughd@google.com
Cc: keescook@google.com
Cc: kirill.shutemov@linux.intel.com
Cc: linux-mm@kvack.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/mmu.h         |  4 +++-
 arch/x86/include/asm/mmu_context.h |  2 ++
 arch/x86/kernel/ldt.c              | 35 +++++++++++++++++++-----------
 3 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 9ea26f167497..5ff3e8af2c20 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -3,6 +3,7 @@
 #define _ASM_X86_MMU_H
 
 #include <linux/spinlock.h>
+#include <linux/rwsem.h>
 #include <linux/mutex.h>
 #include <linux/atomic.h>
 
@@ -27,7 +28,8 @@ typedef struct {
 	atomic64_t tlb_gen;
 
 #ifdef CONFIG_MODIFY_LDT_SYSCALL
-	struct ldt_struct *ldt;
+	struct rw_semaphore	ldt_usr_sem;
+	struct ldt_struct	*ldt;
 #endif
 
 #ifdef CONFIG_X86_64
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index c76162439c8a..4fdbe5efe535 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -132,6 +132,8 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);
 static inline int init_new_context(struct task_struct *tsk,
 				   struct mm_struct *mm)
 {
+	mutex_init(&mm->context.lock);
+
 	mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id);
 	atomic64_set(&mm->context.tlb_gen, 0);
 
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 1c1eae961340..1600aebc1ec7 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -5,6 +5,11 @@
  * Copyright (C) 2002 Andi Kleen
  *
  * This handles calls from both 32bit and 64bit mode.
+ *
+ * Lock order:
+ *	contex.ldt_usr_sem
+ *	  mmap_sem
+ *	    context.lock
  */
 
 #include <linux/errno.h>
@@ -42,7 +47,7 @@ static void refresh_ldt_segments(void)
 #endif
 }
 
-/* context.lock is held for us, so we don't need any locking. */
+/* context.lock is held by the task which issued the smp function call */
 static void flush_ldt(void *__mm)
 {
 	struct mm_struct *mm = __mm;
@@ -99,15 +104,17 @@ static void finalize_ldt_struct(struct ldt_struct *ldt)
 	paravirt_alloc_ldt(ldt->entries, ldt->nr_entries);
 }
 
-/* context.lock is held */
-static void install_ldt(struct mm_struct *current_mm,
-			struct ldt_struct *ldt)
+static void install_ldt(struct mm_struct *mm, struct ldt_struct *ldt)
 {
-	/* Synchronizes with READ_ONCE in load_mm_ldt. */
-	smp_store_release(&current_mm->context.ldt, ldt);
+	mutex_lock(&mm->context.lock);
 
-	/* Activate the LDT for all CPUs using current_mm. */
-	on_each_cpu_mask(mm_cpumask(current_mm), flush_ldt, current_mm, true);
+	/* Synchronizes with READ_ONCE in load_mm_ldt. */
+	smp_store_release(&mm->context.ldt, ldt);
+
+	/* Activate the LDT for all CPUs using currents mm. */
+	on_each_cpu_mask(mm_cpumask(mm), flush_ldt, mm, true);
+
+	mutex_unlock(&mm->context.lock);
 }
 
 static void free_ldt_struct(struct ldt_struct *ldt)
@@ -133,7 +140,8 @@ int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm)
 	struct mm_struct *old_mm;
 	int retval = 0;
 
-	mutex_init(&mm->context.lock);
+	init_rwsem(&mm->context.ldt_usr_sem);
+
 	old_mm = current->mm;
 	if (!old_mm) {
 		mm->context.ldt = NULL;
@@ -180,7 +188,7 @@ static int read_ldt(void __user *ptr, unsigned long bytecount)
 	unsigned long entries_size;
 	int retval;
 
-	mutex_lock(&mm->context.lock);
+	down_read(&mm->context.ldt_usr_sem);
 
 	if (!mm->context.ldt) {
 		retval = 0;
@@ -209,7 +217,7 @@ static int read_ldt(void __user *ptr, unsigned long bytecount)
 	retval = bytecount;
 
 out_unlock:
-	mutex_unlock(&mm->context.lock);
+	up_read(&mm->context.ldt_usr_sem);
 	return retval;
 }
 
@@ -269,7 +277,8 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
 			ldt.avl = 0;
 	}
 
-	mutex_lock(&mm->context.lock);
+	if (down_write_killable(&mm->context.ldt_usr_sem))
+		return -EINTR;
 
 	old_ldt       = mm->context.ldt;
 	old_nr_entries = old_ldt ? old_ldt->nr_entries : 0;
@@ -291,7 +300,7 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
 	error = 0;
 
 out_unlock:
-	mutex_unlock(&mm->context.lock);
+	up_write(&mm->context.ldt_usr_sem);
 out:
 	return error;
 }

From a4828f81037f491b2cc986595e3a969a6eeb2fb5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 14 Dec 2017 12:27:31 +0100
Subject: [PATCH 412/808] x86/ldt: Prevent LDT inheritance on exec

The LDT is inherited across fork() or exec(), but that makes no sense
at all because exec() is supposed to start the process clean.

The reason why this happens is that init_new_context_ldt() is called from
init_new_context() which obviously needs to be called for both fork() and
exec().

It would be surprising if anything relies on that behaviour, so it seems to
be safe to remove that misfeature.

Split the context initialization into two parts. Clear the LDT pointer and
initialize the mutex from the general context init and move the LDT
duplication to arch_dup_mmap() which is only called on fork().

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirsky <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: dan.j.williams@intel.com
Cc: hughd@google.com
Cc: keescook@google.com
Cc: kirill.shutemov@linux.intel.com
Cc: linux-mm@kvack.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/mmu_context.h    | 21 ++++++++++++++-------
 arch/x86/kernel/ldt.c                 | 18 +++++-------------
 tools/testing/selftests/x86/ldt_gdt.c |  9 +++------
 3 files changed, 22 insertions(+), 26 deletions(-)

diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 4fdbe5efe535..5e25423bf9bb 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -57,11 +57,17 @@ struct ldt_struct {
 /*
  * Used for LDT copy/destruction.
  */
-int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm);
+static inline void init_new_context_ldt(struct mm_struct *mm)
+{
+	mm->context.ldt = NULL;
+	init_rwsem(&mm->context.ldt_usr_sem);
+}
+int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm);
 void destroy_context_ldt(struct mm_struct *mm);
 #else	/* CONFIG_MODIFY_LDT_SYSCALL */
-static inline int init_new_context_ldt(struct task_struct *tsk,
-				       struct mm_struct *mm)
+static inline void init_new_context_ldt(struct mm_struct *mm) { }
+static inline int ldt_dup_context(struct mm_struct *oldmm,
+				  struct mm_struct *mm)
 {
 	return 0;
 }
@@ -137,15 +143,16 @@ static inline int init_new_context(struct task_struct *tsk,
 	mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id);
 	atomic64_set(&mm->context.tlb_gen, 0);
 
-	#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
 	if (cpu_feature_enabled(X86_FEATURE_OSPKE)) {
 		/* pkey 0 is the default and always allocated */
 		mm->context.pkey_allocation_map = 0x1;
 		/* -1 means unallocated or invalid */
 		mm->context.execute_only_pkey = -1;
 	}
-	#endif
-	return init_new_context_ldt(tsk, mm);
+#endif
+	init_new_context_ldt(mm);
+	return 0;
 }
 static inline void destroy_context(struct mm_struct *mm)
 {
@@ -181,7 +188,7 @@ do {						\
 static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
 {
 	paravirt_arch_dup_mmap(oldmm, mm);
-	return 0;
+	return ldt_dup_context(oldmm, mm);
 }
 
 static inline void arch_exit_mmap(struct mm_struct *mm)
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 1600aebc1ec7..a6b5d62f45a7 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -131,28 +131,20 @@ static void free_ldt_struct(struct ldt_struct *ldt)
 }
 
 /*
- * we do not have to muck with descriptors here, that is
- * done in switch_mm() as needed.
+ * Called on fork from arch_dup_mmap(). Just copy the current LDT state,
+ * the new task is not running, so nothing can be installed.
  */
-int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm)
+int ldt_dup_context(struct mm_struct *old_mm, struct mm_struct *mm)
 {
 	struct ldt_struct *new_ldt;
-	struct mm_struct *old_mm;
 	int retval = 0;
 
-	init_rwsem(&mm->context.ldt_usr_sem);
-
-	old_mm = current->mm;
-	if (!old_mm) {
-		mm->context.ldt = NULL;
+	if (!old_mm)
 		return 0;
-	}
 
 	mutex_lock(&old_mm->context.lock);
-	if (!old_mm->context.ldt) {
-		mm->context.ldt = NULL;
+	if (!old_mm->context.ldt)
 		goto out_unlock;
-	}
 
 	new_ldt = alloc_ldt_struct(old_mm->context.ldt->nr_entries);
 	if (!new_ldt) {
diff --git a/tools/testing/selftests/x86/ldt_gdt.c b/tools/testing/selftests/x86/ldt_gdt.c
index 66e5ce5b91f0..0304ffb714f2 100644
--- a/tools/testing/selftests/x86/ldt_gdt.c
+++ b/tools/testing/selftests/x86/ldt_gdt.c
@@ -627,13 +627,10 @@ static void do_multicpu_tests(void)
 static int finish_exec_test(void)
 {
 	/*
-	 * In a sensible world, this would be check_invalid_segment(0, 1);
-	 * For better or for worse, though, the LDT is inherited across exec.
-	 * We can probably change this safely, but for now we test it.
+	 * Older kernel versions did inherit the LDT on exec() which is
+	 * wrong because exec() starts from a clean state.
 	 */
-	check_valid_segment(0, 1,
-			    AR_DPL3 | AR_TYPE_XRCODE | AR_S | AR_P | AR_DB,
-			    42, true);
+	check_invalid_segment(0, 1);
 
 	return nerrs ? 1 : 0;
 }

From 5a7ccf4754fb3660569a6de52ba7f7fc3dfaf280 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Tue, 12 Dec 2017 07:56:43 -0800
Subject: [PATCH 413/808] x86/mm/64: Improve the memory map documentation

The old docs had the vsyscall range wrong and were missing the fixmap.
Fix both.

There used to be 8 MB reserved for future vsyscalls, but that's long gone.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/x86/x86_64/mm.txt | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt
index 3448e675b462..83ca5a3b90ac 100644
--- a/Documentation/x86/x86_64/mm.txt
+++ b/Documentation/x86/x86_64/mm.txt
@@ -19,8 +19,9 @@ ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
 ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
 ... unused hole ...
 ffffffff80000000 - ffffffff9fffffff (=512 MB)  kernel text mapping, from phys 0
-ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space (variable)
-ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls
+ffffffffa0000000 - [fixmap start]   (~1526 MB) module mapping space (variable)
+[fixmap start]   - ffffffffff5fffff kernel-internal fixmap range
+ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI
 ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole
 
 Virtual memory map with 5 level page tables:
@@ -41,8 +42,9 @@ ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
 ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
 ... unused hole ...
 ffffffff80000000 - ffffffff9fffffff (=512 MB)  kernel text mapping, from phys 0
-ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space
-ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls
+ffffffffa0000000 - [fixmap start]   (~1526 MB) module mapping space
+[fixmap start]   - ffffffffff5fffff kernel-internal fixmap range
+ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI
 ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole
 
 Architecture defines a 64-bit virtual address. Implementations can support

From e8ffe96e5933d417195268478479933d56213a3f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 5 Dec 2017 13:34:54 +0100
Subject: [PATCH 414/808] x86/doc: Remove obvious weirdnesses from the x86 MM
 layout documentation

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Cc: linux-mm@kvack.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/x86/x86_64/mm.txt | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt
index 83ca5a3b90ac..63a41671d25b 100644
--- a/Documentation/x86/x86_64/mm.txt
+++ b/Documentation/x86/x86_64/mm.txt
@@ -1,6 +1,4 @@
 
-<previous description obsolete, deleted>
-
 Virtual memory map with 4 level page tables:
 
 0000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm
@@ -49,8 +47,9 @@ ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole
 
 Architecture defines a 64-bit virtual address. Implementations can support
 less. Currently supported are 48- and 57-bit virtual addresses. Bits 63
-through to the most-significant implemented bit are set to either all ones
-or all zero. This causes hole between user space and kernel addresses.
+through to the most-significant implemented bit are sign extended.
+This causes hole between user space and kernel addresses if you interpret them
+as unsigned.
 
 The direct mapping covers all memory in the system up to the highest
 memory address (this means in some cases it can also include PCI memory
@@ -60,9 +59,6 @@ vmalloc space is lazily synchronized into the different PML4/PML5 pages of
 the processes using the page fault handler, with init_top_pgt as
 reference.
 
-Current X86-64 implementations support up to 46 bits of address space (64 TB),
-which is our current limit. This expands into MBZ space in the page tables.
-
 We map EFI runtime services in the 'efi_pgd' PGD in a 64Gb large virtual
 memory window (this size is arbitrary, it can be raised later if needed).
 The mappings are not part of any other kernel PGD and are only available
@@ -74,5 +70,3 @@ following fixmap section.
 Note that if CONFIG_RANDOMIZE_MEMORY is enabled, the direct mapping of all
 physical memory, vmalloc/ioremap space and virtual memory map are randomized.
 Their order is preserved but their base will be offset early at boot time.
-
--Andi Kleen, Jul 2004

From 4fe2d8b11a370af286287a2661de9d4e6c9a145a Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Mon, 4 Dec 2017 17:25:07 -0800
Subject: [PATCH 415/808] x86/entry: Rename SYSENTER_stack to
 CPU_ENTRY_AREA_entry_stack

If the kernel oopses while on the trampoline stack, it will print
"<SYSENTER>" even if SYSENTER is not involved.  That is rather confusing.

The "SYSENTER" stack is used for a lot more than SYSENTER now.  Give it a
better string to display in stack dumps, and rename the kernel code to
match.

Also move the 32-bit code over to the new naming even though it still uses
the entry stack only for SYSENTER.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bp@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/entry_32.S         | 12 ++++++------
 arch/x86/entry/entry_64.S         |  4 ++--
 arch/x86/include/asm/fixmap.h     |  8 ++++----
 arch/x86/include/asm/processor.h  |  6 +++---
 arch/x86/include/asm/stacktrace.h |  4 ++--
 arch/x86/kernel/asm-offsets.c     |  4 ++--
 arch/x86/kernel/asm-offsets_32.c  |  2 +-
 arch/x86/kernel/cpu/common.c      | 14 +++++++-------
 arch/x86/kernel/dumpstack.c       | 10 +++++-----
 arch/x86/kernel/dumpstack_32.c    |  6 +++---
 arch/x86/kernel/dumpstack_64.c    | 12 +++++++++---
 11 files changed, 44 insertions(+), 38 deletions(-)

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index bd8b57a5c874..ace8f321a5a1 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -942,9 +942,9 @@ ENTRY(debug)
 
 	/* Are we currently on the SYSENTER stack? */
 	movl	PER_CPU_VAR(cpu_entry_area), %ecx
-	addl	$CPU_ENTRY_AREA_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx
-	subl	%eax, %ecx	/* ecx = (end of SYSENTER_stack) - esp */
-	cmpl	$SIZEOF_SYSENTER_stack, %ecx
+	addl	$CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx
+	subl	%eax, %ecx	/* ecx = (end of entry_stack) - esp */
+	cmpl	$SIZEOF_entry_stack, %ecx
 	jb	.Ldebug_from_sysenter_stack
 
 	TRACE_IRQS_OFF
@@ -986,9 +986,9 @@ ENTRY(nmi)
 
 	/* Are we currently on the SYSENTER stack? */
 	movl	PER_CPU_VAR(cpu_entry_area), %ecx
-	addl	$CPU_ENTRY_AREA_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx
-	subl	%eax, %ecx	/* ecx = (end of SYSENTER_stack) - esp */
-	cmpl	$SIZEOF_SYSENTER_stack, %ecx
+	addl	$CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx
+	subl	%eax, %ecx	/* ecx = (end of entry_stack) - esp */
+	cmpl	$SIZEOF_entry_stack, %ecx
 	jb	.Lnmi_from_sysenter_stack
 
 	/* Not on SYSENTER stack. */
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 2812ce043a7a..87cebe78bbef 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -154,8 +154,8 @@ END(native_usergs_sysret64)
 	_entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip)
 
 /* The top word of the SYSENTER stack is hot and is usable as scratch space. */
-#define RSP_SCRATCH	CPU_ENTRY_AREA_SYSENTER_stack + \
-			SIZEOF_SYSENTER_stack - 8 + CPU_ENTRY_AREA
+#define RSP_SCRATCH	CPU_ENTRY_AREA_entry_stack + \
+			SIZEOF_entry_stack - 8 + CPU_ENTRY_AREA
 
 ENTRY(entry_SYSCALL_64_trampoline)
 	UNWIND_HINT_EMPTY
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 94fc4fa14127..8153b8d86a3c 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -56,10 +56,10 @@ struct cpu_entry_area {
 	char gdt[PAGE_SIZE];
 
 	/*
-	 * The GDT is just below SYSENTER_stack and thus serves (on x86_64) as
+	 * The GDT is just below entry_stack and thus serves (on x86_64) as
 	 * a a read-only guard page.
 	 */
-	struct SYSENTER_stack_page SYSENTER_stack_page;
+	struct entry_stack_page entry_stack_page;
 
 	/*
 	 * On x86_64, the TSS is mapped RO.  On x86_32, it's mapped RW because
@@ -250,9 +250,9 @@ static inline struct cpu_entry_area *get_cpu_entry_area(int cpu)
 	return (struct cpu_entry_area *)__fix_to_virt(__get_cpu_entry_area_page_index(cpu, 0));
 }
 
-static inline struct SYSENTER_stack *cpu_SYSENTER_stack(int cpu)
+static inline struct entry_stack *cpu_entry_stack(int cpu)
 {
-	return &get_cpu_entry_area(cpu)->SYSENTER_stack_page.stack;
+	return &get_cpu_entry_area(cpu)->entry_stack_page.stack;
 }
 
 #endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index da943411d3d8..9e482d8b0b97 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -336,12 +336,12 @@ struct x86_hw_tss {
 #define IO_BITMAP_OFFSET		(offsetof(struct tss_struct, io_bitmap) - offsetof(struct tss_struct, x86_tss))
 #define INVALID_IO_BITMAP_OFFSET	0x8000
 
-struct SYSENTER_stack {
+struct entry_stack {
 	unsigned long		words[64];
 };
 
-struct SYSENTER_stack_page {
-	struct SYSENTER_stack stack;
+struct entry_stack_page {
+	struct entry_stack stack;
 } __aligned(PAGE_SIZE);
 
 struct tss_struct {
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index f8062bfd43a0..f73706878772 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -16,7 +16,7 @@ enum stack_type {
 	STACK_TYPE_TASK,
 	STACK_TYPE_IRQ,
 	STACK_TYPE_SOFTIRQ,
-	STACK_TYPE_SYSENTER,
+	STACK_TYPE_ENTRY,
 	STACK_TYPE_EXCEPTION,
 	STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1,
 };
@@ -29,7 +29,7 @@ struct stack_info {
 bool in_task_stack(unsigned long *stack, struct task_struct *task,
 		   struct stack_info *info);
 
-bool in_sysenter_stack(unsigned long *stack, struct stack_info *info);
+bool in_entry_stack(unsigned long *stack, struct stack_info *info);
 
 int get_stack_info(unsigned long *stack, struct task_struct *task,
 		   struct stack_info *info, unsigned long *visit_mask);
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index cd360a5e0dca..676b7cf4b62b 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -97,6 +97,6 @@ void common(void) {
 	/* Layout info for cpu_entry_area */
 	OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss);
 	OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline);
-	OFFSET(CPU_ENTRY_AREA_SYSENTER_stack, cpu_entry_area, SYSENTER_stack_page);
-	DEFINE(SIZEOF_SYSENTER_stack, sizeof(struct SYSENTER_stack));
+	OFFSET(CPU_ENTRY_AREA_entry_stack, cpu_entry_area, entry_stack_page);
+	DEFINE(SIZEOF_entry_stack, sizeof(struct entry_stack));
 }
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 7d20d9c0b3d6..fa1261eefa16 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -48,7 +48,7 @@ void foo(void)
 
 	/* Offset from the sysenter stack to tss.sp0 */
 	DEFINE(TSS_sysenter_sp0, offsetof(struct cpu_entry_area, tss.x86_tss.sp0) -
-	       offsetofend(struct cpu_entry_area, SYSENTER_stack_page.stack));
+	       offsetofend(struct cpu_entry_area, entry_stack_page.stack));
 
 #ifdef CONFIG_CC_STACKPROTECTOR
 	BLANK();
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 034900623adf..ed4acbce37a8 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -487,8 +487,8 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
 	[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
 #endif
 
-static DEFINE_PER_CPU_PAGE_ALIGNED(struct SYSENTER_stack_page,
-				   SYSENTER_stack_storage);
+static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page,
+				   entry_stack_storage);
 
 static void __init
 set_percpu_fixmap_pages(int idx, void *ptr, int pages, pgprot_t prot)
@@ -523,8 +523,8 @@ static void __init setup_cpu_entry_area(int cpu)
 #endif
 
 	__set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot);
-	set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, SYSENTER_stack_page),
-				per_cpu_ptr(&SYSENTER_stack_storage, cpu), 1,
+	set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, entry_stack_page),
+				per_cpu_ptr(&entry_stack_storage, cpu), 1,
 				PAGE_KERNEL);
 
 	/*
@@ -1323,7 +1323,7 @@ void enable_sep_cpu(void)
 
 	tss->x86_tss.ss1 = __KERNEL_CS;
 	wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0);
-	wrmsr(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_SYSENTER_stack(cpu) + 1), 0);
+	wrmsr(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1), 0);
 	wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0);
 
 	put_cpu();
@@ -1440,7 +1440,7 @@ void syscall_init(void)
 	 * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
 	 */
 	wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
-	wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_SYSENTER_stack(cpu) + 1));
+	wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1));
 	wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
 #else
 	wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret);
@@ -1655,7 +1655,7 @@ void cpu_init(void)
 	 */
 	set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
 	load_TR_desc();
-	load_sp0((unsigned long)(cpu_SYSENTER_stack(cpu) + 1));
+	load_sp0((unsigned long)(cpu_entry_stack(cpu) + 1));
 
 	load_mm_ldt(&init_mm);
 
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index bbd6d986e2d0..1dd3f533d78c 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -43,9 +43,9 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task,
 	return true;
 }
 
-bool in_sysenter_stack(unsigned long *stack, struct stack_info *info)
+bool in_entry_stack(unsigned long *stack, struct stack_info *info)
 {
-	struct SYSENTER_stack *ss = cpu_SYSENTER_stack(smp_processor_id());
+	struct entry_stack *ss = cpu_entry_stack(smp_processor_id());
 
 	void *begin = ss;
 	void *end = ss + 1;
@@ -53,7 +53,7 @@ bool in_sysenter_stack(unsigned long *stack, struct stack_info *info)
 	if ((void *)stack < begin || (void *)stack >= end)
 		return false;
 
-	info->type	= STACK_TYPE_SYSENTER;
+	info->type	= STACK_TYPE_ENTRY;
 	info->begin	= begin;
 	info->end	= end;
 	info->next_sp	= NULL;
@@ -111,13 +111,13 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
 	 * - task stack
 	 * - interrupt stack
 	 * - HW exception stacks (double fault, nmi, debug, mce)
-	 * - SYSENTER stack
+	 * - entry stack
 	 *
 	 * x86-32 can have up to four stacks:
 	 * - task stack
 	 * - softirq stack
 	 * - hardirq stack
-	 * - SYSENTER stack
+	 * - entry stack
 	 */
 	for (regs = NULL; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {
 		const char *stack_name;
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 5ff13a6b3680..04170f63e3a1 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -26,8 +26,8 @@ const char *stack_type_name(enum stack_type type)
 	if (type == STACK_TYPE_SOFTIRQ)
 		return "SOFTIRQ";
 
-	if (type == STACK_TYPE_SYSENTER)
-		return "SYSENTER";
+	if (type == STACK_TYPE_ENTRY)
+		return "ENTRY_TRAMPOLINE";
 
 	return NULL;
 }
@@ -96,7 +96,7 @@ int get_stack_info(unsigned long *stack, struct task_struct *task,
 	if (task != current)
 		goto unknown;
 
-	if (in_sysenter_stack(stack, info))
+	if (in_entry_stack(stack, info))
 		goto recursion_check;
 
 	if (in_hardirq_stack(stack, info))
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index abc828f8c297..563e28d14f2c 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -37,8 +37,14 @@ const char *stack_type_name(enum stack_type type)
 	if (type == STACK_TYPE_IRQ)
 		return "IRQ";
 
-	if (type == STACK_TYPE_SYSENTER)
-		return "SYSENTER";
+	if (type == STACK_TYPE_ENTRY) {
+		/*
+		 * On 64-bit, we have a generic entry stack that we
+		 * use for all the kernel entry points, including
+		 * SYSENTER.
+		 */
+		return "ENTRY_TRAMPOLINE";
+	}
 
 	if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST)
 		return exception_stack_names[type - STACK_TYPE_EXCEPTION];
@@ -118,7 +124,7 @@ int get_stack_info(unsigned long *stack, struct task_struct *task,
 	if (in_irq_stack(stack, info))
 		goto recursion_check;
 
-	if (in_sysenter_stack(stack, info))
+	if (in_entry_stack(stack, info))
 		goto recursion_check;
 
 	goto unknown;

From 3e46e0f5ee3643a1239be9046c7ba6c66ca2b329 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 5 Dec 2017 13:34:50 +0100
Subject: [PATCH 416/808] x86/uv: Use the right TLB-flush API

Since uv_flush_tlb_others() implements flush_tlb_others() which is
about flushing user mappings, we should use __flush_tlb_single(),
which too is about flushing user mappings.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Andrew Banman <abanman@hpe.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Travis <mike.travis@hpe.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Cc: linux-mm@kvack.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/platform/uv/tlb_uv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index f44c0bc95aa2..8538a6723171 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -299,7 +299,7 @@ static void bau_process_message(struct msg_desc *mdp, struct bau_control *bcp,
 		local_flush_tlb();
 		stat->d_alltlb++;
 	} else {
-		__flush_tlb_one(msg->address);
+		__flush_tlb_single(msg->address);
 		stat->d_onetlb++;
 	}
 	stat->d_requestee++;

From 23cb7d46f371844c004784ad9552a57446f73e5a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 5 Dec 2017 13:34:51 +0100
Subject: [PATCH 417/808] x86/microcode: Dont abuse the TLB-flush interface

Commit:

  ec400ddeff20 ("x86/microcode_intel_early.c: Early update ucode on Intel's CPU")

... grubbed into tlbflush internals without coherent explanation.

Since it says its a precaution and the SDM doesn't mention anything like
this, take it out back.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: fenghua.yu@intel.com
Cc: hughd@google.com
Cc: keescook@google.com
Cc: linux-mm@kvack.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/tlbflush.h       | 19 ++++++-------------
 arch/x86/kernel/cpu/microcode/intel.c | 13 -------------
 2 files changed, 6 insertions(+), 26 deletions(-)

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 509046cfa5ce..c2e45da4e540 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -246,20 +246,9 @@ static inline void __native_flush_tlb(void)
 	preempt_enable();
 }
 
-static inline void __native_flush_tlb_global_irq_disabled(void)
-{
-	unsigned long cr4;
-
-	cr4 = this_cpu_read(cpu_tlbstate.cr4);
-	/* clear PGE */
-	native_write_cr4(cr4 & ~X86_CR4_PGE);
-	/* write old PGE again and flush TLBs */
-	native_write_cr4(cr4);
-}
-
 static inline void __native_flush_tlb_global(void)
 {
-	unsigned long flags;
+	unsigned long cr4, flags;
 
 	if (static_cpu_has(X86_FEATURE_INVPCID)) {
 		/*
@@ -277,7 +266,11 @@ static inline void __native_flush_tlb_global(void)
 	 */
 	raw_local_irq_save(flags);
 
-	__native_flush_tlb_global_irq_disabled();
+	cr4 = this_cpu_read(cpu_tlbstate.cr4);
+	/* toggle PGE */
+	native_write_cr4(cr4 ^ X86_CR4_PGE);
+	/* write old PGE again and flush TLBs */
+	native_write_cr4(cr4);
 
 	raw_local_irq_restore(flags);
 }
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 7dbcb7adf797..8ccdca6d3f9e 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -565,15 +565,6 @@ static void print_ucode(struct ucode_cpu_info *uci)
 }
 #else
 
-/*
- * Flush global tlb. We only do this in x86_64 where paging has been enabled
- * already and PGE should be enabled as well.
- */
-static inline void flush_tlb_early(void)
-{
-	__native_flush_tlb_global_irq_disabled();
-}
-
 static inline void print_ucode(struct ucode_cpu_info *uci)
 {
 	struct microcode_intel *mc;
@@ -602,10 +593,6 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
 	if (rev != mc->hdr.rev)
 		return -1;
 
-#ifdef CONFIG_X86_64
-	/* Flush global tlb. This is precaution. */
-	flush_tlb_early();
-#endif
 	uci->cpu_sig.rev = rev;
 
 	if (early)

From a501686b2923ce6f2ff2b1d0d50682c6411baf72 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 5 Dec 2017 13:34:49 +0100
Subject: [PATCH 418/808] x86/mm: Use __flush_tlb_one() for kernel memory

__flush_tlb_single() is for user mappings, __flush_tlb_one() for
kernel mappings.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Cc: linux-mm@kvack.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/tlb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 3118392cdf75..0569987f6da6 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -551,7 +551,7 @@ static void do_kernel_range_flush(void *info)
 
 	/* flush range by one by one 'invlpg' */
 	for (addr = f->start; addr < f->end; addr += PAGE_SIZE)
-		__flush_tlb_single(addr);
+		__flush_tlb_one(addr);
 }
 
 void flush_tlb_kernel_range(unsigned long start, unsigned long end)

From b5fc6d943808b570bdfbec80f40c6b3855f1c48b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 5 Dec 2017 13:34:46 +0100
Subject: [PATCH 419/808] x86/mm: Remove superfluous barriers

atomic64_inc_return() already implies smp_mb() before and after.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Cc: linux-mm@kvack.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/tlbflush.h | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index c2e45da4e540..3e2227386abe 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -60,19 +60,13 @@ static inline void invpcid_flush_all_nonglobals(void)
 
 static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
 {
-	u64 new_tlb_gen;
-
 	/*
 	 * Bump the generation count.  This also serves as a full barrier
 	 * that synchronizes with switch_mm(): callers are required to order
 	 * their read of mm_cpumask after their writes to the paging
 	 * structures.
 	 */
-	smp_mb__before_atomic();
-	new_tlb_gen = atomic64_inc_return(&mm->context.tlb_gen);
-	smp_mb__after_atomic();
-
-	return new_tlb_gen;
+	return atomic64_inc_return(&mm->context.tlb_gen);
 }
 
 #ifdef CONFIG_PARAVIRT

From 3f67af51e56f291d7417d77c4f67cd774633c5e1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 5 Dec 2017 13:34:52 +0100
Subject: [PATCH 420/808] x86/mm: Add comments to clarify which TLB-flush
 functions are supposed to flush what

Per popular request..

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Cc: linux-mm@kvack.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/tlbflush.h | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 3e2227386abe..552d581c8f9f 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -228,6 +228,9 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask)
 
 extern void initialize_tlbstate_and_flush(void);
 
+/*
+ * flush the entire current user mapping
+ */
 static inline void __native_flush_tlb(void)
 {
 	/*
@@ -240,6 +243,9 @@ static inline void __native_flush_tlb(void)
 	preempt_enable();
 }
 
+/*
+ * flush everything
+ */
 static inline void __native_flush_tlb_global(void)
 {
 	unsigned long cr4, flags;
@@ -269,17 +275,27 @@ static inline void __native_flush_tlb_global(void)
 	raw_local_irq_restore(flags);
 }
 
+/*
+ * flush one page in the user mapping
+ */
 static inline void __native_flush_tlb_single(unsigned long addr)
 {
 	asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
 }
 
+/*
+ * flush everything
+ */
 static inline void __flush_tlb_all(void)
 {
-	if (boot_cpu_has(X86_FEATURE_PGE))
+	if (boot_cpu_has(X86_FEATURE_PGE)) {
 		__flush_tlb_global();
-	else
+	} else {
+		/*
+		 * !PGE -> !PCID (setup_pcid()), thus every flush is total.
+		 */
 		__flush_tlb();
+	}
 
 	/*
 	 * Note: if we somehow had PCID but not PGE, then this wouldn't work --
@@ -290,6 +306,9 @@ static inline void __flush_tlb_all(void)
 	 */
 }
 
+/*
+ * flush one page in the kernel mapping
+ */
 static inline void __flush_tlb_one(unsigned long addr)
 {
 	count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);

From 50fb83a62cf472dc53ba23bd3f7bd6c1b2b3b53e Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Mon, 4 Dec 2017 15:07:54 +0100
Subject: [PATCH 421/808] x86/mm: Move the CR3 construction functions to
 tlbflush.h

For flushing the TLB, the ASID which has been programmed into the hardware
must be known.  That differs from what is in 'cpu_tlbstate'.

Add functions to transform the 'cpu_tlbstate' values into to the one
programmed into the hardware (CR3).

It's not easy to include mmu_context.h into tlbflush.h, so just move the
CR3 building over to tlbflush.h.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Cc: linux-mm@kvack.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/mmu_context.h | 29 +----------------------------
 arch/x86/include/asm/tlbflush.h    | 26 ++++++++++++++++++++++++++
 arch/x86/mm/tlb.c                  |  8 ++++----
 3 files changed, 31 insertions(+), 32 deletions(-)

diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 5e25423bf9bb..5ede7cae1d67 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -290,33 +290,6 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
 	return __pkru_allows_pkey(vma_pkey(vma), write);
 }
 
-/*
- * If PCID is on, ASID-aware code paths put the ASID+1 into the PCID
- * bits.  This serves two purposes.  It prevents a nasty situation in
- * which PCID-unaware code saves CR3, loads some other value (with PCID
- * == 0), and then restores CR3, thus corrupting the TLB for ASID 0 if
- * the saved ASID was nonzero.  It also means that any bugs involving
- * loading a PCID-enabled CR3 with CR4.PCIDE off will trigger
- * deterministically.
- */
-
-static inline unsigned long build_cr3(struct mm_struct *mm, u16 asid)
-{
-	if (static_cpu_has(X86_FEATURE_PCID)) {
-		VM_WARN_ON_ONCE(asid > 4094);
-		return __sme_pa(mm->pgd) | (asid + 1);
-	} else {
-		VM_WARN_ON_ONCE(asid != 0);
-		return __sme_pa(mm->pgd);
-	}
-}
-
-static inline unsigned long build_cr3_noflush(struct mm_struct *mm, u16 asid)
-{
-	VM_WARN_ON_ONCE(asid > 4094);
-	return __sme_pa(mm->pgd) | (asid + 1) | CR3_NOFLUSH;
-}
-
 /*
  * This can be used from process context to figure out what the value of
  * CR3 is without needing to do a (slow) __read_cr3().
@@ -326,7 +299,7 @@ static inline unsigned long build_cr3_noflush(struct mm_struct *mm, u16 asid)
  */
 static inline unsigned long __get_current_cr3_fast(void)
 {
-	unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm),
+	unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd,
 		this_cpu_read(cpu_tlbstate.loaded_mm_asid));
 
 	/* For now, be very restrictive about when this can be called. */
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 552d581c8f9f..ee7925adfb57 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -69,6 +69,32 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
 	return atomic64_inc_return(&mm->context.tlb_gen);
 }
 
+/*
+ * If PCID is on, ASID-aware code paths put the ASID+1 into the PCID bits.
+ * This serves two purposes.  It prevents a nasty situation in which
+ * PCID-unaware code saves CR3, loads some other value (with PCID == 0),
+ * and then restores CR3, thus corrupting the TLB for ASID 0 if the saved
+ * ASID was nonzero.  It also means that any bugs involving loading a
+ * PCID-enabled CR3 with CR4.PCIDE off will trigger deterministically.
+ */
+struct pgd_t;
+static inline unsigned long build_cr3(pgd_t *pgd, u16 asid)
+{
+	if (static_cpu_has(X86_FEATURE_PCID)) {
+		VM_WARN_ON_ONCE(asid > 4094);
+		return __sme_pa(pgd) | (asid + 1);
+	} else {
+		VM_WARN_ON_ONCE(asid != 0);
+		return __sme_pa(pgd);
+	}
+}
+
+static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)
+{
+	VM_WARN_ON_ONCE(asid > 4094);
+	return __sme_pa(pgd) | (asid + 1) | CR3_NOFLUSH;
+}
+
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
 #else
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 0569987f6da6..0a1be3adc97e 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -128,7 +128,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 	 * isn't free.
 	 */
 #ifdef CONFIG_DEBUG_VM
-	if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev, prev_asid))) {
+	if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid))) {
 		/*
 		 * If we were to BUG here, we'd be very likely to kill
 		 * the system so hard that we don't see the call trace.
@@ -195,7 +195,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 		if (need_flush) {
 			this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
 			this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
-			write_cr3(build_cr3(next, new_asid));
+			write_cr3(build_cr3(next->pgd, new_asid));
 
 			/*
 			 * NB: This gets called via leave_mm() in the idle path
@@ -208,7 +208,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 			trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
 		} else {
 			/* The new ASID is already up to date. */
-			write_cr3(build_cr3_noflush(next, new_asid));
+			write_cr3(build_cr3_noflush(next->pgd, new_asid));
 
 			/* See above wrt _rcuidle. */
 			trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
@@ -288,7 +288,7 @@ void initialize_tlbstate_and_flush(void)
 		!(cr4_read_shadow() & X86_CR4_PCIDE));
 
 	/* Force ASID 0 and force a TLB flush. */
-	write_cr3(build_cr3(mm, 0));
+	write_cr3(build_cr3(mm->pgd, 0));
 
 	/* Reinitialize tlbstate. */
 	this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0);

From cb0a9144a744e55207e24dcef812f05cd15a499a Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Mon, 4 Dec 2017 15:07:55 +0100
Subject: [PATCH 422/808] x86/mm: Remove hard-coded ASID limit checks

First, it's nice to remove the magic numbers.

Second, PAGE_TABLE_ISOLATION is going to consume half of the available ASID
space.  The space is currently unused, but add a comment to spell out this
new restriction.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Cc: linux-mm@kvack.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/tlbflush.h | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index ee7925adfb57..f88ccd3ae466 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -69,6 +69,22 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
 	return atomic64_inc_return(&mm->context.tlb_gen);
 }
 
+/* There are 12 bits of space for ASIDS in CR3 */
+#define CR3_HW_ASID_BITS		12
+/*
+ * When enabled, PAGE_TABLE_ISOLATION consumes a single bit for
+ * user/kernel switches
+ */
+#define PTI_CONSUMED_ASID_BITS		0
+
+#define CR3_AVAIL_ASID_BITS (CR3_HW_ASID_BITS - PTI_CONSUMED_ASID_BITS)
+/*
+ * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid.  -1 below to account
+ * for them being zero-based.  Another -1 is because ASID 0 is reserved for
+ * use by non-PCID-aware users.
+ */
+#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_ASID_BITS) - 2)
+
 /*
  * If PCID is on, ASID-aware code paths put the ASID+1 into the PCID bits.
  * This serves two purposes.  It prevents a nasty situation in which
@@ -81,7 +97,7 @@ struct pgd_t;
 static inline unsigned long build_cr3(pgd_t *pgd, u16 asid)
 {
 	if (static_cpu_has(X86_FEATURE_PCID)) {
-		VM_WARN_ON_ONCE(asid > 4094);
+		VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
 		return __sme_pa(pgd) | (asid + 1);
 	} else {
 		VM_WARN_ON_ONCE(asid != 0);
@@ -91,7 +107,7 @@ static inline unsigned long build_cr3(pgd_t *pgd, u16 asid)
 
 static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)
 {
-	VM_WARN_ON_ONCE(asid > 4094);
+	VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
 	return __sme_pa(pgd) | (asid + 1) | CR3_NOFLUSH;
 }
 

From dd95f1a4b5ca904c78e6a097091eb21436478abb Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Mon, 4 Dec 2017 15:07:56 +0100
Subject: [PATCH 423/808] x86/mm: Put MMU to hardware ASID translation in one
 place

There are effectively two ASID types:

 1. The one stored in the mmu_context that goes from 0..5
 2. The one programmed into the hardware that goes from 1..6

This consolidates the locations where converting between the two (by doing
a +1) to a single place which gives us a nice place to comment.
PAGE_TABLE_ISOLATION will also need to, given an ASID, know which hardware
ASID to flush for the userspace mapping.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Cc: linux-mm@kvack.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/tlbflush.h | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index f88ccd3ae466..8b27daff7a7f 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -85,20 +85,26 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
  */
 #define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_ASID_BITS) - 2)
 
-/*
- * If PCID is on, ASID-aware code paths put the ASID+1 into the PCID bits.
- * This serves two purposes.  It prevents a nasty situation in which
- * PCID-unaware code saves CR3, loads some other value (with PCID == 0),
- * and then restores CR3, thus corrupting the TLB for ASID 0 if the saved
- * ASID was nonzero.  It also means that any bugs involving loading a
- * PCID-enabled CR3 with CR4.PCIDE off will trigger deterministically.
- */
+static inline u16 kern_pcid(u16 asid)
+{
+	VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
+	/*
+	 * If PCID is on, ASID-aware code paths put the ASID+1 into the
+	 * PCID bits.  This serves two purposes.  It prevents a nasty
+	 * situation in which PCID-unaware code saves CR3, loads some other
+	 * value (with PCID == 0), and then restores CR3, thus corrupting
+	 * the TLB for ASID 0 if the saved ASID was nonzero.  It also means
+	 * that any bugs involving loading a PCID-enabled CR3 with
+	 * CR4.PCIDE off will trigger deterministically.
+	 */
+	return asid + 1;
+}
+
 struct pgd_t;
 static inline unsigned long build_cr3(pgd_t *pgd, u16 asid)
 {
 	if (static_cpu_has(X86_FEATURE_PCID)) {
-		VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
-		return __sme_pa(pgd) | (asid + 1);
+		return __sme_pa(pgd) | kern_pcid(asid);
 	} else {
 		VM_WARN_ON_ONCE(asid != 0);
 		return __sme_pa(pgd);
@@ -108,7 +114,8 @@ static inline unsigned long build_cr3(pgd_t *pgd, u16 asid)
 static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)
 {
 	VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
-	return __sme_pa(pgd) | (asid + 1) | CR3_NOFLUSH;
+	VM_WARN_ON_ONCE(!this_cpu_has(X86_FEATURE_PCID));
+	return __sme_pa(pgd) | kern_pcid(asid) | CR3_NOFLUSH;
 }
 
 #ifdef CONFIG_PARAVIRT

From 1a3b0caeb77edeac5ce5fa05e6a61c474c9a9745 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 5 Dec 2017 13:34:47 +0100
Subject: [PATCH 424/808] x86/mm: Create asm/invpcid.h

Unclutter tlbflush.h a little.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Cc: linux-mm@kvack.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/invpcid.h  | 53 +++++++++++++++++++++++++++++++++
 arch/x86/include/asm/tlbflush.h | 49 +-----------------------------
 2 files changed, 54 insertions(+), 48 deletions(-)
 create mode 100644 arch/x86/include/asm/invpcid.h

diff --git a/arch/x86/include/asm/invpcid.h b/arch/x86/include/asm/invpcid.h
new file mode 100644
index 000000000000..989cfa86de85
--- /dev/null
+++ b/arch/x86/include/asm/invpcid.h
@@ -0,0 +1,53 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_INVPCID
+#define _ASM_X86_INVPCID
+
+static inline void __invpcid(unsigned long pcid, unsigned long addr,
+			     unsigned long type)
+{
+	struct { u64 d[2]; } desc = { { pcid, addr } };
+
+	/*
+	 * The memory clobber is because the whole point is to invalidate
+	 * stale TLB entries and, especially if we're flushing global
+	 * mappings, we don't want the compiler to reorder any subsequent
+	 * memory accesses before the TLB flush.
+	 *
+	 * The hex opcode is invpcid (%ecx), %eax in 32-bit mode and
+	 * invpcid (%rcx), %rax in long mode.
+	 */
+	asm volatile (".byte 0x66, 0x0f, 0x38, 0x82, 0x01"
+		      : : "m" (desc), "a" (type), "c" (&desc) : "memory");
+}
+
+#define INVPCID_TYPE_INDIV_ADDR		0
+#define INVPCID_TYPE_SINGLE_CTXT	1
+#define INVPCID_TYPE_ALL_INCL_GLOBAL	2
+#define INVPCID_TYPE_ALL_NON_GLOBAL	3
+
+/* Flush all mappings for a given pcid and addr, not including globals. */
+static inline void invpcid_flush_one(unsigned long pcid,
+				     unsigned long addr)
+{
+	__invpcid(pcid, addr, INVPCID_TYPE_INDIV_ADDR);
+}
+
+/* Flush all mappings for a given PCID, not including globals. */
+static inline void invpcid_flush_single_context(unsigned long pcid)
+{
+	__invpcid(pcid, 0, INVPCID_TYPE_SINGLE_CTXT);
+}
+
+/* Flush all mappings, including globals, for all PCIDs. */
+static inline void invpcid_flush_all(void)
+{
+	__invpcid(0, 0, INVPCID_TYPE_ALL_INCL_GLOBAL);
+}
+
+/* Flush all mappings for all PCIDs except globals. */
+static inline void invpcid_flush_all_nonglobals(void)
+{
+	__invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL);
+}
+
+#endif /* _ASM_X86_INVPCID */
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 8b27daff7a7f..171b429f43a2 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -9,54 +9,7 @@
 #include <asm/cpufeature.h>
 #include <asm/special_insns.h>
 #include <asm/smp.h>
-
-static inline void __invpcid(unsigned long pcid, unsigned long addr,
-			     unsigned long type)
-{
-	struct { u64 d[2]; } desc = { { pcid, addr } };
-
-	/*
-	 * The memory clobber is because the whole point is to invalidate
-	 * stale TLB entries and, especially if we're flushing global
-	 * mappings, we don't want the compiler to reorder any subsequent
-	 * memory accesses before the TLB flush.
-	 *
-	 * The hex opcode is invpcid (%ecx), %eax in 32-bit mode and
-	 * invpcid (%rcx), %rax in long mode.
-	 */
-	asm volatile (".byte 0x66, 0x0f, 0x38, 0x82, 0x01"
-		      : : "m" (desc), "a" (type), "c" (&desc) : "memory");
-}
-
-#define INVPCID_TYPE_INDIV_ADDR		0
-#define INVPCID_TYPE_SINGLE_CTXT	1
-#define INVPCID_TYPE_ALL_INCL_GLOBAL	2
-#define INVPCID_TYPE_ALL_NON_GLOBAL	3
-
-/* Flush all mappings for a given pcid and addr, not including globals. */
-static inline void invpcid_flush_one(unsigned long pcid,
-				     unsigned long addr)
-{
-	__invpcid(pcid, addr, INVPCID_TYPE_INDIV_ADDR);
-}
-
-/* Flush all mappings for a given PCID, not including globals. */
-static inline void invpcid_flush_single_context(unsigned long pcid)
-{
-	__invpcid(pcid, 0, INVPCID_TYPE_SINGLE_CTXT);
-}
-
-/* Flush all mappings, including globals, for all PCIDs. */
-static inline void invpcid_flush_all(void)
-{
-	__invpcid(0, 0, INVPCID_TYPE_ALL_INCL_GLOBAL);
-}
-
-/* Flush all mappings for all PCIDs except globals. */
-static inline void invpcid_flush_all_nonglobals(void)
-{
-	__invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL);
-}
+#include <asm/invpcid.h>
 
 static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
 {

From ed1bbc40a0d10e0c5c74fe7bdc6298295cf40255 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 20 Dec 2017 18:28:54 +0100
Subject: [PATCH 425/808] x86/cpu_entry_area: Move it to a separate unit

Separate the cpu_entry_area code out of cpu/common.c and the fixmap.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/cpu_entry_area.h |  52 +++++++++++++
 arch/x86/include/asm/fixmap.h         |  41 +---------
 arch/x86/kernel/cpu/common.c          |  94 -----------------------
 arch/x86/kernel/traps.c               |   1 +
 arch/x86/mm/Makefile                  |   2 +-
 arch/x86/mm/cpu_entry_area.c          | 104 ++++++++++++++++++++++++++
 6 files changed, 159 insertions(+), 135 deletions(-)
 create mode 100644 arch/x86/include/asm/cpu_entry_area.h
 create mode 100644 arch/x86/mm/cpu_entry_area.c

diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h
new file mode 100644
index 000000000000..5471826803af
--- /dev/null
+++ b/arch/x86/include/asm/cpu_entry_area.h
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#ifndef _ASM_X86_CPU_ENTRY_AREA_H
+#define _ASM_X86_CPU_ENTRY_AREA_H
+
+#include <linux/percpu-defs.h>
+#include <asm/processor.h>
+
+/*
+ * cpu_entry_area is a percpu region that contains things needed by the CPU
+ * and early entry/exit code.  Real types aren't used for all fields here
+ * to avoid circular header dependencies.
+ *
+ * Every field is a virtual alias of some other allocated backing store.
+ * There is no direct allocation of a struct cpu_entry_area.
+ */
+struct cpu_entry_area {
+	char gdt[PAGE_SIZE];
+
+	/*
+	 * The GDT is just below entry_stack and thus serves (on x86_64) as
+	 * a a read-only guard page.
+	 */
+	struct entry_stack_page entry_stack_page;
+
+	/*
+	 * On x86_64, the TSS is mapped RO.  On x86_32, it's mapped RW because
+	 * we need task switches to work, and task switches write to the TSS.
+	 */
+	struct tss_struct tss;
+
+	char entry_trampoline[PAGE_SIZE];
+
+#ifdef CONFIG_X86_64
+	/*
+	 * Exception stacks used for IST entries.
+	 *
+	 * In the future, this should have a separate slot for each stack
+	 * with guard pages between them.
+	 */
+	char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ];
+#endif
+};
+
+#define CPU_ENTRY_AREA_SIZE	(sizeof(struct cpu_entry_area))
+#define CPU_ENTRY_AREA_PAGES	(CPU_ENTRY_AREA_SIZE / PAGE_SIZE)
+
+DECLARE_PER_CPU(struct cpu_entry_area *, cpu_entry_area);
+
+extern void setup_cpu_entry_areas(void);
+
+#endif
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 8153b8d86a3c..fb801662a230 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -25,6 +25,7 @@
 #else
 #include <uapi/asm/vsyscall.h>
 #endif
+#include <asm/cpu_entry_area.h>
 
 /*
  * We can't declare FIXADDR_TOP as variable for x86_64 because vsyscall
@@ -44,46 +45,6 @@ extern unsigned long __FIXADDR_TOP;
 			 PAGE_SIZE)
 #endif
 
-/*
- * cpu_entry_area is a percpu region in the fixmap that contains things
- * needed by the CPU and early entry/exit code.  Real types aren't used
- * for all fields here to avoid circular header dependencies.
- *
- * Every field is a virtual alias of some other allocated backing store.
- * There is no direct allocation of a struct cpu_entry_area.
- */
-struct cpu_entry_area {
-	char gdt[PAGE_SIZE];
-
-	/*
-	 * The GDT is just below entry_stack and thus serves (on x86_64) as
-	 * a a read-only guard page.
-	 */
-	struct entry_stack_page entry_stack_page;
-
-	/*
-	 * On x86_64, the TSS is mapped RO.  On x86_32, it's mapped RW because
-	 * we need task switches to work, and task switches write to the TSS.
-	 */
-	struct tss_struct tss;
-
-	char entry_trampoline[PAGE_SIZE];
-
-#ifdef CONFIG_X86_64
-	/*
-	 * Exception stacks used for IST entries.
-	 *
-	 * In the future, this should have a separate slot for each stack
-	 * with guard pages between them.
-	 */
-	char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ];
-#endif
-};
-
-#define CPU_ENTRY_AREA_PAGES (sizeof(struct cpu_entry_area) / PAGE_SIZE)
-
-extern void setup_cpu_entry_areas(void);
-
 /*
  * Here we define all the compile-time 'special' virtual
  * addresses. The point is to have a constant address at
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index ed4acbce37a8..8ddcfa4d4165 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -482,102 +482,8 @@ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
 	  [0 ... N_EXCEPTION_STACKS - 1]	= EXCEPTION_STKSZ,
 	  [DEBUG_STACK - 1]			= DEBUG_STKSZ
 };
-
-static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
-	[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
 #endif
 
-static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page,
-				   entry_stack_storage);
-
-static void __init
-set_percpu_fixmap_pages(int idx, void *ptr, int pages, pgprot_t prot)
-{
-	for ( ; pages; pages--, idx--, ptr += PAGE_SIZE)
-		__set_fixmap(idx, per_cpu_ptr_to_phys(ptr), prot);
-}
-
-/* Setup the fixmap mappings only once per-processor */
-static void __init setup_cpu_entry_area(int cpu)
-{
-#ifdef CONFIG_X86_64
-	extern char _entry_trampoline[];
-
-	/* On 64-bit systems, we use a read-only fixmap GDT and TSS. */
-	pgprot_t gdt_prot = PAGE_KERNEL_RO;
-	pgprot_t tss_prot = PAGE_KERNEL_RO;
-#else
-	/*
-	 * On native 32-bit systems, the GDT cannot be read-only because
-	 * our double fault handler uses a task gate, and entering through
-	 * a task gate needs to change an available TSS to busy.  If the
-	 * GDT is read-only, that will triple fault.  The TSS cannot be
-	 * read-only because the CPU writes to it on task switches.
-	 *
-	 * On Xen PV, the GDT must be read-only because the hypervisor
-	 * requires it.
-	 */
-	pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ?
-		PAGE_KERNEL_RO : PAGE_KERNEL;
-	pgprot_t tss_prot = PAGE_KERNEL;
-#endif
-
-	__set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot);
-	set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, entry_stack_page),
-				per_cpu_ptr(&entry_stack_storage, cpu), 1,
-				PAGE_KERNEL);
-
-	/*
-	 * The Intel SDM says (Volume 3, 7.2.1):
-	 *
-	 *  Avoid placing a page boundary in the part of the TSS that the
-	 *  processor reads during a task switch (the first 104 bytes). The
-	 *  processor may not correctly perform address translations if a
-	 *  boundary occurs in this area. During a task switch, the processor
-	 *  reads and writes into the first 104 bytes of each TSS (using
-	 *  contiguous physical addresses beginning with the physical address
-	 *  of the first byte of the TSS). So, after TSS access begins, if
-	 *  part of the 104 bytes is not physically contiguous, the processor
-	 *  will access incorrect information without generating a page-fault
-	 *  exception.
-	 *
-	 * There are also a lot of errata involving the TSS spanning a page
-	 * boundary.  Assert that we're not doing that.
-	 */
-	BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^
-		      offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK);
-	BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0);
-	set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, tss),
-				&per_cpu(cpu_tss_rw, cpu),
-				sizeof(struct tss_struct) / PAGE_SIZE,
-				tss_prot);
-
-#ifdef CONFIG_X86_32
-	per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu);
-#endif
-
-#ifdef CONFIG_X86_64
-	BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0);
-	BUILD_BUG_ON(sizeof(exception_stacks) !=
-		     sizeof(((struct cpu_entry_area *)0)->exception_stacks));
-	set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, exception_stacks),
-				&per_cpu(exception_stacks, cpu),
-				sizeof(exception_stacks) / PAGE_SIZE,
-				PAGE_KERNEL);
-
-	__set_fixmap(get_cpu_entry_area_index(cpu, entry_trampoline),
-		     __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX);
-#endif
-}
-
-void __init setup_cpu_entry_areas(void)
-{
-	unsigned int cpu;
-
-	for_each_possible_cpu(cpu)
-		setup_cpu_entry_area(cpu);
-}
-
 /* Load the original GDT from the per-cpu structure */
 void load_direct_gdt(int cpu)
 {
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 74136fd16f49..464daed6894f 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -52,6 +52,7 @@
 #include <asm/traps.h>
 #include <asm/desc.h>
 #include <asm/fpu/internal.h>
+#include <asm/cpu_entry_area.h>
 #include <asm/mce.h>
 #include <asm/fixmap.h>
 #include <asm/mach_traps.h>
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 7ba7f3d7f477..2e0017af8f9b 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -10,7 +10,7 @@ CFLAGS_REMOVE_mem_encrypt.o	= -pg
 endif
 
 obj-y	:=  init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
-	    pat.o pgtable.o physaddr.o setup_nx.o tlb.o
+	    pat.o pgtable.o physaddr.o setup_nx.o tlb.o cpu_entry_area.o
 
 # Make sure __phys_addr has no stackprotector
 nostackp := $(call cc-option, -fno-stack-protector)
diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c
new file mode 100644
index 000000000000..235ff9cfaaf4
--- /dev/null
+++ b/arch/x86/mm/cpu_entry_area.c
@@ -0,0 +1,104 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
+
+#include <asm/cpu_entry_area.h>
+#include <asm/pgtable.h>
+#include <asm/fixmap.h>
+#include <asm/desc.h>
+
+static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, entry_stack_storage);
+
+#ifdef CONFIG_X86_64
+static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
+	[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
+#endif
+
+static void __init
+set_percpu_fixmap_pages(int idx, void *ptr, int pages, pgprot_t prot)
+{
+	for ( ; pages; pages--, idx--, ptr += PAGE_SIZE)
+		__set_fixmap(idx, per_cpu_ptr_to_phys(ptr), prot);
+}
+
+/* Setup the fixmap mappings only once per-processor */
+static void __init setup_cpu_entry_area(int cpu)
+{
+#ifdef CONFIG_X86_64
+	extern char _entry_trampoline[];
+
+	/* On 64-bit systems, we use a read-only fixmap GDT and TSS. */
+	pgprot_t gdt_prot = PAGE_KERNEL_RO;
+	pgprot_t tss_prot = PAGE_KERNEL_RO;
+#else
+	/*
+	 * On native 32-bit systems, the GDT cannot be read-only because
+	 * our double fault handler uses a task gate, and entering through
+	 * a task gate needs to change an available TSS to busy.  If the
+	 * GDT is read-only, that will triple fault.  The TSS cannot be
+	 * read-only because the CPU writes to it on task switches.
+	 *
+	 * On Xen PV, the GDT must be read-only because the hypervisor
+	 * requires it.
+	 */
+	pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ?
+		PAGE_KERNEL_RO : PAGE_KERNEL;
+	pgprot_t tss_prot = PAGE_KERNEL;
+#endif
+
+	__set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot);
+	set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, entry_stack_page),
+				per_cpu_ptr(&entry_stack_storage, cpu), 1,
+				PAGE_KERNEL);
+
+	/*
+	 * The Intel SDM says (Volume 3, 7.2.1):
+	 *
+	 *  Avoid placing a page boundary in the part of the TSS that the
+	 *  processor reads during a task switch (the first 104 bytes). The
+	 *  processor may not correctly perform address translations if a
+	 *  boundary occurs in this area. During a task switch, the processor
+	 *  reads and writes into the first 104 bytes of each TSS (using
+	 *  contiguous physical addresses beginning with the physical address
+	 *  of the first byte of the TSS). So, after TSS access begins, if
+	 *  part of the 104 bytes is not physically contiguous, the processor
+	 *  will access incorrect information without generating a page-fault
+	 *  exception.
+	 *
+	 * There are also a lot of errata involving the TSS spanning a page
+	 * boundary.  Assert that we're not doing that.
+	 */
+	BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^
+		      offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK);
+	BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0);
+	set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, tss),
+				&per_cpu(cpu_tss_rw, cpu),
+				sizeof(struct tss_struct) / PAGE_SIZE,
+				tss_prot);
+
+#ifdef CONFIG_X86_32
+	per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu);
+#endif
+
+#ifdef CONFIG_X86_64
+	BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0);
+	BUILD_BUG_ON(sizeof(exception_stacks) !=
+		     sizeof(((struct cpu_entry_area *)0)->exception_stacks));
+	set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, exception_stacks),
+				&per_cpu(exception_stacks, cpu),
+				sizeof(exception_stacks) / PAGE_SIZE,
+				PAGE_KERNEL);
+
+	__set_fixmap(get_cpu_entry_area_index(cpu, entry_trampoline),
+		     __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX);
+#endif
+}
+
+void __init setup_cpu_entry_areas(void)
+{
+	unsigned int cpu;
+
+	for_each_possible_cpu(cpu)
+		setup_cpu_entry_area(cpu);
+}

From 92a0f81d89571e3e8759366e050ee05cc545ef99 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 20 Dec 2017 18:51:31 +0100
Subject: [PATCH 426/808] x86/cpu_entry_area: Move it out of the fixmap

Put the cpu_entry_area into a separate P4D entry. The fixmap gets too big
and 0-day already hit a case where the fixmap PTEs were cleared by
cleanup_highmap().

Aside of that the fixmap API is a pain as it's all backwards.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/x86/x86_64/mm.txt         |  2 +
 arch/x86/include/asm/cpu_entry_area.h   | 18 ++++++-
 arch/x86/include/asm/desc.h             |  1 +
 arch/x86/include/asm/fixmap.h           | 32 +-----------
 arch/x86/include/asm/pgtable_32_types.h | 15 ++++--
 arch/x86/include/asm/pgtable_64_types.h | 47 ++++++++++-------
 arch/x86/kernel/dumpstack.c             |  1 +
 arch/x86/kernel/traps.c                 |  5 +-
 arch/x86/mm/cpu_entry_area.c            | 68 ++++++++++++++++++-------
 arch/x86/mm/dump_pagetables.c           |  6 ++-
 arch/x86/mm/init_32.c                   |  6 +++
 arch/x86/mm/kasan_init_64.c             | 37 +++++++-------
 arch/x86/mm/pgtable_32.c                |  1 +
 arch/x86/xen/mmu_pv.c                   |  2 -
 14 files changed, 148 insertions(+), 93 deletions(-)

diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt
index 63a41671d25b..51101708a03a 100644
--- a/Documentation/x86/x86_64/mm.txt
+++ b/Documentation/x86/x86_64/mm.txt
@@ -12,6 +12,7 @@ ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)
 ... unused hole ...
 ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB)
 ... unused hole ...
+fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping
 ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
 ... unused hole ...
 ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
@@ -35,6 +36,7 @@ ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB)
 ... unused hole ...
 ffdf000000000000 - fffffc0000000000 (=53 bits) kasan shadow memory (8PB)
 ... unused hole ...
+fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping
 ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
 ... unused hole ...
 ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h
index 5471826803af..2fbc69a0916e 100644
--- a/arch/x86/include/asm/cpu_entry_area.h
+++ b/arch/x86/include/asm/cpu_entry_area.h
@@ -43,10 +43,26 @@ struct cpu_entry_area {
 };
 
 #define CPU_ENTRY_AREA_SIZE	(sizeof(struct cpu_entry_area))
-#define CPU_ENTRY_AREA_PAGES	(CPU_ENTRY_AREA_SIZE / PAGE_SIZE)
+#define CPU_ENTRY_AREA_TOT_SIZE	(CPU_ENTRY_AREA_SIZE * NR_CPUS)
 
 DECLARE_PER_CPU(struct cpu_entry_area *, cpu_entry_area);
 
 extern void setup_cpu_entry_areas(void);
+extern void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags);
+
+#define	CPU_ENTRY_AREA_RO_IDT		CPU_ENTRY_AREA_BASE
+#define CPU_ENTRY_AREA_PER_CPU		(CPU_ENTRY_AREA_RO_IDT + PAGE_SIZE)
+
+#define CPU_ENTRY_AREA_RO_IDT_VADDR	((void *)CPU_ENTRY_AREA_RO_IDT)
+
+#define CPU_ENTRY_AREA_MAP_SIZE			\
+	(CPU_ENTRY_AREA_PER_CPU + CPU_ENTRY_AREA_TOT_SIZE - CPU_ENTRY_AREA_BASE)
+
+extern struct cpu_entry_area *get_cpu_entry_area(int cpu);
+
+static inline struct entry_stack *cpu_entry_stack(int cpu)
+{
+	return &get_cpu_entry_area(cpu)->entry_stack_page.stack;
+}
 
 #endif
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 2ace1f90d138..bc359dd2f7f6 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -7,6 +7,7 @@
 #include <asm/mmu.h>
 #include <asm/fixmap.h>
 #include <asm/irq_vectors.h>
+#include <asm/cpu_entry_area.h>
 
 #include <linux/smp.h>
 #include <linux/percpu.h>
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index fb801662a230..64c4a30e0d39 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -25,7 +25,6 @@
 #else
 #include <uapi/asm/vsyscall.h>
 #endif
-#include <asm/cpu_entry_area.h>
 
 /*
  * We can't declare FIXADDR_TOP as variable for x86_64 because vsyscall
@@ -84,7 +83,6 @@ enum fixed_addresses {
 	FIX_IO_APIC_BASE_0,
 	FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
 #endif
-	FIX_RO_IDT,	/* Virtual mapping for read-only IDT */
 #ifdef CONFIG_X86_32
 	FIX_KMAP_BEGIN,	/* reserved pte's for temporary kernel mappings */
 	FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
@@ -100,9 +98,6 @@ enum fixed_addresses {
 #ifdef	CONFIG_X86_INTEL_MID
 	FIX_LNW_VRTC,
 #endif
-	/* Fixmap entries to remap the GDTs, one per processor. */
-	FIX_CPU_ENTRY_AREA_TOP,
-	FIX_CPU_ENTRY_AREA_BOTTOM = FIX_CPU_ENTRY_AREA_TOP + (CPU_ENTRY_AREA_PAGES * NR_CPUS) - 1,
 
 #ifdef CONFIG_ACPI_APEI_GHES
 	/* Used for GHES mapping from assorted contexts */
@@ -143,7 +138,7 @@ enum fixed_addresses {
 extern void reserve_top_address(unsigned long reserve);
 
 #define FIXADDR_SIZE	(__end_of_permanent_fixed_addresses << PAGE_SHIFT)
-#define FIXADDR_START		(FIXADDR_TOP - FIXADDR_SIZE)
+#define FIXADDR_START	(FIXADDR_TOP - FIXADDR_SIZE)
 
 extern int fixmaps_set;
 
@@ -191,30 +186,5 @@ void __init *early_memremap_decrypted_wp(resource_size_t phys_addr,
 void __early_set_fixmap(enum fixed_addresses idx,
 			phys_addr_t phys, pgprot_t flags);
 
-static inline unsigned int __get_cpu_entry_area_page_index(int cpu, int page)
-{
-	BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0);
-
-	return FIX_CPU_ENTRY_AREA_BOTTOM - cpu*CPU_ENTRY_AREA_PAGES - page;
-}
-
-#define __get_cpu_entry_area_offset_index(cpu, offset) ({		\
-	BUILD_BUG_ON(offset % PAGE_SIZE != 0);				\
-	__get_cpu_entry_area_page_index(cpu, offset / PAGE_SIZE);	\
-	})
-
-#define get_cpu_entry_area_index(cpu, field)				\
-	__get_cpu_entry_area_offset_index((cpu), offsetof(struct cpu_entry_area, field))
-
-static inline struct cpu_entry_area *get_cpu_entry_area(int cpu)
-{
-	return (struct cpu_entry_area *)__fix_to_virt(__get_cpu_entry_area_page_index(cpu, 0));
-}
-
-static inline struct entry_stack *cpu_entry_stack(int cpu)
-{
-	return &get_cpu_entry_area(cpu)->entry_stack_page.stack;
-}
-
 #endif /* !__ASSEMBLY__ */
 #endif /* _ASM_X86_FIXMAP_H */
diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h
index f2ca9b28fd68..ce245b0cdfca 100644
--- a/arch/x86/include/asm/pgtable_32_types.h
+++ b/arch/x86/include/asm/pgtable_32_types.h
@@ -38,13 +38,22 @@ extern bool __vmalloc_start_set; /* set once high_memory is set */
 #define LAST_PKMAP 1024
 #endif
 
-#define PKMAP_BASE ((FIXADDR_START - PAGE_SIZE * (LAST_PKMAP + 1))	\
-		    & PMD_MASK)
+/*
+ * Define this here and validate with BUILD_BUG_ON() in pgtable_32.c
+ * to avoid include recursion hell
+ */
+#define CPU_ENTRY_AREA_PAGES	(NR_CPUS * 40)
+
+#define CPU_ENTRY_AREA_BASE				\
+	((FIXADDR_START - PAGE_SIZE * (CPU_ENTRY_AREA_PAGES + 1)) & PMD_MASK)
+
+#define PKMAP_BASE		\
+	((CPU_ENTRY_AREA_BASE - PAGE_SIZE) & PMD_MASK)
 
 #ifdef CONFIG_HIGHMEM
 # define VMALLOC_END	(PKMAP_BASE - 2 * PAGE_SIZE)
 #else
-# define VMALLOC_END	(FIXADDR_START - 2 * PAGE_SIZE)
+# define VMALLOC_END	(CPU_ENTRY_AREA_BASE - 2 * PAGE_SIZE)
 #endif
 
 #define MODULES_VADDR	VMALLOC_START
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 6d5f45dcd4a1..3d27831bc58d 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -76,32 +76,41 @@ typedef struct { pteval_t pte; } pte_t;
 #define PGDIR_MASK	(~(PGDIR_SIZE - 1))
 
 /* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
-#define MAXMEM		_AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
+#define MAXMEM			_AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
+
 #ifdef CONFIG_X86_5LEVEL
-#define VMALLOC_SIZE_TB _AC(16384, UL)
-#define __VMALLOC_BASE	_AC(0xff92000000000000, UL)
-#define __VMEMMAP_BASE	_AC(0xffd4000000000000, UL)
+# define VMALLOC_SIZE_TB	_AC(16384, UL)
+# define __VMALLOC_BASE		_AC(0xff92000000000000, UL)
+# define __VMEMMAP_BASE		_AC(0xffd4000000000000, UL)
 #else
-#define VMALLOC_SIZE_TB	_AC(32, UL)
-#define __VMALLOC_BASE	_AC(0xffffc90000000000, UL)
-#define __VMEMMAP_BASE	_AC(0xffffea0000000000, UL)
+# define VMALLOC_SIZE_TB	_AC(32, UL)
+# define __VMALLOC_BASE		_AC(0xffffc90000000000, UL)
+# define __VMEMMAP_BASE		_AC(0xffffea0000000000, UL)
 #endif
+
 #ifdef CONFIG_RANDOMIZE_MEMORY
-#define VMALLOC_START	vmalloc_base
-#define VMEMMAP_START	vmemmap_base
+# define VMALLOC_START		vmalloc_base
+# define VMEMMAP_START		vmemmap_base
 #else
-#define VMALLOC_START	__VMALLOC_BASE
-#define VMEMMAP_START	__VMEMMAP_BASE
+# define VMALLOC_START		__VMALLOC_BASE
+# define VMEMMAP_START		__VMEMMAP_BASE
 #endif /* CONFIG_RANDOMIZE_MEMORY */
-#define VMALLOC_END	(VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL))
-#define MODULES_VADDR    (__START_KERNEL_map + KERNEL_IMAGE_SIZE)
+
+#define VMALLOC_END		(VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL))
+
+#define MODULES_VADDR		(__START_KERNEL_map + KERNEL_IMAGE_SIZE)
 /* The module sections ends with the start of the fixmap */
-#define MODULES_END   __fix_to_virt(__end_of_fixed_addresses + 1)
-#define MODULES_LEN   (MODULES_END - MODULES_VADDR)
-#define ESPFIX_PGD_ENTRY _AC(-2, UL)
-#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << P4D_SHIFT)
-#define EFI_VA_START	 ( -4 * (_AC(1, UL) << 30))
-#define EFI_VA_END	 (-68 * (_AC(1, UL) << 30))
+#define MODULES_END		__fix_to_virt(__end_of_fixed_addresses + 1)
+#define MODULES_LEN		(MODULES_END - MODULES_VADDR)
+
+#define ESPFIX_PGD_ENTRY	_AC(-2, UL)
+#define ESPFIX_BASE_ADDR	(ESPFIX_PGD_ENTRY << P4D_SHIFT)
+
+#define CPU_ENTRY_AREA_PGD	_AC(-3, UL)
+#define CPU_ENTRY_AREA_BASE	(CPU_ENTRY_AREA_PGD << P4D_SHIFT)
+
+#define EFI_VA_START		( -4 * (_AC(1, UL) << 30))
+#define EFI_VA_END		(-68 * (_AC(1, UL) << 30))
 
 #define EARLY_DYNAMIC_PAGE_TABLES	64
 
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 1dd3f533d78c..36b17e0febe8 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -18,6 +18,7 @@
 #include <linux/nmi.h>
 #include <linux/sysfs.h>
 
+#include <asm/cpu_entry_area.h>
 #include <asm/stacktrace.h>
 #include <asm/unwind.h>
 
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 464daed6894f..7c16fe0b60c2 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -951,8 +951,9 @@ void __init trap_init(void)
 	 * "sidt" instruction will not leak the location of the kernel, and
 	 * to defend the IDT against arbitrary memory write vulnerabilities.
 	 * It will be reloaded in cpu_init() */
-	__set_fixmap(FIX_RO_IDT, __pa_symbol(idt_table), PAGE_KERNEL_RO);
-	idt_descr.address = fix_to_virt(FIX_RO_IDT);
+	cea_set_pte(CPU_ENTRY_AREA_RO_IDT_VADDR, __pa_symbol(idt_table),
+		    PAGE_KERNEL_RO);
+	idt_descr.address = CPU_ENTRY_AREA_RO_IDT;
 
 	/*
 	 * Should be a barrier for any external CPU state:
diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c
index 235ff9cfaaf4..21e8b595cbb1 100644
--- a/arch/x86/mm/cpu_entry_area.c
+++ b/arch/x86/mm/cpu_entry_area.c
@@ -15,11 +15,27 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
 	[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
 #endif
 
-static void __init
-set_percpu_fixmap_pages(int idx, void *ptr, int pages, pgprot_t prot)
+struct cpu_entry_area *get_cpu_entry_area(int cpu)
 {
-	for ( ; pages; pages--, idx--, ptr += PAGE_SIZE)
-		__set_fixmap(idx, per_cpu_ptr_to_phys(ptr), prot);
+	unsigned long va = CPU_ENTRY_AREA_PER_CPU + cpu * CPU_ENTRY_AREA_SIZE;
+	BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0);
+
+	return (struct cpu_entry_area *) va;
+}
+EXPORT_SYMBOL(get_cpu_entry_area);
+
+void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags)
+{
+	unsigned long va = (unsigned long) cea_vaddr;
+
+	set_pte_vaddr(va, pfn_pte(pa >> PAGE_SHIFT, flags));
+}
+
+static void __init
+cea_map_percpu_pages(void *cea_vaddr, void *ptr, int pages, pgprot_t prot)
+{
+	for ( ; pages; pages--, cea_vaddr+= PAGE_SIZE, ptr += PAGE_SIZE)
+		cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot);
 }
 
 /* Setup the fixmap mappings only once per-processor */
@@ -47,10 +63,12 @@ static void __init setup_cpu_entry_area(int cpu)
 	pgprot_t tss_prot = PAGE_KERNEL;
 #endif
 
-	__set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot);
-	set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, entry_stack_page),
-				per_cpu_ptr(&entry_stack_storage, cpu), 1,
-				PAGE_KERNEL);
+	cea_set_pte(&get_cpu_entry_area(cpu)->gdt, get_cpu_gdt_paddr(cpu),
+		    gdt_prot);
+
+	cea_map_percpu_pages(&get_cpu_entry_area(cpu)->entry_stack_page,
+			     per_cpu_ptr(&entry_stack_storage, cpu), 1,
+			     PAGE_KERNEL);
 
 	/*
 	 * The Intel SDM says (Volume 3, 7.2.1):
@@ -72,10 +90,9 @@ static void __init setup_cpu_entry_area(int cpu)
 	BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^
 		      offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK);
 	BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0);
-	set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, tss),
-				&per_cpu(cpu_tss_rw, cpu),
-				sizeof(struct tss_struct) / PAGE_SIZE,
-				tss_prot);
+	cea_map_percpu_pages(&get_cpu_entry_area(cpu)->tss,
+			     &per_cpu(cpu_tss_rw, cpu),
+			     sizeof(struct tss_struct) / PAGE_SIZE, tss_prot);
 
 #ifdef CONFIG_X86_32
 	per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu);
@@ -85,20 +102,37 @@ static void __init setup_cpu_entry_area(int cpu)
 	BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0);
 	BUILD_BUG_ON(sizeof(exception_stacks) !=
 		     sizeof(((struct cpu_entry_area *)0)->exception_stacks));
-	set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, exception_stacks),
-				&per_cpu(exception_stacks, cpu),
-				sizeof(exception_stacks) / PAGE_SIZE,
-				PAGE_KERNEL);
+	cea_map_percpu_pages(&get_cpu_entry_area(cpu)->exception_stacks,
+			     &per_cpu(exception_stacks, cpu),
+			     sizeof(exception_stacks) / PAGE_SIZE, PAGE_KERNEL);
 
-	__set_fixmap(get_cpu_entry_area_index(cpu, entry_trampoline),
+	cea_set_pte(&get_cpu_entry_area(cpu)->entry_trampoline,
 		     __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX);
 #endif
 }
 
+static __init void setup_cpu_entry_area_ptes(void)
+{
+#ifdef CONFIG_X86_32
+	unsigned long start, end;
+
+	BUILD_BUG_ON(CPU_ENTRY_AREA_PAGES * PAGE_SIZE < CPU_ENTRY_AREA_MAP_SIZE);
+	BUG_ON(CPU_ENTRY_AREA_BASE & ~PMD_MASK);
+
+	start = CPU_ENTRY_AREA_BASE;
+	end = start + CPU_ENTRY_AREA_MAP_SIZE;
+
+	for (; start < end; start += PMD_SIZE)
+		populate_extra_pte(start);
+#endif
+}
+
 void __init setup_cpu_entry_areas(void)
 {
 	unsigned int cpu;
 
+	setup_cpu_entry_area_ptes();
+
 	for_each_possible_cpu(cpu)
 		setup_cpu_entry_area(cpu);
 }
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index fdf09d8f98da..43dedbfb7257 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -58,6 +58,7 @@ enum address_markers_idx {
 	KASAN_SHADOW_START_NR,
 	KASAN_SHADOW_END_NR,
 #endif
+	CPU_ENTRY_AREA_NR,
 #ifdef CONFIG_X86_ESPFIX64
 	ESPFIX_START_NR,
 #endif
@@ -81,6 +82,7 @@ static struct addr_marker address_markers[] = {
 	[KASAN_SHADOW_START_NR]	= { KASAN_SHADOW_START,	"KASAN shadow" },
 	[KASAN_SHADOW_END_NR]	= { KASAN_SHADOW_END,	"KASAN shadow end" },
 #endif
+	[CPU_ENTRY_AREA_NR]	= { CPU_ENTRY_AREA_BASE,"CPU entry Area" },
 #ifdef CONFIG_X86_ESPFIX64
 	[ESPFIX_START_NR]	= { ESPFIX_BASE_ADDR,	"ESPfix Area", 16 },
 #endif
@@ -104,6 +106,7 @@ enum address_markers_idx {
 #ifdef CONFIG_HIGHMEM
 	PKMAP_BASE_NR,
 #endif
+	CPU_ENTRY_AREA_NR,
 	FIXADDR_START_NR,
 	END_OF_SPACE_NR,
 };
@@ -116,6 +119,7 @@ static struct addr_marker address_markers[] = {
 #ifdef CONFIG_HIGHMEM
 	[PKMAP_BASE_NR]		= { 0UL,		"Persistent kmap() Area" },
 #endif
+	[CPU_ENTRY_AREA_NR]	= { 0UL,		"CPU entry area" },
 	[FIXADDR_START_NR]	= { 0UL,		"Fixmap area" },
 	[END_OF_SPACE_NR]	= { -1,			NULL }
 };
@@ -541,8 +545,8 @@ static int __init pt_dump_init(void)
 	address_markers[PKMAP_BASE_NR].start_address = PKMAP_BASE;
 # endif
 	address_markers[FIXADDR_START_NR].start_address = FIXADDR_START;
+	address_markers[CPU_ENTRY_AREA_NR].start_address = CPU_ENTRY_AREA_BASE;
 #endif
-
 	return 0;
 }
 __initcall(pt_dump_init);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 8a64a6f2848d..135c9a7898c7 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -50,6 +50,7 @@
 #include <asm/setup.h>
 #include <asm/set_memory.h>
 #include <asm/page_types.h>
+#include <asm/cpu_entry_area.h>
 #include <asm/init.h>
 
 #include "mm_internal.h"
@@ -766,6 +767,7 @@ void __init mem_init(void)
 	mem_init_print_info(NULL);
 	printk(KERN_INFO "virtual kernel memory layout:\n"
 		"    fixmap  : 0x%08lx - 0x%08lx   (%4ld kB)\n"
+		"  cpu_entry : 0x%08lx - 0x%08lx   (%4ld kB)\n"
 #ifdef CONFIG_HIGHMEM
 		"    pkmap   : 0x%08lx - 0x%08lx   (%4ld kB)\n"
 #endif
@@ -777,6 +779,10 @@ void __init mem_init(void)
 		FIXADDR_START, FIXADDR_TOP,
 		(FIXADDR_TOP - FIXADDR_START) >> 10,
 
+		CPU_ENTRY_AREA_BASE,
+		CPU_ENTRY_AREA_BASE + CPU_ENTRY_AREA_MAP_SIZE,
+		CPU_ENTRY_AREA_MAP_SIZE >> 10,
+
 #ifdef CONFIG_HIGHMEM
 		PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
 		(LAST_PKMAP*PAGE_SIZE) >> 10,
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 9ec70d780f1f..47388f0c0e59 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -15,6 +15,7 @@
 #include <asm/tlbflush.h>
 #include <asm/sections.h>
 #include <asm/pgtable.h>
+#include <asm/cpu_entry_area.h>
 
 extern struct range pfn_mapped[E820_MAX_ENTRIES];
 
@@ -322,31 +323,33 @@ void __init kasan_init(void)
 		map_range(&pfn_mapped[i]);
 	}
 
+	shadow_cpu_entry_begin = (void *)CPU_ENTRY_AREA_BASE;
+	shadow_cpu_entry_begin = kasan_mem_to_shadow(shadow_cpu_entry_begin);
+	shadow_cpu_entry_begin = (void *)round_down((unsigned long)shadow_cpu_entry_begin,
+						PAGE_SIZE);
+
+	shadow_cpu_entry_end = (void *)(CPU_ENTRY_AREA_BASE +
+					CPU_ENTRY_AREA_MAP_SIZE);
+	shadow_cpu_entry_end = kasan_mem_to_shadow(shadow_cpu_entry_end);
+	shadow_cpu_entry_end = (void *)round_up((unsigned long)shadow_cpu_entry_end,
+					PAGE_SIZE);
+
 	kasan_populate_zero_shadow(
 		kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM),
-		kasan_mem_to_shadow((void *)__START_KERNEL_map));
+		shadow_cpu_entry_begin);
+
+	kasan_populate_shadow((unsigned long)shadow_cpu_entry_begin,
+			      (unsigned long)shadow_cpu_entry_end, 0);
+
+	kasan_populate_zero_shadow(shadow_cpu_entry_end,
+				kasan_mem_to_shadow((void *)__START_KERNEL_map));
 
 	kasan_populate_shadow((unsigned long)kasan_mem_to_shadow(_stext),
 			      (unsigned long)kasan_mem_to_shadow(_end),
 			      early_pfn_to_nid(__pa(_stext)));
 
-	shadow_cpu_entry_begin = (void *)__fix_to_virt(FIX_CPU_ENTRY_AREA_BOTTOM);
-	shadow_cpu_entry_begin = kasan_mem_to_shadow(shadow_cpu_entry_begin);
-	shadow_cpu_entry_begin = (void *)round_down((unsigned long)shadow_cpu_entry_begin,
-						PAGE_SIZE);
-
-	shadow_cpu_entry_end = (void *)(__fix_to_virt(FIX_CPU_ENTRY_AREA_TOP) + PAGE_SIZE);
-	shadow_cpu_entry_end = kasan_mem_to_shadow(shadow_cpu_entry_end);
-	shadow_cpu_entry_end = (void *)round_up((unsigned long)shadow_cpu_entry_end,
-					PAGE_SIZE);
-
 	kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
-				   shadow_cpu_entry_begin);
-
-	kasan_populate_shadow((unsigned long)shadow_cpu_entry_begin,
-			      (unsigned long)shadow_cpu_entry_end, 0);
-
-	kasan_populate_zero_shadow(shadow_cpu_entry_end, (void *)KASAN_SHADOW_END);
+				(void *)KASAN_SHADOW_END);
 
 	load_cr3(init_top_pgt);
 	__flush_tlb_all();
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index 6b9bf023a700..c3c5274410a9 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -10,6 +10,7 @@
 #include <linux/pagemap.h>
 #include <linux/spinlock.h>
 
+#include <asm/cpu_entry_area.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/fixmap.h>
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index c2454237fa67..a0e2b8c6e5c7 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -2261,7 +2261,6 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
 
 	switch (idx) {
 	case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
-	case FIX_RO_IDT:
 #ifdef CONFIG_X86_32
 	case FIX_WP_TEST:
 # ifdef CONFIG_HIGHMEM
@@ -2272,7 +2271,6 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
 #endif
 	case FIX_TEXT_POKE0:
 	case FIX_TEXT_POKE1:
-	case FIX_CPU_ENTRY_AREA_TOP ... FIX_CPU_ENTRY_AREA_BOTTOM:
 		/* All local page mappings */
 		pte = pfn_pte(phys, prot);
 		break;

From 613e396bc0d4c7604fba23256644e78454c68cf6 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 17 Dec 2017 10:56:29 +0100
Subject: [PATCH 427/808] init: Invoke init_espfix_bsp() from mm_init()

init_espfix_bsp() needs to be invoked before the page table isolation
initialization. Move it into mm_init() which is the place where pti_init()
will be added.

While at it get rid of the #ifdeffery and provide proper stub functions.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/espfix.h | 7 ++++---
 arch/x86/kernel/smpboot.c     | 6 +-----
 include/asm-generic/pgtable.h | 5 +++++
 init/main.c                   | 6 ++----
 4 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/arch/x86/include/asm/espfix.h b/arch/x86/include/asm/espfix.h
index 0211029076ea..6777480d8a42 100644
--- a/arch/x86/include/asm/espfix.h
+++ b/arch/x86/include/asm/espfix.h
@@ -2,7 +2,7 @@
 #ifndef _ASM_X86_ESPFIX_H
 #define _ASM_X86_ESPFIX_H
 
-#ifdef CONFIG_X86_64
+#ifdef CONFIG_X86_ESPFIX64
 
 #include <asm/percpu.h>
 
@@ -11,7 +11,8 @@ DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr);
 
 extern void init_espfix_bsp(void);
 extern void init_espfix_ap(int cpu);
-
-#endif /* CONFIG_X86_64 */
+#else
+static inline void init_espfix_ap(int cpu) { }
+#endif
 
 #endif /* _ASM_X86_ESPFIX_H */
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index d56c1d209283..33d6000265aa 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -990,12 +990,8 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
 	initial_code = (unsigned long)start_secondary;
 	initial_stack  = idle->thread.sp;
 
-	/*
-	 * Enable the espfix hack for this CPU
-	*/
-#ifdef CONFIG_X86_ESPFIX64
+	/* Enable the espfix hack for this CPU */
 	init_espfix_ap(cpu);
-#endif
 
 	/* So we see what's up */
 	announce_cpu(cpu, apicid);
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 757dc6ffc7ba..231b35a76dd9 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -1017,6 +1017,11 @@ static inline int pmd_clear_huge(pmd_t *pmd)
 struct file;
 int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
 			unsigned long size, pgprot_t *vma_prot);
+
+#ifndef CONFIG_X86_ESPFIX64
+static inline void init_espfix_bsp(void) { }
+#endif
+
 #endif /* !__ASSEMBLY__ */
 
 #ifndef io_remap_pfn_range
diff --git a/init/main.c b/init/main.c
index 0ee9c6866ada..8a390f60ec81 100644
--- a/init/main.c
+++ b/init/main.c
@@ -504,6 +504,8 @@ static void __init mm_init(void)
 	pgtable_init();
 	vmalloc_init();
 	ioremap_huge_init();
+	/* Should be run before the first non-init thread is created */
+	init_espfix_bsp();
 }
 
 asmlinkage __visible void __init start_kernel(void)
@@ -673,10 +675,6 @@ asmlinkage __visible void __init start_kernel(void)
 #ifdef CONFIG_X86
 	if (efi_enabled(EFI_RUNTIME_SERVICES))
 		efi_enter_virtual_mode();
-#endif
-#ifdef CONFIG_X86_ESPFIX64
-	/* Should be run before the first non-init thread is created */
-	init_espfix_bsp();
 #endif
 	thread_stack_cache_init();
 	cred_init();

From b26a2319be3dd26edb3013504992a037a5902520 Mon Sep 17 00:00:00 2001
From: Ben Skeggs <bskeggs@redhat.com>
Date: Sat, 23 Dec 2017 08:54:28 +1000
Subject: [PATCH 428/808] drm/nouveau: fix race when adding delayed work items

kernel.org bz#198221.

Reported-by: Petr Vandrovec <petr@vandrovec.name>
Signed-off-by: Ben Skeggs <bskeggs@redhat.com>
---
 drivers/gpu/drm/nouveau/nouveau_drm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/nouveau/nouveau_drm.c b/drivers/gpu/drm/nouveau/nouveau_drm.c
index 8d4a5be3b913..56fe261b6268 100644
--- a/drivers/gpu/drm/nouveau/nouveau_drm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_drm.c
@@ -152,9 +152,9 @@ nouveau_cli_work_queue(struct nouveau_cli *cli, struct dma_fence *fence,
 	work->cli = cli;
 	mutex_lock(&cli->lock);
 	list_add_tail(&work->head, &cli->worker);
-	mutex_unlock(&cli->lock);
 	if (dma_fence_add_callback(fence, &work->cb, nouveau_cli_work_fence))
 		nouveau_cli_work_fence(fence, &work->cb);
+	mutex_unlock(&cli->lock);
 }
 
 static void

From b3b1b6532890c70987821946f90c22b8021aaaf8 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 22 Dec 2017 11:36:05 -0800
Subject: [PATCH 429/808] tools: bpftool: maps: close json array on error paths
 of show

We can't return from the middle of do_show(), because
json_array will not be closed.  Break out of the loop.
Note that the error handling after the loop depends on
errno, so no need to set err.

Fixes: 831a0aafe5c3 ("tools: bpftool: add JSON output for `bpftool map *` commands")
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/bpf/bpftool/map.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c
index e2450c8e88e6..8368b7ea31b5 100644
--- a/tools/bpf/bpftool/map.c
+++ b/tools/bpf/bpftool/map.c
@@ -523,21 +523,21 @@ static int do_show(int argc, char **argv)
 				break;
 			p_err("can't get next map: %s%s", strerror(errno),
 			      errno == EINVAL ? " -- kernel too old?" : "");
-			return -1;
+			break;
 		}
 
 		fd = bpf_map_get_fd_by_id(id);
 		if (fd < 0) {
 			p_err("can't get map by id (%u): %s",
 			      id, strerror(errno));
-			return -1;
+			break;
 		}
 
 		err = bpf_obj_get_info_by_fd(fd, &info, &len);
 		if (err) {
 			p_err("can't get map info: %s", strerror(errno));
 			close(fd);
-			return -1;
+			break;
 		}
 
 		if (json_output)

From 8207c6dd4746c345b689684c4cd0ce00a18c7ef2 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 22 Dec 2017 11:36:06 -0800
Subject: [PATCH 430/808] tools: bpftool: protect against races with
 disappearing objects

On program/map show we may get an ID of an object from GETNEXT,
but the object may disappear before we call GET_FD_BY_ID.  If
that happens, ignore the object and continue.

Fixes: 71bb428fe2c1 ("tools: bpf: add bpftool")
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/bpf/bpftool/map.c  | 2 ++
 tools/bpf/bpftool/prog.c | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c
index 8368b7ea31b5..a8c3a33dd185 100644
--- a/tools/bpf/bpftool/map.c
+++ b/tools/bpf/bpftool/map.c
@@ -528,6 +528,8 @@ static int do_show(int argc, char **argv)
 
 		fd = bpf_map_get_fd_by_id(id);
 		if (fd < 0) {
+			if (errno == ENOENT)
+				continue;
 			p_err("can't get map by id (%u): %s",
 			      id, strerror(errno));
 			break;
diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index ad619b96c276..dded77345bfb 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -382,6 +382,8 @@ static int do_show(int argc, char **argv)
 
 		fd = bpf_prog_get_fd_by_id(id);
 		if (fd < 0) {
+			if (errno == ENOENT)
+				continue;
 			p_err("can't get prog by id (%u): %s",
 			      id, strerror(errno));
 			err = -1;

From 8a42d3fc9dfccbf601c5f58f46dc3cdbc1a4b923 Mon Sep 17 00:00:00 2001
From: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Date: Fri, 15 Dec 2017 13:42:04 +0000
Subject: [PATCH 431/808] nvmem: meson-mx-efuse: fix reading from an offset
 other than 0

meson_mx_efuse_read calculates the address internal to the eFuse based
on the offset and the word size. This works fine with any given offset.
However, the offset is also included when writing to the output buffer.
This means that reading 4 bytes at offset 500 tries to write beyond the
array allocated by the nvmem core as it wants to write the 4 bytes to
"buffer address + offset (500)".
This issue did not show up in the previous tests since no driver uses
any value from the eFuse yet and reading the eFuse via sysfs simply
reads the whole eFuse, starting at offset 0.

Fix this by only including the offset in the internal address
calculation.

Fixes: 8caef1fa9176 ("nvmem: add a driver for the Amlogic Meson6/Meson8/Meson8b SoCs")
Signed-off-by: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/nvmem/meson-mx-efuse.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/nvmem/meson-mx-efuse.c b/drivers/nvmem/meson-mx-efuse.c
index a346b4923550..41d3a3c1104e 100644
--- a/drivers/nvmem/meson-mx-efuse.c
+++ b/drivers/nvmem/meson-mx-efuse.c
@@ -156,8 +156,8 @@ static int meson_mx_efuse_read(void *context, unsigned int offset,
 				 MESON_MX_EFUSE_CNTL1_AUTO_RD_ENABLE,
 				 MESON_MX_EFUSE_CNTL1_AUTO_RD_ENABLE);
 
-	for (i = offset; i < offset + bytes; i += efuse->config.word_size) {
-		addr = i / efuse->config.word_size;
+	for (i = 0; i < bytes; i += efuse->config.word_size) {
+		addr = (offset + i) / efuse->config.word_size;
 
 		err = meson_mx_efuse_read_addr(efuse, addr, &tmp);
 		if (err)

From f6c4fd506cb626e4346aa81688f255e593a7c5a0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 23 Dec 2017 19:45:11 +0100
Subject: [PATCH 432/808] x86/cpu_entry_area: Prevent wraparound in
 setup_cpu_entry_area_ptes() on 32bit

The loop which populates the CPU entry area PMDs can wrap around on 32bit
machines when the number of CPUs is small.

It worked wonderful for NR_CPUS=64 for whatever reason and the moron who
wrote that code did not bother to test it with !SMP.

Check for the wraparound to fix it.

Fixes: 92a0f81d8957 ("x86/cpu_entry_area: Move it out of the fixmap")
Reported-by: kernel test robot <fengguang.wu@intel.com>
Signed-off-by: Thomas "Feels stupid" Gleixner <tglx@linutronix.de>
Tested-by: Borislav Petkov <bp@alien8.de>
---
 arch/x86/mm/cpu_entry_area.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c
index 21e8b595cbb1..fe814fd5e014 100644
--- a/arch/x86/mm/cpu_entry_area.c
+++ b/arch/x86/mm/cpu_entry_area.c
@@ -122,7 +122,8 @@ static __init void setup_cpu_entry_area_ptes(void)
 	start = CPU_ENTRY_AREA_BASE;
 	end = start + CPU_ENTRY_AREA_MAP_SIZE;
 
-	for (; start < end; start += PMD_SIZE)
+	/* Careful here: start + PMD_SIZE might wrap around */
+	for (; start < end && start >= CPU_ENTRY_AREA_BASE; start += PMD_SIZE)
 		populate_extra_pte(start);
 #endif
 }

From a89f040fa34ec9cd682aed98b8f04e3c47d998bd Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 4 Dec 2017 15:07:33 +0100
Subject: [PATCH 433/808] x86/cpufeatures: Add X86_BUG_CPU_INSECURE

Many x86 CPUs leak information to user space due to missing isolation of
user space and kernel space page tables. There are many well documented
ways to exploit that.

The upcoming software migitation of isolating the user and kernel space
page tables needs a misfeature flag so code can be made runtime
conditional.

Add the BUG bits which indicates that the CPU is affected and add a feature
bit which indicates that the software migitation is enabled.

Assume for now that _ALL_ x86 CPUs are affected by this. Exceptions can be
made later.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/cpufeatures.h       | 3 ++-
 arch/x86/include/asm/disabled-features.h | 8 +++++++-
 arch/x86/kernel/cpu/common.c             | 4 ++++
 3 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 800104c8a3ed..d8ec834ea884 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -201,7 +201,7 @@
 #define X86_FEATURE_HW_PSTATE		( 7*32+ 8) /* AMD HW-PState */
 #define X86_FEATURE_PROC_FEEDBACK	( 7*32+ 9) /* AMD ProcFeedbackInterface */
 #define X86_FEATURE_SME			( 7*32+10) /* AMD Secure Memory Encryption */
-
+#define X86_FEATURE_PTI			( 7*32+11) /* Kernel Page Table Isolation enabled */
 #define X86_FEATURE_INTEL_PPIN		( 7*32+14) /* Intel Processor Inventory Number */
 #define X86_FEATURE_INTEL_PT		( 7*32+15) /* Intel Processor Trace */
 #define X86_FEATURE_AVX512_4VNNIW	( 7*32+16) /* AVX-512 Neural Network Instructions */
@@ -340,5 +340,6 @@
 #define X86_BUG_SWAPGS_FENCE		X86_BUG(11) /* SWAPGS without input dep on GS */
 #define X86_BUG_MONITOR			X86_BUG(12) /* IPI required to wake up remote CPU */
 #define X86_BUG_AMD_E400		X86_BUG(13) /* CPU is among the affected by Erratum 400 */
+#define X86_BUG_CPU_INSECURE		X86_BUG(14) /* CPU is insecure and needs kernel page table isolation */
 
 #endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
index c10c9128f54e..e428e16dd822 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -44,6 +44,12 @@
 # define DISABLE_LA57	(1<<(X86_FEATURE_LA57 & 31))
 #endif
 
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+# define DISABLE_PTI		0
+#else
+# define DISABLE_PTI		(1 << (X86_FEATURE_PTI & 31))
+#endif
+
 /*
  * Make sure to add features to the correct mask
  */
@@ -54,7 +60,7 @@
 #define DISABLED_MASK4	(DISABLE_PCID)
 #define DISABLED_MASK5	0
 #define DISABLED_MASK6	0
-#define DISABLED_MASK7	0
+#define DISABLED_MASK7	(DISABLE_PTI)
 #define DISABLED_MASK8	0
 #define DISABLED_MASK9	(DISABLE_MPX)
 #define DISABLED_MASK10	0
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 8ddcfa4d4165..a9210f9b7cf8 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -898,6 +898,10 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
 	}
 
 	setup_force_cpu_cap(X86_FEATURE_ALWAYS);
+
+	/* Assume for now that ALL x86 CPUs are insecure */
+	setup_force_cpu_bug(X86_BUG_CPU_INSECURE);
+
 	fpu__init_system(c);
 
 #ifdef CONFIG_X86_32

From c313ec66317d421fb5768d78c56abed2dc862264 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Mon, 4 Dec 2017 15:07:34 +0100
Subject: [PATCH 434/808] x86/mm/pti: Disable global pages if
 PAGE_TABLE_ISOLATION=y

Global pages stay in the TLB across context switches.  Since all contexts
share the same kernel mapping, these mappings are marked as global pages
so kernel entries in the TLB are not flushed out on a context switch.

But, even having these entries in the TLB opens up something that an
attacker can use, such as the double-page-fault attack:

   http://www.ieee-security.org/TC/SP2013/papers/4977a191.pdf

That means that even when PAGE_TABLE_ISOLATION switches page tables
on return to user space the global pages would stay in the TLB cache.

Disable global pages so that kernel TLB entries can be flushed before
returning to user space. This way, all accesses to kernel addresses from
userspace result in a TLB miss independent of the existence of a kernel
mapping.

Suppress global pages via the __supported_pte_mask. The user space
mappings set PAGE_GLOBAL for the minimal kernel mappings which are
required for entry/exit. These mappings are set up manually so the
filtering does not take place.

[ The __supported_pte_mask simplification was written by Thomas Gleixner. ]
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Cc: linux-mm@kvack.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/init.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index a22c2b95e513..020223420308 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -161,6 +161,12 @@ struct map_range {
 
 static int page_size_mask;
 
+static void enable_global_pages(void)
+{
+	if (!static_cpu_has(X86_FEATURE_PTI))
+		__supported_pte_mask |= _PAGE_GLOBAL;
+}
+
 static void __init probe_page_size_mask(void)
 {
 	/*
@@ -179,11 +185,11 @@ static void __init probe_page_size_mask(void)
 		cr4_set_bits_and_update_boot(X86_CR4_PSE);
 
 	/* Enable PGE if available */
+	__supported_pte_mask &= ~_PAGE_GLOBAL;
 	if (boot_cpu_has(X86_FEATURE_PGE)) {
 		cr4_set_bits_and_update_boot(X86_CR4_PGE);
-		__supported_pte_mask |= _PAGE_GLOBAL;
-	} else
-		__supported_pte_mask &= ~_PAGE_GLOBAL;
+		enable_global_pages();
+	}
 
 	/* Enable 1 GB linear kernel mappings if available: */
 	if (direct_gbpages && boot_cpu_has(X86_FEATURE_GBPAGES)) {

From 8a09317b895f073977346779df52f67c1056d81d Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Mon, 4 Dec 2017 15:07:35 +0100
Subject: [PATCH 435/808] x86/mm/pti: Prepare the x86/entry assembly code for
 entry/exit CR3 switching

PAGE_TABLE_ISOLATION needs to switch to a different CR3 value when it
enters the kernel and switch back when it exits.  This essentially needs to
be done before leaving assembly code.

This is extra challenging because the switching context is tricky: the
registers that can be clobbered can vary.  It is also hard to store things
on the stack because there is an established ABI (ptregs) or the stack is
entirely unsafe to use.

Establish a set of macros that allow changing to the user and kernel CR3
values.

Interactions with SWAPGS:

  Previous versions of the PAGE_TABLE_ISOLATION code relied on having
  per-CPU scratch space to save/restore a register that can be used for the
  CR3 MOV.  The %GS register is used to index into our per-CPU space, so
  SWAPGS *had* to be done before the CR3 switch.  That scratch space is gone
  now, but the semantic that SWAPGS must be done before the CR3 MOV is
  retained.  This is good to keep because it is not that hard to do and it
  allows to do things like add per-CPU debugging information.

What this does in the NMI code is worth pointing out.  NMIs can interrupt
*any* context and they can also be nested with NMIs interrupting other
NMIs.  The comments below ".Lnmi_from_kernel" explain the format of the
stack during this situation.  Changing the format of this stack is hard.
Instead of storing the old CR3 value on the stack, this depends on the
*regular* register save/restore mechanism and then uses %r14 to keep CR3
during the NMI.  It is callee-saved and will not be clobbered by the C NMI
handlers that get called.

[ PeterZ: ESPFIX optimization ]

Based-on-code-from: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Cc: linux-mm@kvack.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/calling.h         | 66 ++++++++++++++++++++++++++++++++
 arch/x86/entry/entry_64.S        | 45 +++++++++++++++++++---
 arch/x86/entry/entry_64_compat.S | 24 +++++++++++-
 3 files changed, 128 insertions(+), 7 deletions(-)

diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 3fd8bc560fae..a9d17a7686ab 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -1,6 +1,8 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #include <linux/jump_label.h>
 #include <asm/unwind_hints.h>
+#include <asm/cpufeatures.h>
+#include <asm/page_types.h>
 
 /*
 
@@ -187,6 +189,70 @@ For 32-bit we have the following conventions - kernel is built with
 #endif
 .endm
 
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+
+/* PAGE_TABLE_ISOLATION PGDs are 8k.  Flip bit 12 to switch between the two halves: */
+#define PTI_SWITCH_MASK (1<<PAGE_SHIFT)
+
+.macro ADJUST_KERNEL_CR3 reg:req
+	/* Clear "PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */
+	andq	$(~PTI_SWITCH_MASK), \reg
+.endm
+
+.macro ADJUST_USER_CR3 reg:req
+	/* Move CR3 up a page to the user page tables: */
+	orq	$(PTI_SWITCH_MASK), \reg
+.endm
+
+.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
+	mov	%cr3, \scratch_reg
+	ADJUST_KERNEL_CR3 \scratch_reg
+	mov	\scratch_reg, %cr3
+.endm
+
+.macro SWITCH_TO_USER_CR3 scratch_reg:req
+	mov	%cr3, \scratch_reg
+	ADJUST_USER_CR3 \scratch_reg
+	mov	\scratch_reg, %cr3
+.endm
+
+.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
+	movq	%cr3, \scratch_reg
+	movq	\scratch_reg, \save_reg
+	/*
+	 * Is the switch bit zero?  This means the address is
+	 * up in real PAGE_TABLE_ISOLATION patches in a moment.
+	 */
+	testq	$(PTI_SWITCH_MASK), \scratch_reg
+	jz	.Ldone_\@
+
+	ADJUST_KERNEL_CR3 \scratch_reg
+	movq	\scratch_reg, %cr3
+
+.Ldone_\@:
+.endm
+
+.macro RESTORE_CR3 save_reg:req
+	/*
+	 * The CR3 write could be avoided when not changing its value,
+	 * but would require a CR3 read *and* a scratch register.
+	 */
+	movq	\save_reg, %cr3
+.endm
+
+#else /* CONFIG_PAGE_TABLE_ISOLATION=n: */
+
+.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
+.endm
+.macro SWITCH_TO_USER_CR3 scratch_reg:req
+.endm
+.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
+.endm
+.macro RESTORE_CR3 save_reg:req
+.endm
+
+#endif
+
 #endif /* CONFIG_X86_64 */
 
 /*
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 87cebe78bbef..2ad7ad4d3dd6 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -164,6 +164,9 @@ ENTRY(entry_SYSCALL_64_trampoline)
 	/* Stash the user RSP. */
 	movq	%rsp, RSP_SCRATCH
 
+	/* Note: using %rsp as a scratch reg. */
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
+
 	/* Load the top of the task stack into RSP */
 	movq	CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp
 
@@ -203,6 +206,10 @@ ENTRY(entry_SYSCALL_64)
 	 */
 
 	swapgs
+	/*
+	 * This path is not taken when PAGE_TABLE_ISOLATION is disabled so it
+	 * is not required to switch CR3.
+	 */
 	movq	%rsp, PER_CPU_VAR(rsp_scratch)
 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 
@@ -399,6 +406,7 @@ syscall_return_via_sysret:
 	 * We are on the trampoline stack.  All regs except RDI are live.
 	 * We can do future final exit work right here.
 	 */
+	SWITCH_TO_USER_CR3 scratch_reg=%rdi
 
 	popq	%rdi
 	popq	%rsp
@@ -736,6 +744,8 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode)
 	 * We can do future final exit work right here.
 	 */
 
+	SWITCH_TO_USER_CR3 scratch_reg=%rdi
+
 	/* Restore RDI. */
 	popq	%rdi
 	SWAPGS
@@ -818,7 +828,9 @@ native_irq_return_ldt:
 	 */
 
 	pushq	%rdi				/* Stash user RDI */
-	SWAPGS
+	SWAPGS					/* to kernel GS */
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi	/* to kernel CR3 */
+
 	movq	PER_CPU_VAR(espfix_waddr), %rdi
 	movq	%rax, (0*8)(%rdi)		/* user RAX */
 	movq	(1*8)(%rsp), %rax		/* user RIP */
@@ -834,7 +846,6 @@ native_irq_return_ldt:
 	/* Now RAX == RSP. */
 
 	andl	$0xffff0000, %eax		/* RAX = (RSP & 0xffff0000) */
-	popq	%rdi				/* Restore user RDI */
 
 	/*
 	 * espfix_stack[31:16] == 0.  The page tables are set up such that
@@ -845,7 +856,11 @@ native_irq_return_ldt:
 	 * still points to an RO alias of the ESPFIX stack.
 	 */
 	orq	PER_CPU_VAR(espfix_stack), %rax
-	SWAPGS
+
+	SWITCH_TO_USER_CR3 scratch_reg=%rdi	/* to user CR3 */
+	SWAPGS					/* to user GS */
+	popq	%rdi				/* Restore user RDI */
+
 	movq	%rax, %rsp
 	UNWIND_HINT_IRET_REGS offset=8
 
@@ -945,6 +960,8 @@ ENTRY(switch_to_thread_stack)
 	UNWIND_HINT_FUNC
 
 	pushq	%rdi
+	/* Need to switch before accessing the thread stack. */
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
 	movq	%rsp, %rdi
 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 	UNWIND_HINT sp_offset=16 sp_reg=ORC_REG_DI
@@ -1244,7 +1261,11 @@ ENTRY(paranoid_entry)
 	js	1f				/* negative -> in kernel */
 	SWAPGS
 	xorl	%ebx, %ebx
-1:	ret
+
+1:
+	SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
+
+	ret
 END(paranoid_entry)
 
 /*
@@ -1266,6 +1287,7 @@ ENTRY(paranoid_exit)
 	testl	%ebx, %ebx			/* swapgs needed? */
 	jnz	.Lparanoid_exit_no_swapgs
 	TRACE_IRQS_IRETQ
+	RESTORE_CR3	save_reg=%r14
 	SWAPGS_UNSAFE_STACK
 	jmp	.Lparanoid_exit_restore
 .Lparanoid_exit_no_swapgs:
@@ -1293,6 +1315,8 @@ ENTRY(error_entry)
 	 * from user mode due to an IRET fault.
 	 */
 	SWAPGS
+	/* We have user CR3.  Change to kernel CR3. */
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
 
 .Lerror_entry_from_usermode_after_swapgs:
 	/* Put us onto the real thread stack. */
@@ -1339,6 +1363,7 @@ ENTRY(error_entry)
 	 * .Lgs_change's error handler with kernel gsbase.
 	 */
 	SWAPGS
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
 	jmp .Lerror_entry_done
 
 .Lbstep_iret:
@@ -1348,10 +1373,11 @@ ENTRY(error_entry)
 
 .Lerror_bad_iret:
 	/*
-	 * We came from an IRET to user mode, so we have user gsbase.
-	 * Switch to kernel gsbase:
+	 * We came from an IRET to user mode, so we have user
+	 * gsbase and CR3.  Switch to kernel gsbase and CR3:
 	 */
 	SWAPGS
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
 
 	/*
 	 * Pretend that the exception came from user mode: set up pt_regs
@@ -1383,6 +1409,10 @@ END(error_exit)
 /*
  * Runs on exception stack.  Xen PV does not go through this path at all,
  * so we can use real assembly here.
+ *
+ * Registers:
+ *	%r14: Used to save/restore the CR3 of the interrupted context
+ *	      when PAGE_TABLE_ISOLATION is in use.  Do not clobber.
  */
 ENTRY(nmi)
 	UNWIND_HINT_IRET_REGS
@@ -1446,6 +1476,7 @@ ENTRY(nmi)
 
 	swapgs
 	cld
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx
 	movq	%rsp, %rdx
 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 	UNWIND_HINT_IRET_REGS base=%rdx offset=8
@@ -1698,6 +1729,8 @@ end_repeat_nmi:
 	movq	$-1, %rsi
 	call	do_nmi
 
+	RESTORE_CR3 save_reg=%r14
+
 	testl	%ebx, %ebx			/* swapgs needed? */
 	jnz	nmi_restore
 nmi_swapgs:
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 95ad40eb7eff..05238b29895e 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -49,6 +49,10 @@
 ENTRY(entry_SYSENTER_compat)
 	/* Interrupts are off on entry. */
 	SWAPGS
+
+	/* We are about to clobber %rsp anyway, clobbering here is OK */
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
+
 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 
 	/*
@@ -215,6 +219,12 @@ GLOBAL(entry_SYSCALL_compat_after_hwframe)
 	pushq   $0			/* pt_regs->r14 = 0 */
 	pushq   $0			/* pt_regs->r15 = 0 */
 
+	/*
+	 * We just saved %rdi so it is safe to clobber.  It is not
+	 * preserved during the C calls inside TRACE_IRQS_OFF anyway.
+	 */
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
+
 	/*
 	 * User mode is traced as though IRQs are on, and SYSENTER
 	 * turned them off.
@@ -256,10 +266,22 @@ sysret32_from_system_call:
 	 * when the system call started, which is already known to user
 	 * code.  We zero R8-R10 to avoid info leaks.
          */
+	movq	RSP-ORIG_RAX(%rsp), %rsp
+
+	/*
+	 * The original userspace %rsp (RSP-ORIG_RAX(%rsp)) is stored
+	 * on the process stack which is not mapped to userspace and
+	 * not readable after we SWITCH_TO_USER_CR3.  Delay the CR3
+	 * switch until after after the last reference to the process
+	 * stack.
+	 *
+	 * %r8 is zeroed before the sysret, thus safe to clobber.
+	 */
+	SWITCH_TO_USER_CR3 scratch_reg=%r8
+
 	xorq	%r8, %r8
 	xorq	%r9, %r9
 	xorq	%r10, %r10
-	movq	RSP-ORIG_RAX(%rsp), %rsp
 	swapgs
 	sysretl
 END(entry_SYSCALL_compat)

From aa8c6248f8c75acfd610fe15d8cae23cf70d9d09 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 4 Dec 2017 15:07:36 +0100
Subject: [PATCH 436/808] x86/mm/pti: Add infrastructure for page table
 isolation

Add the initial files for kernel page table isolation, with a minimal init
function and the boot time detection for this misfeature.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 .../admin-guide/kernel-parameters.txt         |  2 +
 arch/x86/boot/compressed/pagetable.c          |  3 +
 arch/x86/entry/calling.h                      |  7 ++
 arch/x86/include/asm/pti.h                    | 14 ++++
 arch/x86/mm/Makefile                          |  7 +-
 arch/x86/mm/init.c                            |  2 +
 arch/x86/mm/pti.c                             | 84 +++++++++++++++++++
 include/linux/pti.h                           | 11 +++
 init/main.c                                   |  3 +
 9 files changed, 130 insertions(+), 3 deletions(-)
 create mode 100644 arch/x86/include/asm/pti.h
 create mode 100644 arch/x86/mm/pti.c
 create mode 100644 include/linux/pti.h

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 05496622b4ef..5dfd26265484 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2685,6 +2685,8 @@
 			steal time is computed, but won't influence scheduler
 			behaviour
 
+	nopti		[X86-64] Disable kernel page table isolation
+
 	nolapic		[X86-32,APIC] Do not enable or use the local APIC.
 
 	nolapic_timer	[X86-32,APIC] Do not use the local APIC timer.
diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/pagetable.c
index 972319ff5b01..e691ff734cb5 100644
--- a/arch/x86/boot/compressed/pagetable.c
+++ b/arch/x86/boot/compressed/pagetable.c
@@ -23,6 +23,9 @@
  */
 #undef CONFIG_AMD_MEM_ENCRYPT
 
+/* No PAGE_TABLE_ISOLATION support needed either: */
+#undef CONFIG_PAGE_TABLE_ISOLATION
+
 #include "misc.h"
 
 /* These actually do the work of building the kernel identity maps. */
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index a9d17a7686ab..3d3389a92c33 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -205,18 +205,23 @@ For 32-bit we have the following conventions - kernel is built with
 .endm
 
 .macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
+	ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
 	mov	%cr3, \scratch_reg
 	ADJUST_KERNEL_CR3 \scratch_reg
 	mov	\scratch_reg, %cr3
+.Lend_\@:
 .endm
 
 .macro SWITCH_TO_USER_CR3 scratch_reg:req
+	ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
 	mov	%cr3, \scratch_reg
 	ADJUST_USER_CR3 \scratch_reg
 	mov	\scratch_reg, %cr3
+.Lend_\@:
 .endm
 
 .macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
+	ALTERNATIVE "jmp .Ldone_\@", "", X86_FEATURE_PTI
 	movq	%cr3, \scratch_reg
 	movq	\scratch_reg, \save_reg
 	/*
@@ -233,11 +238,13 @@ For 32-bit we have the following conventions - kernel is built with
 .endm
 
 .macro RESTORE_CR3 save_reg:req
+	ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
 	/*
 	 * The CR3 write could be avoided when not changing its value,
 	 * but would require a CR3 read *and* a scratch register.
 	 */
 	movq	\save_reg, %cr3
+.Lend_\@:
 .endm
 
 #else /* CONFIG_PAGE_TABLE_ISOLATION=n: */
diff --git a/arch/x86/include/asm/pti.h b/arch/x86/include/asm/pti.h
new file mode 100644
index 000000000000..0b5ef05b2d2d
--- /dev/null
+++ b/arch/x86/include/asm/pti.h
@@ -0,0 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef _ASM_X86_PTI_H
+#define _ASM_X86_PTI_H
+#ifndef __ASSEMBLY__
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+extern void pti_init(void);
+extern void pti_check_boottime_disable(void);
+#else
+static inline void pti_check_boottime_disable(void) { }
+#endif
+
+#endif /* __ASSEMBLY__ */
+#endif /* _ASM_X86_PTI_H */
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 2e0017af8f9b..52906808e277 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -43,9 +43,10 @@ obj-$(CONFIG_AMD_NUMA)		+= amdtopology.o
 obj-$(CONFIG_ACPI_NUMA)		+= srat.o
 obj-$(CONFIG_NUMA_EMU)		+= numa_emulation.o
 
-obj-$(CONFIG_X86_INTEL_MPX)	+= mpx.o
-obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
-obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
+obj-$(CONFIG_X86_INTEL_MPX)			+= mpx.o
+obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS)	+= pkeys.o
+obj-$(CONFIG_RANDOMIZE_MEMORY)			+= kaslr.o
+obj-$(CONFIG_PAGE_TABLE_ISOLATION)		+= pti.o
 
 obj-$(CONFIG_AMD_MEM_ENCRYPT)	+= mem_encrypt.o
 obj-$(CONFIG_AMD_MEM_ENCRYPT)	+= mem_encrypt_boot.o
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 020223420308..af75069fb116 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -20,6 +20,7 @@
 #include <asm/kaslr.h>
 #include <asm/hypervisor.h>
 #include <asm/cpufeature.h>
+#include <asm/pti.h>
 
 /*
  * We need to define the tracepoints somewhere, and tlb.c
@@ -630,6 +631,7 @@ void __init init_mem_mapping(void)
 {
 	unsigned long end;
 
+	pti_check_boottime_disable();
 	probe_page_size_mask();
 	setup_pcid();
 
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
new file mode 100644
index 000000000000..375f23a758bc
--- /dev/null
+++ b/arch/x86/mm/pti.c
@@ -0,0 +1,84 @@
+/*
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * This code is based in part on work published here:
+ *
+ *	https://github.com/IAIK/KAISER
+ *
+ * The original work was written by and and signed off by for the Linux
+ * kernel by:
+ *
+ *   Signed-off-by: Richard Fellner <richard.fellner@student.tugraz.at>
+ *   Signed-off-by: Moritz Lipp <moritz.lipp@iaik.tugraz.at>
+ *   Signed-off-by: Daniel Gruss <daniel.gruss@iaik.tugraz.at>
+ *   Signed-off-by: Michael Schwarz <michael.schwarz@iaik.tugraz.at>
+ *
+ * Major changes to the original code by: Dave Hansen <dave.hansen@intel.com>
+ * Mostly rewritten by Thomas Gleixner <tglx@linutronix.de> and
+ *		       Andy Lutomirsky <luto@amacapital.net>
+ */
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/bug.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/uaccess.h>
+
+#include <asm/cpufeature.h>
+#include <asm/hypervisor.h>
+#include <asm/cmdline.h>
+#include <asm/pti.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+#include <asm/desc.h>
+
+#undef pr_fmt
+#define pr_fmt(fmt)     "Kernel/User page tables isolation: " fmt
+
+static void __init pti_print_if_insecure(const char *reason)
+{
+	if (boot_cpu_has_bug(X86_BUG_CPU_INSECURE))
+		pr_info("%s\n", reason);
+}
+
+void __init pti_check_boottime_disable(void)
+{
+	if (hypervisor_is_type(X86_HYPER_XEN_PV)) {
+		pti_print_if_insecure("disabled on XEN PV.");
+		return;
+	}
+
+	if (cmdline_find_option_bool(boot_command_line, "nopti")) {
+		pti_print_if_insecure("disabled on command line.");
+		return;
+	}
+
+	if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE))
+		return;
+
+	setup_force_cpu_cap(X86_FEATURE_PTI);
+}
+
+/*
+ * Initialize kernel page table isolation
+ */
+void __init pti_init(void)
+{
+	if (!static_cpu_has(X86_FEATURE_PTI))
+		return;
+
+	pr_info("enabled\n");
+}
diff --git a/include/linux/pti.h b/include/linux/pti.h
new file mode 100644
index 000000000000..0174883a935a
--- /dev/null
+++ b/include/linux/pti.h
@@ -0,0 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef _INCLUDE_PTI_H
+#define _INCLUDE_PTI_H
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+#include <asm/pti.h>
+#else
+static inline void pti_init(void) { }
+#endif
+
+#endif
diff --git a/init/main.c b/init/main.c
index 8a390f60ec81..b32ec72cdf3d 100644
--- a/init/main.c
+++ b/init/main.c
@@ -75,6 +75,7 @@
 #include <linux/slab.h>
 #include <linux/perf_event.h>
 #include <linux/ptrace.h>
+#include <linux/pti.h>
 #include <linux/blkdev.h>
 #include <linux/elevator.h>
 #include <linux/sched_clock.h>
@@ -506,6 +507,8 @@ static void __init mm_init(void)
 	ioremap_huge_init();
 	/* Should be run before the first non-init thread is created */
 	init_espfix_bsp();
+	/* Should be run after espfix64 is set up. */
+	pti_init();
 }
 
 asmlinkage __visible void __init start_kernel(void)

From 41f4c20b57a4890ea7f56ff8717cc83fefb8d537 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Tue, 12 Dec 2017 14:39:52 +0100
Subject: [PATCH 437/808] x86/pti: Add the pti= cmdline option and
 documentation

Keep the "nopti" optional for traditional reasons.

[ tglx: Don't allow force on when running on XEN PV and made 'on'
	printout conditional ]

Requested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirsky <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Link: https://lkml.kernel.org/r/20171212133952.10177-1-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 .../admin-guide/kernel-parameters.txt         |  6 +++++
 arch/x86/mm/pti.c                             | 26 ++++++++++++++++++-
 2 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 5dfd26265484..520fdec15bbb 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3255,6 +3255,12 @@
 	pt.		[PARIDE]
 			See Documentation/blockdev/paride.txt.
 
+	pti=		[X86_64]
+			Control user/kernel address space isolation:
+			on - enable
+			off - disable
+			auto - default setting
+
 	pty.legacy_count=
 			[KNL] Number of legacy pty's. Overwrites compiled-in
 			default number.
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index 375f23a758bc..a13f6b109865 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -54,21 +54,45 @@ static void __init pti_print_if_insecure(const char *reason)
 		pr_info("%s\n", reason);
 }
 
+static void __init pti_print_if_secure(const char *reason)
+{
+	if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE))
+		pr_info("%s\n", reason);
+}
+
 void __init pti_check_boottime_disable(void)
 {
+	char arg[5];
+	int ret;
+
 	if (hypervisor_is_type(X86_HYPER_XEN_PV)) {
 		pti_print_if_insecure("disabled on XEN PV.");
 		return;
 	}
 
+	ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg));
+	if (ret > 0)  {
+		if (ret == 3 && !strncmp(arg, "off", 3)) {
+			pti_print_if_insecure("disabled on command line.");
+			return;
+		}
+		if (ret == 2 && !strncmp(arg, "on", 2)) {
+			pti_print_if_secure("force enabled on command line.");
+			goto enable;
+		}
+		if (ret == 4 && !strncmp(arg, "auto", 4))
+			goto autosel;
+	}
+
 	if (cmdline_find_option_bool(boot_command_line, "nopti")) {
 		pti_print_if_insecure("disabled on command line.");
 		return;
 	}
 
+autosel:
 	if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE))
 		return;
-
+enable:
 	setup_force_cpu_cap(X86_FEATURE_PTI);
 }
 

From 61e9b3671007a5da8127955a1a3bda7e0d5f42e8 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Mon, 4 Dec 2017 15:07:37 +0100
Subject: [PATCH 438/808] x86/mm/pti: Add mapping helper functions

Add the pagetable helper functions do manage the separate user space page
tables.

[ tglx: Split out from the big combo kaiser patch. Folded Andys
	simplification and made it out of line as Boris suggested ]

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/pgtable.h    |  6 +-
 arch/x86/include/asm/pgtable_64.h | 92 +++++++++++++++++++++++++++++++
 arch/x86/mm/pti.c                 | 41 ++++++++++++++
 3 files changed, 138 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index f735c3016325..af38d93c4fbb 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -909,7 +909,11 @@ static inline int pgd_none(pgd_t pgd)
  * pgd_offset() returns a (pgd_t *)
  * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
  */
-#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
+#define pgd_offset_pgd(pgd, address) (pgd + pgd_index((address)))
+/*
+ * a shortcut to get a pgd_t in a given mm
+ */
+#define pgd_offset(mm, address) pgd_offset_pgd((mm)->pgd, (address))
 /*
  * a shortcut which implies the use of the kernel's pgd, instead
  * of a process's
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index e9f05331e732..81462e9a34f6 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -131,9 +131,97 @@ static inline pud_t native_pudp_get_and_clear(pud_t *xp)
 #endif
 }
 
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+/*
+ * All top-level PAGE_TABLE_ISOLATION page tables are order-1 pages
+ * (8k-aligned and 8k in size).  The kernel one is at the beginning 4k and
+ * the user one is in the last 4k.  To switch between them, you
+ * just need to flip the 12th bit in their addresses.
+ */
+#define PTI_PGTABLE_SWITCH_BIT	PAGE_SHIFT
+
+/*
+ * This generates better code than the inline assembly in
+ * __set_bit().
+ */
+static inline void *ptr_set_bit(void *ptr, int bit)
+{
+	unsigned long __ptr = (unsigned long)ptr;
+
+	__ptr |= BIT(bit);
+	return (void *)__ptr;
+}
+static inline void *ptr_clear_bit(void *ptr, int bit)
+{
+	unsigned long __ptr = (unsigned long)ptr;
+
+	__ptr &= ~BIT(bit);
+	return (void *)__ptr;
+}
+
+static inline pgd_t *kernel_to_user_pgdp(pgd_t *pgdp)
+{
+	return ptr_set_bit(pgdp, PTI_PGTABLE_SWITCH_BIT);
+}
+
+static inline pgd_t *user_to_kernel_pgdp(pgd_t *pgdp)
+{
+	return ptr_clear_bit(pgdp, PTI_PGTABLE_SWITCH_BIT);
+}
+
+static inline p4d_t *kernel_to_user_p4dp(p4d_t *p4dp)
+{
+	return ptr_set_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
+}
+
+static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp)
+{
+	return ptr_clear_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
+}
+#endif /* CONFIG_PAGE_TABLE_ISOLATION */
+
+/*
+ * Page table pages are page-aligned.  The lower half of the top
+ * level is used for userspace and the top half for the kernel.
+ *
+ * Returns true for parts of the PGD that map userspace and
+ * false for the parts that map the kernel.
+ */
+static inline bool pgdp_maps_userspace(void *__ptr)
+{
+	unsigned long ptr = (unsigned long)__ptr;
+
+	return (ptr & ~PAGE_MASK) < (PAGE_SIZE / 2);
+}
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd);
+
+/*
+ * Take a PGD location (pgdp) and a pgd value that needs to be set there.
+ * Populates the user and returns the resulting PGD that must be set in
+ * the kernel copy of the page tables.
+ */
+static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+	if (!static_cpu_has(X86_FEATURE_PTI))
+		return pgd;
+	return __pti_set_user_pgd(pgdp, pgd);
+}
+#else
+static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+	return pgd;
+}
+#endif
+
 static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d)
 {
+#if defined(CONFIG_PAGE_TABLE_ISOLATION) && !defined(CONFIG_X86_5LEVEL)
+	p4dp->pgd = pti_set_user_pgd(&p4dp->pgd, p4d.pgd);
+#else
 	*p4dp = p4d;
+#endif
 }
 
 static inline void native_p4d_clear(p4d_t *p4d)
@@ -147,7 +235,11 @@ static inline void native_p4d_clear(p4d_t *p4d)
 
 static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
 {
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+	*pgdp = pti_set_user_pgd(pgdp, pgd);
+#else
 	*pgdp = pgd;
+#endif
 }
 
 static inline void native_pgd_clear(pgd_t *pgd)
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index a13f6b109865..69a983365392 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -96,6 +96,47 @@ enable:
 	setup_force_cpu_cap(X86_FEATURE_PTI);
 }
 
+pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+	/*
+	 * Changes to the high (kernel) portion of the kernelmode page
+	 * tables are not automatically propagated to the usermode tables.
+	 *
+	 * Users should keep in mind that, unlike the kernelmode tables,
+	 * there is no vmalloc_fault equivalent for the usermode tables.
+	 * Top-level entries added to init_mm's usermode pgd after boot
+	 * will not be automatically propagated to other mms.
+	 */
+	if (!pgdp_maps_userspace(pgdp))
+		return pgd;
+
+	/*
+	 * The user page tables get the full PGD, accessible from
+	 * userspace:
+	 */
+	kernel_to_user_pgdp(pgdp)->pgd = pgd.pgd;
+
+	/*
+	 * If this is normal user memory, make it NX in the kernel
+	 * pagetables so that, if we somehow screw up and return to
+	 * usermode with the kernel CR3 loaded, we'll get a page fault
+	 * instead of allowing user code to execute with the wrong CR3.
+	 *
+	 * As exceptions, we don't set NX if:
+	 *  - _PAGE_USER is not set.  This could be an executable
+	 *     EFI runtime mapping or something similar, and the kernel
+	 *     may execute from it
+	 *  - we don't have NX support
+	 *  - we're clearing the PGD (i.e. the new pgd is not present).
+	 */
+	if ((pgd.pgd & (_PAGE_USER|_PAGE_PRESENT)) == (_PAGE_USER|_PAGE_PRESENT) &&
+	    (__supported_pte_mask & _PAGE_NX))
+		pgd.pgd |= _PAGE_NX;
+
+	/* return the copy of the PGD we want the kernel to use: */
+	return pgd;
+}
+
 /*
  * Initialize kernel page table isolation
  */

From 1c4de1ff4fe50453b968579ee86fac3da80dd783 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Mon, 4 Dec 2017 15:07:38 +0100
Subject: [PATCH 439/808] x86/mm/pti: Allow NX poison to be set in p4d/pgd

With PAGE_TABLE_ISOLATION the user portion of the kernel page tables is
poisoned with the NX bit so if the entry code exits with the kernel page
tables selected in CR3, userspace crashes.

But doing so trips the p4d/pgd_bad() checks.  Make sure it does not do
that.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/pgtable.h | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index af38d93c4fbb..2d2d07300b4a 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -846,7 +846,12 @@ static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
 
 static inline int p4d_bad(p4d_t p4d)
 {
-	return (p4d_flags(p4d) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0;
+	unsigned long ignore_flags = _KERNPG_TABLE | _PAGE_USER;
+
+	if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
+		ignore_flags |= _PAGE_NX;
+
+	return (p4d_flags(p4d) & ~ignore_flags) != 0;
 }
 #endif  /* CONFIG_PGTABLE_LEVELS > 3 */
 
@@ -880,7 +885,12 @@ static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)
 
 static inline int pgd_bad(pgd_t pgd)
 {
-	return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;
+	unsigned long ignore_flags = _PAGE_USER;
+
+	if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
+		ignore_flags |= _PAGE_NX;
+
+	return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
 }
 
 static inline int pgd_none(pgd_t pgd)

From d9e9a6418065bb376e5de8d93ce346939b9a37a6 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Mon, 4 Dec 2017 15:07:39 +0100
Subject: [PATCH 440/808] x86/mm/pti: Allocate a separate user PGD

Kernel page table isolation requires to have two PGDs. One for the kernel,
which contains the full kernel mapping plus the user space mapping and one
for user space which contains the user space mappings and the minimal set
of kernel mappings which are required by the architecture to be able to
transition from and to user space.

Add the necessary preliminaries.

[ tglx: Split out from the big kaiser dump. EFI fixup from Kirill ]

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/pgalloc.h | 11 +++++++++++
 arch/x86/kernel/head_64.S      | 30 +++++++++++++++++++++++++++---
 arch/x86/mm/pgtable.c          |  5 +++--
 arch/x86/platform/efi/efi_64.c |  5 ++++-
 4 files changed, 45 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index 4b5e1eafada7..aff42e1da6ee 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -30,6 +30,17 @@ static inline void paravirt_release_p4d(unsigned long pfn) {}
  */
 extern gfp_t __userpte_alloc_gfp;
 
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+/*
+ * Instead of one PGD, we acquire two PGDs.  Being order-1, it is
+ * both 8k in size and 8k-aligned.  That lets us just flip bit 12
+ * in a pointer to swap between the two 4k halves.
+ */
+#define PGD_ALLOCATION_ORDER 1
+#else
+#define PGD_ALLOCATION_ORDER 0
+#endif
+
 /*
  * Allocate and free page tables.
  */
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 7dca675fe78d..04a625f0fcda 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -341,6 +341,27 @@ GLOBAL(early_recursion_flag)
 	.balign	PAGE_SIZE; \
 GLOBAL(name)
 
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+/*
+ * Each PGD needs to be 8k long and 8k aligned.  We do not
+ * ever go out to userspace with these, so we do not
+ * strictly *need* the second page, but this allows us to
+ * have a single set_pgd() implementation that does not
+ * need to worry about whether it has 4k or 8k to work
+ * with.
+ *
+ * This ensures PGDs are 8k long:
+ */
+#define PTI_USER_PGD_FILL	512
+/* This ensures they are 8k-aligned: */
+#define NEXT_PGD_PAGE(name) \
+	.balign 2 * PAGE_SIZE; \
+GLOBAL(name)
+#else
+#define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
+#define PTI_USER_PGD_FILL	0
+#endif
+
 /* Automate the creation of 1 to 1 mapping pmd entries */
 #define PMDS(START, PERM, COUNT)			\
 	i = 0 ;						\
@@ -350,13 +371,14 @@ GLOBAL(name)
 	.endr
 
 	__INITDATA
-NEXT_PAGE(early_top_pgt)
+NEXT_PGD_PAGE(early_top_pgt)
 	.fill	511,8,0
 #ifdef CONFIG_X86_5LEVEL
 	.quad	level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
 #else
 	.quad	level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
 #endif
+	.fill	PTI_USER_PGD_FILL,8,0
 
 NEXT_PAGE(early_dynamic_pgts)
 	.fill	512*EARLY_DYNAMIC_PAGE_TABLES,8,0
@@ -364,13 +386,14 @@ NEXT_PAGE(early_dynamic_pgts)
 	.data
 
 #if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH)
-NEXT_PAGE(init_top_pgt)
+NEXT_PGD_PAGE(init_top_pgt)
 	.quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
 	.org    init_top_pgt + PGD_PAGE_OFFSET*8, 0
 	.quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
 	.org    init_top_pgt + PGD_START_KERNEL*8, 0
 	/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
 	.quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
+	.fill	PTI_USER_PGD_FILL,8,0
 
 NEXT_PAGE(level3_ident_pgt)
 	.quad	level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
@@ -381,8 +404,9 @@ NEXT_PAGE(level2_ident_pgt)
 	 */
 	PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
 #else
-NEXT_PAGE(init_top_pgt)
+NEXT_PGD_PAGE(init_top_pgt)
 	.fill	512,8,0
+	.fill	PTI_USER_PGD_FILL,8,0
 #endif
 
 #ifdef CONFIG_X86_5LEVEL
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 17ebc5a978cc..9b7bcbd33cc2 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -355,14 +355,15 @@ static inline void _pgd_free(pgd_t *pgd)
 		kmem_cache_free(pgd_cache, pgd);
 }
 #else
+
 static inline pgd_t *_pgd_alloc(void)
 {
-	return (pgd_t *)__get_free_page(PGALLOC_GFP);
+	return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER);
 }
 
 static inline void _pgd_free(pgd_t *pgd)
 {
-	free_page((unsigned long)pgd);
+	free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
 }
 #endif /* CONFIG_X86_PAE */
 
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 20fb31579b69..39c4b35ac7a4 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -195,6 +195,9 @@ static pgd_t *efi_pgd;
  * because we want to avoid inserting EFI region mappings (EFI_VA_END
  * to EFI_VA_START) into the standard kernel page tables. Everything
  * else can be shared, see efi_sync_low_kernel_mappings().
+ *
+ * We don't want the pgd on the pgd_list and cannot use pgd_alloc() for the
+ * allocation.
  */
 int __init efi_alloc_page_tables(void)
 {
@@ -207,7 +210,7 @@ int __init efi_alloc_page_tables(void)
 		return 0;
 
 	gfp_mask = GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO;
-	efi_pgd = (pgd_t *)__get_free_page(gfp_mask);
+	efi_pgd = (pgd_t *)__get_free_pages(gfp_mask, PGD_ALLOCATION_ORDER);
 	if (!efi_pgd)
 		return -ENOMEM;
 

From fc2fbc8512ed08d1de7720936fd7d2e4ce02c3a2 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Mon, 4 Dec 2017 15:07:40 +0100
Subject: [PATCH 441/808] x86/mm/pti: Populate user PGD

In clone_pgd_range() copy the init user PGDs which cover the kernel half of
the address space, so a process has all the required kernel mappings
visible.

[ tglx: Split out from the big kaiser dump and folded Andys simplification ]

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/pgtable.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 2d2d07300b4a..cc6fa75884e9 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1119,7 +1119,14 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
  */
 static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
 {
-       memcpy(dst, src, count * sizeof(pgd_t));
+	memcpy(dst, src, count * sizeof(pgd_t));
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+	if (!static_cpu_has(X86_FEATURE_PTI))
+		return;
+	/* Clone the user space pgd as well */
+	memcpy(kernel_to_user_pgdp(dst), kernel_to_user_pgdp(src),
+	       count * sizeof(pgd_t));
+#endif
 }
 
 #define PTE_SHIFT ilog2(PTRS_PER_PTE)

From 03f4424f348e8be95eb1bbeba09461cd7b867828 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Mon, 4 Dec 2017 15:07:42 +0100
Subject: [PATCH 442/808] x86/mm/pti: Add functions to clone kernel PMDs

Provide infrastructure to:

 - find a kernel PMD for a mapping which must be visible to user space for
   the entry/exit code to work.

 - walk an address range and share the kernel PMD with it.

This reuses a small part of the original KAISER patches to populate the
user space page table.

[ tglx: Made it universally usable so it can be used for any kind of shared
	mapping. Add a mechanism to clear specific bits in the user space
	visible PMD entry. Folded Andys simplifactions ]

Originally-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/pti.c | 127 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 127 insertions(+)

diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index 69a983365392..d58bcee470fc 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -48,6 +48,11 @@
 #undef pr_fmt
 #define pr_fmt(fmt)     "Kernel/User page tables isolation: " fmt
 
+/* Backporting helper */
+#ifndef __GFP_NOTRACK
+#define __GFP_NOTRACK	0
+#endif
+
 static void __init pti_print_if_insecure(const char *reason)
 {
 	if (boot_cpu_has_bug(X86_BUG_CPU_INSECURE))
@@ -137,6 +142,128 @@ pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
 	return pgd;
 }
 
+/*
+ * Walk the user copy of the page tables (optionally) trying to allocate
+ * page table pages on the way down.
+ *
+ * Returns a pointer to a P4D on success, or NULL on failure.
+ */
+static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address)
+{
+	pgd_t *pgd = kernel_to_user_pgdp(pgd_offset_k(address));
+	gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
+
+	if (address < PAGE_OFFSET) {
+		WARN_ONCE(1, "attempt to walk user address\n");
+		return NULL;
+	}
+
+	if (pgd_none(*pgd)) {
+		unsigned long new_p4d_page = __get_free_page(gfp);
+		if (!new_p4d_page)
+			return NULL;
+
+		if (pgd_none(*pgd)) {
+			set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page)));
+			new_p4d_page = 0;
+		}
+		if (new_p4d_page)
+			free_page(new_p4d_page);
+	}
+	BUILD_BUG_ON(pgd_large(*pgd) != 0);
+
+	return p4d_offset(pgd, address);
+}
+
+/*
+ * Walk the user copy of the page tables (optionally) trying to allocate
+ * page table pages on the way down.
+ *
+ * Returns a pointer to a PMD on success, or NULL on failure.
+ */
+static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
+{
+	gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
+	p4d_t *p4d = pti_user_pagetable_walk_p4d(address);
+	pud_t *pud;
+
+	BUILD_BUG_ON(p4d_large(*p4d) != 0);
+	if (p4d_none(*p4d)) {
+		unsigned long new_pud_page = __get_free_page(gfp);
+		if (!new_pud_page)
+			return NULL;
+
+		if (p4d_none(*p4d)) {
+			set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page)));
+			new_pud_page = 0;
+		}
+		if (new_pud_page)
+			free_page(new_pud_page);
+	}
+
+	pud = pud_offset(p4d, address);
+	/* The user page tables do not use large mappings: */
+	if (pud_large(*pud)) {
+		WARN_ON(1);
+		return NULL;
+	}
+	if (pud_none(*pud)) {
+		unsigned long new_pmd_page = __get_free_page(gfp);
+		if (!new_pmd_page)
+			return NULL;
+
+		if (pud_none(*pud)) {
+			set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
+			new_pmd_page = 0;
+		}
+		if (new_pmd_page)
+			free_page(new_pmd_page);
+	}
+
+	return pmd_offset(pud, address);
+}
+
+static void __init
+pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear)
+{
+	unsigned long addr;
+
+	/*
+	 * Clone the populated PMDs which cover start to end. These PMD areas
+	 * can have holes.
+	 */
+	for (addr = start; addr < end; addr += PMD_SIZE) {
+		pmd_t *pmd, *target_pmd;
+		pgd_t *pgd;
+		p4d_t *p4d;
+		pud_t *pud;
+
+		pgd = pgd_offset_k(addr);
+		if (WARN_ON(pgd_none(*pgd)))
+			return;
+		p4d = p4d_offset(pgd, addr);
+		if (WARN_ON(p4d_none(*p4d)))
+			return;
+		pud = pud_offset(p4d, addr);
+		if (pud_none(*pud))
+			continue;
+		pmd = pmd_offset(pud, addr);
+		if (pmd_none(*pmd))
+			continue;
+
+		target_pmd = pti_user_pagetable_walk_pmd(addr);
+		if (WARN_ON(!target_pmd))
+			return;
+
+		/*
+		 * Copy the PMD.  That is, the kernelmode and usermode
+		 * tables will share the last-level page tables of this
+		 * address range
+		 */
+		*target_pmd = pmd_clear_flags(*pmd, clear);
+	}
+}
+
 /*
  * Initialize kernel page table isolation
  */

From 8d4b067895791ab9fdb1aadfc505f64d71239dd2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 4 Dec 2017 15:07:43 +0100
Subject: [PATCH 443/808] x86/mm/pti: Force entry through trampoline when PTI
 active

Force the entry through the trampoline only when PTI is active. Otherwise
go through the normal entry code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/common.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index a9210f9b7cf8..f2a94dfb434e 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1339,7 +1339,10 @@ void syscall_init(void)
 		(entry_SYSCALL_64_trampoline - _entry_trampoline);
 
 	wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
-	wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline);
+	if (static_cpu_has(X86_FEATURE_PTI))
+		wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline);
+	else
+		wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
 
 #ifdef CONFIG_IA32_EMULATION
 	wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);

From f7cfbee91559ca7e3e961a00ffac921208a115ad Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Mon, 4 Dec 2017 15:07:45 +0100
Subject: [PATCH 444/808] x86/mm/pti: Share cpu_entry_area with user space page
 tables

Share the cpu entry area so the user space and kernel space page tables
have the same P4D page.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/pti.c | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index d58bcee470fc..59290356f19f 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -264,6 +264,29 @@ pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear)
 	}
 }
 
+/*
+ * Clone a single p4d (i.e. a top-level entry on 4-level systems and a
+ * next-level entry on 5-level systems.
+ */
+static void __init pti_clone_p4d(unsigned long addr)
+{
+	p4d_t *kernel_p4d, *user_p4d;
+	pgd_t *kernel_pgd;
+
+	user_p4d = pti_user_pagetable_walk_p4d(addr);
+	kernel_pgd = pgd_offset_k(addr);
+	kernel_p4d = p4d_offset(kernel_pgd, addr);
+	*user_p4d = *kernel_p4d;
+}
+
+/*
+ * Clone the CPU_ENTRY_AREA into the user space visible page table.
+ */
+static void __init pti_clone_user_shared(void)
+{
+	pti_clone_p4d(CPU_ENTRY_AREA_BASE);
+}
+
 /*
  * Initialize kernel page table isolation
  */
@@ -273,4 +296,6 @@ void __init pti_init(void)
 		return;
 
 	pr_info("enabled\n");
+
+	pti_clone_user_shared();
 }

From 2f7412ba9c6af5ab16bdbb4a3fdb1dcd2b4fd3c2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 4 Dec 2017 15:07:46 +0100
Subject: [PATCH 445/808] x86/entry: Align entry text section to PMD boundary

The (irq)entry text must be visible in the user space page tables. To allow
simple PMD based sharing, make the entry text PMD aligned.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/vmlinux.lds.S | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index d2a8b5a24a44..1e413a9326aa 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -61,11 +61,17 @@ jiffies_64 = jiffies;
 		. = ALIGN(HPAGE_SIZE);				\
 		__end_rodata_hpage_align = .;
 
+#define ALIGN_ENTRY_TEXT_BEGIN	. = ALIGN(PMD_SIZE);
+#define ALIGN_ENTRY_TEXT_END	. = ALIGN(PMD_SIZE);
+
 #else
 
 #define X64_ALIGN_RODATA_BEGIN
 #define X64_ALIGN_RODATA_END
 
+#define ALIGN_ENTRY_TEXT_BEGIN
+#define ALIGN_ENTRY_TEXT_END
+
 #endif
 
 PHDRS {
@@ -102,8 +108,10 @@ SECTIONS
 		CPUIDLE_TEXT
 		LOCK_TEXT
 		KPROBES_TEXT
+		ALIGN_ENTRY_TEXT_BEGIN
 		ENTRY_TEXT
 		IRQENTRY_TEXT
+		ALIGN_ENTRY_TEXT_END
 		SOFTIRQENTRY_TEXT
 		*(.fixup)
 		*(.gnu.warning)

From 6dc72c3cbca0580642808d677181cad4c6433893 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 4 Dec 2017 15:07:47 +0100
Subject: [PATCH 446/808] x86/mm/pti: Share entry text PMD

Share the entry text PMD of the kernel mapping with the user space
mapping. If large pages are enabled this is a single PMD entry and at the
point where it is copied into the user page table the RW bit has not been
cleared yet. Clear it right away so the user space visible map becomes RX.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/pti.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index 59290356f19f..0e78797650a7 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -287,6 +287,15 @@ static void __init pti_clone_user_shared(void)
 	pti_clone_p4d(CPU_ENTRY_AREA_BASE);
 }
 
+/*
+ * Clone the populated PMDs of the entry and irqentry text and force it RO.
+ */
+static void __init pti_clone_entry_text(void)
+{
+	pti_clone_pmds((unsigned long) __entry_text_start,
+			(unsigned long) __irqentry_text_end, _PAGE_RW);
+}
+
 /*
  * Initialize kernel page table isolation
  */
@@ -298,4 +307,5 @@ void __init pti_init(void)
 	pr_info("enabled\n");
 
 	pti_clone_user_shared();
+	pti_clone_entry_text();
 }

From 4b6bbe95b87966ba08999574db65c93c5e925a36 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Fri, 15 Dec 2017 22:08:18 +0100
Subject: [PATCH 447/808] x86/mm/pti: Map ESPFIX into user space

Map the ESPFIX pages into user space when PTI is enabled.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/pti.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index 0e78797650a7..b1c38ef9fbbb 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -287,6 +287,16 @@ static void __init pti_clone_user_shared(void)
 	pti_clone_p4d(CPU_ENTRY_AREA_BASE);
 }
 
+/*
+ * Clone the ESPFIX P4D into the user space visinble page table
+ */
+static void __init pti_setup_espfix64(void)
+{
+#ifdef CONFIG_X86_ESPFIX64
+	pti_clone_p4d(ESPFIX_BASE_ADDR);
+#endif
+}
+
 /*
  * Clone the populated PMDs of the entry and irqentry text and force it RO.
  */
@@ -308,4 +318,5 @@ void __init pti_init(void)
 
 	pti_clone_user_shared();
 	pti_clone_entry_text();
+	pti_setup_espfix64();
 }

From 10043e02db7f8a4161f76434931051e7d797a5f6 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 4 Dec 2017 15:07:49 +0100
Subject: [PATCH 448/808] x86/cpu_entry_area: Add debugstore entries to
 cpu_entry_area

The Intel PEBS/BTS debug store is a design trainwreck as it expects virtual
addresses which must be visible in any execution context.

So it is required to make these mappings visible to user space when kernel
page table isolation is active.

Provide enough room for the buffer mappings in the cpu_entry_area so the
buffers are available in the user space visible page tables.

At the point where the kernel side entry area is populated there is no
buffer available yet, but the kernel PMD must be populated. To achieve this
set the entries for these buffers to non present.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/ds.c            |  5 ++--
 arch/x86/events/perf_event.h          | 21 ++--------------
 arch/x86/include/asm/cpu_entry_area.h | 13 ++++++++++
 arch/x86/include/asm/intel_ds.h       | 36 +++++++++++++++++++++++++++
 arch/x86/mm/cpu_entry_area.c          | 27 ++++++++++++++++++++
 5 files changed, 81 insertions(+), 21 deletions(-)
 create mode 100644 arch/x86/include/asm/intel_ds.h

diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 3674a4b6f8bd..6522f0279cb8 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -8,11 +8,12 @@
 
 #include "../perf_event.h"
 
+/* Waste a full page so it can be mapped into the cpu_entry_area */
+DEFINE_PER_CPU_PAGE_ALIGNED(struct debug_store, cpu_debug_store);
+
 /* The size of a BTS record in bytes: */
 #define BTS_RECORD_SIZE		24
 
-#define BTS_BUFFER_SIZE		(PAGE_SIZE << 4)
-#define PEBS_BUFFER_SIZE	(PAGE_SIZE << 4)
 #define PEBS_FIXUP_SIZE		PAGE_SIZE
 
 /*
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index f7aaadf9331f..373f9eda80b1 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -14,6 +14,8 @@
 
 #include <linux/perf_event.h>
 
+#include <asm/intel_ds.h>
+
 /* To enable MSR tracing please use the generic trace points. */
 
 /*
@@ -77,8 +79,6 @@ struct amd_nb {
 	struct event_constraint event_constraints[X86_PMC_IDX_MAX];
 };
 
-/* The maximal number of PEBS events: */
-#define MAX_PEBS_EVENTS		8
 #define PEBS_COUNTER_MASK	((1ULL << MAX_PEBS_EVENTS) - 1)
 
 /*
@@ -95,23 +95,6 @@ struct amd_nb {
 	PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR | \
 	PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER)
 
-/*
- * A debug store configuration.
- *
- * We only support architectures that use 64bit fields.
- */
-struct debug_store {
-	u64	bts_buffer_base;
-	u64	bts_index;
-	u64	bts_absolute_maximum;
-	u64	bts_interrupt_threshold;
-	u64	pebs_buffer_base;
-	u64	pebs_index;
-	u64	pebs_absolute_maximum;
-	u64	pebs_interrupt_threshold;
-	u64	pebs_event_reset[MAX_PEBS_EVENTS];
-};
-
 #define PEBS_REGS \
 	(PERF_REG_X86_AX | \
 	 PERF_REG_X86_BX | \
diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h
index 2fbc69a0916e..4a7884b8dca5 100644
--- a/arch/x86/include/asm/cpu_entry_area.h
+++ b/arch/x86/include/asm/cpu_entry_area.h
@@ -5,6 +5,7 @@
 
 #include <linux/percpu-defs.h>
 #include <asm/processor.h>
+#include <asm/intel_ds.h>
 
 /*
  * cpu_entry_area is a percpu region that contains things needed by the CPU
@@ -40,6 +41,18 @@ struct cpu_entry_area {
 	 */
 	char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ];
 #endif
+#ifdef CONFIG_CPU_SUP_INTEL
+	/*
+	 * Per CPU debug store for Intel performance monitoring. Wastes a
+	 * full page at the moment.
+	 */
+	struct debug_store cpu_debug_store;
+	/*
+	 * The actual PEBS/BTS buffers must be mapped to user space
+	 * Reserve enough fixmap PTEs.
+	 */
+	struct debug_store_buffers cpu_debug_buffers;
+#endif
 };
 
 #define CPU_ENTRY_AREA_SIZE	(sizeof(struct cpu_entry_area))
diff --git a/arch/x86/include/asm/intel_ds.h b/arch/x86/include/asm/intel_ds.h
new file mode 100644
index 000000000000..62a9f4966b42
--- /dev/null
+++ b/arch/x86/include/asm/intel_ds.h
@@ -0,0 +1,36 @@
+#ifndef _ASM_INTEL_DS_H
+#define _ASM_INTEL_DS_H
+
+#include <linux/percpu-defs.h>
+
+#define BTS_BUFFER_SIZE		(PAGE_SIZE << 4)
+#define PEBS_BUFFER_SIZE	(PAGE_SIZE << 4)
+
+/* The maximal number of PEBS events: */
+#define MAX_PEBS_EVENTS		8
+
+/*
+ * A debug store configuration.
+ *
+ * We only support architectures that use 64bit fields.
+ */
+struct debug_store {
+	u64	bts_buffer_base;
+	u64	bts_index;
+	u64	bts_absolute_maximum;
+	u64	bts_interrupt_threshold;
+	u64	pebs_buffer_base;
+	u64	pebs_index;
+	u64	pebs_absolute_maximum;
+	u64	pebs_interrupt_threshold;
+	u64	pebs_event_reset[MAX_PEBS_EVENTS];
+} __aligned(PAGE_SIZE);
+
+DECLARE_PER_CPU_PAGE_ALIGNED(struct debug_store, cpu_debug_store);
+
+struct debug_store_buffers {
+	char	bts_buffer[BTS_BUFFER_SIZE];
+	char	pebs_buffer[PEBS_BUFFER_SIZE];
+};
+
+#endif
diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c
index fe814fd5e014..b9283cc27622 100644
--- a/arch/x86/mm/cpu_entry_area.c
+++ b/arch/x86/mm/cpu_entry_area.c
@@ -38,6 +38,32 @@ cea_map_percpu_pages(void *cea_vaddr, void *ptr, int pages, pgprot_t prot)
 		cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot);
 }
 
+static void percpu_setup_debug_store(int cpu)
+{
+#ifdef CONFIG_CPU_SUP_INTEL
+	int npages;
+	void *cea;
+
+	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+		return;
+
+	cea = &get_cpu_entry_area(cpu)->cpu_debug_store;
+	npages = sizeof(struct debug_store) / PAGE_SIZE;
+	BUILD_BUG_ON(sizeof(struct debug_store) % PAGE_SIZE != 0);
+	cea_map_percpu_pages(cea, &per_cpu(cpu_debug_store, cpu), npages,
+			     PAGE_KERNEL);
+
+	cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers;
+	/*
+	 * Force the population of PMDs for not yet allocated per cpu
+	 * memory like debug store buffers.
+	 */
+	npages = sizeof(struct debug_store_buffers) / PAGE_SIZE;
+	for (; npages; npages--, cea += PAGE_SIZE)
+		cea_set_pte(cea, 0, PAGE_NONE);
+#endif
+}
+
 /* Setup the fixmap mappings only once per-processor */
 static void __init setup_cpu_entry_area(int cpu)
 {
@@ -109,6 +135,7 @@ static void __init setup_cpu_entry_area(int cpu)
 	cea_set_pte(&get_cpu_entry_area(cpu)->entry_trampoline,
 		     __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX);
 #endif
+	percpu_setup_debug_store(cpu);
 }
 
 static __init void setup_cpu_entry_area_ptes(void)

From c1961a4631daef4aeabee8e368b1b13e8f173c91 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Mon, 4 Dec 2017 15:07:50 +0100
Subject: [PATCH 449/808] x86/events/intel/ds: Map debug buffers in
 cpu_entry_area

The BTS and PEBS buffers both have their virtual addresses programmed into
the hardware.  This means that any access to them is performed via the page
tables.  The times that the hardware accesses these are entirely dependent
on how the performance monitoring hardware events are set up.  In other
words, there is no way for the kernel to tell when the hardware might
access these buffers.

To avoid perf crashes, place 'debug_store' allocate pages and map them into
the cpu_entry_area.

The PEBS fixup buffer does not need this treatment.

[ tglx: Got rid of the kaiser_add_mapping() complication ]

Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: keescook@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/ds.c   | 125 ++++++++++++++++++++++-------------
 arch/x86/events/perf_event.h |   2 +
 2 files changed, 82 insertions(+), 45 deletions(-)

diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 6522f0279cb8..8f0aace08b87 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -3,6 +3,7 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 
+#include <asm/cpu_entry_area.h>
 #include <asm/perf_event.h>
 #include <asm/insn.h>
 
@@ -280,17 +281,52 @@ void fini_debug_store_on_cpu(int cpu)
 
 static DEFINE_PER_CPU(void *, insn_buffer);
 
+static void ds_update_cea(void *cea, void *addr, size_t size, pgprot_t prot)
+{
+	phys_addr_t pa;
+	size_t msz = 0;
+
+	pa = virt_to_phys(addr);
+	for (; msz < size; msz += PAGE_SIZE, pa += PAGE_SIZE, cea += PAGE_SIZE)
+		cea_set_pte(cea, pa, prot);
+}
+
+static void ds_clear_cea(void *cea, size_t size)
+{
+	size_t msz = 0;
+
+	for (; msz < size; msz += PAGE_SIZE, cea += PAGE_SIZE)
+		cea_set_pte(cea, 0, PAGE_NONE);
+}
+
+static void *dsalloc_pages(size_t size, gfp_t flags, int cpu)
+{
+	unsigned int order = get_order(size);
+	int node = cpu_to_node(cpu);
+	struct page *page;
+
+	page = __alloc_pages_node(node, flags | __GFP_ZERO, order);
+	return page ? page_address(page) : NULL;
+}
+
+static void dsfree_pages(const void *buffer, size_t size)
+{
+	if (buffer)
+		free_pages((unsigned long)buffer, get_order(size));
+}
+
 static int alloc_pebs_buffer(int cpu)
 {
-	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-	int node = cpu_to_node(cpu);
-	int max;
-	void *buffer, *ibuffer;
+	struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
+	struct debug_store *ds = hwev->ds;
+	size_t bsiz = x86_pmu.pebs_buffer_size;
+	int max, node = cpu_to_node(cpu);
+	void *buffer, *ibuffer, *cea;
 
 	if (!x86_pmu.pebs)
 		return 0;
 
-	buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);
+	buffer = dsalloc_pages(bsiz, GFP_KERNEL, cpu);
 	if (unlikely(!buffer))
 		return -ENOMEM;
 
@@ -301,25 +337,27 @@ static int alloc_pebs_buffer(int cpu)
 	if (x86_pmu.intel_cap.pebs_format < 2) {
 		ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);
 		if (!ibuffer) {
-			kfree(buffer);
+			dsfree_pages(buffer, bsiz);
 			return -ENOMEM;
 		}
 		per_cpu(insn_buffer, cpu) = ibuffer;
 	}
-
-	max = x86_pmu.pebs_buffer_size / x86_pmu.pebs_record_size;
-
-	ds->pebs_buffer_base = (u64)(unsigned long)buffer;
+	hwev->ds_pebs_vaddr = buffer;
+	/* Update the cpu entry area mapping */
+	cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
+	ds->pebs_buffer_base = (unsigned long) cea;
+	ds_update_cea(cea, buffer, bsiz, PAGE_KERNEL);
 	ds->pebs_index = ds->pebs_buffer_base;
-	ds->pebs_absolute_maximum = ds->pebs_buffer_base +
-		max * x86_pmu.pebs_record_size;
-
+	max = x86_pmu.pebs_record_size * (bsiz / x86_pmu.pebs_record_size);
+	ds->pebs_absolute_maximum = ds->pebs_buffer_base + max;
 	return 0;
 }
 
 static void release_pebs_buffer(int cpu)
 {
-	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+	struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
+	struct debug_store *ds = hwev->ds;
+	void *cea;
 
 	if (!ds || !x86_pmu.pebs)
 		return;
@@ -327,73 +365,70 @@ static void release_pebs_buffer(int cpu)
 	kfree(per_cpu(insn_buffer, cpu));
 	per_cpu(insn_buffer, cpu) = NULL;
 
-	kfree((void *)(unsigned long)ds->pebs_buffer_base);
+	/* Clear the fixmap */
+	cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
+	ds_clear_cea(cea, x86_pmu.pebs_buffer_size);
 	ds->pebs_buffer_base = 0;
+	dsfree_pages(hwev->ds_pebs_vaddr, x86_pmu.pebs_buffer_size);
+	hwev->ds_pebs_vaddr = NULL;
 }
 
 static int alloc_bts_buffer(int cpu)
 {
-	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-	int node = cpu_to_node(cpu);
-	int max, thresh;
-	void *buffer;
+	struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
+	struct debug_store *ds = hwev->ds;
+	void *buffer, *cea;
+	int max;
 
 	if (!x86_pmu.bts)
 		return 0;
 
-	buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
+	buffer = dsalloc_pages(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, cpu);
 	if (unlikely(!buffer)) {
 		WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);
 		return -ENOMEM;
 	}
-
-	max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
-	thresh = max / 16;
-
-	ds->bts_buffer_base = (u64)(unsigned long)buffer;
+	hwev->ds_bts_vaddr = buffer;
+	/* Update the fixmap */
+	cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer;
+	ds->bts_buffer_base = (unsigned long) cea;
+	ds_update_cea(cea, buffer, BTS_BUFFER_SIZE, PAGE_KERNEL);
 	ds->bts_index = ds->bts_buffer_base;
-	ds->bts_absolute_maximum = ds->bts_buffer_base +
-		max * BTS_RECORD_SIZE;
-	ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
-		thresh * BTS_RECORD_SIZE;
-
+	max = BTS_RECORD_SIZE * (BTS_BUFFER_SIZE / BTS_RECORD_SIZE);
+	ds->bts_absolute_maximum = ds->bts_buffer_base + max;
+	ds->bts_interrupt_threshold = ds->bts_absolute_maximum - (max / 16);
 	return 0;
 }
 
 static void release_bts_buffer(int cpu)
 {
-	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+	struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
+	struct debug_store *ds = hwev->ds;
+	void *cea;
 
 	if (!ds || !x86_pmu.bts)
 		return;
 
-	kfree((void *)(unsigned long)ds->bts_buffer_base);
+	/* Clear the fixmap */
+	cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer;
+	ds_clear_cea(cea, BTS_BUFFER_SIZE);
 	ds->bts_buffer_base = 0;
+	dsfree_pages(hwev->ds_bts_vaddr, BTS_BUFFER_SIZE);
+	hwev->ds_bts_vaddr = NULL;
 }
 
 static int alloc_ds_buffer(int cpu)
 {
-	int node = cpu_to_node(cpu);
-	struct debug_store *ds;
-
-	ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node);
-	if (unlikely(!ds))
-		return -ENOMEM;
+	struct debug_store *ds = &get_cpu_entry_area(cpu)->cpu_debug_store;
 
+	memset(ds, 0, sizeof(*ds));
 	per_cpu(cpu_hw_events, cpu).ds = ds;
-
 	return 0;
 }
 
 static void release_ds_buffer(int cpu)
 {
-	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-
-	if (!ds)
-		return;
-
 	per_cpu(cpu_hw_events, cpu).ds = NULL;
-	kfree(ds);
 }
 
 void release_ds_buffers(void)
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 373f9eda80b1..8e4ea143ed96 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -199,6 +199,8 @@ struct cpu_hw_events {
 	 * Intel DebugStore bits
 	 */
 	struct debug_store	*ds;
+	void			*ds_pebs_vaddr;
+	void			*ds_bts_vaddr;
 	u64			pebs_enabled;
 	int			n_pebs;
 	int			n_large_pebs;

From 9f449772a3106bcdd4eb8fdeb281147b0e99fb30 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Tue, 12 Dec 2017 07:56:44 -0800
Subject: [PATCH 450/808] x86/mm/64: Make a full PGD-entry size hole in the
 memory map

Shrink vmalloc space from 16384TiB to 12800TiB to enlarge the hole starting
at 0xff90000000000000 to be a full PGD entry.

A subsequent patch will use this hole for the pagetable isolation LDT
alias.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/x86/x86_64/mm.txt         | 4 ++--
 arch/x86/include/asm/pgtable_64_types.h | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt
index 51101708a03a..496a1dbf139d 100644
--- a/Documentation/x86/x86_64/mm.txt
+++ b/Documentation/x86/x86_64/mm.txt
@@ -29,8 +29,8 @@ Virtual memory map with 5 level page tables:
 hole caused by [56:63] sign extension
 ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor
 ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory
-ff90000000000000 - ff91ffffffffffff (=49 bits) hole
-ff92000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space
+ff90000000000000 - ff9fffffffffffff (=52 bits) hole
+ffa0000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space (12800 TB)
 ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole
 ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB)
 ... unused hole ...
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 3d27831bc58d..83e9489ae944 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -79,8 +79,8 @@ typedef struct { pteval_t pte; } pte_t;
 #define MAXMEM			_AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
 
 #ifdef CONFIG_X86_5LEVEL
-# define VMALLOC_SIZE_TB	_AC(16384, UL)
-# define __VMALLOC_BASE		_AC(0xff92000000000000, UL)
+# define VMALLOC_SIZE_TB	_AC(12800, UL)
+# define __VMALLOC_BASE		_AC(0xffa0000000000000, UL)
 # define __VMEMMAP_BASE		_AC(0xffd4000000000000, UL)
 #else
 # define VMALLOC_SIZE_TB	_AC(32, UL)

From f55f0501cbf65ec41cca5058513031b711730b1d Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Tue, 12 Dec 2017 07:56:45 -0800
Subject: [PATCH 451/808] x86/pti: Put the LDT in its own PGD if PTI is on

With PTI enabled, the LDT must be mapped in the usermode tables somewhere.
The LDT is per process, i.e. per mm.

An earlier approach mapped the LDT on context switch into a fixmap area,
but that's a big overhead and exhausted the fixmap space when NR_CPUS got
big.

Take advantage of the fact that there is an address space hole which
provides a completely unused pgd. Use this pgd to manage per-mm LDT
mappings.

This has a down side: the LDT isn't (currently) randomized, and an attack
that can write the LDT is instant root due to call gates (thanks, AMD, for
leaving call gates in AMD64 but designing them wrong so they're only useful
for exploits).  This can be mitigated by making the LDT read-only or
randomizing the mapping, either of which is strightforward on top of this
patch.

This will significantly slow down LDT users, but that shouldn't matter for
important workloads -- the LDT is only used by DOSEMU(2), Wine, and very
old libc implementations.

[ tglx: Cleaned it up. ]

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/x86/x86_64/mm.txt         |   3 +-
 arch/x86/include/asm/mmu_context.h      |  59 +++++++++-
 arch/x86/include/asm/pgtable_64_types.h |   4 +
 arch/x86/include/asm/processor.h        |  23 ++--
 arch/x86/kernel/ldt.c                   | 139 +++++++++++++++++++++++-
 arch/x86/mm/dump_pagetables.c           |   9 ++
 6 files changed, 220 insertions(+), 17 deletions(-)

diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt
index 496a1dbf139d..ad41b3813f0a 100644
--- a/Documentation/x86/x86_64/mm.txt
+++ b/Documentation/x86/x86_64/mm.txt
@@ -12,6 +12,7 @@ ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)
 ... unused hole ...
 ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB)
 ... unused hole ...
+fffffe0000000000 - fffffe7fffffffff (=39 bits) LDT remap for PTI
 fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping
 ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
 ... unused hole ...
@@ -29,7 +30,7 @@ Virtual memory map with 5 level page tables:
 hole caused by [56:63] sign extension
 ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor
 ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory
-ff90000000000000 - ff9fffffffffffff (=52 bits) hole
+ff90000000000000 - ff9fffffffffffff (=52 bits) LDT remap for PTI
 ffa0000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space (12800 TB)
 ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole
 ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB)
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 5ede7cae1d67..c931b88982a0 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -50,10 +50,33 @@ struct ldt_struct {
 	 * call gates.  On native, we could merge the ldt_struct and LDT
 	 * allocations, but it's not worth trying to optimize.
 	 */
-	struct desc_struct *entries;
-	unsigned int nr_entries;
+	struct desc_struct	*entries;
+	unsigned int		nr_entries;
+
+	/*
+	 * If PTI is in use, then the entries array is not mapped while we're
+	 * in user mode.  The whole array will be aliased at the addressed
+	 * given by ldt_slot_va(slot).  We use two slots so that we can allocate
+	 * and map, and enable a new LDT without invalidating the mapping
+	 * of an older, still-in-use LDT.
+	 *
+	 * slot will be -1 if this LDT doesn't have an alias mapping.
+	 */
+	int			slot;
 };
 
+/* This is a multiple of PAGE_SIZE. */
+#define LDT_SLOT_STRIDE (LDT_ENTRIES * LDT_ENTRY_SIZE)
+
+static inline void *ldt_slot_va(int slot)
+{
+#ifdef CONFIG_X86_64
+	return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot);
+#else
+	BUG();
+#endif
+}
+
 /*
  * Used for LDT copy/destruction.
  */
@@ -64,6 +87,7 @@ static inline void init_new_context_ldt(struct mm_struct *mm)
 }
 int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm);
 void destroy_context_ldt(struct mm_struct *mm);
+void ldt_arch_exit_mmap(struct mm_struct *mm);
 #else	/* CONFIG_MODIFY_LDT_SYSCALL */
 static inline void init_new_context_ldt(struct mm_struct *mm) { }
 static inline int ldt_dup_context(struct mm_struct *oldmm,
@@ -71,7 +95,8 @@ static inline int ldt_dup_context(struct mm_struct *oldmm,
 {
 	return 0;
 }
-static inline void destroy_context_ldt(struct mm_struct *mm) {}
+static inline void destroy_context_ldt(struct mm_struct *mm) { }
+static inline void ldt_arch_exit_mmap(struct mm_struct *mm) { }
 #endif
 
 static inline void load_mm_ldt(struct mm_struct *mm)
@@ -96,10 +121,31 @@ static inline void load_mm_ldt(struct mm_struct *mm)
 	 * that we can see.
 	 */
 
-	if (unlikely(ldt))
-		set_ldt(ldt->entries, ldt->nr_entries);
-	else
+	if (unlikely(ldt)) {
+		if (static_cpu_has(X86_FEATURE_PTI)) {
+			if (WARN_ON_ONCE((unsigned long)ldt->slot > 1)) {
+				/*
+				 * Whoops -- either the new LDT isn't mapped
+				 * (if slot == -1) or is mapped into a bogus
+				 * slot (if slot > 1).
+				 */
+				clear_LDT();
+				return;
+			}
+
+			/*
+			 * If page table isolation is enabled, ldt->entries
+			 * will not be mapped in the userspace pagetables.
+			 * Tell the CPU to access the LDT through the alias
+			 * at ldt_slot_va(ldt->slot).
+			 */
+			set_ldt(ldt_slot_va(ldt->slot), ldt->nr_entries);
+		} else {
+			set_ldt(ldt->entries, ldt->nr_entries);
+		}
+	} else {
 		clear_LDT();
+	}
 #else
 	clear_LDT();
 #endif
@@ -194,6 +240,7 @@ static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
 static inline void arch_exit_mmap(struct mm_struct *mm)
 {
 	paravirt_arch_exit_mmap(mm);
+	ldt_arch_exit_mmap(mm);
 }
 
 #ifdef CONFIG_X86_64
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 83e9489ae944..b97a539bcdee 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -82,10 +82,14 @@ typedef struct { pteval_t pte; } pte_t;
 # define VMALLOC_SIZE_TB	_AC(12800, UL)
 # define __VMALLOC_BASE		_AC(0xffa0000000000000, UL)
 # define __VMEMMAP_BASE		_AC(0xffd4000000000000, UL)
+# define LDT_PGD_ENTRY		_AC(-112, UL)
+# define LDT_BASE_ADDR		(LDT_PGD_ENTRY << PGDIR_SHIFT)
 #else
 # define VMALLOC_SIZE_TB	_AC(32, UL)
 # define __VMALLOC_BASE		_AC(0xffffc90000000000, UL)
 # define __VMEMMAP_BASE		_AC(0xffffea0000000000, UL)
+# define LDT_PGD_ENTRY		_AC(-4, UL)
+# define LDT_BASE_ADDR		(LDT_PGD_ENTRY << PGDIR_SHIFT)
 #endif
 
 #ifdef CONFIG_RANDOMIZE_MEMORY
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 9e482d8b0b97..9c18da64daa9 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -851,13 +851,22 @@ static inline void spin_lock_prefetch(const void *x)
 
 #else
 /*
- * User space process size. 47bits minus one guard page.  The guard
- * page is necessary on Intel CPUs: if a SYSCALL instruction is at
- * the highest possible canonical userspace address, then that
- * syscall will enter the kernel with a non-canonical return
- * address, and SYSRET will explode dangerously.  We avoid this
- * particular problem by preventing anything from being mapped
- * at the maximum canonical address.
+ * User space process size.  This is the first address outside the user range.
+ * There are a few constraints that determine this:
+ *
+ * On Intel CPUs, if a SYSCALL instruction is at the highest canonical
+ * address, then that syscall will enter the kernel with a
+ * non-canonical return address, and SYSRET will explode dangerously.
+ * We avoid this particular problem by preventing anything executable
+ * from being mapped at the maximum canonical address.
+ *
+ * On AMD CPUs in the Ryzen family, there's a nasty bug in which the
+ * CPUs malfunction if they execute code from the highest canonical page.
+ * They'll speculate right off the end of the canonical space, and
+ * bad things happen.  This is worked around in the same way as the
+ * Intel problem.
+ *
+ * With page table isolation enabled, we map the LDT in ... [stay tuned]
  */
 #define TASK_SIZE_MAX	((1UL << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE)
 
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index a6b5d62f45a7..9629c5d8267a 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -24,6 +24,7 @@
 #include <linux/uaccess.h>
 
 #include <asm/ldt.h>
+#include <asm/tlb.h>
 #include <asm/desc.h>
 #include <asm/mmu_context.h>
 #include <asm/syscalls.h>
@@ -51,13 +52,11 @@ static void refresh_ldt_segments(void)
 static void flush_ldt(void *__mm)
 {
 	struct mm_struct *mm = __mm;
-	mm_context_t *pc;
 
 	if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm)
 		return;
 
-	pc = &mm->context;
-	set_ldt(pc->ldt->entries, pc->ldt->nr_entries);
+	load_mm_ldt(mm);
 
 	refresh_ldt_segments();
 }
@@ -94,10 +93,121 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
 		return NULL;
 	}
 
+	/* The new LDT isn't aliased for PTI yet. */
+	new_ldt->slot = -1;
+
 	new_ldt->nr_entries = num_entries;
 	return new_ldt;
 }
 
+/*
+ * If PTI is enabled, this maps the LDT into the kernelmode and
+ * usermode tables for the given mm.
+ *
+ * There is no corresponding unmap function.  Even if the LDT is freed, we
+ * leave the PTEs around until the slot is reused or the mm is destroyed.
+ * This is harmless: the LDT is always in ordinary memory, and no one will
+ * access the freed slot.
+ *
+ * If we wanted to unmap freed LDTs, we'd also need to do a flush to make
+ * it useful, and the flush would slow down modify_ldt().
+ */
+static int
+map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
+{
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+	bool is_vmalloc, had_top_level_entry;
+	unsigned long va;
+	spinlock_t *ptl;
+	pgd_t *pgd;
+	int i;
+
+	if (!static_cpu_has(X86_FEATURE_PTI))
+		return 0;
+
+	/*
+	 * Any given ldt_struct should have map_ldt_struct() called at most
+	 * once.
+	 */
+	WARN_ON(ldt->slot != -1);
+
+	/*
+	 * Did we already have the top level entry allocated?  We can't
+	 * use pgd_none() for this because it doens't do anything on
+	 * 4-level page table kernels.
+	 */
+	pgd = pgd_offset(mm, LDT_BASE_ADDR);
+	had_top_level_entry = (pgd->pgd != 0);
+
+	is_vmalloc = is_vmalloc_addr(ldt->entries);
+
+	for (i = 0; i * PAGE_SIZE < ldt->nr_entries * LDT_ENTRY_SIZE; i++) {
+		unsigned long offset = i << PAGE_SHIFT;
+		const void *src = (char *)ldt->entries + offset;
+		unsigned long pfn;
+		pte_t pte, *ptep;
+
+		va = (unsigned long)ldt_slot_va(slot) + offset;
+		pfn = is_vmalloc ? vmalloc_to_pfn(src) :
+			page_to_pfn(virt_to_page(src));
+		/*
+		 * Treat the PTI LDT range as a *userspace* range.
+		 * get_locked_pte() will allocate all needed pagetables
+		 * and account for them in this mm.
+		 */
+		ptep = get_locked_pte(mm, va, &ptl);
+		if (!ptep)
+			return -ENOMEM;
+		pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL & ~_PAGE_GLOBAL));
+		set_pte_at(mm, va, ptep, pte);
+		pte_unmap_unlock(ptep, ptl);
+	}
+
+	if (mm->context.ldt) {
+		/*
+		 * We already had an LDT.  The top-level entry should already
+		 * have been allocated and synchronized with the usermode
+		 * tables.
+		 */
+		WARN_ON(!had_top_level_entry);
+		if (static_cpu_has(X86_FEATURE_PTI))
+			WARN_ON(!kernel_to_user_pgdp(pgd)->pgd);
+	} else {
+		/*
+		 * This is the first time we're mapping an LDT for this process.
+		 * Sync the pgd to the usermode tables.
+		 */
+		WARN_ON(had_top_level_entry);
+		if (static_cpu_has(X86_FEATURE_PTI)) {
+			WARN_ON(kernel_to_user_pgdp(pgd)->pgd);
+			set_pgd(kernel_to_user_pgdp(pgd), *pgd);
+		}
+	}
+
+	va = (unsigned long)ldt_slot_va(slot);
+	flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, 0);
+
+	ldt->slot = slot;
+#endif
+	return 0;
+}
+
+static void free_ldt_pgtables(struct mm_struct *mm)
+{
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+	struct mmu_gather tlb;
+	unsigned long start = LDT_BASE_ADDR;
+	unsigned long end = start + (1UL << PGDIR_SHIFT);
+
+	if (!static_cpu_has(X86_FEATURE_PTI))
+		return;
+
+	tlb_gather_mmu(&tlb, mm, start, end);
+	free_pgd_range(&tlb, start, end, start, end);
+	tlb_finish_mmu(&tlb, start, end);
+#endif
+}
+
 /* After calling this, the LDT is immutable. */
 static void finalize_ldt_struct(struct ldt_struct *ldt)
 {
@@ -156,6 +266,12 @@ int ldt_dup_context(struct mm_struct *old_mm, struct mm_struct *mm)
 	       new_ldt->nr_entries * LDT_ENTRY_SIZE);
 	finalize_ldt_struct(new_ldt);
 
+	retval = map_ldt_struct(mm, new_ldt, 0);
+	if (retval) {
+		free_ldt_pgtables(mm);
+		free_ldt_struct(new_ldt);
+		goto out_unlock;
+	}
 	mm->context.ldt = new_ldt;
 
 out_unlock:
@@ -174,6 +290,11 @@ void destroy_context_ldt(struct mm_struct *mm)
 	mm->context.ldt = NULL;
 }
 
+void ldt_arch_exit_mmap(struct mm_struct *mm)
+{
+	free_ldt_pgtables(mm);
+}
+
 static int read_ldt(void __user *ptr, unsigned long bytecount)
 {
 	struct mm_struct *mm = current->mm;
@@ -287,6 +408,18 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
 	new_ldt->entries[ldt_info.entry_number] = ldt;
 	finalize_ldt_struct(new_ldt);
 
+	/*
+	 * If we are using PTI, map the new LDT into the userspace pagetables.
+	 * If there is already an LDT, use the other slot so that other CPUs
+	 * will continue to use the old LDT until install_ldt() switches
+	 * them over to the new LDT.
+	 */
+	error = map_ldt_struct(mm, new_ldt, old_ldt ? !old_ldt->slot : 0);
+	if (error) {
+		free_ldt_struct(old_ldt);
+		goto out_unlock;
+	}
+
 	install_ldt(mm, new_ldt);
 	free_ldt_struct(old_ldt);
 	error = 0;
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 43dedbfb7257..690eaf31ca34 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -52,11 +52,17 @@ enum address_markers_idx {
 	USER_SPACE_NR = 0,
 	KERNEL_SPACE_NR,
 	LOW_KERNEL_NR,
+#if defined(CONFIG_MODIFY_LDT_SYSCALL) && defined(CONFIG_X86_5LEVEL)
+	LDT_NR,
+#endif
 	VMALLOC_START_NR,
 	VMEMMAP_START_NR,
 #ifdef CONFIG_KASAN
 	KASAN_SHADOW_START_NR,
 	KASAN_SHADOW_END_NR,
+#endif
+#if defined(CONFIG_MODIFY_LDT_SYSCALL) && !defined(CONFIG_X86_5LEVEL)
+	LDT_NR,
 #endif
 	CPU_ENTRY_AREA_NR,
 #ifdef CONFIG_X86_ESPFIX64
@@ -81,6 +87,9 @@ static struct addr_marker address_markers[] = {
 #ifdef CONFIG_KASAN
 	[KASAN_SHADOW_START_NR]	= { KASAN_SHADOW_START,	"KASAN shadow" },
 	[KASAN_SHADOW_END_NR]	= { KASAN_SHADOW_END,	"KASAN shadow end" },
+#endif
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+	[LDT_NR]		= { LDT_BASE_ADDR,	"LDT remap" },
 #endif
 	[CPU_ENTRY_AREA_NR]	= { CPU_ENTRY_AREA_BASE,"CPU entry Area" },
 #ifdef CONFIG_X86_ESPFIX64

From 85900ea51577e31b186e523c8f4e068c79ecc7d3 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Tue, 12 Dec 2017 07:56:42 -0800
Subject: [PATCH 452/808] x86/pti: Map the vsyscall page if needed

Make VSYSCALLs work fully in PTI mode by mapping them properly to the user
space visible page tables.

[ tglx: Hide unused functions (Patch by Arnd Bergmann) ]

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/vsyscall/vsyscall_64.c |  6 +--
 arch/x86/include/asm/vsyscall.h       |  1 +
 arch/x86/mm/pti.c                     | 65 +++++++++++++++++++++++++++
 3 files changed, 69 insertions(+), 3 deletions(-)

diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index 1faf40f2dda9..577fa8adb785 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -344,14 +344,14 @@ int in_gate_area_no_mm(unsigned long addr)
  * vsyscalls but leave the page not present.  If so, we skip calling
  * this.
  */
-static void __init set_vsyscall_pgtable_user_bits(void)
+void __init set_vsyscall_pgtable_user_bits(pgd_t *root)
 {
 	pgd_t *pgd;
 	p4d_t *p4d;
 	pud_t *pud;
 	pmd_t *pmd;
 
-	pgd = pgd_offset_k(VSYSCALL_ADDR);
+	pgd = pgd_offset_pgd(root, VSYSCALL_ADDR);
 	set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER));
 	p4d = p4d_offset(pgd, VSYSCALL_ADDR);
 #if CONFIG_PGTABLE_LEVELS >= 5
@@ -373,7 +373,7 @@ void __init map_vsyscall(void)
 			     vsyscall_mode == NATIVE
 			     ? PAGE_KERNEL_VSYSCALL
 			     : PAGE_KERNEL_VVAR);
-		set_vsyscall_pgtable_user_bits();
+		set_vsyscall_pgtable_user_bits(swapper_pg_dir);
 	}
 
 	BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=
diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h
index d9a7c659009c..b986b2ca688a 100644
--- a/arch/x86/include/asm/vsyscall.h
+++ b/arch/x86/include/asm/vsyscall.h
@@ -7,6 +7,7 @@
 
 #ifdef CONFIG_X86_VSYSCALL_EMULATION
 extern void map_vsyscall(void);
+extern void set_vsyscall_pgtable_user_bits(pgd_t *root);
 
 /*
  * Called on instruction fetch fault in vsyscall page.
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index b1c38ef9fbbb..bce8aea65606 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -38,6 +38,7 @@
 
 #include <asm/cpufeature.h>
 #include <asm/hypervisor.h>
+#include <asm/vsyscall.h>
 #include <asm/cmdline.h>
 #include <asm/pti.h>
 #include <asm/pgtable.h>
@@ -223,6 +224,69 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
 	return pmd_offset(pud, address);
 }
 
+#ifdef CONFIG_X86_VSYSCALL_EMULATION
+/*
+ * Walk the shadow copy of the page tables (optionally) trying to allocate
+ * page table pages on the way down.  Does not support large pages.
+ *
+ * Note: this is only used when mapping *new* kernel data into the
+ * user/shadow page tables.  It is never used for userspace data.
+ *
+ * Returns a pointer to a PTE on success, or NULL on failure.
+ */
+static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address)
+{
+	gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
+	pmd_t *pmd = pti_user_pagetable_walk_pmd(address);
+	pte_t *pte;
+
+	/* We can't do anything sensible if we hit a large mapping. */
+	if (pmd_large(*pmd)) {
+		WARN_ON(1);
+		return NULL;
+	}
+
+	if (pmd_none(*pmd)) {
+		unsigned long new_pte_page = __get_free_page(gfp);
+		if (!new_pte_page)
+			return NULL;
+
+		if (pmd_none(*pmd)) {
+			set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
+			new_pte_page = 0;
+		}
+		if (new_pte_page)
+			free_page(new_pte_page);
+	}
+
+	pte = pte_offset_kernel(pmd, address);
+	if (pte_flags(*pte) & _PAGE_USER) {
+		WARN_ONCE(1, "attempt to walk to user pte\n");
+		return NULL;
+	}
+	return pte;
+}
+
+static void __init pti_setup_vsyscall(void)
+{
+	pte_t *pte, *target_pte;
+	unsigned int level;
+
+	pte = lookup_address(VSYSCALL_ADDR, &level);
+	if (!pte || WARN_ON(level != PG_LEVEL_4K) || pte_none(*pte))
+		return;
+
+	target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR);
+	if (WARN_ON(!target_pte))
+		return;
+
+	*target_pte = *pte;
+	set_vsyscall_pgtable_user_bits(kernel_to_user_pgdp(swapper_pg_dir));
+}
+#else
+static void __init pti_setup_vsyscall(void) { }
+#endif
+
 static void __init
 pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear)
 {
@@ -319,4 +383,5 @@ void __init pti_init(void)
 	pti_clone_user_shared();
 	pti_clone_entry_text();
 	pti_setup_espfix64();
+	pti_setup_vsyscall();
 }

From 2ea907c4fe7b78e5840c1dc07800eae93248cad1 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Mon, 4 Dec 2017 15:07:57 +0100
Subject: [PATCH 453/808] x86/mm: Allow flushing for future ASID switches

If changing the page tables in such a way that an invalidation of all
contexts (aka. PCIDs / ASIDs) is required, they can be actively invalidated
by:

 1. INVPCID for each PCID (works for single pages too).

 2. Load CR3 with each PCID without the NOFLUSH bit set

 3. Load CR3 with the NOFLUSH bit set for each and do INVLPG for each address.

But, none of these are really feasible since there are ~6 ASIDs (12 with
PAGE_TABLE_ISOLATION) at the time that invalidation is required.
Instead of actively invalidating them, invalidate the *current* context and
also mark the cpu_tlbstate _quickly_ to indicate future invalidation to be
required.

At the next context-switch, look for this indicator
('invalidate_other' being set) invalidate all of the
cpu_tlbstate.ctxs[] entries.

This ensures that any future context switches will do a full flush
of the TLB, picking up the previous changes.

[ tglx: Folded more fixups from Peter ]

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/tlbflush.h | 37 ++++++++++++++++++++++++++-------
 arch/x86/mm/tlb.c               | 35 +++++++++++++++++++++++++++++++
 2 files changed, 64 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 171b429f43a2..490a706fdba8 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -134,6 +134,17 @@ struct tlb_state {
 	 */
 	bool is_lazy;
 
+	/*
+	 * If set we changed the page tables in such a way that we
+	 * needed an invalidation of all contexts (aka. PCIDs / ASIDs).
+	 * This tells us to go invalidate all the non-loaded ctxs[]
+	 * on the next context switch.
+	 *
+	 * The current ctx was kept up-to-date as it ran and does not
+	 * need to be invalidated.
+	 */
+	bool invalidate_other;
+
 	/*
 	 * Access to this CR4 shadow and to H/W CR4 is protected by
 	 * disabling interrupts when modifying either one.
@@ -211,6 +222,14 @@ static inline unsigned long cr4_read_shadow(void)
 	return this_cpu_read(cpu_tlbstate.cr4);
 }
 
+/*
+ * Mark all other ASIDs as invalid, preserves the current.
+ */
+static inline void invalidate_other_asid(void)
+{
+	this_cpu_write(cpu_tlbstate.invalidate_other, true);
+}
+
 /*
  * Save some of cr4 feature set we're using (e.g.  Pentium 4MB
  * enable and PPro Global page enable), so that any CPU's that boot
@@ -298,14 +317,6 @@ static inline void __flush_tlb_all(void)
 		 */
 		__flush_tlb();
 	}
-
-	/*
-	 * Note: if we somehow had PCID but not PGE, then this wouldn't work --
-	 * we'd end up flushing kernel translations for the current ASID but
-	 * we might fail to flush kernel translations for other cached ASIDs.
-	 *
-	 * To avoid this issue, we force PCID off if PGE is off.
-	 */
 }
 
 /*
@@ -315,6 +326,16 @@ static inline void __flush_tlb_one(unsigned long addr)
 {
 	count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
 	__flush_tlb_single(addr);
+
+	if (!static_cpu_has(X86_FEATURE_PTI))
+		return;
+
+	/*
+	 * __flush_tlb_single() will have cleared the TLB entry for this ASID,
+	 * but since kernel space is replicated across all, we must also
+	 * invalidate all others.
+	 */
+	invalidate_other_asid();
 }
 
 #define TLB_FLUSH_ALL	-1UL
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 0a1be3adc97e..254c9eb79fe5 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -28,6 +28,38 @@
  *	Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
  */
 
+/*
+ * We get here when we do something requiring a TLB invalidation
+ * but could not go invalidate all of the contexts.  We do the
+ * necessary invalidation by clearing out the 'ctx_id' which
+ * forces a TLB flush when the context is loaded.
+ */
+void clear_asid_other(void)
+{
+	u16 asid;
+
+	/*
+	 * This is only expected to be set if we have disabled
+	 * kernel _PAGE_GLOBAL pages.
+	 */
+	if (!static_cpu_has(X86_FEATURE_PTI)) {
+		WARN_ON_ONCE(1);
+		return;
+	}
+
+	for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
+		/* Do not need to flush the current asid */
+		if (asid == this_cpu_read(cpu_tlbstate.loaded_mm_asid))
+			continue;
+		/*
+		 * Make sure the next time we go to switch to
+		 * this asid, we do a flush:
+		 */
+		this_cpu_write(cpu_tlbstate.ctxs[asid].ctx_id, 0);
+	}
+	this_cpu_write(cpu_tlbstate.invalidate_other, false);
+}
+
 atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
 
 
@@ -42,6 +74,9 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
 		return;
 	}
 
+	if (this_cpu_read(cpu_tlbstate.invalidate_other))
+		clear_asid_other();
+
 	for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
 		if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) !=
 		    next->context.ctx_id)

From 48e111982cda033fec832c6b0592c2acedd85d04 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Mon, 4 Dec 2017 15:07:58 +0100
Subject: [PATCH 454/808] x86/mm: Abstract switching CR3

In preparation to adding additional PCID flushing, abstract the
loading of a new ASID into CR3.

[ PeterZ: Split out from big combo patch ]

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/tlb.c | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 254c9eb79fe5..42a8875f73fe 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -100,6 +100,24 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
 	*need_flush = true;
 }
 
+static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush)
+{
+	unsigned long new_mm_cr3;
+
+	if (need_flush) {
+		new_mm_cr3 = build_cr3(pgdir, new_asid);
+	} else {
+		new_mm_cr3 = build_cr3_noflush(pgdir, new_asid);
+	}
+
+	/*
+	 * Caution: many callers of this function expect
+	 * that load_cr3() is serializing and orders TLB
+	 * fills with respect to the mm_cpumask writes.
+	 */
+	write_cr3(new_mm_cr3);
+}
+
 void leave_mm(int cpu)
 {
 	struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
@@ -230,7 +248,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 		if (need_flush) {
 			this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
 			this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
-			write_cr3(build_cr3(next->pgd, new_asid));
+			load_new_mm_cr3(next->pgd, new_asid, true);
 
 			/*
 			 * NB: This gets called via leave_mm() in the idle path
@@ -243,7 +261,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 			trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
 		} else {
 			/* The new ASID is already up to date. */
-			write_cr3(build_cr3_noflush(next->pgd, new_asid));
+			load_new_mm_cr3(next->pgd, new_asid, false);
 
 			/* See above wrt _rcuidle. */
 			trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);

From 6fd166aae78c0ab738d49bda653cbd9e3b1491cf Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 4 Dec 2017 15:07:59 +0100
Subject: [PATCH 455/808] x86/mm: Use/Fix PCID to optimize user/kernel switches

We can use PCID to retain the TLBs across CR3 switches; including those now
part of the user/kernel switch. This increases performance of kernel
entry/exit at the cost of more expensive/complicated TLB flushing.

Now that we have two address spaces, one for kernel and one for user space,
we need two PCIDs per mm. We use the top PCID bit to indicate a user PCID
(just like we use the PFN LSB for the PGD). Since we do TLB invalidation
from kernel space, the existing code will only invalidate the kernel PCID,
we augment that by marking the corresponding user PCID invalid, and upon
switching back to userspace, use a flushing CR3 write for the switch.

In order to access the user_pcid_flush_mask we use PER_CPU storage, which
means the previously established SWAPGS vs CR3 ordering is now mandatory
and required.

Having to do this memory access does require additional registers, most
sites have a functioning stack and we can spill one (RAX), sites without
functional stack need to otherwise provide the second scratch register.

Note: PCID is generally available on Intel Sandybridge and later CPUs.
Note: Up until this point TLB flushing was broken in this series.

Based-on-code-from: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/calling.h                    | 72 +++++++++++++---
 arch/x86/entry/entry_64.S                   |  9 +-
 arch/x86/entry/entry_64_compat.S            |  4 +-
 arch/x86/include/asm/processor-flags.h      |  5 ++
 arch/x86/include/asm/tlbflush.h             | 91 ++++++++++++++++++---
 arch/x86/include/uapi/asm/processor-flags.h |  7 +-
 arch/x86/kernel/asm-offsets.c               |  4 +
 arch/x86/mm/init.c                          |  2 +-
 arch/x86/mm/tlb.c                           |  1 +
 9 files changed, 162 insertions(+), 33 deletions(-)

diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 3d3389a92c33..7894e5c0eef7 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -3,6 +3,9 @@
 #include <asm/unwind_hints.h>
 #include <asm/cpufeatures.h>
 #include <asm/page_types.h>
+#include <asm/percpu.h>
+#include <asm/asm-offsets.h>
+#include <asm/processor-flags.h>
 
 /*
 
@@ -191,17 +194,21 @@ For 32-bit we have the following conventions - kernel is built with
 
 #ifdef CONFIG_PAGE_TABLE_ISOLATION
 
-/* PAGE_TABLE_ISOLATION PGDs are 8k.  Flip bit 12 to switch between the two halves: */
-#define PTI_SWITCH_MASK (1<<PAGE_SHIFT)
+/*
+ * PAGE_TABLE_ISOLATION PGDs are 8k.  Flip bit 12 to switch between the two
+ * halves:
+ */
+#define PTI_SWITCH_PGTABLES_MASK	(1<<PAGE_SHIFT)
+#define PTI_SWITCH_MASK		(PTI_SWITCH_PGTABLES_MASK|(1<<X86_CR3_PTI_SWITCH_BIT))
 
-.macro ADJUST_KERNEL_CR3 reg:req
-	/* Clear "PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */
-	andq	$(~PTI_SWITCH_MASK), \reg
+.macro SET_NOFLUSH_BIT	reg:req
+	bts	$X86_CR3_PCID_NOFLUSH_BIT, \reg
 .endm
 
-.macro ADJUST_USER_CR3 reg:req
-	/* Move CR3 up a page to the user page tables: */
-	orq	$(PTI_SWITCH_MASK), \reg
+.macro ADJUST_KERNEL_CR3 reg:req
+	ALTERNATIVE "", "SET_NOFLUSH_BIT \reg", X86_FEATURE_PCID
+	/* Clear PCID and "PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */
+	andq    $(~PTI_SWITCH_MASK), \reg
 .endm
 
 .macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
@@ -212,21 +219,58 @@ For 32-bit we have the following conventions - kernel is built with
 .Lend_\@:
 .endm
 
-.macro SWITCH_TO_USER_CR3 scratch_reg:req
+#define THIS_CPU_user_pcid_flush_mask   \
+	PER_CPU_VAR(cpu_tlbstate) + TLB_STATE_user_pcid_flush_mask
+
+.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
 	ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
 	mov	%cr3, \scratch_reg
-	ADJUST_USER_CR3 \scratch_reg
+
+	ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID
+
+	/*
+	 * Test if the ASID needs a flush.
+	 */
+	movq	\scratch_reg, \scratch_reg2
+	andq	$(0x7FF), \scratch_reg		/* mask ASID */
+	bt	\scratch_reg, THIS_CPU_user_pcid_flush_mask
+	jnc	.Lnoflush_\@
+
+	/* Flush needed, clear the bit */
+	btr	\scratch_reg, THIS_CPU_user_pcid_flush_mask
+	movq	\scratch_reg2, \scratch_reg
+	jmp	.Lwrcr3_\@
+
+.Lnoflush_\@:
+	movq	\scratch_reg2, \scratch_reg
+	SET_NOFLUSH_BIT \scratch_reg
+
+.Lwrcr3_\@:
+	/* Flip the PGD and ASID to the user version */
+	orq     $(PTI_SWITCH_MASK), \scratch_reg
 	mov	\scratch_reg, %cr3
 .Lend_\@:
 .endm
 
+.macro SWITCH_TO_USER_CR3_STACK	scratch_reg:req
+	pushq	%rax
+	SWITCH_TO_USER_CR3_NOSTACK scratch_reg=\scratch_reg scratch_reg2=%rax
+	popq	%rax
+.endm
+
 .macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
 	ALTERNATIVE "jmp .Ldone_\@", "", X86_FEATURE_PTI
 	movq	%cr3, \scratch_reg
 	movq	\scratch_reg, \save_reg
 	/*
-	 * Is the switch bit zero?  This means the address is
-	 * up in real PAGE_TABLE_ISOLATION patches in a moment.
+	 * Is the "switch mask" all zero?  That means that both of
+	 * these are zero:
+	 *
+	 *	1. The user/kernel PCID bit, and
+	 *	2. The user/kernel "bit" that points CR3 to the
+	 *	   bottom half of the 8k PGD
+	 *
+	 * That indicates a kernel CR3 value, not a user CR3.
 	 */
 	testq	$(PTI_SWITCH_MASK), \scratch_reg
 	jz	.Ldone_\@
@@ -251,7 +295,9 @@ For 32-bit we have the following conventions - kernel is built with
 
 .macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
 .endm
-.macro SWITCH_TO_USER_CR3 scratch_reg:req
+.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
+.endm
+.macro SWITCH_TO_USER_CR3_STACK scratch_reg:req
 .endm
 .macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
 .endm
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 2ad7ad4d3dd6..fd501844af1f 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -23,7 +23,6 @@
 #include <asm/segment.h>
 #include <asm/cache.h>
 #include <asm/errno.h>
-#include "calling.h"
 #include <asm/asm-offsets.h>
 #include <asm/msr.h>
 #include <asm/unistd.h>
@@ -40,6 +39,8 @@
 #include <asm/frame.h>
 #include <linux/err.h>
 
+#include "calling.h"
+
 .code64
 .section .entry.text, "ax"
 
@@ -406,7 +407,7 @@ syscall_return_via_sysret:
 	 * We are on the trampoline stack.  All regs except RDI are live.
 	 * We can do future final exit work right here.
 	 */
-	SWITCH_TO_USER_CR3 scratch_reg=%rdi
+	SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
 
 	popq	%rdi
 	popq	%rsp
@@ -744,7 +745,7 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode)
 	 * We can do future final exit work right here.
 	 */
 
-	SWITCH_TO_USER_CR3 scratch_reg=%rdi
+	SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
 
 	/* Restore RDI. */
 	popq	%rdi
@@ -857,7 +858,7 @@ native_irq_return_ldt:
 	 */
 	orq	PER_CPU_VAR(espfix_stack), %rax
 
-	SWITCH_TO_USER_CR3 scratch_reg=%rdi	/* to user CR3 */
+	SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
 	SWAPGS					/* to user GS */
 	popq	%rdi				/* Restore user RDI */
 
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 05238b29895e..40f17009ec20 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -275,9 +275,9 @@ sysret32_from_system_call:
 	 * switch until after after the last reference to the process
 	 * stack.
 	 *
-	 * %r8 is zeroed before the sysret, thus safe to clobber.
+	 * %r8/%r9 are zeroed before the sysret, thus safe to clobber.
 	 */
-	SWITCH_TO_USER_CR3 scratch_reg=%r8
+	SWITCH_TO_USER_CR3_NOSTACK scratch_reg=%r8 scratch_reg2=%r9
 
 	xorq	%r8, %r8
 	xorq	%r9, %r9
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
index 43212a43ee69..6a60fea90b9d 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -38,6 +38,11 @@
 #define CR3_ADDR_MASK	__sme_clr(0x7FFFFFFFFFFFF000ull)
 #define CR3_PCID_MASK	0xFFFull
 #define CR3_NOFLUSH	BIT_ULL(63)
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+# define X86_CR3_PTI_SWITCH_BIT	11
+#endif
+
 #else
 /*
  * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 490a706fdba8..5dcc38b16604 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -10,6 +10,8 @@
 #include <asm/special_insns.h>
 #include <asm/smp.h>
 #include <asm/invpcid.h>
+#include <asm/pti.h>
+#include <asm/processor-flags.h>
 
 static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
 {
@@ -24,24 +26,54 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
 
 /* There are 12 bits of space for ASIDS in CR3 */
 #define CR3_HW_ASID_BITS		12
+
 /*
  * When enabled, PAGE_TABLE_ISOLATION consumes a single bit for
  * user/kernel switches
  */
-#define PTI_CONSUMED_ASID_BITS		0
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+# define PTI_CONSUMED_PCID_BITS	1
+#else
+# define PTI_CONSUMED_PCID_BITS	0
+#endif
+
+#define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS)
 
-#define CR3_AVAIL_ASID_BITS (CR3_HW_ASID_BITS - PTI_CONSUMED_ASID_BITS)
 /*
  * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid.  -1 below to account
  * for them being zero-based.  Another -1 is because ASID 0 is reserved for
  * use by non-PCID-aware users.
  */
-#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_ASID_BITS) - 2)
+#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2)
+
+/*
+ * 6 because 6 should be plenty and struct tlb_state will fit in two cache
+ * lines.
+ */
+#define TLB_NR_DYN_ASIDS	6
 
 static inline u16 kern_pcid(u16 asid)
 {
 	VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
 	/*
+	 * Make sure that the dynamic ASID space does not confict with the
+	 * bit we are using to switch between user and kernel ASIDs.
+	 */
+	BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_SWITCH_BIT));
+
+	/*
+	 * The ASID being passed in here should have respected the
+	 * MAX_ASID_AVAILABLE and thus never have the switch bit set.
+	 */
+	VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_SWITCH_BIT));
+#endif
+	/*
+	 * The dynamically-assigned ASIDs that get passed in are small
+	 * (<TLB_NR_DYN_ASIDS).  They never have the high switch bit set,
+	 * so do not bother to clear it.
+	 *
 	 * If PCID is on, ASID-aware code paths put the ASID+1 into the
 	 * PCID bits.  This serves two purposes.  It prevents a nasty
 	 * situation in which PCID-unaware code saves CR3, loads some other
@@ -95,12 +127,6 @@ static inline bool tlb_defer_switch_to_init_mm(void)
 	return !static_cpu_has(X86_FEATURE_PCID);
 }
 
-/*
- * 6 because 6 should be plenty and struct tlb_state will fit in
- * two cache lines.
- */
-#define TLB_NR_DYN_ASIDS 6
-
 struct tlb_context {
 	u64 ctx_id;
 	u64 tlb_gen;
@@ -145,6 +171,13 @@ struct tlb_state {
 	 */
 	bool invalidate_other;
 
+	/*
+	 * Mask that contains TLB_NR_DYN_ASIDS+1 bits to indicate
+	 * the corresponding user PCID needs a flush next time we
+	 * switch to it; see SWITCH_TO_USER_CR3.
+	 */
+	unsigned short user_pcid_flush_mask;
+
 	/*
 	 * Access to this CR4 shadow and to H/W CR4 is protected by
 	 * disabling interrupts when modifying either one.
@@ -249,15 +282,42 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask)
 
 extern void initialize_tlbstate_and_flush(void);
 
+/*
+ * Given an ASID, flush the corresponding user ASID.  We can delay this
+ * until the next time we switch to it.
+ *
+ * See SWITCH_TO_USER_CR3.
+ */
+static inline void invalidate_user_asid(u16 asid)
+{
+	/* There is no user ASID if address space separation is off */
+	if (!IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
+		return;
+
+	/*
+	 * We only have a single ASID if PCID is off and the CR3
+	 * write will have flushed it.
+	 */
+	if (!cpu_feature_enabled(X86_FEATURE_PCID))
+		return;
+
+	if (!static_cpu_has(X86_FEATURE_PTI))
+		return;
+
+	__set_bit(kern_pcid(asid),
+		  (unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask));
+}
+
 /*
  * flush the entire current user mapping
  */
 static inline void __native_flush_tlb(void)
 {
+	invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid));
 	/*
-	 * If current->mm == NULL then we borrow a mm which may change during a
-	 * task switch and therefore we must not be preempted while we write CR3
-	 * back:
+	 * If current->mm == NULL then we borrow a mm which may change
+	 * during a task switch and therefore we must not be preempted
+	 * while we write CR3 back:
 	 */
 	preempt_disable();
 	native_write_cr3(__native_read_cr3());
@@ -301,7 +361,14 @@ static inline void __native_flush_tlb_global(void)
  */
 static inline void __native_flush_tlb_single(unsigned long addr)
 {
+	u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+
 	asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
+
+	if (!static_cpu_has(X86_FEATURE_PTI))
+		return;
+
+	invalidate_user_asid(loaded_mm_asid);
 }
 
 /*
diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h
index 53b4ca55ebb6..97abdaab9535 100644
--- a/arch/x86/include/uapi/asm/processor-flags.h
+++ b/arch/x86/include/uapi/asm/processor-flags.h
@@ -78,7 +78,12 @@
 #define X86_CR3_PWT		_BITUL(X86_CR3_PWT_BIT)
 #define X86_CR3_PCD_BIT		4 /* Page Cache Disable */
 #define X86_CR3_PCD		_BITUL(X86_CR3_PCD_BIT)
-#define X86_CR3_PCID_MASK	_AC(0x00000fff,UL) /* PCID Mask */
+
+#define X86_CR3_PCID_BITS	12
+#define X86_CR3_PCID_MASK	(_AC((1UL << X86_CR3_PCID_BITS) - 1, UL))
+
+#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */
+#define X86_CR3_PCID_NOFLUSH    _BITULL(X86_CR3_PCID_NOFLUSH_BIT)
 
 /*
  * Intel CPU features in CR4
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 676b7cf4b62b..76417a9aab73 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -17,6 +17,7 @@
 #include <asm/sigframe.h>
 #include <asm/bootparam.h>
 #include <asm/suspend.h>
+#include <asm/tlbflush.h>
 
 #ifdef CONFIG_XEN
 #include <xen/interface/xen.h>
@@ -94,6 +95,9 @@ void common(void) {
 	BLANK();
 	DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
 
+	/* TLB state for the entry code */
+	OFFSET(TLB_STATE_user_pcid_flush_mask, tlb_state, user_pcid_flush_mask);
+
 	/* Layout info for cpu_entry_area */
 	OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss);
 	OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline);
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index af75069fb116..caeb8a7bf0a4 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -855,7 +855,7 @@ void __init zone_sizes_init(void)
 	free_area_init_nodes(max_zone_pfns);
 }
 
-DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
+__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
 	.loaded_mm = &init_mm,
 	.next_asid = 1,
 	.cr4 = ~0UL,	/* fail hard if we screw up cr4 shadow initialization */
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 42a8875f73fe..a1561957dccb 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -105,6 +105,7 @@ static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush)
 	unsigned long new_mm_cr3;
 
 	if (need_flush) {
+		invalidate_user_asid(new_asid);
 		new_mm_cr3 = build_cr3(pgdir, new_asid);
 	} else {
 		new_mm_cr3 = build_cr3_noflush(pgdir, new_asid);

From 21e94459110252d41b45c0c8ba50fd72a664d50c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 4 Dec 2017 15:08:00 +0100
Subject: [PATCH 456/808] x86/mm: Optimize RESTORE_CR3

Most NMI/paranoid exceptions will not in fact change pagetables and would
thus not require TLB flushing, however RESTORE_CR3 uses flushing CR3
writes.

Restores to kernel PCIDs can be NOFLUSH, because we explicitly flush the
kernel mappings and now that we track which user PCIDs need flushing we can
avoid those too when possible.

This does mean RESTORE_CR3 needs an additional scratch_reg, luckily both
sites have plenty available.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/calling.h  | 30 ++++++++++++++++++++++++++++--
 arch/x86/entry/entry_64.S |  4 ++--
 2 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 7894e5c0eef7..45a63e00a6af 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -281,8 +281,34 @@ For 32-bit we have the following conventions - kernel is built with
 .Ldone_\@:
 .endm
 
-.macro RESTORE_CR3 save_reg:req
+.macro RESTORE_CR3 scratch_reg:req save_reg:req
 	ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
+
+	ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID
+
+	/*
+	 * KERNEL pages can always resume with NOFLUSH as we do
+	 * explicit flushes.
+	 */
+	bt	$X86_CR3_PTI_SWITCH_BIT, \save_reg
+	jnc	.Lnoflush_\@
+
+	/*
+	 * Check if there's a pending flush for the user ASID we're
+	 * about to set.
+	 */
+	movq	\save_reg, \scratch_reg
+	andq	$(0x7FF), \scratch_reg
+	bt	\scratch_reg, THIS_CPU_user_pcid_flush_mask
+	jnc	.Lnoflush_\@
+
+	btr	\scratch_reg, THIS_CPU_user_pcid_flush_mask
+	jmp	.Lwrcr3_\@
+
+.Lnoflush_\@:
+	SET_NOFLUSH_BIT \save_reg
+
+.Lwrcr3_\@:
 	/*
 	 * The CR3 write could be avoided when not changing its value,
 	 * but would require a CR3 read *and* a scratch register.
@@ -301,7 +327,7 @@ For 32-bit we have the following conventions - kernel is built with
 .endm
 .macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
 .endm
-.macro RESTORE_CR3 save_reg:req
+.macro RESTORE_CR3 scratch_reg:req save_reg:req
 .endm
 
 #endif
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index fd501844af1f..ed31d00dc5ee 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1288,7 +1288,7 @@ ENTRY(paranoid_exit)
 	testl	%ebx, %ebx			/* swapgs needed? */
 	jnz	.Lparanoid_exit_no_swapgs
 	TRACE_IRQS_IRETQ
-	RESTORE_CR3	save_reg=%r14
+	RESTORE_CR3	scratch_reg=%rbx save_reg=%r14
 	SWAPGS_UNSAFE_STACK
 	jmp	.Lparanoid_exit_restore
 .Lparanoid_exit_no_swapgs:
@@ -1730,7 +1730,7 @@ end_repeat_nmi:
 	movq	$-1, %rsi
 	call	do_nmi
 
-	RESTORE_CR3 save_reg=%r14
+	RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
 
 	testl	%ebx, %ebx			/* swapgs needed? */
 	jnz	nmi_restore

From 6cff64b86aaaa07f89f50498055a20e45754b0c1 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Mon, 4 Dec 2017 15:08:01 +0100
Subject: [PATCH 457/808] x86/mm: Use INVPCID for __native_flush_tlb_single()

This uses INVPCID to shoot down individual lines of the user mapping
instead of marking the entire user map as invalid. This
could/might/possibly be faster.

This for sure needs tlb_single_page_flush_ceiling to be redetermined;
esp. since INVPCID is _slow_.

A detailed performance analysis is available here:

  https://lkml.kernel.org/r/3062e486-3539-8a1f-5724-16199420be71@intel.com

[ Peterz: Split out from big combo patch ]

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/cpufeatures.h |  1 +
 arch/x86/include/asm/tlbflush.h    | 23 ++++++++++-
 arch/x86/mm/init.c                 | 64 +++++++++++++++++-------------
 3 files changed, 60 insertions(+), 28 deletions(-)

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index d8ec834ea884..07cdd1715705 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -197,6 +197,7 @@
 #define X86_FEATURE_CAT_L3		( 7*32+ 4) /* Cache Allocation Technology L3 */
 #define X86_FEATURE_CAT_L2		( 7*32+ 5) /* Cache Allocation Technology L2 */
 #define X86_FEATURE_CDP_L3		( 7*32+ 6) /* Code and Data Prioritization L3 */
+#define X86_FEATURE_INVPCID_SINGLE	( 7*32+ 7) /* Effectively INVPCID && CR4.PCIDE=1 */
 
 #define X86_FEATURE_HW_PSTATE		( 7*32+ 8) /* AMD HW-PState */
 #define X86_FEATURE_PROC_FEEDBACK	( 7*32+ 9) /* AMD ProcFeedbackInterface */
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 5dcc38b16604..57072a1052fe 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -85,6 +85,18 @@ static inline u16 kern_pcid(u16 asid)
 	return asid + 1;
 }
 
+/*
+ * The user PCID is just the kernel one, plus the "switch bit".
+ */
+static inline u16 user_pcid(u16 asid)
+{
+	u16 ret = kern_pcid(asid);
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+	ret |= 1 << X86_CR3_PTI_SWITCH_BIT;
+#endif
+	return ret;
+}
+
 struct pgd_t;
 static inline unsigned long build_cr3(pgd_t *pgd, u16 asid)
 {
@@ -335,6 +347,8 @@ static inline void __native_flush_tlb_global(void)
 		/*
 		 * Using INVPCID is considerably faster than a pair of writes
 		 * to CR4 sandwiched inside an IRQ flag save/restore.
+		 *
+		 * Note, this works with CR4.PCIDE=0 or 1.
 		 */
 		invpcid_flush_all();
 		return;
@@ -368,7 +382,14 @@ static inline void __native_flush_tlb_single(unsigned long addr)
 	if (!static_cpu_has(X86_FEATURE_PTI))
 		return;
 
-	invalidate_user_asid(loaded_mm_asid);
+	/*
+	 * Some platforms #GP if we call invpcid(type=1/2) before CR4.PCIDE=1.
+	 * Just use invalidate_user_asid() in case we are called early.
+	 */
+	if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE))
+		invalidate_user_asid(loaded_mm_asid);
+	else
+		invpcid_flush_one(user_pcid(loaded_mm_asid), addr);
 }
 
 /*
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index caeb8a7bf0a4..80259ad8c386 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -203,34 +203,44 @@ static void __init probe_page_size_mask(void)
 
 static void setup_pcid(void)
 {
-#ifdef CONFIG_X86_64
-	if (boot_cpu_has(X86_FEATURE_PCID)) {
-		if (boot_cpu_has(X86_FEATURE_PGE)) {
-			/*
-			 * This can't be cr4_set_bits_and_update_boot() --
-			 * the trampoline code can't handle CR4.PCIDE and
-			 * it wouldn't do any good anyway.  Despite the name,
-			 * cr4_set_bits_and_update_boot() doesn't actually
-			 * cause the bits in question to remain set all the
-			 * way through the secondary boot asm.
-			 *
-			 * Instead, we brute-force it and set CR4.PCIDE
-			 * manually in start_secondary().
-			 */
-			cr4_set_bits(X86_CR4_PCIDE);
-		} else {
-			/*
-			 * flush_tlb_all(), as currently implemented, won't
-			 * work if PCID is on but PGE is not.  Since that
-			 * combination doesn't exist on real hardware, there's
-			 * no reason to try to fully support it, but it's
-			 * polite to avoid corrupting data if we're on
-			 * an improperly configured VM.
-			 */
-			setup_clear_cpu_cap(X86_FEATURE_PCID);
-		}
+	if (!IS_ENABLED(CONFIG_X86_64))
+		return;
+
+	if (!boot_cpu_has(X86_FEATURE_PCID))
+		return;
+
+	if (boot_cpu_has(X86_FEATURE_PGE)) {
+		/*
+		 * This can't be cr4_set_bits_and_update_boot() -- the
+		 * trampoline code can't handle CR4.PCIDE and it wouldn't
+		 * do any good anyway.  Despite the name,
+		 * cr4_set_bits_and_update_boot() doesn't actually cause
+		 * the bits in question to remain set all the way through
+		 * the secondary boot asm.
+		 *
+		 * Instead, we brute-force it and set CR4.PCIDE manually in
+		 * start_secondary().
+		 */
+		cr4_set_bits(X86_CR4_PCIDE);
+
+		/*
+		 * INVPCID's single-context modes (2/3) only work if we set
+		 * X86_CR4_PCIDE, *and* we INVPCID support.  It's unusable
+		 * on systems that have X86_CR4_PCIDE clear, or that have
+		 * no INVPCID support at all.
+		 */
+		if (boot_cpu_has(X86_FEATURE_INVPCID))
+			setup_force_cpu_cap(X86_FEATURE_INVPCID_SINGLE);
+	} else {
+		/*
+		 * flush_tlb_all(), as currently implemented, won't work if
+		 * PCID is on but PGE is not.  Since that combination
+		 * doesn't exist on real hardware, there's no reason to try
+		 * to fully support it, but it's polite to avoid corrupting
+		 * data if we're on an improperly configured VM.
+		 */
+		setup_clear_cpu_cap(X86_FEATURE_PCID);
 	}
-#endif
 }
 
 #ifdef CONFIG_X86_32

From 0a126abd576ebc6403f063dbe20cf7416c9d9393 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 5 Dec 2017 13:34:53 +0100
Subject: [PATCH 458/808] x86/mm: Clarify the whole ASID/kernel PCID/user PCID
 naming

Ideally we'd also use sparse to enforce this separation so it becomes much
more difficult to mess up.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Cc: linux-mm@kvack.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/tlbflush.h | 55 ++++++++++++++++++++++++++-------
 1 file changed, 43 insertions(+), 12 deletions(-)

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 57072a1052fe..b519da4fc03c 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -13,16 +13,33 @@
 #include <asm/pti.h>
 #include <asm/processor-flags.h>
 
-static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
-{
-	/*
-	 * Bump the generation count.  This also serves as a full barrier
-	 * that synchronizes with switch_mm(): callers are required to order
-	 * their read of mm_cpumask after their writes to the paging
-	 * structures.
-	 */
-	return atomic64_inc_return(&mm->context.tlb_gen);
-}
+/*
+ * The x86 feature is called PCID (Process Context IDentifier). It is similar
+ * to what is traditionally called ASID on the RISC processors.
+ *
+ * We don't use the traditional ASID implementation, where each process/mm gets
+ * its own ASID and flush/restart when we run out of ASID space.
+ *
+ * Instead we have a small per-cpu array of ASIDs and cache the last few mm's
+ * that came by on this CPU, allowing cheaper switch_mm between processes on
+ * this CPU.
+ *
+ * We end up with different spaces for different things. To avoid confusion we
+ * use different names for each of them:
+ *
+ * ASID  - [0, TLB_NR_DYN_ASIDS-1]
+ *         the canonical identifier for an mm
+ *
+ * kPCID - [1, TLB_NR_DYN_ASIDS]
+ *         the value we write into the PCID part of CR3; corresponds to the
+ *         ASID+1, because PCID 0 is special.
+ *
+ * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS]
+ *         for KPTI each mm has two address spaces and thus needs two
+ *         PCID values, but we can still do with a single ASID denomination
+ *         for each mm. Corresponds to kPCID + 2048.
+ *
+ */
 
 /* There are 12 bits of space for ASIDS in CR3 */
 #define CR3_HW_ASID_BITS		12
@@ -41,7 +58,7 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
 
 /*
  * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid.  -1 below to account
- * for them being zero-based.  Another -1 is because ASID 0 is reserved for
+ * for them being zero-based.  Another -1 is because PCID 0 is reserved for
  * use by non-PCID-aware users.
  */
 #define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2)
@@ -52,6 +69,9 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
  */
 #define TLB_NR_DYN_ASIDS	6
 
+/*
+ * Given @asid, compute kPCID
+ */
 static inline u16 kern_pcid(u16 asid)
 {
 	VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
@@ -86,7 +106,7 @@ static inline u16 kern_pcid(u16 asid)
 }
 
 /*
- * The user PCID is just the kernel one, plus the "switch bit".
+ * Given @asid, compute uPCID
  */
 static inline u16 user_pcid(u16 asid)
 {
@@ -484,6 +504,17 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
 void native_flush_tlb_others(const struct cpumask *cpumask,
 			     const struct flush_tlb_info *info);
 
+static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
+{
+	/*
+	 * Bump the generation count.  This also serves as a full barrier
+	 * that synchronizes with switch_mm(): callers are required to order
+	 * their read of mm_cpumask after their writes to the paging
+	 * structures.
+	 */
+	return atomic64_inc_return(&mm->context.tlb_gen);
+}
+
 static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,
 					struct mm_struct *mm)
 {

From 5f26d76c3fd67c48806415ef8b1116c97beff8ba Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Tue, 19 Dec 2017 22:33:46 +0100
Subject: [PATCH 459/808] x86/dumpstack: Indicate in Oops whether PTI is
 configured and enabled

CONFIG_PAGE_TABLE_ISOLATION is relatively new and intrusive feature that may
still have some corner cases which could take some time to manifest and be
fixed. It would be useful to have Oops messages indicate whether it was
enabled for building the kernel, and whether it was disabled during boot.

Example of fully enabled:

	Oops: 0001 [#1] SMP PTI

Example of enabled during build, but disabled during boot:

	Oops: 0001 [#1] SMP NOPTI

We can decide to remove this after the feature has been tested in the field
long enough.

[ tglx: Made it use boot_cpu_has() as requested by Borislav ]

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Eduardo Valentin <eduval@amazon.com>
Acked-by: Dave Hansen <dave.hansen@intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirsky <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: bpetkov@suse.de
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: jkosina@suse.cz
Cc: keescook@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/dumpstack.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 36b17e0febe8..5fa110699ed2 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -297,11 +297,13 @@ int __die(const char *str, struct pt_regs *regs, long err)
 	unsigned long sp;
 #endif
 	printk(KERN_DEFAULT
-	       "%s: %04lx [#%d]%s%s%s%s\n", str, err & 0xffff, ++die_counter,
+	       "%s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff, ++die_counter,
 	       IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT"         : "",
 	       IS_ENABLED(CONFIG_SMP)     ? " SMP"             : "",
 	       debug_pagealloc_enabled()  ? " DEBUG_PAGEALLOC" : "",
-	       IS_ENABLED(CONFIG_KASAN)   ? " KASAN"           : "");
+	       IS_ENABLED(CONFIG_KASAN)   ? " KASAN"           : "",
+	       IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION) ?
+	       (boot_cpu_has(X86_FEATURE_PTI) ? " PTI" : " NOPTI") : "");
 
 	if (notify_die(DIE_OOPS, str, regs, err,
 			current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP)

From 385ce0ea4c078517fa51c261882c4e72fba53005 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Mon, 4 Dec 2017 15:08:03 +0100
Subject: [PATCH 460/808] x86/mm/pti: Add Kconfig

Finally allow CONFIG_PAGE_TABLE_ISOLATION to be enabled.

PARAVIRT generally requires that the kernel not manage its own page tables.
It also means that the hypervisor and kernel must agree wholeheartedly
about what format the page tables are in and what they contain.
PAGE_TABLE_ISOLATION, unfortunately, changes the rules and they
can not be used together.

I've seen conflicting feedback from maintainers lately about whether they
want the Kconfig magic to go first or last in a patch series.  It's going
last here because the partially-applied series leads to kernels that can
not boot in a bunch of cases.  I did a run through the entire series with
CONFIG_PAGE_TABLE_ISOLATION=y to look for build errors, though.

[ tglx: Removed SMP and !PARAVIRT dependencies as they not longer exist ]

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Cc: linux-mm@kvack.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 security/Kconfig | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/security/Kconfig b/security/Kconfig
index e8e449444e65..a623d13bf288 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -54,6 +54,16 @@ config SECURITY_NETWORK
 	  implement socket and networking access controls.
 	  If you are unsure how to answer this question, answer N.
 
+config PAGE_TABLE_ISOLATION
+	bool "Remove the kernel mapping in user mode"
+	depends on X86_64 && !UML
+	help
+	  This feature reduces the number of hardware side channels by
+	  ensuring that the majority of kernel addresses are not mapped
+	  into userspace.
+
+	  See Documentation/x86/pagetable-isolation.txt for more details.
+
 config SECURITY_INFINIBAND
 	bool "Infiniband Security Hooks"
 	depends on SECURITY && INFINIBAND

From 75298aa179d56cd64f54e58a19fffc8ab922b4c0 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 4 Dec 2017 15:08:04 +0100
Subject: [PATCH 461/808] x86/mm/dump_pagetables: Add page table directory to
 the debugfs VFS hierarchy

The upcoming support for dumping the kernel and the user space page tables
of the current process would create more random files in the top level
debugfs directory.

Add a page table directory and move the existing file to it.

Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/debug_pagetables.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c
index bfcffdf6c577..d1449fb6dc7a 100644
--- a/arch/x86/mm/debug_pagetables.c
+++ b/arch/x86/mm/debug_pagetables.c
@@ -22,21 +22,26 @@ static const struct file_operations ptdump_fops = {
 	.release	= single_release,
 };
 
-static struct dentry *pe;
+static struct dentry *dir, *pe;
 
 static int __init pt_dump_debug_init(void)
 {
-	pe = debugfs_create_file("kernel_page_tables", S_IRUSR, NULL, NULL,
-				 &ptdump_fops);
-	if (!pe)
+	dir = debugfs_create_dir("page_tables", NULL);
+	if (!dir)
 		return -ENOMEM;
 
+	pe = debugfs_create_file("kernel", 0400, dir, NULL, &ptdump_fops);
+	if (!pe)
+		goto err;
 	return 0;
+err:
+	debugfs_remove_recursive(dir);
+	return -ENOMEM;
 }
 
 static void __exit pt_dump_debug_exit(void)
 {
-	debugfs_remove_recursive(pe);
+	debugfs_remove_recursive(dir);
 }
 
 module_init(pt_dump_debug_init);

From b4bf4f924b1d7bade38fd51b2e401d20d0956e4d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 4 Dec 2017 15:08:05 +0100
Subject: [PATCH 462/808] x86/mm/dump_pagetables: Check user space page table
 for WX pages

ptdump_walk_pgd_level_checkwx() checks the kernel page table for WX pages,
but does not check the PAGE_TABLE_ISOLATION user space page table.

Restructure the code so that dmesg output is selected by an explicit
argument and not implicit via checking the pgd argument for !NULL.

Add the check for the user space page table.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Cc: linux-mm@kvack.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/pgtable.h |  1 +
 arch/x86/mm/debug_pagetables.c |  2 +-
 arch/x86/mm/dump_pagetables.c  | 30 +++++++++++++++++++++++++-----
 3 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index cc6fa75884e9..03780d5c41c5 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -28,6 +28,7 @@ extern pgd_t early_top_pgt[PTRS_PER_PGD];
 int __init __early_make_pgtable(unsigned long address, pmdval_t pmd);
 
 void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);
+void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd);
 void ptdump_walk_pgd_level_checkwx(void);
 
 #ifdef CONFIG_DEBUG_WX
diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c
index d1449fb6dc7a..8e70c1599e51 100644
--- a/arch/x86/mm/debug_pagetables.c
+++ b/arch/x86/mm/debug_pagetables.c
@@ -5,7 +5,7 @@
 
 static int ptdump_show(struct seq_file *m, void *v)
 {
-	ptdump_walk_pgd_level(m, NULL);
+	ptdump_walk_pgd_level_debugfs(m, NULL);
 	return 0;
 }
 
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 690eaf31ca34..17f5b417f95e 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -476,7 +476,7 @@ static inline bool is_hypervisor_range(int idx)
 }
 
 static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
-				       bool checkwx)
+				       bool checkwx, bool dmesg)
 {
 #ifdef CONFIG_X86_64
 	pgd_t *start = (pgd_t *) &init_top_pgt;
@@ -489,7 +489,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
 
 	if (pgd) {
 		start = pgd;
-		st.to_dmesg = true;
+		st.to_dmesg = dmesg;
 	}
 
 	st.check_wx = checkwx;
@@ -527,13 +527,33 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
 
 void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd)
 {
-	ptdump_walk_pgd_level_core(m, pgd, false);
+	ptdump_walk_pgd_level_core(m, pgd, false, true);
+}
+
+void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd)
+{
+	ptdump_walk_pgd_level_core(m, pgd, false, false);
+}
+EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs);
+
+static void ptdump_walk_user_pgd_level_checkwx(void)
+{
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+	pgd_t *pgd = (pgd_t *) &init_top_pgt;
+
+	if (!static_cpu_has(X86_FEATURE_PTI))
+		return;
+
+	pr_info("x86/mm: Checking user space page tables\n");
+	pgd = kernel_to_user_pgdp(pgd);
+	ptdump_walk_pgd_level_core(NULL, pgd, true, false);
+#endif
 }
-EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level);
 
 void ptdump_walk_pgd_level_checkwx(void)
 {
-	ptdump_walk_pgd_level_core(NULL, NULL, true);
+	ptdump_walk_pgd_level_core(NULL, NULL, true, false);
+	ptdump_walk_user_pgd_level_checkwx();
 }
 
 static int __init pt_dump_init(void)

From a4b51ef6552c704764684cef7e753162dc87c5fa Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 4 Dec 2017 15:08:06 +0100
Subject: [PATCH 463/808] x86/mm/dump_pagetables: Allow dumping current
 pagetables

Add two debugfs files which allow to dump the pagetable of the current
task.

current_kernel dumps the regular page table. This is the page table which
is normally shared between kernel and user space. If kernel page table
isolation is enabled this is the kernel space mapping.

If kernel page table isolation is enabled the second file, current_user,
dumps the user space page table.

These files allow to verify the resulting page tables for page table
isolation, but even in the normal case its useful to be able to inspect
user space page tables of current for debugging purposes.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Cc: linux-mm@kvack.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/pgtable.h |  2 +-
 arch/x86/mm/debug_pagetables.c | 71 ++++++++++++++++++++++++++++++++--
 arch/x86/mm/dump_pagetables.c  |  6 ++-
 3 files changed, 73 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 03780d5c41c5..6b43d677f8ca 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -28,7 +28,7 @@ extern pgd_t early_top_pgt[PTRS_PER_PGD];
 int __init __early_make_pgtable(unsigned long address, pmdval_t pmd);
 
 void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);
-void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd);
+void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user);
 void ptdump_walk_pgd_level_checkwx(void);
 
 #ifdef CONFIG_DEBUG_WX
diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c
index 8e70c1599e51..421f2664ffa0 100644
--- a/arch/x86/mm/debug_pagetables.c
+++ b/arch/x86/mm/debug_pagetables.c
@@ -5,7 +5,7 @@
 
 static int ptdump_show(struct seq_file *m, void *v)
 {
-	ptdump_walk_pgd_level_debugfs(m, NULL);
+	ptdump_walk_pgd_level_debugfs(m, NULL, false);
 	return 0;
 }
 
@@ -22,7 +22,57 @@ static const struct file_operations ptdump_fops = {
 	.release	= single_release,
 };
 
-static struct dentry *dir, *pe;
+static int ptdump_show_curknl(struct seq_file *m, void *v)
+{
+	if (current->mm->pgd) {
+		down_read(&current->mm->mmap_sem);
+		ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, false);
+		up_read(&current->mm->mmap_sem);
+	}
+	return 0;
+}
+
+static int ptdump_open_curknl(struct inode *inode, struct file *filp)
+{
+	return single_open(filp, ptdump_show_curknl, NULL);
+}
+
+static const struct file_operations ptdump_curknl_fops = {
+	.owner		= THIS_MODULE,
+	.open		= ptdump_open_curknl,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+static struct dentry *pe_curusr;
+
+static int ptdump_show_curusr(struct seq_file *m, void *v)
+{
+	if (current->mm->pgd) {
+		down_read(&current->mm->mmap_sem);
+		ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, true);
+		up_read(&current->mm->mmap_sem);
+	}
+	return 0;
+}
+
+static int ptdump_open_curusr(struct inode *inode, struct file *filp)
+{
+	return single_open(filp, ptdump_show_curusr, NULL);
+}
+
+static const struct file_operations ptdump_curusr_fops = {
+	.owner		= THIS_MODULE,
+	.open		= ptdump_open_curusr,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+#endif
+
+static struct dentry *dir, *pe_knl, *pe_curknl;
 
 static int __init pt_dump_debug_init(void)
 {
@@ -30,9 +80,22 @@ static int __init pt_dump_debug_init(void)
 	if (!dir)
 		return -ENOMEM;
 
-	pe = debugfs_create_file("kernel", 0400, dir, NULL, &ptdump_fops);
-	if (!pe)
+	pe_knl = debugfs_create_file("kernel", 0400, dir, NULL,
+				     &ptdump_fops);
+	if (!pe_knl)
 		goto err;
+
+	pe_curknl = debugfs_create_file("current_kernel", 0400,
+					dir, NULL, &ptdump_curknl_fops);
+	if (!pe_curknl)
+		goto err;
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+	pe_curusr = debugfs_create_file("current_user", 0400,
+					dir, NULL, &ptdump_curusr_fops);
+	if (!pe_curusr)
+		goto err;
+#endif
 	return 0;
 err:
 	debugfs_remove_recursive(dir);
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 17f5b417f95e..f56902c1f04b 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -530,8 +530,12 @@ void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd)
 	ptdump_walk_pgd_level_core(m, pgd, false, true);
 }
 
-void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd)
+void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user)
 {
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+	if (user && static_cpu_has(X86_FEATURE_PTI))
+		pgd = kernel_to_user_pgdp(pgd);
+#endif
 	ptdump_walk_pgd_level_core(m, pgd, false, false);
 }
 EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs);

From 9f5cb6b32d9e0a3a7453222baaf15664d92adbf2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 15 Dec 2017 20:35:11 +0100
Subject: [PATCH 464/808] x86/ldt: Make the LDT mapping RO

Now that the LDT mapping is in a known area when PAGE_TABLE_ISOLATION is
enabled its a primary target for attacks, if a user space interface fails
to validate a write address correctly. That can never happen, right?

The SDM states:

    If the segment descriptors in the GDT or an LDT are placed in ROM, the
    processor can enter an indefinite loop if software or the processor
    attempts to update (write to) the ROM-based segment descriptors. To
    prevent this problem, set the accessed bits for all segment descriptors
    placed in a ROM. Also, remove operating-system or executive code that
    attempts to modify segment descriptors located in ROM.

So its a valid approach to set the ACCESS bit when setting up the LDT entry
and to map the table RO. Fixup the selftest so it can handle that new mode.

Remove the manual ACCESS bit setter in set_tls_desc() as this is now
pointless. Folded the patch from Peter Ziljstra.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/desc.h           |  2 ++
 arch/x86/kernel/ldt.c                 |  7 ++++++-
 arch/x86/kernel/tls.c                 | 11 ++---------
 tools/testing/selftests/x86/ldt_gdt.c |  3 +--
 4 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index bc359dd2f7f6..85e23bb7b34e 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -21,6 +21,8 @@ static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *in
 
 	desc->type		= (info->read_exec_only ^ 1) << 1;
 	desc->type	       |= info->contents << 2;
+	/* Set the ACCESS bit so it can be mapped RO */
+	desc->type	       |= 1;
 
 	desc->s			= 1;
 	desc->dpl		= 0x3;
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 9629c5d8267a..579cc4a66fdf 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -158,7 +158,12 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
 		ptep = get_locked_pte(mm, va, &ptl);
 		if (!ptep)
 			return -ENOMEM;
-		pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL & ~_PAGE_GLOBAL));
+		/*
+		 * Map it RO so the easy to find address is not a primary
+		 * target via some kernel interface which misses a
+		 * permission check.
+		 */
+		pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL));
 		set_pte_at(mm, va, ptep, pte);
 		pte_unmap_unlock(ptep, ptl);
 	}
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
index 9a9c9b076955..a5b802a12212 100644
--- a/arch/x86/kernel/tls.c
+++ b/arch/x86/kernel/tls.c
@@ -93,17 +93,10 @@ static void set_tls_desc(struct task_struct *p, int idx,
 	cpu = get_cpu();
 
 	while (n-- > 0) {
-		if (LDT_empty(info) || LDT_zero(info)) {
+		if (LDT_empty(info) || LDT_zero(info))
 			memset(desc, 0, sizeof(*desc));
-		} else {
+		else
 			fill_ldt(desc, info);
-
-			/*
-			 * Always set the accessed bit so that the CPU
-			 * doesn't try to write to the (read-only) GDT.
-			 */
-			desc->type |= 1;
-		}
 		++info;
 		++desc;
 	}
diff --git a/tools/testing/selftests/x86/ldt_gdt.c b/tools/testing/selftests/x86/ldt_gdt.c
index 0304ffb714f2..1aef72df20a1 100644
--- a/tools/testing/selftests/x86/ldt_gdt.c
+++ b/tools/testing/selftests/x86/ldt_gdt.c
@@ -122,8 +122,7 @@ static void check_valid_segment(uint16_t index, int ldt,
 	 * NB: Different Linux versions do different things with the
 	 * accessed bit in set_thread_area().
 	 */
-	if (ar != expected_ar &&
-	    (ldt || ar != (expected_ar | AR_ACCESSED))) {
+	if (ar != expected_ar && ar != (expected_ar | AR_ACCESSED)) {
 		printf("[FAIL]\t%s entry %hu has AR 0x%08X but expected 0x%08X\n",
 		       (ldt ? "LDT" : "GDT"), index, ar, expected_ar);
 		nerrs++;

From c0ee554906c3d6554fbddf95ae664cd9f817082b Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 22 Dec 2017 12:37:43 -0600
Subject: [PATCH 465/808] pid: Handle failure to allocate the first pid in a
 pid namespace

With the replacement of the pid bitmap and hashtable with an idr in
alloc_pid started occassionally failing when allocating the first pid
in a pid namespace.  Things were not completely reset resulting in
the first allocated pid getting the number 2 (not 1).  Which
further resulted in ns->proc_mnt not getting set and eventually
causing an oops in proc_flush_task.

Oops: 0000 [#1] SMP
CPU: 2 PID: 6743 Comm: trinity-c117 Not tainted 4.15.0-rc4-think+ #2
RIP: 0010:proc_flush_task+0x8e/0x1b0
RSP: 0018:ffffc9000bbffc40 EFLAGS: 00010286
RAX: 0000000000000001 RBX: 0000000000000001 RCX: 00000000fffffffb
RDX: 0000000000000000 RSI: ffffc9000bbffc50 RDI: 0000000000000000
RBP: ffffc9000bbffc63 R08: 0000000000000000 R09: 0000000000000002
R10: ffffc9000bbffb70 R11: ffffc9000bbffc64 R12: 0000000000000003
R13: 0000000000000000 R14: 0000000000000003 R15: ffff8804c10d7840
FS:  00007f7cb8965700(0000) GS:ffff88050a200000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000000 CR3: 00000003e21ae003 CR4: 00000000001606e0
DR0: 00007fb1d6c22000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000600
Call Trace:
 ? release_task+0xaf/0x680
 release_task+0xd2/0x680
 ? wait_consider_task+0xb82/0xce0
 wait_consider_task+0xbe9/0xce0
 ? do_wait+0xe1/0x330
 do_wait+0x151/0x330
 kernel_wait4+0x8d/0x150
 ? task_stopped_code+0x50/0x50
 SYSC_wait4+0x95/0xa0
 ? rcu_read_lock_sched_held+0x6c/0x80
 ? syscall_trace_enter+0x2d7/0x340
 ? do_syscall_64+0x60/0x210
 do_syscall_64+0x60/0x210
 entry_SYSCALL64_slow_path+0x25/0x25
RIP: 0033:0x7f7cb82603aa
RSP: 002b:00007ffd60770bc8 EFLAGS: 00000246
 ORIG_RAX: 000000000000003d
RAX: ffffffffffffffda RBX: 00007f7cb6cd4000 RCX: 00007f7cb82603aa
RDX: 000000000000000b RSI: 00007ffd60770bd0 RDI: 0000000000007cca
RBP: 0000000000007cca R08: 00007f7cb8965700 R09: 00007ffd607c7080
R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
R13: 00007ffd60770bd0 R14: 00007f7cb6cd4058 R15: 00000000cccccccd
Code: c1 e2 04 44 8b 60 30 48 8b 40 38 44 8b 34 11 48 c7 c2 60 3a f5 81 44 89 e1 4c 8b 68 58 e8 4b b4 77 00 89 44 24 14 48 8d 74 24 10 <49> 8b 7d 00 e8 b9 6a f9 ff 48 85 c0 74 1a 48 89 c7 48 89 44 24
RIP: proc_flush_task+0x8e/0x1b0 RSP: ffffc9000bbffc40
CR2: 0000000000000000
---[ end trace 53d67a6481059862 ]---

Improve the quality of the implementation by resetting the place to
start allocating pids on failure to allocate the first pid.

As improving the quality of the implementation is the goal remove the now
unnecesarry disable_pid_allocations call when we fail to mount proc.

Fixes: 95846ecf9dac ("pid: replace pid bitmap implementation with IDR API")
Fixes: 8ef047aaaeb8 ("pid namespaces: make alloc_pid(), free_pid() and put_pid() work with struct upid")
Reported-by: Dave Jones <davej@codemonkey.org.uk>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/pid.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/kernel/pid.c b/kernel/pid.c
index b13b624e2c49..1e8bb6550ec4 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -193,10 +193,8 @@ struct pid *alloc_pid(struct pid_namespace *ns)
 	}
 
 	if (unlikely(is_child_reaper(pid))) {
-		if (pid_ns_prepare_proc(ns)) {
-			disable_pid_allocation(ns);
+		if (pid_ns_prepare_proc(ns))
 			goto out_free;
-		}
 	}
 
 	get_pid_ns(ns);
@@ -226,6 +224,10 @@ out_free:
 	while (++i <= ns->level)
 		idr_remove(&ns->idr, (pid->numbers + i)->nr);
 
+	/* On failure to allocate the first pid, reset the state */
+	if (ns->pid_allocated == PIDNS_ADDING)
+		idr_set_cursor(&ns->idr, 0);
+
 	spin_unlock_irq(&pidmap_lock);
 
 	kmem_cache_free(ns->pid_cachep, pid);

From 464e1d5f23cca236b930ef068c328a64cab78fb1 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sat, 23 Dec 2017 20:47:16 -0800
Subject: [PATCH 466/808] Linux 4.15-rc5

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 7e02f951b284..ac8c441866b7 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,7 @@
 VERSION = 4
 PATCHLEVEL = 15
 SUBLEVEL = 0
-EXTRAVERSION = -rc4
+EXTRAVERSION = -rc5
 NAME = Fearless Coyote
 
 # *DOCUMENTATION*

From 182088aa3c6c7f7c20a2c1dcc9ded4a3fc631f38 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Wed, 20 Dec 2017 23:21:28 +0000
Subject: [PATCH 467/808] phylink: ensure the PHY interface mode is
 appropriately set

When setting the ethtool settings, ensure that the validated PHY
interface mode is propagated to the current link settings, so that
2500BaseX can be selected.

Fixes: 9525ae83959b ("phylink: add phylink infrastructure")
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phylink.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index 5dc9668dde34..8d06a083ac4c 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -951,6 +951,7 @@ int phylink_ethtool_ksettings_set(struct phylink *pl,
 	mutex_lock(&pl->state_mutex);
 	/* Configure the MAC to match the new settings */
 	linkmode_copy(pl->link_config.advertising, our_kset.link_modes.advertising);
+	pl->link_config.interface = config.interface;
 	pl->link_config.speed = our_kset.base.speed;
 	pl->link_config.duplex = our_kset.base.duplex;
 	pl->link_config.an_enabled = our_kset.base.autoneg != AUTONEG_DISABLE;

From 74ee0e8c1bf9925c59cc8f1c65c29adf6e4cf603 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Wed, 20 Dec 2017 23:21:34 +0000
Subject: [PATCH 468/808] phylink: ensure AN is enabled

Ensure that we mark AN as enabled at boot time, rather than leaving
it disabled.  This is noticable if your SFP module is fiber, and
it supports faster speeds than 1G with 2.5G support in place.

Fixes: 9525ae83959b ("phylink: add phylink infrastructure")
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phylink.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index 8d06a083ac4c..827f3f92560e 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -526,6 +526,7 @@ struct phylink *phylink_create(struct net_device *ndev, struct device_node *np,
 	pl->link_config.pause = MLO_PAUSE_AN;
 	pl->link_config.speed = SPEED_UNKNOWN;
 	pl->link_config.duplex = DUPLEX_UNKNOWN;
+	pl->link_config.an_enabled = true;
 	pl->ops = ops;
 	__set_bit(PHYLINK_DISABLE_STOPPED, &pl->phylink_disable_state);
 

From 8bea728dce8972e534e6b99fd550f7b5cc3864e8 Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Mon, 25 Dec 2017 11:34:54 +0800
Subject: [PATCH 469/808] netfilter: nf_tables: fix potential NULL-ptr deref in
 nf_tables_dump_obj_done()

If there is no NFTA_OBJ_TABLE and NFTA_OBJ_TYPE, the c.data will be NULL in
nf_tables_getobj(). So before free filter->table in nf_tables_dump_obj_done(),
we need to check if filter is NULL first.

Fixes: e46abbcc05aa ("netfilter: nf_tables: Allow table names of up to 255 chars")
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Acked-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 8d4526651661..07bd4138c84e 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -4665,8 +4665,10 @@ static int nf_tables_dump_obj_done(struct netlink_callback *cb)
 {
 	struct nft_obj_filter *filter = cb->data;
 
-	kfree(filter->table);
-	kfree(filter);
+	if (filter) {
+		kfree(filter->table);
+		kfree(filter);
+	}
 
 	return 0;
 }

From e5a9336adb317db55eb3fe8200856096f3c71109 Mon Sep 17 00:00:00 2001
From: Alexey Kodanev <alexey.kodanev@oracle.com>
Date: Wed, 20 Dec 2017 19:36:03 +0300
Subject: [PATCH 470/808] ip6_gre: fix device features for ioctl setup

When ip6gre is created using ioctl, its features, such as
scatter-gather, GSO and tx-checksumming will be turned off:

  # ip -f inet6 tunnel add gre6 mode ip6gre remote fd00::1
  # ethtool -k gre6 (truncated output)
    tx-checksumming: off
    scatter-gather: off
    tcp-segmentation-offload: off
    generic-segmentation-offload: off [requested on]

But when netlink is used, they will be enabled:
  # ip link add gre6 type ip6gre remote fd00::1
  # ethtool -k gre6 (truncated output)
    tx-checksumming: on
    scatter-gather: on
    tcp-segmentation-offload: on
    generic-segmentation-offload: on

This results in a loss of performance when gre6 is created via ioctl.
The issue was found with LTP/gre tests.

Fix it by moving the setup of device features to a separate function
and invoke it with ndo_init callback because both netlink and ioctl
will eventually call it via register_netdevice():

   register_netdevice()
       - ndo_init() callback -> ip6gre_tunnel_init() or ip6gre_tap_init()
           - ip6gre_tunnel_init_common()
                - ip6gre_tnl_init_features()

The moved code also contains two minor style fixes:
  * removed needless tab from GRE6_FEATURES on NETIF_F_HIGHDMA line.
  * fixed the issue reported by checkpatch: "Unnecessary parentheses around
    'nt->encap.type == TUNNEL_ENCAP_NONE'"

Fixes: ac4eb009e477 ("ip6gre: Add support for basic offloads offloads excluding GSO")
Signed-off-by: Alexey Kodanev <alexey.kodanev@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_gre.c | 57 ++++++++++++++++++++++++++--------------------
 1 file changed, 32 insertions(+), 25 deletions(-)

diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 416c8913f132..772695960890 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -1014,6 +1014,36 @@ static void ip6gre_tunnel_setup(struct net_device *dev)
 	eth_random_addr(dev->perm_addr);
 }
 
+#define GRE6_FEATURES (NETIF_F_SG |		\
+		       NETIF_F_FRAGLIST |	\
+		       NETIF_F_HIGHDMA |	\
+		       NETIF_F_HW_CSUM)
+
+static void ip6gre_tnl_init_features(struct net_device *dev)
+{
+	struct ip6_tnl *nt = netdev_priv(dev);
+
+	dev->features		|= GRE6_FEATURES;
+	dev->hw_features	|= GRE6_FEATURES;
+
+	if (!(nt->parms.o_flags & TUNNEL_SEQ)) {
+		/* TCP offload with GRE SEQ is not supported, nor
+		 * can we support 2 levels of outer headers requiring
+		 * an update.
+		 */
+		if (!(nt->parms.o_flags & TUNNEL_CSUM) ||
+		    nt->encap.type == TUNNEL_ENCAP_NONE) {
+			dev->features    |= NETIF_F_GSO_SOFTWARE;
+			dev->hw_features |= NETIF_F_GSO_SOFTWARE;
+		}
+
+		/* Can use a lockless transmit, unless we generate
+		 * output sequences
+		 */
+		dev->features |= NETIF_F_LLTX;
+	}
+}
+
 static int ip6gre_tunnel_init_common(struct net_device *dev)
 {
 	struct ip6_tnl *tunnel;
@@ -1048,6 +1078,8 @@ static int ip6gre_tunnel_init_common(struct net_device *dev)
 	if (!(tunnel->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
 		dev->mtu -= 8;
 
+	ip6gre_tnl_init_features(dev);
+
 	return 0;
 }
 
@@ -1298,11 +1330,6 @@ static const struct net_device_ops ip6gre_tap_netdev_ops = {
 	.ndo_get_iflink = ip6_tnl_get_iflink,
 };
 
-#define GRE6_FEATURES (NETIF_F_SG |		\
-		       NETIF_F_FRAGLIST |	\
-		       NETIF_F_HIGHDMA |		\
-		       NETIF_F_HW_CSUM)
-
 static void ip6gre_tap_setup(struct net_device *dev)
 {
 
@@ -1383,26 +1410,6 @@ static int ip6gre_newlink(struct net *src_net, struct net_device *dev,
 	nt->net = dev_net(dev);
 	ip6gre_tnl_link_config(nt, !tb[IFLA_MTU]);
 
-	dev->features		|= GRE6_FEATURES;
-	dev->hw_features	|= GRE6_FEATURES;
-
-	if (!(nt->parms.o_flags & TUNNEL_SEQ)) {
-		/* TCP offload with GRE SEQ is not supported, nor
-		 * can we support 2 levels of outer headers requiring
-		 * an update.
-		 */
-		if (!(nt->parms.o_flags & TUNNEL_CSUM) ||
-		    (nt->encap.type == TUNNEL_ENCAP_NONE)) {
-			dev->features    |= NETIF_F_GSO_SOFTWARE;
-			dev->hw_features |= NETIF_F_GSO_SOFTWARE;
-		}
-
-		/* Can use a lockless transmit, unless we generate
-		 * output sequences
-		 */
-		dev->features |= NETIF_F_LLTX;
-	}
-
 	err = register_netdevice(dev);
 	if (err)
 		goto out;

From c1a8d0a3accf64a014d605e6806ce05d1c17adf1 Mon Sep 17 00:00:00 2001
From: Grygorii Strashko <grygorii.strashko@ti.com>
Date: Wed, 20 Dec 2017 18:45:10 -0600
Subject: [PATCH 471/808] net: phy: micrel: ksz9031: reconfigure autoneg after
 phy autoneg workaround

Under some circumstances driver will perform PHY reset in
ksz9031_read_status() to fix autoneg failure case (idle error count =
0xFF). When this happens ksz9031 will not detect link status change any
more when connecting to Netgear 1G switch (link can be recovered sometimes by
restarting netdevice "ifconfig down up"). Reproduced with TI am572x board
equipped with ksz9031 PHY while connecting to Netgear 1G switch.

Fix the issue by reconfiguring autonegotiation after PHY reset in
ksz9031_read_status().

Fixes: d2fd719bcb0e ("net/phy: micrel: Add workaround for bad autoneg")
Signed-off-by: Grygorii Strashko <grygorii.strashko@ti.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/micrel.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c
index ab4614113403..422ff6333c52 100644
--- a/drivers/net/phy/micrel.c
+++ b/drivers/net/phy/micrel.c
@@ -624,6 +624,7 @@ static int ksz9031_read_status(struct phy_device *phydev)
 		phydev->link = 0;
 		if (phydev->drv->config_intr && phy_interrupt_is_valid(phydev))
 			phydev->drv->config_intr(phydev);
+		return genphy_config_aneg(phydev);
 	}
 
 	return 0;

From b2fb01f426883a794ed80be9110675a2d8356347 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Wed, 20 Dec 2017 23:26:24 -0800
Subject: [PATCH 472/808] net_sched: fix a missing rcu barrier in
 mini_qdisc_pair_swap()

The rcu_barrier_bh() in mini_qdisc_pair_swap() is to wait for
flying RCU callback installed by a previous mini_qdisc_pair_swap(),
however we miss it on the tp_head==NULL path, which leads to that
the RCU callback still uses miniq_old->rcu after it is freed together
with qdisc in qdisc_graft(). So just add it on that path too.

Fixes: 46209401f8f6 ("net: core: introduce mini_Qdisc and eliminate usage of tp->q for clsact fastpath ")
Reported-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Tested-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Cc: Jiri Pirko <jiri@mellanox.com>
Cc: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_generic.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index cd1b200acae7..661c7144b53a 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -1040,6 +1040,8 @@ void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp,
 
 	if (!tp_head) {
 		RCU_INIT_POINTER(*miniqp->p_miniq, NULL);
+		/* Wait for flying RCU callback before it is freed. */
+		rcu_barrier_bh();
 		return;
 	}
 
@@ -1055,7 +1057,7 @@ void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp,
 	rcu_assign_pointer(*miniqp->p_miniq, miniq);
 
 	if (miniq_old)
-		/* This is counterpart of the rcu barrier above. We need to
+		/* This is counterpart of the rcu barriers above. We need to
 		 * block potential new user of miniq_old until all readers
 		 * are not seeing it.
 		 */

From 0a3d805c9c503e05d6e5d3868c53e92a06589dcf Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Thu, 21 Dec 2017 13:07:11 +0100
Subject: [PATCH 473/808] tipc: base group replicast ack counter on number of
 actual receivers

In commit 2f487712b893 ("tipc: guarantee that group broadcast doesn't
bypass group unicast") we introduced a mechanism that requires the first
(replicated) broadcast sent after a unicast to be acknowledged by all
receivers before permitting sending of the next (true) broadcast.

The counter for keeping track of the number of acknowledges to expect
is based on the tipc_group::member_cnt variable. But this misses that
some of the known members may not be ready for reception, and will never
acknowledge the message, either because they haven't fully joined the
group or because they are leaving the group. Such members are identified
by not fulfilling the condition tested for in the function
tipc_group_is_enabled().

We now set the counter for the actual number of acks to receive at the
moment the message is sent, by just counting the number of recipients
satisfying the tipc_group_is_enabled() test.

Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/group.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/tipc/group.c b/net/tipc/group.c
index 7ebbdeb2a90e..e5b03f08f076 100644
--- a/net/tipc/group.c
+++ b/net/tipc/group.c
@@ -368,18 +368,20 @@ void tipc_group_update_bc_members(struct tipc_group *grp, int len, bool ack)
 	u16 prev = grp->bc_snd_nxt - 1;
 	struct tipc_member *m;
 	struct rb_node *n;
+	u16 ackers = 0;
 
 	for (n = rb_first(&grp->members); n; n = rb_next(n)) {
 		m = container_of(n, struct tipc_member, tree_node);
 		if (tipc_group_is_enabled(m)) {
 			tipc_group_update_member(m, len);
 			m->bc_acked = prev;
+			ackers++;
 		}
 	}
 
 	/* Mark number of acknowledges to expect, if any */
 	if (ack)
-		grp->bc_ackers = grp->member_cnt;
+		grp->bc_ackers = ackers;
 	grp->bc_snd_nxt++;
 }
 

From 4853f128c13ed2731625dff2410b7fdbe540fb26 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Thu, 21 Dec 2017 13:13:59 +0100
Subject: [PATCH 474/808] net: sched: fix possible null pointer deref in
 tcf_block_put

We need to check block for being null in both tcf_block_put and
tcf_block_put_ext.

Fixes: 343723dd51ef ("net: sched: fix clsact init error path")
Reported-by: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_api.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index b91ea03e3afa..b9d63d2246e6 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -379,6 +379,8 @@ void tcf_block_put(struct tcf_block *block)
 {
 	struct tcf_block_ext_info ei = {0, };
 
+	if (!block)
+		return;
 	tcf_block_put_ext(block, block->q, &ei);
 }
 

From 3a33a19bf88cdfc6d982972bc6ffcf7a62c1015e Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Thu, 21 Dec 2017 14:36:34 +0100
Subject: [PATCH 475/808] tipc: fix memory leak of group member when peer node
 is lost

When a group member receives a member WITHDRAW event, this might have
two reasons: either the peer member is leaving the group, or the link
to the member's node has been lost.

In the latter case we need to issue a DOWN event to the user right away,
and let function tipc_group_filter_msg() perform delete of the member
item. However, in this case we miss to change the state of the member
item to MBR_LEAVING, so the member item is not deleted, and we have a
memory leak.

We now separate better between the four sub-cases of a WITHRAW event
and make sure that each case is handled correctly.

Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/group.c | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/net/tipc/group.c b/net/tipc/group.c
index e5b03f08f076..8e12ab55346b 100644
--- a/net/tipc/group.c
+++ b/net/tipc/group.c
@@ -850,17 +850,26 @@ void tipc_group_member_evt(struct tipc_group *grp,
 		*usr_wakeup = true;
 		m->usr_pending = false;
 		node_up = tipc_node_is_up(net, node);
+		m->event_msg = NULL;
 
-		/* Hold back event if more messages might be expected */
-		if (m->state != MBR_LEAVING && node_up) {
-			m->event_msg = skb;
-			tipc_group_decr_active(grp, m);
-			m->state = MBR_LEAVING;
-		} else {
-			if (node_up)
+		if (node_up) {
+			/* Hold back event if a LEAVE msg should be expected */
+			if (m->state != MBR_LEAVING) {
+				m->event_msg = skb;
+				tipc_group_decr_active(grp, m);
+				m->state = MBR_LEAVING;
+			} else {
 				msg_set_grp_bc_seqno(hdr, m->bc_syncpt);
-			else
+				__skb_queue_tail(inputq, skb);
+			}
+		} else {
+			if (m->state != MBR_LEAVING) {
+				tipc_group_decr_active(grp, m);
+				m->state = MBR_LEAVING;
 				msg_set_grp_bc_seqno(hdr, m->bc_rcv_nxt);
+			} else {
+				msg_set_grp_bc_seqno(hdr, m->bc_syncpt);
+			}
 			__skb_queue_tail(inputq, skb);
 		}
 		list_del_init(&m->list);

From 47c332deb8e89f6c59b0bb2615945c6e7fad1a60 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Tue, 5 Dec 2017 09:36:14 +0100
Subject: [PATCH 476/808] hwmon: Deal with errors from the thermal subsystem

If the thermal subsystem returne -EPROBE_DEFER or any other error
when hwmon calls devm_thermal_zone_of_sensor_register(), this is
silently ignored.

I ran into this with an incorrectly defined thermal zone, making
it non-existing and thus this call failed with -EPROBE_DEFER
assuming it would appear later. The sensor was still added
which is incorrect: sensors must strictly be added after the
thermal zones, so deferred probe must be respected.

Fixes: d560168b5d0f ("hwmon: (core) New hwmon registration API")
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 drivers/hwmon/hwmon.c | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/drivers/hwmon/hwmon.c b/drivers/hwmon/hwmon.c
index c9790e2c3440..af5123042990 100644
--- a/drivers/hwmon/hwmon.c
+++ b/drivers/hwmon/hwmon.c
@@ -143,6 +143,7 @@ static int hwmon_thermal_add_sensor(struct device *dev,
 				    struct hwmon_device *hwdev, int index)
 {
 	struct hwmon_thermal_data *tdata;
+	struct thermal_zone_device *tzd;
 
 	tdata = devm_kzalloc(dev, sizeof(*tdata), GFP_KERNEL);
 	if (!tdata)
@@ -151,8 +152,14 @@ static int hwmon_thermal_add_sensor(struct device *dev,
 	tdata->hwdev = hwdev;
 	tdata->index = index;
 
-	devm_thermal_zone_of_sensor_register(&hwdev->dev, index, tdata,
-					     &hwmon_thermal_ops);
+	tzd = devm_thermal_zone_of_sensor_register(&hwdev->dev, index, tdata,
+						   &hwmon_thermal_ops);
+	/*
+	 * If CONFIG_THERMAL_OF is disabled, this returns -ENODEV,
+	 * so ignore that error but forward any other error.
+	 */
+	if (IS_ERR(tzd) && (PTR_ERR(tzd) != -ENODEV))
+		return PTR_ERR(tzd);
 
 	return 0;
 }
@@ -621,14 +628,20 @@ __hwmon_device_register(struct device *dev, const char *name, void *drvdata,
 				if (!chip->ops->is_visible(drvdata, hwmon_temp,
 							   hwmon_temp_input, j))
 					continue;
-				if (info[i]->config[j] & HWMON_T_INPUT)
-					hwmon_thermal_add_sensor(dev, hwdev, j);
+				if (info[i]->config[j] & HWMON_T_INPUT) {
+					err = hwmon_thermal_add_sensor(dev,
+								hwdev, j);
+					if (err)
+						goto free_device;
+				}
 			}
 		}
 	}
 
 	return hdev;
 
+free_device:
+	device_unregister(hdev);
 free_hwmon:
 	kfree(hwdev);
 ida_remove:

From 6a6b0b9914e73a8a54253dd5f6f5e5dd5e4a756c Mon Sep 17 00:00:00 2001
From: Mat Martineau <mathew.j.martineau@linux.intel.com>
Date: Thu, 21 Dec 2017 10:29:09 -0800
Subject: [PATCH 477/808] tcp: Avoid preprocessor directives in tracepoint
 macro args

Using a preprocessor directive to check for CONFIG_IPV6 in the middle of
a DECLARE_EVENT_CLASS macro's arg list causes sparse to report a series
of errors:

./include/trace/events/tcp.h:68:1: error: directive in argument list
./include/trace/events/tcp.h:75:1: error: directive in argument list
./include/trace/events/tcp.h:144:1: error: directive in argument list
./include/trace/events/tcp.h:151:1: error: directive in argument list
./include/trace/events/tcp.h:216:1: error: directive in argument list
./include/trace/events/tcp.h:223:1: error: directive in argument list
./include/trace/events/tcp.h:274:1: error: directive in argument list
./include/trace/events/tcp.h:281:1: error: directive in argument list

Once sparse finds an error, it stops printing warnings for the file it
is checking. This masks any sparse warnings that would normally be
reported for the core TCP code.

Instead, handle the preprocessor conditionals in a couple of auxiliary
macros. This also has the benefit of reducing duplicate code.

Cc: David Ahern <dsahern@gmail.com>
Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/tcp.h | 97 +++++++++++++++-----------------------
 1 file changed, 37 insertions(+), 60 deletions(-)

diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index 07cccca6cbf1..ab34c561f26b 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -25,6 +25,35 @@
 		tcp_state_name(TCP_CLOSING),		\
 		tcp_state_name(TCP_NEW_SYN_RECV))
 
+#define TP_STORE_V4MAPPED(__entry, saddr, daddr)		\
+	do {							\
+		struct in6_addr *pin6;				\
+								\
+		pin6 = (struct in6_addr *)__entry->saddr_v6;	\
+		ipv6_addr_set_v4mapped(saddr, pin6);		\
+		pin6 = (struct in6_addr *)__entry->daddr_v6;	\
+		ipv6_addr_set_v4mapped(daddr, pin6);		\
+	} while (0)
+
+#if IS_ENABLED(CONFIG_IPV6)
+#define TP_STORE_ADDRS(__entry, saddr, daddr, saddr6, daddr6)		\
+	do {								\
+		if (sk->sk_family == AF_INET6) {			\
+			struct in6_addr *pin6;				\
+									\
+			pin6 = (struct in6_addr *)__entry->saddr_v6;	\
+			*pin6 = saddr6;					\
+			pin6 = (struct in6_addr *)__entry->daddr_v6;	\
+			*pin6 = daddr6;					\
+		} else {						\
+			TP_STORE_V4MAPPED(__entry, saddr, daddr);	\
+		}							\
+	} while (0)
+#else
+#define TP_STORE_ADDRS(__entry, saddr, daddr, saddr6, daddr6)	\
+	TP_STORE_V4MAPPED(__entry, saddr, daddr)
+#endif
+
 /*
  * tcp event with arguments sk and skb
  *
@@ -50,7 +79,6 @@ DECLARE_EVENT_CLASS(tcp_event_sk_skb,
 
 	TP_fast_assign(
 		struct inet_sock *inet = inet_sk(sk);
-		struct in6_addr *pin6;
 		__be32 *p32;
 
 		__entry->skbaddr = skb;
@@ -65,20 +93,8 @@ DECLARE_EVENT_CLASS(tcp_event_sk_skb,
 		p32 = (__be32 *) __entry->daddr;
 		*p32 =  inet->inet_daddr;
 
-#if IS_ENABLED(CONFIG_IPV6)
-		if (sk->sk_family == AF_INET6) {
-			pin6 = (struct in6_addr *)__entry->saddr_v6;
-			*pin6 = sk->sk_v6_rcv_saddr;
-			pin6 = (struct in6_addr *)__entry->daddr_v6;
-			*pin6 = sk->sk_v6_daddr;
-		} else
-#endif
-		{
-			pin6 = (struct in6_addr *)__entry->saddr_v6;
-			ipv6_addr_set_v4mapped(inet->inet_saddr, pin6);
-			pin6 = (struct in6_addr *)__entry->daddr_v6;
-			ipv6_addr_set_v4mapped(inet->inet_daddr, pin6);
-		}
+		TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr,
+			      sk->sk_v6_rcv_saddr, sk->sk_v6_daddr);
 	),
 
 	TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c",
@@ -127,7 +143,6 @@ DECLARE_EVENT_CLASS(tcp_event_sk,
 
 	TP_fast_assign(
 		struct inet_sock *inet = inet_sk(sk);
-		struct in6_addr *pin6;
 		__be32 *p32;
 
 		__entry->skaddr = sk;
@@ -141,20 +156,8 @@ DECLARE_EVENT_CLASS(tcp_event_sk,
 		p32 = (__be32 *) __entry->daddr;
 		*p32 =  inet->inet_daddr;
 
-#if IS_ENABLED(CONFIG_IPV6)
-		if (sk->sk_family == AF_INET6) {
-			pin6 = (struct in6_addr *)__entry->saddr_v6;
-			*pin6 = sk->sk_v6_rcv_saddr;
-			pin6 = (struct in6_addr *)__entry->daddr_v6;
-			*pin6 = sk->sk_v6_daddr;
-		} else
-#endif
-		{
-			pin6 = (struct in6_addr *)__entry->saddr_v6;
-			ipv6_addr_set_v4mapped(inet->inet_saddr, pin6);
-			pin6 = (struct in6_addr *)__entry->daddr_v6;
-			ipv6_addr_set_v4mapped(inet->inet_daddr, pin6);
-		}
+		TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr,
+			       sk->sk_v6_rcv_saddr, sk->sk_v6_daddr);
 	),
 
 	TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c",
@@ -197,7 +200,6 @@ TRACE_EVENT(tcp_set_state,
 
 	TP_fast_assign(
 		struct inet_sock *inet = inet_sk(sk);
-		struct in6_addr *pin6;
 		__be32 *p32;
 
 		__entry->skaddr = sk;
@@ -213,20 +215,8 @@ TRACE_EVENT(tcp_set_state,
 		p32 = (__be32 *) __entry->daddr;
 		*p32 =  inet->inet_daddr;
 
-#if IS_ENABLED(CONFIG_IPV6)
-		if (sk->sk_family == AF_INET6) {
-			pin6 = (struct in6_addr *)__entry->saddr_v6;
-			*pin6 = sk->sk_v6_rcv_saddr;
-			pin6 = (struct in6_addr *)__entry->daddr_v6;
-			*pin6 = sk->sk_v6_daddr;
-		} else
-#endif
-		{
-			pin6 = (struct in6_addr *)__entry->saddr_v6;
-			ipv6_addr_set_v4mapped(inet->inet_saddr, pin6);
-			pin6 = (struct in6_addr *)__entry->daddr_v6;
-			ipv6_addr_set_v4mapped(inet->inet_daddr, pin6);
-		}
+		TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr,
+			       sk->sk_v6_rcv_saddr, sk->sk_v6_daddr);
 	),
 
 	TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c oldstate=%s newstate=%s",
@@ -256,7 +246,6 @@ TRACE_EVENT(tcp_retransmit_synack,
 
 	TP_fast_assign(
 		struct inet_request_sock *ireq = inet_rsk(req);
-		struct in6_addr *pin6;
 		__be32 *p32;
 
 		__entry->skaddr = sk;
@@ -271,20 +260,8 @@ TRACE_EVENT(tcp_retransmit_synack,
 		p32 = (__be32 *) __entry->daddr;
 		*p32 = ireq->ir_rmt_addr;
 
-#if IS_ENABLED(CONFIG_IPV6)
-		if (sk->sk_family == AF_INET6) {
-			pin6 = (struct in6_addr *)__entry->saddr_v6;
-			*pin6 = ireq->ir_v6_loc_addr;
-			pin6 = (struct in6_addr *)__entry->daddr_v6;
-			*pin6 = ireq->ir_v6_rmt_addr;
-		} else
-#endif
-		{
-			pin6 = (struct in6_addr *)__entry->saddr_v6;
-			ipv6_addr_set_v4mapped(ireq->ir_loc_addr, pin6);
-			pin6 = (struct in6_addr *)__entry->daddr_v6;
-			ipv6_addr_set_v4mapped(ireq->ir_rmt_addr, pin6);
-		}
+		TP_STORE_ADDRS(__entry, ireq->ir_loc_addr, ireq->ir_rmt_addr,
+			      ireq->ir_v6_loc_addr, ireq->ir_v6_rmt_addr);
 	),
 
 	TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c",

From 756efe131088b6e6e7f0124ff9c4e1f0165d3140 Mon Sep 17 00:00:00 2001
From: Dong Aisheng <aisheng.dong@nxp.com>
Date: Fri, 22 Dec 2017 17:46:04 +0800
Subject: [PATCH 478/808] clk: use atomic runtime pm api in clk_core_is_enabled

Current clk_pm_runtime_put is using pm_runtime_put_sync which
is not safe to be called in clk_core_is_enabled as it should
be able to run in atomic context.

Thus use pm_runtime_put instead which is atomic safe.

Cc: Stephen Boyd <sboyd@codeaurora.org>
Cc: Michael Turquette <mturquette@baylibre.com>
Cc: Ulf Hansson <ulf.hansson@linaro.org>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Fixes: 9a34b45397e5 ("clk: Add support for runtime PM")
Signed-off-by: Dong Aisheng <aisheng.dong@nxp.com>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
---
 drivers/clk/clk.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c
index 8a1860a36c77..b56c11f51baf 100644
--- a/drivers/clk/clk.c
+++ b/drivers/clk/clk.c
@@ -220,7 +220,8 @@ static bool clk_core_is_enabled(struct clk_core *core)
 
 	ret = core->ops->is_enabled(core->hw);
 done:
-	clk_pm_runtime_put(core);
+	if (core->dev)
+		pm_runtime_put(core->dev);
 
 	return ret;
 }

From 44be77c590f381bc629815ac789b8b15ecc4ddcf Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Wed, 27 Dec 2017 08:53:59 +0100
Subject: [PATCH 479/808] ALSA: hda - Fix missing COEF init for ALC225/295/299

There was a long-standing problem on HP Spectre X360 with Kabylake
where it lacks of the front speaker output in some situations.  Also
there are other products showing the similar behavior.  The culprit
seems to be the missing COEF setup on ALC codecs, ALC225/295/299,
which are all compatible.

This patch adds the proper COEF setup (to initialize idx 0x67 / bits
0x3000) for addressing the issue.

Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=195457
Cc: <stable@vger.kernel.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/pci/hda/patch_realtek.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
index 1522ba31e16d..8fd2d9c62c96 100644
--- a/sound/pci/hda/patch_realtek.c
+++ b/sound/pci/hda/patch_realtek.c
@@ -324,8 +324,12 @@ static void alc_fill_eapd_coef(struct hda_codec *codec)
 	case 0x10ec0292:
 		alc_update_coef_idx(codec, 0x4, 1<<15, 0);
 		break;
-	case 0x10ec0215:
 	case 0x10ec0225:
+	case 0x10ec0295:
+	case 0x10ec0299:
+		alc_update_coef_idx(codec, 0x67, 0xf000, 0x3000);
+		/* fallthrough */
+	case 0x10ec0215:
 	case 0x10ec0233:
 	case 0x10ec0236:
 	case 0x10ec0255:
@@ -336,10 +340,8 @@ static void alc_fill_eapd_coef(struct hda_codec *codec)
 	case 0x10ec0286:
 	case 0x10ec0288:
 	case 0x10ec0285:
-	case 0x10ec0295:
 	case 0x10ec0298:
 	case 0x10ec0289:
-	case 0x10ec0299:
 		alc_update_coef_idx(codec, 0x10, 1<<9, 0);
 		break;
 	case 0x10ec0275:

From c6a36ad383559a60a249aa6016cebf3cb8b6c485 Mon Sep 17 00:00:00 2001
From: Max Schulze <max.schulze@posteo.de>
Date: Wed, 20 Dec 2017 20:47:44 +0100
Subject: [PATCH 480/808] USB: serial: ftdi_sio: add id for Airbus DS P8GR

Add AIRBUS_DS_P8GR device IDs to ftdi_sio driver.

Signed-off-by: Max Schulze <max.schulze@posteo.de>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Johan Hovold <johan@kernel.org>
---
 drivers/usb/serial/ftdi_sio.c     | 1 +
 drivers/usb/serial/ftdi_sio_ids.h | 6 ++++++
 2 files changed, 7 insertions(+)

diff --git a/drivers/usb/serial/ftdi_sio.c b/drivers/usb/serial/ftdi_sio.c
index 1aba9105b369..fc68952c994a 100644
--- a/drivers/usb/serial/ftdi_sio.c
+++ b/drivers/usb/serial/ftdi_sio.c
@@ -1013,6 +1013,7 @@ static const struct usb_device_id id_table_combined[] = {
 		.driver_info = (kernel_ulong_t)&ftdi_jtag_quirk },
 	{ USB_DEVICE(CYPRESS_VID, CYPRESS_WICED_BT_USB_PID) },
 	{ USB_DEVICE(CYPRESS_VID, CYPRESS_WICED_WL_USB_PID) },
+	{ USB_DEVICE(AIRBUS_DS_VID, AIRBUS_DS_P8GR) },
 	{ }					/* Terminating entry */
 };
 
diff --git a/drivers/usb/serial/ftdi_sio_ids.h b/drivers/usb/serial/ftdi_sio_ids.h
index 4faa09fe308c..8b4ecd2bd297 100644
--- a/drivers/usb/serial/ftdi_sio_ids.h
+++ b/drivers/usb/serial/ftdi_sio_ids.h
@@ -914,6 +914,12 @@
 #define ICPDAS_I7561U_PID		0x0104
 #define ICPDAS_I7563U_PID		0x0105
 
+/*
+ * Airbus Defence and Space
+ */
+#define AIRBUS_DS_VID			0x1e8e  /* Vendor ID */
+#define AIRBUS_DS_P8GR			0x6001  /* Tetra P8GR */
+
 /*
  * RT Systems programming cables for various ham radios
  */

From 052f71e25a7ecd80a9567b291df8ea333d9a8565 Mon Sep 17 00:00:00 2001
From: Mathias Nyman <mathias.nyman@linux.intel.com>
Date: Thu, 21 Dec 2017 15:06:13 +0200
Subject: [PATCH 481/808] xhci: Fix xhci debugfs NULL pointer dereference in
 resume from hibernate

Free the virt_device and its debugfs_private member together.

When resuming from hibernate the .free_dev callback unconditionally
freed the debugfs_private member, but could leave virt_device intact.

This triggered a NULL pointer dereference after resume when usbmuxd
sent a USBDEVFS_SETCONFIGURATION ioctl to a device, trying to add a
endpoint debugfs entry to a already freed debugfs_private pointer.

Fixes: 02b6fdc2a153 ("usb: xhci: Add debugfs interface for xHCI driver")
Reported-by: Alexander Kappner <agk@godking.net>
Tested-by: Alexander Kappner <agk@godking.net>
Signed-off-by: Mathias Nyman <mathias.nyman@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/host/xhci.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c
index 2424d3020ca3..da6dbe3ebd8b 100644
--- a/drivers/usb/host/xhci.c
+++ b/drivers/usb/host/xhci.c
@@ -3525,8 +3525,6 @@ static void xhci_free_dev(struct usb_hcd *hcd, struct usb_device *udev)
 	struct xhci_slot_ctx *slot_ctx;
 	int i, ret;
 
-	xhci_debugfs_remove_slot(xhci, udev->slot_id);
-
 #ifndef CONFIG_USB_DEFAULT_PERSIST
 	/*
 	 * We called pm_runtime_get_noresume when the device was attached.
@@ -3555,8 +3553,10 @@ static void xhci_free_dev(struct usb_hcd *hcd, struct usb_device *udev)
 	}
 
 	ret = xhci_disable_slot(xhci, udev->slot_id);
-	if (ret)
+	if (ret) {
+		xhci_debugfs_remove_slot(xhci, udev->slot_id);
 		xhci_free_virt_device(xhci, udev->slot_id);
+	}
 }
 
 int xhci_disable_slot(struct xhci_hcd *xhci, u32 slot_id)

From dde634057da71a3505d7a6c0b77bb24ded6728c8 Mon Sep 17 00:00:00 2001
From: Alexander Kappner <agk@godking.net>
Date: Thu, 21 Dec 2017 15:06:14 +0200
Subject: [PATCH 482/808] xhci: Fix use-after-free in xhci debugfs

Trying to read from debugfs after the system has resumed from
hibernate causes a use-after-free and thus a protection fault.

Steps to reproduce:
Hibernate system, resume from hibernate, then run
$ cat /sys/kernel/debug/usb/xhci/*/command-ring/enqueue

[ 3902.765086] general protection fault: 0000 [#1] PREEMPT SMP
...
[ 3902.765136] RIP: 0010:xhci_trb_virt_to_dma.part.50+0x5/0x30
...
[ 3902.765178] Call Trace:
[ 3902.765188]  xhci_ring_enqueue_show+0x1e/0x40
[ 3902.765197]  seq_read+0xdb/0x3a0
[ 3902.765204]  ? __handle_mm_fault+0x5fb/0x1210
[ 3902.765211]  full_proxy_read+0x4a/0x70
[ 3902.765219]  __vfs_read+0x23/0x120
[ 3902.765228]  vfs_read+0x8e/0x130
[ 3902.765235]  SyS_read+0x42/0x90
[ 3902.765242]  do_syscall_64+0x6b/0x290
[ 3902.765251]  entry_SYSCALL64_slow_path+0x25/0x25

The issue is caused by the xhci ring structures being reallocated
when the system is resumed, but pointers to the old structures
being retained in the debugfs files "private" field:

The proposed patch fixes this issue by storing a pointer to the xhci_ring
field in the xhci device structure in debugfs rather than directly
storing a pointer to the xhci_ring.

Fixes: 02b6fdc2a153 ("usb: xhci: Add debugfs interface for xHCI driver")
Signed-off-by: Alexander Kappner <agk@godking.net>
Signed-off-by: Mathias Nyman <mathias.nyman@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/host/xhci-debugfs.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/usb/host/xhci-debugfs.c b/drivers/usb/host/xhci-debugfs.c
index 4f7895dbcf88..e26e685d8a57 100644
--- a/drivers/usb/host/xhci-debugfs.c
+++ b/drivers/usb/host/xhci-debugfs.c
@@ -162,7 +162,7 @@ static void xhci_debugfs_extcap_regset(struct xhci_hcd *xhci, int cap_id,
 static int xhci_ring_enqueue_show(struct seq_file *s, void *unused)
 {
 	dma_addr_t		dma;
-	struct xhci_ring	*ring = s->private;
+	struct xhci_ring	*ring = *(struct xhci_ring **)s->private;
 
 	dma = xhci_trb_virt_to_dma(ring->enq_seg, ring->enqueue);
 	seq_printf(s, "%pad\n", &dma);
@@ -173,7 +173,7 @@ static int xhci_ring_enqueue_show(struct seq_file *s, void *unused)
 static int xhci_ring_dequeue_show(struct seq_file *s, void *unused)
 {
 	dma_addr_t		dma;
-	struct xhci_ring	*ring = s->private;
+	struct xhci_ring	*ring = *(struct xhci_ring **)s->private;
 
 	dma = xhci_trb_virt_to_dma(ring->deq_seg, ring->dequeue);
 	seq_printf(s, "%pad\n", &dma);
@@ -183,7 +183,7 @@ static int xhci_ring_dequeue_show(struct seq_file *s, void *unused)
 
 static int xhci_ring_cycle_show(struct seq_file *s, void *unused)
 {
-	struct xhci_ring	*ring = s->private;
+	struct xhci_ring	*ring = *(struct xhci_ring **)s->private;
 
 	seq_printf(s, "%d\n", ring->cycle_state);
 
@@ -346,7 +346,7 @@ static void xhci_debugfs_create_files(struct xhci_hcd *xhci,
 }
 
 static struct dentry *xhci_debugfs_create_ring_dir(struct xhci_hcd *xhci,
-						   struct xhci_ring *ring,
+						   struct xhci_ring **ring,
 						   const char *name,
 						   struct dentry *parent)
 {
@@ -387,7 +387,7 @@ void xhci_debugfs_create_endpoint(struct xhci_hcd *xhci,
 
 	snprintf(epriv->name, sizeof(epriv->name), "ep%02d", ep_index);
 	epriv->root = xhci_debugfs_create_ring_dir(xhci,
-						   dev->eps[ep_index].new_ring,
+						   &dev->eps[ep_index].new_ring,
 						   epriv->name,
 						   spriv->root);
 	spriv->eps[ep_index] = epriv;
@@ -423,7 +423,7 @@ void xhci_debugfs_create_slot(struct xhci_hcd *xhci, int slot_id)
 	priv->dev = dev;
 	dev->debugfs_private = priv;
 
-	xhci_debugfs_create_ring_dir(xhci, dev->eps[0].ring,
+	xhci_debugfs_create_ring_dir(xhci, &dev->eps[0].ring,
 				     "ep00", priv->root);
 
 	xhci_debugfs_create_context_files(xhci, priv->root, slot_id);
@@ -488,11 +488,11 @@ void xhci_debugfs_init(struct xhci_hcd *xhci)
 				   ARRAY_SIZE(xhci_extcap_dbc),
 				   "reg-ext-dbc");
 
-	xhci_debugfs_create_ring_dir(xhci, xhci->cmd_ring,
+	xhci_debugfs_create_ring_dir(xhci, &xhci->cmd_ring,
 				     "command-ring",
 				     xhci->debugfs_root);
 
-	xhci_debugfs_create_ring_dir(xhci, xhci->event_ring,
+	xhci_debugfs_create_ring_dir(xhci, &xhci->event_ring,
 				     "event-ring",
 				     xhci->debugfs_root);
 

From da99706689481717998d1d48edd389f339eea979 Mon Sep 17 00:00:00 2001
From: Daniel Thompson <daniel.thompson@linaro.org>
Date: Thu, 21 Dec 2017 15:06:15 +0200
Subject: [PATCH 483/808] usb: xhci: Add XHCI_TRUST_TX_LENGTH for Renesas
 uPD720201

When plugging in a USB webcam I see the following message:
xhci_hcd 0000:04:00.0: WARN Successful completion on short TX: needs
XHCI_TRUST_TX_LENGTH quirk?
handle_tx_event: 913 callbacks suppressed

All is quiet again with this patch (and I've done a fair but of soak
testing with the camera since).

Cc: <stable@vger.kernel.org>
Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org>
Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Mathias Nyman <mathias.nyman@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/host/xhci-pci.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c
index 7ef1274ef7f7..1aad89b8aba0 100644
--- a/drivers/usb/host/xhci-pci.c
+++ b/drivers/usb/host/xhci-pci.c
@@ -177,6 +177,9 @@ static void xhci_pci_quirks(struct device *dev, struct xhci_hcd *xhci)
 		xhci->quirks |= XHCI_TRUST_TX_LENGTH;
 		xhci->quirks |= XHCI_BROKEN_STREAMS;
 	}
+	if (pdev->vendor == PCI_VENDOR_ID_RENESAS &&
+			pdev->device == 0x0014)
+		xhci->quirks |= XHCI_TRUST_TX_LENGTH;
 	if (pdev->vendor == PCI_VENDOR_ID_RENESAS &&
 			pdev->device == 0x0015)
 		xhci->quirks |= XHCI_RESET_ON_RESUME;

From 14e138a86f6347c6199f610576d2e11c03bec5f0 Mon Sep 17 00:00:00 2001
From: Avinash Repaka <avinash.repaka@oracle.com>
Date: Thu, 21 Dec 2017 20:17:04 -0800
Subject: [PATCH 484/808] RDS: Check cmsg_len before dereferencing CMSG_DATA

RDS currently doesn't check if the length of the control message is
large enough to hold the required data, before dereferencing the control
message data. This results in following crash:

BUG: KASAN: stack-out-of-bounds in rds_rdma_bytes net/rds/send.c:1013
[inline]
BUG: KASAN: stack-out-of-bounds in rds_sendmsg+0x1f02/0x1f90
net/rds/send.c:1066
Read of size 8 at addr ffff8801c928fb70 by task syzkaller455006/3157

CPU: 0 PID: 3157 Comm: syzkaller455006 Not tainted 4.15.0-rc3+ #161
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:17 [inline]
 dump_stack+0x194/0x257 lib/dump_stack.c:53
 print_address_description+0x73/0x250 mm/kasan/report.c:252
 kasan_report_error mm/kasan/report.c:351 [inline]
 kasan_report+0x25b/0x340 mm/kasan/report.c:409
 __asan_report_load8_noabort+0x14/0x20 mm/kasan/report.c:430
 rds_rdma_bytes net/rds/send.c:1013 [inline]
 rds_sendmsg+0x1f02/0x1f90 net/rds/send.c:1066
 sock_sendmsg_nosec net/socket.c:628 [inline]
 sock_sendmsg+0xca/0x110 net/socket.c:638
 ___sys_sendmsg+0x320/0x8b0 net/socket.c:2018
 __sys_sendmmsg+0x1ee/0x620 net/socket.c:2108
 SYSC_sendmmsg net/socket.c:2139 [inline]
 SyS_sendmmsg+0x35/0x60 net/socket.c:2134
 entry_SYSCALL_64_fastpath+0x1f/0x96
RIP: 0033:0x43fe49
RSP: 002b:00007fffbe244ad8 EFLAGS: 00000217 ORIG_RAX: 0000000000000133
RAX: ffffffffffffffda RBX: 00000000004002c8 RCX: 000000000043fe49
RDX: 0000000000000001 RSI: 000000002020c000 RDI: 0000000000000003
RBP: 00000000006ca018 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000217 R12: 00000000004017b0
R13: 0000000000401840 R14: 0000000000000000 R15: 0000000000000000

To fix this, we verify that the cmsg_len is large enough to hold the
data to be read, before proceeding further.

Reported-by: syzbot <syzkaller-bugs@googlegroups.com>
Signed-off-by: Avinash Repaka <avinash.repaka@oracle.com>
Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Reviewed-by: Yuval Shaia <yuval.shaia@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/send.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/rds/send.c b/net/rds/send.c
index b52cdc8ae428..f72466c63f0c 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -1009,6 +1009,9 @@ static int rds_rdma_bytes(struct msghdr *msg, size_t *rdma_bytes)
 			continue;
 
 		if (cmsg->cmsg_type == RDS_CMSG_RDMA_ARGS) {
+			if (cmsg->cmsg_len <
+			    CMSG_LEN(sizeof(struct rds_rdma_args)))
+				return -EINVAL;
 			args = CMSG_DATA(cmsg);
 			*rdma_bytes += args->remote_vec.bytes;
 		}

From 19142551b2be4a9e13838099fde1351386e5e007 Mon Sep 17 00:00:00 2001
From: Tommi Rantala <tommi.t.rantala@nokia.com>
Date: Fri, 22 Dec 2017 09:35:16 +0200
Subject: [PATCH 485/808] tipc: error path leak fixes in tipc_enable_bearer()

Fix memory leak in tipc_enable_bearer() if enable_media() fails, and
cleanup with bearer_disable() if tipc_mon_create() fails.

Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tommi Rantala <tommi.t.rantala@nokia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/bearer.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index 47ec121574ce..c8001471da6c 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -324,6 +324,7 @@ restart:
 	if (res) {
 		pr_warn("Bearer <%s> rejected, enable failure (%d)\n",
 			name, -res);
+		kfree(b);
 		return -EINVAL;
 	}
 
@@ -347,8 +348,10 @@ restart:
 	if (skb)
 		tipc_bearer_xmit_skb(net, bearer_id, skb, &b->bcast_addr);
 
-	if (tipc_mon_create(net, bearer_id))
+	if (tipc_mon_create(net, bearer_id)) {
+		bearer_disable(net, b);
 		return -ENOMEM;
+	}
 
 	pr_info("Enabled bearer <%s>, discovery domain %s, priority %u\n",
 		name,

From 642a8439ddd8423b92f2e71960afe21ee1f66bb6 Mon Sep 17 00:00:00 2001
From: Tommi Rantala <tommi.t.rantala@nokia.com>
Date: Fri, 22 Dec 2017 09:35:17 +0200
Subject: [PATCH 486/808] tipc: fix tipc_mon_delete() oops in
 tipc_enable_bearer() error path

Calling tipc_mon_delete() before the monitor has been created will oops.
This can happen in tipc_enable_bearer() error path if tipc_disc_create()
fails.

[   48.589074] BUG: unable to handle kernel paging request at 0000000000001008
[   48.590266] IP: tipc_mon_delete+0xea/0x270 [tipc]
[   48.591223] PGD 1e60c5067 P4D 1e60c5067 PUD 1eb0cf067 PMD 0
[   48.592230] Oops: 0000 [#1] SMP KASAN
[   48.595610] CPU: 5 PID: 1199 Comm: tipc Tainted: G    B            4.15.0-rc4-pc64-dirty #5
[   48.597176] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-2.fc27 04/01/2014
[   48.598489] RIP: 0010:tipc_mon_delete+0xea/0x270 [tipc]
[   48.599347] RSP: 0018:ffff8801d827f668 EFLAGS: 00010282
[   48.600705] RAX: ffff8801ee813f00 RBX: 0000000000000204 RCX: 0000000000000000
[   48.602183] RDX: 1ffffffff1de6a75 RSI: 0000000000000297 RDI: 0000000000000297
[   48.604373] RBP: 0000000000000000 R08: 0000000000000000 R09: fffffbfff1dd1533
[   48.605607] R10: ffffffff8eafbb05 R11: fffffbfff1dd1534 R12: 0000000000000050
[   48.607082] R13: dead000000000200 R14: ffffffff8e73f310 R15: 0000000000001020
[   48.608228] FS:  00007fc686484800(0000) GS:ffff8801f5540000(0000) knlGS:0000000000000000
[   48.610189] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   48.611459] CR2: 0000000000001008 CR3: 00000001dda70002 CR4: 00000000003606e0
[   48.612759] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[   48.613831] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[   48.615038] Call Trace:
[   48.615635]  tipc_enable_bearer+0x415/0x5e0 [tipc]
[   48.620623]  tipc_nl_bearer_enable+0x1ab/0x200 [tipc]
[   48.625118]  genl_family_rcv_msg+0x36b/0x570
[   48.631233]  genl_rcv_msg+0x5a/0xa0
[   48.631867]  netlink_rcv_skb+0x1cc/0x220
[   48.636373]  genl_rcv+0x24/0x40
[   48.637306]  netlink_unicast+0x29c/0x350
[   48.639664]  netlink_sendmsg+0x439/0x590
[   48.642014]  SYSC_sendto+0x199/0x250
[   48.649912]  do_syscall_64+0xfd/0x2c0
[   48.650651]  entry_SYSCALL64_slow_path+0x25/0x25
[   48.651843] RIP: 0033:0x7fc6859848e3
[   48.652539] RSP: 002b:00007ffd25dff938 EFLAGS: 00000246 ORIG_RAX: 000000000000002c
[   48.654003] RAX: ffffffffffffffda RBX: 00007ffd25dff990 RCX: 00007fc6859848e3
[   48.655303] RDX: 0000000000000054 RSI: 00007ffd25dff990 RDI: 0000000000000003
[   48.656512] RBP: 00007ffd25dff980 R08: 00007fc685c35fc0 R09: 000000000000000c
[   48.657697] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000d13010
[   48.658840] R13: 00007ffd25e009c0 R14: 0000000000000000 R15: 0000000000000000
[   48.662972] RIP: tipc_mon_delete+0xea/0x270 [tipc] RSP: ffff8801d827f668
[   48.664073] CR2: 0000000000001008
[   48.664576] ---[ end trace e811818d54d5ce88 ]---

Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tommi Rantala <tommi.t.rantala@nokia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/monitor.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/net/tipc/monitor.c b/net/tipc/monitor.c
index 8e884ed06d4b..32dc33a94bc7 100644
--- a/net/tipc/monitor.c
+++ b/net/tipc/monitor.c
@@ -642,9 +642,13 @@ void tipc_mon_delete(struct net *net, int bearer_id)
 {
 	struct tipc_net *tn = tipc_net(net);
 	struct tipc_monitor *mon = tipc_monitor(net, bearer_id);
-	struct tipc_peer *self = get_self(net, bearer_id);
+	struct tipc_peer *self;
 	struct tipc_peer *peer, *tmp;
 
+	if (!mon)
+		return;
+
+	self = get_self(net, bearer_id);
 	write_lock_bh(&mon->lock);
 	tn->monitors[bearer_id] = NULL;
 	list_for_each_entry_safe(peer, tmp, &self->list, list) {

From 178e5f57a8d8f8fc5799a624b96fc31ef9a29ffa Mon Sep 17 00:00:00 2001
From: Fugang Duan <fugang.duan@nxp.com>
Date: Fri, 22 Dec 2017 17:12:09 +0800
Subject: [PATCH 487/808] net: fec: unmap the xmit buffer that are not
 transferred by DMA

The enet IP only support 32 bit, it will use swiotlb buffer to do dma
mapping when xmit buffer DMA memory address is bigger than 4G in i.MX
platform. After stress suspend/resume test, it will print out:

log:
[12826.352864] fec 5b040000.ethernet: swiotlb buffer is full (sz: 191 bytes)
[12826.359676] DMA: Out of SW-IOMMU space for 191 bytes at device 5b040000.ethernet
[12826.367110] fec 5b040000.ethernet eth0: Tx DMA memory map failed

The issue is that the ready xmit buffers that are dma mapped but DMA still
don't copy them into fifo, once MAC restart, these DMA buffers are not unmapped.
So it should check the dma mapping buffer and unmap them.

Signed-off-by: Fugang Duan <fugang.duan@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/freescale/fec_main.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c
index 610573855213..8184d2fca9be 100644
--- a/drivers/net/ethernet/freescale/fec_main.c
+++ b/drivers/net/ethernet/freescale/fec_main.c
@@ -818,6 +818,12 @@ static void fec_enet_bd_init(struct net_device *dev)
 		for (i = 0; i < txq->bd.ring_size; i++) {
 			/* Initialize the BD for every fragment in the page. */
 			bdp->cbd_sc = cpu_to_fec16(0);
+			if (bdp->cbd_bufaddr &&
+			    !IS_TSO_HEADER(txq, fec32_to_cpu(bdp->cbd_bufaddr)))
+				dma_unmap_single(&fep->pdev->dev,
+						 fec32_to_cpu(bdp->cbd_bufaddr),
+						 fec16_to_cpu(bdp->cbd_datlen),
+						 DMA_TO_DEVICE);
 			if (txq->tx_skbuff[i]) {
 				dev_kfree_skb_any(txq->tx_skbuff[i]);
 				txq->tx_skbuff[i] = NULL;

From 5a8bae9761dc5dd409ff5c3a529b2801bd0dac3a Mon Sep 17 00:00:00 2001
From: Siva Reddy Kallam <siva.kallam@broadcom.com>
Date: Fri, 22 Dec 2017 16:05:27 +0530
Subject: [PATCH 488/808] tg3: Update copyright

Signed-off-by: Siva Reddy Kallam <siva.kallam@broadcom.com>
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/tg3.c | 6 ++++--
 drivers/net/ethernet/broadcom/tg3.h | 3 ++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index d09c5a9c53b5..5fe8d9b05f31 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -4,11 +4,13 @@
  * Copyright (C) 2001, 2002, 2003, 2004 David S. Miller (davem@redhat.com)
  * Copyright (C) 2001, 2002, 2003 Jeff Garzik (jgarzik@pobox.com)
  * Copyright (C) 2004 Sun Microsystems Inc.
- * Copyright (C) 2005-2014 Broadcom Corporation.
+ * Copyright (C) 2005-2016 Broadcom Corporation.
+ * Copyright (C) 2016-2017 Broadcom Limited.
  *
  * Firmware is:
  *	Derived from proprietary unpublished source code,
- *	Copyright (C) 2000-2003 Broadcom Corporation.
+ *	Copyright (C) 2000-2016 Broadcom Corporation.
+ *	Copyright (C) 2016-2017 Broadcom Ltd.
  *
  *	Permission is hereby granted for the distribution of this firmware
  *	data in hexadecimal or equivalent format, provided this copyright
diff --git a/drivers/net/ethernet/broadcom/tg3.h b/drivers/net/ethernet/broadcom/tg3.h
index c2d02d02d1e6..3d60fc7a2da6 100644
--- a/drivers/net/ethernet/broadcom/tg3.h
+++ b/drivers/net/ethernet/broadcom/tg3.h
@@ -5,7 +5,8 @@
  * Copyright (C) 2001, 2002, 2003, 2004 David S. Miller (davem@redhat.com)
  * Copyright (C) 2001 Jeff Garzik (jgarzik@pobox.com)
  * Copyright (C) 2004 Sun Microsystems Inc.
- * Copyright (C) 2007-2014 Broadcom Corporation.
+ * Copyright (C) 2007-2016 Broadcom Corporation.
+ * Copyright (C) 2016-2017 Broadcom Limited.
  */
 
 #ifndef _T3_H

From 4419bb1cedcda0272e1dc410345c5a1d1da0e367 Mon Sep 17 00:00:00 2001
From: Siva Reddy Kallam <siva.kallam@broadcom.com>
Date: Fri, 22 Dec 2017 16:05:28 +0530
Subject: [PATCH 489/808] tg3: Add workaround to restrict 5762 MRRS to 2048

One of AMD based server with 5762 hangs with jumbo frame traffic.
This AMD platform has southbridge limitation which is restricting MRRS
to 4000. As a work around, driver to restricts the MRRS to 2048 for
this particular 5762 NX1 card.

Signed-off-by: Siva Reddy Kallam <siva.kallam@broadcom.com>
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/tg3.c | 10 ++++++++++
 drivers/net/ethernet/broadcom/tg3.h |  4 ++++
 2 files changed, 14 insertions(+)

diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index 5fe8d9b05f31..a0caa71a8c3b 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -10054,6 +10054,16 @@ static int tg3_reset_hw(struct tg3 *tp, bool reset_phy)
 
 	tw32(GRC_MODE, tp->grc_mode | val);
 
+	/* On one of the AMD platform, MRRS is restricted to 4000 because of
+	 * south bridge limitation. As a workaround, Driver is setting MRRS
+	 * to 2048 instead of default 4096.
+	 */
+	if (tp->pdev->subsystem_vendor == PCI_VENDOR_ID_DELL &&
+	    tp->pdev->subsystem_device == TG3PCI_SUBDEVICE_ID_DELL_5762) {
+		val = tr32(TG3PCI_DEV_STATUS_CTRL) & ~MAX_READ_REQ_MASK;
+		tw32(TG3PCI_DEV_STATUS_CTRL, val | MAX_READ_REQ_SIZE_2048);
+	}
+
 	/* Setup the timer prescalar register.  Clock is always 66Mhz. */
 	val = tr32(GRC_MISC_CFG);
 	val &= ~0xff;
diff --git a/drivers/net/ethernet/broadcom/tg3.h b/drivers/net/ethernet/broadcom/tg3.h
index 3d60fc7a2da6..1f0271fa7c74 100644
--- a/drivers/net/ethernet/broadcom/tg3.h
+++ b/drivers/net/ethernet/broadcom/tg3.h
@@ -97,6 +97,7 @@
 #define TG3PCI_SUBDEVICE_ID_DELL_JAGUAR		0x0106
 #define TG3PCI_SUBDEVICE_ID_DELL_MERLOT		0x0109
 #define TG3PCI_SUBDEVICE_ID_DELL_SLIM_MERLOT	0x010a
+#define TG3PCI_SUBDEVICE_ID_DELL_5762		0x07f0
 #define TG3PCI_SUBVENDOR_ID_COMPAQ		PCI_VENDOR_ID_COMPAQ
 #define TG3PCI_SUBDEVICE_ID_COMPAQ_BANSHEE	0x007c
 #define TG3PCI_SUBDEVICE_ID_COMPAQ_BANSHEE_2	0x009a
@@ -282,6 +283,9 @@
 #define TG3PCI_STD_RING_PROD_IDX	0x00000098 /* 64-bit */
 #define TG3PCI_RCV_RET_RING_CON_IDX	0x000000a0 /* 64-bit */
 /* 0xa8 --> 0xb8 unused */
+#define TG3PCI_DEV_STATUS_CTRL		0x000000b4
+#define  MAX_READ_REQ_SIZE_2048		 0x00004000
+#define  MAX_READ_REQ_MASK		 0x00007000
 #define TG3PCI_DUAL_MAC_CTRL		0x000000b8
 #define  DUAL_MAC_CTRL_CH_MASK		 0x00000003
 #define  DUAL_MAC_CTRL_ID		 0x00000004

From e60ee41aaf898584205a6af5c996860d0fe6a836 Mon Sep 17 00:00:00 2001
From: Siva Reddy Kallam <siva.kallam@broadcom.com>
Date: Fri, 22 Dec 2017 16:05:29 +0530
Subject: [PATCH 490/808] tg3: Enable PHY reset in MTU change path for 5720

A customer noticed RX path hang when MTU is changed on the fly while
running heavy traffic with NCSI enabled for 5717 and 5719. Since 5720
belongs to same ASIC family, we observed same issue and same fix
could solve this problem for 5720.

Signed-off-by: Siva Reddy Kallam <siva.kallam@broadcom.com>
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/tg3.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index a0caa71a8c3b..8995cfefbfcf 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -14239,7 +14239,8 @@ static int tg3_change_mtu(struct net_device *dev, int new_mtu)
 	 */
 	if (tg3_asic_rev(tp) == ASIC_REV_57766 ||
 	    tg3_asic_rev(tp) == ASIC_REV_5717 ||
-	    tg3_asic_rev(tp) == ASIC_REV_5719)
+	    tg3_asic_rev(tp) == ASIC_REV_5719 ||
+	    tg3_asic_rev(tp) == ASIC_REV_5720)
 		reset_phy = true;
 
 	err = tg3_restart_hw(tp, reset_phy);

From f7084059a9cb9e56a186e1677b1dcffd76c2cd24 Mon Sep 17 00:00:00 2001
From: "Guilherme G. Piccoli" <gpiccoli@linux.vnet.ibm.com>
Date: Fri, 22 Dec 2017 13:01:39 -0200
Subject: [PATCH 491/808] bnx2x: Improve reliability in case of nested PCI
 errors

While in recovery process of PCI error (called EEH on PowerPC arch),
another PCI transaction could be corrupted causing a situation of
nested PCI errors. Also, this scenario could be reproduced with
error injection mechanisms (for debug purposes).

We observe that in case of nested PCI errors, bnx2x might attempt to
initialize its shmem and cause a kernel crash due to bad addresses
read from MCP. Multiple different stack traces were observed depending
on the point the second PCI error happens.

This patch avoids the crashes by:

 * failing PCI recovery in case of nested errors (since multiple
 PCI errors in a row are not expected to lead to a functional
 adapter anyway), and by,

 * preventing access to adapter FW when MCP is failed (we mark it as
 failed when shmem cannot get initialized properly).

Reported-by: Abdul Haleem <abdhalee@linux.vnet.ibm.com>
Signed-off-by: Guilherme G. Piccoli <gpiccoli@linux.vnet.ibm.com>
Acked-by: Shahed Shaikh <Shahed.Shaikh@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c  |  4 ++--
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c | 14 +++++++++++++-
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index 4c739d5355d2..8ae269ec17a1 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -3030,7 +3030,7 @@ int bnx2x_nic_unload(struct bnx2x *bp, int unload_mode, bool keep_link)
 
 	del_timer_sync(&bp->timer);
 
-	if (IS_PF(bp)) {
+	if (IS_PF(bp) && !BP_NOMCP(bp)) {
 		/* Set ALWAYS_ALIVE bit in shmem */
 		bp->fw_drv_pulse_wr_seq |= DRV_PULSE_ALWAYS_ALIVE;
 		bnx2x_drv_pulse(bp);
@@ -3116,7 +3116,7 @@ int bnx2x_nic_unload(struct bnx2x *bp, int unload_mode, bool keep_link)
 	bp->cnic_loaded = false;
 
 	/* Clear driver version indication in shmem */
-	if (IS_PF(bp))
+	if (IS_PF(bp) && !BP_NOMCP(bp))
 		bnx2x_update_mng_version(bp);
 
 	/* Check if there are pending parity attentions. If there are - set
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
index 91e2a7560b48..ddd5d3ebd201 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
@@ -9578,6 +9578,15 @@ static int bnx2x_init_shmem(struct bnx2x *bp)
 
 	do {
 		bp->common.shmem_base = REG_RD(bp, MISC_REG_SHARED_MEM_ADDR);
+
+		/* If we read all 0xFFs, means we are in PCI error state and
+		 * should bail out to avoid crashes on adapter's FW reads.
+		 */
+		if (bp->common.shmem_base == 0xFFFFFFFF) {
+			bp->flags |= NO_MCP_FLAG;
+			return -ENODEV;
+		}
+
 		if (bp->common.shmem_base) {
 			val = SHMEM_RD(bp, validity_map[BP_PORT(bp)]);
 			if (val & SHR_MEM_VALIDITY_MB)
@@ -14320,7 +14329,10 @@ static pci_ers_result_t bnx2x_io_slot_reset(struct pci_dev *pdev)
 		BNX2X_ERR("IO slot reset --> driver unload\n");
 
 		/* MCP should have been reset; Need to wait for validity */
-		bnx2x_init_shmem(bp);
+		if (bnx2x_init_shmem(bp)) {
+			rtnl_unlock();
+			return PCI_ERS_RESULT_DISCONNECT;
+		}
 
 		if (IS_PF(bp) && SHMEM2_HAS(bp, drv_capabilities_flag)) {
 			u32 v;

From 76dc6c097d581ad8eeedf8e1a000423a3d742445 Mon Sep 17 00:00:00 2001
From: Mathieu Malaterre <malat@debian.org>
Date: Tue, 26 Dec 2017 15:08:53 +0100
Subject: [PATCH 492/808] cpu/hotplug: Move inline keyword at the beginning of
 declaration
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix non-fatal warnings such as:

kernel/cpu.c:95:1: warning: ‘inline’ is not at beginning of declaration [-Wold-style-declaration]
 static void inline cpuhp_lock_release(bool bringup) { }
 ^~~~~~

Signed-off-by: Mathieu Malaterre <malat@debian.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Link: https://lkml.kernel.org/r/20171226140855.16583-1-malat@debian.org
---
 kernel/cpu.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 41376c3ac93b..3d002a6f216e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -80,19 +80,19 @@ static struct lockdep_map cpuhp_state_down_map =
 	STATIC_LOCKDEP_MAP_INIT("cpuhp_state-down", &cpuhp_state_down_map);
 
 
-static void inline cpuhp_lock_acquire(bool bringup)
+static inline void cpuhp_lock_acquire(bool bringup)
 {
 	lock_map_acquire(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map);
 }
 
-static void inline cpuhp_lock_release(bool bringup)
+static inline void cpuhp_lock_release(bool bringup)
 {
 	lock_map_release(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map);
 }
 #else
 
-static void inline cpuhp_lock_acquire(bool bringup) { }
-static void inline cpuhp_lock_release(bool bringup) { }
+static inline void cpuhp_lock_acquire(bool bringup) { }
+static inline void cpuhp_lock_release(bool bringup) { }
 
 #endif
 

From 8cb38a602478e9f806571f6920b0a3298aabf042 Mon Sep 17 00:00:00 2001
From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Date: Fri, 22 Dec 2017 10:15:20 -0800
Subject: [PATCH 493/808] sctp: Replace use of sockets_allocated with specified
 macro.

The patch(180d8cd942ce) replaces all uses of struct sock fields'
memory_pressure, memory_allocated, sockets_allocated, and sysctl_mem
to accessor macros. But the sockets_allocated field of sctp sock is
not replaced at all. Then replace it now for unifying the code.

Fixes: 180d8cd942ce ("foundations of per-cgroup memory pressure controlling.")
Cc: Glauber Costa <glommer@parallels.com>
Signed-off-by: Tonghao Zhang <zhangtonghao@didichuxing.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/socket.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 3253f724a995..b4fb6e4886d2 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4498,7 +4498,7 @@ static int sctp_init_sock(struct sock *sk)
 	SCTP_DBG_OBJCNT_INC(sock);
 
 	local_bh_disable();
-	percpu_counter_inc(&sctp_sockets_allocated);
+	sk_sockets_allocated_inc(sk);
 	sock_prot_inuse_add(net, sk->sk_prot, 1);
 
 	/* Nothing can fail after this block, otherwise
@@ -4542,7 +4542,7 @@ static void sctp_destroy_sock(struct sock *sk)
 	}
 	sctp_endpoint_free(sp->ep);
 	local_bh_disable();
-	percpu_counter_dec(&sctp_sockets_allocated);
+	sk_sockets_allocated_dec(sk);
 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 	local_bh_enable();
 }

From 45d8b80c2ac5d21cd1e2954431fb676bc2b1e099 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Fri, 22 Dec 2017 20:32:35 -0500
Subject: [PATCH 494/808] ring-buffer: Mask out the info bits when returning
 buffer page length

Two info bits were added to the "commit" part of the ring buffer data page
when returned to be consumed. This was to inform the user space readers that
events have been missed, and that the count may be stored at the end of the
page.

What wasn't handled, was the splice code that actually called a function to
return the length of the data in order to zero out the rest of the page
before sending it up to user space. These data bits were returned with the
length making the value negative, and that negative value was not checked.
It was compared to PAGE_SIZE, and only used if the size was less than
PAGE_SIZE. Luckily PAGE_SIZE is unsigned long which made the compare an
unsigned compare, meaning the negative size value did not end up causing a
large portion of memory to be randomly zeroed out.

Cc: stable@vger.kernel.org
Fixes: 66a8cb95ed040 ("ring-buffer: Add place holder recording of dropped events")
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index c87766c1c204..e06cde093f76 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -280,6 +280,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
 /* Missed count stored at end */
 #define RB_MISSED_STORED	(1 << 30)
 
+#define RB_MISSED_FLAGS		(RB_MISSED_EVENTS|RB_MISSED_STORED)
+
 struct buffer_data_page {
 	u64		 time_stamp;	/* page time stamp */
 	local_t		 commit;	/* write committed index */
@@ -331,7 +333,9 @@ static void rb_init_page(struct buffer_data_page *bpage)
  */
 size_t ring_buffer_page_len(void *page)
 {
-	return local_read(&((struct buffer_data_page *)page)->commit)
+	struct buffer_data_page *bpage = page;
+
+	return (local_read(&bpage->commit) & ~RB_MISSED_FLAGS)
 		+ BUF_PAGE_HDR_SIZE;
 }
 

From 6b7e633fe9c24682df550e5311f47fb524701586 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Fri, 22 Dec 2017 20:38:57 -0500
Subject: [PATCH 495/808] tracing: Remove extra zeroing out of the ring buffer
 page

The ring_buffer_read_page() takes care of zeroing out any extra data in the
page that it returns. There's no need to zero it out again from the
consumer. It was removed from one consumer of this function, but
read_buffers_splice_read() did not remove it, and worse, it contained a
nasty bug because of it.

Cc: stable@vger.kernel.org
Fixes: 2711ca237a084 ("ring-buffer: Move zeroing out excess in page to ring buffer code")
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 59518b8126d0..73652d5318b2 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -6769,7 +6769,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 		.spd_release	= buffer_spd_release,
 	};
 	struct buffer_ref *ref;
-	int entries, size, i;
+	int entries, i;
 	ssize_t ret = 0;
 
 #ifdef CONFIG_TRACER_MAX_TRACE
@@ -6823,14 +6823,6 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 			break;
 		}
 
-		/*
-		 * zero out any left over data, this is going to
-		 * user land.
-		 */
-		size = ring_buffer_page_len(ref->page);
-		if (size < PAGE_SIZE)
-			memset(ref->page + size, 0, PAGE_SIZE - size);
-
 		page = virt_to_page(ref->page);
 
 		spd.pages[i] = page;

From ae415fa4c5248a8cf4faabd5a3c20576cb1ad607 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Fri, 22 Dec 2017 21:19:29 -0500
Subject: [PATCH 496/808] ring-buffer: Do no reuse reader page if still in use

To free the reader page that is allocated with ring_buffer_alloc_read_page(),
ring_buffer_free_read_page() must be called. For faster performance, this
page can be reused by the ring buffer to avoid having to free and allocate
new pages.

The issue arises when the page is used with a splice pipe into the
networking code. The networking code may up the page counter for the page,
and keep it active while sending it is queued to go to the network. The
incrementing of the page ref does not prevent it from being reused in the
ring buffer, and this can cause the page that is being sent out to the
network to be modified before it is sent by reading new data.

Add a check to the page ref counter, and only reuse the page if it is not
being used anywhere else.

Cc: stable@vger.kernel.org
Fixes: 73a757e63114d ("ring-buffer: Return reader page back into existing ring buffer")
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index e06cde093f76..9ab18995ff1e 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -4404,8 +4404,13 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, int cpu, void *data)
 {
 	struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
 	struct buffer_data_page *bpage = data;
+	struct page *page = virt_to_page(bpage);
 	unsigned long flags;
 
+	/* If the page is still in use someplace else, we can't reuse it */
+	if (page_ref_count(page) > 1)
+		goto out;
+
 	local_irq_save(flags);
 	arch_spin_lock(&cpu_buffer->lock);
 
@@ -4417,6 +4422,7 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, int cpu, void *data)
 	arch_spin_unlock(&cpu_buffer->lock);
 	local_irq_restore(flags);
 
+ out:
 	free_page((unsigned long)bpage);
 }
 EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);

From 24f2aaf952ee0b59f31c3a18b8b36c9e3d3c2cf5 Mon Sep 17 00:00:00 2001
From: Jing Xia <jing.xia@spreadtrum.com>
Date: Tue, 26 Dec 2017 15:12:53 +0800
Subject: [PATCH 497/808] tracing: Fix crash when it fails to alloc ring buffer

Double free of the ring buffer happens when it fails to alloc new
ring buffer instance for max_buffer if TRACER_MAX_TRACE is configured.
The root cause is that the pointer is not set to NULL after the buffer
is freed in allocate_trace_buffers(), and the freeing of the ring
buffer is invoked again later if the pointer is not equal to Null,
as:

instance_mkdir()
    |-allocate_trace_buffers()
        |-allocate_trace_buffer(tr, &tr->trace_buffer...)
	|-allocate_trace_buffer(tr, &tr->max_buffer...)

          // allocate fail(-ENOMEM),first free
          // and the buffer pointer is not set to null
        |-ring_buffer_free(tr->trace_buffer.buffer)

       // out_free_tr
    |-free_trace_buffers()
        |-free_trace_buffer(&tr->trace_buffer);

	      //if trace_buffer is not null, free again
	    |-ring_buffer_free(buf->buffer)
                |-rb_free_cpu_buffer(buffer->buffers[cpu])
                    // ring_buffer_per_cpu is null, and
                    // crash in ring_buffer_per_cpu->pages

Link: http://lkml.kernel.org/r/20171226071253.8968-1-chunyan.zhang@spreadtrum.com

Cc: stable@vger.kernel.org
Fixes: 737223fbca3b1 ("tracing: Consolidate buffer allocation code")
Signed-off-by: Jing Xia <jing.xia@spreadtrum.com>
Signed-off-by: Chunyan Zhang <chunyan.zhang@spreadtrum.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 73652d5318b2..0e53d46544b8 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -7603,7 +7603,9 @@ static int allocate_trace_buffers(struct trace_array *tr, int size)
 				    allocate_snapshot ? size : 1);
 	if (WARN_ON(ret)) {
 		ring_buffer_free(tr->trace_buffer.buffer);
+		tr->trace_buffer.buffer = NULL;
 		free_percpu(tr->trace_buffer.data);
+		tr->trace_buffer.data = NULL;
 		return -ENOMEM;
 	}
 	tr->allocated_snapshot = allocate_snapshot;

From 4397f04575c44e1440ec2e49b6302785c95fd2f8 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Tue, 26 Dec 2017 20:07:34 -0500
Subject: [PATCH 498/808] tracing: Fix possible double free on failure of
 allocating trace buffer

Jing Xia and Chunyan Zhang reported that on failing to allocate part of the
tracing buffer, memory is freed, but the pointers that point to them are not
initialized back to NULL, and later paths may try to free the freed memory
again. Jing and Chunyan fixed one of the locations that does this, but
missed a spot.

Link: http://lkml.kernel.org/r/20171226071253.8968-1-chunyan.zhang@spreadtrum.com

Cc: stable@vger.kernel.org
Fixes: 737223fbca3b1 ("tracing: Consolidate buffer allocation code")
Reported-by: Jing Xia <jing.xia@spreadtrum.com>
Reported-by: Chunyan Zhang <chunyan.zhang@spreadtrum.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0e53d46544b8..2a8d8a294345 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -7580,6 +7580,7 @@ allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size
 	buf->data = alloc_percpu(struct trace_array_cpu);
 	if (!buf->data) {
 		ring_buffer_free(buf->buffer);
+		buf->buffer = NULL;
 		return -ENOMEM;
 	}
 

From 7ad1437d6ace0e450a6c1167720608ad660b191d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 27 Dec 2017 19:45:31 +0100
Subject: [PATCH 499/808] perf/x86/intel: Plug memory leak in intel_pmu_init()

A recent commit introduced an extra merge_attr() call in the skylake
branch, which causes a memory leak.

Store the pointer to the extra allocated memory and free it at the end of
the function.

Fixes: a5df70c354c2 ("perf/x86: Only show format attributes when supported")
Reported-by: Tommi Rantala <tommi.t.rantala@nokia.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andi Kleen <ak@linux.intel.com>
---
 arch/x86/events/intel/core.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 09c26a4f139c..731153a4681e 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3847,6 +3847,8 @@ static struct attribute *intel_pmu_attrs[] = {
 
 __init int intel_pmu_init(void)
 {
+	struct attribute **extra_attr = NULL;
+	struct attribute **to_free = NULL;
 	union cpuid10_edx edx;
 	union cpuid10_eax eax;
 	union cpuid10_ebx ebx;
@@ -3854,7 +3856,6 @@ __init int intel_pmu_init(void)
 	unsigned int unused;
 	struct extra_reg *er;
 	int version, i;
-	struct attribute **extra_attr = NULL;
 	char *name;
 
 	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
@@ -4294,6 +4295,7 @@ __init int intel_pmu_init(void)
 		extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
 			hsw_format_attr : nhm_format_attr;
 		extra_attr = merge_attr(extra_attr, skl_format_attr);
+		to_free = extra_attr;
 		x86_pmu.cpu_events = get_hsw_events_attrs();
 		intel_pmu_pebs_data_source_skl(
 			boot_cpu_data.x86_model == INTEL_FAM6_SKYLAKE_X);
@@ -4401,6 +4403,7 @@ __init int intel_pmu_init(void)
 		pr_cont("full-width counters, ");
 	}
 
+	kfree(to_free);
 	return 0;
 }
 

From 7ac139eaa6bbdb07c547b6916a808eab3897e0e3 Mon Sep 17 00:00:00 2001
From: rodrigosiqueira <rodrigosiqueiramelo@gmail.com>
Date: Fri, 15 Dec 2017 11:15:33 -0200
Subject: [PATCH 500/808] x86: Remove unused parameter of prepare_switch_to

Commit e37e43a497d5 ("x86/mm/64: Enable vmapped stacks
(CONFIG_HAVE_ARCH_VMAP_STACK=y)") added prepare_switch_to with one extra
parameter which is not used by the function, remove it.

Signed-off-by: Rodrigo Siqueira <rodrigosiqueiramelo@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: kernel-janitors@vger.kernel.org
Link: https://lkml.kernel.org/r/20171215131533.hp6kqebw45o7uvsb@smtp.gmail.com
---
 arch/x86/include/asm/switch_to.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index 8c6bd6863db9..1008d4622709 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -16,8 +16,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
 		      struct tss_struct *tss);
 
 /* This runs runs on the previous thread's stack. */
-static inline void prepare_switch_to(struct task_struct *prev,
-				     struct task_struct *next)
+static inline void prepare_switch_to(struct task_struct *next)
 {
 #ifdef CONFIG_VMAP_STACK
 	/*
@@ -70,7 +69,7 @@ struct fork_frame {
 
 #define switch_to(prev, next, last)					\
 do {									\
-	prepare_switch_to(prev, next);					\
+	prepare_switch_to(next);					\
 									\
 	((last) = __switch_to_asm((prev), (next)));			\
 } while (0)

From 2b83ff96f51d0b039c4561b9f95c824d7bddb85c Mon Sep 17 00:00:00 2001
From: Matthieu CASTET <matthieu.castet@parrot.com>
Date: Tue, 12 Dec 2017 11:10:44 +0100
Subject: [PATCH 501/808] led: core: Fix brightness setting when setting
 delay_off=0

With the current code, the following sequence won't work :
echo timer > trigger

echo 0 >  delay_off
* at this point we call
** led_delay_off_store
** led_blink_set
*** stop timer
** led_blink_setup
** led_set_software_blink
*** if !delay_on, led off
*** if !delay_off, set led_set_brightness_nosleep <--- LED_BLINK_SW is set but timer is stop
*** otherwise start timer/set LED_BLINK_SW flag

echo xxx > brightness
* led_set_brightness
** if LED_BLINK_SW
*** if brightness=0, led off
*** else apply brightness if next timer <--- timer is stop, and will never apply new setting
** otherwise set led_set_brightness_nosleep

To fix that, when we delete the timer, we should clear LED_BLINK_SW.

Cc: linux-leds@vger.kernel.org
Signed-off-by: Matthieu CASTET <matthieu.castet@parrot.com>
Signed-off-by: Jacek Anaszewski <jacek.anaszewski@gmail.com>
---
 drivers/leds/led-core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/leds/led-core.c b/drivers/leds/led-core.c
index fd83c7f77a95..f3654fd2eaf3 100644
--- a/drivers/leds/led-core.c
+++ b/drivers/leds/led-core.c
@@ -186,7 +186,7 @@ void led_blink_set(struct led_classdev *led_cdev,
 		   unsigned long *delay_on,
 		   unsigned long *delay_off)
 {
-	del_timer_sync(&led_cdev->blink_timer);
+	led_stop_software_blink(led_cdev);
 
 	clear_bit(LED_BLINK_ONESHOT, &led_cdev->work_flags);
 	clear_bit(LED_BLINK_ONESHOT_STOP, &led_cdev->work_flags);

From ac461122c88a10b7d775de2f56467f097c9e627a Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 27 Dec 2017 11:48:50 -0800
Subject: [PATCH 502/808] x86-32: Fix kexec with stack canary
 (CONFIG_CC_STACKPROTECTOR)

Commit e802a51ede91 ("x86/idt: Consolidate IDT invalidation") cleaned up
and unified the IDT invalidation that existed in a couple of places.  It
changed no actual real code.

Despite not changing any actual real code, it _did_ change code generation:
by implementing the common idt_invalidate() function in
archx86/kernel/idt.c, it made the use of the function in
arch/x86/kernel/machine_kexec_32.c be a real function call rather than an
(accidental) inlining of the function.

That, in turn, exposed two issues:

 - in load_segments(), we had incorrectly reset all the segment
   registers, which then made the stack canary load (which gcc does
   using offset of %gs) cause a trap.  Instead of %gs pointing to the
   stack canary, it will be the normal zero-based kernel segment, and
   the stack canary load will take a page fault at address 0x14.

 - to make this even harder to debug, we had invalidated the GDT just
   before calling idt_invalidate(), which meant that the fault happened
   with an invalid GDT, which in turn causes a triple fault and
   immediate reboot.

Fix this by

 (a) not reloading the special segments in load_segments(). We currently
     don't do any percpu accesses (which would require %fs on x86-32) in
     this area, but there's no reason to think that we might not want to
     do them, and like %gs, it's pointless to break it.

 (b) doing idt_invalidate() before invalidating the GDT, to keep things
     at least _slightly_ more debuggable for a bit longer. Without a
     IDT, traps will not work. Without a GDT, traps also will not work,
     but neither will any segment loads etc. So in a very real sense,
     the GDT is even more core than the IDT.

Fixes: e802a51ede91 ("x86/idt: Consolidate IDT invalidation")
Reported-and-tested-by: Alexandru Chirvasitu <achirvasub@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/alpine.LFD.2.21.1712271143180.8572@i7.lan
---
 arch/x86/kernel/machine_kexec_32.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index 00bc751c861c..edfede768688 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -48,8 +48,6 @@ static void load_segments(void)
 		"\tmovl $"STR(__KERNEL_DS)",%%eax\n"
 		"\tmovl %%eax,%%ds\n"
 		"\tmovl %%eax,%%es\n"
-		"\tmovl %%eax,%%fs\n"
-		"\tmovl %%eax,%%gs\n"
 		"\tmovl %%eax,%%ss\n"
 		: : : "eax", "memory");
 #undef STR
@@ -232,8 +230,8 @@ void machine_kexec(struct kimage *image)
 	 * The gdt & idt are now invalid.
 	 * If you want to load them you must set up your own idt & gdt.
 	 */
-	set_gdt(phys_to_virt(0), 0);
 	idt_invalidate(phys_to_virt(0));
+	set_gdt(phys_to_virt(0), 0);
 
 	/* now call it */
 	image->start = relocate_kernel_ptr((unsigned long)image->head,

From ad9a3668a434faca1339789ed2f043d679199309 Mon Sep 17 00:00:00 2001
From: Majd Dibbiny <majd@mellanox.com>
Date: Sun, 24 Dec 2017 13:54:56 +0200
Subject: [PATCH 503/808] IB/mlx5: Serialize access to the VMA list

User-space applications can do mmap and munmap directly at
any time.

Since the VMA list is not protected with a mutex, concurrent
accesses to the VMA list from the mmap and munmap can cause
data corruption. Add a mutex around the list.

Cc: <stable@vger.kernel.org> # v4.7
Fixes: 7c2344c3bbf9 ("IB/mlx5: Implements disassociate_ucontext API")
Reviewed-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Majd Dibbiny <majd@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx5/main.c    | 8 ++++++++
 drivers/infiniband/hw/mlx5/mlx5_ib.h | 4 ++++
 2 files changed, 12 insertions(+)

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index b4ef4d9b6ce5..8ac50de2b242 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -1463,6 +1463,7 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
 	}
 
 	INIT_LIST_HEAD(&context->vma_private_list);
+	mutex_init(&context->vma_private_list_mutex);
 	INIT_LIST_HEAD(&context->db_page_list);
 	mutex_init(&context->db_page_mutex);
 
@@ -1624,7 +1625,9 @@ static void  mlx5_ib_vma_close(struct vm_area_struct *area)
 	 * mlx5_ib_disassociate_ucontext().
 	 */
 	mlx5_ib_vma_priv_data->vma = NULL;
+	mutex_lock(mlx5_ib_vma_priv_data->vma_private_list_mutex);
 	list_del(&mlx5_ib_vma_priv_data->list);
+	mutex_unlock(mlx5_ib_vma_priv_data->vma_private_list_mutex);
 	kfree(mlx5_ib_vma_priv_data);
 }
 
@@ -1644,10 +1647,13 @@ static int mlx5_ib_set_vma_data(struct vm_area_struct *vma,
 		return -ENOMEM;
 
 	vma_prv->vma = vma;
+	vma_prv->vma_private_list_mutex = &ctx->vma_private_list_mutex;
 	vma->vm_private_data = vma_prv;
 	vma->vm_ops =  &mlx5_ib_vm_ops;
 
+	mutex_lock(&ctx->vma_private_list_mutex);
 	list_add(&vma_prv->list, vma_head);
+	mutex_unlock(&ctx->vma_private_list_mutex);
 
 	return 0;
 }
@@ -1690,6 +1696,7 @@ static void mlx5_ib_disassociate_ucontext(struct ib_ucontext *ibcontext)
 	 * mlx5_ib_vma_close.
 	 */
 	down_write(&owning_mm->mmap_sem);
+	mutex_lock(&context->vma_private_list_mutex);
 	list_for_each_entry_safe(vma_private, n, &context->vma_private_list,
 				 list) {
 		vma = vma_private->vma;
@@ -1704,6 +1711,7 @@ static void mlx5_ib_disassociate_ucontext(struct ib_ucontext *ibcontext)
 		list_del(&vma_private->list);
 		kfree(vma_private);
 	}
+	mutex_unlock(&context->vma_private_list_mutex);
 	up_write(&owning_mm->mmap_sem);
 	mmput(owning_mm);
 	put_task_struct(owning_process);
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 6dd8cac78de2..2c5f3533bbc9 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -115,6 +115,8 @@ enum {
 struct mlx5_ib_vma_private_data {
 	struct list_head list;
 	struct vm_area_struct *vma;
+	/* protect vma_private_list add/del */
+	struct mutex *vma_private_list_mutex;
 };
 
 struct mlx5_ib_ucontext {
@@ -129,6 +131,8 @@ struct mlx5_ib_ucontext {
 	/* Transport Domain number */
 	u32			tdn;
 	struct list_head	vma_private_list;
+	/* protect vma_private_list add/del */
+	struct mutex		vma_private_list_mutex;
 
 	unsigned long		upd_xlt_page;
 	/* protect ODP/KSM */

From 05d14e7b0c138cb07ba30e464f47b39434f3fdef Mon Sep 17 00:00:00 2001
From: Moni Shoua <monis@mellanox.com>
Date: Sun, 24 Dec 2017 13:54:57 +0200
Subject: [PATCH 504/808] IB/uverbs: Fix command checking as part of
 ib_uverbs_ex_modify_qp()

If the input command length is larger than the kernel supports an error should
be returned in case the unsupported bytes are not cleared, instead of the
other way aroudn. This matches what all other callers of ib_is_udata_cleared
do and will avoid user ABI problems in the future.

Cc: <stable@vger.kernel.org> # v4.10
Fixes: 189aba99e700 ("IB/uverbs: Extend modify_qp and support packet pacing")
Reviewed-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Moni Shoua <monis@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/uverbs_cmd.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index d0202bb176a4..840b24096690 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -2074,8 +2074,8 @@ int ib_uverbs_ex_modify_qp(struct ib_uverbs_file *file,
 		return -EOPNOTSUPP;
 
 	if (ucore->inlen > sizeof(cmd)) {
-		if (ib_is_udata_cleared(ucore, sizeof(cmd),
-					ucore->inlen - sizeof(cmd)))
+		if (!ib_is_udata_cleared(ucore, sizeof(cmd),
+					 ucore->inlen - sizeof(cmd)))
 			return -EOPNOTSUPP;
 	}
 

From 4a50881bbac309e6f0684816a180bc3c14e1485d Mon Sep 17 00:00:00 2001
From: Moni Shoua <monis@mellanox.com>
Date: Sun, 24 Dec 2017 13:54:58 +0200
Subject: [PATCH 505/808] IB/core: Verify that QP is security enabled in create
 and destroy

The XRC target QP create flow sets up qp_sec only if there is an IB link with
LSM security enabled. However, several other related uAPI entry points blindly
follow the qp_sec NULL pointer, resulting in a possible oops.

Check for NULL before using qp_sec.

Cc: <stable@vger.kernel.org> # v4.12
Fixes: d291f1a65232 ("IB/core: Enforce PKey security on QPs")
Reviewed-by: Daniel Jurgens <danielj@mellanox.com>
Signed-off-by: Moni Shoua <monis@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/security.c | 3 +++
 drivers/infiniband/core/verbs.c    | 3 ++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/infiniband/core/security.c b/drivers/infiniband/core/security.c
index feafdb961c48..59b2f96d986a 100644
--- a/drivers/infiniband/core/security.c
+++ b/drivers/infiniband/core/security.c
@@ -386,6 +386,9 @@ int ib_open_shared_qp_security(struct ib_qp *qp, struct ib_device *dev)
 	if (ret)
 		return ret;
 
+	if (!qp->qp_sec)
+		return 0;
+
 	mutex_lock(&real_qp->qp_sec->mutex);
 	ret = check_qp_port_pkey_settings(real_qp->qp_sec->ports_pkeys,
 					  qp->qp_sec);
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 3fb8fb6cc824..e36d27ed4daa 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -1438,7 +1438,8 @@ int ib_close_qp(struct ib_qp *qp)
 	spin_unlock_irqrestore(&real_qp->device->event_handler_lock, flags);
 
 	atomic_dec(&real_qp->usecnt);
-	ib_close_shared_qp_security(qp->qp_sec);
+	if (qp->qp_sec)
+		ib_close_shared_qp_security(qp->qp_sec);
 	kfree(qp);
 
 	return 0;

From 45e6ae7ef21b907dacb18da62d5787d74a31d860 Mon Sep 17 00:00:00 2001
From: Nitzan Carmi <nitzanc@mellanox.com>
Date: Tue, 26 Dec 2017 11:20:20 +0200
Subject: [PATCH 506/808] IB/mlx5: Fix mlx5_ib_alloc_mr error flow

ibmr.device is being set only after ib_alloc_mr() is
(successfully) complete. Therefore, in case mlx5_core_create_mkey()
return with error, the error flow calls mlx5_free_priv_descs()
which uses ibmr.device (which doesn't exist yet), causing
a NULL dereference oops.

To fix this, the IB device should be set in the mr struct earlier
stage (e.g. prior to calling mlx5_core_create_mkey()).

Fixes: 8a187ee52b04 ("IB/mlx5: Support the new memory registration API")
Signed-off-by: Max Gurtovoy <maxg@mellanox.com>
Signed-off-by: Nitzan Carmi <nitzanc@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx5/mr.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index ee0ee1f9994b..d109fe8290a7 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -1637,6 +1637,7 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd,
 	MLX5_SET(mkc, mkc, access_mode, mr->access_mode);
 	MLX5_SET(mkc, mkc, umr_en, 1);
 
+	mr->ibmr.device = pd->device;
 	err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, inlen);
 	if (err)
 		goto err_destroy_psv;

From 59585b4be9ae4dc6506551709bdcd6f5210b8a01 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@inai.de>
Date: Mon, 25 Dec 2017 03:43:53 +0100
Subject: [PATCH 507/808] sparc64: repair calling incorrect hweight function
 from stubs

Commit v4.12-rc4-1-g9289ea7f952b introduced a mistake that made the
64-bit hweight stub call the 16-bit hweight function.

Fixes: 9289ea7f952b ("sparc64: Use indirect calls in hamming weight stubs")
Signed-off-by: Jan Engelhardt <jengelh@inai.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/lib/hweight.S | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/sparc/lib/hweight.S b/arch/sparc/lib/hweight.S
index e5547b22cd18..0ddbbb031822 100644
--- a/arch/sparc/lib/hweight.S
+++ b/arch/sparc/lib/hweight.S
@@ -44,8 +44,8 @@ EXPORT_SYMBOL(__arch_hweight32)
 	.previous
 
 ENTRY(__arch_hweight64)
-	sethi	%hi(__sw_hweight16), %g1
-	jmpl	%g1 + %lo(__sw_hweight16), %g0
+	sethi	%hi(__sw_hweight64), %g1
+	jmpl	%g1 + %lo(__sw_hweight64), %g0
 	 nop
 ENDPROC(__arch_hweight64)
 EXPORT_SYMBOL(__arch_hweight64)

From 39c3fd58952d7599d367c84c1330b785d91d6088 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Sat, 2 Dec 2017 18:11:04 +0100
Subject: [PATCH 508/808] kernel/irq: Extend lockdep class for request mutex

The IRQ code already has support for lockdep class for the lock mutex
in an interrupt descriptor. Extend this to add a second class for the
request mutex in the descriptor. Not having a class is resulting in
false positive splats in some code paths.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: linus.walleij@linaro.org
Cc: grygorii.strashko@ti.com
Cc: f.fainelli@gmail.com
Link: https://lkml.kernel.org/r/1512234664-21555-1-git-send-email-andrew@lunn.ch
---
 arch/powerpc/sysdev/fsl_msi.c             |  4 ++-
 drivers/gpio/gpio-bcm-kona.c              |  3 ++-
 drivers/gpio/gpio-brcmstb.c               |  4 ++-
 drivers/gpio/gpio-tegra.c                 |  4 ++-
 drivers/gpio/gpiolib.c                    | 27 ++++++++++++-------
 drivers/irqchip/irq-renesas-intc-irqpin.c |  6 ++++-
 drivers/mfd/arizona-irq.c                 |  4 ++-
 drivers/pinctrl/pinctrl-single.c          |  5 +++-
 include/linux/gpio/driver.h               | 33 ++++++++++++++---------
 include/linux/irqdesc.h                   |  9 ++++---
 kernel/irq/generic-chip.c                 | 11 +++++---
 11 files changed, 75 insertions(+), 35 deletions(-)

diff --git a/arch/powerpc/sysdev/fsl_msi.c b/arch/powerpc/sysdev/fsl_msi.c
index 44cbf4c12ea1..df95102e732c 100644
--- a/arch/powerpc/sysdev/fsl_msi.c
+++ b/arch/powerpc/sysdev/fsl_msi.c
@@ -354,6 +354,7 @@ static int fsl_of_msi_remove(struct platform_device *ofdev)
 }
 
 static struct lock_class_key fsl_msi_irq_class;
+static struct lock_class_key fsl_msi_irq_request_class;
 
 static int fsl_msi_setup_hwirq(struct fsl_msi *msi, struct platform_device *dev,
 			       int offset, int irq_index)
@@ -373,7 +374,8 @@ static int fsl_msi_setup_hwirq(struct fsl_msi *msi, struct platform_device *dev,
 		dev_err(&dev->dev, "No memory for MSI cascade data\n");
 		return -ENOMEM;
 	}
-	irq_set_lockdep_class(virt_msir, &fsl_msi_irq_class);
+	irq_set_lockdep_class(virt_msir, &fsl_msi_irq_class,
+			      &fsl_msi_irq_request_class);
 	cascade_data->index = offset;
 	cascade_data->msi_data = msi;
 	cascade_data->virq = virt_msir;
diff --git a/drivers/gpio/gpio-bcm-kona.c b/drivers/gpio/gpio-bcm-kona.c
index dfcf56ee3c61..76861a00bb92 100644
--- a/drivers/gpio/gpio-bcm-kona.c
+++ b/drivers/gpio/gpio-bcm-kona.c
@@ -522,6 +522,7 @@ static struct of_device_id const bcm_kona_gpio_of_match[] = {
  * category than their parents, so it won't report false recursion.
  */
 static struct lock_class_key gpio_lock_class;
+static struct lock_class_key gpio_request_class;
 
 static int bcm_kona_gpio_irq_map(struct irq_domain *d, unsigned int irq,
 				 irq_hw_number_t hwirq)
@@ -531,7 +532,7 @@ static int bcm_kona_gpio_irq_map(struct irq_domain *d, unsigned int irq,
 	ret = irq_set_chip_data(irq, d->host_data);
 	if (ret < 0)
 		return ret;
-	irq_set_lockdep_class(irq, &gpio_lock_class);
+	irq_set_lockdep_class(irq, &gpio_lock_class, &gpio_request_class);
 	irq_set_chip_and_handler(irq, &bcm_gpio_irq_chip, handle_simple_irq);
 	irq_set_noprobe(irq);
 
diff --git a/drivers/gpio/gpio-brcmstb.c b/drivers/gpio/gpio-brcmstb.c
index 545d43a587b7..5b24801bffef 100644
--- a/drivers/gpio/gpio-brcmstb.c
+++ b/drivers/gpio/gpio-brcmstb.c
@@ -327,6 +327,7 @@ static struct brcmstb_gpio_bank *brcmstb_gpio_hwirq_to_bank(
  * category than their parents, so it won't report false recursion.
  */
 static struct lock_class_key brcmstb_gpio_irq_lock_class;
+static struct lock_class_key brcmstb_gpio_irq_request_class;
 
 
 static int brcmstb_gpio_irq_map(struct irq_domain *d, unsigned int irq,
@@ -346,7 +347,8 @@ static int brcmstb_gpio_irq_map(struct irq_domain *d, unsigned int irq,
 	ret = irq_set_chip_data(irq, &bank->gc);
 	if (ret < 0)
 		return ret;
-	irq_set_lockdep_class(irq, &brcmstb_gpio_irq_lock_class);
+	irq_set_lockdep_class(irq, &brcmstb_gpio_irq_lock_class,
+			      &brcmstb_gpio_irq_lock_class);
 	irq_set_chip_and_handler(irq, &priv->irq_chip, handle_level_irq);
 	irq_set_noprobe(irq);
 	return 0;
diff --git a/drivers/gpio/gpio-tegra.c b/drivers/gpio/gpio-tegra.c
index 8db47f671708..02fa8fe2292a 100644
--- a/drivers/gpio/gpio-tegra.c
+++ b/drivers/gpio/gpio-tegra.c
@@ -565,6 +565,7 @@ static const struct dev_pm_ops tegra_gpio_pm_ops = {
  * than their parents, so it won't report false recursion.
  */
 static struct lock_class_key gpio_lock_class;
+static struct lock_class_key gpio_request_class;
 
 static int tegra_gpio_probe(struct platform_device *pdev)
 {
@@ -670,7 +671,8 @@ static int tegra_gpio_probe(struct platform_device *pdev)
 
 		bank = &tgi->bank_info[GPIO_BANK(gpio)];
 
-		irq_set_lockdep_class(irq, &gpio_lock_class);
+		irq_set_lockdep_class(irq, &gpio_lock_class,
+				      &gpio_request_class);
 		irq_set_chip_data(irq, bank);
 		irq_set_chip_and_handler(irq, &tgi->ic, handle_simple_irq);
 	}
diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index aad84a6306c4..44332b793718 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -73,7 +73,8 @@ LIST_HEAD(gpio_devices);
 
 static void gpiochip_free_hogs(struct gpio_chip *chip);
 static int gpiochip_add_irqchip(struct gpio_chip *gpiochip,
-				struct lock_class_key *key);
+				struct lock_class_key *lock_key,
+				struct lock_class_key *request_key);
 static void gpiochip_irqchip_remove(struct gpio_chip *gpiochip);
 static int gpiochip_irqchip_init_valid_mask(struct gpio_chip *gpiochip);
 static void gpiochip_irqchip_free_valid_mask(struct gpio_chip *gpiochip);
@@ -1100,7 +1101,8 @@ static void gpiochip_setup_devs(void)
 }
 
 int gpiochip_add_data_with_key(struct gpio_chip *chip, void *data,
-			       struct lock_class_key *key)
+			       struct lock_class_key *lock_key,
+			       struct lock_class_key *request_key)
 {
 	unsigned long	flags;
 	int		status = 0;
@@ -1246,7 +1248,7 @@ int gpiochip_add_data_with_key(struct gpio_chip *chip, void *data,
 	if (status)
 		goto err_remove_from_list;
 
-	status = gpiochip_add_irqchip(chip, key);
+	status = gpiochip_add_irqchip(chip, lock_key, request_key);
 	if (status)
 		goto err_remove_chip;
 
@@ -1632,7 +1634,7 @@ int gpiochip_irq_map(struct irq_domain *d, unsigned int irq,
 	 * This lock class tells lockdep that GPIO irqs are in a different
 	 * category than their parents, so it won't report false recursion.
 	 */
-	irq_set_lockdep_class(irq, chip->irq.lock_key);
+	irq_set_lockdep_class(irq, chip->irq.lock_key, chip->irq.request_key);
 	irq_set_chip_and_handler(irq, chip->irq.chip, chip->irq.handler);
 	/* Chips that use nested thread handlers have them marked */
 	if (chip->irq.threaded)
@@ -1712,10 +1714,12 @@ static int gpiochip_to_irq(struct gpio_chip *chip, unsigned offset)
 /**
  * gpiochip_add_irqchip() - adds an IRQ chip to a GPIO chip
  * @gpiochip: the GPIO chip to add the IRQ chip to
- * @lock_key: lockdep class
+ * @lock_key: lockdep class for IRQ lock
+ * @request_key: lockdep class for IRQ request
  */
 static int gpiochip_add_irqchip(struct gpio_chip *gpiochip,
-				struct lock_class_key *lock_key)
+				struct lock_class_key *lock_key,
+				struct lock_class_key *request_key)
 {
 	struct irq_chip *irqchip = gpiochip->irq.chip;
 	const struct irq_domain_ops *ops;
@@ -1753,6 +1757,7 @@ static int gpiochip_add_irqchip(struct gpio_chip *gpiochip,
 	gpiochip->to_irq = gpiochip_to_irq;
 	gpiochip->irq.default_type = type;
 	gpiochip->irq.lock_key = lock_key;
+	gpiochip->irq.request_key = request_key;
 
 	if (gpiochip->irq.domain_ops)
 		ops = gpiochip->irq.domain_ops;
@@ -1850,7 +1855,8 @@ static void gpiochip_irqchip_remove(struct gpio_chip *gpiochip)
  * @type: the default type for IRQs on this irqchip, pass IRQ_TYPE_NONE
  * to have the core avoid setting up any default type in the hardware.
  * @threaded: whether this irqchip uses a nested thread handler
- * @lock_key: lockdep class
+ * @lock_key: lockdep class for IRQ lock
+ * @request_key: lockdep class for IRQ request
  *
  * This function closely associates a certain irqchip with a certain
  * gpiochip, providing an irq domain to translate the local IRQs to
@@ -1872,7 +1878,8 @@ int gpiochip_irqchip_add_key(struct gpio_chip *gpiochip,
 			     irq_flow_handler_t handler,
 			     unsigned int type,
 			     bool threaded,
-			     struct lock_class_key *lock_key)
+			     struct lock_class_key *lock_key,
+			     struct lock_class_key *request_key)
 {
 	struct device_node *of_node;
 
@@ -1913,6 +1920,7 @@ int gpiochip_irqchip_add_key(struct gpio_chip *gpiochip,
 	gpiochip->irq.default_type = type;
 	gpiochip->to_irq = gpiochip_to_irq;
 	gpiochip->irq.lock_key = lock_key;
+	gpiochip->irq.request_key = request_key;
 	gpiochip->irq.domain = irq_domain_add_simple(of_node,
 					gpiochip->ngpio, first_irq,
 					&gpiochip_domain_ops, gpiochip);
@@ -1940,7 +1948,8 @@ EXPORT_SYMBOL_GPL(gpiochip_irqchip_add_key);
 #else /* CONFIG_GPIOLIB_IRQCHIP */
 
 static inline int gpiochip_add_irqchip(struct gpio_chip *gpiochip,
-				       struct lock_class_key *key)
+				       struct lock_class_key *lock_key,
+				       struct lock_class_key *request_key)
 {
 	return 0;
 }
diff --git a/drivers/irqchip/irq-renesas-intc-irqpin.c b/drivers/irqchip/irq-renesas-intc-irqpin.c
index 06f29cf5018a..cee59fe1321c 100644
--- a/drivers/irqchip/irq-renesas-intc-irqpin.c
+++ b/drivers/irqchip/irq-renesas-intc-irqpin.c
@@ -342,6 +342,9 @@ static irqreturn_t intc_irqpin_shared_irq_handler(int irq, void *dev_id)
  */
 static struct lock_class_key intc_irqpin_irq_lock_class;
 
+/* And this is for the request mutex */
+static struct lock_class_key intc_irqpin_irq_request_class;
+
 static int intc_irqpin_irq_domain_map(struct irq_domain *h, unsigned int virq,
 				      irq_hw_number_t hw)
 {
@@ -352,7 +355,8 @@ static int intc_irqpin_irq_domain_map(struct irq_domain *h, unsigned int virq,
 
 	intc_irqpin_dbg(&p->irq[hw], "map");
 	irq_set_chip_data(virq, h->host_data);
-	irq_set_lockdep_class(virq, &intc_irqpin_irq_lock_class);
+	irq_set_lockdep_class(virq, &intc_irqpin_irq_lock_class,
+			      &intc_irqpin_irq_request_class);
 	irq_set_chip_and_handler(virq, &p->irq_chip, handle_level_irq);
 	return 0;
 }
diff --git a/drivers/mfd/arizona-irq.c b/drivers/mfd/arizona-irq.c
index 09cf3699e354..a307832d7e45 100644
--- a/drivers/mfd/arizona-irq.c
+++ b/drivers/mfd/arizona-irq.c
@@ -184,6 +184,7 @@ static struct irq_chip arizona_irq_chip = {
 };
 
 static struct lock_class_key arizona_irq_lock_class;
+static struct lock_class_key arizona_irq_request_class;
 
 static int arizona_irq_map(struct irq_domain *h, unsigned int virq,
 			      irq_hw_number_t hw)
@@ -191,7 +192,8 @@ static int arizona_irq_map(struct irq_domain *h, unsigned int virq,
 	struct arizona *data = h->host_data;
 
 	irq_set_chip_data(virq, data);
-	irq_set_lockdep_class(virq, &arizona_irq_lock_class);
+	irq_set_lockdep_class(virq, &arizona_irq_lock_class,
+		&arizona_irq_request_class);
 	irq_set_chip_and_handler(virq, &arizona_irq_chip, handle_simple_irq);
 	irq_set_nested_thread(virq, 1);
 	irq_set_noprobe(virq);
diff --git a/drivers/pinctrl/pinctrl-single.c b/drivers/pinctrl/pinctrl-single.c
index e6cd8de793e2..3501491e5bfc 100644
--- a/drivers/pinctrl/pinctrl-single.c
+++ b/drivers/pinctrl/pinctrl-single.c
@@ -222,6 +222,9 @@ static enum pin_config_param pcs_bias[] = {
  */
 static struct lock_class_key pcs_lock_class;
 
+/* Class for the IRQ request mutex */
+static struct lock_class_key pcs_request_class;
+
 /*
  * REVISIT: Reads and writes could eventually use regmap or something
  * generic. But at least on omaps, some mux registers are performance
@@ -1486,7 +1489,7 @@ static int pcs_irqdomain_map(struct irq_domain *d, unsigned int irq,
 	irq_set_chip_data(irq, pcs_soc);
 	irq_set_chip_and_handler(irq, &pcs->chip,
 				 handle_level_irq);
-	irq_set_lockdep_class(irq, &pcs_lock_class);
+	irq_set_lockdep_class(irq, &pcs_lock_class, &pcs_request_class);
 	irq_set_noprobe(irq);
 
 	return 0;
diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index 55e672592fa9..7258cd676df4 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -66,9 +66,10 @@ struct gpio_irq_chip {
 	/**
 	 * @lock_key:
 	 *
-	 * Per GPIO IRQ chip lockdep class.
+	 * Per GPIO IRQ chip lockdep classes.
 	 */
 	struct lock_class_key *lock_key;
+	struct lock_class_key *request_key;
 
 	/**
 	 * @parent_handler:
@@ -323,7 +324,8 @@ extern const char *gpiochip_is_requested(struct gpio_chip *chip,
 
 /* add/remove chips */
 extern int gpiochip_add_data_with_key(struct gpio_chip *chip, void *data,
-				      struct lock_class_key *lock_key);
+				      struct lock_class_key *lock_key,
+				      struct lock_class_key *request_key);
 
 /**
  * gpiochip_add_data() - register a gpio_chip
@@ -350,11 +352,13 @@ extern int gpiochip_add_data_with_key(struct gpio_chip *chip, void *data,
  */
 #ifdef CONFIG_LOCKDEP
 #define gpiochip_add_data(chip, data) ({		\
-		static struct lock_class_key key;	\
-		gpiochip_add_data_with_key(chip, data, &key);	\
+		static struct lock_class_key lock_key;	\
+		static struct lock_class_key request_key;	  \
+		gpiochip_add_data_with_key(chip, data, &lock_key, \
+					   &request_key);	  \
 	})
 #else
-#define gpiochip_add_data(chip, data) gpiochip_add_data_with_key(chip, data, NULL)
+#define gpiochip_add_data(chip, data) gpiochip_add_data_with_key(chip, data, NULL, NULL)
 #endif
 
 static inline int gpiochip_add(struct gpio_chip *chip)
@@ -429,7 +433,8 @@ int gpiochip_irqchip_add_key(struct gpio_chip *gpiochip,
 			     irq_flow_handler_t handler,
 			     unsigned int type,
 			     bool threaded,
-			     struct lock_class_key *lock_key);
+			     struct lock_class_key *lock_key,
+			     struct lock_class_key *request_key);
 
 #ifdef CONFIG_LOCKDEP
 
@@ -445,10 +450,12 @@ static inline int gpiochip_irqchip_add(struct gpio_chip *gpiochip,
 				       irq_flow_handler_t handler,
 				       unsigned int type)
 {
-	static struct lock_class_key key;
+	static struct lock_class_key lock_key;
+	static struct lock_class_key request_key;
 
 	return gpiochip_irqchip_add_key(gpiochip, irqchip, first_irq,
-					handler, type, false, &key);
+					handler, type, false,
+					&lock_key, &request_key);
 }
 
 static inline int gpiochip_irqchip_add_nested(struct gpio_chip *gpiochip,
@@ -458,10 +465,12 @@ static inline int gpiochip_irqchip_add_nested(struct gpio_chip *gpiochip,
 			  unsigned int type)
 {
 
-	static struct lock_class_key key;
+	static struct lock_class_key lock_key;
+	static struct lock_class_key request_key;
 
 	return gpiochip_irqchip_add_key(gpiochip, irqchip, first_irq,
-					handler, type, true, &key);
+					handler, type, true,
+					&lock_key, &request_key);
 }
 #else
 static inline int gpiochip_irqchip_add(struct gpio_chip *gpiochip,
@@ -471,7 +480,7 @@ static inline int gpiochip_irqchip_add(struct gpio_chip *gpiochip,
 				       unsigned int type)
 {
 	return gpiochip_irqchip_add_key(gpiochip, irqchip, first_irq,
-					handler, type, false, NULL);
+					handler, type, false, NULL, NULL);
 }
 
 static inline int gpiochip_irqchip_add_nested(struct gpio_chip *gpiochip,
@@ -481,7 +490,7 @@ static inline int gpiochip_irqchip_add_nested(struct gpio_chip *gpiochip,
 			  unsigned int type)
 {
 	return gpiochip_irqchip_add_key(gpiochip, irqchip, first_irq,
-					handler, type, true, NULL);
+					handler, type, true, NULL, NULL);
 }
 #endif /* CONFIG_LOCKDEP */
 
diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index 39fb3700f7a9..25b33b664537 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -255,12 +255,15 @@ static inline bool irq_is_percpu_devid(unsigned int irq)
 }
 
 static inline void
-irq_set_lockdep_class(unsigned int irq, struct lock_class_key *class)
+irq_set_lockdep_class(unsigned int irq, struct lock_class_key *lock_class,
+		      struct lock_class_key *request_class)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 
-	if (desc)
-		lockdep_set_class(&desc->lock, class);
+	if (desc) {
+		lockdep_set_class(&desc->lock, lock_class);
+		lockdep_set_class(&desc->request_mutex, request_class);
+	}
 }
 
 #ifdef CONFIG_IRQ_PREFLOW_FASTEOI
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index c26c5bb6b491..508c03dfef25 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -364,10 +364,11 @@ irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq)
 EXPORT_SYMBOL_GPL(irq_get_domain_generic_chip);
 
 /*
- * Separate lockdep class for interrupt chip which can nest irq_desc
- * lock.
+ * Separate lockdep classes for interrupt chip which can nest irq_desc
+ * lock and request mutex.
  */
 static struct lock_class_key irq_nested_lock_class;
+static struct lock_class_key irq_nested_request_class;
 
 /*
  * irq_map_generic_chip - Map a generic chip for an irq domain
@@ -409,7 +410,8 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
 	set_bit(idx, &gc->installed);
 
 	if (dgc->gc_flags & IRQ_GC_INIT_NESTED_LOCK)
-		irq_set_lockdep_class(virq, &irq_nested_lock_class);
+		irq_set_lockdep_class(virq, &irq_nested_lock_class,
+				      &irq_nested_request_class);
 
 	if (chip->irq_calc_mask)
 		chip->irq_calc_mask(data);
@@ -479,7 +481,8 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk,
 			continue;
 
 		if (flags & IRQ_GC_INIT_NESTED_LOCK)
-			irq_set_lockdep_class(i, &irq_nested_lock_class);
+			irq_set_lockdep_class(i, &irq_nested_lock_class,
+					      &irq_nested_request_class);
 
 		if (!(flags & IRQ_GC_NO_MASK)) {
 			struct irq_data *d = irq_get_irq_data(i);

From 466a2b42d67644447a1765276259a3ea5531ddff Mon Sep 17 00:00:00 2001
From: Joel Fernandes <joelaf@google.com>
Date: Thu, 21 Dec 2017 02:22:45 +0100
Subject: [PATCH 509/808] cpufreq: schedutil: Use idle_calls counter of the
 remote CPU

Since the recent remote cpufreq callback work, its possible that a cpufreq
update is triggered from a remote CPU. For single policies however, the current
code uses the local CPU when trying to determine if the remote sg_cpu entered
idle or is busy. This is incorrect. To remedy this, compare with the nohz tick
idle_calls counter of the remote CPU.

Fixes: 674e75411fc2 (sched: cpufreq: Allow remote cpufreq callbacks)
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Joel Fernandes <joelaf@google.com>
Cc: 4.14+ <stable@vger.kernel.org> # 4.14+
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/tick.h             |  1 +
 kernel/sched/cpufreq_schedutil.c |  2 +-
 kernel/time/tick-sched.c         | 13 +++++++++++++
 3 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/include/linux/tick.h b/include/linux/tick.h
index f442d1a42025..7cc35921218e 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -119,6 +119,7 @@ extern void tick_nohz_idle_exit(void);
 extern void tick_nohz_irq_exit(void);
 extern ktime_t tick_nohz_get_sleep_length(void);
 extern unsigned long tick_nohz_get_idle_calls(void);
+extern unsigned long tick_nohz_get_idle_calls_cpu(int cpu);
 extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time);
 extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time);
 #else /* !CONFIG_NO_HZ_COMMON */
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 2f52ec0f1539..d6717a3331a1 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -244,7 +244,7 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
 #ifdef CONFIG_NO_HZ_COMMON
 static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu)
 {
-	unsigned long idle_calls = tick_nohz_get_idle_calls();
+	unsigned long idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu);
 	bool ret = idle_calls == sg_cpu->saved_idle_calls;
 
 	sg_cpu->saved_idle_calls = idle_calls;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 99578f06c8d4..77555faf6fbc 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -985,6 +985,19 @@ ktime_t tick_nohz_get_sleep_length(void)
 	return ts->sleep_length;
 }
 
+/**
+ * tick_nohz_get_idle_calls_cpu - return the current idle calls counter value
+ * for a particular CPU.
+ *
+ * Called from the schedutil frequency scaling governor in scheduler context.
+ */
+unsigned long tick_nohz_get_idle_calls_cpu(int cpu)
+{
+	struct tick_sched *ts = tick_get_tick_sched(cpu);
+
+	return ts->idle_calls;
+}
+
 /**
  * tick_nohz_get_idle_calls - return the current idle calls counter value
  *

From 11bca0a83f83f6093d816295668e74ef24595944 Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Sat, 2 Dec 2017 09:13:04 -0800
Subject: [PATCH 510/808] genirq: Guard handle_bad_irq log messages

An interrupt storm on a bad interrupt will cause the kernel
log to be clogged.

[   60.089234] ->handle_irq():  ffffffffbe2f803f,
[   60.090455] 0xffffffffbf2af380
[   60.090510] handle_bad_irq+0x0/0x2e5
[   60.090522] ->irq_data.chip(): ffffffffbf2af380,
[   60.090553]    IRQ_NOPROBE set
[   60.090584] ->handle_irq():  ffffffffbe2f803f,
[   60.090590] handle_bad_irq+0x0/0x2e5
[   60.090596] ->irq_data.chip(): ffffffffbf2af380,
[   60.090602] 0xffffffffbf2af380
[   60.090608] ->action():           (null)
[   60.090779] handle_bad_irq+0x0/0x2e5

This was seen when running an upstream kernel on Acer Chromebook R11.  The
system was unstable as result.

Guard the log message with __printk_ratelimit to reduce the impact.  This
won't prevent the interrupt storm from happening, but at least the system
remains stable.

Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Dmitry Torokhov <dtor@chromium.org>
Cc: Joe Perches <joe@perches.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Mika Westerberg <mika.westerberg@linux.intel.com>
Link: https://bugzilla.kernel.org/show_bug.cgi?id=197953
Link: https://lkml.kernel.org/r/1512234784-21038-1-git-send-email-linux@roeck-us.net
---
 kernel/irq/debug.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h
index 17f05ef8f575..e4d3819a91cc 100644
--- a/kernel/irq/debug.h
+++ b/kernel/irq/debug.h
@@ -12,6 +12,11 @@
 
 static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
 {
+	static DEFINE_RATELIMIT_STATE(ratelimit, 5 * HZ, 5);
+
+	if (!__ratelimit(&ratelimit))
+		return;
+
 	printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n",
 		irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled);
 	printk("->handle_irq():  %p, ", desc->handle_irq);

From 4fcab6693445cfb84f2b65868c58043535090e52 Mon Sep 17 00:00:00 2001
From: Dou Liyang <douly.fnst@cn.fujitsu.com>
Date: Mon, 4 Dec 2017 12:03:12 +0800
Subject: [PATCH 511/808] x86/apic: Avoid wrong warning when parsing 'apic=' in
 X86-32 case

There are two consumers of apic=:
  apic_set_verbosity() for setting the APIC debug level;
  parse_apic() for registering APIC driver by hand.

X86-32 supports both of them, but sometimes, kernel issues a weird warning.
eg: when kernel was booted up with 'apic=bigsmp' in command line,
early_param would warn like that:

...
[    0.000000] APIC Verbosity level bigsmp not recognised use apic=verbose or apic=debug
[    0.000000] Malformed early option 'apic'
...

Wrap the warning code in CONFIG_X86_64 case to avoid this.

Signed-off-by: Dou Liyang <douly.fnst@cn.fujitsu.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: peterz@infradead.org
Cc: rdunlap@infradead.org
Cc: corbet@lwn.net
Link: https://lkml.kernel.org/r/20171204040313.24824-1-douly.fnst@cn.fujitsu.com
---
 arch/x86/kernel/apic/apic.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 6e272f3ea984..880441f24146 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -2626,11 +2626,13 @@ static int __init apic_set_verbosity(char *arg)
 		apic_verbosity = APIC_DEBUG;
 	else if (strcmp("verbose", arg) == 0)
 		apic_verbosity = APIC_VERBOSE;
+#ifdef CONFIG_X86_64
 	else {
 		pr_warning("APIC Verbosity level %s not recognised"
 			" use apic=verbose or apic=debug\n", arg);
 		return -EINVAL;
 	}
+#endif
 
 	return 0;
 }

From 64e05d118e357bb52a084b609436acf292ce7944 Mon Sep 17 00:00:00 2001
From: Dou Liyang <douly.fnst@cn.fujitsu.com>
Date: Mon, 4 Dec 2017 12:03:13 +0800
Subject: [PATCH 512/808] x86/apic: Update the 'apic=' description of setting
 APIC driver

There are two consumers of apic=: the APIC debug level and the low
level generic architecture code, but Linux just documented the first
one.

Append the second description.

Signed-off-by: Dou Liyang <douly.fnst@cn.fujitsu.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: peterz@infradead.org
Cc: rdunlap@infradead.org
Cc: corbet@lwn.net
Link: https://lkml.kernel.org/r/20171204040313.24824-2-douly.fnst@cn.fujitsu.com
---
 Documentation/admin-guide/kernel-parameters.txt | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index b74e13312fdc..852fb11dd2c9 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -328,11 +328,15 @@
 			not play well with APC CPU idle - disable it if you have
 			APC and your system crashes randomly.
 
-	apic=		[APIC,X86-32] Advanced Programmable Interrupt Controller
+	apic=		[APIC,X86] Advanced Programmable Interrupt Controller
 			Change the output verbosity whilst booting
 			Format: { quiet (default) | verbose | debug }
 			Change the amount of debugging information output
 			when initialising the APIC and IO-APIC components.
+			For X86-32, this can also be used to specify an APIC
+			driver name.
+			Format: apic=driver_name
+			Examples: apic=bigsmp
 
 	apic_extnmi=	[APIC,X86] External NMI delivery setting
 			Format: { bsp (default) | all | none }

From e7e83dd3ff1dd2f9e60213f6eedc7e5b08192062 Mon Sep 17 00:00:00 2001
From: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Date: Tue, 26 Dec 2017 15:27:20 -0600
Subject: [PATCH 513/808] objtool: Fix Clang enum conversion warning

Fix the following Clang enum conversion warning:

  arch/x86/decode.c:141:20: error: implicit conversion from enumeration
  type 'enum op_src_type' to different enumeration
  type 'enum op_dest_type' [-Werror,-Wenum-conversion]

    op->dest.type = OP_SRC_REG;
		  ~ ^~~~~~~~~~

It just happened to work before because OP_SRC_REG and OP_DEST_REG have
the same value.

Signed-off-by: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Reviewed-by: Nicholas Mc Guire <der.herr@hofr.at>
Reviewed-by: Nick Desaulniers <nick.desaulniers@gmail.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: baa41469a7b9 ("objtool: Implement stack validation 2.0")
Link: http://lkml.kernel.org/r/b4156c5738bae781c392e7a3691aed4514ebbdf2.1514323568.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 tools/objtool/arch/x86/decode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c
index 8acfc47af70e..540a209b78ab 100644
--- a/tools/objtool/arch/x86/decode.c
+++ b/tools/objtool/arch/x86/decode.c
@@ -138,7 +138,7 @@ int arch_decode_instruction(struct elf *elf, struct section *sec,
 			*type = INSN_STACK;
 			op->src.type = OP_SRC_ADD;
 			op->src.reg = op_to_cfi_reg[modrm_reg][rex_r];
-			op->dest.type = OP_SRC_REG;
+			op->dest.type = OP_DEST_REG;
 			op->dest.reg = CFI_SP;
 		}
 		break;

From 517d7c79bdb39864e617960504bdc1aa560c75c6 Mon Sep 17 00:00:00 2001
From: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan@gmail.com>
Date: Thu, 28 Dec 2017 12:03:06 +0100
Subject: [PATCH 514/808] tipc: fix hanging poll() for stream sockets

In commit 42b531de17d2f6 ("tipc: Fix missing connection request
handling"), we replaced unconditional wakeup() with condtional
wakeup for clients with flags POLLIN | POLLRDNORM | POLLRDBAND.

This breaks the applications which do a connect followed by poll
with POLLOUT flag. These applications are not woken when the
connection is ESTABLISHED and hence sleep forever.

In this commit, we fix it by including the POLLOUT event for
sockets in TIPC_CONNECTING state.

Fixes: 42b531de17d2f6 ("tipc: Fix missing connection request handling")
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/socket.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 41127d0b925e..3b4084480377 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -727,11 +727,11 @@ static unsigned int tipc_poll(struct file *file, struct socket *sock,
 
 	switch (sk->sk_state) {
 	case TIPC_ESTABLISHED:
+	case TIPC_CONNECTING:
 		if (!tsk->cong_link_cnt && !tsk_conn_cong(tsk))
 			revents |= POLLOUT;
 		/* fall thru' */
 	case TIPC_LISTEN:
-	case TIPC_CONNECTING:
 		if (!skb_queue_empty(&sk->sk_receive_queue))
 			revents |= POLLIN | POLLRDNORM;
 		break;

From f72c4ac695573699dde5b71da1c3b9ef80440616 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Thu, 28 Dec 2017 12:38:13 -0500
Subject: [PATCH 515/808] skbuff: in skb_copy_ubufs unclone before releasing
 zerocopy

skb_copy_ubufs must unclone before it is safe to modify its
skb_shared_info with skb_zcopy_clear.

Commit b90ddd568792 ("skbuff: skb_copy_ubufs must release uarg even
without user frags") ensures that all skbs release their zerocopy
state, even those without frags.

But I forgot an edge case where such an skb arrives that is cloned.

The stack does not build such packets. Vhost/tun skbs have their
frags orphaned before cloning. TCP skbs only attach zerocopy state
when a frag is added.

But if TCP packets can be trimmed or linearized, this might occur.
Tracing the code I found no instance so far (e.g., skb_linearize
ends up calling skb_zcopy_clear if !skb->data_len).

Still, it is non-obvious that no path exists. And it is fragile to
rely on this.

Fixes: b90ddd568792 ("skbuff: skb_copy_ubufs must release uarg even without user frags")
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index a3cb0be4c6f3..08f574081315 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1177,12 +1177,12 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
 	int i, new_frags;
 	u32 d_off;
 
-	if (!num_frags)
-		goto release;
-
 	if (skb_shared(skb) || skb_unclone(skb, gfp_mask))
 		return -EINVAL;
 
+	if (!num_frags)
+		goto release;
+
 	new_frags = (__skb_pagelen(skb) + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	for (i = 0; i < new_frags; i++) {
 		page = alloc_page(gfp_mask);

From 602f7a2714a3b3aa4bec82ab0a86a9f5a2c4aa61 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@quantonium.net>
Date: Thu, 28 Dec 2017 11:00:43 -0800
Subject: [PATCH 516/808] sock: Add sock_owned_by_user_nocheck

This allows checking socket lock ownership with producing lockdep
warnings.

Signed-off-by: Tom Herbert <tom@quantonium.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/include/net/sock.h b/include/net/sock.h
index 9155da422692..7a7b14e9628a 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1514,6 +1514,11 @@ static inline bool sock_owned_by_user(const struct sock *sk)
 	return sk->sk_lock.owned;
 }
 
+static inline bool sock_owned_by_user_nocheck(const struct sock *sk)
+{
+	return sk->sk_lock.owned;
+}
+
 /* no reclassification while locks are held */
 static inline bool sock_allow_reclassification(const struct sock *csk)
 {

From d66fa9ec53c43bba9fa973c16419f6061b7cc3ea Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@quantonium.net>
Date: Thu, 28 Dec 2017 11:00:44 -0800
Subject: [PATCH 517/808] strparser: Call sock_owned_by_user_nocheck

strparser wants to check socket ownership without producing any
warnings. As indicated by the comment in the code, it is permissible
for owned_by_user to return true.

Fixes: 43a0c6751a322847 ("strparser: Stream parser for messages")
Reported-by: syzbot <syzkaller@googlegroups.com>
Reported-and-tested-by: <syzbot+c91c53af67f9ebe599a337d2e70950366153b295@syzkaller.appspotmail.com>
Signed-off-by: Tom Herbert <tom@quantonium.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/strparser/strparser.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c
index c5fda15ba319..1fdab5c4eda8 100644
--- a/net/strparser/strparser.c
+++ b/net/strparser/strparser.c
@@ -401,7 +401,7 @@ void strp_data_ready(struct strparser *strp)
 	 * allows a thread in BH context to safely check if the process
 	 * lock is held. In this case, if the lock is held, queue work.
 	 */
-	if (sock_owned_by_user(strp->sk)) {
+	if (sock_owned_by_user_nocheck(strp->sk)) {
 		queue_work(strp_wq, &strp->work);
 		return;
 	}

From 955b1b5a00ba694159a7d3763412597f707c294d Mon Sep 17 00:00:00 2001
From: Minwoo Im <minwoo.im.dev@gmail.com>
Date: Wed, 20 Dec 2017 16:30:50 +0900
Subject: [PATCH 518/808] nvme-pci: move use_sgl initialization to
 nvme_init_iod()

A flag "use_sgl" of "struct nvme_iod" has been used in nvme_init_iod()
without being set to any value. It seems like "use_sgl" has been set
in either nvme_pci_setup_prps() or nvme_pci_setup_sgls() which occur
later than nvme_init_iod().

Make "iod->use_sgl" being set in a proper place, nvme_init_iod().
Also move nvme_pci_use_sgls() up above nvme_init_iod() to make it
possible to be called by nvme_init_iod().

Signed-off-by: Minwoo Im <minwoo.im.dev@gmail.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/pci.c | 42 ++++++++++++++++++++---------------------
 1 file changed, 20 insertions(+), 22 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index f5800c3c9082..d53550e612bc 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -448,12 +448,31 @@ static void **nvme_pci_iod_list(struct request *req)
 	return (void **)(iod->sg + blk_rq_nr_phys_segments(req));
 }
 
+static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req)
+{
+	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+	unsigned int avg_seg_size;
+
+	avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req),
+			blk_rq_nr_phys_segments(req));
+
+	if (!(dev->ctrl.sgls & ((1 << 0) | (1 << 1))))
+		return false;
+	if (!iod->nvmeq->qid)
+		return false;
+	if (!sgl_threshold || avg_seg_size < sgl_threshold)
+		return false;
+	return true;
+}
+
 static blk_status_t nvme_init_iod(struct request *rq, struct nvme_dev *dev)
 {
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(rq);
 	int nseg = blk_rq_nr_phys_segments(rq);
 	unsigned int size = blk_rq_payload_bytes(rq);
 
+	iod->use_sgl = nvme_pci_use_sgls(dev, rq);
+
 	if (nseg > NVME_INT_PAGES || size > NVME_INT_BYTES(dev)) {
 		size_t alloc_size = nvme_pci_iod_alloc_size(dev, size, nseg,
 				iod->use_sgl);
@@ -604,8 +623,6 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
 	dma_addr_t prp_dma;
 	int nprps, i;
 
-	iod->use_sgl = false;
-
 	length -= (page_size - offset);
 	if (length <= 0) {
 		iod->first_dma = 0;
@@ -715,8 +732,6 @@ static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
 	int entries = iod->nents, i = 0;
 	dma_addr_t sgl_dma;
 
-	iod->use_sgl = true;
-
 	/* setting the transfer type as SGL */
 	cmd->flags = NVME_CMD_SGL_METABUF;
 
@@ -770,23 +785,6 @@ static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
 	return BLK_STS_OK;
 }
 
-static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req)
-{
-	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
-	unsigned int avg_seg_size;
-
-	avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req),
-			blk_rq_nr_phys_segments(req));
-
-	if (!(dev->ctrl.sgls & ((1 << 0) | (1 << 1))))
-		return false;
-	if (!iod->nvmeq->qid)
-		return false;
-	if (!sgl_threshold || avg_seg_size < sgl_threshold)
-		return false;
-	return true;
-}
-
 static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
 		struct nvme_command *cmnd)
 {
@@ -806,7 +804,7 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
 				DMA_ATTR_NO_WARN))
 		goto out;
 
-	if (nvme_pci_use_sgls(dev, req))
+	if (iod->use_sgl)
 		ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw);
 	else
 		ret = nvme_pci_setup_prps(dev, req, &cmnd->rw);

From cee160fd34b459ace029653436319557a643795a Mon Sep 17 00:00:00 2001
From: Jeff Lien <jeff.lien@wdc.com>
Date: Tue, 19 Dec 2017 13:24:15 -0600
Subject: [PATCH 519/808] nvme: fix sector units when going between formats

If you format a device with a 4k sector size back to 512 bytes, the queue
limit values for physical block size and minimum IO size were not getting
updated; only the logical block size was being updated.  This patch adds
code to update the physical block and IO minimum sizes.

Signed-off-by: Jeff Lien <jeff.lien@wdc.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/core.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 1e46e60b8f10..961d6a4af19c 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1335,6 +1335,7 @@ static void nvme_update_disk_info(struct gendisk *disk,
 		struct nvme_ns *ns, struct nvme_id_ns *id)
 {
 	sector_t capacity = le64_to_cpup(&id->nsze) << (ns->lba_shift - 9);
+	unsigned short bs = 1 << ns->lba_shift;
 	unsigned stream_alignment = 0;
 
 	if (ns->ctrl->nr_streams && ns->sws && ns->sgs)
@@ -1343,7 +1344,10 @@ static void nvme_update_disk_info(struct gendisk *disk,
 	blk_mq_freeze_queue(disk->queue);
 	blk_integrity_unregister(disk);
 
-	blk_queue_logical_block_size(disk->queue, 1 << ns->lba_shift);
+	blk_queue_logical_block_size(disk->queue, bs);
+	blk_queue_physical_block_size(disk->queue, bs);
+	blk_queue_io_min(disk->queue, bs);
+
 	if (ns->ms && !ns->ext &&
 	    (ns->ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
 		nvme_init_integrity(disk, ns->ms, ns->pi_type);

From d5bf4b7f437c250821d40c3e32158729e6b484ce Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Thu, 21 Dec 2017 14:54:15 +0200
Subject: [PATCH 520/808] nvme-rdma: fix concurrent reset and reconnect

Now ctrl state machine allows to transition from RESETTING to
RECONNECTING.  In nvme-rdma when we receive a rdma cm DISONNECTED event,
we trigger nvme_rdma_error_recovery. This happens also when we execute a
controller reset, issue a cm diconnect request and receive a cm
disconnect reply, as a result, the reset work and the error recovery work
can run concurrently.

Until now the state machine prevented from the error recovery work from
running as a result of a controller reset (RESETTING -> RECONNECTING was
not allowed).

To fix this, we adopt the FC state machine approach, we always transition
from LIVE to RESETTING and only then to RECONNECTING.  We do this both
for the error recovery work and the controller reset work:

 1. transition to RESETTING
 2. teardown the controller association
 3. transition to RECONNECTING

This will restore the protection against reset work and error recovery work
from concurrently running together.

Fixes: 3cec7f9de448 ("nvme: allow controller RESETTING to RECONNECTING transition")
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/rdma.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 37af56596be6..2a0bba7f50cf 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -974,12 +974,18 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work)
 	blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
 	nvme_start_queues(&ctrl->ctrl);
 
+	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING)) {
+		/* state change failure should never happen */
+		WARN_ON_ONCE(1);
+		return;
+	}
+
 	nvme_rdma_reconnect_or_remove(ctrl);
 }
 
 static void nvme_rdma_error_recovery(struct nvme_rdma_ctrl *ctrl)
 {
-	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING))
+	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING))
 		return;
 
 	queue_work(nvme_wq, &ctrl->err_work);
@@ -1753,6 +1759,12 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work)
 	nvme_stop_ctrl(&ctrl->ctrl);
 	nvme_rdma_shutdown_ctrl(ctrl, false);
 
+	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING)) {
+		/* state change failure should never happen */
+		WARN_ON_ONCE(1);
+		return;
+	}
+
 	ret = nvme_rdma_configure_admin_queue(ctrl, false);
 	if (ret)
 		goto out_fail;

From 479a322fb729d657d34706ccf8dd12916f36628f Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Thu, 21 Dec 2017 15:07:27 +0200
Subject: [PATCH 521/808] nvme-mpath: fix last path removal during traffic

In case our last path is removed during traffic, we can end up requeueing
the bio(s) but never schedule the actual requeue work as upper layers
still have open handles on the mpath device node.

Fix this by scheduling requeue work if the namespace being removed is
the last path in the ns_head path list.

Fixes: 32acab3181c7 ("nvme: implement multipath access to nvme subsystems")
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/core.c |  1 +
 drivers/nvme/host/nvme.h | 12 ++++++++++++
 2 files changed, 13 insertions(+)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 961d6a4af19c..839650e0926a 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2991,6 +2991,7 @@ static void nvme_ns_remove(struct nvme_ns *ns)
 	mutex_unlock(&ns->ctrl->namespaces_mutex);
 
 	synchronize_srcu(&ns->head->srcu);
+	nvme_mpath_check_last_path(ns);
 	nvme_put_ns(ns);
 }
 
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index ea1aa5283e8e..a00eabd06427 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -417,6 +417,15 @@ static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns)
 		rcu_assign_pointer(head->current_path, NULL);
 }
 struct nvme_ns *nvme_find_path(struct nvme_ns_head *head);
+
+static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
+{
+	struct nvme_ns_head *head = ns->head;
+
+	if (head->disk && list_empty(&head->list))
+		kblockd_schedule_work(&head->requeue_work);
+}
+
 #else
 static inline void nvme_failover_req(struct request *req)
 {
@@ -448,6 +457,9 @@ static inline void nvme_mpath_remove_disk_links(struct nvme_ns *ns)
 static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns)
 {
 }
+static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
+{
+}
 #endif /* CONFIG_NVME_MULTIPATH */
 
 #ifdef CONFIG_NVM

From 254beb84faccbe2f4eda0b51924857bdfb679969 Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Thu, 21 Dec 2017 14:15:47 -0800
Subject: [PATCH 522/808] nvme-fcloop: avoid possible uninitialized variable
 warning

The kbuild test robot send mail of a potential use of an uninitialized
variable - "tport" in fcloop_delete_targetport() which then calls
__targetport_unreg() which uses the variable. It will never be the
case it is uninitialized as the call to __targetport_unreg() only
occurs if there is a valid nport pointer. And at the time the nport
pointer is assigned, the tport variable is set.

Remove the warning by assigning a NULL value initially.

Signed-off-by: James Smart <james.smart@broadcom.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/fcloop.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c
index 7b75d9de55ab..6a018a0bd6ce 100644
--- a/drivers/nvme/target/fcloop.c
+++ b/drivers/nvme/target/fcloop.c
@@ -1085,7 +1085,7 @@ fcloop_delete_target_port(struct device *dev, struct device_attribute *attr,
 		const char *buf, size_t count)
 {
 	struct fcloop_nport *nport = NULL, *tmpport;
-	struct fcloop_tport *tport;
+	struct fcloop_tport *tport = NULL;
 	u64 nodename, portname;
 	unsigned long flags;
 	int ret;

From a31e58e129f73ab5b04016330b13ed51fde7a961 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 28 Dec 2017 11:33:33 +0100
Subject: [PATCH 523/808] x86/apic: Switch all APICs to Fixed delivery mode

Some of the APIC incarnations are operating in lowest priority delivery
mode. This worked as long as the vector management code allocated the same
vector on all possible CPUs for each interrupt.

Lowest priority delivery mode does not necessarily respect the affinity
setting and may redirect to some other online CPU. This was documented
somewhere in the old code and the conversion to single target delivery
missed to update the delivery mode of the affected APIC drivers which
results in spurious interrupts on some of the affected CPU/Chipset
combinations.

Switch the APIC drivers over to Fixed delivery mode and remove all
leftovers of lowest priority delivery mode.

Switching to Fixed delivery mode is not a problem on these CPUs because the
kernel already uses Fixed delivery mode for IPIs. The reason for this is
that th SDM explicitely forbids lowest prio mode for IPIs. The reason is
obvious: If the irq routing does not honor destination targets in lowest
prio mode then an IPI targeted at CPU1 might end up on CPU0, which would be
a fatal problem in many cases.

As a consequence of this change, the apic::irq_delivery_mode field is now
pointless, but this needs to be cleaned up in a separate patch.

Fixes: fdba46ffb4c2 ("x86/apic: Get rid of multi CPU affinity")
Reported-by: vcaputo@pengaru.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: vcaputo@pengaru.com
Cc: Pavel Machek <pavel@ucw.cz>
Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1712281140440.1688@nanos
---
 arch/x86/kernel/apic/apic_flat_64.c   | 2 +-
 arch/x86/kernel/apic/apic_noop.c      | 2 +-
 arch/x86/kernel/apic/msi.c            | 8 ++------
 arch/x86/kernel/apic/probe_32.c       | 2 +-
 arch/x86/kernel/apic/x2apic_cluster.c | 2 +-
 drivers/pci/host/pci-hyperv.c         | 8 ++------
 6 files changed, 8 insertions(+), 16 deletions(-)

diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index aa85690e9b64..25a87028cb3f 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -151,7 +151,7 @@ static struct apic apic_flat __ro_after_init = {
 	.apic_id_valid			= default_apic_id_valid,
 	.apic_id_registered		= flat_apic_id_registered,
 
-	.irq_delivery_mode		= dest_LowestPrio,
+	.irq_delivery_mode		= dest_Fixed,
 	.irq_dest_mode			= 1, /* logical */
 
 	.disable_esr			= 0,
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index 7b659c4480c9..5078b5ce63a7 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -110,7 +110,7 @@ struct apic apic_noop __ro_after_init = {
 	.apic_id_valid			= default_apic_id_valid,
 	.apic_id_registered		= noop_apic_id_registered,
 
-	.irq_delivery_mode		= dest_LowestPrio,
+	.irq_delivery_mode		= dest_Fixed,
 	/* logical delivery broadcast to all CPUs: */
 	.irq_dest_mode			= 1,
 
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index 9b18be764422..ce503c99f5c4 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -39,17 +39,13 @@ static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)
 		((apic->irq_dest_mode == 0) ?
 			MSI_ADDR_DEST_MODE_PHYSICAL :
 			MSI_ADDR_DEST_MODE_LOGICAL) |
-		((apic->irq_delivery_mode != dest_LowestPrio) ?
-			MSI_ADDR_REDIRECTION_CPU :
-			MSI_ADDR_REDIRECTION_LOWPRI) |
+		MSI_ADDR_REDIRECTION_CPU |
 		MSI_ADDR_DEST_ID(cfg->dest_apicid);
 
 	msg->data =
 		MSI_DATA_TRIGGER_EDGE |
 		MSI_DATA_LEVEL_ASSERT |
-		((apic->irq_delivery_mode != dest_LowestPrio) ?
-			MSI_DATA_DELIVERY_FIXED :
-			MSI_DATA_DELIVERY_LOWPRI) |
+		MSI_DATA_DELIVERY_FIXED |
 		MSI_DATA_VECTOR(cfg->vector);
 }
 
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index fa22017de806..02e8acb134f8 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -105,7 +105,7 @@ static struct apic apic_default __ro_after_init = {
 	.apic_id_valid			= default_apic_id_valid,
 	.apic_id_registered		= default_apic_id_registered,
 
-	.irq_delivery_mode		= dest_LowestPrio,
+	.irq_delivery_mode		= dest_Fixed,
 	/* logical delivery broadcast to all CPUs: */
 	.irq_dest_mode			= 1,
 
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 622f13ca8a94..8b04234e010b 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -184,7 +184,7 @@ static struct apic apic_x2apic_cluster __ro_after_init = {
 	.apic_id_valid			= x2apic_apic_id_valid,
 	.apic_id_registered		= x2apic_apic_id_registered,
 
-	.irq_delivery_mode		= dest_LowestPrio,
+	.irq_delivery_mode		= dest_Fixed,
 	.irq_dest_mode			= 1, /* logical */
 
 	.disable_esr			= 0,
diff --git a/drivers/pci/host/pci-hyperv.c b/drivers/pci/host/pci-hyperv.c
index 0fe3ea164ee5..e7d94473aedd 100644
--- a/drivers/pci/host/pci-hyperv.c
+++ b/drivers/pci/host/pci-hyperv.c
@@ -985,9 +985,7 @@ static u32 hv_compose_msi_req_v1(
 	int_pkt->wslot.slot = slot;
 	int_pkt->int_desc.vector = vector;
 	int_pkt->int_desc.vector_count = 1;
-	int_pkt->int_desc.delivery_mode =
-		(apic->irq_delivery_mode == dest_LowestPrio) ?
-			dest_LowestPrio : dest_Fixed;
+	int_pkt->int_desc.delivery_mode = dest_Fixed;
 
 	/*
 	 * Create MSI w/ dummy vCPU set, overwritten by subsequent retarget in
@@ -1008,9 +1006,7 @@ static u32 hv_compose_msi_req_v2(
 	int_pkt->wslot.slot = slot;
 	int_pkt->int_desc.vector = vector;
 	int_pkt->int_desc.vector_count = 1;
-	int_pkt->int_desc.delivery_mode =
-		(apic->irq_delivery_mode == dest_LowestPrio) ?
-			dest_LowestPrio : dest_Fixed;
+	int_pkt->int_desc.delivery_mode = dest_Fixed;
 
 	/*
 	 * Create MSI w/ dummy vCPU set targeting just one vCPU, overwritten

From 8880c13734af33635118a1e9567dadc7f9ddb7a8 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 29 Dec 2017 16:29:15 +0100
Subject: [PATCH 524/808] gpio: brcmstb: Make really use of the new lockdep
 class

The recent extension of irq_set_lockdep_class() with a second argument
added the new lockdep class to the mrcmstb driver, but used the already
existing lockdep class as second argument, which leaves the new lockdep
class defined but unused.

Use the new lockdep class as that's what the change intended to do.

Fixes: 39c3fd58952d ("kernel/irq: Extend lockdep class for request mutex")
Reported-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Lunn <andrew@lunn.ch>
Cc: linus.walleij@linaro.org
---
 drivers/gpio/gpio-brcmstb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpio/gpio-brcmstb.c b/drivers/gpio/gpio-brcmstb.c
index 5b24801bffef..bb4f8cf18bd9 100644
--- a/drivers/gpio/gpio-brcmstb.c
+++ b/drivers/gpio/gpio-brcmstb.c
@@ -348,7 +348,7 @@ static int brcmstb_gpio_irq_map(struct irq_domain *d, unsigned int irq,
 	if (ret < 0)
 		return ret;
 	irq_set_lockdep_class(irq, &brcmstb_gpio_irq_lock_class,
-			      &brcmstb_gpio_irq_lock_class);
+			      &brcmstb_gpio_irq_request_class);
 	irq_set_chip_and_handler(irq, &priv->irq_chip, handle_level_irq);
 	irq_set_noprobe(irq);
 	return 0;

From da5dd9e854d2edd6b02ebfe28583052f922104da Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 29 Dec 2017 10:42:10 +0100
Subject: [PATCH 525/808] genirq/msi: Handle reactivation only on success

When analyzing the fallout of the x86 vector allocation rework it turned
out that the error handling in msi_domain_alloc_irqs() is broken.

If MSI_FLAG_MUST_REACTIVATE is set for a MSI domain then it clears the
activation flag for a successfully initialized msi descriptor. If a
subsequent initialization fails then the error handling code path does not
deactivate the interrupt because the activation flag got cleared.

Move the clearing of the activation flag outside of the initialization loop
so that an eventual failure can be cleaned up correctly.

Fixes: 22d0b12f3560 ("genirq/irqdomain: Add force reactivation flag to irq domains")
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Alexandru Chirvasitu <achirvasub@gmail.com>
Tested-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Dou Liyang <douly.fnst@cn.fujitsu.com>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Maciej W. Rozycki <macro@linux-mips.org>
Cc: Mikael Pettersson <mikpelinux@gmail.com>
Cc: Josh Poulson <jopoulso@microsoft.com>
Cc: Mihai Costache <v-micos@microsoft.com>
Cc: Stephen Hemminger <sthemmin@microsoft.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: linux-pci@vger.kernel.org
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: Dexuan Cui <decui@microsoft.com>
Cc: Simon Xiao <sixiao@microsoft.com>
Cc: Saeed Mahameed <saeedm@mellanox.com>
Cc: Jork Loeser <Jork.Loeser@microsoft.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: devel@linuxdriverproject.org
Cc: KY Srinivasan <kys@microsoft.com>
Cc: Alan Cox <alan@linux.intel.com>
Cc: Sakari Ailus <sakari.ailus@intel.com>,
Cc: linux-media@vger.kernel.org
---
 kernel/irq/msi.c | 35 +++++++++++++++++++++++++++--------
 1 file changed, 27 insertions(+), 8 deletions(-)

diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index edb987b2c58d..9ba954331171 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -339,6 +339,13 @@ int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev,
 	return ret;
 }
 
+static bool msi_check_reservation_mode(struct msi_domain_info *info)
+{
+	if (!(info->flags & MSI_FLAG_MUST_REACTIVATE))
+		return false;
+	return true;
+}
+
 /**
  * msi_domain_alloc_irqs - Allocate interrupts from a MSI interrupt domain
  * @domain:	The domain to allocate from
@@ -353,9 +360,11 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
 {
 	struct msi_domain_info *info = domain->host_data;
 	struct msi_domain_ops *ops = info->ops;
-	msi_alloc_info_t arg;
+	struct irq_data *irq_data;
 	struct msi_desc *desc;
+	msi_alloc_info_t arg;
 	int i, ret, virq;
+	bool can_reserve;
 
 	ret = msi_domain_prepare_irqs(domain, dev, nvec, &arg);
 	if (ret)
@@ -385,6 +394,8 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
 	if (ops->msi_finish)
 		ops->msi_finish(&arg, 0);
 
+	can_reserve = msi_check_reservation_mode(info);
+
 	for_each_msi_entry(desc, dev) {
 		virq = desc->irq;
 		if (desc->nvec_used == 1)
@@ -397,15 +408,23 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
 		 * the MSI entries before the PCI layer enables MSI in the
 		 * card. Otherwise the card latches a random msi message.
 		 */
-		if (info->flags & MSI_FLAG_ACTIVATE_EARLY) {
-			struct irq_data *irq_data;
+		if (!(info->flags & MSI_FLAG_ACTIVATE_EARLY))
+			continue;
 
+		irq_data = irq_domain_get_irq_data(domain, desc->irq);
+		ret = irq_domain_activate_irq(irq_data, true);
+		if (ret)
+			goto cleanup;
+	}
+
+	/*
+	 * If these interrupts use reservation mode, clear the activated bit
+	 * so request_irq() will assign the final vector.
+	 */
+	if (can_reserve) {
+		for_each_msi_entry(desc, dev) {
 			irq_data = irq_domain_get_irq_data(domain, desc->irq);
-			ret = irq_domain_activate_irq(irq_data, true);
-			if (ret)
-				goto cleanup;
-			if (info->flags & MSI_FLAG_MUST_REACTIVATE)
-				irqd_clr_activated(irq_data);
+			irqd_clr_activated(irq_data);
 		}
 	}
 	return 0;

From 69790ba92b8d67eaee5e50b30a5b696d40664caf Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 29 Dec 2017 16:44:34 +0100
Subject: [PATCH 526/808] genirq: Introduce IRQD_CAN_RESERVE flag

Add a new flag to mark interrupts which can use reservation mode. This is
going to be used in subsequent patches to disable reservation mode for a
certain class of MSI devices.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Alexandru Chirvasitu <achirvasub@gmail.com>
Tested-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Dou Liyang <douly.fnst@cn.fujitsu.com>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Maciej W. Rozycki <macro@linux-mips.org>
Cc: Mikael Pettersson <mikpelinux@gmail.com>
Cc: Josh Poulson <jopoulso@microsoft.com>
Cc: Mihai Costache <v-micos@microsoft.com>
Cc: Stephen Hemminger <sthemmin@microsoft.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: linux-pci@vger.kernel.org
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: Dexuan Cui <decui@microsoft.com>
Cc: Simon Xiao <sixiao@microsoft.com>
Cc: Saeed Mahameed <saeedm@mellanox.com>
Cc: Jork Loeser <Jork.Loeser@microsoft.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: devel@linuxdriverproject.org
Cc: KY Srinivasan <kys@microsoft.com>
Cc: Alan Cox <alan@linux.intel.com>
Cc: Sakari Ailus <sakari.ailus@intel.com>,
Cc: linux-media@vger.kernel.org
---
 include/linux/irq.h  | 17 +++++++++++++++++
 kernel/irq/debugfs.c |  1 +
 2 files changed, 18 insertions(+)

diff --git a/include/linux/irq.h b/include/linux/irq.h
index e140f69163b6..a0231e96a578 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -212,6 +212,7 @@ struct irq_data {
  *				  mask. Applies only to affinity managed irqs.
  * IRQD_SINGLE_TARGET		- IRQ allows only a single affinity target
  * IRQD_DEFAULT_TRIGGER_SET	- Expected trigger already been set
+ * IRQD_CAN_RESERVE		- Can use reservation mode
  */
 enum {
 	IRQD_TRIGGER_MASK		= 0xf,
@@ -233,6 +234,7 @@ enum {
 	IRQD_MANAGED_SHUTDOWN		= (1 << 23),
 	IRQD_SINGLE_TARGET		= (1 << 24),
 	IRQD_DEFAULT_TRIGGER_SET	= (1 << 25),
+	IRQD_CAN_RESERVE		= (1 << 26),
 };
 
 #define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors)
@@ -377,6 +379,21 @@ static inline bool irqd_is_managed_and_shutdown(struct irq_data *d)
 	return __irqd_to_state(d) & IRQD_MANAGED_SHUTDOWN;
 }
 
+static inline void irqd_set_can_reserve(struct irq_data *d)
+{
+	__irqd_to_state(d) |= IRQD_CAN_RESERVE;
+}
+
+static inline void irqd_clr_can_reserve(struct irq_data *d)
+{
+	__irqd_to_state(d) &= ~IRQD_CAN_RESERVE;
+}
+
+static inline bool irqd_can_reserve(struct irq_data *d)
+{
+	return __irqd_to_state(d) & IRQD_CAN_RESERVE;
+}
+
 #undef __irqd_to_state
 
 static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d)
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index 7f608ac39653..acfaaef8672a 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -113,6 +113,7 @@ static const struct irq_bit_descr irqdata_states[] = {
 	BIT_MASK_DESCR(IRQD_SETAFFINITY_PENDING),
 	BIT_MASK_DESCR(IRQD_AFFINITY_MANAGED),
 	BIT_MASK_DESCR(IRQD_MANAGED_SHUTDOWN),
+	BIT_MASK_DESCR(IRQD_CAN_RESERVE),
 
 	BIT_MASK_DESCR(IRQD_FORWARDED_TO_VCPU),
 

From 945f50a591783ac6e9bd59694f34d1ba03b778a7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 29 Dec 2017 16:57:00 +0100
Subject: [PATCH 527/808] x86/vector: Use IRQD_CAN_RESERVE flag

Set the new CAN_RESERVE flag when the initial reservation for an interrupt
happens. The flag is used in a subsequent patch to disable reservation mode
for a certain class of MSI devices.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Alexandru Chirvasitu <achirvasub@gmail.com>
Tested-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Dou Liyang <douly.fnst@cn.fujitsu.com>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Maciej W. Rozycki <macro@linux-mips.org>
Cc: Mikael Pettersson <mikpelinux@gmail.com>
Cc: Josh Poulson <jopoulso@microsoft.com>
Cc: Mihai Costache <v-micos@microsoft.com>
Cc: Stephen Hemminger <sthemmin@microsoft.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: linux-pci@vger.kernel.org
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: Dexuan Cui <decui@microsoft.com>
Cc: Simon Xiao <sixiao@microsoft.com>
Cc: Saeed Mahameed <saeedm@mellanox.com>
Cc: Jork Loeser <Jork.Loeser@microsoft.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: devel@linuxdriverproject.org
Cc: KY Srinivasan <kys@microsoft.com>
Cc: Alan Cox <alan@linux.intel.com>
Cc: Sakari Ailus <sakari.ailus@intel.com>,
Cc: linux-media@vger.kernel.org
---
 arch/x86/kernel/apic/vector.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 750449152b04..1e969dba0476 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -184,6 +184,7 @@ static void reserve_irq_vector_locked(struct irq_data *irqd)
 	irq_matrix_reserve(vector_matrix);
 	apicd->can_reserve = true;
 	apicd->has_reserved = true;
+	irqd_set_can_reserve(irqd);
 	trace_vector_reserve(irqd->irq, 0);
 	vector_assign_managed_shutdown(irqd);
 }
@@ -478,6 +479,7 @@ static bool vector_configure_legacy(unsigned int virq, struct irq_data *irqd,
 	} else {
 		/* Release the vector */
 		apicd->can_reserve = true;
+		irqd_set_can_reserve(irqd);
 		clear_irq_vector(irqd);
 		realloc = true;
 	}

From 702cb0a02813299d6911b775c637906ae21b737d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 29 Dec 2017 16:59:06 +0100
Subject: [PATCH 528/808] genirq/irqdomain: Rename early argument of
 irq_domain_activate_irq()

The 'early' argument of irq_domain_activate_irq() is actually used to
denote reservation mode. To avoid confusion, rename it before abuse
happens.

No functional change.

Fixes: 72491643469a ("genirq/irqdomain: Update irq_domain_ops.activate() signature")
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Alexandru Chirvasitu <achirvasub@gmail.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Dou Liyang <douly.fnst@cn.fujitsu.com>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Maciej W. Rozycki <macro@linux-mips.org>
Cc: Mikael Pettersson <mikpelinux@gmail.com>
Cc: Josh Poulson <jopoulso@microsoft.com>
Cc: Mihai Costache <v-micos@microsoft.com>
Cc: Stephen Hemminger <sthemmin@microsoft.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: linux-pci@vger.kernel.org
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: Dexuan Cui <decui@microsoft.com>
Cc: Simon Xiao <sixiao@microsoft.com>
Cc: Saeed Mahameed <saeedm@mellanox.com>
Cc: Jork Loeser <Jork.Loeser@microsoft.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: devel@linuxdriverproject.org
Cc: KY Srinivasan <kys@microsoft.com>
Cc: Alan Cox <alan@linux.intel.com>
Cc: Sakari Ailus <sakari.ailus@intel.com>,
Cc: linux-media@vger.kernel.org
---
 arch/x86/include/asm/irqdomain.h         |  2 +-
 arch/x86/include/asm/trace/irq_vectors.h | 16 ++++++++--------
 arch/x86/kernel/apic/io_apic.c           |  2 +-
 arch/x86/kernel/apic/vector.c            |  6 +++---
 arch/x86/platform/uv/uv_irq.c            |  2 +-
 drivers/gpio/gpio-xgene-sb.c             |  2 +-
 drivers/iommu/amd_iommu.c                |  2 +-
 drivers/iommu/intel_irq_remapping.c      |  2 +-
 drivers/irqchip/irq-gic-v3-its.c         |  4 ++--
 drivers/pinctrl/stm32/pinctrl-stm32.c    |  2 +-
 include/linux/irqdomain.h                |  2 +-
 kernel/irq/internals.h                   |  2 +-
 kernel/irq/irqdomain.c                   | 13 +++++++------
 13 files changed, 29 insertions(+), 28 deletions(-)

diff --git a/arch/x86/include/asm/irqdomain.h b/arch/x86/include/asm/irqdomain.h
index 139feef467f7..c066ffae222b 100644
--- a/arch/x86/include/asm/irqdomain.h
+++ b/arch/x86/include/asm/irqdomain.h
@@ -44,7 +44,7 @@ extern int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
 extern void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq,
 			      unsigned int nr_irqs);
 extern int mp_irqdomain_activate(struct irq_domain *domain,
-				 struct irq_data *irq_data, bool early);
+				 struct irq_data *irq_data, bool reserve);
 extern void mp_irqdomain_deactivate(struct irq_domain *domain,
 				    struct irq_data *irq_data);
 extern int mp_irqdomain_ioapic_idx(struct irq_domain *domain);
diff --git a/arch/x86/include/asm/trace/irq_vectors.h b/arch/x86/include/asm/trace/irq_vectors.h
index 84b9ec0c1bc0..22647a642e98 100644
--- a/arch/x86/include/asm/trace/irq_vectors.h
+++ b/arch/x86/include/asm/trace/irq_vectors.h
@@ -283,34 +283,34 @@ TRACE_EVENT(vector_alloc_managed,
 DECLARE_EVENT_CLASS(vector_activate,
 
 	TP_PROTO(unsigned int irq, bool is_managed, bool can_reserve,
-		 bool early),
+		 bool reserve),
 
-	TP_ARGS(irq, is_managed, can_reserve, early),
+	TP_ARGS(irq, is_managed, can_reserve, reserve),
 
 	TP_STRUCT__entry(
 		__field(	unsigned int,	irq		)
 		__field(	bool,		is_managed	)
 		__field(	bool,		can_reserve	)
-		__field(	bool,		early		)
+		__field(	bool,		reserve		)
 	),
 
 	TP_fast_assign(
 		__entry->irq		= irq;
 		__entry->is_managed	= is_managed;
 		__entry->can_reserve	= can_reserve;
-		__entry->early		= early;
+		__entry->reserve	= reserve;
 	),
 
-	TP_printk("irq=%u is_managed=%d can_reserve=%d early=%d",
+	TP_printk("irq=%u is_managed=%d can_reserve=%d reserve=%d",
 		  __entry->irq, __entry->is_managed, __entry->can_reserve,
-		  __entry->early)
+		  __entry->reserve)
 );
 
 #define DEFINE_IRQ_VECTOR_ACTIVATE_EVENT(name)				\
 DEFINE_EVENT_FN(vector_activate, name,					\
 	TP_PROTO(unsigned int irq, bool is_managed,			\
-		 bool can_reserve, bool early),				\
-	TP_ARGS(irq, is_managed, can_reserve, early), NULL, NULL);	\
+		 bool can_reserve, bool reserve),			\
+	TP_ARGS(irq, is_managed, can_reserve, reserve), NULL, NULL);	\
 
 DEFINE_IRQ_VECTOR_ACTIVATE_EVENT(vector_activate);
 DEFINE_IRQ_VECTOR_ACTIVATE_EVENT(vector_deactivate);
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 201579dc5242..8a7963421460 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2988,7 +2988,7 @@ void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq,
 }
 
 int mp_irqdomain_activate(struct irq_domain *domain,
-			  struct irq_data *irq_data, bool early)
+			  struct irq_data *irq_data, bool reserve)
 {
 	unsigned long flags;
 
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 1e969dba0476..52c85c8147e9 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -399,21 +399,21 @@ static int activate_managed(struct irq_data *irqd)
 }
 
 static int x86_vector_activate(struct irq_domain *dom, struct irq_data *irqd,
-			       bool early)
+			       bool reserve)
 {
 	struct apic_chip_data *apicd = apic_chip_data(irqd);
 	unsigned long flags;
 	int ret = 0;
 
 	trace_vector_activate(irqd->irq, apicd->is_managed,
-			      apicd->can_reserve, early);
+			      apicd->can_reserve, reserve);
 
 	/* Nothing to do for fixed assigned vectors */
 	if (!apicd->can_reserve && !apicd->is_managed)
 		return 0;
 
 	raw_spin_lock_irqsave(&vector_lock, flags);
-	if (early || irqd_is_managed_and_shutdown(irqd))
+	if (reserve || irqd_is_managed_and_shutdown(irqd))
 		vector_assign_managed_shutdown(irqd);
 	else if (apicd->is_managed)
 		ret = activate_managed(irqd);
diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c
index 5f6fd860820a..e4cb9f4cde8a 100644
--- a/arch/x86/platform/uv/uv_irq.c
+++ b/arch/x86/platform/uv/uv_irq.c
@@ -128,7 +128,7 @@ static void uv_domain_free(struct irq_domain *domain, unsigned int virq,
  * on the specified blade to allow the sending of MSIs to the specified CPU.
  */
 static int uv_domain_activate(struct irq_domain *domain,
-			      struct irq_data *irq_data, bool early)
+			      struct irq_data *irq_data, bool reserve)
 {
 	uv_program_mmr(irqd_cfg(irq_data), irq_data->chip_data);
 	return 0;
diff --git a/drivers/gpio/gpio-xgene-sb.c b/drivers/gpio/gpio-xgene-sb.c
index 2313af82fad3..acd59113e08b 100644
--- a/drivers/gpio/gpio-xgene-sb.c
+++ b/drivers/gpio/gpio-xgene-sb.c
@@ -139,7 +139,7 @@ static int xgene_gpio_sb_to_irq(struct gpio_chip *gc, u32 gpio)
 
 static int xgene_gpio_sb_domain_activate(struct irq_domain *d,
 					 struct irq_data *irq_data,
-					 bool early)
+					 bool reserve)
 {
 	struct xgene_gpio_sb *priv = d->host_data;
 	u32 gpio = HWIRQ_TO_GPIO(priv, irq_data->hwirq);
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 7d5eb004091d..97baf88d9505 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -4184,7 +4184,7 @@ static void amd_ir_update_irte(struct irq_data *irqd, struct amd_iommu *iommu,
 			       struct irq_cfg *cfg);
 
 static int irq_remapping_activate(struct irq_domain *domain,
-				  struct irq_data *irq_data, bool early)
+				  struct irq_data *irq_data, bool reserve)
 {
 	struct amd_ir_data *data = irq_data->chip_data;
 	struct irq_2_irte *irte_info = &data->irq_2_irte;
diff --git a/drivers/iommu/intel_irq_remapping.c b/drivers/iommu/intel_irq_remapping.c
index 76a193c7fcfc..66f69af2c219 100644
--- a/drivers/iommu/intel_irq_remapping.c
+++ b/drivers/iommu/intel_irq_remapping.c
@@ -1397,7 +1397,7 @@ static void intel_irq_remapping_free(struct irq_domain *domain,
 }
 
 static int intel_irq_remapping_activate(struct irq_domain *domain,
-					struct irq_data *irq_data, bool early)
+					struct irq_data *irq_data, bool reserve)
 {
 	intel_ir_reconfigure_irte(irq_data, true);
 	return 0;
diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index 4039e64cd342..06f025fd5726 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -2303,7 +2303,7 @@ static int its_irq_domain_alloc(struct irq_domain *domain, unsigned int virq,
 }
 
 static int its_irq_domain_activate(struct irq_domain *domain,
-				   struct irq_data *d, bool early)
+				   struct irq_data *d, bool reserve)
 {
 	struct its_device *its_dev = irq_data_get_irq_chip_data(d);
 	u32 event = its_get_event_id(d);
@@ -2818,7 +2818,7 @@ static int its_vpe_irq_domain_alloc(struct irq_domain *domain, unsigned int virq
 }
 
 static int its_vpe_irq_domain_activate(struct irq_domain *domain,
-				       struct irq_data *d, bool early)
+				       struct irq_data *d, bool reserve)
 {
 	struct its_vpe *vpe = irq_data_get_irq_chip_data(d);
 	struct its_node *its;
diff --git a/drivers/pinctrl/stm32/pinctrl-stm32.c b/drivers/pinctrl/stm32/pinctrl-stm32.c
index a276c61be217..e62ab087bfd8 100644
--- a/drivers/pinctrl/stm32/pinctrl-stm32.c
+++ b/drivers/pinctrl/stm32/pinctrl-stm32.c
@@ -290,7 +290,7 @@ static int stm32_gpio_domain_translate(struct irq_domain *d,
 }
 
 static int stm32_gpio_domain_activate(struct irq_domain *d,
-				      struct irq_data *irq_data, bool early)
+				      struct irq_data *irq_data, bool reserve)
 {
 	struct stm32_gpio_bank *bank = d->host_data;
 	struct stm32_pinctrl *pctl = dev_get_drvdata(bank->gpio_chip.parent);
diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index a34355d19546..48c7e86bb556 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -113,7 +113,7 @@ struct irq_domain_ops {
 		     unsigned int nr_irqs, void *arg);
 	void (*free)(struct irq_domain *d, unsigned int virq,
 		     unsigned int nr_irqs);
-	int (*activate)(struct irq_domain *d, struct irq_data *irqd, bool early);
+	int (*activate)(struct irq_domain *d, struct irq_data *irqd, bool reserve);
 	void (*deactivate)(struct irq_domain *d, struct irq_data *irq_data);
 	int (*translate)(struct irq_domain *d, struct irq_fwspec *fwspec,
 			 unsigned long *out_hwirq, unsigned int *out_type);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 07d08ca701ec..ab19371eab9b 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -440,7 +440,7 @@ static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear)
 #endif /* !CONFIG_GENERIC_PENDING_IRQ */
 
 #if !defined(CONFIG_IRQ_DOMAIN) || !defined(CONFIG_IRQ_DOMAIN_HIERARCHY)
-static inline int irq_domain_activate_irq(struct irq_data *data, bool early)
+static inline int irq_domain_activate_irq(struct irq_data *data, bool reserve)
 {
 	irqd_set_activated(data);
 	return 0;
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 4f4f60015e8a..62068ad46930 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1693,7 +1693,7 @@ static void __irq_domain_deactivate_irq(struct irq_data *irq_data)
 	}
 }
 
-static int __irq_domain_activate_irq(struct irq_data *irqd, bool early)
+static int __irq_domain_activate_irq(struct irq_data *irqd, bool reserve)
 {
 	int ret = 0;
 
@@ -1702,9 +1702,9 @@ static int __irq_domain_activate_irq(struct irq_data *irqd, bool early)
 
 		if (irqd->parent_data)
 			ret = __irq_domain_activate_irq(irqd->parent_data,
-							early);
+							reserve);
 		if (!ret && domain->ops->activate) {
-			ret = domain->ops->activate(domain, irqd, early);
+			ret = domain->ops->activate(domain, irqd, reserve);
 			/* Rollback in case of error */
 			if (ret && irqd->parent_data)
 				__irq_domain_deactivate_irq(irqd->parent_data);
@@ -1716,17 +1716,18 @@ static int __irq_domain_activate_irq(struct irq_data *irqd, bool early)
 /**
  * irq_domain_activate_irq - Call domain_ops->activate recursively to activate
  *			     interrupt
- * @irq_data:	outermost irq_data associated with interrupt
+ * @irq_data:	Outermost irq_data associated with interrupt
+ * @reserve:	If set only reserve an interrupt vector instead of assigning one
  *
  * This is the second step to call domain_ops->activate to program interrupt
  * controllers, so the interrupt could actually get delivered.
  */
-int irq_domain_activate_irq(struct irq_data *irq_data, bool early)
+int irq_domain_activate_irq(struct irq_data *irq_data, bool reserve)
 {
 	int ret = 0;
 
 	if (!irqd_is_activated(irq_data))
-		ret = __irq_domain_activate_irq(irq_data, early);
+		ret = __irq_domain_activate_irq(irq_data, reserve);
 	if (!ret)
 		irqd_set_activated(irq_data);
 	return ret;

From bc976233a872c0f20f018fb1e89264a541584e25 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 29 Dec 2017 10:47:22 +0100
Subject: [PATCH 529/808] genirq/msi, x86/vector: Prevent reservation mode for
 non maskable MSI

The new reservation mode for interrupts assigns a dummy vector when the
interrupt is allocated and assigns a real vector when the interrupt is
requested. The reservation mode prevents vector pressure when devices with
a large amount of queues/interrupts are initialized, but only a minimal
subset of those queues/interrupts is actually used.

This mode has an issue with MSI interrupts which cannot be masked. If the
driver is not careful or the hardware emits an interrupt before the device
irq is requestd by the driver then the interrupt ends up on the dummy
vector as a spurious interrupt which can cause malfunction of the device or
in the worst case a lockup of the machine.

Change the logic for the reservation mode so that the early activation of
MSI interrupts checks whether:

 - the device is a PCI/MSI device
 - the reservation mode of the underlying irqdomain is activated
 - PCI/MSI masking is globally enabled
 - the PCI/MSI device uses either MSI-X, which supports masking, or
   MSI with the maskbit supported.

If one of those conditions is false, then clear the reservation mode flag
in the irq data of the interrupt and invoke irq_domain_activate_irq() with
the reserve argument cleared. In the x86 vector code, clear the can_reserve
flag in the vector allocation data so a subsequent free_irq() won't create
the same situation again. The interrupt stays assigned to a real vector
until pci_disable_msi() is invoked and all allocations are undone.

Fixes: 4900be83602b ("x86/vector/msi: Switch to global reservation mode")
Reported-by: Alexandru Chirvasitu <achirvasub@gmail.com>
Reported-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Alexandru Chirvasitu <achirvasub@gmail.com>
Tested-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Dou Liyang <douly.fnst@cn.fujitsu.com>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Maciej W. Rozycki <macro@linux-mips.org>
Cc: Mikael Pettersson <mikpelinux@gmail.com>
Cc: Josh Poulson <jopoulso@microsoft.com>
Cc: Mihai Costache <v-micos@microsoft.com>
Cc: Stephen Hemminger <sthemmin@microsoft.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: linux-pci@vger.kernel.org
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: Dexuan Cui <decui@microsoft.com>
Cc: Simon Xiao <sixiao@microsoft.com>
Cc: Saeed Mahameed <saeedm@mellanox.com>
Cc: Jork Loeser <Jork.Loeser@microsoft.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: devel@linuxdriverproject.org
Cc: KY Srinivasan <kys@microsoft.com>
Cc: Alan Cox <alan@linux.intel.com>
Cc: Sakari Ailus <sakari.ailus@intel.com>,
Cc: linux-media@vger.kernel.org
Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1712291406420.1899@nanos
Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1712291409460.1899@nanos
---
 arch/x86/kernel/apic/vector.c | 12 +++++++++++-
 kernel/irq/msi.c              | 37 +++++++++++++++++++++++++++++++----
 2 files changed, 44 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 52c85c8147e9..f8b03bb8e725 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -369,8 +369,18 @@ static int activate_reserved(struct irq_data *irqd)
 	int ret;
 
 	ret = assign_irq_vector_any_locked(irqd);
-	if (!ret)
+	if (!ret) {
 		apicd->has_reserved = false;
+		/*
+		 * Core might have disabled reservation mode after
+		 * allocating the irq descriptor. Ideally this should
+		 * happen before allocation time, but that would require
+		 * completely convoluted ways of transporting that
+		 * information.
+		 */
+		if (!irqd_can_reserve(irqd))
+			apicd->can_reserve = false;
+	}
 	return ret;
 }
 
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 9ba954331171..2f3c4f5382cc 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -339,11 +339,38 @@ int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev,
 	return ret;
 }
 
-static bool msi_check_reservation_mode(struct msi_domain_info *info)
+/*
+ * Carefully check whether the device can use reservation mode. If
+ * reservation mode is enabled then the early activation will assign a
+ * dummy vector to the device. If the PCI/MSI device does not support
+ * masking of the entry then this can result in spurious interrupts when
+ * the device driver is not absolutely careful. But even then a malfunction
+ * of the hardware could result in a spurious interrupt on the dummy vector
+ * and render the device unusable. If the entry can be masked then the core
+ * logic will prevent the spurious interrupt and reservation mode can be
+ * used. For now reservation mode is restricted to PCI/MSI.
+ */
+static bool msi_check_reservation_mode(struct irq_domain *domain,
+				       struct msi_domain_info *info,
+				       struct device *dev)
 {
+	struct msi_desc *desc;
+
+	if (domain->bus_token != DOMAIN_BUS_PCI_MSI)
+		return false;
+
 	if (!(info->flags & MSI_FLAG_MUST_REACTIVATE))
 		return false;
-	return true;
+
+	if (IS_ENABLED(CONFIG_PCI_MSI) && pci_msi_ignore_mask)
+		return false;
+
+	/*
+	 * Checking the first MSI descriptor is sufficient. MSIX supports
+	 * masking and MSI does so when the maskbit is set.
+	 */
+	desc = first_msi_entry(dev);
+	return desc->msi_attrib.is_msix || desc->msi_attrib.maskbit;
 }
 
 /**
@@ -394,7 +421,7 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
 	if (ops->msi_finish)
 		ops->msi_finish(&arg, 0);
 
-	can_reserve = msi_check_reservation_mode(info);
+	can_reserve = msi_check_reservation_mode(domain, info, dev);
 
 	for_each_msi_entry(desc, dev) {
 		virq = desc->irq;
@@ -412,7 +439,9 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
 			continue;
 
 		irq_data = irq_domain_get_irq_data(domain, desc->irq);
-		ret = irq_domain_activate_irq(irq_data, true);
+		if (!can_reserve)
+			irqd_clr_can_reserve(irq_data);
+		ret = irq_domain_activate_irq(irq_data, can_reserve);
 		if (ret)
 			goto cleanup;
 	}

From ced6d5c11d3e7b342f1a80f908e6756ebd4b8ddd Mon Sep 17 00:00:00 2001
From: Anna-Maria Gleixner <anna-maria@linutronix.de>
Date: Fri, 22 Dec 2017 15:51:12 +0100
Subject: [PATCH 530/808] timers: Use deferrable base independent of
 base::nohz_active

During boot and before base::nohz_active is set in the timer bases, deferrable
timers are enqueued into the standard timer base. This works correctly as
long as base::nohz_active is false.

Once it base::nohz_active is set and a timer which was enqueued before that
is accessed the lock selector code choses the lock of the deferred
base. This causes unlocked access to the standard base and in case the
timer is removed it does not clear the pending flag in the standard base
bitmap which causes get_next_timer_interrupt() to return bogus values.

To prevent that, the deferrable timers must be enqueued in the deferrable
base, even when base::nohz_active is not set. Those deferrable timers also
need to be expired unconditional.

Fixes: 500462a9de65 ("timers: Switch to a non-cascading wheel")
Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: stable@vger.kernel.org
Cc: rt@linutronix.de
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Link: https://lkml.kernel.org/r/20171222145337.633328378@linutronix.de
---
 kernel/time/timer.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index ffebcf878fba..19a9c3da7698 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -823,11 +823,10 @@ static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu)
 	struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_STD], cpu);
 
 	/*
-	 * If the timer is deferrable and nohz is active then we need to use
-	 * the deferrable base.
+	 * If the timer is deferrable and NO_HZ_COMMON is set then we need
+	 * to use the deferrable base.
 	 */
-	if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active &&
-	    (tflags & TIMER_DEFERRABLE))
+	if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE))
 		base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu);
 	return base;
 }
@@ -837,11 +836,10 @@ static inline struct timer_base *get_timer_this_cpu_base(u32 tflags)
 	struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
 
 	/*
-	 * If the timer is deferrable and nohz is active then we need to use
-	 * the deferrable base.
+	 * If the timer is deferrable and NO_HZ_COMMON is set then we need
+	 * to use the deferrable base.
 	 */
-	if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active &&
-	    (tflags & TIMER_DEFERRABLE))
+	if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE))
 		base = this_cpu_ptr(&timer_bases[BASE_DEF]);
 	return base;
 }
@@ -1684,7 +1682,7 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h)
 	base->must_forward_clk = false;
 
 	__run_timers(base);
-	if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active)
+	if (IS_ENABLED(CONFIG_NO_HZ_COMMON))
 		__run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));
 }
 

From 26456f87aca7157c057de65c9414b37f1ab881d1 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 27 Dec 2017 21:37:25 +0100
Subject: [PATCH 531/808] timers: Reinitialize per cpu bases on hotplug

The timer wheel bases are not (re)initialized on CPU hotplug. That leaves
them with a potentially stale clk and next_expiry valuem, which can cause
trouble then the CPU is plugged.

Add a prepare callback which forwards the clock, sets next_expiry to far in
the future and reset the control flags to a known state.

Set base->must_forward_clk so the first timer which is queued will try to
forward the clock to current jiffies.

Fixes: 500462a9de65 ("timers: Switch to a non-cascading wheel")
Reported-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Anna-Maria Gleixner <anna-maria@linutronix.de>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1712272152200.2431@nanos
---
 include/linux/cpuhotplug.h |  2 +-
 include/linux/timer.h      |  4 +++-
 kernel/cpu.c               |  4 ++--
 kernel/time/timer.c        | 15 +++++++++++++++
 4 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 201ab7267986..1a32e558eb11 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -86,7 +86,7 @@ enum cpuhp_state {
 	CPUHP_MM_ZSWP_POOL_PREPARE,
 	CPUHP_KVM_PPC_BOOK3S_PREPARE,
 	CPUHP_ZCOMP_PREPARE,
-	CPUHP_TIMERS_DEAD,
+	CPUHP_TIMERS_PREPARE,
 	CPUHP_MIPS_SOC_PREPARE,
 	CPUHP_BP_PREPARE_DYN,
 	CPUHP_BP_PREPARE_DYN_END		= CPUHP_BP_PREPARE_DYN + 20,
diff --git a/include/linux/timer.h b/include/linux/timer.h
index 04af640ea95b..2448f9cc48a3 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -207,9 +207,11 @@ unsigned long round_jiffies_up(unsigned long j);
 unsigned long round_jiffies_up_relative(unsigned long j);
 
 #ifdef CONFIG_HOTPLUG_CPU
+int timers_prepare_cpu(unsigned int cpu);
 int timers_dead_cpu(unsigned int cpu);
 #else
-#define timers_dead_cpu NULL
+#define timers_prepare_cpu	NULL
+#define timers_dead_cpu		NULL
 #endif
 
 #endif
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 41376c3ac93b..97858477e586 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1277,9 +1277,9 @@ static struct cpuhp_step cpuhp_bp_states[] = {
 	 * before blk_mq_queue_reinit_notify() from notify_dead(),
 	 * otherwise a RCU stall occurs.
 	 */
-	[CPUHP_TIMERS_DEAD] = {
+	[CPUHP_TIMERS_PREPARE] = {
 		.name			= "timers:dead",
-		.startup.single		= NULL,
+		.startup.single		= timers_prepare_cpu,
 		.teardown.single	= timers_dead_cpu,
 	},
 	/* Kicks the plugged cpu into life */
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 19a9c3da7698..6be576e02209 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1853,6 +1853,21 @@ static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *h
 	}
 }
 
+int timers_prepare_cpu(unsigned int cpu)
+{
+	struct timer_base *base;
+	int b;
+
+	for (b = 0; b < NR_BASES; b++) {
+		base = per_cpu_ptr(&timer_bases[b], cpu);
+		base->clk = jiffies;
+		base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
+		base->is_idle = false;
+		base->must_forward_clk = true;
+	}
+	return 0;
+}
+
 int timers_dead_cpu(unsigned int cpu)
 {
 	struct timer_base *old_base;

From 5d62c183f9e9df1deeea0906d099a94e8a43047a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 22 Dec 2017 15:51:13 +0100
Subject: [PATCH 532/808] nohz: Prevent a timer interrupt storm in
 tick_nohz_stop_sched_tick()

The conditions in irq_exit() to invoke tick_nohz_irq_exit() which
subsequently invokes tick_nohz_stop_sched_tick() are:

  if ((idle_cpu(cpu) && !need_resched()) || tick_nohz_full_cpu(cpu))

If need_resched() is not set, but a timer softirq is pending then this is
an indication that the softirq code punted and delegated the execution to
softirqd. need_resched() is not true because the current interrupted task
takes precedence over softirqd.

Invoking tick_nohz_irq_exit() in this case can cause an endless loop of
timer interrupts because the timer wheel contains an expired timer, but
softirqs are not yet executed. So it returns an immediate expiry request,
which causes the timer to fire immediately again. Lather, rinse and
repeat....

Prevent that by adding a check for a pending timer soft interrupt to the
conditions in tick_nohz_stop_sched_tick() which avoid calling
get_next_timer_interrupt(). That keeps the tick sched timer on the tick and
prevents a repetitive programming of an already expired timer.

Reported-by: Sebastian Siewior <bigeasy@linutronix.d>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Anna-Maria Gleixner <anna-maria@linutronix.de>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1712272156050.2431@nanos
---
 kernel/time/tick-sched.c | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 77555faf6fbc..f7cc7abfcf25 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -650,6 +650,11 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
 	ts->next_tick = 0;
 }
 
+static inline bool local_timer_softirq_pending(void)
+{
+	return local_softirq_pending() & TIMER_SOFTIRQ;
+}
+
 static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 					 ktime_t now, int cpu)
 {
@@ -666,8 +671,18 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 	} while (read_seqretry(&jiffies_lock, seq));
 	ts->last_jiffies = basejiff;
 
-	if (rcu_needs_cpu(basemono, &next_rcu) ||
-	    arch_needs_cpu() || irq_work_needs_cpu()) {
+	/*
+	 * Keep the periodic tick, when RCU, architecture or irq_work
+	 * requests it.
+	 * Aside of that check whether the local timer softirq is
+	 * pending. If so its a bad idea to call get_next_timer_interrupt()
+	 * because there is an already expired timer, so it will request
+	 * immeditate expiry, which rearms the hardware timer with a
+	 * minimal delta which brings us back to this place
+	 * immediately. Lather, rinse and repeat...
+	 */
+	if (rcu_needs_cpu(basemono, &next_rcu) || arch_needs_cpu() ||
+	    irq_work_needs_cpu() || local_timer_softirq_pending()) {
 		next_tick = basemono + TICK_NSEC;
 	} else {
 		/*

From fd45bb77ad682be728d1002431d77b8c73342836 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 22 Dec 2017 15:51:14 +0100
Subject: [PATCH 533/808] timers: Invoke timer_start_debug() where it makes
 sense

The timer start debug function is called before the proper timer base is
set. As a consequence the trace data contains the stale CPU and flags
values.

Call the debug function after setting the new base and flags.

Fixes: 500462a9de65 ("timers: Switch to a non-cascading wheel")
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: stable@vger.kernel.org
Cc: rt@linutronix.de
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Anna-Maria Gleixner <anna-maria@linutronix.de>
Link: https://lkml.kernel.org/r/20171222145337.792907137@linutronix.de
---
 kernel/time/timer.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 6be576e02209..89a9e1b4264a 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1007,8 +1007,6 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option
 	if (!ret && (options & MOD_TIMER_PENDING_ONLY))
 		goto out_unlock;
 
-	debug_activate(timer, expires);
-
 	new_base = get_target_base(base, timer->flags);
 
 	if (base != new_base) {
@@ -1032,6 +1030,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option
 		}
 	}
 
+	debug_activate(timer, expires);
+
 	timer->expires = expires;
 	/*
 	 * If 'idx' was calculated above and the base time did not advance

From 9f4533cd7334235cd4c9b9fb1b0b8791e2ba01a7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 22 Dec 2017 15:51:15 +0100
Subject: [PATCH 534/808] timerqueue: Document return values of
 timerqueue_add/del()

The return values of timerqueue_add/del() are not documented in the kernel doc
comment. Add proper documentation.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: rt@linutronix.de
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Anna-Maria Gleixner <anna-maria@linutronix.de>
Link: https://lkml.kernel.org/r/20171222145337.872681338@linutronix.de
---
 lib/timerqueue.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/lib/timerqueue.c b/lib/timerqueue.c
index 4a720ed4fdaf..0d54bcbc8170 100644
--- a/lib/timerqueue.c
+++ b/lib/timerqueue.c
@@ -33,8 +33,9 @@
  * @head: head of timerqueue
  * @node: timer node to be added
  *
- * Adds the timer node to the timerqueue, sorted by the
- * node's expires value.
+ * Adds the timer node to the timerqueue, sorted by the node's expires
+ * value. Returns true if the newly added timer is the first expiring timer in
+ * the queue.
  */
 bool timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node)
 {
@@ -70,7 +71,8 @@ EXPORT_SYMBOL_GPL(timerqueue_add);
  * @head: head of timerqueue
  * @node: timer node to be removed
  *
- * Removes the timer node from the timerqueue.
+ * Removes the timer node from the timerqueue. Returns true if the queue is
+ * not empty after the remove.
  */
 bool timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node)
 {

From 3ce120b16cc548472f80cf8644f90eda958cf1b6 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 29 Dec 2017 17:34:43 -0800
Subject: [PATCH 535/808] kbuild: add '-fno-stack-check' to kernel build
 options
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It appears that hardened gentoo enables "-fstack-check" by default for
gcc.

That doesn't work _at_all_ for the kernel, because the kernel stack
doesn't act like a user stack at all: it's much smaller, and it doesn't
auto-expand on use.  So the extra "probe one page below the stack" code
generated by -fstack-check just breaks the kernel in horrible ways,
causing infinite double faults etc.

[ I have to say, that the particular code gcc generates looks very
  stupid even for user space where it works, but that's a separate
  issue.  ]

Reported-and-tested-by: Alexander Tsoy <alexander@tsoy.me>
Reported-and-tested-by: Toralf Förster <toralf.foerster@gmx.de>
Cc: stable@kernel.org
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Makefile | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/Makefile b/Makefile
index ac8c441866b7..92b74bcd3c2a 100644
--- a/Makefile
+++ b/Makefile
@@ -789,6 +789,9 @@ KBUILD_CFLAGS += $(call cc-disable-warning, pointer-sign)
 # disable invalid "can't wrap" optimizations for signed / pointers
 KBUILD_CFLAGS	+= $(call cc-option,-fno-strict-overflow)
 
+# Make sure -fstack-check isn't enabled (like gentoo apparently did)
+KBUILD_CFLAGS  += $(call cc-option,-fno-stack-check,)
+
 # conserve stack if available
 KBUILD_CFLAGS   += $(call cc-option,-fconserve-stack)
 

From d89e426499cf36b96161bd32970d6783f1fbcb0e Mon Sep 17 00:00:00 2001
From: Simon Ser <contact@emersion.fr>
Date: Sat, 30 Dec 2017 14:43:31 -0600
Subject: [PATCH 536/808] objtool: Fix seg fault caused by missing parameter

Fix a seg fault when no parameter is provided to 'objtool orc'.

Signed-off-by: Simon Ser <contact@emersion.fr>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/9172803ec7ebb72535bcd0b7f966ae96d515968e.1514666459.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 tools/objtool/builtin-orc.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tools/objtool/builtin-orc.c b/tools/objtool/builtin-orc.c
index 4c6b5c9ef073..91e8e19ff5e0 100644
--- a/tools/objtool/builtin-orc.c
+++ b/tools/objtool/builtin-orc.c
@@ -44,6 +44,9 @@ int cmd_orc(int argc, const char **argv)
 	const char *objname;
 
 	argc--; argv++;
+	if (argc <= 0)
+		usage_with_options(orc_usage, check_options);
+
 	if (!strncmp(argv[0], "gen", 3)) {
 		argc = parse_options(argc, argv, check_options, orc_usage, 0);
 		if (argc != 1)
@@ -52,7 +55,6 @@ int cmd_orc(int argc, const char **argv)
 		objname = argv[0];
 
 		return check(objname, no_fp, no_unreachable, true);
-
 	}
 
 	if (!strcmp(argv[0], "dump")) {

From ce90aaf5cde4ce057b297bb6c955caf16ef00ee6 Mon Sep 17 00:00:00 2001
From: Simon Ser <contact@emersion.fr>
Date: Sat, 30 Dec 2017 14:43:32 -0600
Subject: [PATCH 537/808] objtool: Fix seg fault with clang-compiled objects

Fix a seg fault which happens when an input file provided to 'objtool
orc generate' doesn't have a '.shstrtab' section (for instance, object
files produced by clang don't have this section).

Signed-off-by: Simon Ser <contact@emersion.fr>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/c0f2231683e9bed40fac1f13ce2c33b8389854bc.1514666459.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 tools/objtool/orc_gen.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/objtool/orc_gen.c b/tools/objtool/orc_gen.c
index e5ca31429c9b..e61fe703197b 100644
--- a/tools/objtool/orc_gen.c
+++ b/tools/objtool/orc_gen.c
@@ -165,6 +165,8 @@ int create_orc_sections(struct objtool_file *file)
 
 	/* create .orc_unwind_ip and .rela.orc_unwind_ip sections */
 	sec = elf_create_section(file->elf, ".orc_unwind_ip", sizeof(int), idx);
+	if (!sec)
+		return -1;
 
 	ip_relasec = elf_create_rela_section(file->elf, sec);
 	if (!ip_relasec)

From 322f8b8b340c824aef891342b0f5795d15e11562 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 30 Dec 2017 22:13:53 +0100
Subject: [PATCH 538/808] x86/smpboot: Remove stale TLB flush invocations

smpboot_setup_warm_reset_vector() and smpboot_restore_warm_reset_vector()
invoke local_flush_tlb() for no obvious reason.

Digging in history revealed that the original code in the 2.1 era added
those because the code manipulated a swapper_pg_dir pagetable entry. The
pagetable manipulation was removed long ago in the 2.3 timeframe, but the
TLB flush invocations stayed around forever.

Remove them along with the pointless pr_debug()s which come from the same 2.1
change.

Reported-by: Dominik Brodowski <linux@dominikbrodowski.net>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: <stable@vger.kernel.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linuxfoundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20171230211829.586548655@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/smpboot.c | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 33d6000265aa..c3402fc30865 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -128,25 +128,16 @@ static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
 	spin_lock_irqsave(&rtc_lock, flags);
 	CMOS_WRITE(0xa, 0xf);
 	spin_unlock_irqrestore(&rtc_lock, flags);
-	local_flush_tlb();
-	pr_debug("1.\n");
 	*((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) =
 							start_eip >> 4;
-	pr_debug("2.\n");
 	*((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) =
 							start_eip & 0xf;
-	pr_debug("3.\n");
 }
 
 static inline void smpboot_restore_warm_reset_vector(void)
 {
 	unsigned long flags;
 
-	/*
-	 * Install writable page 0 entry to set BIOS data area.
-	 */
-	local_flush_tlb();
-
 	/*
 	 * Paranoid:  Set warm reset code and vector here back
 	 * to default values.

From decab0888e6e14e11d53cefa85f8b3d3b45ce73c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 30 Dec 2017 22:13:54 +0100
Subject: [PATCH 539/808] x86/mm: Remove preempt_disable/enable() from
 __native_flush_tlb()

The preempt_disable/enable() pair in __native_flush_tlb() was added in
commit:

  5cf0791da5c1 ("x86/mm: Disable preemption during CR3 read+write")

... to protect the UP variant of flush_tlb_mm_range().

That preempt_disable/enable() pair should have been added to the UP variant
of flush_tlb_mm_range() instead.

The UP variant was removed with commit:

  ce4a4e565f52 ("x86/mm: Remove the UP asm/tlbflush.h code, always use the (formerly) SMP code")

... but the preempt_disable/enable() pair stayed around.

The latest change to __native_flush_tlb() in commit:

  6fd166aae78c ("x86/mm: Use/Fix PCID to optimize user/kernel switches")

... added an access to a per CPU variable outside the preempt disabled
regions, which makes no sense at all. __native_flush_tlb() must always
be called with at least preemption disabled.

Remove the preempt_disable/enable() pair and add a WARN_ON_ONCE() to catch
bad callers independent of the smp_processor_id() debugging.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: <stable@vger.kernel.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dominik Brodowski <linux@dominikbrodowski.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linuxfoundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20171230211829.679325424@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/tlbflush.h | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index b519da4fc03c..f9b48ce152eb 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -345,15 +345,17 @@ static inline void invalidate_user_asid(u16 asid)
  */
 static inline void __native_flush_tlb(void)
 {
-	invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid));
 	/*
-	 * If current->mm == NULL then we borrow a mm which may change
-	 * during a task switch and therefore we must not be preempted
-	 * while we write CR3 back:
+	 * Preemption or interrupts must be disabled to protect the access
+	 * to the per CPU variable and to prevent being preempted between
+	 * read_cr3() and write_cr3().
 	 */
-	preempt_disable();
+	WARN_ON_ONCE(preemptible());
+
+	invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid));
+
+	/* If current->mm == NULL then the read_cr3() "borrows" an mm */
 	native_write_cr3(__native_read_cr3());
-	preempt_enable();
 }
 
 /*

From a62d69857aab4caa43049e72fe0ed5c4a60518dd Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 31 Dec 2017 11:24:34 +0100
Subject: [PATCH 540/808] x86/ldt: Plug memory leak in error path

The error path in write_ldt() tries to free 'old_ldt' instead of the newly
allocated 'new_ldt', resulting in a memory leak. It also misses to clean up a
half populated LDT pagetable, which is not a leak as it gets cleaned up
when the process exits.

Free both the potentially half populated LDT pagetable and the newly
allocated LDT struct. This can be done unconditionally because once an LDT
is mapped subsequent maps will succeed, because the PTE page is already
populated and the two LDTs fit into that single page.

Reported-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dominik Brodowski <linux@dominikbrodowski.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linuxfoundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Fixes: f55f0501cbf6 ("x86/pti: Put the LDT in its own PGD if PTI is on")
Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1712311121340.1899@nanos
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/ldt.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 579cc4a66fdf..500e90e44f86 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -421,7 +421,13 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
 	 */
 	error = map_ldt_struct(mm, new_ldt, old_ldt ? !old_ldt->slot : 0);
 	if (error) {
-		free_ldt_struct(old_ldt);
+		/*
+		 * This only can fail for the first LDT setup. If an LDT is
+		 * already installed then the PTE page is already
+		 * populated. Mop up a half populated page table.
+		 */
+		free_ldt_pgtables(mm);
+		free_ldt_struct(new_ldt);
 		goto out_unlock;
 	}
 

From 7f414195b0c3612acd12b4611a5fe75995cf10c7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 31 Dec 2017 16:52:15 +0100
Subject: [PATCH 541/808] x86/ldt: Make LDT pgtable free conditional

Andy prefers to be paranoid about the pagetable free in the error path of
write_ldt(). Make it conditional and warn whenever the installment of a
secondary LDT fails.

Requested-by: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/ldt.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 500e90e44f86..26d713ecad34 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -426,7 +426,8 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
 		 * already installed then the PTE page is already
 		 * populated. Mop up a half populated page table.
 		 */
-		free_ldt_pgtables(mm);
+		if (!WARN_ON_ONCE(old_ldt))
+			free_ldt_pgtables(mm);
 		free_ldt_struct(new_ldt);
 		goto out_unlock;
 	}

From c0b23903f5b077effec90769d365646a8c2faae0 Mon Sep 17 00:00:00 2001
From: Adam Borowski <kilobyte@angband.pl>
Date: Mon, 25 Dec 2017 16:38:58 +0100
Subject: [PATCH 542/808] MAINTAINERS: mark arch/blackfin/ and its gubbins as
 orphaned

The blackfin architecture has seen no maintainer action of any kind since
April 2015.  No new code, no pull requests, no acks to patches, no response
to mails, nothing.

The web site has an expired certificate (expiration Sep 2017, issued in
2013), the mailing list sees no answers either, with one exception:

  https://sourceforge.net/p/adi-buildroot/mailman/adi-buildroot-devel/
  >
  > Steven is no longer working on this for ADI. Acked by me if this works. Thanks.
  >
  > Best regards,
  > Aaron Wu
  > Analog Devices Inc.

But, Aaron doesn't seem to respond to queries either.

Signed-off-by: Adam Borowski <kilobyte@angband.pl>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 MAINTAINERS | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index a6e86e20761e..2d0773007c89 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2621,24 +2621,22 @@ F:	fs/bfs/
 F:	include/uapi/linux/bfs_fs.h
 
 BLACKFIN ARCHITECTURE
-M:	Steven Miao <realmz6@gmail.com>
 L:	adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers)
 T:	git git://git.code.sf.net/p/adi-linux/code
 W:	http://blackfin.uclinux.org
-S:	Supported
+S:	Orphan
 F:	arch/blackfin/
 
 BLACKFIN EMAC DRIVER
 L:	adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers)
 W:	http://blackfin.uclinux.org
-S:	Supported
+S:	Orphan
 F:	drivers/net/ethernet/adi/
 
 BLACKFIN MEDIA DRIVER
-M:	Scott Jiang <scott.jiang.linux@gmail.com>
 L:	adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers)
 W:	http://blackfin.uclinux.org/
-S:	Supported
+S:	Orphan
 F:	drivers/media/platform/blackfin/
 F:	drivers/media/i2c/adv7183*
 F:	drivers/media/i2c/vs6624*
@@ -2646,25 +2644,25 @@ F:	drivers/media/i2c/vs6624*
 BLACKFIN RTC DRIVER
 L:	adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers)
 W:	http://blackfin.uclinux.org
-S:	Supported
+S:	Orphan
 F:	drivers/rtc/rtc-bfin.c
 
 BLACKFIN SDH DRIVER
 L:	adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers)
 W:	http://blackfin.uclinux.org
-S:	Supported
+S:	Orphan
 F:	drivers/mmc/host/bfin_sdh.c
 
 BLACKFIN SERIAL DRIVER
 L:	adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers)
 W:	http://blackfin.uclinux.org
-S:	Supported
+S:	Orphan
 F:	drivers/tty/serial/bfin_uart.c
 
 BLACKFIN WATCHDOG DRIVER
 L:	adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers)
 W:	http://blackfin.uclinux.org
-S:	Supported
+S:	Orphan
 F:	drivers/watchdog/bfin_wdt.c
 
 BLINKM RGB LED DRIVER

From 30a7acd573899fd8b8ac39236eff6468b195ac7d Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 31 Dec 2017 14:47:43 -0800
Subject: [PATCH 543/808] Linux 4.15-rc6

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 92b74bcd3c2a..eb1f5973813e 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,7 @@
 VERSION = 4
 PATCHLEVEL = 15
 SUBLEVEL = 0
-EXTRAVERSION = -rc5
+EXTRAVERSION = -rc6
 NAME = Fearless Coyote
 
 # *DOCUMENTATION*

From 4307413256ac1e09b8f53e8715af3df9e49beec3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Diego=20Elio=20Petten=C3=B2?= <flameeyes@flameeyes.eu>
Date: Fri, 29 Dec 2017 09:54:25 +0000
Subject: [PATCH 544/808] USB: serial: cp210x: add IDs for LifeScan OneTouch
 Verio IQ
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add IDs for the OneTouch Verio IQ that comes with an embedded
USB-to-serial converter.

Signed-off-by: Diego Elio Pettenò <flameeyes@flameeyes.eu>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Johan Hovold <johan@kernel.org>
---
 drivers/usb/serial/cp210x.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/usb/serial/cp210x.c b/drivers/usb/serial/cp210x.c
index 7c6273bf5beb..38814225a816 100644
--- a/drivers/usb/serial/cp210x.c
+++ b/drivers/usb/serial/cp210x.c
@@ -124,6 +124,7 @@ static const struct usb_device_id id_table[] = {
 	{ USB_DEVICE(0x10C4, 0x8470) }, /* Juniper Networks BX Series System Console */
 	{ USB_DEVICE(0x10C4, 0x8477) }, /* Balluff RFID */
 	{ USB_DEVICE(0x10C4, 0x84B6) }, /* Starizona Hyperion */
+	{ USB_DEVICE(0x10C4, 0x85A7) }, /* LifeScan OneTouch Verio IQ */
 	{ USB_DEVICE(0x10C4, 0x85EA) }, /* AC-Services IBUS-IF */
 	{ USB_DEVICE(0x10C4, 0x85EB) }, /* AC-Services CIS-IBUS */
 	{ USB_DEVICE(0x10C4, 0x85F8) }, /* Virtenio Preon32 */

From dc32b5c3e6e2ef29cef76d9ce1b92d394446150e Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 1 Jan 2018 09:28:31 -0600
Subject: [PATCH 545/808] capabilities: fix buffer overread on very short xattr

If userspace attempted to set a "security.capability" xattr shorter than
4 bytes (e.g. 'setfattr -n security.capability -v x file'), then
cap_convert_nscap() read past the end of the buffer containing the xattr
value because it accessed the ->magic_etc field without verifying that
the xattr value is long enough to contain that field.

Fix it by validating the xattr value size first.

This bug was found using syzkaller with KASAN.  The KASAN report was as
follows (cleaned up slightly):

    BUG: KASAN: slab-out-of-bounds in cap_convert_nscap+0x514/0x630 security/commoncap.c:498
    Read of size 4 at addr ffff88002d8741c0 by task syz-executor1/2852

    CPU: 0 PID: 2852 Comm: syz-executor1 Not tainted 4.15.0-rc6-00200-gcc0aac99d977 #253
    Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-20171110_100015-anatol 04/01/2014
    Call Trace:
     __dump_stack lib/dump_stack.c:17 [inline]
     dump_stack+0xe3/0x195 lib/dump_stack.c:53
     print_address_description+0x73/0x260 mm/kasan/report.c:252
     kasan_report_error mm/kasan/report.c:351 [inline]
     kasan_report+0x235/0x350 mm/kasan/report.c:409
     cap_convert_nscap+0x514/0x630 security/commoncap.c:498
     setxattr+0x2bd/0x350 fs/xattr.c:446
     path_setxattr+0x168/0x1b0 fs/xattr.c:472
     SYSC_setxattr fs/xattr.c:487 [inline]
     SyS_setxattr+0x36/0x50 fs/xattr.c:483
     entry_SYSCALL_64_fastpath+0x18/0x85

Fixes: 8db6c34f1dbc ("Introduce v3 namespaced file capabilities")
Cc: <stable@vger.kernel.org> # v4.14+
Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Serge Hallyn <serge@hallyn.com>
Signed-off-by: James Morris <james.l.morris@oracle.com>
---
 security/commoncap.c | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/security/commoncap.c b/security/commoncap.c
index 4f8e09340956..48620c93d697 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -348,21 +348,18 @@ static __u32 sansflags(__u32 m)
 	return m & ~VFS_CAP_FLAGS_EFFECTIVE;
 }
 
-static bool is_v2header(size_t size, __le32 magic)
+static bool is_v2header(size_t size, const struct vfs_cap_data *cap)
 {
-	__u32 m = le32_to_cpu(magic);
 	if (size != XATTR_CAPS_SZ_2)
 		return false;
-	return sansflags(m) == VFS_CAP_REVISION_2;
+	return sansflags(le32_to_cpu(cap->magic_etc)) == VFS_CAP_REVISION_2;
 }
 
-static bool is_v3header(size_t size, __le32 magic)
+static bool is_v3header(size_t size, const struct vfs_cap_data *cap)
 {
-	__u32 m = le32_to_cpu(magic);
-
 	if (size != XATTR_CAPS_SZ_3)
 		return false;
-	return sansflags(m) == VFS_CAP_REVISION_3;
+	return sansflags(le32_to_cpu(cap->magic_etc)) == VFS_CAP_REVISION_3;
 }
 
 /*
@@ -405,7 +402,7 @@ int cap_inode_getsecurity(struct inode *inode, const char *name, void **buffer,
 
 	fs_ns = inode->i_sb->s_user_ns;
 	cap = (struct vfs_cap_data *) tmpbuf;
-	if (is_v2header((size_t) ret, cap->magic_etc)) {
+	if (is_v2header((size_t) ret, cap)) {
 		/* If this is sizeof(vfs_cap_data) then we're ok with the
 		 * on-disk value, so return that.  */
 		if (alloc)
@@ -413,7 +410,7 @@ int cap_inode_getsecurity(struct inode *inode, const char *name, void **buffer,
 		else
 			kfree(tmpbuf);
 		return ret;
-	} else if (!is_v3header((size_t) ret, cap->magic_etc)) {
+	} else if (!is_v3header((size_t) ret, cap)) {
 		kfree(tmpbuf);
 		return -EINVAL;
 	}
@@ -470,9 +467,9 @@ static kuid_t rootid_from_xattr(const void *value, size_t size,
 	return make_kuid(task_ns, rootid);
 }
 
-static bool validheader(size_t size, __le32 magic)
+static bool validheader(size_t size, const struct vfs_cap_data *cap)
 {
-	return is_v2header(size, magic) || is_v3header(size, magic);
+	return is_v2header(size, cap) || is_v3header(size, cap);
 }
 
 /*
@@ -495,7 +492,7 @@ int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t size)
 
 	if (!*ivalue)
 		return -EINVAL;
-	if (!validheader(size, cap->magic_etc))
+	if (!validheader(size, cap))
 		return -EINVAL;
 	if (!capable_wrt_inode_uidgid(inode, CAP_SETFCAP))
 		return -EPERM;

From 98801506552593c9b8ac11021b0cdad12cab4f6b Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 2 Jan 2018 10:02:19 +0000
Subject: [PATCH 546/808] fscache: Fix the default for
 fscache_maybe_release_page()

Fix the default for fscache_maybe_release_page() for when the cookie isn't
valid or the page isn't cached.  It mustn't return false as that indicates
the page cannot yet be freed.

The problem with the default is that if, say, there's no cache, but a
network filesystem's pages are using up almost all the available memory, a
system can OOM because the filesystem ->releasepage() op will not allow
them to be released as fscache_maybe_release_page() incorrectly prevents
it.

This can be tested by writing a sequence of 512MiB files to an AFS mount.
It does not affect NFS or CIFS because both of those wrap the call in a
check of PG_fscache and it shouldn't bother Ceph as that only has
PG_private set whilst writeback is in progress.  This might be an issue for
9P, however.

Note that the pages aren't entirely stuck.  Removing a file or unmounting
will clear things because that uses ->invalidatepage() instead.

Fixes: 201a15428bd5 ("FS-Cache: Handle pages pending storage that get evicted under OOM conditions")
Reported-by: Marc Dionne <marc.dionne@auristor.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Marc Dionne <marc.dionne@auristor.com>
cc: stable@vger.kernel.org # 2.6.32+
---
 include/linux/fscache.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/fscache.h b/include/linux/fscache.h
index f4ff47d4a893..fe0c349684fa 100644
--- a/include/linux/fscache.h
+++ b/include/linux/fscache.h
@@ -755,7 +755,7 @@ bool fscache_maybe_release_page(struct fscache_cookie *cookie,
 {
 	if (fscache_cookie_valid(cookie) && PageFsCache(page))
 		return __fscache_maybe_release_page(cookie, page, gfp);
-	return false;
+	return true;
 }
 
 /**

From 7888da95832d50a87bbfdb9f40620ddc66f94b3c Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Tue, 2 Jan 2018 10:02:19 +0000
Subject: [PATCH 547/808] afs: Potential uninitialized variable in
 afs_extract_data()

Smatch warns that:

    fs/afs/rxrpc.c:922 afs_extract_data()
    error: uninitialized symbol 'remote_abort'.

Smatch is right that "remote_abort" might be uninitialized when we pass
it to afs_set_call_complete().  I don't know if that function uses the
uninitialized variable.  Anyway, the comment for rxrpc_kernel_recv_data(),
says that "*_abort should also be initialised to 0." and this patch does
that.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/afs/rxrpc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index ea1460b9b71a..e1126659f043 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -885,7 +885,7 @@ int afs_extract_data(struct afs_call *call, void *buf, size_t count,
 {
 	struct afs_net *net = call->net;
 	enum afs_call_state state;
-	u32 remote_abort;
+	u32 remote_abort = 0;
 	int ret;
 
 	_enter("{%s,%zu},,%zu,%d",

From 440fbc3a8a694467ba641234cedb96c28ab2d5fb Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 2 Jan 2018 10:02:19 +0000
Subject: [PATCH 548/808] afs: Fix unlink

Repeating creation and deletion of a file on an afs mount will run the box
out of memory, e.g.:

	dd if=/dev/zero of=/afs/scratch/m0 bs=$((1024*1024)) count=512
	rm /afs/scratch/m0

The problem seems to be that it's not properly decrementing the nlink count
so that the inode can be scrapped.

Note that this doesn't fix local creation followed by remote deletion.
That's harder to handle and will require a separate patch as we're not told
that the file has been deleted - only that the directory has changed.

Reported-by: Marc Dionne <marc.dionne@auristor.com>
Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/afs/dir.c   | 37 +++++++++++++++++++++++++++++--------
 fs/afs/inode.c |  4 ++++
 2 files changed, 33 insertions(+), 8 deletions(-)

diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index ff8d5bf4354f..23c7f395d718 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -895,20 +895,38 @@ error:
  * However, if we didn't have a callback promise outstanding, or it was
  * outstanding on a different server, then it won't break it either...
  */
-static int afs_dir_remove_link(struct dentry *dentry, struct key *key)
+static int afs_dir_remove_link(struct dentry *dentry, struct key *key,
+			       unsigned long d_version_before,
+			       unsigned long d_version_after)
 {
+	bool dir_valid;
 	int ret = 0;
 
+	/* There were no intervening changes on the server if the version
+	 * number we got back was incremented by exactly 1.
+	 */
+	dir_valid = (d_version_after == d_version_before + 1);
+
 	if (d_really_is_positive(dentry)) {
 		struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry));
 
-		if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
-			kdebug("AFS_VNODE_DELETED");
-		clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
-
-		ret = afs_validate(vnode, key);
-		if (ret == -ESTALE)
+		if (dir_valid) {
+			drop_nlink(&vnode->vfs_inode);
+			if (vnode->vfs_inode.i_nlink == 0) {
+				set_bit(AFS_VNODE_DELETED, &vnode->flags);
+				clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
+			}
 			ret = 0;
+		} else {
+			clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
+
+			if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
+				kdebug("AFS_VNODE_DELETED");
+
+			ret = afs_validate(vnode, key);
+			if (ret == -ESTALE)
+				ret = 0;
+		}
 		_debug("nlink %d [val %d]", vnode->vfs_inode.i_nlink, ret);
 	}
 
@@ -923,6 +941,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry)
 	struct afs_fs_cursor fc;
 	struct afs_vnode *dvnode = AFS_FS_I(dir), *vnode;
 	struct key *key;
+	unsigned long d_version = (unsigned long)dentry->d_fsdata;
 	int ret;
 
 	_enter("{%x:%u},{%pd}",
@@ -955,7 +974,9 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry)
 		afs_vnode_commit_status(&fc, dvnode, fc.cb_break);
 		ret = afs_end_vnode_operation(&fc);
 		if (ret == 0)
-			ret = afs_dir_remove_link(dentry, key);
+			ret = afs_dir_remove_link(
+				dentry, key, d_version,
+				(unsigned long)dvnode->status.data_version);
 	}
 
 error_key:
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 3415eb7484f6..1e81864ef0b2 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -377,6 +377,10 @@ int afs_validate(struct afs_vnode *vnode, struct key *key)
 	}
 
 	read_sequnlock_excl(&vnode->cb_lock);
+
+	if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
+		clear_nlink(&vnode->vfs_inode);
+
 	if (valid)
 		goto valid;
 

From afae457d874860a7e299d334f59eede5f3ad4b47 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 2 Jan 2018 10:02:19 +0000
Subject: [PATCH 549/808] afs: Fix missing error handling in afs_write_end()

afs_write_end() is missing page unlock and put if afs_fill_page() fails.

Reported-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/afs/write.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/fs/afs/write.c b/fs/afs/write.c
index cb5f8a3df577..9370e2feb999 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -198,7 +198,7 @@ int afs_write_end(struct file *file, struct address_space *mapping,
 			ret = afs_fill_page(vnode, key, pos + copied,
 					    len - copied, page);
 			if (ret < 0)
-				return ret;
+				goto out;
 		}
 		SetPageUptodate(page);
 	}
@@ -206,10 +206,12 @@ int afs_write_end(struct file *file, struct address_space *mapping,
 	set_page_dirty(page);
 	if (PageDirty(page))
 		_debug("dirtied");
+	ret = copied;
+
+out:
 	unlock_page(page);
 	put_page(page);
-
-	return copied;
+	return ret;
 }
 
 /*

From ecb101aed86156ec7cd71e5dca668e09146e6994 Mon Sep 17 00:00:00 2001
From: John Sperbeck <jsperbeck@google.com>
Date: Sun, 31 Dec 2017 21:24:58 -0800
Subject: [PATCH 550/808] powerpc/mm: Fix SEGV on mapped region to return
 SEGV_ACCERR

The recent refactoring of the powerpc page fault handler in commit
c3350602e876 ("powerpc/mm: Make bad_area* helper functions") caused
access to protected memory regions to indicate SEGV_MAPERR instead of
the traditional SEGV_ACCERR in the si_code field of a user-space
signal handler. This can confuse debug libraries that temporarily
change the protection of memory regions, and expect to use SEGV_ACCERR
as an indication to restore access to a region.

This commit restores the previous behavior. The following program
exhibits the issue:

    $ ./repro read  || echo "FAILED"
    $ ./repro write || echo "FAILED"
    $ ./repro exec  || echo "FAILED"

    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    #include <unistd.h>
    #include <signal.h>
    #include <sys/mman.h>
    #include <assert.h>

    static void segv_handler(int n, siginfo_t *info, void *arg) {
            _exit(info->si_code == SEGV_ACCERR ? 0 : 1);
    }

    int main(int argc, char **argv)
    {
            void *p = NULL;
            struct sigaction act = {
                    .sa_sigaction = segv_handler,
                    .sa_flags = SA_SIGINFO,
            };

            assert(argc == 2);
            p = mmap(NULL, getpagesize(),
                    (strcmp(argv[1], "write") == 0) ? PROT_READ : 0,
                    MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
            assert(p != MAP_FAILED);

            assert(sigaction(SIGSEGV, &act, NULL) == 0);
            if (strcmp(argv[1], "read") == 0)
                    printf("%c", *(unsigned char *)p);
            else if (strcmp(argv[1], "write") == 0)
                    *(unsigned char *)p = 0;
            else if (strcmp(argv[1], "exec") == 0)
                    ((void (*)(void))p)();
            return 1;  /* failed to generate SEGV */
    }

Fixes: c3350602e876 ("powerpc/mm: Make bad_area* helper functions")
Cc: stable@vger.kernel.org # v4.14+
Signed-off-by: John Sperbeck <jsperbeck@google.com>
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
[mpe: Add commit references in change log]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/fault.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 4797d08581ce..6e1e39035380 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -145,6 +145,11 @@ static noinline int bad_area(struct pt_regs *regs, unsigned long address)
 	return __bad_area(regs, address, SEGV_MAPERR);
 }
 
+static noinline int bad_access(struct pt_regs *regs, unsigned long address)
+{
+	return __bad_area(regs, address, SEGV_ACCERR);
+}
+
 static int do_sigbus(struct pt_regs *regs, unsigned long address,
 		     unsigned int fault)
 {
@@ -490,7 +495,7 @@ retry:
 
 good_area:
 	if (unlikely(access_error(is_write, is_exec, vma)))
-		return bad_area(regs, address);
+		return bad_access(regs, address);
 
 	/*
 	 * If for any reason at all we couldn't handle the fault,

From e0093a89f2386f12cc87047b43e93c3c6e15e94e Mon Sep 17 00:00:00 2001
From: Dhinakaran Pandiyan <dhinakaran.pandiyan@gmail.com>
Date: Tue, 19 Dec 2017 20:35:20 -0800
Subject: [PATCH 551/808] drm/i915/psr: Fix register name mess up.

Commit 77affa31722b ("drm/i915/psr: Fix compiler warnings for
hsw_psr_disable()") swapped status and control registers while fixing
indentation. The _ctl at the end of the status register name must have to
led to this.

Fixes: 77affa31722b ("drm/i915/psr: Fix compiler warnings for hsw_psr_disable()")
References: https://www.mrc-cbu.cam.ac.uk/people/matt.davis/cmabridge/
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Signed-off-by: Dhinakaran Pandiyan <dhinakaran.pandiyan@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20171220043520.2599-1-dhinakaran.pandiyan@intel.com
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
(cherry picked from commit 14c6547d6df641d3e41fa4f4164f6e267ebfab89)
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
---
 drivers/gpu/drm/i915/intel_psr.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_psr.c b/drivers/gpu/drm/i915/intel_psr.c
index 6e3b430fccdc..55ea5eb3b7df 100644
--- a/drivers/gpu/drm/i915/intel_psr.c
+++ b/drivers/gpu/drm/i915/intel_psr.c
@@ -590,7 +590,7 @@ static void hsw_psr_disable(struct intel_dp *intel_dp,
 	struct drm_i915_private *dev_priv = to_i915(dev);
 
 	if (dev_priv->psr.active) {
-		i915_reg_t psr_ctl;
+		i915_reg_t psr_status;
 		u32 psr_status_mask;
 
 		if (dev_priv->psr.aux_frame_sync)
@@ -599,24 +599,24 @@ static void hsw_psr_disable(struct intel_dp *intel_dp,
 					0);
 
 		if (dev_priv->psr.psr2_support) {
-			psr_ctl = EDP_PSR2_CTL;
+			psr_status = EDP_PSR2_STATUS_CTL;
 			psr_status_mask = EDP_PSR2_STATUS_STATE_MASK;
 
-			I915_WRITE(psr_ctl,
-				   I915_READ(psr_ctl) &
+			I915_WRITE(EDP_PSR2_CTL,
+				   I915_READ(EDP_PSR2_CTL) &
 				   ~(EDP_PSR2_ENABLE | EDP_SU_TRACK_ENABLE));
 
 		} else {
-			psr_ctl = EDP_PSR_STATUS_CTL;
+			psr_status = EDP_PSR_STATUS_CTL;
 			psr_status_mask = EDP_PSR_STATUS_STATE_MASK;
 
-			I915_WRITE(psr_ctl,
-				   I915_READ(psr_ctl) & ~EDP_PSR_ENABLE);
+			I915_WRITE(EDP_PSR_CTL,
+				   I915_READ(EDP_PSR_CTL) & ~EDP_PSR_ENABLE);
 		}
 
 		/* Wait till PSR is idle */
 		if (intel_wait_for_register(dev_priv,
-					    psr_ctl, psr_status_mask, 0,
+					    psr_status, psr_status_mask, 0,
 					    2000))
 			DRM_ERROR("Timed out waiting for PSR Idle State\n");
 

From 3488d0237f6364614f0c59d6d784bb79b11eeb92 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= <ville.syrjala@linux.intel.com>
Date: Fri, 8 Dec 2017 23:37:36 +0200
Subject: [PATCH 552/808] drm/i915: Disable DC states around GMBUS on GLK
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Prevent the DMC from destroying GMBUS transfers on GLK. GMBUS
lives in PG1 so DC off is all we need.

Cc: stable@vger.kernel.org
Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20171208213739.16388-1-ville.syrjala@linux.intel.com
Reviewed-by: Dhinakaran Pandiyan <dhinakaran.pandiyan@intel.com>
(cherry picked from commit 156961ae7bdf6feb72778e8da83d321b273343fd)
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
---
 drivers/gpu/drm/i915/intel_runtime_pm.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/i915/intel_runtime_pm.c b/drivers/gpu/drm/i915/intel_runtime_pm.c
index 8af286c63d3b..9bf46ab211cb 100644
--- a/drivers/gpu/drm/i915/intel_runtime_pm.c
+++ b/drivers/gpu/drm/i915/intel_runtime_pm.c
@@ -1786,6 +1786,7 @@ void intel_display_power_put(struct drm_i915_private *dev_priv,
 	GLK_DISPLAY_POWERWELL_2_POWER_DOMAINS |		\
 	BIT_ULL(POWER_DOMAIN_MODESET) |			\
 	BIT_ULL(POWER_DOMAIN_AUX_A) |			\
+	BIT_ULL(POWER_DOMAIN_GMBUS) |			\
 	BIT_ULL(POWER_DOMAIN_INIT))
 
 #define CNL_DISPLAY_POWERWELL_2_POWER_DOMAINS (		\

From eda41bdc571e5c51d817c2e8b4578d34a9e383f5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= <ville.syrjala@linux.intel.com>
Date: Mon, 13 Nov 2017 15:36:22 +0200
Subject: [PATCH 553/808] drm/i915: Put all non-blocking modesets onto an
 ordered wq
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We have plenty of global registers and whatnot programmed without
any further locking by the modeset code. Currently non-bocking
modesets are allowed to execute in parallel which could corrupt
said registers.

To avoid the problem let's run all non-blocking modesets on an
ordered workqueue. We still put page flips etc. to system_unbound_wq
allowing page flips on one pipe to execute in parallel with page flips
or a modeset on a another pipe (assuming no known state is shared
between them, at which point they would have been added to the same
atomic commit and serialized that way).

Blocking modesets are already serialized with each other by
connection_mutex, and thus are safe. To serialize them with
non-blocking modesets we just flush the workqueue before executing
blocking modesets.

Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Fixes: 94f050246b42 ("drm/i915: nonblocking commit")
Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20171113133622.8593-1-ville.syrjala@linux.intel.com
Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Reviewed-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
(cherry picked from commit 757fffcfdffb6c0dd46c1b264091c36b4e5a86ae)
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h      |  3 +++
 drivers/gpu/drm/i915/intel_display.c | 14 +++++++++++---
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 54b5d4c582b6..e143004e66d5 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2368,6 +2368,9 @@ struct drm_i915_private {
 	 */
 	struct workqueue_struct *wq;
 
+	/* ordered wq for modesets */
+	struct workqueue_struct *modeset_wq;
+
 	/* Display functions */
 	struct drm_i915_display_funcs display;
 
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 30cf273d57aa..123585eeb87d 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -12544,11 +12544,15 @@ static int intel_atomic_commit(struct drm_device *dev,
 	INIT_WORK(&state->commit_work, intel_atomic_commit_work);
 
 	i915_sw_fence_commit(&intel_state->commit_ready);
-	if (nonblock)
+	if (nonblock && intel_state->modeset) {
+		queue_work(dev_priv->modeset_wq, &state->commit_work);
+	} else if (nonblock) {
 		queue_work(system_unbound_wq, &state->commit_work);
-	else
+	} else {
+		if (intel_state->modeset)
+			flush_workqueue(dev_priv->modeset_wq);
 		intel_atomic_commit_tail(state);
-
+	}
 
 	return 0;
 }
@@ -14462,6 +14466,8 @@ int intel_modeset_init(struct drm_device *dev)
 	enum pipe pipe;
 	struct intel_crtc *crtc;
 
+	dev_priv->modeset_wq = alloc_ordered_workqueue("i915_modeset", 0);
+
 	drm_mode_config_init(dev);
 
 	dev->mode_config.min_width = 0;
@@ -15270,6 +15276,8 @@ void intel_modeset_cleanup(struct drm_device *dev)
 	intel_cleanup_gt_powersave(dev_priv);
 
 	intel_teardown_gmbus(dev_priv);
+
+	destroy_workqueue(dev_priv->modeset_wq);
 }
 
 void intel_connector_attach_encoder(struct intel_connector *connector,

From c1f08c419764439bfa2d3f33d2fdef9d7013fc47 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sun, 3 Dec 2017 15:36:20 -0800
Subject: [PATCH 554/808] documentation/gpu/i915: fix docs build error after
 file rename

Fix documentation build errors after intel_guc_loader.c was
renamed to intel_guc_fw.c.

Error: Cannot open file ../drivers/gpu/drm/i915/intel_guc_loader.c
WARNING: kernel-doc '../scripts/kernel-doc -rst -enable-lineno -function GuC-specific firmware loader ../drivers/gpu/drm/i915/intel_guc_loader.c' failed with return code 1
Error: Cannot open file ../drivers/gpu/drm/i915/intel_guc_loader.c
Error: Cannot open file ../drivers/gpu/drm/i915/intel_guc_loader.c
WARNING: kernel-doc '../scripts/kernel-doc -rst -enable-lineno -internal ../drivers/gpu/drm/i915/intel_guc_loader.c' failed with return code 2

Fixes: e8668bbcb0f9 ("drm/i915/guc: Rename intel_guc_loader.c to intel_guc_fw.c")
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Michal Wajdeczko <michal.wajdeczko@intel.com>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/1b214f53-47f5-bef3-f58e-8136de5678ed@infradead.org
(cherry picked from commit 006c23327f8de8575508c458131b304188d426f7)
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
---
 Documentation/gpu/i915.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/gpu/i915.rst b/Documentation/gpu/i915.rst
index 2e7ee0313c1c..e21698e16534 100644
--- a/Documentation/gpu/i915.rst
+++ b/Documentation/gpu/i915.rst
@@ -341,10 +341,10 @@ GuC
 GuC-specific firmware loader
 ----------------------------
 
-.. kernel-doc:: drivers/gpu/drm/i915/intel_guc_loader.c
+.. kernel-doc:: drivers/gpu/drm/i915/intel_guc_fw.c
    :doc: GuC-specific firmware loader
 
-.. kernel-doc:: drivers/gpu/drm/i915/intel_guc_loader.c
+.. kernel-doc:: drivers/gpu/drm/i915/intel_guc_fw.c
    :internal:
 
 GuC-based command submission

From df29c9db8ace4497a61f3b3d33c2b8a7fd4b7b8e Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil@xs4all.nl>
Date: Mon, 4 Dec 2017 14:32:46 +0100
Subject: [PATCH 555/808] omapdrm/dss/hdmi4_cec: fix interrupt handling

The omap4 CEC hardware cannot tell a Nack from a Low Drive from an
Arbitration Lost error, so just report a Nack, which is almost
certainly the reason for the error anyway.

This also simplifies the implementation. The only three interrupts
that need to be enabled are:

Transmit Buffer Full/Empty Change event: triggered when the
transmit finished successfully and cleared the buffer.

Receiver FIFO Not Empty event: triggered when a message was received.

Frame Retransmit Count Exceeded event: triggered when a transmit
failed repeatedly, usually due to the message being Nacked. Other
reasons are possible (Low Drive, Arbitration Lost) but there is no
way to know. If this happens the TX buffer needs to be cleared
manually.

While testing various error conditions I noticed that the hardware
can receive messages up to 18 bytes in total, which exceeds the legal
maximum of 16. This could cause a buffer overflow, so we check for
this and constrain the size to 16 bytes.

The old incorrect interrupt handler could cause the CEC framework to
enter into a bad state because it mis-detected the "Start Bit Irregularity
event" as an ARB_LOST transmit error when it actually is a receive error
which should be ignored.

Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Reported-by: Henrik Austad <haustad@cisco.com>
Tested-by: Henrik Austad <haustad@cisco.com>
Tested-by: Hans Verkuil <hans.verkuil@cisco.com>
Signed-off-by: Tomi Valkeinen <tomi.valkeinen@ti.com>
---
 drivers/gpu/drm/omapdrm/dss/hdmi4_cec.c | 46 +++++--------------------
 1 file changed, 9 insertions(+), 37 deletions(-)

diff --git a/drivers/gpu/drm/omapdrm/dss/hdmi4_cec.c b/drivers/gpu/drm/omapdrm/dss/hdmi4_cec.c
index e626eddf24d5..23db74ae1826 100644
--- a/drivers/gpu/drm/omapdrm/dss/hdmi4_cec.c
+++ b/drivers/gpu/drm/omapdrm/dss/hdmi4_cec.c
@@ -78,6 +78,8 @@ static void hdmi_cec_received_msg(struct hdmi_core_data *core)
 
 			/* then read the message */
 			msg.len = cnt & 0xf;
+			if (msg.len > CEC_MAX_MSG_SIZE - 2)
+				msg.len = CEC_MAX_MSG_SIZE - 2;
 			msg.msg[0] = hdmi_read_reg(core->base,
 						   HDMI_CEC_RX_CMD_HEADER);
 			msg.msg[1] = hdmi_read_reg(core->base,
@@ -104,26 +106,6 @@ static void hdmi_cec_received_msg(struct hdmi_core_data *core)
 	}
 }
 
-static void hdmi_cec_transmit_fifo_empty(struct hdmi_core_data *core, u32 stat1)
-{
-	if (stat1 & 2) {
-		u32 dbg3 = hdmi_read_reg(core->base, HDMI_CEC_DBG_3);
-
-		cec_transmit_done(core->adap,
-				  CEC_TX_STATUS_NACK |
-				  CEC_TX_STATUS_MAX_RETRIES,
-				  0, (dbg3 >> 4) & 7, 0, 0);
-	} else if (stat1 & 1) {
-		cec_transmit_done(core->adap,
-				  CEC_TX_STATUS_ARB_LOST |
-				  CEC_TX_STATUS_MAX_RETRIES,
-				  0, 0, 0, 0);
-	} else if (stat1 == 0) {
-		cec_transmit_done(core->adap, CEC_TX_STATUS_OK,
-				  0, 0, 0, 0);
-	}
-}
-
 void hdmi4_cec_irq(struct hdmi_core_data *core)
 {
 	u32 stat0 = hdmi_read_reg(core->base, HDMI_CEC_INT_STATUS_0);
@@ -132,27 +114,21 @@ void hdmi4_cec_irq(struct hdmi_core_data *core)
 	hdmi_write_reg(core->base, HDMI_CEC_INT_STATUS_0, stat0);
 	hdmi_write_reg(core->base, HDMI_CEC_INT_STATUS_1, stat1);
 
-	if (stat0 & 0x40)
+	if (stat0 & 0x20) {
+		cec_transmit_done(core->adap, CEC_TX_STATUS_OK,
+				  0, 0, 0, 0);
 		REG_FLD_MOD(core->base, HDMI_CEC_DBG_3, 0x1, 7, 7);
-	else if (stat0 & 0x24)
-		hdmi_cec_transmit_fifo_empty(core, stat1);
-	if (stat1 & 2) {
+	} else if (stat1 & 0x02) {
 		u32 dbg3 = hdmi_read_reg(core->base, HDMI_CEC_DBG_3);
 
 		cec_transmit_done(core->adap,
 				  CEC_TX_STATUS_NACK |
 				  CEC_TX_STATUS_MAX_RETRIES,
 				  0, (dbg3 >> 4) & 7, 0, 0);
-	} else if (stat1 & 1) {
-		cec_transmit_done(core->adap,
-				  CEC_TX_STATUS_ARB_LOST |
-				  CEC_TX_STATUS_MAX_RETRIES,
-				  0, 0, 0, 0);
+		REG_FLD_MOD(core->base, HDMI_CEC_DBG_3, 0x1, 7, 7);
 	}
 	if (stat0 & 0x02)
 		hdmi_cec_received_msg(core);
-	if (stat1 & 0x3)
-		REG_FLD_MOD(core->base, HDMI_CEC_DBG_3, 0x1, 7, 7);
 }
 
 static bool hdmi_cec_clear_tx_fifo(struct cec_adapter *adap)
@@ -231,18 +207,14 @@ static int hdmi_cec_adap_enable(struct cec_adapter *adap, bool enable)
 	/*
 	 * Enable CEC interrupts:
 	 * Transmit Buffer Full/Empty Change event
-	 * Transmitter FIFO Empty event
 	 * Receiver FIFO Not Empty event
 	 */
-	hdmi_write_reg(core->base, HDMI_CEC_INT_ENABLE_0, 0x26);
+	hdmi_write_reg(core->base, HDMI_CEC_INT_ENABLE_0, 0x22);
 	/*
 	 * Enable CEC interrupts:
-	 * RX FIFO Overrun Error event
-	 * Short Pulse Detected event
 	 * Frame Retransmit Count Exceeded event
-	 * Start Bit Irregularity event
 	 */
-	hdmi_write_reg(core->base, HDMI_CEC_INT_ENABLE_1, 0x0f);
+	hdmi_write_reg(core->base, HDMI_CEC_INT_ENABLE_1, 0x02);
 
 	/* cec calibration enable (self clearing) */
 	hdmi_write_reg(core->base, HDMI_CEC_SETUP, 0x03);

From 8a9bd4f8ebc6800bfc0596e28631ff6809a2f615 Mon Sep 17 00:00:00 2001
From: Stefan Haberland <sth@linux.vnet.ibm.com>
Date: Wed, 6 Dec 2017 10:30:39 +0100
Subject: [PATCH 556/808] s390/dasd: fix wrongly assigned configuration data

We store per path and per device configuration data to identify the
path or device correctly. The per path configuration data might get
mixed up if the original request gets into error recovery and is
started with a random path mask.

This would lead to a wrong identification of a path in case of a CUIR
event for example.

Fix by copying the path mask from the original request to the error
recovery request in case it is a path verification request.

Signed-off-by: Stefan Haberland <sth@linux.vnet.ibm.com>
Reviewed-by: Jan Hoeppner <hoeppner@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 drivers/s390/block/dasd_3990_erp.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/drivers/s390/block/dasd_3990_erp.c b/drivers/s390/block/dasd_3990_erp.c
index c94b606e0df8..ee14d8e45c97 100644
--- a/drivers/s390/block/dasd_3990_erp.c
+++ b/drivers/s390/block/dasd_3990_erp.c
@@ -2803,6 +2803,16 @@ dasd_3990_erp_action(struct dasd_ccw_req * cqr)
 		erp = dasd_3990_erp_handle_match_erp(cqr, erp);
 	}
 
+
+	/*
+	 * For path verification work we need to stick with the path that was
+	 * originally chosen so that the per path configuration data is
+	 * assigned correctly.
+	 */
+	if (test_bit(DASD_CQR_VERIFY_PATH, &erp->flags) && cqr->lpm) {
+		erp->lpm = cqr->lpm;
+	}
+
 	if (device->features & DASD_FEATURE_ERPLOG) {
 		/* print current erp_chain */
 		dev_err(&device->cdev->dev,

From fe08f34d066f4404934a509b6806db1a4f700c86 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Mon, 1 Jan 2018 09:50:50 +0100
Subject: [PATCH 557/808] ALSA: pcm: Remove incorrect snd_BUG_ON() usages

syzkaller triggered kernel warnings through PCM OSS emulation at
closing a stream:
  WARNING: CPU: 0 PID: 3502 at sound/core/pcm_lib.c:1635
  snd_pcm_hw_param_first+0x289/0x690 sound/core/pcm_lib.c:1635
  Call Trace:
  ....
   snd_pcm_hw_param_near.constprop.27+0x78d/0x9a0 sound/core/oss/pcm_oss.c:457
   snd_pcm_oss_change_params+0x17d3/0x3720 sound/core/oss/pcm_oss.c:969
   snd_pcm_oss_make_ready+0xaa/0x130 sound/core/oss/pcm_oss.c:1128
   snd_pcm_oss_sync+0x257/0x830 sound/core/oss/pcm_oss.c:1638
   snd_pcm_oss_release+0x20b/0x280 sound/core/oss/pcm_oss.c:2431
   __fput+0x327/0x7e0 fs/file_table.c:210
   ....

This happens while it tries to open and set up the aloop device
concurrently.  The warning above (invoked from snd_BUG_ON() macro) is
to detect the unexpected logical error where snd_pcm_hw_refine() call
shouldn't fail.  The theory is true for the case where the hw_params
config rules are static.  But for an aloop device, the hw_params rule
condition does vary dynamically depending on the connected target;
when another device is opened and changes the parameters, the device
connected in another side is also affected, and it caused the error
from snd_pcm_hw_refine().

That is, the simplest "solution" for this is to remove the incorrect
assumption of static rules, and treat such an error as a normal error
path.  As there are a couple of other places using snd_BUG_ON()
incorrectly, this patch removes these spurious snd_BUG_ON() calls.

Reported-by: syzbot+6f11c7e2a1b91d466432@syzkaller.appspotmail.com
Cc: <stable@vger.kernel.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/core/oss/pcm_oss.c | 1 -
 sound/core/pcm_lib.c     | 4 ++--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/sound/core/oss/pcm_oss.c b/sound/core/oss/pcm_oss.c
index e49f448ee04f..ceaa51f76591 100644
--- a/sound/core/oss/pcm_oss.c
+++ b/sound/core/oss/pcm_oss.c
@@ -455,7 +455,6 @@ static int snd_pcm_hw_param_near(struct snd_pcm_substream *pcm,
 		v = snd_pcm_hw_param_last(pcm, params, var, dir);
 	else
 		v = snd_pcm_hw_param_first(pcm, params, var, dir);
-	snd_BUG_ON(v < 0);
 	return v;
 }
 
diff --git a/sound/core/pcm_lib.c b/sound/core/pcm_lib.c
index 10e7ef7a8804..db7894bb028c 100644
--- a/sound/core/pcm_lib.c
+++ b/sound/core/pcm_lib.c
@@ -1632,7 +1632,7 @@ int snd_pcm_hw_param_first(struct snd_pcm_substream *pcm,
 		return changed;
 	if (params->rmask) {
 		int err = snd_pcm_hw_refine(pcm, params);
-		if (snd_BUG_ON(err < 0))
+		if (err < 0)
 			return err;
 	}
 	return snd_pcm_hw_param_value(params, var, dir);
@@ -1678,7 +1678,7 @@ int snd_pcm_hw_param_last(struct snd_pcm_substream *pcm,
 		return changed;
 	if (params->rmask) {
 		int err = snd_pcm_hw_refine(pcm, params);
-		if (snd_BUG_ON(err < 0))
+		if (err < 0)
 			return err;
 	}
 	return snd_pcm_hw_param_value(params, var, dir);

From 4aac2caff30fdef1db8403af81e79807811d22ea Mon Sep 17 00:00:00 2001
From: Wei Yongjun <weiyongjun1@huawei.com>
Date: Thu, 28 Dec 2017 03:46:48 +0000
Subject: [PATCH 558/808] xen/pvcalls: use GFP_ATOMIC under spin lock

A spin lock is taken here so we should use GFP_ATOMIC.

Fixes: 9774c6cca266 ("xen/pvcalls: implement accept command")
Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
Reviewed-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 drivers/xen/pvcalls-front.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c
index d1e1d8d2b9d5..4c789e61554b 100644
--- a/drivers/xen/pvcalls-front.c
+++ b/drivers/xen/pvcalls-front.c
@@ -805,7 +805,7 @@ int pvcalls_front_accept(struct socket *sock, struct socket *newsock, int flags)
 		pvcalls_exit();
 		return ret;
 	}
-	map2 = kzalloc(sizeof(*map2), GFP_KERNEL);
+	map2 = kzalloc(sizeof(*map2), GFP_ATOMIC);
 	if (map2 == NULL) {
 		clear_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT,
 			  (void *)&map->passive.flags);

From af2e01da344e9f90e38d039c39385882d7364c0f Mon Sep 17 00:00:00 2001
From: Markus Heiser <markus.heiser@darmarit.de>
Date: Tue, 12 Dec 2017 12:38:37 +0100
Subject: [PATCH 559/808] docs: fix, intel_guc_loader.c has been moved to
 intel_guc_fw.c

With commit d9e2e0143c the 'GuC-specific firmware loader' doc
section was removed from intel_guc_loader.c without a
replacement.  So lets remove it from the Kernel-doc::

  .. kernel-doc:: drivers/gpu/drm/i915/intel_guc_loader.c
     :doc: GuC-specific firmware loader

With commit e8668bbcb0 intel_guc_loader.c was renamed to to
intel_guc_fw.c and to name just one, intel_guc_init_hw() was
renamed to intel_guc_fw_upload(). Since we get errors in the
Sphinx build like:

- Error: Cannot open file ./drivers/gpu/drm/i915/intel_guc_loader.c

Change the kernel-doc directive from intel_guc_loader.c to
intel_guc_fw.c

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
[danvet: Rebase onto the partial fix 006c23327f8d
("documentation/gpu/i915: fix docs build error after file rename")]
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/1513078717-12373-1-git-send-email-markus.heiser@darmarit.de
(cherry picked from commit 0132a1a5d44d2cd32a249dbe999a88c2134a6bd1)
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
---
 Documentation/gpu/i915.rst | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/Documentation/gpu/i915.rst b/Documentation/gpu/i915.rst
index e21698e16534..e94d3ac2bdd0 100644
--- a/Documentation/gpu/i915.rst
+++ b/Documentation/gpu/i915.rst
@@ -341,9 +341,6 @@ GuC
 GuC-specific firmware loader
 ----------------------------
 
-.. kernel-doc:: drivers/gpu/drm/i915/intel_guc_fw.c
-   :doc: GuC-specific firmware loader
-
 .. kernel-doc:: drivers/gpu/drm/i915/intel_guc_fw.c
    :internal:
 

From 57d72e159b60456c8bb281736c02ddd3164037aa Mon Sep 17 00:00:00 2001
From: Jean-Philippe Brucker <jean-philippe.brucker@arm.com>
Date: Thu, 14 Dec 2017 11:03:01 +0000
Subject: [PATCH 560/808] iommu/arm-smmu-v3: Don't free page table ops twice

Kasan reports a double free when finalise_stage_fn fails: the io_pgtable
ops are freed by arm_smmu_domain_finalise and then again by
arm_smmu_domain_free. Prevent this by leaving pgtbl_ops empty on failure.

Cc: <stable@vger.kernel.org>
Fixes: 48ec83bcbcf5 ("iommu/arm-smmu: Add initial driver support for ARM SMMUv3 devices")
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 drivers/iommu/arm-smmu-v3.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index f122071688fd..db4281d0e269 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -1698,13 +1698,15 @@ static int arm_smmu_domain_finalise(struct iommu_domain *domain)
 	domain->pgsize_bitmap = pgtbl_cfg.pgsize_bitmap;
 	domain->geometry.aperture_end = (1UL << ias) - 1;
 	domain->geometry.force_aperture = true;
-	smmu_domain->pgtbl_ops = pgtbl_ops;
 
 	ret = finalise_stage_fn(smmu_domain, &pgtbl_cfg);
-	if (ret < 0)
+	if (ret < 0) {
 		free_io_pgtable_ops(pgtbl_ops);
+		return ret;
+	}
 
-	return ret;
+	smmu_domain->pgtbl_ops = pgtbl_ops;
+	return 0;
 }
 
 static __le64 *arm_smmu_get_step_for_sid(struct arm_smmu_device *smmu, u32 sid)

From 563b5cbe334e9503ab2b234e279d500fc4f76018 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Tue, 2 Jan 2018 12:33:14 +0000
Subject: [PATCH 561/808] iommu/arm-smmu-v3: Cope with duplicated Stream IDs

For PCI devices behind an aliasing PCIe-to-PCI/X bridge, the bridge
alias to DevFn 0.0 on the subordinate bus may match the original RID of
the device, resulting in the same SID being present in the device's
fwspec twice. This causes trouble later in arm_smmu_write_strtab_ent()
when we wind up visiting the STE a second time and find it already live.

Avoid the issue by giving arm_smmu_install_ste_for_dev() the cleverness
to skip over duplicates. It seems mildly counterintuitive compared to
preventing the duplicates from existing in the first place, but since
the DT and ACPI probe paths build their fwspecs differently, this is
actually the cleanest and most self-contained way to deal with it.

Cc: <stable@vger.kernel.org>
Fixes: 8f78515425da ("iommu/arm-smmu: Implement of_xlate() for SMMUv3")
Reported-by: Tomasz Nowicki <tomasz.nowicki@caviumnetworks.com>
Tested-by: Tomasz Nowicki <Tomasz.Nowicki@cavium.com>
Tested-by: Jayachandran C. <jnair@caviumnetworks.com>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 drivers/iommu/arm-smmu-v3.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index db4281d0e269..744592d330ca 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -1733,7 +1733,7 @@ static __le64 *arm_smmu_get_step_for_sid(struct arm_smmu_device *smmu, u32 sid)
 
 static void arm_smmu_install_ste_for_dev(struct iommu_fwspec *fwspec)
 {
-	int i;
+	int i, j;
 	struct arm_smmu_master_data *master = fwspec->iommu_priv;
 	struct arm_smmu_device *smmu = master->smmu;
 
@@ -1741,6 +1741,13 @@ static void arm_smmu_install_ste_for_dev(struct iommu_fwspec *fwspec)
 		u32 sid = fwspec->ids[i];
 		__le64 *step = arm_smmu_get_step_for_sid(smmu, sid);
 
+		/* Bridged PCI devices may end up with duplicated IDs */
+		for (j = 0; j < i; j++)
+			if (fwspec->ids[j] == sid)
+				break;
+		if (j < i)
+			continue;
+
 		arm_smmu_write_strtab_ent(smmu, sid, step, &master->ste);
 	}
 }

From 55a5ec9b77106ffc05e8c40d7568432bf4696d7b Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 2 Jan 2018 11:45:07 -0500
Subject: [PATCH 562/808] Revert "net: core: dev_get_valid_name is now the same
 as dev_alloc_name_ns"

This reverts commit 87c320e51519a83c496ab7bfb4e96c8f9c001e89.

Changing the error return code in some situations turns out to
be harmful in practice.  In particular Michael Ellerman reports
that DHCP fails on his powerpc machines, and this revert gets
things working again.

Johannes Berg agrees that this revert is the best course of
action for now.

Fixes: 029b6d140550 ("Revert "net: core: maybe return -EEXIST in __dev_alloc_name"")
Reported-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 01ee854454a8..0e0ba36eeac9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1146,7 +1146,19 @@ EXPORT_SYMBOL(dev_alloc_name);
 int dev_get_valid_name(struct net *net, struct net_device *dev,
 		       const char *name)
 {
-	return dev_alloc_name_ns(net, dev, name);
+	BUG_ON(!net);
+
+	if (!dev_valid_name(name))
+		return -EINVAL;
+
+	if (strchr(name, '%'))
+		return dev_alloc_name_ns(net, dev, name);
+	else if (__dev_get_by_name(net, name))
+		return -EEXIST;
+	else if (dev->name != name)
+		strlcpy(dev->name, name, IFNAMSIZ);
+
+	return 0;
 }
 EXPORT_SYMBOL(dev_get_valid_name);
 

From beed9263f4000c48a5c48912f26576f6fa091181 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Wed, 13 Dec 2017 13:50:07 +0200
Subject: [PATCH 563/808] btrfs: Fix flush bio leak

Commit e0ae99941423 ("btrfs: preallocate device flush bio") reworked
the way the flush bio is allocated and used. Concretely it allocates
the bio in __alloc_device and then re-uses it multiple times with a
very simple endio routine that just calls complete() without consuming
a reference. Allocated bios by default come with a ref count of 1,
which is then consumed by the endio routine (or not, in which case they
should be bio_put by the caller). The way the impleementation works now
is that the flush bio has a refcount of 2 and we only ever bio_put it
once, leaving it to hang indefinitely. Fix this by removing the extra
bio_get in __alloc_device.

Fixes: e0ae99941423 ("btrfs: preallocate device flush bio")
Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index d48b24e54366..94d28f549837 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -237,7 +237,6 @@ static struct btrfs_device *__alloc_device(void)
 		kfree(dev);
 		return ERR_PTR(-ENOMEM);
 	}
-	bio_get(dev->flush_bio);
 
 	INIT_LIST_HEAD(&dev->dev_list);
 	INIT_LIST_HEAD(&dev->dev_alloc_list);

From ec35e48b286959991cdbb886f1bdeda4575c80b4 Mon Sep 17 00:00:00 2001
From: Chris Mason <clm@fb.com>
Date: Fri, 15 Dec 2017 11:58:27 -0800
Subject: [PATCH 564/808] btrfs: fix refcount_t usage when deleting
 btrfs_delayed_nodes

refcounts have a generic implementation and an asm optimized one.  The
generic version has extra debugging to make sure that once a refcount
goes to zero, refcount_inc won't increase it.

The btrfs delayed inode code wasn't expecting this, and we're tripping
over the warnings when the generic refcounts are used.  We ended up with
this race:

Process A                                         Process B
                                                  btrfs_get_delayed_node()
						  spin_lock(root->inode_lock)
						  radix_tree_lookup()
__btrfs_release_delayed_node()
refcount_dec_and_test(&delayed_node->refs)
our refcount is now zero
						  refcount_add(2) <---
						  warning here, refcount
                                                  unchanged

spin_lock(root->inode_lock)
radix_tree_delete()

With the generic refcounts, we actually warn again when process B above
tries to release his refcount because refcount_add() turned into a
no-op.

We saw this in production on older kernels without the asm optimized
refcounts.

The fix used here is to use refcount_inc_not_zero() to detect when the
object is in the middle of being freed and return NULL.  This is almost
always the right answer anyway, since we usually end up pitching the
delayed_node if it didn't have fresh data in it.

This also changes __btrfs_release_delayed_node() to remove the extra
check for zero refcounts before radix tree deletion.
btrfs_get_delayed_node() was the only path that was allowing refcounts
to go from zero to one.

Fixes: 6de5f18e7b0da ("btrfs: fix refcount_t usage when deleting btrfs_delayed_node")
CC: <stable@vger.kernel.org> # 4.12+
Signed-off-by: Chris Mason <clm@fb.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/delayed-inode.c | 45 ++++++++++++++++++++++++++++++----------
 1 file changed, 34 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 5d73f79ded8b..056276101c63 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -87,6 +87,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
 
 	spin_lock(&root->inode_lock);
 	node = radix_tree_lookup(&root->delayed_nodes_tree, ino);
+
 	if (node) {
 		if (btrfs_inode->delayed_node) {
 			refcount_inc(&node->refs);	/* can be accessed */
@@ -94,9 +95,30 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
 			spin_unlock(&root->inode_lock);
 			return node;
 		}
-		btrfs_inode->delayed_node = node;
-		/* can be accessed and cached in the inode */
-		refcount_add(2, &node->refs);
+
+		/*
+		 * It's possible that we're racing into the middle of removing
+		 * this node from the radix tree.  In this case, the refcount
+		 * was zero and it should never go back to one.  Just return
+		 * NULL like it was never in the radix at all; our release
+		 * function is in the process of removing it.
+		 *
+		 * Some implementations of refcount_inc refuse to bump the
+		 * refcount once it has hit zero.  If we don't do this dance
+		 * here, refcount_inc() may decide to just WARN_ONCE() instead
+		 * of actually bumping the refcount.
+		 *
+		 * If this node is properly in the radix, we want to bump the
+		 * refcount twice, once for the inode and once for this get
+		 * operation.
+		 */
+		if (refcount_inc_not_zero(&node->refs)) {
+			refcount_inc(&node->refs);
+			btrfs_inode->delayed_node = node;
+		} else {
+			node = NULL;
+		}
+
 		spin_unlock(&root->inode_lock);
 		return node;
 	}
@@ -254,17 +276,18 @@ static void __btrfs_release_delayed_node(
 	mutex_unlock(&delayed_node->mutex);
 
 	if (refcount_dec_and_test(&delayed_node->refs)) {
-		bool free = false;
 		struct btrfs_root *root = delayed_node->root;
+
 		spin_lock(&root->inode_lock);
-		if (refcount_read(&delayed_node->refs) == 0) {
-			radix_tree_delete(&root->delayed_nodes_tree,
-					  delayed_node->inode_id);
-			free = true;
-		}
+		/*
+		 * Once our refcount goes to zero, nobody is allowed to bump it
+		 * back up.  We can delete it now.
+		 */
+		ASSERT(refcount_read(&delayed_node->refs) == 0);
+		radix_tree_delete(&root->delayed_nodes_tree,
+				  delayed_node->inode_id);
 		spin_unlock(&root->inode_lock);
-		if (free)
-			kmem_cache_free(delayed_node_cache, delayed_node);
+		kmem_cache_free(delayed_node_cache, delayed_node);
 	}
 }
 

From 23263ec86a5f44312d2899323872468752324107 Mon Sep 17 00:00:00 2001
From: Eli Cooper <elicooper@gmx.com>
Date: Mon, 25 Dec 2017 10:43:49 +0800
Subject: [PATCH 565/808] ip6_tunnel: disable dst caching if tunnel is
 dual-stack

When an ip6_tunnel is in mode 'any', where the transport layer
protocol can be either 4 or 41, dst_cache must be disabled.

This is because xfrm policies might apply to only one of the two
protocols. Caching dst would cause xfrm policies for one protocol
incorrectly used for the other.

Signed-off-by: Eli Cooper <elicooper@gmx.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_tunnel.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 931c38f6ff4a..b263c809d8d4 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1074,10 +1074,11 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
 			memcpy(&fl6->daddr, addr6, sizeof(fl6->daddr));
 			neigh_release(neigh);
 		}
-	} else if (!(t->parms.flags &
-		     (IP6_TNL_F_USE_ORIG_TCLASS | IP6_TNL_F_USE_ORIG_FWMARK))) {
-		/* enable the cache only only if the routing decision does
-		 * not depend on the current inner header value
+	} else if (t->parms.proto != 0 && !(t->parms.flags &
+					    (IP6_TNL_F_USE_ORIG_TCLASS |
+					     IP6_TNL_F_USE_ORIG_FWMARK))) {
+		/* enable the cache only if neither the outer protocol nor the
+		 * routing decision depends on the current inner header value
 		 */
 		use_cache = true;
 	}

From 52a589d51f1008f62569bf89e95b26221ee76690 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Mon, 25 Dec 2017 14:43:58 +0800
Subject: [PATCH 566/808] geneve: update skb dst pmtu on tx path

Commit a93bf0ff4490 ("vxlan: update skb dst pmtu on tx path") has fixed
a performance issue caused by the change of lower dev's mtu for vxlan.

The same thing needs to be done for geneve as well.

Note that geneve cannot adjust it's mtu according to lower dev's mtu
when creating it. The performance is very low later when netperfing
over it without fixing the mtu manually. This patch could also avoid
this issue.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/geneve.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index b718a02a6bb6..0a48b3073d3d 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -825,6 +825,13 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 	if (IS_ERR(rt))
 		return PTR_ERR(rt);
 
+	if (skb_dst(skb)) {
+		int mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr) -
+			  GENEVE_BASE_HLEN - info->options_len - 14;
+
+		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
+	}
+
 	sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true);
 	if (geneve->collect_md) {
 		tos = ip_tunnel_ecn_encap(key->tos, ip_hdr(skb), skb);
@@ -864,6 +871,13 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 	if (IS_ERR(dst))
 		return PTR_ERR(dst);
 
+	if (skb_dst(skb)) {
+		int mtu = dst_mtu(dst) - sizeof(struct ipv6hdr) -
+			  GENEVE_BASE_HLEN - info->options_len - 14;
+
+		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
+	}
+
 	sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true);
 	if (geneve->collect_md) {
 		prio = ip_tunnel_ecn_encap(key->tos, ip_hdr(skb), skb);

From 2fa771be953a17f8e0a9c39103464c2574444c62 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Mon, 25 Dec 2017 14:45:12 +0800
Subject: [PATCH 567/808] ip6_tunnel: allow ip6gre dev mtu to be set below 1280

Commit 582442d6d5bc ("ipv6: Allow the MTU of ipip6 tunnel to be set
below 1280") fixed a mtu setting issue. It works for ipip6 tunnel.

But ip6gre dev updates the mtu also with ip6_tnl_change_mtu. Since
the inner packet over ip6gre can be ipv4 and it's mtu should also
be allowed to set below 1280, the same issue also exists on ip6gre.

This patch is to fix it by simply changing to check if parms.proto
is IPPROTO_IPV6 in ip6_tnl_change_mtu instead, to make ip6gre to
go to 'else' branch.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_tunnel.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index b263c809d8d4..9a7cf355bc8c 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1677,11 +1677,11 @@ int ip6_tnl_change_mtu(struct net_device *dev, int new_mtu)
 {
 	struct ip6_tnl *tnl = netdev_priv(dev);
 
-	if (tnl->parms.proto == IPPROTO_IPIP) {
-		if (new_mtu < ETH_MIN_MTU)
+	if (tnl->parms.proto == IPPROTO_IPV6) {
+		if (new_mtu < IPV6_MIN_MTU)
 			return -EINVAL;
 	} else {
-		if (new_mtu < IPV6_MIN_MTU)
+		if (new_mtu < ETH_MIN_MTU)
 			return -EINVAL;
 	}
 	if (new_mtu > 0xFFF8 - dev->hard_header_len)

From 8764a8267b128405cf383157d5e9a4a3735d2409 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Mon, 25 Dec 2017 08:57:35 +0100
Subject: [PATCH 568/808] mlxsw: spectrum_router: Fix NULL pointer deref

When we remove the neighbour associated with a nexthop we should always
refuse to write the nexthop to the adjacency table. Regardless if it is
already present in the table or not.

Otherwise, we risk dereferencing the NULL pointer that was set instead
of the neighbour.

Fixes: a7ff87acd995 ("mlxsw: spectrum_router: Implement next-hop routing")
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Reported-by: Alexander Petrovskiy <alexpe@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index be657b8533f0..434b3922b34f 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -3228,7 +3228,7 @@ static void __mlxsw_sp_nexthop_neigh_update(struct mlxsw_sp_nexthop *nh,
 {
 	if (!removing)
 		nh->should_offload = 1;
-	else if (nh->offloaded)
+	else
 		nh->should_offload = 0;
 	nh->update = 1;
 }

From 90045fc9c78855bdc625a0ab185d97b72a937613 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Mon, 25 Dec 2017 09:05:33 +0100
Subject: [PATCH 569/808] mlxsw: spectrum: Relax sanity checks during
 enslavement

Since commit 25cc72a33835 ("mlxsw: spectrum: Forbid linking to devices that
have uppers") the driver forbids enslavement to netdevs that already
have uppers of their own, as this can result in various ordering
problems.

This requirement proved to be too strict for some users who need to be
able to enslave ports to a bridge that already has uppers. In this case,
we can allow the enslavement if the bridge is already known to us, as
any configuration performed on top of the bridge was already reflected
to the device.

Fixes: 25cc72a33835 ("mlxsw: spectrum: Forbid linking to devices that have uppers")
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Reported-by: Alexander Petrovskiy <alexpe@mellanox.com>
Tested-by: Alexander Petrovskiy <alexpe@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c        | 11 +++++++++--
 drivers/net/ethernet/mellanox/mlxsw/spectrum.h        |  2 ++
 .../net/ethernet/mellanox/mlxsw/spectrum_switchdev.c  |  6 ++++++
 3 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index 9bd8d28de152..c3837ca7a705 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -4376,7 +4376,10 @@ static int mlxsw_sp_netdevice_port_upper_event(struct net_device *lower_dev,
 		}
 		if (!info->linking)
 			break;
-		if (netdev_has_any_upper_dev(upper_dev)) {
+		if (netdev_has_any_upper_dev(upper_dev) &&
+		    (!netif_is_bridge_master(upper_dev) ||
+		     !mlxsw_sp_bridge_device_is_offloaded(mlxsw_sp,
+							  upper_dev))) {
 			NL_SET_ERR_MSG(extack,
 				       "spectrum: Enslaving a port to a device that already has an upper device is not supported");
 			return -EINVAL;
@@ -4504,6 +4507,7 @@ static int mlxsw_sp_netdevice_port_vlan_event(struct net_device *vlan_dev,
 					      u16 vid)
 {
 	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
 	struct netdev_notifier_changeupper_info *info = ptr;
 	struct netlink_ext_ack *extack;
 	struct net_device *upper_dev;
@@ -4520,7 +4524,10 @@ static int mlxsw_sp_netdevice_port_vlan_event(struct net_device *vlan_dev,
 		}
 		if (!info->linking)
 			break;
-		if (netdev_has_any_upper_dev(upper_dev)) {
+		if (netdev_has_any_upper_dev(upper_dev) &&
+		    (!netif_is_bridge_master(upper_dev) ||
+		     !mlxsw_sp_bridge_device_is_offloaded(mlxsw_sp,
+							  upper_dev))) {
 			NL_SET_ERR_MSG(extack, "spectrum: Enslaving a port to a device that already has an upper device is not supported");
 			return -EINVAL;
 		}
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
index 432ab9b12b7f..05ce1befd9b3 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
@@ -365,6 +365,8 @@ int mlxsw_sp_port_bridge_join(struct mlxsw_sp_port *mlxsw_sp_port,
 void mlxsw_sp_port_bridge_leave(struct mlxsw_sp_port *mlxsw_sp_port,
 				struct net_device *brport_dev,
 				struct net_device *br_dev);
+bool mlxsw_sp_bridge_device_is_offloaded(const struct mlxsw_sp *mlxsw_sp,
+					 const struct net_device *br_dev);
 
 /* spectrum.c */
 int mlxsw_sp_port_ets_set(struct mlxsw_sp_port *mlxsw_sp_port,
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
index 7b8548e25ae7..593ad31be749 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
@@ -152,6 +152,12 @@ mlxsw_sp_bridge_device_find(const struct mlxsw_sp_bridge *bridge,
 	return NULL;
 }
 
+bool mlxsw_sp_bridge_device_is_offloaded(const struct mlxsw_sp *mlxsw_sp,
+					 const struct net_device *br_dev)
+{
+	return !!mlxsw_sp_bridge_device_find(mlxsw_sp->bridge, br_dev);
+}
+
 static struct mlxsw_sp_bridge_device *
 mlxsw_sp_bridge_device_create(struct mlxsw_sp_bridge *bridge,
 			      struct net_device *br_dev)

From 02a0d9216d4daf6a58d88642bd2da2c78c327552 Mon Sep 17 00:00:00 2001
From: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>
Date: Tue, 2 Jan 2018 09:39:25 -0800
Subject: [PATCH 570/808] Input: xen-kbdfront - do not advertise multi-touch
 pressure support

Some user-space applications expect multi-touch pressure
on contact to be reported if it is advertised in device
properties. Otherwise, such applications may treat reports
not as actual touches, but hovering. Currently this is
only advertised, but not reported.
Fix this by not advertising that ABS_MT_PRESSURE is supported.

Signed-off-by: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>
Signed-off-by: Andrii Chepurnyi <andrii_chepurnyi@epam.com>
Patchwork-Id: 10140017
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/misc/xen-kbdfront.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/input/misc/xen-kbdfront.c b/drivers/input/misc/xen-kbdfront.c
index 6bf56bb5f8d9..d91f3b1c5375 100644
--- a/drivers/input/misc/xen-kbdfront.c
+++ b/drivers/input/misc/xen-kbdfront.c
@@ -326,8 +326,6 @@ static int xenkbd_probe(struct xenbus_device *dev,
 				     0, width, 0, 0);
 		input_set_abs_params(mtouch, ABS_MT_POSITION_Y,
 				     0, height, 0, 0);
-		input_set_abs_params(mtouch, ABS_MT_PRESSURE,
-				     0, 255, 0, 0);
 
 		ret = input_mt_init_slots(mtouch, num_cont, INPUT_MT_DIRECT);
 		if (ret) {

From 5a371cf87e145b86efd32007e46146e78c1eff6d Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Sun, 31 Dec 2017 15:33:14 +0200
Subject: [PATCH 571/808] IB/mlx4: Fix mlx4_ib_alloc_mr error flow

ibmr.device is being set only after ib_alloc_mr() is successfully complete.
Therefore, in case imlx4_mr_enable() returns with error, the error flow
unwinder calls to mlx4_free_priv_pages(), which uses ibmr.device.

Such usage causes to NULL dereference oops and to fix it, the IB device
should be set in the mr struct earlier stage (e.g. prior to calling
mlx4_free_priv_pages()).

Fixes: 1b2cd0fc673c ("IB/mlx4: Support the new memory registration API")
Signed-off-by: Nitzan Carmi <nitzanc@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx4/mr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c
index 313bfb9ccb71..4975f3e6596e 100644
--- a/drivers/infiniband/hw/mlx4/mr.c
+++ b/drivers/infiniband/hw/mlx4/mr.c
@@ -642,7 +642,6 @@ struct ib_mr *mlx4_ib_alloc_mr(struct ib_pd *pd,
 		goto err_free_mr;
 
 	mr->max_pages = max_num_sg;
-
 	err = mlx4_mr_enable(dev->dev, &mr->mmr);
 	if (err)
 		goto err_free_pl;
@@ -653,6 +652,7 @@ struct ib_mr *mlx4_ib_alloc_mr(struct ib_pd *pd,
 	return &mr->ibmr;
 
 err_free_pl:
+	mr->ibmr.device = pd->device;
 	mlx4_free_priv_pages(mr);
 err_free_mr:
 	(void) mlx4_mr_free(dev->dev, &mr->mmr);

From 16ba3defb8bd01a9464ba4820a487f5b196b455b Mon Sep 17 00:00:00 2001
From: Erez Shitrit <erezsh@mellanox.com>
Date: Sun, 31 Dec 2017 15:33:15 +0200
Subject: [PATCH 572/808] IB/ipoib: Fix race condition in neigh creation

When using enhanced mode for IPoIB, two threads may execute xmit in
parallel to two different TX queues while the target is the same.
In this case, both of them will add the same neighbor to the path's
neigh link list and we might see the following message:

  list_add double add: new=ffff88024767a348, prev=ffff88024767a348...
  WARNING: lib/list_debug.c:31__list_add_valid+0x4e/0x70
  ipoib_start_xmit+0x477/0x680 [ib_ipoib]
  dev_hard_start_xmit+0xb9/0x3e0
  sch_direct_xmit+0xf9/0x250
  __qdisc_run+0x176/0x5d0
  __dev_queue_xmit+0x1f5/0xb10
  __dev_queue_xmit+0x55/0xb10

Analysis:
Two SKB are scheduled to be transmitted from two cores.
In ipoib_start_xmit, both gets NULL when calling ipoib_neigh_get.
Two calls to neigh_add_path are made. One thread takes the spin-lock
and calls ipoib_neigh_alloc which creates the neigh structure,
then (after the __path_find) the neigh is added to the path's neigh
link list. When the second thread enters the critical section it also
calls ipoib_neigh_alloc but in this case it gets the already allocated
ipoib_neigh structure, which is already linked to the path's neigh
link list and adds it again to the list. Which beside of triggering
the list, it creates a loop in the linked list. This loop leads to
endless loop inside path_rec_completion.

Solution:
Check list_empty(&neigh->list) before adding to the list.
Add a similar fix in "ipoib_multicast.c::ipoib_mcast_send"

Fixes: b63b70d87741 ('IPoIB: Use a private hash table for path lookup in xmit path')
Signed-off-by: Erez Shitrit <erezsh@mellanox.com>
Reviewed-by: Alex Vesker <valex@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/ulp/ipoib/ipoib_main.c     | 25 +++++++++++++------
 .../infiniband/ulp/ipoib/ipoib_multicast.c    |  5 +++-
 2 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 12b7f911f0e5..8880351df179 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -902,8 +902,8 @@ static int path_rec_start(struct net_device *dev,
 	return 0;
 }
 
-static void neigh_add_path(struct sk_buff *skb, u8 *daddr,
-			   struct net_device *dev)
+static struct ipoib_neigh *neigh_add_path(struct sk_buff *skb, u8 *daddr,
+					  struct net_device *dev)
 {
 	struct ipoib_dev_priv *priv = ipoib_priv(dev);
 	struct rdma_netdev *rn = netdev_priv(dev);
@@ -917,7 +917,15 @@ static void neigh_add_path(struct sk_buff *skb, u8 *daddr,
 		spin_unlock_irqrestore(&priv->lock, flags);
 		++dev->stats.tx_dropped;
 		dev_kfree_skb_any(skb);
-		return;
+		return NULL;
+	}
+
+	/* To avoid race condition, make sure that the
+	 * neigh will be added only once.
+	 */
+	if (unlikely(!list_empty(&neigh->list))) {
+		spin_unlock_irqrestore(&priv->lock, flags);
+		return neigh;
 	}
 
 	path = __path_find(dev, daddr + 4);
@@ -956,7 +964,7 @@ static void neigh_add_path(struct sk_buff *skb, u8 *daddr,
 			path->ah->last_send = rn->send(dev, skb, path->ah->ah,
 						       IPOIB_QPN(daddr));
 			ipoib_neigh_put(neigh);
-			return;
+			return NULL;
 		}
 	} else {
 		neigh->ah  = NULL;
@@ -973,7 +981,7 @@ static void neigh_add_path(struct sk_buff *skb, u8 *daddr,
 
 	spin_unlock_irqrestore(&priv->lock, flags);
 	ipoib_neigh_put(neigh);
-	return;
+	return NULL;
 
 err_path:
 	ipoib_neigh_free(neigh);
@@ -983,6 +991,8 @@ err_drop:
 
 	spin_unlock_irqrestore(&priv->lock, flags);
 	ipoib_neigh_put(neigh);
+
+	return NULL;
 }
 
 static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,
@@ -1091,8 +1101,9 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	case htons(ETH_P_TIPC):
 		neigh = ipoib_neigh_get(dev, phdr->hwaddr);
 		if (unlikely(!neigh)) {
-			neigh_add_path(skb, phdr->hwaddr, dev);
-			return NETDEV_TX_OK;
+			neigh = neigh_add_path(skb, phdr->hwaddr, dev);
+			if (likely(!neigh))
+				return NETDEV_TX_OK;
 		}
 		break;
 	case htons(ETH_P_ARP):
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index 93e149efc1f5..9b3f47ae2016 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -816,7 +816,10 @@ void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb)
 		spin_lock_irqsave(&priv->lock, flags);
 		if (!neigh) {
 			neigh = ipoib_neigh_alloc(daddr, dev);
-			if (neigh) {
+			/* Make sure that the neigh will be added only
+			 * once to mcast list.
+			 */
+			if (neigh && list_empty(&neigh->list)) {
 				kref_get(&mcast->ah->ref);
 				neigh->ah	= mcast->ah;
 				list_add_tail(&neigh->list, &mcast->neigh_list);

From 2196881566225f3c3428d1a5f847a992944daa5b Mon Sep 17 00:00:00 2001
From: Aliaksei Karaliou <akaraliou.dev@gmail.com>
Date: Thu, 21 Dec 2017 13:18:26 -0800
Subject: [PATCH 573/808] xfs: quota: fix missed destroy of qi_tree_lock

xfs_qm_destroy_quotainfo() does not destroy quotainfo->qi_tree_lock
while destroys quotainfo->qi_quotaofflock.

Signed-off-by: Aliaksei Karaliou <akaraliou.dev@gmail.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_qm.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index ec952dfad359..d0053115427f 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -736,6 +736,7 @@ xfs_qm_destroy_quotainfo(
 		IRELE(qi->qi_pquotaip);
 		qi->qi_pquotaip = NULL;
 	}
+	mutex_destroy(&qi->qi_tree_lock);
 	mutex_destroy(&qi->qi_quotaofflock);
 	kmem_free(qi);
 	mp->m_quotainfo = NULL;

From 3a3882ff26fbdbaf5f7e13f6a0bccfbf7121041d Mon Sep 17 00:00:00 2001
From: Aliaksei Karaliou <akaraliou.dev@gmail.com>
Date: Thu, 21 Dec 2017 13:18:26 -0800
Subject: [PATCH 574/808] xfs: quota: check result of register_shrinker()

xfs_qm_init_quotainfo() does not check result of register_shrinker()
which was tagged as __must_check recently, reported by sparse.

Signed-off-by: Aliaksei Karaliou <akaraliou.dev@gmail.com>
[darrick: move xfs_qm_destroy_quotainos nearer xfs_qm_init_quotainos]
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_qm.c | 45 +++++++++++++++++++++++++++++----------------
 1 file changed, 29 insertions(+), 16 deletions(-)

diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index d0053115427f..b897b11afb2c 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -48,7 +48,7 @@
 STATIC int	xfs_qm_init_quotainos(xfs_mount_t *);
 STATIC int	xfs_qm_init_quotainfo(xfs_mount_t *);
 
-
+STATIC void	xfs_qm_destroy_quotainos(xfs_quotainfo_t *qi);
 STATIC void	xfs_qm_dqfree_one(struct xfs_dquot *dqp);
 /*
  * We use the batch lookup interface to iterate over the dquots as it
@@ -695,9 +695,17 @@ xfs_qm_init_quotainfo(
 	qinf->qi_shrinker.scan_objects = xfs_qm_shrink_scan;
 	qinf->qi_shrinker.seeks = DEFAULT_SEEKS;
 	qinf->qi_shrinker.flags = SHRINKER_NUMA_AWARE;
-	register_shrinker(&qinf->qi_shrinker);
+
+	error = register_shrinker(&qinf->qi_shrinker);
+	if (error)
+		goto out_free_inos;
+
 	return 0;
 
+out_free_inos:
+	mutex_destroy(&qinf->qi_quotaofflock);
+	mutex_destroy(&qinf->qi_tree_lock);
+	xfs_qm_destroy_quotainos(qinf);
 out_free_lru:
 	list_lru_destroy(&qinf->qi_lru);
 out_free_qinf:
@@ -706,7 +714,6 @@ out_free_qinf:
 	return error;
 }
 
-
 /*
  * Gets called when unmounting a filesystem or when all quotas get
  * turned off.
@@ -723,19 +730,7 @@ xfs_qm_destroy_quotainfo(
 
 	unregister_shrinker(&qi->qi_shrinker);
 	list_lru_destroy(&qi->qi_lru);
-
-	if (qi->qi_uquotaip) {
-		IRELE(qi->qi_uquotaip);
-		qi->qi_uquotaip = NULL; /* paranoia */
-	}
-	if (qi->qi_gquotaip) {
-		IRELE(qi->qi_gquotaip);
-		qi->qi_gquotaip = NULL;
-	}
-	if (qi->qi_pquotaip) {
-		IRELE(qi->qi_pquotaip);
-		qi->qi_pquotaip = NULL;
-	}
+	xfs_qm_destroy_quotainos(qi);
 	mutex_destroy(&qi->qi_tree_lock);
 	mutex_destroy(&qi->qi_quotaofflock);
 	kmem_free(qi);
@@ -1600,6 +1595,24 @@ error_rele:
 	return error;
 }
 
+STATIC void
+xfs_qm_destroy_quotainos(
+	xfs_quotainfo_t	*qi)
+{
+	if (qi->qi_uquotaip) {
+		IRELE(qi->qi_uquotaip);
+		qi->qi_uquotaip = NULL; /* paranoia */
+	}
+	if (qi->qi_gquotaip) {
+		IRELE(qi->qi_gquotaip);
+		qi->qi_gquotaip = NULL;
+	}
+	if (qi->qi_pquotaip) {
+		IRELE(qi->qi_pquotaip);
+		qi->qi_pquotaip = NULL;
+	}
+}
+
 STATIC void
 xfs_qm_dqfree_one(
 	struct xfs_dquot	*dqp)

From b4d8ad7fd3a18e6d92d4ebe858185c704604a57d Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Fri, 22 Dec 2017 13:14:34 -0800
Subject: [PATCH 575/808] xfs: fix s_maxbytes overflow problems

Fix some integer overflow problems if offset + count happen to be large
enough to cause an integer overflow.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_aops.c  | 4 ++--
 fs/xfs/xfs_iomap.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 21e2d70884e1..4fc526a27a94 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -399,7 +399,7 @@ xfs_map_blocks(
 	       (ip->i_df.if_flags & XFS_IFEXTENTS));
 	ASSERT(offset <= mp->m_super->s_maxbytes);
 
-	if ((xfs_ufsize_t)offset + count > mp->m_super->s_maxbytes)
+	if (offset > mp->m_super->s_maxbytes - count)
 		count = mp->m_super->s_maxbytes - offset;
 	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
 	offset_fsb = XFS_B_TO_FSBT(mp, offset);
@@ -1312,7 +1312,7 @@ xfs_get_blocks(
 	lockmode = xfs_ilock_data_map_shared(ip);
 
 	ASSERT(offset <= mp->m_super->s_maxbytes);
-	if ((xfs_ufsize_t)offset + size > mp->m_super->s_maxbytes)
+	if (offset > mp->m_super->s_maxbytes - size)
 		size = mp->m_super->s_maxbytes - offset;
 	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
 	offset_fsb = XFS_B_TO_FSBT(mp, offset);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 7ab52a8bc0a9..66e1edbfb2b2 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -1006,7 +1006,7 @@ xfs_file_iomap_begin(
 	}
 
 	ASSERT(offset <= mp->m_super->s_maxbytes);
-	if ((xfs_fsize_t)offset + length > mp->m_super->s_maxbytes)
+	if (offset > mp->m_super->s_maxbytes - length)
 		length = mp->m_super->s_maxbytes - offset;
 	offset_fsb = XFS_B_TO_FSBT(mp, offset);
 	end_fsb = XFS_B_TO_FSB(mp, offset + length);

From 3bb23421a504f01551b7cb9dff0e41dbf16656b0 Mon Sep 17 00:00:00 2001
From: Roi Dayan <roid@mellanox.com>
Date: Tue, 26 Dec 2017 07:48:51 +0200
Subject: [PATCH 576/808] net/sched: Fix update of lastuse in act modules
 implementing stats_update

We need to update lastuse to to the most updated value between what
is already set and the new value.
If HW matching fails, i.e. because of an issue, the stats are not updated
but it could be that software did match and updated lastuse.

Fixes: 5712bf9c5c30 ("net/sched: act_mirred: Use passed lastuse argument")
Fixes: 9fea47d93bcc ("net/sched: act_gact: Update statistics when offloaded to hardware")
Signed-off-by: Roi Dayan <roid@mellanox.com>
Reviewed-by: Paul Blakey <paulb@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_gact.c   | 2 +-
 net/sched/act_mirred.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index e29a48ef7fc3..a0ac42b3ed06 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -159,7 +159,7 @@ static void tcf_gact_stats_update(struct tc_action *a, u64 bytes, u32 packets,
 	if (action == TC_ACT_SHOT)
 		this_cpu_ptr(gact->common.cpu_qstats)->drops += packets;
 
-	tm->lastuse = lastuse;
+	tm->lastuse = max_t(u64, tm->lastuse, lastuse);
 }
 
 static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a,
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 8b3e59388480..08b61849c2a2 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -239,7 +239,7 @@ static void tcf_stats_update(struct tc_action *a, u64 bytes, u32 packets,
 	struct tcf_t *tm = &m->tcf_tm;
 
 	_bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets);
-	tm->lastuse = lastuse;
+	tm->lastuse = max_t(u64, tm->lastuse, lastuse);
 }
 
 static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind,

From d02fd6e7d2933ede6478a15f9e4ce8a93845824e Mon Sep 17 00:00:00 2001
From: Gao Feng <gfree.wind@vip.163.com>
Date: Tue, 26 Dec 2017 21:44:32 +0800
Subject: [PATCH 577/808] macvlan: Fix one possible double free

Because the macvlan_uninit would free the macvlan port, so there is one
double free case in macvlan_common_newlink. When the macvlan port is just
created, then register_netdevice or netdev_upper_dev_link failed and they
would invoke macvlan_uninit. Then it would reach the macvlan_port_destroy
which triggers the double free.

Signed-off-by: Gao Feng <gfree.wind@vip.163.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/macvlan.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index a178c5efd33e..a0f2be81d52e 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -1444,9 +1444,14 @@ int macvlan_common_newlink(struct net *src_net, struct net_device *dev,
 	return 0;
 
 unregister_netdev:
+	/* macvlan_uninit would free the macvlan port */
 	unregister_netdevice(dev);
+	return err;
 destroy_macvlan_port:
-	if (create)
+	/* the macvlan port may be freed by macvlan_uninit when fail to register.
+	 * so we destroy the macvlan port only when it's valid.
+	 */
+	if (create && macvlan_port_get_rtnl(dev))
 		macvlan_port_destroy(port->dev);
 	return err;
 }

From ac817f5ad066697e4d4d35ec68c974eba2c5f17a Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Tue, 26 Dec 2017 23:15:12 +0000
Subject: [PATCH 578/808] phylink: ensure we report link down when LOS asserted

Although we disable the netdev carrier, we fail to report in the kernel
log that the link went down.  Fix this.

Fixes: 9525ae83959b ("phylink: add phylink infrastructure")
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phylink.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index 827f3f92560e..150cd95a6e1e 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -1429,9 +1429,8 @@ static void phylink_sfp_link_down(void *upstream)
 	WARN_ON(!lockdep_rtnl_is_held());
 
 	set_bit(PHYLINK_DISABLE_LINK, &pl->phylink_disable_state);
+	queue_work(system_power_efficient_wq, &pl->resolve);
 	flush_work(&pl->resolve);
-
-	netif_carrier_off(pl->netdev);
 }
 
 static void phylink_sfp_link_up(void *upstream)

From 0b2122e4934c7783d336397864e34ee53aad0965 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Tue, 26 Dec 2017 23:15:17 +0000
Subject: [PATCH 579/808] sfp: fix sfp-bus oops when removing socket/upstream

When we remove a socket or upstream, and the other side isn't
registered, we dereference a NULL pointer, causing a kernel oops.
Fix this.

Fixes: ce0aa27ff3f6 ("sfp: add sfp-bus to bridge between network devices and sfp cages")
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/sfp-bus.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/net/phy/sfp-bus.c b/drivers/net/phy/sfp-bus.c
index 8a1b1f4c1b7c..ab64a142b832 100644
--- a/drivers/net/phy/sfp-bus.c
+++ b/drivers/net/phy/sfp-bus.c
@@ -356,7 +356,8 @@ EXPORT_SYMBOL_GPL(sfp_register_upstream);
 void sfp_unregister_upstream(struct sfp_bus *bus)
 {
 	rtnl_lock();
-	sfp_unregister_bus(bus);
+	if (bus->sfp)
+		sfp_unregister_bus(bus);
 	bus->upstream = NULL;
 	bus->netdev = NULL;
 	rtnl_unlock();
@@ -459,7 +460,8 @@ EXPORT_SYMBOL_GPL(sfp_register_socket);
 void sfp_unregister_socket(struct sfp_bus *bus)
 {
 	rtnl_lock();
-	sfp_unregister_bus(bus);
+	if (bus->netdev)
+		sfp_unregister_bus(bus);
 	bus->sfp_dev = NULL;
 	bus->sfp = NULL;
 	bus->socket_ops = NULL;

From 0b76aae741abb9d16d2c0e67f8b1e766576f897d Mon Sep 17 00:00:00 2001
From: Tushar Dave <tushar.n.dave@oracle.com>
Date: Wed, 6 Dec 2017 02:26:29 +0530
Subject: [PATCH 580/808] e1000: fix disabling already-disabled warning

This patch adds check so that driver does not disable already
disabled device.

[   44.637743] advantechwdt: Unexpected close, not stopping watchdog!
[   44.997548] input: ImExPS/2 Generic Explorer Mouse as /devices/platform/i8042/serio1/input/input6
[   45.013419] e1000 0000:00:03.0: disabling already-disabled device
[   45.013447] ------------[ cut here ]------------
[   45.014868] WARNING: CPU: 1 PID: 71 at drivers/pci/pci.c:1641 pci_disable_device+0xa1/0x105:
						pci_disable_device at drivers/pci/pci.c:1640
[   45.016171] CPU: 1 PID: 71 Comm: rcu_perf_shutdo Not tainted 4.14.0-01330-g3c07399 #1
[   45.017197] task: ffff88011bee9e40 task.stack: ffffc90000860000
[   45.017987] RIP: 0010:pci_disable_device+0xa1/0x105:
						pci_disable_device at drivers/pci/pci.c:1640
[   45.018603] RSP: 0000:ffffc90000863e30 EFLAGS: 00010286
[   45.019282] RAX: 0000000000000035 RBX: ffff88013a230008 RCX: 0000000000000000
[   45.020182] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000203
[   45.021084] RBP: ffff88013a3f31e8 R08: 0000000000000001 R09: 0000000000000000
[   45.021986] R10: ffffffff827ec29c R11: 0000000000000002 R12: 0000000000000001
[   45.022946] R13: ffff88013a230008 R14: ffff880117802b20 R15: ffffc90000863e8f
[   45.023842] FS:  0000000000000000(0000) GS:ffff88013fd00000(0000) knlGS:0000000000000000
[   45.024863] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   45.025583] CR2: ffffc900006d4000 CR3: 000000000220f000 CR4: 00000000000006a0
[   45.026478] Call Trace:
[   45.026811]  __e1000_shutdown+0x1d4/0x1e2:
						__e1000_shutdown at drivers/net/ethernet/intel/e1000/e1000_main.c:5162
[   45.027344]  ? rcu_perf_cleanup+0x2a1/0x2a1:
						rcu_perf_shutdown at kernel/rcu/rcuperf.c:627
[   45.027883]  e1000_shutdown+0x14/0x3a:
						e1000_shutdown at drivers/net/ethernet/intel/e1000/e1000_main.c:5235
[   45.028351]  device_shutdown+0x110/0x1aa:
						device_shutdown at drivers/base/core.c:2807
[   45.028858]  kernel_power_off+0x31/0x64:
						kernel_power_off at kernel/reboot.c:260
[   45.029343]  rcu_perf_shutdown+0x9b/0xa7:
						rcu_perf_shutdown at kernel/rcu/rcuperf.c:637
[   45.029852]  ? __wake_up_common_lock+0xa2/0xa2:
						autoremove_wake_function at kernel/sched/wait.c:376
[   45.030414]  kthread+0x126/0x12e:
						kthread at kernel/kthread.c:233
[   45.030834]  ? __kthread_bind_mask+0x8e/0x8e:
						kthread at kernel/kthread.c:190
[   45.031399]  ? ret_from_fork+0x1f/0x30:
						ret_from_fork at arch/x86/entry/entry_64.S:443
[   45.031883]  ? kernel_init+0xa/0xf5:
						kernel_init at init/main.c:997
[   45.032325]  ret_from_fork+0x1f/0x30:
						ret_from_fork at arch/x86/entry/entry_64.S:443
[   45.032777] Code: 00 48 85 ed 75 07 48 8b ab a8 00 00 00 48 8d bb 98 00 00 00 e8 aa d1 11 00 48 89 ea 48 89 c6 48 c7 c7 d8 e4 0b 82 e8 55 7d da ff <0f> ff b9 01 00 00 00 31 d2 be 01 00 00 00 48 c7 c7 f0 b1 61 82
[   45.035222] ---[ end trace c257137b1b1976ef ]---
[   45.037838] ACPI: Preparing to enter system sleep state S5

Signed-off-by: Tushar Dave <tushar.n.dave@oracle.com>
Tested-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/e1000/e1000.h      |  3 ++-
 drivers/net/ethernet/intel/e1000/e1000_main.c | 27 +++++++++++++++----
 2 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/intel/e1000/e1000.h b/drivers/net/ethernet/intel/e1000/e1000.h
index d7bdea79e9fa..8fd2458060a0 100644
--- a/drivers/net/ethernet/intel/e1000/e1000.h
+++ b/drivers/net/ethernet/intel/e1000/e1000.h
@@ -331,7 +331,8 @@ struct e1000_adapter {
 enum e1000_state_t {
 	__E1000_TESTING,
 	__E1000_RESETTING,
-	__E1000_DOWN
+	__E1000_DOWN,
+	__E1000_DISABLED
 };
 
 #undef pr_fmt
diff --git a/drivers/net/ethernet/intel/e1000/e1000_main.c b/drivers/net/ethernet/intel/e1000/e1000_main.c
index 1982f7917a8d..3dd4aeb2706d 100644
--- a/drivers/net/ethernet/intel/e1000/e1000_main.c
+++ b/drivers/net/ethernet/intel/e1000/e1000_main.c
@@ -945,7 +945,7 @@ static int e1000_init_hw_struct(struct e1000_adapter *adapter,
 static int e1000_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 {
 	struct net_device *netdev;
-	struct e1000_adapter *adapter;
+	struct e1000_adapter *adapter = NULL;
 	struct e1000_hw *hw;
 
 	static int cards_found;
@@ -955,6 +955,7 @@ static int e1000_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	u16 tmp = 0;
 	u16 eeprom_apme_mask = E1000_EEPROM_APME;
 	int bars, need_ioport;
+	bool disable_dev = false;
 
 	/* do not allocate ioport bars when not needed */
 	need_ioport = e1000_is_need_ioport(pdev);
@@ -1259,11 +1260,13 @@ err_mdio_ioremap:
 	iounmap(hw->ce4100_gbe_mdio_base_virt);
 	iounmap(hw->hw_addr);
 err_ioremap:
+	disable_dev = !test_and_set_bit(__E1000_DISABLED, &adapter->flags);
 	free_netdev(netdev);
 err_alloc_etherdev:
 	pci_release_selected_regions(pdev, bars);
 err_pci_reg:
-	pci_disable_device(pdev);
+	if (!adapter || disable_dev)
+		pci_disable_device(pdev);
 	return err;
 }
 
@@ -1281,6 +1284,7 @@ static void e1000_remove(struct pci_dev *pdev)
 	struct net_device *netdev = pci_get_drvdata(pdev);
 	struct e1000_adapter *adapter = netdev_priv(netdev);
 	struct e1000_hw *hw = &adapter->hw;
+	bool disable_dev;
 
 	e1000_down_and_stop(adapter);
 	e1000_release_manageability(adapter);
@@ -1299,9 +1303,11 @@ static void e1000_remove(struct pci_dev *pdev)
 		iounmap(hw->flash_address);
 	pci_release_selected_regions(pdev, adapter->bars);
 
+	disable_dev = !test_and_set_bit(__E1000_DISABLED, &adapter->flags);
 	free_netdev(netdev);
 
-	pci_disable_device(pdev);
+	if (disable_dev)
+		pci_disable_device(pdev);
 }
 
 /**
@@ -5156,7 +5162,8 @@ static int __e1000_shutdown(struct pci_dev *pdev, bool *enable_wake)
 	if (netif_running(netdev))
 		e1000_free_irq(adapter);
 
-	pci_disable_device(pdev);
+	if (!test_and_set_bit(__E1000_DISABLED, &adapter->flags))
+		pci_disable_device(pdev);
 
 	return 0;
 }
@@ -5200,6 +5207,10 @@ static int e1000_resume(struct pci_dev *pdev)
 		pr_err("Cannot enable PCI device from suspend\n");
 		return err;
 	}
+
+	/* flush memory to make sure state is correct */
+	smp_mb__before_atomic();
+	clear_bit(__E1000_DISABLED, &adapter->flags);
 	pci_set_master(pdev);
 
 	pci_enable_wake(pdev, PCI_D3hot, 0);
@@ -5274,7 +5285,9 @@ static pci_ers_result_t e1000_io_error_detected(struct pci_dev *pdev,
 
 	if (netif_running(netdev))
 		e1000_down(adapter);
-	pci_disable_device(pdev);
+
+	if (!test_and_set_bit(__E1000_DISABLED, &adapter->flags))
+		pci_disable_device(pdev);
 
 	/* Request a slot slot reset. */
 	return PCI_ERS_RESULT_NEED_RESET;
@@ -5302,6 +5315,10 @@ static pci_ers_result_t e1000_io_slot_reset(struct pci_dev *pdev)
 		pr_err("Cannot re-enable PCI device after reset.\n");
 		return PCI_ERS_RESULT_DISCONNECT;
 	}
+
+	/* flush memory to make sure state is correct */
+	smp_mb__before_atomic();
+	clear_bit(__E1000_DISABLED, &adapter->flags);
 	pci_set_master(pdev);
 
 	pci_enable_wake(pdev, PCI_D3hot, 0);

From 4110e02eb45ea447ec6f5459c9934de0a273fb91 Mon Sep 17 00:00:00 2001
From: Benjamin Poirier <bpoirier@suse.com>
Date: Mon, 11 Dec 2017 16:26:40 +0900
Subject: [PATCH 581/808] e1000e: Fix e1000_check_for_copper_link_ich8lan
 return value.

e1000e_check_for_copper_link() and e1000_check_for_copper_link_ich8lan()
are the two functions that may be assigned to mac.ops.check_for_link when
phy.media_type == e1000_media_type_copper. Commit 19110cfbb34d ("e1000e:
Separate signaling for link check/link up") changed the meaning of the
return value of check_for_link for copper media but only adjusted the first
function. This patch adjusts the second function likewise.

Reported-by: Christian Hesse <list@eworm.de>
Reported-by: Gabriel C <nix.or.die@gmail.com>
Link: https://bugzilla.kernel.org/show_bug.cgi?id=198047
Fixes: 19110cfbb34d ("e1000e: Separate signaling for link check/link up")
Signed-off-by: Benjamin Poirier <bpoirier@suse.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
Tested-by: Christian Hesse <list@eworm.de>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/e1000e/ich8lan.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/intel/e1000e/ich8lan.c b/drivers/net/ethernet/intel/e1000e/ich8lan.c
index d6d4ed7acf03..31277d3bb7dc 100644
--- a/drivers/net/ethernet/intel/e1000e/ich8lan.c
+++ b/drivers/net/ethernet/intel/e1000e/ich8lan.c
@@ -1367,6 +1367,9 @@ out:
  *  Checks to see of the link status of the hardware has changed.  If a
  *  change in link status has been detected, then we read the PHY registers
  *  to get the current speed/duplex if link exists.
+ *
+ *  Returns a negative error code (-E1000_ERR_*) or 0 (link down) or 1 (link
+ *  up).
  **/
 static s32 e1000_check_for_copper_link_ich8lan(struct e1000_hw *hw)
 {
@@ -1382,7 +1385,7 @@ static s32 e1000_check_for_copper_link_ich8lan(struct e1000_hw *hw)
 	 * Change or Rx Sequence Error interrupt.
 	 */
 	if (!mac->get_link_status)
-		return 0;
+		return 1;
 
 	/* First we want to see if the MII Status Register reports
 	 * link.  If so, then we want to get the current speed/duplex
@@ -1613,10 +1616,12 @@ static s32 e1000_check_for_copper_link_ich8lan(struct e1000_hw *hw)
 	 * different link partner.
 	 */
 	ret_val = e1000e_config_fc_after_link_up(hw);
-	if (ret_val)
+	if (ret_val) {
 		e_dbg("Error configuring flow control\n");
+		return ret_val;
+	}
 
-	return ret_val;
+	return 1;
 }
 
 static s32 e1000_get_variants_ich8lan(struct e1000_adapter *adapter)

From bd30ffc414e55194ed6149fad69a145550cb7c18 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?SZ=20Lin=20=28=E6=9E=97=E4=B8=8A=E6=99=BA=29?=
 <sz.lin@moxa.com>
Date: Fri, 29 Dec 2017 17:02:17 +0800
Subject: [PATCH 582/808] NET: usb: qmi_wwan: add support for YUGA CLM920-NC5
 PID 0x9625
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch adds support for PID 0x9625 of YUGA CLM920-NC5.

YUGA CLM920-NC5 needs to enable QMI_WWAN_QUIRK_DTR before QMI operation.

qmicli -d /dev/cdc-wdm0 -p --dms-get-revision
[/dev/cdc-wdm0] Device revision retrieved:
        Revision: 'CLM920_NC5-V1  1  [Oct 23 2016 19:00:00]'

Signed-off-by: SZ Lin (林上智) <sz.lin@moxa.com>
Acked-by: Bjørn Mork <bjorn@mork.no>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/qmi_wwan.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c
index 3000ddd1c7e2..728819feab44 100644
--- a/drivers/net/usb/qmi_wwan.c
+++ b/drivers/net/usb/qmi_wwan.c
@@ -1100,6 +1100,7 @@ static const struct usb_device_id products[] = {
 	{QMI_FIXED_INTF(0x05c6, 0x9084, 4)},
 	{QMI_FIXED_INTF(0x05c6, 0x920d, 0)},
 	{QMI_FIXED_INTF(0x05c6, 0x920d, 5)},
+	{QMI_QUIRK_SET_DTR(0x05c6, 0x9625, 4)},	/* YUGA CLM920-NC5 */
 	{QMI_FIXED_INTF(0x0846, 0x68a2, 8)},
 	{QMI_FIXED_INTF(0x12d1, 0x140c, 1)},	/* Huawei E173 */
 	{QMI_FIXED_INTF(0x12d1, 0x14ac, 1)},	/* Huawei E1820 */

From 807fc072991861ff0cd7ac44267ff1dd76ef316e Mon Sep 17 00:00:00 2001
From: Yue Hin Lau <Yuehin.Lau@amd.com>
Date: Fri, 29 Dec 2017 11:11:18 +0000
Subject: [PATCH 583/808] drm/amd/display: call set csc_default if enable
 adjustment is false

Fixes a greenish tint on RV displays.

Signed-off-by: Yue Hin Lau <Yuehin.Lau@amd.com>
Reviewed-by: Eric Bernstein <Eric.Bernstein@amd.com>
Acked-by: Harry Wentland <harry.wentland@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
[drake@endlessm.com: backport to 4.15]
Signed-off-by: Daniel Drake <drake@endlessm.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/display/dc/dcn10/dcn10_dpp.h          | 2 +-
 drivers/gpu/drm/amd/display/dc/dcn10/dcn10_dpp_cm.c       | 6 ++----
 drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c | 2 ++
 drivers/gpu/drm/amd/display/dc/inc/hw/dpp.h               | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_dpp.h b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_dpp.h
index a9782b1aba47..34daf895f848 100644
--- a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_dpp.h
+++ b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_dpp.h
@@ -1360,7 +1360,7 @@ void dpp1_cm_set_output_csc_adjustment(
 
 void dpp1_cm_set_output_csc_default(
 		struct dpp *dpp_base,
-		const struct default_adjustment *default_adjust);
+		enum dc_color_space colorspace);
 
 void dpp1_cm_set_gamut_remap(
 	struct dpp *dpp,
diff --git a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_dpp_cm.c b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_dpp_cm.c
index 40627c244bf5..ed1216b53465 100644
--- a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_dpp_cm.c
+++ b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_dpp_cm.c
@@ -225,14 +225,13 @@ void dpp1_cm_set_gamut_remap(
 
 void dpp1_cm_set_output_csc_default(
 		struct dpp *dpp_base,
-		const struct default_adjustment *default_adjust)
+		enum dc_color_space colorspace)
 {
 
 	struct dcn10_dpp *dpp = TO_DCN10_DPP(dpp_base);
 	uint32_t ocsc_mode = 0;
 
-	if (default_adjust != NULL) {
-		switch (default_adjust->out_color_space) {
+	switch (colorspace) {
 		case COLOR_SPACE_SRGB:
 		case COLOR_SPACE_2020_RGB_FULLRANGE:
 			ocsc_mode = 0;
@@ -253,7 +252,6 @@ void dpp1_cm_set_output_csc_default(
 		case COLOR_SPACE_UNKNOWN:
 		default:
 			break;
-		}
 	}
 
 	REG_SET(CM_OCSC_CONTROL, 0, CM_OCSC_MODE, ocsc_mode);
diff --git a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c
index 961ad5c3b454..05dc01e54531 100644
--- a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c
+++ b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c
@@ -2097,6 +2097,8 @@ static void program_csc_matrix(struct pipe_ctx *pipe_ctx,
 			tbl_entry.color_space = color_space;
 			//tbl_entry.regval = matrix;
 			pipe_ctx->plane_res.dpp->funcs->opp_set_csc_adjustment(pipe_ctx->plane_res.dpp, &tbl_entry);
+	} else {
+		pipe_ctx->plane_res.dpp->funcs->opp_set_csc_default(pipe_ctx->plane_res.dpp, colorspace);
 	}
 }
 static bool is_lower_pipe_tree_visible(struct pipe_ctx *pipe_ctx)
diff --git a/drivers/gpu/drm/amd/display/dc/inc/hw/dpp.h b/drivers/gpu/drm/amd/display/dc/inc/hw/dpp.h
index 83a68460edcd..9420dfb94d39 100644
--- a/drivers/gpu/drm/amd/display/dc/inc/hw/dpp.h
+++ b/drivers/gpu/drm/amd/display/dc/inc/hw/dpp.h
@@ -64,7 +64,7 @@ struct dpp_funcs {
 
 	void (*opp_set_csc_default)(
 		struct dpp *dpp,
-		const struct default_adjustment *default_adjust);
+		enum dc_color_space colorspace);
 
 	void (*opp_set_csc_adjustment)(
 		struct dpp *dpp,

From 19d859a7205bc59ffc38303eb25ae394f61d21dc Mon Sep 17 00:00:00 2001
From: Xiongwei Song <sxwjean@gmail.com>
Date: Tue, 2 Jan 2018 21:24:55 +0800
Subject: [PATCH 584/808] drm/ttm: check the return value of kzalloc
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In the function ttm_page_alloc_init, kzalloc call is made for variable
_manager, we need to check its return value, it may return NULL.

Signed-off-by: Xiongwei Song <sxwjean@gmail.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/ttm/ttm_page_alloc.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/ttm/ttm_page_alloc.c b/drivers/gpu/drm/ttm/ttm_page_alloc.c
index b5ba6441489f..5d252fb27a82 100644
--- a/drivers/gpu/drm/ttm/ttm_page_alloc.c
+++ b/drivers/gpu/drm/ttm/ttm_page_alloc.c
@@ -1007,6 +1007,8 @@ int ttm_page_alloc_init(struct ttm_mem_global *glob, unsigned max_pages)
 	pr_info("Initializing pool allocator\n");
 
 	_manager = kzalloc(sizeof(*_manager), GFP_KERNEL);
+	if (!_manager)
+		return -ENOMEM;
 
 	ttm_page_pool_init_locked(&_manager->wc_pool, GFP_HIGHUSER, "wc", 0);
 

From 0ae60d0c4f191c4241377cc3fc5931dc90ca3bbd Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Tue, 2 Jan 2018 20:40:21 +0100
Subject: [PATCH 585/808] parisc: Show unhashed hardware inventory

Fixes: ad67b74d2469d9b8 ("printk: hash addresses printed with %p")
Signed-off-by: Helge Deller <deller@gmx.de>
---
 arch/parisc/kernel/drivers.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/parisc/kernel/drivers.c b/arch/parisc/kernel/drivers.c
index d8f77358e2ba..29b99b8964aa 100644
--- a/arch/parisc/kernel/drivers.c
+++ b/arch/parisc/kernel/drivers.c
@@ -870,7 +870,7 @@ static void print_parisc_device(struct parisc_device *dev)
 	static int count;
 
 	print_pa_hwpath(dev, hw_path);
-	printk(KERN_INFO "%d. %s at 0x%p [%s] { %d, 0x%x, 0x%.3x, 0x%.5x }",
+	printk(KERN_INFO "%d. %s at 0x%px [%s] { %d, 0x%x, 0x%.3x, 0x%.5x }",
 		++count, dev->name, (void*) dev->hpa.start, hw_path, dev->id.hw_type,
 		dev->id.hversion_rev, dev->id.hversion, dev->id.sversion);
 

From 63b2c373137b16d948b08cffacc6abfcf4cffea6 Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Tue, 2 Jan 2018 20:42:59 +0100
Subject: [PATCH 586/808] parisc: Show initial kernel memory layout unhashed

Fixes: ad67b74d2469d9b8 ("printk: hash addresses printed with %p")
Signed-off-by: Helge Deller <deller@gmx.de>
---
 arch/parisc/mm/init.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c
index 13f7854e0d49..48f41399fc0b 100644
--- a/arch/parisc/mm/init.c
+++ b/arch/parisc/mm/init.c
@@ -631,11 +631,11 @@ void __init mem_init(void)
 	mem_init_print_info(NULL);
 #ifdef CONFIG_DEBUG_KERNEL /* double-sanity-check paranoia */
 	printk("virtual kernel memory layout:\n"
-	       "    vmalloc : 0x%p - 0x%p   (%4ld MB)\n"
-	       "    memory  : 0x%p - 0x%p   (%4ld MB)\n"
-	       "      .init : 0x%p - 0x%p   (%4ld kB)\n"
-	       "      .data : 0x%p - 0x%p   (%4ld kB)\n"
-	       "      .text : 0x%p - 0x%p   (%4ld kB)\n",
+	       "    vmalloc : 0x%px - 0x%px   (%4ld MB)\n"
+	       "    memory  : 0x%px - 0x%px   (%4ld MB)\n"
+	       "      .init : 0x%px - 0x%px   (%4ld kB)\n"
+	       "      .data : 0x%px - 0x%px   (%4ld kB)\n"
+	       "      .text : 0x%px - 0x%px   (%4ld kB)\n",
 
 	       (void*)VMALLOC_START, (void*)VMALLOC_END,
 	       (VMALLOC_END - VMALLOC_START) >> 20,

From 04903c06b4854d2e85f6e3c368d5d48c4ce55f09 Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Tue, 2 Jan 2018 20:45:42 +0100
Subject: [PATCH 587/808] parisc: Show unhashed HPA of Dino chip

Fixes: ad67b74d2469d9b8 ("printk: hash addresses printed with %p")
Signed-off-by: Helge Deller <deller@gmx.de>
---
 drivers/parisc/dino.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/parisc/dino.c b/drivers/parisc/dino.c
index 0b3fb99d9b89..7390fb8ca9d1 100644
--- a/drivers/parisc/dino.c
+++ b/drivers/parisc/dino.c
@@ -303,7 +303,7 @@ static void dino_mask_irq(struct irq_data *d)
 	struct dino_device *dino_dev = irq_data_get_irq_chip_data(d);
 	int local_irq = gsc_find_local_irq(d->irq, dino_dev->global_irq, DINO_LOCAL_IRQS);
 
-	DBG(KERN_WARNING "%s(0x%p, %d)\n", __func__, dino_dev, d->irq);
+	DBG(KERN_WARNING "%s(0x%px, %d)\n", __func__, dino_dev, d->irq);
 
 	/* Clear the matching bit in the IMR register */
 	dino_dev->imr &= ~(DINO_MASK_IRQ(local_irq));
@@ -316,7 +316,7 @@ static void dino_unmask_irq(struct irq_data *d)
 	int local_irq = gsc_find_local_irq(d->irq, dino_dev->global_irq, DINO_LOCAL_IRQS);
 	u32 tmp;
 
-	DBG(KERN_WARNING "%s(0x%p, %d)\n", __func__, dino_dev, d->irq);
+	DBG(KERN_WARNING "%s(0x%px, %d)\n", __func__, dino_dev, d->irq);
 
 	/*
 	** clear pending IRQ bits
@@ -396,7 +396,7 @@ ilr_again:
 	if (mask) {
 		if (--ilr_loop > 0)
 			goto ilr_again;
-		printk(KERN_ERR "Dino 0x%p: stuck interrupt %d\n", 
+		printk(KERN_ERR "Dino 0x%px: stuck interrupt %d\n",
 		       dino_dev->hba.base_addr, mask);
 		return IRQ_NONE;
 	}
@@ -553,7 +553,7 @@ dino_fixup_bus(struct pci_bus *bus)
         struct pci_dev *dev;
         struct dino_device *dino_dev = DINO_DEV(parisc_walk_tree(bus->bridge));
 
-	DBG(KERN_WARNING "%s(0x%p) bus %d platform_data 0x%p\n",
+	DBG(KERN_WARNING "%s(0x%px) bus %d platform_data 0x%px\n",
 	    __func__, bus, bus->busn_res.start,
 	    bus->bridge->platform_data);
 
@@ -854,7 +854,7 @@ static int __init dino_common_init(struct parisc_device *dev,
 	res->flags = IORESOURCE_IO; /* do not mark it busy ! */
 	if (request_resource(&ioport_resource, res) < 0) {
 		printk(KERN_ERR "%s: request I/O Port region failed "
-		       "0x%lx/%lx (hpa 0x%p)\n",
+		       "0x%lx/%lx (hpa 0x%px)\n",
 		       name, (unsigned long)res->start, (unsigned long)res->end,
 		       dino_dev->hba.base_addr);
 		return 1;

From 28df2f83c39554d9e64cd9d2a93b8e28e24df5b7 Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Tue, 2 Jan 2018 20:47:01 +0100
Subject: [PATCH 588/808] parisc: Show unhashed EISA EEPROM address

Fixes: ad67b74d2469d9b8 ("printk: hash addresses printed with %p")
Signed-off-by: Helge Deller <deller@gmx.de>
---
 drivers/parisc/eisa_eeprom.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/parisc/eisa_eeprom.c b/drivers/parisc/eisa_eeprom.c
index 4dd9b1308128..99a80da6fd2e 100644
--- a/drivers/parisc/eisa_eeprom.c
+++ b/drivers/parisc/eisa_eeprom.c
@@ -106,7 +106,7 @@ static int __init eisa_eeprom_init(void)
 		return retval;
 	}
 
-	printk(KERN_INFO "EISA EEPROM at 0x%p\n", eisa_eeprom_addr);
+	printk(KERN_INFO "EISA EEPROM at 0x%px\n", eisa_eeprom_addr);
 	return 0;
 }
 

From f8978bd95cf92f869f3d9b34c1b699f49253b8c6 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Mon, 1 Jan 2018 13:07:15 +0200
Subject: [PATCH 589/808] RDMA/netlink: Fix locking around
 __ib_get_device_by_index

Holding locks is mandatory when calling __ib_device_get_by_index,
otherwise there are races during the list iteration with device removal.

Since the locks are static to device.c, __ib_device_get_by_index can
never be called correctly by any user out side the file.

Make the function static and provide a safe function that gets the
correct locks and returns a kref'd pointer. Fix all callers.

Fixes: e5c9469efcb1 ("RDMA/netlink: Add nldev device doit implementation")
Fixes: c3f66f7b0052 ("RDMA/netlink: Implement nldev port doit callback")
Fixes: 7d02f605f0dc ("RDMA/netlink: Add nldev port dumpit implementation")
Reviewed-by: Mark Bloch <markb@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/core_priv.h |  2 +-
 drivers/infiniband/core/device.c    | 18 +++++++++-
 drivers/infiniband/core/nldev.c     | 54 +++++++++++++++++++----------
 3 files changed, 54 insertions(+), 20 deletions(-)

diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
index a1d687a664f8..66f0268f37a6 100644
--- a/drivers/infiniband/core/core_priv.h
+++ b/drivers/infiniband/core/core_priv.h
@@ -314,7 +314,7 @@ static inline int ib_mad_enforce_security(struct ib_mad_agent_private *map,
 }
 #endif
 
-struct ib_device *__ib_device_get_by_index(u32 ifindex);
+struct ib_device *ib_device_get_by_index(u32 ifindex);
 /* RDMA device netlink */
 void nldev_init(void);
 void nldev_exit(void);
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 30914f3baa5f..465520627e4b 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -134,7 +134,7 @@ static int ib_device_check_mandatory(struct ib_device *device)
 	return 0;
 }
 
-struct ib_device *__ib_device_get_by_index(u32 index)
+static struct ib_device *__ib_device_get_by_index(u32 index)
 {
 	struct ib_device *device;
 
@@ -145,6 +145,22 @@ struct ib_device *__ib_device_get_by_index(u32 index)
 	return NULL;
 }
 
+/*
+ * Caller is responsible to return refrerence count by calling put_device()
+ */
+struct ib_device *ib_device_get_by_index(u32 index)
+{
+	struct ib_device *device;
+
+	down_read(&lists_rwsem);
+	device = __ib_device_get_by_index(index);
+	if (device)
+		get_device(&device->dev);
+
+	up_read(&lists_rwsem);
+	return device;
+}
+
 static struct ib_device *__ib_device_get_by_name(const char *name)
 {
 	struct ib_device *device;
diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index 9a05245a1acf..0dcd1aa6f683 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -142,27 +142,34 @@ static int nldev_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
 
 	index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
 
-	device = __ib_device_get_by_index(index);
+	device = ib_device_get_by_index(index);
 	if (!device)
 		return -EINVAL;
 
 	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
-	if (!msg)
-		return -ENOMEM;
+	if (!msg) {
+		err = -ENOMEM;
+		goto err;
+	}
 
 	nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
 			RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET),
 			0, 0);
 
 	err = fill_dev_info(msg, device);
-	if (err) {
-		nlmsg_free(msg);
-		return err;
-	}
+	if (err)
+		goto err_free;
 
 	nlmsg_end(msg, nlh);
 
+	put_device(&device->dev);
 	return rdma_nl_unicast(msg, NETLINK_CB(skb).portid);
+
+err_free:
+	nlmsg_free(msg);
+err:
+	put_device(&device->dev);
+	return err;
 }
 
 static int _nldev_get_dumpit(struct ib_device *device,
@@ -220,31 +227,40 @@ static int nldev_port_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
 		return -EINVAL;
 
 	index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
-	device = __ib_device_get_by_index(index);
+	device = ib_device_get_by_index(index);
 	if (!device)
 		return -EINVAL;
 
 	port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]);
-	if (!rdma_is_port_valid(device, port))
-		return -EINVAL;
+	if (!rdma_is_port_valid(device, port)) {
+		err = -EINVAL;
+		goto err;
+	}
 
 	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
-	if (!msg)
-		return -ENOMEM;
+	if (!msg) {
+		err = -ENOMEM;
+		goto err;
+	}
 
 	nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
 			RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET),
 			0, 0);
 
 	err = fill_port_info(msg, device, port);
-	if (err) {
-		nlmsg_free(msg);
-		return err;
-	}
+	if (err)
+		goto err_free;
 
 	nlmsg_end(msg, nlh);
+	put_device(&device->dev);
 
 	return rdma_nl_unicast(msg, NETLINK_CB(skb).portid);
+
+err_free:
+	nlmsg_free(msg);
+err:
+	put_device(&device->dev);
+	return err;
 }
 
 static int nldev_port_get_dumpit(struct sk_buff *skb,
@@ -265,7 +281,7 @@ static int nldev_port_get_dumpit(struct sk_buff *skb,
 		return -EINVAL;
 
 	ifindex = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
-	device = __ib_device_get_by_index(ifindex);
+	device = ib_device_get_by_index(ifindex);
 	if (!device)
 		return -EINVAL;
 
@@ -299,7 +315,9 @@ static int nldev_port_get_dumpit(struct sk_buff *skb,
 		nlmsg_end(skb, nlh);
 	}
 
-out:	cb->args[0] = idx;
+out:
+	put_device(&device->dev);
+	cb->args[0] = idx;
 	return skb->len;
 }
 

From 88776c0e70be0290f8357019d844aae15edaa967 Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Tue, 2 Jan 2018 20:36:44 +0100
Subject: [PATCH 590/808] parisc: Fix alignment of pa_tlb_lock in assembly on
 32-bit SMP kernel

Qemu for PARISC reported on a 32bit SMP parisc kernel strange failures
about "Not-handled unaligned insn 0x0e8011d6 and 0x0c2011c9."

Those opcodes evaluate to the ldcw() assembly instruction which requires
(on 32bit) an alignment of 16 bytes to ensure atomicity.

As it turns out, qemu is correct and in our assembly code in entry.S and
pacache.S we don't pay attention to the required alignment.

This patch fixes the problem by aligning the lock offset in assembly
code in the same manner as we do in our C-code.

Signed-off-by: Helge Deller <deller@gmx.de>
Cc: <stable@vger.kernel.org> # v4.0+
---
 arch/parisc/include/asm/ldcw.h |  2 ++
 arch/parisc/kernel/entry.S     | 13 +++++++++++--
 arch/parisc/kernel/pacache.S   |  9 +++++++--
 3 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/arch/parisc/include/asm/ldcw.h b/arch/parisc/include/asm/ldcw.h
index dd5a08aaa4da..3eb4bfc1fb36 100644
--- a/arch/parisc/include/asm/ldcw.h
+++ b/arch/parisc/include/asm/ldcw.h
@@ -12,6 +12,7 @@
    for the semaphore.  */
 
 #define __PA_LDCW_ALIGNMENT	16
+#define __PA_LDCW_ALIGN_ORDER	4
 #define __ldcw_align(a) ({					\
 	unsigned long __ret = (unsigned long) &(a)->lock[0];	\
 	__ret = (__ret + __PA_LDCW_ALIGNMENT - 1)		\
@@ -29,6 +30,7 @@
    ldcd). */
 
 #define __PA_LDCW_ALIGNMENT	4
+#define __PA_LDCW_ALIGN_ORDER	2
 #define __ldcw_align(a) (&(a)->slock)
 #define __LDCW	"ldcw,co"
 
diff --git a/arch/parisc/kernel/entry.S b/arch/parisc/kernel/entry.S
index f3cecf5117cf..e95207c0565e 100644
--- a/arch/parisc/kernel/entry.S
+++ b/arch/parisc/kernel/entry.S
@@ -35,6 +35,7 @@
 #include <asm/pgtable.h>
 #include <asm/signal.h>
 #include <asm/unistd.h>
+#include <asm/ldcw.h>
 #include <asm/thread_info.h>
 
 #include <linux/linkage.h>
@@ -46,6 +47,14 @@
 #endif
 
 	.import		pa_tlb_lock,data
+	.macro  load_pa_tlb_lock reg
+#if __PA_LDCW_ALIGNMENT > 4
+	load32	PA(pa_tlb_lock) + __PA_LDCW_ALIGNMENT-1, \reg
+	depi	0,31,__PA_LDCW_ALIGN_ORDER, \reg
+#else
+	load32	PA(pa_tlb_lock), \reg
+#endif
+	.endm
 
 	/* space_to_prot macro creates a prot id from a space id */
 
@@ -457,7 +466,7 @@
 	.macro		tlb_lock	spc,ptp,pte,tmp,tmp1,fault
 #ifdef CONFIG_SMP
 	cmpib,COND(=),n	0,\spc,2f
-	load32		PA(pa_tlb_lock),\tmp
+	load_pa_tlb_lock \tmp
 1:	LDCW		0(\tmp),\tmp1
 	cmpib,COND(=)	0,\tmp1,1b
 	nop
@@ -480,7 +489,7 @@
 	/* Release pa_tlb_lock lock. */
 	.macro		tlb_unlock1	spc,tmp
 #ifdef CONFIG_SMP
-	load32		PA(pa_tlb_lock),\tmp
+	load_pa_tlb_lock \tmp
 	tlb_unlock0	\spc,\tmp
 #endif
 	.endm
diff --git a/arch/parisc/kernel/pacache.S b/arch/parisc/kernel/pacache.S
index adf7187f8951..2d40c4ff3f69 100644
--- a/arch/parisc/kernel/pacache.S
+++ b/arch/parisc/kernel/pacache.S
@@ -36,6 +36,7 @@
 #include <asm/assembly.h>
 #include <asm/pgtable.h>
 #include <asm/cache.h>
+#include <asm/ldcw.h>
 #include <linux/linkage.h>
 
 	.text
@@ -333,8 +334,12 @@ ENDPROC_CFI(flush_data_cache_local)
 
 	.macro	tlb_lock	la,flags,tmp
 #ifdef CONFIG_SMP
-	ldil		L%pa_tlb_lock,%r1
-	ldo		R%pa_tlb_lock(%r1),\la
+#if __PA_LDCW_ALIGNMENT > 4
+	load32		pa_tlb_lock + __PA_LDCW_ALIGNMENT-1, \la
+	depi		0,31,__PA_LDCW_ALIGN_ORDER, \la
+#else
+	load32		pa_tlb_lock, \la
+#endif
 	rsm		PSW_SM_I,\flags
 1:	LDCW		0(\la),\tmp
 	cmpib,<>,n	0,\tmp,3f

From 71891e2dab6b55a870f8f7735e44a2963860b5c6 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Fri, 29 Dec 2017 10:02:52 -0800
Subject: [PATCH 591/808] ethtool: do not print warning for applications using
 legacy API

In kernel log ths message appears on every boot:
 "warning: `NetworkChangeNo' uses legacy ethtool link settings API,
  link modes are only partially reported"

When ethtool link settings API changed, it started complaining about
usages of old API. Ironically, the original patch was from google but
the application using the legacy API is chrome.

Linux ABI is fixed as much as possible. The kernel must not break it
and should not complain about applications using legacy API's.
This patch just removes the warning since using legacy API's
in Linux is perfectly acceptable.

Fixes: 3f1ac7a700d0 ("net: ethtool: add new ETHTOOL_xLINKSETTINGS API")
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: David Decotigny <decot@googlers.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/ethtool.c | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index f8fcf450a36e..8225416911ae 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -770,15 +770,6 @@ static int ethtool_set_link_ksettings(struct net_device *dev,
 	return dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings);
 }
 
-static void
-warn_incomplete_ethtool_legacy_settings_conversion(const char *details)
-{
-	char name[sizeof(current->comm)];
-
-	pr_info_once("warning: `%s' uses legacy ethtool link settings API, %s\n",
-		     get_task_comm(name, current), details);
-}
-
 /* Query device for its ethtool_cmd settings.
  *
  * Backward compatibility note: for compatibility with legacy ethtool,
@@ -805,10 +796,8 @@ static int ethtool_get_settings(struct net_device *dev, void __user *useraddr)
 							   &link_ksettings);
 		if (err < 0)
 			return err;
-		if (!convert_link_ksettings_to_legacy_settings(&cmd,
-							       &link_ksettings))
-			warn_incomplete_ethtool_legacy_settings_conversion(
-				"link modes are only partially reported");
+		convert_link_ksettings_to_legacy_settings(&cmd,
+							  &link_ksettings);
 
 		/* send a sensible cmd tag back to user */
 		cmd.cmd = ETHTOOL_GSET;

From f9c935db8086231a35b7f5c2a53e3f1e10f388ee Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Fri, 29 Dec 2017 19:48:02 +0100
Subject: [PATCH 592/808] tipc: fix problems with multipoint-to-point flow
 control

In commit 04d7b574b245 ("tipc: add multipoint-to-point flow control") we
introduced a protocol for preventing buffer overflow when many group
members try to simultaneously send messages to the same receiving member.

Stress test of this mechanism has revealed a couple of related bugs:

- When the receiving member receives an advertisement REMIT message from
  one of the senders, it will sometimes prematurely activate a pending
  member and send it the remitted advertisement, although the upper
  limit for active senders has been reached. This leads to accumulation
  of illegal advertisements, and eventually to messages being dropped
  because of receive buffer overflow.

- When the receiving member leaves REMITTED state while a received
  message is being read, we miss to look at the pending queue, to
  activate the oldest pending peer. This leads to some pending senders
  being starved out, and never getting the opportunity to profit from
  the remitted advertisement.

We fix the former in the function tipc_group_proto_rcv() by returning
directly from the function once it becomes clear that the remitting
peer cannot leave REMITTED state at that point.

We fix the latter in the function tipc_group_update_rcv_win() by looking
up and activate the longest pending peer when it becomes clear that the
remitting peer now can leave REMITTED state.

Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/group.c | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/net/tipc/group.c b/net/tipc/group.c
index 8e12ab55346b..5f4ffae807ee 100644
--- a/net/tipc/group.c
+++ b/net/tipc/group.c
@@ -109,7 +109,8 @@ static void tipc_group_proto_xmit(struct tipc_group *grp, struct tipc_member *m,
 static void tipc_group_decr_active(struct tipc_group *grp,
 				   struct tipc_member *m)
 {
-	if (m->state == MBR_ACTIVE || m->state == MBR_RECLAIMING)
+	if (m->state == MBR_ACTIVE || m->state == MBR_RECLAIMING ||
+	    m->state == MBR_REMITTED)
 		grp->active_cnt--;
 }
 
@@ -562,7 +563,7 @@ void tipc_group_update_rcv_win(struct tipc_group *grp, int blks, u32 node,
 	int max_active = grp->max_active;
 	int reclaim_limit = max_active * 3 / 4;
 	int active_cnt = grp->active_cnt;
-	struct tipc_member *m, *rm;
+	struct tipc_member *m, *rm, *pm;
 
 	m = tipc_group_find_member(grp, node, port);
 	if (!m)
@@ -605,6 +606,17 @@ void tipc_group_update_rcv_win(struct tipc_group *grp, int blks, u32 node,
 			pr_warn_ratelimited("Rcv unexpected msg after REMIT\n");
 			tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq);
 		}
+		grp->active_cnt--;
+		list_del_init(&m->list);
+		if (list_empty(&grp->pending))
+			return;
+
+		/* Set oldest pending member to active and advertise */
+		pm = list_first_entry(&grp->pending, struct tipc_member, list);
+		pm->state = MBR_ACTIVE;
+		list_move_tail(&pm->list, &grp->active);
+		grp->active_cnt++;
+		tipc_group_proto_xmit(grp, pm, GRP_ADV_MSG, xmitq);
 		break;
 	case MBR_RECLAIMING:
 	case MBR_DISCOVERED:
@@ -742,14 +754,14 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup,
 		if (!m || m->state != MBR_RECLAIMING)
 			return;
 
-		list_del_init(&m->list);
-		grp->active_cnt--;
 		remitted = msg_grp_remitted(hdr);
 
 		/* Messages preceding the REMIT still in receive queue */
 		if (m->advertised > remitted) {
 			m->state = MBR_REMITTED;
 			in_flight = m->advertised - remitted;
+			m->advertised = ADV_IDLE + in_flight;
+			return;
 		}
 		/* All messages preceding the REMIT have been read */
 		if (m->advertised <= remitted) {
@@ -761,6 +773,8 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup,
 			tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq);
 
 		m->advertised = ADV_IDLE + in_flight;
+		grp->active_cnt--;
+		list_del_init(&m->list);
 
 		/* Set oldest pending member to active and advertise */
 		if (list_empty(&grp->pending))

From af1be2e21203867cb958aaceed5366e2e24b88e8 Mon Sep 17 00:00:00 2001
From: Vineet Gupta <vgupta@synopsys.com>
Date: Fri, 8 Dec 2017 08:45:57 -0800
Subject: [PATCH 593/808] ARC: handle gcc generated __builtin_trap for older
 compiler

ARC gcc prior to GNU 2018.03 release didn't have a target specific
__builtin_trap() implementation, generating default abort() call.

Implement the abort() call - emulating what newer gcc does for the same,
as suggested by Arnd.

Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
---
 arch/arc/kernel/traps.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arch/arc/kernel/traps.c b/arch/arc/kernel/traps.c
index 004f4e4a4c10..133a4dae41fe 100644
--- a/arch/arc/kernel/traps.c
+++ b/arch/arc/kernel/traps.c
@@ -161,3 +161,11 @@ void do_insterror_or_kprobe(unsigned long address, struct pt_regs *regs)
 
 	insterror_is_error(address, regs);
 }
+
+/*
+ * abort() call generated by older gcc for __builtin_trap()
+ */
+void abort(void)
+{
+	__asm__ __volatile__("trap_s  5\n");
+}

From 835bcec5fdf3f9e880111b482177e7e70e3596da Mon Sep 17 00:00:00 2001
From: Dave Young <dyoung@redhat.com>
Date: Tue, 2 Jan 2018 17:21:09 +0000
Subject: [PATCH 594/808] x86/efi: Fix kernel param add_efi_memmap regression

'add_efi_memmap' is an early param, but do_add_efi_memmap() has no
chance to run because the code path is before parse_early_param().
I believe it worked when the param was introduced but probably later
some other changes caused the wrong order and nobody noticed it.

Move efi_memblock_x86_reserve_range() after parse_early_param()
to fix it.

Signed-off-by: Dave Young <dyoung@redhat.com>
Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Bryan O'Donoghue <pure.logic@nexus-software.ie>
Cc: Ge Song <ge.song@hxt-semitech.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-efi@vger.kernel.org
Link: http://lkml.kernel.org/r/20180102172110.17018-2-ard.biesheuvel@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/setup.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 8af2e8d0c0a1..145810b0edf6 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -906,9 +906,6 @@ void __init setup_arch(char **cmdline_p)
 		set_bit(EFI_BOOT, &efi.flags);
 		set_bit(EFI_64BIT, &efi.flags);
 	}
-
-	if (efi_enabled(EFI_BOOT))
-		efi_memblock_x86_reserve_range();
 #endif
 
 	x86_init.oem.arch_setup();
@@ -962,6 +959,8 @@ void __init setup_arch(char **cmdline_p)
 
 	parse_early_param();
 
+	if (efi_enabled(EFI_BOOT))
+		efi_memblock_x86_reserve_range();
 #ifdef CONFIG_MEMORY_HOTPLUG
 	/*
 	 * Memory used by the kernel cannot be hot-removed because Linux

From f24c4d478013d82bd1b943df566fff3561d52864 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Tue, 2 Jan 2018 17:21:10 +0000
Subject: [PATCH 595/808] efi/capsule-loader: Reinstate virtual capsule mapping

Commit:

  82c3768b8d68 ("efi/capsule-loader: Use a cached copy of the capsule header")

... refactored the capsule loading code that maps the capsule header,
to avoid having to map it several times.

However, as it turns out, the vmap() call we ended up removing did not
just map the header, but the entire capsule image, and dropping this
virtual mapping breaks capsules that are processed by the firmware
immediately (i.e., without a reboot).

Unfortunately, that change was part of a larger refactor that allowed
a quirk to be implemented for Quark, which has a non-standard memory
layout for capsules, and we have slightly painted ourselves into a
corner by allowing quirk code to mangle the capsule header and memory
layout.

So we need to fix this without breaking Quark. Fortunately, Quark does
not appear to care about the virtual mapping, and so we can simply
do a partial revert of commit:

  2a457fb31df6 ("efi/capsule-loader: Use page addresses rather than struct page pointers")

... and create a vmap() mapping of the entire capsule (including header)
based on the reinstated struct page array, unless running on Quark, in
which case we pass the capsule header copy as before.

Reported-by: Ge Song <ge.song@hxt-semitech.com>
Tested-by: Bryan O'Donoghue <pure.logic@nexus-software.ie>
Tested-by: Ge Song <ge.song@hxt-semitech.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: <stable@vger.kernel.org>
Cc: Dave Young <dyoung@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-efi@vger.kernel.org
Fixes: 82c3768b8d68 ("efi/capsule-loader: Use a cached copy of the capsule header")
Link: http://lkml.kernel.org/r/20180102172110.17018-3-ard.biesheuvel@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/platform/efi/quirks.c        | 13 +++++++-
 drivers/firmware/efi/capsule-loader.c | 45 ++++++++++++++++++++++-----
 include/linux/efi.h                   |  4 ++-
 3 files changed, 52 insertions(+), 10 deletions(-)

diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
index 8a99a2e96537..5b513ccffde4 100644
--- a/arch/x86/platform/efi/quirks.c
+++ b/arch/x86/platform/efi/quirks.c
@@ -592,7 +592,18 @@ static int qrk_capsule_setup_info(struct capsule_info *cap_info, void **pkbuff,
 	/*
 	 * Update the first page pointer to skip over the CSH header.
 	 */
-	cap_info->pages[0] += csh->headersize;
+	cap_info->phys[0] += csh->headersize;
+
+	/*
+	 * cap_info->capsule should point at a virtual mapping of the entire
+	 * capsule, starting at the capsule header. Our image has the Quark
+	 * security header prepended, so we cannot rely on the default vmap()
+	 * mapping created by the generic capsule code.
+	 * Given that the Quark firmware does not appear to care about the
+	 * virtual mapping, let's just point cap_info->capsule at our copy
+	 * of the capsule header.
+	 */
+	cap_info->capsule = &cap_info->header;
 
 	return 1;
 }
diff --git a/drivers/firmware/efi/capsule-loader.c b/drivers/firmware/efi/capsule-loader.c
index ec8ac5c4dd84..055e2e8f985a 100644
--- a/drivers/firmware/efi/capsule-loader.c
+++ b/drivers/firmware/efi/capsule-loader.c
@@ -20,10 +20,6 @@
 
 #define NO_FURTHER_WRITE_ACTION -1
 
-#ifndef phys_to_page
-#define phys_to_page(x)		pfn_to_page((x) >> PAGE_SHIFT)
-#endif
-
 /**
  * efi_free_all_buff_pages - free all previous allocated buffer pages
  * @cap_info: pointer to current instance of capsule_info structure
@@ -35,7 +31,7 @@
 static void efi_free_all_buff_pages(struct capsule_info *cap_info)
 {
 	while (cap_info->index > 0)
-		__free_page(phys_to_page(cap_info->pages[--cap_info->index]));
+		__free_page(cap_info->pages[--cap_info->index]);
 
 	cap_info->index = NO_FURTHER_WRITE_ACTION;
 }
@@ -71,6 +67,14 @@ int __efi_capsule_setup_info(struct capsule_info *cap_info)
 
 	cap_info->pages = temp_page;
 
+	temp_page = krealloc(cap_info->phys,
+			     pages_needed * sizeof(phys_addr_t *),
+			     GFP_KERNEL | __GFP_ZERO);
+	if (!temp_page)
+		return -ENOMEM;
+
+	cap_info->phys = temp_page;
+
 	return 0;
 }
 
@@ -105,9 +109,24 @@ int __weak efi_capsule_setup_info(struct capsule_info *cap_info, void *kbuff,
  **/
 static ssize_t efi_capsule_submit_update(struct capsule_info *cap_info)
 {
+	bool do_vunmap = false;
 	int ret;
 
-	ret = efi_capsule_update(&cap_info->header, cap_info->pages);
+	/*
+	 * cap_info->capsule may have been assigned already by a quirk
+	 * handler, so only overwrite it if it is NULL
+	 */
+	if (!cap_info->capsule) {
+		cap_info->capsule = vmap(cap_info->pages, cap_info->index,
+					 VM_MAP, PAGE_KERNEL);
+		if (!cap_info->capsule)
+			return -ENOMEM;
+		do_vunmap = true;
+	}
+
+	ret = efi_capsule_update(cap_info->capsule, cap_info->phys);
+	if (do_vunmap)
+		vunmap(cap_info->capsule);
 	if (ret) {
 		pr_err("capsule update failed\n");
 		return ret;
@@ -165,10 +184,12 @@ static ssize_t efi_capsule_write(struct file *file, const char __user *buff,
 			goto failed;
 		}
 
-		cap_info->pages[cap_info->index++] = page_to_phys(page);
+		cap_info->pages[cap_info->index] = page;
+		cap_info->phys[cap_info->index] = page_to_phys(page);
 		cap_info->page_bytes_remain = PAGE_SIZE;
+		cap_info->index++;
 	} else {
-		page = phys_to_page(cap_info->pages[cap_info->index - 1]);
+		page = cap_info->pages[cap_info->index - 1];
 	}
 
 	kbuff = kmap(page);
@@ -252,6 +273,7 @@ static int efi_capsule_release(struct inode *inode, struct file *file)
 	struct capsule_info *cap_info = file->private_data;
 
 	kfree(cap_info->pages);
+	kfree(cap_info->phys);
 	kfree(file->private_data);
 	file->private_data = NULL;
 	return 0;
@@ -281,6 +303,13 @@ static int efi_capsule_open(struct inode *inode, struct file *file)
 		return -ENOMEM;
 	}
 
+	cap_info->phys = kzalloc(sizeof(void *), GFP_KERNEL);
+	if (!cap_info->phys) {
+		kfree(cap_info->pages);
+		kfree(cap_info);
+		return -ENOMEM;
+	}
+
 	file->private_data = cap_info;
 
 	return 0;
diff --git a/include/linux/efi.h b/include/linux/efi.h
index d813f7b04da7..29fdf8029cf6 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -140,11 +140,13 @@ struct efi_boot_memmap {
 
 struct capsule_info {
 	efi_capsule_header_t	header;
+	efi_capsule_header_t	*capsule;
 	int			reset_type;
 	long			index;
 	size_t			count;
 	size_t			total_size;
-	phys_addr_t		*pages;
+	struct page		**pages;
+	phys_addr_t		*phys;
 	size_t			page_bytes_remain;
 };
 

From 81b60dbff04980a45b348c5b5eeca2713d4594ca Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt@codeblueprint.co.uk>
Date: Wed, 3 Jan 2018 09:44:17 +0000
Subject: [PATCH 596/808] MAINTAINERS: Remove Matt Fleming as EFI co-maintainer

Instate Ard Biesheuvel as the sole EFI maintainer and leave other folks
as maintainers for the EFI test driver and efivarfs file system.

Also add Ard Biesheuvel as the EFI test driver and efivarfs maintainer.

Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Ivan Hu <ivan.hu@canonical.com>
Cc: Jeremy Kerr <jk@ozlabs.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matthew Garrett <mjg59@srcf.ucam.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-efi@vger.kernel.org
Link: http://lkml.kernel.org/r/20180103094417.6353-1-matt@codeblueprint.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 MAINTAINERS | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index b46c9cea5ae5..95c3fa1f520f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5149,15 +5149,15 @@ F:	sound/usb/misc/ua101.c
 EFI TEST DRIVER
 L:	linux-efi@vger.kernel.org
 M:	Ivan Hu <ivan.hu@canonical.com>
-M:	Matt Fleming <matt@codeblueprint.co.uk>
+M:	Ard Biesheuvel <ard.biesheuvel@linaro.org>
 S:	Maintained
 F:	drivers/firmware/efi/test/
 
 EFI VARIABLE FILESYSTEM
 M:	Matthew Garrett <matthew.garrett@nebula.com>
 M:	Jeremy Kerr <jk@ozlabs.org>
-M:	Matt Fleming <matt@codeblueprint.co.uk>
-T:	git git://git.kernel.org/pub/scm/linux/kernel/git/mfleming/efi.git
+M:	Ard Biesheuvel <ard.biesheuvel@linaro.org>
+T:	git git://git.kernel.org/pub/scm/linux/kernel/git/efi/efi.git
 L:	linux-efi@vger.kernel.org
 S:	Maintained
 F:	fs/efivarfs/
@@ -5318,7 +5318,6 @@ S:	Supported
 F:	security/integrity/evm/
 
 EXTENSIBLE FIRMWARE INTERFACE (EFI)
-M:	Matt Fleming <matt@codeblueprint.co.uk>
 M:	Ard Biesheuvel <ard.biesheuvel@linaro.org>
 L:	linux-efi@vger.kernel.org
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/efi/efi.git

From 87faa0d9b43b4755ff6963a22d1fd1bee1aa3b39 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 3 Jan 2018 15:18:44 +0100
Subject: [PATCH 597/808] x86/pti: Enable PTI by default

This really want's to be enabled by default. Users who know what they are
doing can disable it either in the config or on the kernel command line.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
---
 security/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/security/Kconfig b/security/Kconfig
index a623d13bf288..3d4debd0257e 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -56,6 +56,7 @@ config SECURITY_NETWORK
 
 config PAGE_TABLE_ISOLATION
 	bool "Remove the kernel mapping in user mode"
+	default y
 	depends on X86_64 && !UML
 	help
 	  This feature reduces the number of hardware side channels by

From 694d99d40972f12e59a3696effee8a376b79d7c8 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Tue, 26 Dec 2017 23:43:54 -0600
Subject: [PATCH 598/808] x86/cpu, x86/pti: Do not enable PTI on AMD processors

AMD processors are not subject to the types of attacks that the kernel
page table isolation feature protects against.  The AMD microarchitecture
does not allow memory references, including speculative references, that
access higher privileged data when running in a lesser privileged mode
when that access would result in a page fault.

Disable page table isolation by default on AMD processors by not setting
the X86_BUG_CPU_INSECURE feature, which controls whether X86_FEATURE_PTI
is set.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/20171227054354.20369.94587.stgit@tlendack-t1.amdoffice.net
---
 arch/x86/kernel/cpu/common.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index f2a94dfb434e..b1be494ab4e8 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -899,8 +899,8 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
 
 	setup_force_cpu_cap(X86_FEATURE_ALWAYS);
 
-	/* Assume for now that ALL x86 CPUs are insecure */
-	setup_force_cpu_bug(X86_BUG_CPU_INSECURE);
+	if (c->x86_vendor != X86_VENDOR_AMD)
+		setup_force_cpu_bug(X86_BUG_CPU_INSECURE);
 
 	fpu__init_system(c);
 

From 52994c256df36fda9a715697431cba9daecb6b11 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 3 Jan 2018 15:57:59 +0100
Subject: [PATCH 599/808] x86/pti: Make sure the user/kernel PTEs match

Meelis reported that his K8 Athlon64 emits MCE warnings when PTI is
enabled:

[Hardware Error]: Error Addr: 0x0000ffff81e000e0
[Hardware Error]: MC1 Error: L1 TLB multimatch.
[Hardware Error]: cache level: L1, tx: INSN

The address is in the entry area, which is mapped into kernel _AND_ user
space. That's special because we switch CR3 while we are executing
there.

User mapping:
0xffffffff81e00000-0xffffffff82000000           2M     ro         PSE     GLB x  pmd

Kernel mapping:
0xffffffff81000000-0xffffffff82000000          16M     ro         PSE         x  pmd

So the K8 is complaining that the TLB entries differ. They differ in the
GLB bit.

Drop the GLB bit when installing the user shared mapping.

Fixes: 6dc72c3cbca0 ("x86/mm/pti: Share entry text PMD")
Reported-by: Meelis Roos <mroos@linux.ee>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Meelis Roos <mroos@linux.ee>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801031407180.1957@nanos
---
 arch/x86/mm/pti.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index bce8aea65606..2da28ba97508 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -367,7 +367,8 @@ static void __init pti_setup_espfix64(void)
 static void __init pti_clone_entry_text(void)
 {
 	pti_clone_pmds((unsigned long) __entry_text_start,
-			(unsigned long) __irqentry_text_end, _PAGE_RW);
+			(unsigned long) __irqentry_text_end,
+		       _PAGE_RW | _PAGE_GLOBAL);
 }
 
 /*

From a9cdbe72c4e8bf3b38781c317a79326e2e1a230d Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Sun, 31 Dec 2017 10:18:06 -0600
Subject: [PATCH 600/808] x86/dumpstack: Fix partial register dumps
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The show_regs_safe() logic is wrong.  When there's an iret stack frame,
it prints the entire pt_regs -- most of which is random stack data --
instead of just the five registers at the end.

show_regs_safe() is also poorly named: the on_stack() checks aren't for
safety.  Rename the function to show_regs_if_on_stack() and add a
comment to explain why the checks are needed.

These issues were introduced with the "partial register dump" feature of
the following commit:

  b02fcf9ba121 ("x86/unwinder: Handle stack overflows more gracefully")

That patch had gone through a few iterations of development, and the
above issues were artifacts from a previous iteration of the patch where
'regs' pointed directly to the iret frame rather than to the (partially
empty) pt_regs.

Tested-by: Alexander Tsoy <alexander@tsoy.me>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toralf Förster <toralf.foerster@gmx.de>
Cc: stable@vger.kernel.org
Fixes: b02fcf9ba121 ("x86/unwinder: Handle stack overflows more gracefully")
Link: http://lkml.kernel.org/r/5b05b8b344f59db2d3d50dbdeba92d60f2304c54.1514736742.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/unwind.h | 17 +++++++++++++----
 arch/x86/kernel/dumpstack.c   | 28 ++++++++++++++++++++--------
 arch/x86/kernel/stacktrace.c  |  2 +-
 3 files changed, 34 insertions(+), 13 deletions(-)

diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h
index c1688c2d0a12..1f86e1b0a5cd 100644
--- a/arch/x86/include/asm/unwind.h
+++ b/arch/x86/include/asm/unwind.h
@@ -56,18 +56,27 @@ void unwind_start(struct unwind_state *state, struct task_struct *task,
 
 #if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER)
 /*
- * WARNING: The entire pt_regs may not be safe to dereference.  In some cases,
- * only the iret frame registers are accessible.  Use with caution!
+ * If 'partial' returns true, only the iret frame registers are valid.
  */
-static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
+static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state,
+						    bool *partial)
 {
 	if (unwind_done(state))
 		return NULL;
 
+	if (partial) {
+#ifdef CONFIG_UNWINDER_ORC
+		*partial = !state->full_regs;
+#else
+		*partial = false;
+#endif
+	}
+
 	return state->regs;
 }
 #else
-static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
+static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state,
+						    bool *partial)
 {
 	return NULL;
 }
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 5fa110699ed2..d0bb176a7261 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -76,12 +76,23 @@ void show_iret_regs(struct pt_regs *regs)
 		regs->sp, regs->flags);
 }
 
-static void show_regs_safe(struct stack_info *info, struct pt_regs *regs)
+static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs,
+				  bool partial)
 {
-	if (on_stack(info, regs, sizeof(*regs)))
+	/*
+	 * These on_stack() checks aren't strictly necessary: the unwind code
+	 * has already validated the 'regs' pointer.  The checks are done for
+	 * ordering reasons: if the registers are on the next stack, we don't
+	 * want to print them out yet.  Otherwise they'll be shown as part of
+	 * the wrong stack.  Later, when show_trace_log_lvl() switches to the
+	 * next stack, this function will be called again with the same regs so
+	 * they can be printed in the right context.
+	 */
+	if (!partial && on_stack(info, regs, sizeof(*regs))) {
 		__show_regs(regs, 0);
-	else if (on_stack(info, (void *)regs + IRET_FRAME_OFFSET,
-			  IRET_FRAME_SIZE)) {
+
+	} else if (partial && on_stack(info, (void *)regs + IRET_FRAME_OFFSET,
+				       IRET_FRAME_SIZE)) {
 		/*
 		 * When an interrupt or exception occurs in entry code, the
 		 * full pt_regs might not have been saved yet.  In that case
@@ -98,6 +109,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
 	struct stack_info stack_info = {0};
 	unsigned long visit_mask = 0;
 	int graph_idx = 0;
+	bool partial;
 
 	printk("%sCall Trace:\n", log_lvl);
 
@@ -140,7 +152,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
 			printk("%s <%s>\n", log_lvl, stack_name);
 
 		if (regs)
-			show_regs_safe(&stack_info, regs);
+			show_regs_if_on_stack(&stack_info, regs, partial);
 
 		/*
 		 * Scan the stack, printing any text addresses we find.  At the
@@ -164,7 +176,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
 
 			/*
 			 * Don't print regs->ip again if it was already printed
-			 * by show_regs_safe() below.
+			 * by show_regs_if_on_stack().
 			 */
 			if (regs && stack == &regs->ip)
 				goto next;
@@ -199,9 +211,9 @@ next:
 			unwind_next_frame(&state);
 
 			/* if the frame has entry regs, print them */
-			regs = unwind_get_entry_regs(&state);
+			regs = unwind_get_entry_regs(&state, &partial);
 			if (regs)
-				show_regs_safe(&stack_info, regs);
+				show_regs_if_on_stack(&stack_info, regs, partial);
 		}
 
 		if (stack_name)
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 8dabd7bf1673..60244bfaf88f 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -98,7 +98,7 @@ static int __save_stack_trace_reliable(struct stack_trace *trace,
 	for (unwind_start(&state, task, NULL, NULL); !unwind_done(&state);
 	     unwind_next_frame(&state)) {
 
-		regs = unwind_get_entry_regs(&state);
+		regs = unwind_get_entry_regs(&state, NULL);
 		if (regs) {
 			/*
 			 * Kernel mode registers on the stack indicate an

From 3ffdeb1a02be3086f1411a15c5b9c481fa28e21f Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Sun, 31 Dec 2017 10:18:07 -0600
Subject: [PATCH 601/808] x86/dumpstack: Print registers for first stack frame
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In the stack dump code, if the frame after the starting pt_regs is also
a regs frame, the registers don't get printed.  Fix that.

Reported-by: Andy Lutomirski <luto@amacapital.net>
Tested-by: Alexander Tsoy <alexander@tsoy.me>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toralf Förster <toralf.foerster@gmx.de>
Cc: stable@vger.kernel.org
Fixes: 3b3fa11bc700 ("x86/dumpstack: Print any pt_regs found on the stack")
Link: http://lkml.kernel.org/r/396f84491d2f0ef64eda4217a2165f5712f6a115.1514736742.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/dumpstack.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index d0bb176a7261..afbecff161d1 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -115,6 +115,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
 
 	unwind_start(&state, task, regs, stack);
 	stack = stack ? : get_stack_pointer(task, regs);
+	regs = unwind_get_entry_regs(&state, &partial);
 
 	/*
 	 * Iterate through the stacks, starting with the current stack pointer.
@@ -132,7 +133,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
 	 * - hardirq stack
 	 * - entry stack
 	 */
-	for (regs = NULL; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {
+	for ( ; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {
 		const char *stack_name;
 
 		if (get_stack_info(stack, task, &stack_info, &visit_mask)) {

From c0bace798436bca0fdc221ff61143f1376a9c3de Mon Sep 17 00:00:00 2001
From: Felix Janda <felix.janda@posteo.de>
Date: Mon, 1 Jan 2018 19:33:20 +0100
Subject: [PATCH 602/808] uapi libc compat: add fallback for unsupported libcs

libc-compat.h aims to prevent symbol collisions between uapi and libc
headers for each supported libc. This requires continuous coordination
between them.

The goal of this commit is to improve the situation for libcs (such as
musl) which are not yet supported and/or do not wish to be explicitly
supported, while not affecting supported libcs. More precisely, with
this commit, unsupported libcs can request the suppression of any
specific uapi definition by defining the correspondings _UAPI_DEF_*
macro as 0. This can fix symbol collisions for them, as long as the
libc headers are included before the uapi headers. Inclusion in the
other order is outside the scope of this commit.

All infrastructure in order to enable this fallback for unsupported
libcs is already in place, except that libc-compat.h unconditionally
defines all _UAPI_DEF_* macros to 1 for all unsupported libcs so that
any previous definitions are ignored. In order to fix this, this commit
merely makes these definitions conditional.

This commit together with the musl libc commit

http://git.musl-libc.org/cgit/musl/commit/?id=04983f2272382af92eb8f8838964ff944fbb8258

fixes for example the following compiler errors when <linux/in6.h> is
included after musl's <netinet/in.h>:

./linux/in6.h:32:8: error: redefinition of 'struct in6_addr'
./linux/in6.h:49:8: error: redefinition of 'struct sockaddr_in6'
./linux/in6.h:59:8: error: redefinition of 'struct ipv6_mreq'

The comments referencing glibc are still correct, but this file is not
only used for glibc any more.

Signed-off-by: Felix Janda <felix.janda@posteo.de>
Reviewed-by: Hauke Mehrtens <hauke@hauke-m.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/libc-compat.h | 55 +++++++++++++++++++++++++++++++-
 1 file changed, 54 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/libc-compat.h b/include/uapi/linux/libc-compat.h
index 282875cf8056..8254c937c9f4 100644
--- a/include/uapi/linux/libc-compat.h
+++ b/include/uapi/linux/libc-compat.h
@@ -168,46 +168,99 @@
 
 /* If we did not see any headers from any supported C libraries,
  * or we are being included in the kernel, then define everything
- * that we need. */
+ * that we need. Check for previous __UAPI_* definitions to give
+ * unsupported C libraries a way to opt out of any kernel definition. */
 #else /* !defined(__GLIBC__) */
 
 /* Definitions for if.h */
+#ifndef __UAPI_DEF_IF_IFCONF
 #define __UAPI_DEF_IF_IFCONF 1
+#endif
+#ifndef __UAPI_DEF_IF_IFMAP
 #define __UAPI_DEF_IF_IFMAP 1
+#endif
+#ifndef __UAPI_DEF_IF_IFNAMSIZ
 #define __UAPI_DEF_IF_IFNAMSIZ 1
+#endif
+#ifndef __UAPI_DEF_IF_IFREQ
 #define __UAPI_DEF_IF_IFREQ 1
+#endif
 /* Everything up to IFF_DYNAMIC, matches net/if.h until glibc 2.23 */
+#ifndef __UAPI_DEF_IF_NET_DEVICE_FLAGS
 #define __UAPI_DEF_IF_NET_DEVICE_FLAGS 1
+#endif
 /* For the future if glibc adds IFF_LOWER_UP, IFF_DORMANT and IFF_ECHO */
+#ifndef __UAPI_DEF_IF_NET_DEVICE_FLAGS_LOWER_UP_DORMANT_ECHO
 #define __UAPI_DEF_IF_NET_DEVICE_FLAGS_LOWER_UP_DORMANT_ECHO 1
+#endif
 
 /* Definitions for in.h */
+#ifndef __UAPI_DEF_IN_ADDR
 #define __UAPI_DEF_IN_ADDR		1
+#endif
+#ifndef __UAPI_DEF_IN_IPPROTO
 #define __UAPI_DEF_IN_IPPROTO		1
+#endif
+#ifndef __UAPI_DEF_IN_PKTINFO
 #define __UAPI_DEF_IN_PKTINFO		1
+#endif
+#ifndef __UAPI_DEF_IP_MREQ
 #define __UAPI_DEF_IP_MREQ		1
+#endif
+#ifndef __UAPI_DEF_SOCKADDR_IN
 #define __UAPI_DEF_SOCKADDR_IN		1
+#endif
+#ifndef __UAPI_DEF_IN_CLASS
 #define __UAPI_DEF_IN_CLASS		1
+#endif
 
 /* Definitions for in6.h */
+#ifndef __UAPI_DEF_IN6_ADDR
 #define __UAPI_DEF_IN6_ADDR		1
+#endif
+#ifndef __UAPI_DEF_IN6_ADDR_ALT
 #define __UAPI_DEF_IN6_ADDR_ALT		1
+#endif
+#ifndef __UAPI_DEF_SOCKADDR_IN6
 #define __UAPI_DEF_SOCKADDR_IN6		1
+#endif
+#ifndef __UAPI_DEF_IPV6_MREQ
 #define __UAPI_DEF_IPV6_MREQ		1
+#endif
+#ifndef __UAPI_DEF_IPPROTO_V6
 #define __UAPI_DEF_IPPROTO_V6		1
+#endif
+#ifndef __UAPI_DEF_IPV6_OPTIONS
 #define __UAPI_DEF_IPV6_OPTIONS		1
+#endif
+#ifndef __UAPI_DEF_IN6_PKTINFO
 #define __UAPI_DEF_IN6_PKTINFO		1
+#endif
+#ifndef __UAPI_DEF_IP6_MTUINFO
 #define __UAPI_DEF_IP6_MTUINFO		1
+#endif
 
 /* Definitions for ipx.h */
+#ifndef __UAPI_DEF_SOCKADDR_IPX
 #define __UAPI_DEF_SOCKADDR_IPX			1
+#endif
+#ifndef __UAPI_DEF_IPX_ROUTE_DEFINITION
 #define __UAPI_DEF_IPX_ROUTE_DEFINITION		1
+#endif
+#ifndef __UAPI_DEF_IPX_INTERFACE_DEFINITION
 #define __UAPI_DEF_IPX_INTERFACE_DEFINITION	1
+#endif
+#ifndef __UAPI_DEF_IPX_CONFIG_DATA
 #define __UAPI_DEF_IPX_CONFIG_DATA		1
+#endif
+#ifndef __UAPI_DEF_IPX_ROUTE_DEF
 #define __UAPI_DEF_IPX_ROUTE_DEF		1
+#endif
 
 /* Definitions for xattr.h */
+#ifndef __UAPI_DEF_XATTR
 #define __UAPI_DEF_XATTR		1
+#endif
 
 #endif /* __GLIBC__ */
 

From c095508770aebf1b9218e77026e48345d719b17c Mon Sep 17 00:00:00 2001
From: Mohamed Ghannam <simo.ghannam@gmail.com>
Date: Tue, 2 Jan 2018 19:44:34 +0000
Subject: [PATCH 603/808] RDS: Heap OOB write in rds_message_alloc_sgs()

When args->nr_local is 0, nr_pages gets also 0 due some size
calculation via rds_rm_size(), which is later used to allocate
pages for DMA, this bug produces a heap Out-Of-Bound write access
to a specific memory region.

Signed-off-by: Mohamed Ghannam <simo.ghannam@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/rdma.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index bc2f1e0977d6..94729d9da437 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -525,6 +525,9 @@ int rds_rdma_extra_size(struct rds_rdma_args *args)
 
 	local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr;
 
+	if (args->nr_local == 0)
+		return -EINVAL;
+
 	/* figure out the number of pages in the vector */
 	for (i = 0; i < args->nr_local; i++) {
 		if (copy_from_user(&vec, &local_vec[i],

From 79d0895140e937ba111e6420b4cd83ee75efa788 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Tue, 2 Jan 2018 19:44:37 -0200
Subject: [PATCH 604/808] sctp: fix error path in sctp_stream_init

syzbot noticed a NULL pointer dereference panic in sctp_stream_free()
which was caused by an incomplete error handling in sctp_stream_init().
By not clearing stream->outcnt, it made a for() in sctp_stream_free()
think that it had elements to free, but not, leading to the panic.

As suggested by Xin Long, this patch also simplifies the error path by
moving it to the only if() that uses it.

See-also: https://www.spinics.net/lists/netdev/msg473756.html
See-also: https://www.spinics.net/lists/netdev/msg465024.html
Reported-by: syzbot <syzkaller@googlegroups.com>
Fixes: f952be79cebd ("sctp: introduce struct sctp_stream_out_ext")
Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Reviewed-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/stream.c | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/net/sctp/stream.c b/net/sctp/stream.c
index 76ea66be0bbe..524dfeb94c41 100644
--- a/net/sctp/stream.c
+++ b/net/sctp/stream.c
@@ -156,9 +156,9 @@ int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt,
 	sctp_stream_outq_migrate(stream, NULL, outcnt);
 	sched->sched_all(stream);
 
-	i = sctp_stream_alloc_out(stream, outcnt, gfp);
-	if (i)
-		return i;
+	ret = sctp_stream_alloc_out(stream, outcnt, gfp);
+	if (ret)
+		goto out;
 
 	stream->outcnt = outcnt;
 	for (i = 0; i < stream->outcnt; i++)
@@ -170,19 +170,17 @@ in:
 	if (!incnt)
 		goto out;
 
-	i = sctp_stream_alloc_in(stream, incnt, gfp);
-	if (i) {
-		ret = -ENOMEM;
-		goto free;
+	ret = sctp_stream_alloc_in(stream, incnt, gfp);
+	if (ret) {
+		sched->free(stream);
+		kfree(stream->out);
+		stream->out = NULL;
+		stream->outcnt = 0;
+		goto out;
 	}
 
 	stream->incnt = incnt;
-	goto out;
 
-free:
-	sched->free(stream);
-	kfree(stream->out);
-	stream->out = NULL;
 out:
 	return ret;
 }

From f1c8d3720f2e6c8c2b209120678236debd0360e5 Mon Sep 17 00:00:00 2001
From: William Tu <u9012063@gmail.com>
Date: Tue, 2 Jan 2018 14:05:19 -0800
Subject: [PATCH 605/808] vxlan: trivial indenting fix.

Fix indentation of reserved_flags2 field in vxlanhdr_gpe.

Fixes: e1e5314de08b ("vxlan: implement GPE")
Signed-off-by: William Tu <u9012063@gmail.com>
Acked-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/vxlan.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 13223396dc64..f96391e84a8a 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -146,7 +146,7 @@ struct vxlanhdr_gpe {
 		np_applied:1,
 		instance_applied:1,
 		version:2,
-reserved_flags2:2;
+		reserved_flags2:2;
 #elif defined(__BIG_ENDIAN_BITFIELD)
 	u8	reserved_flags2:2,
 		version:2,

From 64e711ca59ef9b7873d77ef06bc174aa01af9115 Mon Sep 17 00:00:00 2001
From: Amritha Nambiar <amritha.nambiar@intel.com>
Date: Fri, 17 Nov 2017 15:51:47 -0800
Subject: [PATCH 606/808] i40e: Remove UDP support for big buffer

Since UDP based filters are not supported via big buffer cloud
filters, remove UDP support.  Also change a few return types to
indicate unsupported vs invalid configuration.

Signed-off-by: Amritha Nambiar <amritha.nambiar@intel.com>
Acked-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 321d8be80871..fffd4868defb 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -6038,8 +6038,8 @@ static int i40e_validate_and_set_switch_mode(struct i40e_vsi *vsi)
 	/* Set Bit 7 to be valid */
 	mode = I40E_AQ_SET_SWITCH_BIT7_VALID;
 
-	/* Set L4type to both TCP and UDP support */
-	mode |= I40E_AQ_SET_SWITCH_L4_TYPE_BOTH;
+	/* Set L4type for TCP support */
+	mode |= I40E_AQ_SET_SWITCH_L4_TYPE_TCP;
 
 	/* Set cloud filter mode */
 	mode |= I40E_AQ_SET_SWITCH_MODE_NON_TUNNEL;
@@ -6969,18 +6969,18 @@ static int i40e_add_del_cloud_filter_big_buf(struct i40e_vsi *vsi,
 	     is_valid_ether_addr(filter->src_mac)) ||
 	    (is_multicast_ether_addr(filter->dst_mac) &&
 	     is_multicast_ether_addr(filter->src_mac)))
-		return -EINVAL;
+		return -EOPNOTSUPP;
 
-	/* Make sure port is specified, otherwise bail out, for channel
-	 * specific cloud filter needs 'L4 port' to be non-zero
+	/* Big buffer cloud filter needs 'L4 port' to be non-zero. Also, UDP
+	 * ports are not supported via big buffer now.
 	 */
-	if (!filter->dst_port)
-		return -EINVAL;
+	if (!filter->dst_port || filter->ip_proto == IPPROTO_UDP)
+		return -EOPNOTSUPP;
 
 	/* adding filter using src_port/src_ip is not supported at this stage */
 	if (filter->src_port || filter->src_ipv4 ||
 	    !ipv6_addr_any(&filter->ip.v6.src_ip6))
-		return -EINVAL;
+		return -EOPNOTSUPP;
 
 	/* copy element needed to add cloud filter from filter */
 	i40e_set_cld_element(filter, &cld_filter.element);
@@ -6991,7 +6991,7 @@ static int i40e_add_del_cloud_filter_big_buf(struct i40e_vsi *vsi,
 	    is_multicast_ether_addr(filter->src_mac)) {
 		/* MAC + IP : unsupported mode */
 		if (filter->dst_ipv4)
-			return -EINVAL;
+			return -EOPNOTSUPP;
 
 		/* since we validated that L4 port must be valid before
 		 * we get here, start with respective "flags" value

From e90f686b4358d7d7e5dbaa48b8e78c9a4e41826e Mon Sep 17 00:00:00 2001
From: Fugang Duan <fugang.duan@nxp.com>
Date: Wed, 3 Jan 2018 10:39:29 +0800
Subject: [PATCH 607/808] net: fec: restore dev_id in the cases of probe error

The static variable dev_id always plus one before netdev registerred.
It should restore the dev_id value in the cases of probe error.

Signed-off-by: Fugang Duan <fugang.duan@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/freescale/fec_main.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c
index 8184d2fca9be..6a4fc2b35488 100644
--- a/drivers/net/ethernet/freescale/fec_main.c
+++ b/drivers/net/ethernet/freescale/fec_main.c
@@ -3556,6 +3556,7 @@ failed_phy:
 	of_node_put(phy_node);
 failed_ioremap:
 	free_netdev(ndev);
+	dev_id--;
 
 	return ret;
 }

From 3f38c683033a9a0a2738e7067f449deefabfa3ef Mon Sep 17 00:00:00 2001
From: Fugang Duan <fugang.duan@nxp.com>
Date: Wed, 3 Jan 2018 10:39:30 +0800
Subject: [PATCH 608/808] net: fec: defer probe if regulator is not ready

Defer probe if regulator is not ready. E.g. some regulator is fixed
regulator controlled by i2c expander gpio, the i2c device may be probed
after the driver, then it should handle the case of defer probe error.

Signed-off-by: Fugang Duan <fugang.duan@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/freescale/fec_main.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c
index 6a4fc2b35488..19f198e22e15 100644
--- a/drivers/net/ethernet/freescale/fec_main.c
+++ b/drivers/net/ethernet/freescale/fec_main.c
@@ -3469,6 +3469,10 @@ fec_probe(struct platform_device *pdev)
 			goto failed_regulator;
 		}
 	} else {
+		if (PTR_ERR(fep->reg_phy) == -EPROBE_DEFER) {
+			ret = -EPROBE_DEFER;
+			goto failed_regulator;
+		}
 		fep->reg_phy = NULL;
 	}
 

From 248de22e638f10bd5bfc7624a357f940f66ba137 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Fri, 8 Dec 2017 10:55:04 -0800
Subject: [PATCH 609/808] i40e/i40evf: Account for frags split over multiple
 descriptors in check linearize

The original code for __i40e_chk_linearize didn't take into account the
fact that if a fragment is 16K in size or larger it has to be split over 2
descriptors and the smaller of those 2 descriptors will be on the trailing
edge of the transmit. As a result we can get into situations where we didn't
catch requests that could result in a Tx hang.

This patch takes care of that by subtracting the length of all but the
trailing edge of the stale fragment before we test for sum. By doing this
we can guarantee that we have all cases covered, including the case of a
fragment that spans multiple descriptors. We don't need to worry about
checking the inner portions of this since 12K is the maximum aligned DMA
size and that is larger than any MSS will ever be since the MTU limit for
jumbos is something on the order of 9K.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_txrx.c   | 26 ++++++++++++++++---
 drivers/net/ethernet/intel/i40evf/i40e_txrx.c | 26 ++++++++++++++++---
 2 files changed, 46 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 4566d66ffc7c..5bc2748ac468 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -3047,10 +3047,30 @@ bool __i40e_chk_linearize(struct sk_buff *skb)
 	/* Walk through fragments adding latest fragment, testing it, and
 	 * then removing stale fragments from the sum.
 	 */
-	stale = &skb_shinfo(skb)->frags[0];
-	for (;;) {
+	for (stale = &skb_shinfo(skb)->frags[0];; stale++) {
+		int stale_size = skb_frag_size(stale);
+
 		sum += skb_frag_size(frag++);
 
+		/* The stale fragment may present us with a smaller
+		 * descriptor than the actual fragment size. To account
+		 * for that we need to remove all the data on the front and
+		 * figure out what the remainder would be in the last
+		 * descriptor associated with the fragment.
+		 */
+		if (stale_size > I40E_MAX_DATA_PER_TXD) {
+			int align_pad = -(stale->page_offset) &
+					(I40E_MAX_READ_REQ_SIZE - 1);
+
+			sum -= align_pad;
+			stale_size -= align_pad;
+
+			do {
+				sum -= I40E_MAX_DATA_PER_TXD_ALIGNED;
+				stale_size -= I40E_MAX_DATA_PER_TXD_ALIGNED;
+			} while (stale_size > I40E_MAX_DATA_PER_TXD);
+		}
+
 		/* if sum is negative we failed to make sufficient progress */
 		if (sum < 0)
 			return true;
@@ -3058,7 +3078,7 @@ bool __i40e_chk_linearize(struct sk_buff *skb)
 		if (!nr_frags--)
 			break;
 
-		sum -= skb_frag_size(stale++);
+		sum -= stale_size;
 	}
 
 	return false;
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
index 50864f99446d..1ba29bb85b67 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
@@ -2012,10 +2012,30 @@ bool __i40evf_chk_linearize(struct sk_buff *skb)
 	/* Walk through fragments adding latest fragment, testing it, and
 	 * then removing stale fragments from the sum.
 	 */
-	stale = &skb_shinfo(skb)->frags[0];
-	for (;;) {
+	for (stale = &skb_shinfo(skb)->frags[0];; stale++) {
+		int stale_size = skb_frag_size(stale);
+
 		sum += skb_frag_size(frag++);
 
+		/* The stale fragment may present us with a smaller
+		 * descriptor than the actual fragment size. To account
+		 * for that we need to remove all the data on the front and
+		 * figure out what the remainder would be in the last
+		 * descriptor associated with the fragment.
+		 */
+		if (stale_size > I40E_MAX_DATA_PER_TXD) {
+			int align_pad = -(stale->page_offset) &
+					(I40E_MAX_READ_REQ_SIZE - 1);
+
+			sum -= align_pad;
+			stale_size -= align_pad;
+
+			do {
+				sum -= I40E_MAX_DATA_PER_TXD_ALIGNED;
+				stale_size -= I40E_MAX_DATA_PER_TXD_ALIGNED;
+			} while (stale_size > I40E_MAX_DATA_PER_TXD);
+		}
+
 		/* if sum is negative we failed to make sufficient progress */
 		if (sum < 0)
 			return true;
@@ -2023,7 +2043,7 @@ bool __i40evf_chk_linearize(struct sk_buff *skb)
 		if (!nr_frags--)
 			break;
 
-		sum -= skb_frag_size(stale++);
+		sum -= stale_size;
 	}
 
 	return false;

From 458867b2ca0c987445c5d9adccd1642970e1ba07 Mon Sep 17 00:00:00 2001
From: Jacob Keller <jacob.e.keller@intel.com>
Date: Wed, 20 Dec 2017 11:04:36 -0500
Subject: [PATCH 610/808] i40e: don't remove netdev->dev_addr when syncing uc
 list

In some circumstances, such as with bridging, it is possible that the
stack will add a devices own MAC address to its unicast address list.

If, later, the stack deletes this address, then the i40e driver will
receive a request to remove this address.

The driver stores its current MAC address as part of the MAC/VLAN hash
array, since it is convenient and matches exactly how the hardware
expects to be told which traffic to receive.

This causes a problem, since for more devices, the MAC address is stored
separately, and requests to delete a unicast address should not have the
ability to remove the filter for the MAC address.

Fix this by forcing a check on every address sync to ensure we do not
remove the device address.

There is a very narrow possibility of a race between .set_mac and
.set_rx_mode, if we don't change netdev->dev_addr before updating our
internal MAC list in .set_mac. This might be possible if .set_rx_mode is
going to remove MAC "XYZ" from the list, at the same time as .set_mac
changes our dev_addr to MAC "XYZ", we might possibly queue a delete,
then an add in .set_mac, then queue a delete in .set_rx_mode's
dev_uc_sync and then update netdev->dev_addr. We can avoid this by
moving the copy into dev_addr prior to the changes to the MAC filter
list.

A similar race on the other side does not cause problems, as if we're
changing our MAC form A to B, and we race with .set_rx_mode, it could
queue a delete from A, we'd update our address, and allow the delete.
This seems like a race, but in reality we're about to queue a delete of
A anyways, so it would not cause any issues.

A race in the initialization code is unlikely because the netdevice has
not yet been fully initialized and the stack should not be adding or
removing addresses yet.

Note that we don't (yet) need similar code for the VF driver because it
does not make use of __dev_uc_sync and __dev_mc_sync, but instead roles
its own method for handling updates to the MAC/VLAN list, which already
has code to protect against removal of the hardware address.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index fffd4868defb..9e4b78e447f8 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -1573,11 +1573,18 @@ static int i40e_set_mac(struct net_device *netdev, void *p)
 	else
 		netdev_info(netdev, "set new mac address %pM\n", addr->sa_data);
 
+	/* Copy the address first, so that we avoid a possible race with
+	 * .set_rx_mode(). If we copy after changing the address in the filter
+	 * list, we might open ourselves to a narrow race window where
+	 * .set_rx_mode could delete our dev_addr filter and prevent traffic
+	 * from passing.
+	 */
+	ether_addr_copy(netdev->dev_addr, addr->sa_data);
+
 	spin_lock_bh(&vsi->mac_filter_hash_lock);
 	i40e_del_mac_filter(vsi, netdev->dev_addr);
 	i40e_add_mac_filter(vsi, addr->sa_data);
 	spin_unlock_bh(&vsi->mac_filter_hash_lock);
-	ether_addr_copy(netdev->dev_addr, addr->sa_data);
 	if (vsi->type == I40E_VSI_MAIN) {
 		i40e_status ret;
 
@@ -1923,6 +1930,14 @@ static int i40e_addr_unsync(struct net_device *netdev, const u8 *addr)
 	struct i40e_netdev_priv *np = netdev_priv(netdev);
 	struct i40e_vsi *vsi = np->vsi;
 
+	/* Under some circumstances, we might receive a request to delete
+	 * our own device address from our uc list. Because we store the
+	 * device address in the VSI's MAC/VLAN filter list, we need to ignore
+	 * such requests and not delete our device address from this list.
+	 */
+	if (ether_addr_equal(addr, netdev->dev_addr))
+		return 0;
+
 	i40e_del_mac_filter(vsi, addr);
 
 	return 0;

From bc4244c6e33f96b48c4986ce4653df4673c6a08e Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Fri, 22 Dec 2017 12:45:16 +0100
Subject: [PATCH 611/808] i40e: flower: Fix return value for unsupported
 offload

When filter configuration is not supported, drivers should return
-EOPNOTSUPP so the core can react correctly.

Fixes: 2f4b411a3d67 ("i40e: Enable cloud filters via tc-flower")
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 9e4b78e447f8..42dcaefc4c19 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -7371,7 +7371,7 @@ static int i40e_configure_clsflower(struct i40e_vsi *vsi,
 
 	if (tc < 0) {
 		dev_err(&vsi->back->pdev->dev, "Invalid traffic class\n");
-		return -EINVAL;
+		return -EOPNOTSUPP;
 	}
 
 	if (test_bit(__I40E_RESET_RECOVERY_PENDING, pf->state) ||

From 15962a18284552b5ec58982ff60a5e92e0c5c92b Mon Sep 17 00:00:00 2001
From: Arjun Vynipadath <arjun@chelsio.com>
Date: Wed, 3 Jan 2018 11:44:07 +0530
Subject: [PATCH 612/808] cxgb4: Fix FW flash errors

commit 96ac18f14a5a ("cxgb4: Add support for new flash parts")
removed initialization of adapter->params.sf_fw_start causing issues
while flashing firmware to card. We no longer need sf_fw_start
in adapter->params as we already have macros defined for FW flash
addresses.

Fixes: 96ac18f14a5a ("cxgb4: Add support for new flash parts")
Signed-off-by: Arjun Vynipadath <arjun@chelsio.com>
Signed-off-by: Casey Leedom <leedom@chelsio.com>
Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4.h |  1 -
 drivers/net/ethernet/chelsio/cxgb4/t4_hw.c | 17 ++++++++---------
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
index 6f9fa6e3c42a..d8424ed16c33 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
@@ -344,7 +344,6 @@ struct adapter_params {
 
 	unsigned int sf_size;             /* serial flash size in bytes */
 	unsigned int sf_nsec;             /* # of flash sectors */
-	unsigned int sf_fw_start;         /* start of FW image in flash */
 
 	unsigned int fw_vers;		  /* firmware version */
 	unsigned int bs_vers;		  /* bootstrap version */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
index f63210f15579..375ef86a84da 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
@@ -2844,8 +2844,6 @@ enum {
 	SF_RD_DATA_FAST = 0xb,        /* read flash */
 	SF_RD_ID        = 0x9f,       /* read ID */
 	SF_ERASE_SECTOR = 0xd8,       /* erase sector */
-
-	FW_MAX_SIZE = 16 * SF_SEC_SIZE,
 };
 
 /**
@@ -3558,8 +3556,9 @@ int t4_load_fw(struct adapter *adap, const u8 *fw_data, unsigned int size)
 	const __be32 *p = (const __be32 *)fw_data;
 	const struct fw_hdr *hdr = (const struct fw_hdr *)fw_data;
 	unsigned int sf_sec_size = adap->params.sf_size / adap->params.sf_nsec;
-	unsigned int fw_img_start = adap->params.sf_fw_start;
-	unsigned int fw_start_sec = fw_img_start / sf_sec_size;
+	unsigned int fw_start_sec = FLASH_FW_START_SEC;
+	unsigned int fw_size = FLASH_FW_MAX_SIZE;
+	unsigned int fw_start = FLASH_FW_START;
 
 	if (!size) {
 		dev_err(adap->pdev_dev, "FW image has no data\n");
@@ -3575,9 +3574,9 @@ int t4_load_fw(struct adapter *adap, const u8 *fw_data, unsigned int size)
 			"FW image size differs from size in FW header\n");
 		return -EINVAL;
 	}
-	if (size > FW_MAX_SIZE) {
+	if (size > fw_size) {
 		dev_err(adap->pdev_dev, "FW image too large, max is %u bytes\n",
-			FW_MAX_SIZE);
+			fw_size);
 		return -EFBIG;
 	}
 	if (!t4_fw_matches_chip(adap, hdr))
@@ -3604,11 +3603,11 @@ int t4_load_fw(struct adapter *adap, const u8 *fw_data, unsigned int size)
 	 */
 	memcpy(first_page, fw_data, SF_PAGE_SIZE);
 	((struct fw_hdr *)first_page)->fw_ver = cpu_to_be32(0xffffffff);
-	ret = t4_write_flash(adap, fw_img_start, SF_PAGE_SIZE, first_page);
+	ret = t4_write_flash(adap, fw_start, SF_PAGE_SIZE, first_page);
 	if (ret)
 		goto out;
 
-	addr = fw_img_start;
+	addr = fw_start;
 	for (size -= SF_PAGE_SIZE; size; size -= SF_PAGE_SIZE) {
 		addr += SF_PAGE_SIZE;
 		fw_data += SF_PAGE_SIZE;
@@ -3618,7 +3617,7 @@ int t4_load_fw(struct adapter *adap, const u8 *fw_data, unsigned int size)
 	}
 
 	ret = t4_write_flash(adap,
-			     fw_img_start + offsetof(struct fw_hdr, fw_ver),
+			     fw_start + offsetof(struct fw_hdr, fw_ver),
 			     sizeof(hdr->fw_ver), (const u8 *)&hdr->fw_ver);
 out:
 	if (ret)

From 7853b49ce8e0ef6364d24512b287463841d71bd3 Mon Sep 17 00:00:00 2001
From: Netanel Belgazal <netanel@amazon.com>
Date: Wed, 3 Jan 2018 06:17:29 +0000
Subject: [PATCH 613/808] net: ena: unmask MSI-X only after device
 initialization is completed

Under certain conditions MSI-X interrupt might arrive right after it
was unmasked in ena_up(). There is a chance it would be processed by
the driver before device ENA_FLAG_DEV_UP flag is set. In such a case
the interrupt is ignored.
ENA device operates in auto-masked mode, therefore ignoring
interrupt leaves it masked for good.
Moving unmask of interrupt to be the last step in ena_up().

Signed-off-by: Netanel Belgazal <netanel@amazon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/amazon/ena/ena_netdev.c | 26 ++++++++++----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c
index 97c5a89a9cf7..6fb28fd43eb3 100644
--- a/drivers/net/ethernet/amazon/ena/ena_netdev.c
+++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c
@@ -1565,7 +1565,7 @@ static int ena_rss_configure(struct ena_adapter *adapter)
 
 static int ena_up_complete(struct ena_adapter *adapter)
 {
-	int rc, i;
+	int rc;
 
 	rc = ena_rss_configure(adapter);
 	if (rc)
@@ -1584,17 +1584,6 @@ static int ena_up_complete(struct ena_adapter *adapter)
 
 	ena_napi_enable_all(adapter);
 
-	/* Enable completion queues interrupt */
-	for (i = 0; i < adapter->num_queues; i++)
-		ena_unmask_interrupt(&adapter->tx_ring[i],
-				     &adapter->rx_ring[i]);
-
-	/* schedule napi in case we had pending packets
-	 * from the last time we disable napi
-	 */
-	for (i = 0; i < adapter->num_queues; i++)
-		napi_schedule(&adapter->ena_napi[i].napi);
-
 	return 0;
 }
 
@@ -1731,7 +1720,7 @@ create_err:
 
 static int ena_up(struct ena_adapter *adapter)
 {
-	int rc;
+	int rc, i;
 
 	netdev_dbg(adapter->netdev, "%s\n", __func__);
 
@@ -1774,6 +1763,17 @@ static int ena_up(struct ena_adapter *adapter)
 
 	set_bit(ENA_FLAG_DEV_UP, &adapter->flags);
 
+	/* Enable completion queues interrupt */
+	for (i = 0; i < adapter->num_queues; i++)
+		ena_unmask_interrupt(&adapter->tx_ring[i],
+				     &adapter->rx_ring[i]);
+
+	/* schedule napi in case we had pending packets
+	 * from the last time we disable napi
+	 */
+	for (i = 0; i < adapter->num_queues; i++)
+		napi_schedule(&adapter->ena_napi[i].napi);
+
 	return rc;
 
 err_up:

From ee4552aaf3fef5345199b8a82e40be7245b289fb Mon Sep 17 00:00:00 2001
From: Netanel Belgazal <netanel@amazon.com>
Date: Wed, 3 Jan 2018 06:17:30 +0000
Subject: [PATCH 614/808] net: ena: fix error handling in ena_down() sequence

ENA admin command queue errors are not handled as part of ena_down().
As a result, in case of error admin queue transitions to non-running
state and aborts all subsequent commands including those coming from
ena_up(). Reset scheduled by the driver from the timer service
context would not proceed due to sharing rtnl with ena_up()/ena_down()

Signed-off-by: Netanel Belgazal <netanel@amazon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/amazon/ena/ena_netdev.c | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c
index 6fb28fd43eb3..fbe21a817bd8 100644
--- a/drivers/net/ethernet/amazon/ena/ena_netdev.c
+++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c
@@ -75,6 +75,9 @@ static struct workqueue_struct *ena_wq;
 MODULE_DEVICE_TABLE(pci, ena_pci_tbl);
 
 static int ena_rss_init_default(struct ena_adapter *adapter);
+static void check_for_admin_com_state(struct ena_adapter *adapter);
+static void ena_destroy_device(struct ena_adapter *adapter);
+static int ena_restore_device(struct ena_adapter *adapter);
 
 static void ena_tx_timeout(struct net_device *dev)
 {
@@ -1884,6 +1887,17 @@ static int ena_close(struct net_device *netdev)
 	if (test_bit(ENA_FLAG_DEV_UP, &adapter->flags))
 		ena_down(adapter);
 
+	/* Check for device status and issue reset if needed*/
+	check_for_admin_com_state(adapter);
+	if (unlikely(test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags))) {
+		netif_err(adapter, ifdown, adapter->netdev,
+			  "Destroy failure, restarting device\n");
+		ena_dump_stats_to_dmesg(adapter);
+		/* rtnl lock already obtained in dev_ioctl() layer */
+		ena_destroy_device(adapter);
+		ena_restore_device(adapter);
+	}
+
 	return 0;
 }
 
@@ -2544,11 +2558,12 @@ static void ena_destroy_device(struct ena_adapter *adapter)
 
 	ena_com_set_admin_running_state(ena_dev, false);
 
-	ena_close(netdev);
+	if (test_bit(ENA_FLAG_DEV_UP, &adapter->flags))
+		ena_down(adapter);
 
 	/* Before releasing the ENA resources, a device reset is required.
 	 * (to prevent the device from accessing them).
-	 * In case the reset flag is set and the device is up, ena_close
+	 * In case the reset flag is set and the device is up, ena_down()
 	 * already perform the reset, so it can be skipped.
 	 */
 	if (!(test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags) && dev_up))

From e816c201aed5232171f8eb80b5d46ae6516683b9 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 2 Jan 2018 15:21:33 -0800
Subject: [PATCH 615/808] exec: Weaken dumpability for secureexec

This is a logical revert of commit e37fdb785a5f ("exec: Use secureexec
for setting dumpability")

This weakens dumpability back to checking only for uid/gid changes in
current (which is useless), but userspace depends on dumpability not
being tied to secureexec.

  https://bugzilla.redhat.com/show_bug.cgi?id=1528633

Reported-by: Tom Horsley <horsley1953@gmail.com>
Fixes: e37fdb785a5f ("exec: Use secureexec for setting dumpability")
Cc: stable@vger.kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index 5688b5e1b937..7eb8d21bcab9 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1349,9 +1349,14 @@ void setup_new_exec(struct linux_binprm * bprm)
 
 	current->sas_ss_sp = current->sas_ss_size = 0;
 
-	/* Figure out dumpability. */
+	/*
+	 * Figure out dumpability. Note that this checking only of current
+	 * is wrong, but userspace depends on it. This should be testing
+	 * bprm->secureexec instead.
+	 */
 	if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP ||
-	    bprm->secureexec)
+	    !(uid_eq(current_euid(), current_uid()) &&
+	      gid_eq(current_egid(), current_gid())))
 		set_dumpable(current->mm, suid_dumpable);
 	else
 		set_dumpable(current->mm, SUID_DUMP_USER);

From ee4aa8df70fa6d76bd776c025dc0d8d746c18317 Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@tuxdriver.com>
Date: Wed, 3 Jan 2018 13:09:23 -0500
Subject: [PATCH 616/808] 3c59x: fix missing dma_mapping_error check and bad
 ring refill logic

A few spots in 3c59x missed calls to dma_mapping_error checks, casuing
WARN_ONS to trigger.  Clean those up.  While we're at it, refactor the
refill code a bit so that if skb allocation or dma mapping fails, we
recycle the existing buffer.  This prevents holes in the rx ring, and
makes for much simpler logic

Note: This is compile only tested.  Ted, if you could run this and
confirm that it continues to work properly, I would appreciate it, as I
currently don't have access to this hardware

Signed-off-by: Neil Horman <nhorman@redhat.com>
CC: Steffen Klassert <klassert@mathematik.tu-chemnitz.de>
CC: "David S. Miller" <davem@davemloft.net>
Reported-by: tedheadster@gmail.com
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/3com/3c59x.c | 90 +++++++++++++------------------
 1 file changed, 38 insertions(+), 52 deletions(-)

diff --git a/drivers/net/ethernet/3com/3c59x.c b/drivers/net/ethernet/3com/3c59x.c
index f4e13a7014bd..36c8950dbd2d 100644
--- a/drivers/net/ethernet/3com/3c59x.c
+++ b/drivers/net/ethernet/3com/3c59x.c
@@ -602,7 +602,7 @@ struct vortex_private {
 	struct sk_buff* rx_skbuff[RX_RING_SIZE];
 	struct sk_buff* tx_skbuff[TX_RING_SIZE];
 	unsigned int cur_rx, cur_tx;		/* The next free ring entry */
-	unsigned int dirty_rx, dirty_tx;	/* The ring entries to be free()ed. */
+	unsigned int dirty_tx;	/* The ring entries to be free()ed. */
 	struct vortex_extra_stats xstats;	/* NIC-specific extra stats */
 	struct sk_buff *tx_skb;				/* Packet being eaten by bus master ctrl.  */
 	dma_addr_t tx_skb_dma;				/* Allocated DMA address for bus master ctrl DMA.   */
@@ -618,7 +618,6 @@ struct vortex_private {
 
 	/* The remainder are related to chip state, mostly media selection. */
 	struct timer_list timer;			/* Media selection timer. */
-	struct timer_list rx_oom_timer;		/* Rx skb allocation retry timer */
 	int options;						/* User-settable misc. driver options. */
 	unsigned int media_override:4, 		/* Passed-in media type. */
 		default_media:4,				/* Read from the EEPROM/Wn3_Config. */
@@ -760,7 +759,6 @@ static void mdio_sync(struct vortex_private *vp, int bits);
 static int mdio_read(struct net_device *dev, int phy_id, int location);
 static void mdio_write(struct net_device *vp, int phy_id, int location, int value);
 static void vortex_timer(struct timer_list *t);
-static void rx_oom_timer(struct timer_list *t);
 static netdev_tx_t vortex_start_xmit(struct sk_buff *skb,
 				     struct net_device *dev);
 static netdev_tx_t boomerang_start_xmit(struct sk_buff *skb,
@@ -1601,7 +1599,6 @@ vortex_up(struct net_device *dev)
 
 	timer_setup(&vp->timer, vortex_timer, 0);
 	mod_timer(&vp->timer, RUN_AT(media_tbl[dev->if_port].wait));
-	timer_setup(&vp->rx_oom_timer, rx_oom_timer, 0);
 
 	if (vortex_debug > 1)
 		pr_debug("%s: Initial media type %s.\n",
@@ -1676,7 +1673,7 @@ vortex_up(struct net_device *dev)
 	window_write16(vp, 0x0040, 4, Wn4_NetDiag);
 
 	if (vp->full_bus_master_rx) { /* Boomerang bus master. */
-		vp->cur_rx = vp->dirty_rx = 0;
+		vp->cur_rx = 0;
 		/* Initialize the RxEarly register as recommended. */
 		iowrite16(SetRxThreshold + (1536>>2), ioaddr + EL3_CMD);
 		iowrite32(0x0020, ioaddr + PktStatus);
@@ -1729,6 +1726,7 @@ vortex_open(struct net_device *dev)
 	struct vortex_private *vp = netdev_priv(dev);
 	int i;
 	int retval;
+	dma_addr_t dma;
 
 	/* Use the now-standard shared IRQ implementation. */
 	if ((retval = request_irq(dev->irq, vp->full_bus_master_rx ?
@@ -1753,7 +1751,11 @@ vortex_open(struct net_device *dev)
 				break;			/* Bad news!  */
 
 			skb_reserve(skb, NET_IP_ALIGN);	/* Align IP on 16 byte boundaries */
-			vp->rx_ring[i].addr = cpu_to_le32(pci_map_single(VORTEX_PCI(vp), skb->data, PKT_BUF_SZ, PCI_DMA_FROMDEVICE));
+			dma = pci_map_single(VORTEX_PCI(vp), skb->data,
+					     PKT_BUF_SZ, PCI_DMA_FROMDEVICE);
+			if (dma_mapping_error(&VORTEX_PCI(vp)->dev, dma))
+				break;
+			vp->rx_ring[i].addr = cpu_to_le32(dma);
 		}
 		if (i != RX_RING_SIZE) {
 			pr_emerg("%s: no memory for rx ring\n", dev->name);
@@ -2067,6 +2069,12 @@ vortex_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		int len = (skb->len + 3) & ~3;
 		vp->tx_skb_dma = pci_map_single(VORTEX_PCI(vp), skb->data, len,
 						PCI_DMA_TODEVICE);
+		if (dma_mapping_error(&VORTEX_PCI(vp)->dev, vp->tx_skb_dma)) {
+			dev_kfree_skb_any(skb);
+			dev->stats.tx_dropped++;
+			return NETDEV_TX_OK;
+		}
+
 		spin_lock_irq(&vp->window_lock);
 		window_set(vp, 7);
 		iowrite32(vp->tx_skb_dma, ioaddr + Wn7_MasterAddr);
@@ -2593,7 +2601,7 @@ boomerang_rx(struct net_device *dev)
 	int entry = vp->cur_rx % RX_RING_SIZE;
 	void __iomem *ioaddr = vp->ioaddr;
 	int rx_status;
-	int rx_work_limit = vp->dirty_rx + RX_RING_SIZE - vp->cur_rx;
+	int rx_work_limit = RX_RING_SIZE;
 
 	if (vortex_debug > 5)
 		pr_debug("boomerang_rx(): status %4.4x\n", ioread16(ioaddr+EL3_STATUS));
@@ -2614,7 +2622,8 @@ boomerang_rx(struct net_device *dev)
 		} else {
 			/* The packet length: up to 4.5K!. */
 			int pkt_len = rx_status & 0x1fff;
-			struct sk_buff *skb;
+			struct sk_buff *skb, *newskb;
+			dma_addr_t newdma;
 			dma_addr_t dma = le32_to_cpu(vp->rx_ring[entry].addr);
 
 			if (vortex_debug > 4)
@@ -2633,9 +2642,27 @@ boomerang_rx(struct net_device *dev)
 				pci_dma_sync_single_for_device(VORTEX_PCI(vp), dma, PKT_BUF_SZ, PCI_DMA_FROMDEVICE);
 				vp->rx_copy++;
 			} else {
+				/* Pre-allocate the replacement skb.  If it or its
+				 * mapping fails then recycle the buffer thats already
+				 * in place
+				 */
+				newskb = netdev_alloc_skb_ip_align(dev, PKT_BUF_SZ);
+				if (!newskb) {
+					dev->stats.rx_dropped++;
+					goto clear_complete;
+				}
+				newdma = pci_map_single(VORTEX_PCI(vp), newskb->data,
+							PKT_BUF_SZ, PCI_DMA_FROMDEVICE);
+				if (dma_mapping_error(&VORTEX_PCI(vp)->dev, newdma)) {
+					dev->stats.rx_dropped++;
+					consume_skb(newskb);
+					goto clear_complete;
+				}
+
 				/* Pass up the skbuff already on the Rx ring. */
 				skb = vp->rx_skbuff[entry];
-				vp->rx_skbuff[entry] = NULL;
+				vp->rx_skbuff[entry] = newskb;
+				vp->rx_ring[entry].addr = cpu_to_le32(newdma);
 				skb_put(skb, pkt_len);
 				pci_unmap_single(VORTEX_PCI(vp), dma, PKT_BUF_SZ, PCI_DMA_FROMDEVICE);
 				vp->rx_nocopy++;
@@ -2653,55 +2680,15 @@ boomerang_rx(struct net_device *dev)
 			netif_rx(skb);
 			dev->stats.rx_packets++;
 		}
-		entry = (++vp->cur_rx) % RX_RING_SIZE;
-	}
-	/* Refill the Rx ring buffers. */
-	for (; vp->cur_rx - vp->dirty_rx > 0; vp->dirty_rx++) {
-		struct sk_buff *skb;
-		entry = vp->dirty_rx % RX_RING_SIZE;
-		if (vp->rx_skbuff[entry] == NULL) {
-			skb = netdev_alloc_skb_ip_align(dev, PKT_BUF_SZ);
-			if (skb == NULL) {
-				static unsigned long last_jif;
-				if (time_after(jiffies, last_jif + 10 * HZ)) {
-					pr_warn("%s: memory shortage\n",
-						dev->name);
-					last_jif = jiffies;
-				}
-				if ((vp->cur_rx - vp->dirty_rx) == RX_RING_SIZE)
-					mod_timer(&vp->rx_oom_timer, RUN_AT(HZ * 1));
-				break;			/* Bad news!  */
-			}
 
-			vp->rx_ring[entry].addr = cpu_to_le32(pci_map_single(VORTEX_PCI(vp), skb->data, PKT_BUF_SZ, PCI_DMA_FROMDEVICE));
-			vp->rx_skbuff[entry] = skb;
-		}
+clear_complete:
 		vp->rx_ring[entry].status = 0;	/* Clear complete bit. */
 		iowrite16(UpUnstall, ioaddr + EL3_CMD);
+		entry = (++vp->cur_rx) % RX_RING_SIZE;
 	}
 	return 0;
 }
 
-/*
- * If we've hit a total OOM refilling the Rx ring we poll once a second
- * for some memory.  Otherwise there is no way to restart the rx process.
- */
-static void
-rx_oom_timer(struct timer_list *t)
-{
-	struct vortex_private *vp = from_timer(vp, t, rx_oom_timer);
-	struct net_device *dev = vp->mii.dev;
-
-	spin_lock_irq(&vp->lock);
-	if ((vp->cur_rx - vp->dirty_rx) == RX_RING_SIZE)	/* This test is redundant, but makes me feel good */
-		boomerang_rx(dev);
-	if (vortex_debug > 1) {
-		pr_debug("%s: rx_oom_timer %s\n", dev->name,
-			((vp->cur_rx - vp->dirty_rx) != RX_RING_SIZE) ? "succeeded" : "retrying");
-	}
-	spin_unlock_irq(&vp->lock);
-}
-
 static void
 vortex_down(struct net_device *dev, int final_down)
 {
@@ -2711,7 +2698,6 @@ vortex_down(struct net_device *dev, int final_down)
 	netdev_reset_queue(dev);
 	netif_stop_queue(dev);
 
-	del_timer_sync(&vp->rx_oom_timer);
 	del_timer_sync(&vp->timer);
 
 	/* Turn off statistics ASAP.  We update dev->stats below. */

From d7732ba55c4b6a2da339bb12589c515830cfac2c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 3 Jan 2018 19:52:04 +0100
Subject: [PATCH 617/808] x86/pti: Switch to kernel CR3 at early in
 entry_SYSCALL_compat()

The preparation for PTI which added CR3 switching to the entry code
misplaced the CR3 switch in entry_SYSCALL_compat().

With PTI enabled the entry code tries to access a per cpu variable after
switching to kernel GS. This fails because that variable is not mapped to
user space. This results in a double fault and in the worst case a kernel
crash.

Move the switch ahead of the access and clobber RSP which has been saved
already.

Fixes: 8a09317b895f ("x86/mm/pti: Prepare the x86/entry assembly code for entry/exit CR3 switching")
Reported-by: Lars Wendler <wendler.lars@web.de>
Reported-by: Laura Abbott <labbott@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Andy Lutomirski <luto@kernel.org>,
Cc: Dave Hansen <dave.hansen@linux.intel.com>,
Cc: Peter Zijlstra <peterz@infradead.org>,
Cc: Greg KH <gregkh@linuxfoundation.org>, ,
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>,
Cc: Juergen Gross <jgross@suse.com>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801031949200.1957@nanos
---
 arch/x86/entry/entry_64_compat.S | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 40f17009ec20..98d5358e4041 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -190,8 +190,13 @@ ENTRY(entry_SYSCALL_compat)
 	/* Interrupts are off on entry. */
 	swapgs
 
-	/* Stash user ESP and switch to the kernel stack. */
+	/* Stash user ESP */
 	movl	%esp, %r8d
+
+	/* Use %rsp as scratch reg. User ESP is stashed in r8 */
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
+
+	/* Switch to the kernel stack */
 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 
 	/* Construct struct pt_regs on stack */
@@ -219,12 +224,6 @@ GLOBAL(entry_SYSCALL_compat_after_hwframe)
 	pushq   $0			/* pt_regs->r14 = 0 */
 	pushq   $0			/* pt_regs->r15 = 0 */
 
-	/*
-	 * We just saved %rdi so it is safe to clobber.  It is not
-	 * preserved during the C calls inside TRACE_IRQS_OFF anyway.
-	 */
-	SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
-
 	/*
 	 * User mode is traced as though IRQs are on, and SYSENTER
 	 * turned them off.

From 2fd9c41aea47f4ad071accf94b94f94f2c4d31eb Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <ndesaulniers@google.com>
Date: Wed, 3 Jan 2018 12:39:52 -0800
Subject: [PATCH 618/808] x86/process: Define cpu_tss_rw in same section as
 declaration

cpu_tss_rw is declared with DECLARE_PER_CPU_PAGE_ALIGNED
but then defined with DEFINE_PER_CPU_SHARED_ALIGNED
leading to section mismatch warnings.

Use DEFINE_PER_CPU_PAGE_ALIGNED consistently. This is necessary because
it's mapped to the cpu entry area and must be page aligned.

[ tglx: Massaged changelog a bit ]

Fixes: 1a935bc3d4ea ("x86/entry: Move SYSENTER_stack to the beginning of struct tss_struct")
Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: thomas.lendacky@amd.com
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: tklauser@distanz.ch
Cc: minipli@googlemail.com
Cc: me@kylehuey.com
Cc: namit@vmware.com
Cc: luto@kernel.org
Cc: jpoimboe@redhat.com
Cc: tj@kernel.org
Cc: cl@linux.com
Cc: bp@suse.de
Cc: thgarnie@google.com
Cc: kirill.shutemov@linux.intel.com
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/20180103203954.183360-1-ndesaulniers@google.com
---
 arch/x86/kernel/process.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 517415978409..3cb2486c47e4 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -47,7 +47,7 @@
  * section. Since TSS's are completely CPU-local, we want them
  * on exact cacheline boundaries, to eliminate cacheline ping-pong.
  */
-__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss_rw) = {
+__visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw) = {
 	.x86_tss = {
 		/*
 		 * .sp0 is only used when entering ring 0 from a lower

From ce9caf2f79a5aa170a4b6456a03db639eed9c988 Mon Sep 17 00:00:00 2001
From: Stefan Schake <stschake@gmail.com>
Date: Fri, 29 Dec 2017 17:05:43 +0100
Subject: [PATCH 619/808] drm/vc4: Move IRQ enable to PM path

We were calling enable_irq on bind, where it was already enabled previously
by the IRQ helper. Additionally, dev->irq is not set correctly until after
postinstall and so was always zero here, triggering a warning in 4.15.
Fix both by moving the enable to the power management resume path, where we
know there was a previous disable invocation during suspend.

Fixes: 253696ccd613 ("drm/vc4: Account for interrupts in flight")
Signed-off-by: Stefan Schake <stschake@gmail.com>
Signed-off-by: Eric Anholt <eric@anholt.net>
Link: https://patchwork.freedesktop.org/patch/msgid/1514563543-32511-1-git-send-email-stschake@gmail.com
Tested-by: Stefan Wahren <stefan.wahren@i2se.com>
Reviewed-by: Eric Anholt <eric@anholt.net>
---
 drivers/gpu/drm/vc4/vc4_irq.c | 3 ---
 drivers/gpu/drm/vc4/vc4_v3d.c | 3 +++
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/vc4/vc4_irq.c b/drivers/gpu/drm/vc4/vc4_irq.c
index 26eddbb62893..3dd62d75f531 100644
--- a/drivers/gpu/drm/vc4/vc4_irq.c
+++ b/drivers/gpu/drm/vc4/vc4_irq.c
@@ -209,9 +209,6 @@ vc4_irq_postinstall(struct drm_device *dev)
 {
 	struct vc4_dev *vc4 = to_vc4_dev(dev);
 
-	/* Undo the effects of a previous vc4_irq_uninstall. */
-	enable_irq(dev->irq);
-
 	/* Enable both the render done and out of memory interrupts. */
 	V3D_WRITE(V3D_INTENA, V3D_DRIVER_IRQS);
 
diff --git a/drivers/gpu/drm/vc4/vc4_v3d.c b/drivers/gpu/drm/vc4/vc4_v3d.c
index 622cd43840b8..493f392b3a0a 100644
--- a/drivers/gpu/drm/vc4/vc4_v3d.c
+++ b/drivers/gpu/drm/vc4/vc4_v3d.c
@@ -327,6 +327,9 @@ static int vc4_v3d_runtime_resume(struct device *dev)
 		return ret;
 
 	vc4_v3d_init_hw(vc4->dev);
+
+	/* We disabled the IRQ as part of vc4_irq_uninstall in suspend. */
+	enable_irq(vc4->dev->irq);
 	vc4_irq_postinstall(vc4->dev);
 
 	return 0;

From bec40c26041de61162f7be9d2ce548c756ce0f65 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Wed, 3 Jan 2018 13:39:15 -0800
Subject: [PATCH 620/808] IB/srpt: Disable RDMA access by the initiator

With the SRP protocol all RDMA operations are initiated by the target.
Since no RDMA operations are initiated by the initiator, do not grant
the initiator permission to submit RDMA reads or writes to the target.

Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/ulp/srpt/ib_srpt.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c
index 8a1bd354b1cc..7c4249038004 100644
--- a/drivers/infiniband/ulp/srpt/ib_srpt.c
+++ b/drivers/infiniband/ulp/srpt/ib_srpt.c
@@ -1013,8 +1013,7 @@ static int srpt_init_ch_qp(struct srpt_rdma_ch *ch, struct ib_qp *qp)
 		return -ENOMEM;
 
 	attr->qp_state = IB_QPS_INIT;
-	attr->qp_access_flags = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ |
-	    IB_ACCESS_REMOTE_WRITE;
+	attr->qp_access_flags = IB_ACCESS_LOCAL_WRITE;
 	attr->port_num = ch->sport->port;
 	attr->pkey_index = 0;
 

From a1ffa4670cb97ae3a4b3e8535d88be5f643f7c3b Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Wed, 3 Jan 2018 13:39:16 -0800
Subject: [PATCH 621/808] IB/srpt: Fix ACL lookup during login

Make sure that the initiator port GUID is stored in ch->ini_guid.
Note: when initiating a connection sgid and dgid members in struct
sa_path_rec represent the source and destination GIDs. When accepting
a connection however sgid represents the destination GID and dgid the
source GID.

Fixes: commit 2bce1a6d2209 ("IB/srpt: Accept GUIDs as port names")
Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/ulp/srpt/ib_srpt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c
index 7c4249038004..bfa576aa9f03 100644
--- a/drivers/infiniband/ulp/srpt/ib_srpt.c
+++ b/drivers/infiniband/ulp/srpt/ib_srpt.c
@@ -2077,7 +2077,7 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id,
 		goto destroy_ib;
 	}
 
-	guid = (__be16 *)&param->primary_path->sgid.global.interface_id;
+	guid = (__be16 *)&param->primary_path->dgid.global.interface_id;
 	snprintf(ch->ini_guid, sizeof(ch->ini_guid), "%04x:%04x:%04x:%04x",
 		 be16_to_cpu(guid[0]), be16_to_cpu(guid[1]),
 		 be16_to_cpu(guid[2]), be16_to_cpu(guid[3]));

From 121d760d0788f95619049c63449d977065cab69d Mon Sep 17 00:00:00 2001
From: Zhi Wang <zhi.a.wang@intel.com>
Date: Fri, 29 Dec 2017 02:50:08 +0800
Subject: [PATCH 622/808] drm/i915/gvt: Clear the shadow page table entry after
 post-sync

A shadow page table entry needs to be cleared after being set as
post-sync. This patch fixes the recent error reported in Win7-32 test.

Fixes: 2707e4446688 ("drm/i915/gvt: vGPU graphics memory virtualization")
Signed-off-by: Zhi Wang <zhi.a.wang@intel.com>
CC: Stable <stable@vger.kernel.org>
Signed-off-by: Zhenyu Wang <zhenyuw@linux.intel.com>
---
 drivers/gpu/drm/i915/gvt/gtt.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/gvt/gtt.c b/drivers/gpu/drm/i915/gvt/gtt.c
index 8e331142badb..64d67ff9bf08 100644
--- a/drivers/gpu/drm/i915/gvt/gtt.c
+++ b/drivers/gpu/drm/i915/gvt/gtt.c
@@ -1359,12 +1359,15 @@ static int ppgtt_handle_guest_write_page_table_bytes(void *gp,
 			return ret;
 	} else {
 		if (!test_bit(index, spt->post_shadow_bitmap)) {
+			int type = spt->shadow_page.type;
+
 			ppgtt_get_shadow_entry(spt, &se, index);
 			ret = ppgtt_handle_guest_entry_removal(gpt, &se, index);
 			if (ret)
 				return ret;
+			ops->set_pfn(&se, vgpu->gtt.scratch_pt[type].page_mfn);
+			ppgtt_set_shadow_entry(spt, &se, index);
 		}
-
 		ppgtt_set_post_shadow(spt, index);
 	}
 

From 2bd7b4aacdb6efa5ccd4749c365c171b884791d2 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 3 Jan 2018 23:49:18 +0100
Subject: [PATCH 623/808] mmc: s3mci: mark debug_regs[] as static

The global array clashes with a newly added symbol of the same name:

drivers/staging/ccree/cc_debugfs.o:(.data+0x0): multiple definition of `debug_regs'
drivers/mmc/host/s3cmci.o:(.data+0x70): first defined here

We should fix both, this one addresses the s3cmci driver by removing
the symbol from the global namespace. While at it, this separates
the declaration from the type definition and makes the variable const.

Fixes: 9bdd203b4dc8 ("s3cmci: add debugfs support for examining driver and hardware state")
Fixes: b3ec9a6736f2 ("staging: ccree: staging: ccree: replace sysfs by debugfs interface")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/host/s3cmci.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/mmc/host/s3cmci.c b/drivers/mmc/host/s3cmci.c
index f7f157a62a4a..555c7f133eb8 100644
--- a/drivers/mmc/host/s3cmci.c
+++ b/drivers/mmc/host/s3cmci.c
@@ -1424,7 +1424,9 @@ static const struct file_operations s3cmci_fops_state = {
 struct s3cmci_reg {
 	unsigned short	addr;
 	unsigned char	*name;
-} debug_regs[] = {
+};
+
+static const struct s3cmci_reg debug_regs[] = {
 	DBG_REG(CON),
 	DBG_REG(PRE),
 	DBG_REG(CMDARG),
@@ -1446,7 +1448,7 @@ struct s3cmci_reg {
 static int s3cmci_regs_show(struct seq_file *seq, void *v)
 {
 	struct s3cmci_host *host = seq->private;
-	struct s3cmci_reg *rptr = debug_regs;
+	const struct s3cmci_reg *rptr = debug_regs;
 
 	for (; rptr->name; rptr++)
 		seq_printf(seq, "SDI%s\t=0x%08x\n", rptr->name,

From 30414f3010aff95ffdb6bed7b9dce62cde94fdc7 Mon Sep 17 00:00:00 2001
From: Lucas De Marchi <lucas.demarchi@intel.com>
Date: Tue, 2 Jan 2018 12:18:37 -0800
Subject: [PATCH 624/808] drm/i915: Apply Display WA #1183 on skl, kbl, and cfl
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Display WA #1183 was recently added to workaround
"Failures when enabling DPLL0 with eDP link rate 2.16
or 4.32 GHz and CD clock frequency 308.57 or 617.14 MHz
(CDCLK_CTL CD Frequency Select 10b or 11b) used in this
 enabling or in previous enabling."

This workaround was designed to minimize the impact only
to save the bad case with that link rates. But HW engineers
indicated that it should be safe to apply broadly, although
they were expecting the DPLL0 link rate to be unchanged on
runtime.

We need to cover 2 cases: when we are in fact enabling DPLL0
and when we are just changing the frequency with small
differences.

This is based on previous patch by Rodrigo Vivi with suggestions
from Ville Syrjälä.

Cc: Arthur J Runyan <arthur.j.runyan@intel.com>
Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: stable@vger.kernel.org
Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
Reviewed-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20171204232210.4958-1-lucas.demarchi@intel.com
(cherry picked from commit 53421c2fe99ce16838639ad89d772d914a119a49)
[ Lucas: Backport to 4.15 adding back variable that has been removed on
  commits not meant to be backported ]
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180102201837.6812-1-lucas.demarchi@intel.com
---
 drivers/gpu/drm/i915/i915_reg.h         |  2 ++
 drivers/gpu/drm/i915/intel_cdclk.c      | 35 ++++++++++++++++++-------
 drivers/gpu/drm/i915/intel_runtime_pm.c | 10 +++++++
 3 files changed, 38 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index 3866c49bc390..333f40bc03bb 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -6977,6 +6977,7 @@ enum {
 #define  RESET_PCH_HANDSHAKE_ENABLE	(1<<4)
 
 #define GEN8_CHICKEN_DCPR_1		_MMIO(0x46430)
+#define   SKL_SELECT_ALTERNATE_DC_EXIT	(1<<30)
 #define   MASK_WAKEMEM			(1<<13)
 
 #define SKL_DFSM			_MMIO(0x51000)
@@ -8522,6 +8523,7 @@ enum skl_power_gate {
 #define  BXT_CDCLK_CD2X_DIV_SEL_2	(2<<22)
 #define  BXT_CDCLK_CD2X_DIV_SEL_4	(3<<22)
 #define  BXT_CDCLK_CD2X_PIPE(pipe)	((pipe)<<20)
+#define  CDCLK_DIVMUX_CD_OVERRIDE	(1<<19)
 #define  BXT_CDCLK_CD2X_PIPE_NONE	BXT_CDCLK_CD2X_PIPE(3)
 #define  BXT_CDCLK_SSA_PRECHARGE_ENABLE	(1<<16)
 #define  CDCLK_FREQ_DECIMAL_MASK	(0x7ff)
diff --git a/drivers/gpu/drm/i915/intel_cdclk.c b/drivers/gpu/drm/i915/intel_cdclk.c
index b2a6d62b71c0..60cf4e58389a 100644
--- a/drivers/gpu/drm/i915/intel_cdclk.c
+++ b/drivers/gpu/drm/i915/intel_cdclk.c
@@ -860,16 +860,10 @@ static void skl_set_preferred_cdclk_vco(struct drm_i915_private *dev_priv,
 
 static void skl_dpll0_enable(struct drm_i915_private *dev_priv, int vco)
 {
-	int min_cdclk = skl_calc_cdclk(0, vco);
 	u32 val;
 
 	WARN_ON(vco != 8100000 && vco != 8640000);
 
-	/* select the minimum CDCLK before enabling DPLL 0 */
-	val = CDCLK_FREQ_337_308 | skl_cdclk_decimal(min_cdclk);
-	I915_WRITE(CDCLK_CTL, val);
-	POSTING_READ(CDCLK_CTL);
-
 	/*
 	 * We always enable DPLL0 with the lowest link rate possible, but still
 	 * taking into account the VCO required to operate the eDP panel at the
@@ -923,7 +917,7 @@ static void skl_set_cdclk(struct drm_i915_private *dev_priv,
 {
 	int cdclk = cdclk_state->cdclk;
 	int vco = cdclk_state->vco;
-	u32 freq_select, pcu_ack;
+	u32 freq_select, pcu_ack, cdclk_ctl;
 	int ret;
 
 	WARN_ON((cdclk == 24000) != (vco == 0));
@@ -940,7 +934,7 @@ static void skl_set_cdclk(struct drm_i915_private *dev_priv,
 		return;
 	}
 
-	/* set CDCLK_CTL */
+	/* Choose frequency for this cdclk */
 	switch (cdclk) {
 	case 450000:
 	case 432000:
@@ -968,10 +962,33 @@ static void skl_set_cdclk(struct drm_i915_private *dev_priv,
 	    dev_priv->cdclk.hw.vco != vco)
 		skl_dpll0_disable(dev_priv);
 
+	cdclk_ctl = I915_READ(CDCLK_CTL);
+
+	if (dev_priv->cdclk.hw.vco != vco) {
+		/* Wa Display #1183: skl,kbl,cfl */
+		cdclk_ctl &= ~(CDCLK_FREQ_SEL_MASK | CDCLK_FREQ_DECIMAL_MASK);
+		cdclk_ctl |= freq_select | skl_cdclk_decimal(cdclk);
+		I915_WRITE(CDCLK_CTL, cdclk_ctl);
+	}
+
+	/* Wa Display #1183: skl,kbl,cfl */
+	cdclk_ctl |= CDCLK_DIVMUX_CD_OVERRIDE;
+	I915_WRITE(CDCLK_CTL, cdclk_ctl);
+	POSTING_READ(CDCLK_CTL);
+
 	if (dev_priv->cdclk.hw.vco != vco)
 		skl_dpll0_enable(dev_priv, vco);
 
-	I915_WRITE(CDCLK_CTL, freq_select | skl_cdclk_decimal(cdclk));
+	/* Wa Display #1183: skl,kbl,cfl */
+	cdclk_ctl &= ~(CDCLK_FREQ_SEL_MASK | CDCLK_FREQ_DECIMAL_MASK);
+	I915_WRITE(CDCLK_CTL, cdclk_ctl);
+
+	cdclk_ctl |= freq_select | skl_cdclk_decimal(cdclk);
+	I915_WRITE(CDCLK_CTL, cdclk_ctl);
+
+	/* Wa Display #1183: skl,kbl,cfl */
+	cdclk_ctl &= ~CDCLK_DIVMUX_CD_OVERRIDE;
+	I915_WRITE(CDCLK_CTL, cdclk_ctl);
 	POSTING_READ(CDCLK_CTL);
 
 	/* inform PCU of the change */
diff --git a/drivers/gpu/drm/i915/intel_runtime_pm.c b/drivers/gpu/drm/i915/intel_runtime_pm.c
index 9bf46ab211cb..7e115f3927f6 100644
--- a/drivers/gpu/drm/i915/intel_runtime_pm.c
+++ b/drivers/gpu/drm/i915/intel_runtime_pm.c
@@ -598,6 +598,11 @@ void gen9_enable_dc5(struct drm_i915_private *dev_priv)
 
 	DRM_DEBUG_KMS("Enabling DC5\n");
 
+	/* Wa Display #1183: skl,kbl,cfl */
+	if (IS_GEN9_BC(dev_priv))
+		I915_WRITE(GEN8_CHICKEN_DCPR_1, I915_READ(GEN8_CHICKEN_DCPR_1) |
+			   SKL_SELECT_ALTERNATE_DC_EXIT);
+
 	gen9_set_dc_state(dev_priv, DC_STATE_EN_UPTO_DC5);
 }
 
@@ -625,6 +630,11 @@ void skl_disable_dc6(struct drm_i915_private *dev_priv)
 {
 	DRM_DEBUG_KMS("Disabling DC6\n");
 
+	/* Wa Display #1183: skl,kbl,cfl */
+	if (IS_GEN9_BC(dev_priv))
+		I915_WRITE(GEN8_CHICKEN_DCPR_1, I915_READ(GEN8_CHICKEN_DCPR_1) |
+			   SKL_SELECT_ALTERNATE_DC_EXIT);
+
 	gen9_set_dc_state(dev_priv, DC_STATE_DISABLE);
 }
 

From 3ea15452ee85754f70f3b9fa1f23165ef2e77ba7 Mon Sep 17 00:00:00 2001
From: Hao Chen <flank3rsky@gmail.com>
Date: Wed, 3 Jan 2018 11:00:31 +0800
Subject: [PATCH 625/808] nl80211: Check for the required netlink attribute
 presence

nl80211_nan_add_func() does not check if the required attribute
NL80211_NAN_FUNC_FOLLOW_UP_DEST is present when processing
NL80211_CMD_ADD_NAN_FUNCTION request. This request can be issued
by users with CAP_NET_ADMIN privilege and may result in NULL dereference
and a system crash. Add a check for the required attribute presence.

Signed-off-by: Hao Chen <flank3rsky@gmail.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/nl80211.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 213d0c498c97..2b3dbcd40e46 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -11361,7 +11361,8 @@ static int nl80211_nan_add_func(struct sk_buff *skb,
 		break;
 	case NL80211_NAN_FUNC_FOLLOW_UP:
 		if (!tb[NL80211_NAN_FUNC_FOLLOW_UP_ID] ||
-		    !tb[NL80211_NAN_FUNC_FOLLOW_UP_REQ_ID]) {
+		    !tb[NL80211_NAN_FUNC_FOLLOW_UP_REQ_ID] ||
+		    !tb[NL80211_NAN_FUNC_FOLLOW_UP_DEST]) {
 			err = -EINVAL;
 			goto out;
 		}

From 736a80bbfda709fb3631f5f62056f250a38e5804 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 4 Jan 2018 15:51:53 +0100
Subject: [PATCH 626/808] mac80211: mesh: drop frames appearing to be from us

If there are multiple mesh stations with the same MAC address,
they will both get confused and start throwing warnings.

Obviously in this case nothing can actually work anyway, so just
drop frames that look like they're from ourselves early on.

Reported-by: Gui Iribarren <gui@altermundi.net>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/rx.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 70e9d2ca8bbe..4daafb07602f 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -3632,6 +3632,8 @@ static bool ieee80211_accept_frame(struct ieee80211_rx_data *rx)
 		}
 		return true;
 	case NL80211_IFTYPE_MESH_POINT:
+		if (ether_addr_equal(sdata->vif.addr, hdr->addr2))
+			return false;
 		if (multicast)
 			return true;
 		return ether_addr_equal(sdata->vif.addr, hdr->addr1);

From d14ac576d10f865970bb1324d337e5e24d79aaf4 Mon Sep 17 00:00:00 2001
From: Christian Holl <cyborgx1@gmail.com>
Date: Wed, 3 Jan 2018 19:53:02 +0100
Subject: [PATCH 627/808] USB: serial: cp210x: add new device ID ELV ALC 8xxx

This adds the ELV ALC 8xxx Battery Charging device
to the list of USB IDs of drivers/usb/serial/cp210x.c

Signed-off-by: Christian Holl <cyborgx1@gmail.com>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Johan Hovold <johan@kernel.org>
---
 drivers/usb/serial/cp210x.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/usb/serial/cp210x.c b/drivers/usb/serial/cp210x.c
index 38814225a816..06d502b3e913 100644
--- a/drivers/usb/serial/cp210x.c
+++ b/drivers/usb/serial/cp210x.c
@@ -175,6 +175,7 @@ static const struct usb_device_id id_table[] = {
 	{ USB_DEVICE(0x1843, 0x0200) }, /* Vaisala USB Instrument Cable */
 	{ USB_DEVICE(0x18EF, 0xE00F) }, /* ELV USB-I2C-Interface */
 	{ USB_DEVICE(0x18EF, 0xE025) }, /* ELV Marble Sound Board 1 */
+	{ USB_DEVICE(0x18EF, 0xE030) }, /* ELV ALC 8xxx Battery Charger */
 	{ USB_DEVICE(0x18EF, 0xE032) }, /* ELV TFD500 Data Logger */
 	{ USB_DEVICE(0x1901, 0x0190) }, /* GE B850 CP2105 Recorder interface */
 	{ USB_DEVICE(0x1901, 0x0193) }, /* GE B650 CP2104 PMC interface */

From 54e98b5d663fcd8e3279c2391537b1a1f7bfe344 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Wed, 3 Jan 2018 22:02:29 -0800
Subject: [PATCH 628/808] net: dsa: b53: Turn off Broadcom tags for more
 switches

Models such as BCM5395/97/98 and BCM53125/24/53115 and compatible require that
we turn on managed mode to actually act on Broadcom tags, otherwise they just
pass them through on ingress (host -> switch) and don't insert them in egress
(switch -> host). Turning on managed mode is simple, but requires us to
properly support ARL misses on multicast addresses which is a much more
involved set of changes not suitable for a bug fix for this release.

Reported-by: Jochen Friedrich <jochen@scram.de>
Fixes: 7edc58d614d4 ("net: dsa: b53: Turn on Broadcom tags")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/b53/b53_common.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
index f5a8dd96fd75..4498ab897d94 100644
--- a/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@ -1500,10 +1500,13 @@ static enum dsa_tag_protocol b53_get_tag_protocol(struct dsa_switch *ds,
 {
 	struct b53_device *dev = ds->priv;
 
-	/* Older models support a different tag format that we do not
-	 * support in net/dsa/tag_brcm.c yet.
+	/* Older models (5325, 5365) support a different tag format that we do
+	 * not support in net/dsa/tag_brcm.c yet. 539x and 531x5 require managed
+	 * mode to be turned on which means we need to specifically manage ARL
+	 * misses on multicast addresses (TBD).
 	 */
-	if (is5325(dev) || is5365(dev) || !b53_can_enable_brcm_tags(ds, port))
+	if (is5325(dev) || is5365(dev) || is539x(dev) || is531x5(dev) ||
+	    !b53_can_enable_brcm_tags(ds, port))
 		return DSA_TAG_PROTO_NONE;
 
 	/* Broadcom BCM58xx chips have a flow accelerator on Port 8

From b4c2951a4833e66f1bbfe65ddcd4fdcdfafe5e8f Mon Sep 17 00:00:00 2001
From: Oliver Hartkopp <socketcan@hartkopp.net>
Date: Sat, 2 Dec 2017 18:48:52 +0100
Subject: [PATCH 629/808] can: vxcan: improve handling of missing peer name
 attribute

Picking up the patch from Serhey Popovych (commit 191cdb3822e5df6b3c8,
"veth: Be more robust on network device creation when no attributes").

When the peer name attribute is not provided the former implementation tries
to register the given device name twice ... which leads to -EEXIST.
If only one device name is given apply an automatic generated and valid name
for the peer.

Cc: Serhey Popovych <serhe.popovych@gmail.com>
Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Cc: linux-stable <stable@vger.kernel.org>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/vxcan.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/can/vxcan.c b/drivers/net/can/vxcan.c
index 8404e8852a0f..b4c4a2c76437 100644
--- a/drivers/net/can/vxcan.c
+++ b/drivers/net/can/vxcan.c
@@ -194,7 +194,7 @@ static int vxcan_newlink(struct net *net, struct net_device *dev,
 		tbp = peer_tb;
 	}
 
-	if (tbp[IFLA_IFNAME]) {
+	if (ifmp && tbp[IFLA_IFNAME]) {
 		nla_strlcpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ);
 		name_assign_type = NET_NAME_USER;
 	} else {

From d5b42e6607661b198d8b26a0c30969605b1bf5c7 Mon Sep 17 00:00:00 2001
From: Wolfgang Grandegger <wg@grandegger.com>
Date: Wed, 13 Dec 2017 19:52:23 +0100
Subject: [PATCH 630/808] can: gs_usb: fix return value of the "set_bittiming"
 callback

The "set_bittiming" callback treats a positive return value as error!
For that reason "can_changelink()" will quit silently after setting
the bittiming values without processing ctrlmode, restart-ms, etc.

Signed-off-by: Wolfgang Grandegger <wg@grandegger.com>
Cc: linux-stable <stable@vger.kernel.org>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/usb/gs_usb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/can/usb/gs_usb.c b/drivers/net/can/usb/gs_usb.c
index 68ac3e88a8ce..8bf80ad9dc44 100644
--- a/drivers/net/can/usb/gs_usb.c
+++ b/drivers/net/can/usb/gs_usb.c
@@ -449,7 +449,7 @@ static int gs_usb_set_bittiming(struct net_device *netdev)
 		dev_err(netdev->dev.parent, "Couldn't set bittimings (err=%d)",
 			rc);
 
-	return rc;
+	return (rc > 0) ? 0 : rc;
 }
 
 static void gs_usb_xmit_callback(struct urb *urb)

From 13454c14550065fcc1705d6bd4ee6d40e057099f Mon Sep 17 00:00:00 2001
From: Luu An Phu <phu.luuan@nxp.com>
Date: Tue, 2 Jan 2018 10:44:18 +0700
Subject: [PATCH 631/808] can: flex_can: Correct the checking for frame length
 in flexcan_start_xmit()

The flexcan_start_xmit() function compares the frame length with data
register length to write frame content into data[0] and data[1]
register. Data register length is 4 bytes and frame maximum length is 8
bytes.

Fix the check that compares frame length with 3. Because the register
length is 4.

Signed-off-by: Luu An Phu <phu.luuan@nxp.com>
Reviewed-by: Oliver Hartkopp <socketcan@hartkopp.net>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/flexcan.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/can/flexcan.c b/drivers/net/can/flexcan.c
index 0626dcfd1f3d..760d2c07e3a2 100644
--- a/drivers/net/can/flexcan.c
+++ b/drivers/net/can/flexcan.c
@@ -526,7 +526,7 @@ static int flexcan_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		data = be32_to_cpup((__be32 *)&cf->data[0]);
 		flexcan_write(data, &priv->tx_mb->data[0]);
 	}
-	if (cf->can_dlc > 3) {
+	if (cf->can_dlc > 4) {
 		data = be32_to_cpup((__be32 *)&cf->data[4]);
 		flexcan_write(data, &priv->tx_mb->data[1]);
 	}

From 6ebc5e8fe85286c7392f1777a3dba9e1fd6d0253 Mon Sep 17 00:00:00 2001
From: Martin Lederhilger <m.lederhilger@ds-automotion.com>
Date: Thu, 21 Dec 2017 14:42:44 +0000
Subject: [PATCH 632/808] can: ems_usb: improve error reporting for error
 warning and error passive

This patch adds the missing CAN_ERR_CRTL to cf->can_id in case of
CAN_STATE_ERROR_WARNING or CAN_STATE_ERROR_PASSIVE

Signed-off-by: Martin Lederhilger <m.lederhilger@ds-automotion.com>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/usb/ems_usb.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/can/usb/ems_usb.c b/drivers/net/can/usb/ems_usb.c
index b00358297424..12ff0020ecd6 100644
--- a/drivers/net/can/usb/ems_usb.c
+++ b/drivers/net/can/usb/ems_usb.c
@@ -395,6 +395,7 @@ static void ems_usb_rx_err(struct ems_usb *dev, struct ems_cpc_msg *msg)
 
 		if (dev->can.state == CAN_STATE_ERROR_WARNING ||
 		    dev->can.state == CAN_STATE_ERROR_PASSIVE) {
+			cf->can_id |= CAN_ERR_CRTL;
 			cf->data[1] = (txerr > rxerr) ?
 			    CAN_ERR_CRTL_TX_PASSIVE : CAN_ERR_CRTL_RX_PASSIVE;
 		}

From 6708913750344a900f2e73bfe4a4d6dbbce4fe8d Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Thu, 4 Jan 2018 16:39:27 +0100
Subject: [PATCH 633/808] ALSA: pcm: Add missing error checks in OSS emulation
 plugin builder

In the OSS emulation plugin builder where the frame size is parsed in
the plugin chain, some places miss the possible errors returned from
the plugin src_ or dst_frames callback.

This patch papers over such places.

Cc: <stable@vger.kernel.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/core/oss/pcm_plugin.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/sound/core/oss/pcm_plugin.c b/sound/core/oss/pcm_plugin.c
index cadc93792868..85a56af104bd 100644
--- a/sound/core/oss/pcm_plugin.c
+++ b/sound/core/oss/pcm_plugin.c
@@ -592,18 +592,26 @@ snd_pcm_sframes_t snd_pcm_plug_write_transfer(struct snd_pcm_substream *plug, st
 	snd_pcm_sframes_t frames = size;
 
 	plugin = snd_pcm_plug_first(plug);
-	while (plugin && frames > 0) {
+	while (plugin) {
+		if (frames <= 0)
+			return frames;
 		if ((next = plugin->next) != NULL) {
 			snd_pcm_sframes_t frames1 = frames;
-			if (plugin->dst_frames)
+			if (plugin->dst_frames) {
 				frames1 = plugin->dst_frames(plugin, frames);
+				if (frames1 <= 0)
+					return frames1;
+			}
 			if ((err = next->client_channels(next, frames1, &dst_channels)) < 0) {
 				return err;
 			}
 			if (err != frames1) {
 				frames = err;
-				if (plugin->src_frames)
+				if (plugin->src_frames) {
 					frames = plugin->src_frames(plugin, frames1);
+					if (frames <= 0)
+						return frames;
+				}
 			}
 		} else
 			dst_channels = NULL;

From 06e7e776ca4d36547e503279aeff996cbb292c16 Mon Sep 17 00:00:00 2001
From: Ben Seri <ben@armis.com>
Date: Fri, 8 Dec 2017 15:14:47 +0100
Subject: [PATCH 634/808] Bluetooth: Prevent stack info leak from the EFS
 element.

In the function l2cap_parse_conf_rsp and in the function
l2cap_parse_conf_req the following variable is declared without
initialization:

struct l2cap_conf_efs efs;

In addition, when parsing input configuration parameters in both of
these functions, the switch case for handling EFS elements may skip the
memcpy call that will write to the efs variable:

...
case L2CAP_CONF_EFS:
if (olen == sizeof(efs))
memcpy(&efs, (void *)val, olen);
...

The olen in the above if is attacker controlled, and regardless of that
if, in both of these functions the efs variable would eventually be
added to the outgoing configuration request that is being built:

l2cap_add_conf_opt(&ptr, L2CAP_CONF_EFS, sizeof(efs), (unsigned long) &efs);

So by sending a configuration request, or response, that contains an
L2CAP_CONF_EFS element, but with an element length that is not
sizeof(efs) - the memcpy to the uninitialized efs variable can be
avoided, and the uninitialized variable would be returned to the
attacker (16 bytes).

This issue has been assigned CVE-2017-1000410

Cc: Marcel Holtmann <marcel@holtmann.org>
Cc: Gustavo Padovan <gustavo@padovan.org>
Cc: Johan Hedberg <johan.hedberg@gmail.com>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Ben Seri <ben@armis.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/bluetooth/l2cap_core.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index 43ba91c440bc..fc6615d59165 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -3363,9 +3363,10 @@ static int l2cap_parse_conf_req(struct l2cap_chan *chan, void *data, size_t data
 			break;
 
 		case L2CAP_CONF_EFS:
-			remote_efs = 1;
-			if (olen == sizeof(efs))
+			if (olen == sizeof(efs)) {
+				remote_efs = 1;
 				memcpy(&efs, (void *) val, olen);
+			}
 			break;
 
 		case L2CAP_CONF_EWS:
@@ -3584,16 +3585,17 @@ static int l2cap_parse_conf_rsp(struct l2cap_chan *chan, void *rsp, int len,
 			break;
 
 		case L2CAP_CONF_EFS:
-			if (olen == sizeof(efs))
+			if (olen == sizeof(efs)) {
 				memcpy(&efs, (void *)val, olen);
 
-			if (chan->local_stype != L2CAP_SERV_NOTRAFIC &&
-			    efs.stype != L2CAP_SERV_NOTRAFIC &&
-			    efs.stype != chan->local_stype)
-				return -ECONNREFUSED;
+				if (chan->local_stype != L2CAP_SERV_NOTRAFIC &&
+				    efs.stype != L2CAP_SERV_NOTRAFIC &&
+				    efs.stype != chan->local_stype)
+					return -ECONNREFUSED;
 
-			l2cap_add_conf_opt(&ptr, L2CAP_CONF_EFS, sizeof(efs),
-					   (unsigned long) &efs, endptr - ptr);
+				l2cap_add_conf_opt(&ptr, L2CAP_CONF_EFS, sizeof(efs),
+						   (unsigned long) &efs, endptr - ptr);
+			}
 			break;
 
 		case L2CAP_CONF_FCS:

From b78d830f0049ef1966dc1e0ebd1ec2a594e2cf25 Mon Sep 17 00:00:00 2001
From: Shuah Khan <shuahkh@osg.samsung.com>
Date: Fri, 22 Dec 2017 19:23:46 -0700
Subject: [PATCH 635/808] usbip: fix vudc_rx: harden CMD_SUBMIT path to handle
 malicious input

Harden CMD_SUBMIT path to handle malicious input that could trigger
large memory allocations. Add checks to validate transfer_buffer_length
and number_of_packets to protect against bad input requesting for
unbounded memory allocations.

Signed-off-by: Shuah Khan <shuahkh@osg.samsung.com>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/usbip/vudc_rx.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/drivers/usb/usbip/vudc_rx.c b/drivers/usb/usbip/vudc_rx.c
index df1e30989148..1e8a23d92cb4 100644
--- a/drivers/usb/usbip/vudc_rx.c
+++ b/drivers/usb/usbip/vudc_rx.c
@@ -120,6 +120,25 @@ static int v_recv_cmd_submit(struct vudc *udc,
 	urb_p->new = 1;
 	urb_p->seqnum = pdu->base.seqnum;
 
+	if (urb_p->ep->type == USB_ENDPOINT_XFER_ISOC) {
+		/* validate packet size and number of packets */
+		unsigned int maxp, packets, bytes;
+
+		maxp = usb_endpoint_maxp(urb_p->ep->desc);
+		maxp *= usb_endpoint_maxp_mult(urb_p->ep->desc);
+		bytes = pdu->u.cmd_submit.transfer_buffer_length;
+		packets = DIV_ROUND_UP(bytes, maxp);
+
+		if (pdu->u.cmd_submit.number_of_packets < 0 ||
+		    pdu->u.cmd_submit.number_of_packets > packets) {
+			dev_err(&udc->gadget.dev,
+				"CMD_SUBMIT: isoc invalid num packets %d\n",
+				pdu->u.cmd_submit.number_of_packets);
+			ret = -EMSGSIZE;
+			goto free_urbp;
+		}
+	}
+
 	ret = alloc_urb_from_cmd(&urb_p->urb, pdu, urb_p->ep->type);
 	if (ret) {
 		usbip_event_add(&udc->ud, VUDC_EVENT_ERROR_MALLOC);

From e1346fd87c71a1f61de1fe476ec8df1425ac931c Mon Sep 17 00:00:00 2001
From: Shuah Khan <shuahkh@osg.samsung.com>
Date: Fri, 22 Dec 2017 17:00:06 -0700
Subject: [PATCH 636/808] usbip: remove kernel addresses from usb device and
 urb debug msgs

usbip_dump_usb_device() and usbip_dump_urb() print kernel addresses.
Remove kernel addresses from usb device and urb debug msgs and improve
the message content.

Instead of printing parent device and bus addresses, print parent device
and bus names.

Signed-off-by: Shuah Khan <shuahkh@osg.samsung.com>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/usbip/usbip_common.c | 17 +++--------------
 1 file changed, 3 insertions(+), 14 deletions(-)

diff --git a/drivers/usb/usbip/usbip_common.c b/drivers/usb/usbip/usbip_common.c
index 7b219d9109b4..ee2bbce24584 100644
--- a/drivers/usb/usbip/usbip_common.c
+++ b/drivers/usb/usbip/usbip_common.c
@@ -91,7 +91,7 @@ static void usbip_dump_usb_device(struct usb_device *udev)
 	dev_dbg(dev, "       devnum(%d) devpath(%s) usb speed(%s)",
 		udev->devnum, udev->devpath, usb_speed_string(udev->speed));
 
-	pr_debug("tt %p, ttport %d\n", udev->tt, udev->ttport);
+	pr_debug("tt hub ttport %d\n", udev->ttport);
 
 	dev_dbg(dev, "                    ");
 	for (i = 0; i < 16; i++)
@@ -124,12 +124,8 @@ static void usbip_dump_usb_device(struct usb_device *udev)
 	}
 	pr_debug("\n");
 
-	dev_dbg(dev, "parent %p, bus %p\n", udev->parent, udev->bus);
-
-	dev_dbg(dev,
-		"descriptor %p, config %p, actconfig %p, rawdescriptors %p\n",
-		&udev->descriptor, udev->config,
-		udev->actconfig, udev->rawdescriptors);
+	dev_dbg(dev, "parent %s, bus %s\n", dev_name(&udev->parent->dev),
+		udev->bus->bus_name);
 
 	dev_dbg(dev, "have_langid %d, string_langid %d\n",
 		udev->have_langid, udev->string_langid);
@@ -237,9 +233,6 @@ void usbip_dump_urb(struct urb *urb)
 
 	dev = &urb->dev->dev;
 
-	dev_dbg(dev, "   urb                   :%p\n", urb);
-	dev_dbg(dev, "   dev                   :%p\n", urb->dev);
-
 	usbip_dump_usb_device(urb->dev);
 
 	dev_dbg(dev, "   pipe                  :%08x ", urb->pipe);
@@ -248,11 +241,9 @@ void usbip_dump_urb(struct urb *urb)
 
 	dev_dbg(dev, "   status                :%d\n", urb->status);
 	dev_dbg(dev, "   transfer_flags        :%08X\n", urb->transfer_flags);
-	dev_dbg(dev, "   transfer_buffer       :%p\n", urb->transfer_buffer);
 	dev_dbg(dev, "   transfer_buffer_length:%d\n",
 						urb->transfer_buffer_length);
 	dev_dbg(dev, "   actual_length         :%d\n", urb->actual_length);
-	dev_dbg(dev, "   setup_packet          :%p\n", urb->setup_packet);
 
 	if (urb->setup_packet && usb_pipetype(urb->pipe) == PIPE_CONTROL)
 		usbip_dump_usb_ctrlrequest(
@@ -262,8 +253,6 @@ void usbip_dump_urb(struct urb *urb)
 	dev_dbg(dev, "   number_of_packets     :%d\n", urb->number_of_packets);
 	dev_dbg(dev, "   interval              :%d\n", urb->interval);
 	dev_dbg(dev, "   error_count           :%d\n", urb->error_count);
-	dev_dbg(dev, "   context               :%p\n", urb->context);
-	dev_dbg(dev, "   complete              :%p\n", urb->complete);
 }
 EXPORT_SYMBOL_GPL(usbip_dump_urb);
 

From 5fd77a3a0e408c23ab4002a57db980e46bc16e72 Mon Sep 17 00:00:00 2001
From: Shuah Khan <shuahkh@osg.samsung.com>
Date: Fri, 22 Dec 2017 19:23:47 -0700
Subject: [PATCH 637/808] usbip: vudc_tx: fix v_send_ret_submit() vulnerability
 to null xfer buffer

v_send_ret_submit() handles urb with a null transfer_buffer, when it
replays a packet with potential malicious data that could contain a
null buffer.

Add a check for the condition when actual_length > 0 and transfer_buffer
is null.

Signed-off-by: Shuah Khan <shuahkh@osg.samsung.com>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/usbip/vudc_tx.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/drivers/usb/usbip/vudc_tx.c b/drivers/usb/usbip/vudc_tx.c
index 1440ae0919ec..3ccb17c3e840 100644
--- a/drivers/usb/usbip/vudc_tx.c
+++ b/drivers/usb/usbip/vudc_tx.c
@@ -85,6 +85,13 @@ static int v_send_ret_submit(struct vudc *udc, struct urbp *urb_p)
 	memset(&pdu_header, 0, sizeof(pdu_header));
 	memset(&msg, 0, sizeof(msg));
 
+	if (urb->actual_length > 0 && !urb->transfer_buffer) {
+		dev_err(&udc->gadget.dev,
+			"urb: actual_length %d transfer_buffer null\n",
+			urb->actual_length);
+		return -1;
+	}
+
 	if (urb_p->type == USB_ENDPOINT_XFER_ISOC)
 		iovnum = 2 + urb->number_of_packets;
 	else
@@ -100,8 +107,8 @@ static int v_send_ret_submit(struct vudc *udc, struct urbp *urb_p)
 
 	/* 1. setup usbip_header */
 	setup_ret_submit_pdu(&pdu_header, urb_p);
-	usbip_dbg_stub_tx("setup txdata seqnum: %d urb: %p\n",
-			  pdu_header.base.seqnum, urb);
+	usbip_dbg_stub_tx("setup txdata seqnum: %d\n",
+			  pdu_header.base.seqnum);
 	usbip_header_correct_endian(&pdu_header, 1);
 
 	iov[iovnum].iov_base = &pdu_header;

From abb62c46d4949d44979fa647740feff3f7538799 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Fri, 29 Dec 2017 21:15:54 +0900
Subject: [PATCH 638/808] arm64: dts: uniphier: fix gpio-ranges property of
 PXs3 SoC

This is probably a copy-paste mistake.  The gpio-ranges of PXs3 is
different from that of LD20.

Fixes: 277b51e7050f ("arm64: dts: uniphier: add GPIO controller nodes")
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 arch/arm64/boot/dts/socionext/uniphier-pxs3.dtsi | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/boot/dts/socionext/uniphier-pxs3.dtsi b/arch/arm64/boot/dts/socionext/uniphier-pxs3.dtsi
index 48e733136db4..0ac2ace82435 100644
--- a/arch/arm64/boot/dts/socionext/uniphier-pxs3.dtsi
+++ b/arch/arm64/boot/dts/socionext/uniphier-pxs3.dtsi
@@ -198,8 +198,8 @@
 			gpio-controller;
 			#gpio-cells = <2>;
 			gpio-ranges = <&pinctrl 0 0 0>,
-				      <&pinctrl 96 0 0>,
-				      <&pinctrl 160 0 0>;
+				      <&pinctrl 104 0 0>,
+				      <&pinctrl 168 0 0>;
 			gpio-ranges-group-names = "gpio_range0",
 						  "gpio_range1",
 						  "gpio_range2";

From 0856655a25476d4431005e39d606e349050066b0 Mon Sep 17 00:00:00 2001
From: Loic Poulain <loic.poulain@linaro.org>
Date: Mon, 11 Dec 2017 09:52:22 +0100
Subject: [PATCH 639/808] wcn36xx: Fix dynamic power saving

Since driver does not report hardware dynamic power saving cap,
this is up to the mac80211 to manage power saving timeout and
state machine, using the ieee80211 config callback to report
PS changes. This patch enables/disables PS mode according to
the new configuration.

Remove old behaviour enabling PS mode in a static way, this make
the device unusable when power save is enabled since device is
forced to PS regardless RX/TX traffic.

Acked-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Signed-off-by: Loic Poulain <loic.poulain@linaro.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/wcn36xx/main.c | 23 ++++++++++++-----------
 drivers/net/wireless/ath/wcn36xx/pmc.c  |  6 ++++--
 2 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/drivers/net/wireless/ath/wcn36xx/main.c b/drivers/net/wireless/ath/wcn36xx/main.c
index f7d228b5ba93..987f1252a3cf 100644
--- a/drivers/net/wireless/ath/wcn36xx/main.c
+++ b/drivers/net/wireless/ath/wcn36xx/main.c
@@ -384,6 +384,18 @@ static int wcn36xx_config(struct ieee80211_hw *hw, u32 changed)
 		}
 	}
 
+	if (changed & IEEE80211_CONF_CHANGE_PS) {
+		list_for_each_entry(tmp, &wcn->vif_list, list) {
+			vif = wcn36xx_priv_to_vif(tmp);
+			if (hw->conf.flags & IEEE80211_CONF_PS) {
+				if (vif->bss_conf.ps) /* ps allowed ? */
+					wcn36xx_pmc_enter_bmps_state(wcn, vif);
+			} else {
+				wcn36xx_pmc_exit_bmps_state(wcn, vif);
+			}
+		}
+	}
+
 	mutex_unlock(&wcn->conf_mutex);
 
 	return 0;
@@ -747,17 +759,6 @@ static void wcn36xx_bss_info_changed(struct ieee80211_hw *hw,
 		vif_priv->dtim_period = bss_conf->dtim_period;
 	}
 
-	if (changed & BSS_CHANGED_PS) {
-		wcn36xx_dbg(WCN36XX_DBG_MAC,
-			    "mac bss PS set %d\n",
-			    bss_conf->ps);
-		if (bss_conf->ps) {
-			wcn36xx_pmc_enter_bmps_state(wcn, vif);
-		} else {
-			wcn36xx_pmc_exit_bmps_state(wcn, vif);
-		}
-	}
-
 	if (changed & BSS_CHANGED_BSSID) {
 		wcn36xx_dbg(WCN36XX_DBG_MAC, "mac bss changed_bssid %pM\n",
 			    bss_conf->bssid);
diff --git a/drivers/net/wireless/ath/wcn36xx/pmc.c b/drivers/net/wireless/ath/wcn36xx/pmc.c
index 589fe5f70971..1976b80c235f 100644
--- a/drivers/net/wireless/ath/wcn36xx/pmc.c
+++ b/drivers/net/wireless/ath/wcn36xx/pmc.c
@@ -45,8 +45,10 @@ int wcn36xx_pmc_exit_bmps_state(struct wcn36xx *wcn,
 	struct wcn36xx_vif *vif_priv = wcn36xx_vif_to_priv(vif);
 
 	if (WCN36XX_BMPS != vif_priv->pw_state) {
-		wcn36xx_err("Not in BMPS mode, no need to exit from BMPS mode!\n");
-		return -EINVAL;
+		/* Unbalanced call or last BMPS enter failed */
+		wcn36xx_dbg(WCN36XX_DBG_PMC,
+			    "Not in BMPS mode, no need to exit\n");
+		return -EALREADY;
 	}
 	wcn36xx_smd_exit_bmps(wcn, vif);
 	vif_priv->pw_state = WCN36XX_FULL_POWER;

From fb32dd3abf7a8fc13271d0d1c45ffc66df28dd15 Mon Sep 17 00:00:00 2001
From: Pravin B Shelar <pshelar@ovn.org>
Date: Tue, 2 Jan 2018 20:14:42 -0800
Subject: [PATCH 640/808] MAINTAINERS: Update my email address.

Signed-off-by: Pravin Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index a6e86e20761e..1e6872b4c6e2 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10137,7 +10137,7 @@ F:	drivers/irqchip/irq-ompic.c
 F:	drivers/irqchip/irq-or1k-*
 
 OPENVSWITCH
-M:	Pravin Shelar <pshelar@nicira.com>
+M:	Pravin B Shelar <pshelar@ovn.org>
 L:	netdev@vger.kernel.org
 L:	dev@openvswitch.org
 W:	http://openvswitch.org

From f428fe4a04cc339166c8bbd489789760de3a0cee Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@openvz.org>
Date: Tue, 2 Jan 2018 23:27:33 -0800
Subject: [PATCH 641/808] rtnetlink: give a user socket to get_target_net()

This function is used from two places: rtnl_dump_ifinfo and
rtnl_getlink. In rtnl_getlink(), we give a request skb into
get_target_net(), but in rtnl_dump_ifinfo, we give a response skb
into get_target_net().
The problem here is that NETLINK_CB() isn't initialized for the response
skb. In both cases we can get a user socket and give it instead of skb
into get_target_net().

This bug was found by syzkaller with this call-trace:

kasan: GPF could be caused by NULL-ptr deref or user memory access
general protection fault: 0000 [#1] SMP KASAN
Modules linked in:
CPU: 1 PID: 3149 Comm: syzkaller140561 Not tainted 4.15.0-rc4-mm1+ #47
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
Google 01/01/2011
RIP: 0010:__netlink_ns_capable+0x8b/0x120 net/netlink/af_netlink.c:868
RSP: 0018:ffff8801c880f348 EFLAGS: 00010206
RAX: dffffc0000000000 RBX: 0000000000000000 RCX: ffffffff8443f900
RDX: 000000000000007b RSI: ffffffff86510f40 RDI: 00000000000003d8
RBP: ffff8801c880f360 R08: 0000000000000000 R09: 1ffff10039101e4f
R10: 0000000000000000 R11: 0000000000000001 R12: ffffffff86510f40
R13: 000000000000000c R14: 0000000000000004 R15: 0000000000000011
FS:  0000000001a1a880(0000) GS:ffff8801db300000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000020151000 CR3: 00000001c9511005 CR4: 00000000001606e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
  netlink_ns_capable+0x26/0x30 net/netlink/af_netlink.c:886
  get_target_net+0x9d/0x120 net/core/rtnetlink.c:1765
  rtnl_dump_ifinfo+0x2e5/0xee0 net/core/rtnetlink.c:1806
  netlink_dump+0x48c/0xce0 net/netlink/af_netlink.c:2222
  __netlink_dump_start+0x4f0/0x6d0 net/netlink/af_netlink.c:2319
  netlink_dump_start include/linux/netlink.h:214 [inline]
  rtnetlink_rcv_msg+0x7f0/0xb10 net/core/rtnetlink.c:4485
  netlink_rcv_skb+0x21e/0x460 net/netlink/af_netlink.c:2441
  rtnetlink_rcv+0x1c/0x20 net/core/rtnetlink.c:4540
  netlink_unicast_kernel net/netlink/af_netlink.c:1308 [inline]
  netlink_unicast+0x4be/0x6a0 net/netlink/af_netlink.c:1334
  netlink_sendmsg+0xa4a/0xe60 net/netlink/af_netlink.c:1897

Cc: Jiri Benc <jbenc@redhat.com>
Fixes: 79e1ad148c84 ("rtnetlink: use netnsid to query interface")
Signed-off-by: Andrei Vagin <avagin@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index dabba2a91fc8..778d7f03404a 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1681,18 +1681,18 @@ static bool link_dump_filtered(struct net_device *dev,
 	return false;
 }
 
-static struct net *get_target_net(struct sk_buff *skb, int netnsid)
+static struct net *get_target_net(struct sock *sk, int netnsid)
 {
 	struct net *net;
 
-	net = get_net_ns_by_id(sock_net(skb->sk), netnsid);
+	net = get_net_ns_by_id(sock_net(sk), netnsid);
 	if (!net)
 		return ERR_PTR(-EINVAL);
 
 	/* For now, the caller is required to have CAP_NET_ADMIN in
 	 * the user namespace owning the target net ns.
 	 */
-	if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) {
+	if (!sk_ns_capable(sk, net->user_ns, CAP_NET_ADMIN)) {
 		put_net(net);
 		return ERR_PTR(-EACCES);
 	}
@@ -1733,7 +1733,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
 			ifla_policy, NULL) >= 0) {
 		if (tb[IFLA_IF_NETNSID]) {
 			netnsid = nla_get_s32(tb[IFLA_IF_NETNSID]);
-			tgt_net = get_target_net(skb, netnsid);
+			tgt_net = get_target_net(skb->sk, netnsid);
 			if (IS_ERR(tgt_net)) {
 				tgt_net = net;
 				netnsid = -1;
@@ -2883,7 +2883,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh,
 
 	if (tb[IFLA_IF_NETNSID]) {
 		netnsid = nla_get_s32(tb[IFLA_IF_NETNSID]);
-		tgt_net = get_target_net(skb, netnsid);
+		tgt_net = get_target_net(NETLINK_CB(skb).sk, netnsid);
 		if (IS_ERR(tgt_net))
 			return PTR_ERR(tgt_net);
 	}

From 879626e3a52630316d817cbda7cec9a5446d1d82 Mon Sep 17 00:00:00 2001
From: Jerome Brunet <jbrunet@baylibre.com>
Date: Wed, 3 Jan 2018 16:46:29 +0100
Subject: [PATCH 642/808] net: stmmac: enable EEE in MII, GMII or RGMII only

Note in the databook - Section 4.4 - EEE :
" The EEE feature is not supported when the MAC is configured to use the
TBI, RTBI, SMII, RMII or SGMII single PHY interface. Even if the MAC
supports multiple PHY interfaces, you should activate the EEE mode only
when the MAC is operating with GMII, MII, or RGMII interface."

Applying this restriction solves a stability issue observed on Amlogic
gxl platforms operating with RMII interface and the internal PHY.

Fixes: 83bf79b6bb64 ("stmmac: disable at run-time the EEE if not supported")
Signed-off-by: Jerome Brunet <jbrunet@baylibre.com>
Tested-by: Arnaud Patard <arnaud.patard@rtp-net.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 337d53d12e94..c0af0bc4e714 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -364,9 +364,15 @@ static void stmmac_eee_ctrl_timer(struct timer_list *t)
 bool stmmac_eee_init(struct stmmac_priv *priv)
 {
 	struct net_device *ndev = priv->dev;
+	int interface = priv->plat->interface;
 	unsigned long flags;
 	bool ret = false;
 
+	if ((interface != PHY_INTERFACE_MODE_MII) &&
+	    (interface != PHY_INTERFACE_MODE_GMII) &&
+	    !phy_interface_mode_is_rgmii(interface))
+		goto out;
+
 	/* Using PCS we cannot dial with the phy registers at this stage
 	 * so we do not support extra feature like EEE.
 	 */

From dfe8266b8dd10e12a731c985b725fcf7f0e537f0 Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Date: Wed, 3 Jan 2018 20:09:49 +0300
Subject: [PATCH 643/808] sh_eth: fix TSU resource handling

When switching  the driver to the managed device API,  I managed to break
the  case of a  dual Ether devices sharing a single TSU: the 2nd Ether port
wouldn't probe. Iwamatsu-san has tried to fix this but his patch was buggy
and he then dropped the ball...

The solution is to  limit calling devm_request_mem_region() to the first
of  the two  ports  sharing the same TSU, so devm_ioremap_resource() can't
be used anymore for the TSU resource...

Fixes: d5e07e69218f ("sh_eth: use managed device API")
Reported-by: Nobuhiro Iwamatsu <nobuhiro.iwamatsu.yj@renesas.com>
Signed-off-by: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/renesas/sh_eth.c | 25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c
index 75323000c364..1bdd67a8a869 100644
--- a/drivers/net/ethernet/renesas/sh_eth.c
+++ b/drivers/net/ethernet/renesas/sh_eth.c
@@ -3225,10 +3225,29 @@ static int sh_eth_drv_probe(struct platform_device *pdev)
 	/* ioremap the TSU registers */
 	if (mdp->cd->tsu) {
 		struct resource *rtsu;
+
 		rtsu = platform_get_resource(pdev, IORESOURCE_MEM, 1);
-		mdp->tsu_addr = devm_ioremap_resource(&pdev->dev, rtsu);
-		if (IS_ERR(mdp->tsu_addr)) {
-			ret = PTR_ERR(mdp->tsu_addr);
+		if (!rtsu) {
+			dev_err(&pdev->dev, "no TSU resource\n");
+			ret = -ENODEV;
+			goto out_release;
+		}
+		/* We can only request the  TSU region  for the first port
+		 * of the two  sharing this TSU for the probe to succeed...
+		 */
+		if (devno % 2 == 0 &&
+		    !devm_request_mem_region(&pdev->dev, rtsu->start,
+					     resource_size(rtsu),
+					     dev_name(&pdev->dev))) {
+			dev_err(&pdev->dev, "can't request TSU resource.\n");
+			ret = -EBUSY;
+			goto out_release;
+		}
+		mdp->tsu_addr = devm_ioremap(&pdev->dev, rtsu->start,
+					     resource_size(rtsu));
+		if (!mdp->tsu_addr) {
+			dev_err(&pdev->dev, "TSU region ioremap() failed.\n");
+			ret = -ENOMEM;
 			goto out_release;
 		}
 		mdp->port = devno % 2;

From 7d11f77f84b27cef452cee332f4e469503084737 Mon Sep 17 00:00:00 2001
From: Mohamed Ghannam <simo.ghannam@gmail.com>
Date: Wed, 3 Jan 2018 21:06:06 +0000
Subject: [PATCH 644/808] RDS: null pointer dereference in rds_atomic_free_op

set rm->atomic.op_active to 0 when rds_pin_pages() fails
or the user supplied address is invalid,
this prevents a NULL pointer usage in rds_atomic_free_op()

Signed-off-by: Mohamed Ghannam <simo.ghannam@gmail.com>
Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/rdma.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index 94729d9da437..634cfcb7bba6 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -877,6 +877,7 @@ int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm,
 err:
 	if (page)
 		put_page(page);
+	rm->atomic.op_active = 0;
 	kfree(rm->atomic.op_notifier);
 
 	return ret;

From 7bbfe00e025240505db3e04c3b296d7c023b2a26 Mon Sep 17 00:00:00 2001
From: Wei Wang <weiwan@google.com>
Date: Wed, 3 Jan 2018 14:11:59 -0800
Subject: [PATCH 645/808] ipv6: fix general protection fault in fib6_add()

In fib6_add(), pn could be NULL if fib6_add_1() failed to return a fib6
node. Checking pn != fn before accessing pn->leaf makes sure pn is not
NULL.
This fixes the following GPF reported by syzkaller:
general protection fault: 0000 [#1] SMP KASAN
Dumping ftrace buffer:
   (ftrace buffer empty)
Modules linked in:
CPU: 0 PID: 3201 Comm: syzkaller001778 Not tainted 4.15.0-rc5+ #151
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
RIP: 0010:fib6_add+0x736/0x15a0 net/ipv6/ip6_fib.c:1244
RSP: 0018:ffff8801c7626a70 EFLAGS: 00010202
RAX: dffffc0000000000 RBX: 0000000000000020 RCX: ffffffff84794465
RDX: 0000000000000004 RSI: ffff8801d38935f0 RDI: 0000000000000282
RBP: ffff8801c7626da0 R08: 1ffff10038ec4c35 R09: 0000000000000000
R10: ffff8801c7626c68 R11: 0000000000000000 R12: 00000000fffffffe
R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000009
FS:  0000000000000000(0000) GS:ffff8801db200000(0063) knlGS:0000000009b70840
CS:  0010 DS: 002b ES: 002b CR0: 0000000080050033
CR2: 0000000020be1000 CR3: 00000001d585a006 CR4: 00000000001606f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
 __ip6_ins_rt+0x6c/0x90 net/ipv6/route.c:1006
 ip6_route_multipath_add+0xd14/0x16c0 net/ipv6/route.c:3833
 inet6_rtm_newroute+0xdc/0x160 net/ipv6/route.c:3957
 rtnetlink_rcv_msg+0x733/0x1020 net/core/rtnetlink.c:4411
 netlink_rcv_skb+0x21e/0x460 net/netlink/af_netlink.c:2408
 rtnetlink_rcv+0x1c/0x20 net/core/rtnetlink.c:4423
 netlink_unicast_kernel net/netlink/af_netlink.c:1275 [inline]
 netlink_unicast+0x4e8/0x6f0 net/netlink/af_netlink.c:1301
 netlink_sendmsg+0xa4a/0xe60 net/netlink/af_netlink.c:1864
 sock_sendmsg_nosec net/socket.c:636 [inline]
 sock_sendmsg+0xca/0x110 net/socket.c:646
 sock_write_iter+0x31a/0x5d0 net/socket.c:915
 call_write_iter include/linux/fs.h:1772 [inline]
 do_iter_readv_writev+0x525/0x7f0 fs/read_write.c:653
 do_iter_write+0x154/0x540 fs/read_write.c:932
 compat_writev+0x225/0x420 fs/read_write.c:1246
 do_compat_writev+0x115/0x220 fs/read_write.c:1267
 C_SYSC_writev fs/read_write.c:1278 [inline]
 compat_SyS_writev+0x26/0x30 fs/read_write.c:1274
 do_syscall_32_irqs_on arch/x86/entry/common.c:327 [inline]
 do_fast_syscall_32+0x3ee/0xf9d arch/x86/entry/common.c:389
 entry_SYSENTER_compat+0x54/0x63 arch/x86/entry/entry_64_compat.S:125

Reported-by: syzbot <syzkaller@googlegroups.com>
Fixes: 66f5d6ce53e6 ("ipv6: replace rwlock with rcu and spinlock in fib6_table")
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_fib.c | 35 ++++++++++++++++++++---------------
 1 file changed, 20 insertions(+), 15 deletions(-)

diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index f5285f4e1d08..d11a5578e4f8 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -1241,23 +1241,28 @@ out:
 		 * If fib6_add_1 has cleared the old leaf pointer in the
 		 * super-tree leaf node we have to find a new one for it.
 		 */
-		struct rt6_info *pn_leaf = rcu_dereference_protected(pn->leaf,
-					    lockdep_is_held(&table->tb6_lock));
-		if (pn != fn && pn_leaf == rt) {
-			pn_leaf = NULL;
-			RCU_INIT_POINTER(pn->leaf, NULL);
-			atomic_dec(&rt->rt6i_ref);
-		}
-		if (pn != fn && !pn_leaf && !(pn->fn_flags & RTN_RTINFO)) {
-			pn_leaf = fib6_find_prefix(info->nl_net, table, pn);
-#if RT6_DEBUG >= 2
-			if (!pn_leaf) {
-				WARN_ON(!pn_leaf);
-				pn_leaf = info->nl_net->ipv6.ip6_null_entry;
+		if (pn != fn) {
+			struct rt6_info *pn_leaf =
+				rcu_dereference_protected(pn->leaf,
+				    lockdep_is_held(&table->tb6_lock));
+			if (pn_leaf == rt) {
+				pn_leaf = NULL;
+				RCU_INIT_POINTER(pn->leaf, NULL);
+				atomic_dec(&rt->rt6i_ref);
 			}
+			if (!pn_leaf && !(pn->fn_flags & RTN_RTINFO)) {
+				pn_leaf = fib6_find_prefix(info->nl_net, table,
+							   pn);
+#if RT6_DEBUG >= 2
+				if (!pn_leaf) {
+					WARN_ON(!pn_leaf);
+					pn_leaf =
+					    info->nl_net->ipv6.ip6_null_entry;
+				}
 #endif
-			atomic_inc(&pn_leaf->rt6i_ref);
-			rcu_assign_pointer(pn->leaf, pn_leaf);
+				atomic_inc(&pn_leaf->rt6i_ref);
+				rcu_assign_pointer(pn->leaf, pn_leaf);
+			}
 		}
 #endif
 		goto failure;

From 6926e041a8920c8ec27e4e155efa760aa01551fd Mon Sep 17 00:00:00 2001
From: Hauke Mehrtens <hauke@hauke-m.de>
Date: Wed, 3 Jan 2018 23:14:21 +0100
Subject: [PATCH 646/808] uapi/if_ether.h: prevent redefinition of struct
 ethhdr

Musl provides its own ethhdr struct definition. Add a guard to prevent
its definition of the appropriate musl header has already been included.

glibc does not implement this header, but when glibc will implement this
they can just define __UAPI_DEF_ETHHDR 0 to make it work with the
kernel.

Signed-off-by: Hauke Mehrtens <hauke@hauke-m.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_ether.h    | 3 +++
 include/uapi/linux/libc-compat.h | 6 ++++++
 2 files changed, 9 insertions(+)

diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h
index 3ee3bf7c8526..144de4d2f385 100644
--- a/include/uapi/linux/if_ether.h
+++ b/include/uapi/linux/if_ether.h
@@ -23,6 +23,7 @@
 #define _UAPI_LINUX_IF_ETHER_H
 
 #include <linux/types.h>
+#include <linux/libc-compat.h>
 
 /*
  *	IEEE 802.3 Ethernet magic constants.  The frame sizes omit the preamble
@@ -149,11 +150,13 @@
  *	This is an Ethernet frame header.
  */
 
+#if __UAPI_DEF_ETHHDR
 struct ethhdr {
 	unsigned char	h_dest[ETH_ALEN];	/* destination eth addr	*/
 	unsigned char	h_source[ETH_ALEN];	/* source ether addr	*/
 	__be16		h_proto;		/* packet type ID field	*/
 } __attribute__((packed));
+#endif
 
 
 #endif /* _UAPI_LINUX_IF_ETHER_H */
diff --git a/include/uapi/linux/libc-compat.h b/include/uapi/linux/libc-compat.h
index 8254c937c9f4..fc29efaa918c 100644
--- a/include/uapi/linux/libc-compat.h
+++ b/include/uapi/linux/libc-compat.h
@@ -264,4 +264,10 @@
 
 #endif /* __GLIBC__ */
 
+/* Definitions for if_ether.h */
+/* allow libcs like musl to deactivate this, glibc does not implement this. */
+#ifndef __UAPI_DEF_ETHHDR
+#define __UAPI_DEF_ETHHDR		1
+#endif
+
 #endif /* _UAPI_LIBC_COMPAT_H */

From f5a40711fa58f1c109165a4fec6078bf2dfd2bdc Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Thu, 28 Dec 2017 19:06:20 +0300
Subject: [PATCH 647/808] x86/mm: Set MODULES_END to 0xffffffffff000000

Since f06bdd4001c2 ("x86/mm: Adapt MODULES_END based on fixmap section size")
kasan_mem_to_shadow(MODULES_END) could be not aligned to a page boundary.

So passing page unaligned address to kasan_populate_zero_shadow() have two
possible effects:

1) It may leave one page hole in supposed to be populated area. After commit
  21506525fb8d ("x86/kasan/64: Teach KASAN about the cpu_entry_area") that
  hole happens to be in the shadow covering fixmap area and leads to crash:

 BUG: unable to handle kernel paging request at fffffbffffe8ee04
 RIP: 0010:check_memory_region+0x5c/0x190

 Call Trace:
  <NMI>
  memcpy+0x1f/0x50
  ghes_copy_tofrom_phys+0xab/0x180
  ghes_read_estatus+0xfb/0x280
  ghes_notify_nmi+0x2b2/0x410
  nmi_handle+0x115/0x2c0
  default_do_nmi+0x57/0x110
  do_nmi+0xf8/0x150
  end_repeat_nmi+0x1a/0x1e

Note, the crash likely disappeared after commit 92a0f81d8957, which
changed kasan_populate_zero_shadow() call the way it was before
commit 21506525fb8d.

2) Attempt to load module near MODULES_END will fail, because
   __vmalloc_node_range() called from kasan_module_alloc() will hit the
   WARN_ON(!pte_none(*pte)) in the vmap_pte_range() and bail out with error.

To fix this we need to make kasan_mem_to_shadow(MODULES_END) page aligned
which means that MODULES_END should be 8*PAGE_SIZE aligned.

The whole point of commit f06bdd4001c2 was to move MODULES_END down if
NR_CPUS is big, so the cpu_entry_area takes a lot of space.
But since 92a0f81d8957 ("x86/cpu_entry_area: Move it out of the fixmap")
the cpu_entry_area is no longer in fixmap, so we could just set
MODULES_END to a fixed 8*PAGE_SIZE aligned address.

Fixes: f06bdd4001c2 ("x86/mm: Adapt MODULES_END based on fixmap section size")
Reported-by: Jakub Kicinski <kubakici@wp.pl>
Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Thomas Garnier <thgarnie@google.com>
Link: https://lkml.kernel.org/r/20171228160620.23818-1-aryabinin@virtuozzo.com
---
 Documentation/x86/x86_64/mm.txt         | 5 +----
 arch/x86/include/asm/pgtable_64_types.h | 2 +-
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt
index ad41b3813f0a..ddd5ffd31bd0 100644
--- a/Documentation/x86/x86_64/mm.txt
+++ b/Documentation/x86/x86_64/mm.txt
@@ -43,7 +43,7 @@ ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
 ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
 ... unused hole ...
 ffffffff80000000 - ffffffff9fffffff (=512 MB)  kernel text mapping, from phys 0
-ffffffffa0000000 - [fixmap start]   (~1526 MB) module mapping space
+ffffffffa0000000 - fffffffffeffffff (1520 MB) module mapping space
 [fixmap start]   - ffffffffff5fffff kernel-internal fixmap range
 ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI
 ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole
@@ -67,9 +67,6 @@ memory window (this size is arbitrary, it can be raised later if needed).
 The mappings are not part of any other kernel PGD and are only available
 during EFI runtime calls.
 
-The module mapping space size changes based on the CONFIG requirements for the
-following fixmap section.
-
 Note that if CONFIG_RANDOMIZE_MEMORY is enabled, the direct mapping of all
 physical memory, vmalloc/ioremap space and virtual memory map are randomized.
 Their order is preserved but their base will be offset early at boot time.
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index b97a539bcdee..6233e5595389 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -104,7 +104,7 @@ typedef struct { pteval_t pte; } pte_t;
 
 #define MODULES_VADDR		(__START_KERNEL_map + KERNEL_IMAGE_SIZE)
 /* The module sections ends with the start of the fixmap */
-#define MODULES_END		__fix_to_virt(__end_of_fixed_addresses + 1)
+#define MODULES_END		_AC(0xffffffffff000000, UL)
 #define MODULES_LEN		(MODULES_END - MODULES_VADDR)
 
 #define ESPFIX_PGD_ENTRY	_AC(-2, UL)

From f2078904810373211fb15f91888fba14c01a4acc Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 4 Jan 2018 13:01:40 +0100
Subject: [PATCH 648/808] x86/mm: Map cpu_entry_area at the same place on 4/5
 level

There is no reason for 4 and 5 level pagetables to have a different
layout. It just makes determining vaddr_end for KASLR harder than
necessary.

Fixes: 92a0f81d8957 ("x86/cpu_entry_area: Move it out of the fixmap")
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Benjamin Gilbert <benjamin.gilbert@coreos.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: stable <stable@vger.kernel.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Garnier <thgarnie@google.com>,
Cc: Alexander Kuleshov <kuleshovmail@gmail.com>
Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801041320360.1771@nanos
---
 Documentation/x86/x86_64/mm.txt         | 7 ++++---
 arch/x86/include/asm/pgtable_64_types.h | 4 ++--
 arch/x86/mm/dump_pagetables.c           | 2 +-
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt
index ddd5ffd31bd0..f7dabe1f01e9 100644
--- a/Documentation/x86/x86_64/mm.txt
+++ b/Documentation/x86/x86_64/mm.txt
@@ -12,8 +12,8 @@ ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)
 ... unused hole ...
 ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB)
 ... unused hole ...
-fffffe0000000000 - fffffe7fffffffff (=39 bits) LDT remap for PTI
-fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping
+fffffe0000000000 - fffffe7fffffffff (=39 bits) cpu_entry_area mapping
+fffffe8000000000 - fffffeffffffffff (=39 bits) LDT remap for PTI
 ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
 ... unused hole ...
 ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
@@ -37,7 +37,8 @@ ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB)
 ... unused hole ...
 ffdf000000000000 - fffffc0000000000 (=53 bits) kasan shadow memory (8PB)
 ... unused hole ...
-fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping
+fffffe0000000000 - fffffe7fffffffff (=39 bits) cpu_entry_area mapping
+... unused hole ...
 ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
 ... unused hole ...
 ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 6233e5595389..61b4b60bdc13 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -88,7 +88,7 @@ typedef struct { pteval_t pte; } pte_t;
 # define VMALLOC_SIZE_TB	_AC(32, UL)
 # define __VMALLOC_BASE		_AC(0xffffc90000000000, UL)
 # define __VMEMMAP_BASE		_AC(0xffffea0000000000, UL)
-# define LDT_PGD_ENTRY		_AC(-4, UL)
+# define LDT_PGD_ENTRY		_AC(-3, UL)
 # define LDT_BASE_ADDR		(LDT_PGD_ENTRY << PGDIR_SHIFT)
 #endif
 
@@ -110,7 +110,7 @@ typedef struct { pteval_t pte; } pte_t;
 #define ESPFIX_PGD_ENTRY	_AC(-2, UL)
 #define ESPFIX_BASE_ADDR	(ESPFIX_PGD_ENTRY << P4D_SHIFT)
 
-#define CPU_ENTRY_AREA_PGD	_AC(-3, UL)
+#define CPU_ENTRY_AREA_PGD	_AC(-4, UL)
 #define CPU_ENTRY_AREA_BASE	(CPU_ENTRY_AREA_PGD << P4D_SHIFT)
 
 #define EFI_VA_START		( -4 * (_AC(1, UL) << 30))
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index f56902c1f04b..2a4849e92831 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -61,10 +61,10 @@ enum address_markers_idx {
 	KASAN_SHADOW_START_NR,
 	KASAN_SHADOW_END_NR,
 #endif
+	CPU_ENTRY_AREA_NR,
 #if defined(CONFIG_MODIFY_LDT_SYSCALL) && !defined(CONFIG_X86_5LEVEL)
 	LDT_NR,
 #endif
-	CPU_ENTRY_AREA_NR,
 #ifdef CONFIG_X86_ESPFIX64
 	ESPFIX_START_NR,
 #endif

From 1dddd25125112ba49706518ac9077a1026a18f37 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 4 Jan 2018 12:32:03 +0100
Subject: [PATCH 649/808] x86/kaslr: Fix the vaddr_end mess

vaddr_end for KASLR is only documented in the KASLR code itself and is
adjusted depending on config options. So it's not surprising that a change
of the memory layout causes KASLR to have the wrong vaddr_end. This can map
arbitrary stuff into other areas causing hard to understand problems.

Remove the whole ifdef magic and define the start of the cpu_entry_area to
be the end of the KASLR vaddr range.

Add documentation to that effect.

Fixes: 92a0f81d8957 ("x86/cpu_entry_area: Move it out of the fixmap")
Reported-by: Benjamin Gilbert <benjamin.gilbert@coreos.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Benjamin Gilbert <benjamin.gilbert@coreos.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: stable <stable@vger.kernel.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Garnier <thgarnie@google.com>,
Cc: Alexander Kuleshov <kuleshovmail@gmail.com>
Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801041320360.1771@nanos
---
 Documentation/x86/x86_64/mm.txt         |  6 +++++
 arch/x86/include/asm/pgtable_64_types.h |  8 ++++++-
 arch/x86/mm/kaslr.c                     | 32 +++++++------------------
 3 files changed, 22 insertions(+), 24 deletions(-)

diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt
index f7dabe1f01e9..ea91cb61a602 100644
--- a/Documentation/x86/x86_64/mm.txt
+++ b/Documentation/x86/x86_64/mm.txt
@@ -12,6 +12,7 @@ ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)
 ... unused hole ...
 ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB)
 ... unused hole ...
+				    vaddr_end for KASLR
 fffffe0000000000 - fffffe7fffffffff (=39 bits) cpu_entry_area mapping
 fffffe8000000000 - fffffeffffffffff (=39 bits) LDT remap for PTI
 ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
@@ -37,6 +38,7 @@ ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB)
 ... unused hole ...
 ffdf000000000000 - fffffc0000000000 (=53 bits) kasan shadow memory (8PB)
 ... unused hole ...
+				    vaddr_end for KASLR
 fffffe0000000000 - fffffe7fffffffff (=39 bits) cpu_entry_area mapping
 ... unused hole ...
 ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
@@ -71,3 +73,7 @@ during EFI runtime calls.
 Note that if CONFIG_RANDOMIZE_MEMORY is enabled, the direct mapping of all
 physical memory, vmalloc/ioremap space and virtual memory map are randomized.
 Their order is preserved but their base will be offset early at boot time.
+
+Be very careful vs. KASLR when changing anything here. The KASLR address
+range must not overlap with anything except the KASAN shadow area, which is
+correct as KASAN disables KASLR.
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 61b4b60bdc13..6b8f73dcbc2c 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -75,7 +75,13 @@ typedef struct { pteval_t pte; } pte_t;
 #define PGDIR_SIZE	(_AC(1, UL) << PGDIR_SHIFT)
 #define PGDIR_MASK	(~(PGDIR_SIZE - 1))
 
-/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
+/*
+ * See Documentation/x86/x86_64/mm.txt for a description of the memory map.
+ *
+ * Be very careful vs. KASLR when changing anything here. The KASLR address
+ * range must not overlap with anything except the KASAN shadow area, which
+ * is correct as KASAN disables KASLR.
+ */
 #define MAXMEM			_AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
 
 #ifdef CONFIG_X86_5LEVEL
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index 879ef930e2c2..aedebd2ebf1e 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -34,25 +34,14 @@
 #define TB_SHIFT 40
 
 /*
- * Virtual address start and end range for randomization. The end changes base
- * on configuration to have the highest amount of space for randomization.
- * It increases the possible random position for each randomized region.
+ * Virtual address start and end range for randomization.
  *
- * You need to add an if/def entry if you introduce a new memory region
- * compatible with KASLR. Your entry must be in logical order with memory
- * layout. For example, ESPFIX is before EFI because its virtual address is
- * before. You also need to add a BUILD_BUG_ON() in kernel_randomize_memory() to
- * ensure that this order is correct and won't be changed.
+ * The end address could depend on more configuration options to make the
+ * highest amount of space for randomization available, but that's too hard
+ * to keep straight and caused issues already.
  */
 static const unsigned long vaddr_start = __PAGE_OFFSET_BASE;
-
-#if defined(CONFIG_X86_ESPFIX64)
-static const unsigned long vaddr_end = ESPFIX_BASE_ADDR;
-#elif defined(CONFIG_EFI)
-static const unsigned long vaddr_end = EFI_VA_END;
-#else
-static const unsigned long vaddr_end = __START_KERNEL_map;
-#endif
+static const unsigned long vaddr_end = CPU_ENTRY_AREA_BASE;
 
 /* Default values */
 unsigned long page_offset_base = __PAGE_OFFSET_BASE;
@@ -101,15 +90,12 @@ void __init kernel_randomize_memory(void)
 	unsigned long remain_entropy;
 
 	/*
-	 * All these BUILD_BUG_ON checks ensures the memory layout is
-	 * consistent with the vaddr_start/vaddr_end variables.
+	 * These BUILD_BUG_ON checks ensure the memory layout is consistent
+	 * with the vaddr_start/vaddr_end variables. These checks are very
+	 * limited....
 	 */
 	BUILD_BUG_ON(vaddr_start >= vaddr_end);
-	BUILD_BUG_ON(IS_ENABLED(CONFIG_X86_ESPFIX64) &&
-		     vaddr_end >= EFI_VA_END);
-	BUILD_BUG_ON((IS_ENABLED(CONFIG_X86_ESPFIX64) ||
-		      IS_ENABLED(CONFIG_EFI)) &&
-		     vaddr_end >= __START_KERNEL_map);
+	BUILD_BUG_ON(vaddr_end != CPU_ENTRY_AREA_BASE);
 	BUILD_BUG_ON(vaddr_end > __START_KERNEL_map);
 
 	if (!kaslr_memory_enabled())

From 42f3bdc5dd962a5958bc024c1e1444248a6b8b4a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 4 Jan 2018 18:07:12 +0100
Subject: [PATCH 650/808] x86/events/intel/ds: Use the proper cache flush
 method for mapping ds buffers

Thomas reported the following warning:

 BUG: using smp_processor_id() in preemptible [00000000] code: ovsdb-server/4498
 caller is native_flush_tlb_single+0x57/0xc0
 native_flush_tlb_single+0x57/0xc0
 __set_pte_vaddr+0x2d/0x40
 set_pte_vaddr+0x2f/0x40
 cea_set_pte+0x30/0x40
 ds_update_cea.constprop.4+0x4d/0x70
 reserve_ds_buffers+0x159/0x410
 x86_reserve_hardware+0x150/0x160
 x86_pmu_event_init+0x3e/0x1f0
 perf_try_init_event+0x69/0x80
 perf_event_alloc+0x652/0x740
 SyS_perf_event_open+0x3f6/0xd60
 do_syscall_64+0x5c/0x190

set_pte_vaddr is used to map the ds buffers into the cpu entry area, but
there are two problems with that:

 1) The resulting flush is not supposed to be called in preemptible context

 2) The cpu entry area is supposed to be per CPU, but the debug store
    buffers are mapped for all CPUs so these mappings need to be flushed
    globally.

Add the necessary preemption protection across the mapping code and flush
TLBs globally.

Fixes: c1961a4631da ("x86/events/intel/ds: Map debug buffers in cpu_entry_area")
Reported-by: Thomas Zeitlhofer <thomas.zeitlhofer+lkml@ze-it.at>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Thomas Zeitlhofer <thomas.zeitlhofer+lkml@ze-it.at>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/20180104170712.GB3040@hirez.programming.kicks-ass.net
---
 arch/x86/events/intel/ds.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 8f0aace08b87..8156e47da7ba 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -5,6 +5,7 @@
 
 #include <asm/cpu_entry_area.h>
 #include <asm/perf_event.h>
+#include <asm/tlbflush.h>
 #include <asm/insn.h>
 
 #include "../perf_event.h"
@@ -283,20 +284,35 @@ static DEFINE_PER_CPU(void *, insn_buffer);
 
 static void ds_update_cea(void *cea, void *addr, size_t size, pgprot_t prot)
 {
+	unsigned long start = (unsigned long)cea;
 	phys_addr_t pa;
 	size_t msz = 0;
 
 	pa = virt_to_phys(addr);
+
+	preempt_disable();
 	for (; msz < size; msz += PAGE_SIZE, pa += PAGE_SIZE, cea += PAGE_SIZE)
 		cea_set_pte(cea, pa, prot);
+
+	/*
+	 * This is a cross-CPU update of the cpu_entry_area, we must shoot down
+	 * all TLB entries for it.
+	 */
+	flush_tlb_kernel_range(start, start + size);
+	preempt_enable();
 }
 
 static void ds_clear_cea(void *cea, size_t size)
 {
+	unsigned long start = (unsigned long)cea;
 	size_t msz = 0;
 
+	preempt_disable();
 	for (; msz < size; msz += PAGE_SIZE, cea += PAGE_SIZE)
 		cea_set_pte(cea, 0, PAGE_NONE);
+
+	flush_tlb_kernel_range(start, start + size);
+	preempt_enable();
 }
 
 static void *dsalloc_pages(size_t size, gfp_t flags, int cpu)

From 1e5476815fd7f98b888e01a0f9522b63085f96c9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 4 Jan 2018 22:19:04 +0100
Subject: [PATCH 651/808] x86/tlb: Drop the _GPL from the cpu_tlbstate export

The recent changes for PTI touch cpu_tlbstate from various tlb_flush
inlines. cpu_tlbstate is exported as GPL symbol, so this causes a
regression when building out of tree drivers for certain graphics cards.

Aside of that the export was wrong since it was introduced as it should
have been EXPORT_PER_CPU_SYMBOL_GPL().

Use the correct PER_CPU export and drop the _GPL to restore the previous
state which allows users to utilize the cards they payed for.

As always I'm really thrilled to make this kind of change to support the
#friends (or however the hot hashtag of today is spelled) from that closet
sauce graphics corp.

Fixes: 1e02ce4cccdc ("x86: Store a per-cpu shadow copy of CR4")
Fixes: 6fd166aae78c ("x86/mm: Use/Fix PCID to optimize user/kernel switches")
Reported-by: Kees Cook <keescook@google.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: stable@vger.kernel.org
---
 arch/x86/mm/init.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 80259ad8c386..6b462a472a7b 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -870,7 +870,7 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
 	.next_asid = 1,
 	.cr4 = ~0UL,	/* fail hard if we screw up cr4 shadow initialization */
 };
-EXPORT_SYMBOL_GPL(cpu_tlbstate);
+EXPORT_PER_CPU_SYMBOL(cpu_tlbstate);
 
 void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache)
 {

From e8c24773d6b2cd9bc8b36bd6e60beff599be14be Mon Sep 17 00:00:00 2001
From: Dave Young <dyoung@redhat.com>
Date: Thu, 4 Jan 2018 16:17:45 -0800
Subject: [PATCH 652/808] mm: check pfn_valid first in zero_resv_unavail

With latest kernel I get below bug while testing kdump:

  BUG: unable to handle kernel paging request at ffffea00034b1040
  IP: zero_resv_unavail+0xbd/0x126
  PGD 37b98067 P4D 37b98067 PUD 37b97067 PMD 0
  Oops: 0002 [#1] SMP
  Modules linked in:
  CPU: 0 PID: 0 Comm: swapper Not tainted 4.15.0-rc1+ #316
  Hardware name: LENOVO 20ARS1BJ02/20ARS1BJ02, BIOS GJET92WW (2.42 ) 03/03/2017
  task: ffffffff81a0e4c0 task.stack: ffffffff81a00000
  RIP: 0010:zero_resv_unavail+0xbd/0x126
  RSP: 0000:ffffffff81a03d88 EFLAGS: 00010006
  RAX: 0000000000000000 RBX: ffffea00034b1040 RCX: 0000000000000010
  RDX: 0000000000000000 RSI: 0000000000000092 RDI: ffffea00034b1040
  RBP: 00000000000d2c41 R08: 00000000000000c0 R09: 0000000000000a0d
  R10: 0000000000000002 R11: 0000000000007f01 R12: ffffffff81a03d90
  R13: ffffea0000000000 R14: 0000000000000063 R15: 0000000000000062
  FS:  0000000000000000(0000) GS:ffffffff81c73000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: ffffea00034b1040 CR3: 0000000037609000 CR4: 00000000000606b0
  Call Trace:
   ? free_area_init_nodes+0x640/0x664
   ? zone_sizes_init+0x58/0x72
   ? setup_arch+0xb50/0xc6c
   ? start_kernel+0x64/0x43d
   ? secondary_startup_64+0xa5/0xb0
  Code: c1 e8 0c 48 39 d8 76 27 48 89 de 48 c1 e3 06 48 c7 c7 7a 87 79 81 e8 b0 c0 3e ff 4c 01 eb b9 10 00 00 00 31 c0 48 89 df 49 ff c6 <f3> ab eb bc 6a 00 49 c7 c0 f0 93 d1 81 31 d2 83 ce ff 41 54 49
  RIP: zero_resv_unavail+0xbd/0x126 RSP: ffffffff81a03d88
  CR2: ffffea00034b1040
  ---[ end trace f5ba9e8f73c7ee26 ]---

This is introduced by commit a4a3ede2132a ("mm: zero reserved and
unavailable struct pages").

The reason is some efi reserved boot ranges is not reported in E820 ram.
In my case it is a bgrt buffer:

  efi: mem00: [Boot Data          |RUN|  |  |  |  |  |  |   |WB|WT|WC|UC] range=[0x00000000d2c41000-0x00000000d2c85fff] (0MB)

Use "add_efi_memmap" can workaround the problem with another fix:

  http://lkml.kernel.org/r/20171130052327.GA3500@dhcp-128-65.nay.redhat.com

In zero_resv_unavail it would be better to check pfn_valid first before
zero the page struct.  This fixes the problem and potential other
similar problems.  Also as Pavel Tatashin suggested checks pfn_valid at
the beginning of the section.

The range is backed by real memory.  The memory range is efi "Boot
Service Data", that means after ExitBootServices() these ranges can be
used as system ram.  But some of them need to be reserved, for example
the bgrt image address in an acpi table, if the image memory is freed
then kexec reboot will fail because kexec inherit same acpi table to
initialize the driver.

Link: http://lkml.kernel.org/r/20171201095048.GA3084@dhcp-128-65.nay.redhat.com
Fixes: a4a3ede2132a ("mm: zero reserved and unavailable struct pages")
Signed-off-by: Dave Young <dyoung@redhat.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Pavel Tatashin <pasha.tatashin@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7e5e775e97f4..76c9688b6a0a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6260,6 +6260,8 @@ void __paginginit zero_resv_unavail(void)
 	pgcnt = 0;
 	for_each_resv_unavail_range(i, &start, &end) {
 		for (pfn = PFN_DOWN(start); pfn < PFN_UP(end); pfn++) {
+			if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages)))
+				continue;
 			mm_zero_struct_page(pfn_to_page(pfn));
 			pgcnt++;
 		}

From 4d9570158b6260f449e317a5f9ed030c2504a615 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 4 Jan 2018 16:17:49 -0800
Subject: [PATCH 653/808] kernel/acct.c: fix the acct->needcheck check in
 check_free_space()

As Tsukada explains, the time_is_before_jiffies(acct->needcheck) check
is very wrong, we need time_is_after_jiffies() to make sys_acct() work.

Ignoring the overflows, the code should "goto out" if needcheck >
jiffies, while currently it checks "needcheck < jiffies" and thus in the
likely case check_free_space() does nothing until jiffies overflow.

In particular this means that sys_acct() is simply broken, acct_on()
sets acct->needcheck = jiffies and expects that check_free_space()
should set acct->active = 1 after the free-space check, but this won't
happen if jiffies increments in between.

This was broken by commit 32dc73086015 ("get rid of timer in
kern/acct.c") in 2011, then another (correct) commit 795a2f22a8ea
("acct() should honour the limits from the very beginning") made the
problem more visible.

Link: http://lkml.kernel.org/r/20171213133940.GA6554@redhat.com
Fixes: 32dc73086015 ("get rid of timer in kern/acct.c")
Reported-by: TSUKADA Koutaro <tsukada@ascade.co.jp>
Suggested-by: TSUKADA Koutaro <tsukada@ascade.co.jp>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/acct.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/acct.c b/kernel/acct.c
index d15c0ee4d955..addf7732fb56 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -102,7 +102,7 @@ static int check_free_space(struct bsd_acct_struct *acct)
 {
 	struct kstatfs sbuf;
 
-	if (time_is_before_jiffies(acct->needcheck))
+	if (time_is_after_jiffies(acct->needcheck))
 		goto out;
 
 	/* May block */

From 4991c09c7c812dba13ea9be79a68b4565bb1fa4e Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Date: Thu, 4 Jan 2018 16:17:52 -0800
Subject: [PATCH 654/808] mm/mprotect: add a cond_resched() inside
 change_pmd_range()

While testing on a large CPU system, detected the following RCU stall
many times over the span of the workload.  This problem is solved by
adding a cond_resched() in the change_pmd_range() function.

  INFO: rcu_sched detected stalls on CPUs/tasks:
   154-....: (670 ticks this GP) idle=022/140000000000000/0 softirq=2825/2825 fqs=612
   (detected by 955, t=6002 jiffies, g=4486, c=4485, q=90864)
  Sending NMI from CPU 955 to CPUs 154:
  NMI backtrace for cpu 154
  CPU: 154 PID: 147071 Comm: workload Not tainted 4.15.0-rc3+ #3
  NIP:  c0000000000b3f64 LR: c0000000000b33d4 CTR: 000000000000aa18
  REGS: 00000000a4b0fb44 TRAP: 0501   Not tainted  (4.15.0-rc3+)
  MSR:  8000000000009033 <SF,EE,ME,IR,DR,RI,LE>  CR: 22422082  XER: 00000000
  CFAR: 00000000006cf8f0 SOFTE: 1
  GPR00: 0010000000000000 c00003ef9b1cb8c0 c0000000010cc600 0000000000000000
  GPR04: 8e0000018c32b200 40017b3858fd6e00 8e0000018c32b208 40017b3858fd6e00
  GPR08: 8e0000018c32b210 40017b3858fd6e00 8e0000018c32b218 40017b3858fd6e00
  GPR12: ffffffffffffffff c00000000fb25100
  NIP [c0000000000b3f64] plpar_hcall9+0x44/0x7c
  LR [c0000000000b33d4] pSeries_lpar_flush_hash_range+0x384/0x420
  Call Trace:
    flush_hash_range+0x48/0x100
    __flush_tlb_pending+0x44/0xd0
    hpte_need_flush+0x408/0x470
    change_protection_range+0xaac/0xf10
    change_prot_numa+0x30/0xb0
    task_numa_work+0x2d0/0x3e0
    task_work_run+0x130/0x190
    do_notify_resume+0x118/0x120
    ret_from_except_lite+0x70/0x74
  Instruction dump:
  60000000 f8810028 7ca42b78 7cc53378 7ce63b78 7d074378 7d284b78 7d495378
  e9410060 e9610068 e9810070 44000022 <7d806378> e9810028 f88c0000 f8ac0008

Link: http://lkml.kernel.org/r/20171214140551.5794-1-khandual@linux.vnet.ibm.com
Signed-off-by: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Suggested-by: Nicholas Piggin <npiggin@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mprotect.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/mm/mprotect.c b/mm/mprotect.c
index ec39f730a0bf..58b629bb70de 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -166,7 +166,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
 		next = pmd_addr_end(addr, end);
 		if (!is_swap_pmd(*pmd) && !pmd_trans_huge(*pmd) && !pmd_devmap(*pmd)
 				&& pmd_none_or_clear_bad(pmd))
-			continue;
+			goto next;
 
 		/* invoke the mmu notifier if the pmd is populated */
 		if (!mni_start) {
@@ -188,7 +188,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
 					}
 
 					/* huge pmd was handled */
-					continue;
+					goto next;
 				}
 			}
 			/* fall through, the trans huge pmd just split */
@@ -196,6 +196,8 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
 		this_pages = change_pte_range(vma, pmd, addr, next, newprot,
 				 dirty_accountable, prot_numa);
 		pages += this_pages;
+next:
+		cond_resched();
 	} while (pmd++, addr = next, addr != end);
 
 	if (mni_start)

From dc8635b78cd8669c37e230058d18c33af7451ab1 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Thu, 4 Jan 2018 16:17:56 -0800
Subject: [PATCH 655/808] kernel/exit.c: export abort() to modules

gcc -fisolate-erroneous-paths-dereference can generate calls to abort()
from modular code too.

[arnd@arndb.de: drop duplicate exports of abort()]
  Link: http://lkml.kernel.org/r/20180102103311.706364-1-arnd@arndb.de
Reported-by: Vineet Gupta <Vineet.Gupta1@synopsys.com>
Cc: Sudip Mukherjee <sudipm.mukherjee@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Alexey Brodkin <Alexey.Brodkin@synopsys.com>
Cc: Russell King <rmk+kernel@armlinux.org.uk>
Cc: Jose Abreu <Jose.Abreu@synopsys.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/kernel/traps.c       | 1 -
 arch/m32r/kernel/traps.c      | 1 -
 arch/unicore32/kernel/traps.c | 1 -
 kernel/exit.c                 | 1 +
 4 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c
index 5cf04888c581..3e26c6f7a191 100644
--- a/arch/arm/kernel/traps.c
+++ b/arch/arm/kernel/traps.c
@@ -793,7 +793,6 @@ void abort(void)
 	/* if that doesn't kill us, halt */
 	panic("Oops failed to kill thread");
 }
-EXPORT_SYMBOL(abort);
 
 void __init trap_init(void)
 {
diff --git a/arch/m32r/kernel/traps.c b/arch/m32r/kernel/traps.c
index cb79fba79d43..b88a8dd14933 100644
--- a/arch/m32r/kernel/traps.c
+++ b/arch/m32r/kernel/traps.c
@@ -122,7 +122,6 @@ void abort(void)
 	/* if that doesn't kill us, halt */
 	panic("Oops failed to kill thread");
 }
-EXPORT_SYMBOL(abort);
 
 void __init trap_init(void)
 {
diff --git a/arch/unicore32/kernel/traps.c b/arch/unicore32/kernel/traps.c
index 5f25b39f04d4..c4ac6043ebb0 100644
--- a/arch/unicore32/kernel/traps.c
+++ b/arch/unicore32/kernel/traps.c
@@ -298,7 +298,6 @@ void abort(void)
 	/* if that doesn't kill us, halt */
 	panic("Oops failed to kill thread");
 }
-EXPORT_SYMBOL(abort);
 
 void __init trap_init(void)
 {
diff --git a/kernel/exit.c b/kernel/exit.c
index df0c91d5606c..995453d9fb55 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1763,3 +1763,4 @@ __weak void abort(void)
 	/* if that doesn't kill us, halt */
 	panic("Oops failed to kill thread");
 }
+EXPORT_SYMBOL(abort);

From 152a2d199e1385c6ccef17c24555103b30447c91 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <mawilcox@microsoft.com>
Date: Thu, 4 Jan 2018 16:17:59 -0800
Subject: [PATCH 656/808] mm/debug.c: provide useful debugging information for
 VM_BUG

With the recent addition of hashed kernel pointers, places which need to
produce useful debug output have to specify %px, not %p.  This patch
fixes all the VM debug to use %px.  This is appropriate because it's
debug output that the user should never be able to trigger, and kernel
developers need to see the actual pointers.

Link: http://lkml.kernel.org/r/20171219133236.GE13680@bombadil.infradead.org
Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: "Tobin C. Harding" <me@tobin.cc>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/debug.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/mm/debug.c b/mm/debug.c
index d947f3e03b0d..56e2d9125ea5 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -50,7 +50,7 @@ void __dump_page(struct page *page, const char *reason)
 	 */
 	int mapcount = PageSlab(page) ? 0 : page_mapcount(page);
 
-	pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx",
+	pr_emerg("page:%px count:%d mapcount:%d mapping:%px index:%#lx",
 		  page, page_ref_count(page), mapcount,
 		  page->mapping, page_to_pgoff(page));
 	if (PageCompound(page))
@@ -69,7 +69,7 @@ void __dump_page(struct page *page, const char *reason)
 
 #ifdef CONFIG_MEMCG
 	if (page->mem_cgroup)
-		pr_alert("page->mem_cgroup:%p\n", page->mem_cgroup);
+		pr_alert("page->mem_cgroup:%px\n", page->mem_cgroup);
 #endif
 }
 
@@ -84,10 +84,10 @@ EXPORT_SYMBOL(dump_page);
 
 void dump_vma(const struct vm_area_struct *vma)
 {
-	pr_emerg("vma %p start %p end %p\n"
-		"next %p prev %p mm %p\n"
-		"prot %lx anon_vma %p vm_ops %p\n"
-		"pgoff %lx file %p private_data %p\n"
+	pr_emerg("vma %px start %px end %px\n"
+		"next %px prev %px mm %px\n"
+		"prot %lx anon_vma %px vm_ops %px\n"
+		"pgoff %lx file %px private_data %px\n"
 		"flags: %#lx(%pGv)\n",
 		vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_next,
 		vma->vm_prev, vma->vm_mm,
@@ -100,27 +100,27 @@ EXPORT_SYMBOL(dump_vma);
 
 void dump_mm(const struct mm_struct *mm)
 {
-	pr_emerg("mm %p mmap %p seqnum %d task_size %lu\n"
+	pr_emerg("mm %px mmap %px seqnum %d task_size %lu\n"
 #ifdef CONFIG_MMU
-		"get_unmapped_area %p\n"
+		"get_unmapped_area %px\n"
 #endif
 		"mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n"
-		"pgd %p mm_users %d mm_count %d pgtables_bytes %lu map_count %d\n"
+		"pgd %px mm_users %d mm_count %d pgtables_bytes %lu map_count %d\n"
 		"hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n"
 		"pinned_vm %lx data_vm %lx exec_vm %lx stack_vm %lx\n"
 		"start_code %lx end_code %lx start_data %lx end_data %lx\n"
 		"start_brk %lx brk %lx start_stack %lx\n"
 		"arg_start %lx arg_end %lx env_start %lx env_end %lx\n"
-		"binfmt %p flags %lx core_state %p\n"
+		"binfmt %px flags %lx core_state %px\n"
 #ifdef CONFIG_AIO
-		"ioctx_table %p\n"
+		"ioctx_table %px\n"
 #endif
 #ifdef CONFIG_MEMCG
-		"owner %p "
+		"owner %px "
 #endif
-		"exe_file %p\n"
+		"exe_file %px\n"
 #ifdef CONFIG_MMU_NOTIFIER
-		"mmu_notifier_mm %p\n"
+		"mmu_notifier_mm %px\n"
 #endif
 #ifdef CONFIG_NUMA_BALANCING
 		"numa_next_scan %lu numa_scan_offset %lu numa_scan_seq %d\n"

From cdc346b36e1dfec201b24eddb7bdbcff6727db04 Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>
Date: Thu, 4 Jan 2018 16:18:02 -0800
Subject: [PATCH 657/808] mm/zsmalloc.c: include fs.h

`struct file_system_type' and alloc_anon_inode() function are defined in
fs.h, include it directly.

Link: http://lkml.kernel.org/r/20171219104219.3017-1-sergey.senozhatsky@gmail.com
Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/zsmalloc.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 685049a9048d..683c0651098c 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -53,6 +53,7 @@
 #include <linux/mount.h>
 #include <linux/migrate.h>
 #include <linux/pagemap.h>
+#include <linux/fs.h>
 
 #define ZSPAGE_MAGIC	0x58
 

From d09cfbbfa0f761a97687828b5afb27b56cbf2e19 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Thu, 4 Jan 2018 16:18:06 -0800
Subject: [PATCH 658/808] mm/sparse.c: wrong allocation for mem_section

In commit 83e3c48729d9 ("mm/sparsemem: Allocate mem_section at runtime
for CONFIG_SPARSEMEM_EXTREME=y") mem_section is allocated at runtime to
save memory.

It allocates the first dimension of array with sizeof(struct mem_section).

It costs extra memory, should be sizeof(struct mem_section *).

Fix it.

Link: http://lkml.kernel.org/r/1513932498-20350-1-git-send-email-bhe@redhat.com
Fixes: 83e3c48729 ("mm/sparsemem: Allocate mem_section at runtime for CONFIG_SPARSEMEM_EXTREME=y")
Signed-off-by: Baoquan He <bhe@redhat.com>
Tested-by: Dave Young <dyoung@redhat.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Atsushi Kumagai <ats-kumagai@wm.jp.nec.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/sparse.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/sparse.c b/mm/sparse.c
index 7a5dacaa06e3..2609aba121e8 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -211,7 +211,7 @@ void __init memory_present(int nid, unsigned long start, unsigned long end)
 	if (unlikely(!mem_section)) {
 		unsigned long size, align;
 
-		size = sizeof(struct mem_section) * NR_SECTION_ROOTS;
+		size = sizeof(struct mem_section*) * NR_SECTION_ROOTS;
 		align = 1 << (INTERNODE_CACHE_SHIFT);
 		mem_section = memblock_virt_alloc(size, align);
 	}

From 0cbb4b4f4c44f54af268969b18d8deda63aded59 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Thu, 4 Jan 2018 16:18:09 -0800
Subject: [PATCH 659/808] userfaultfd: clear the vma->vm_userfaultfd_ctx if
 UFFD_EVENT_FORK fails

The previous fix in commit 384632e67e08 ("userfaultfd: non-cooperative:
fix fork use after free") corrected the refcounting in case of
UFFD_EVENT_FORK failure for the fork userfault paths.

That still didn't clear the vma->vm_userfaultfd_ctx of the vmas that
were set to point to the aborted new uffd ctx earlier in
dup_userfaultfd.

Link: http://lkml.kernel.org/r/20171223002505.593-2-aarcange@redhat.com
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Reviewed-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Eric Biggers <ebiggers3@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/userfaultfd.c | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index ac9a4e65ca49..41a75f9f23fd 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -570,11 +570,14 @@ out:
 static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
 					      struct userfaultfd_wait_queue *ewq)
 {
+	struct userfaultfd_ctx *release_new_ctx;
+
 	if (WARN_ON_ONCE(current->flags & PF_EXITING))
 		goto out;
 
 	ewq->ctx = ctx;
 	init_waitqueue_entry(&ewq->wq, current);
+	release_new_ctx = NULL;
 
 	spin_lock(&ctx->event_wqh.lock);
 	/*
@@ -601,8 +604,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
 				new = (struct userfaultfd_ctx *)
 					(unsigned long)
 					ewq->msg.arg.reserved.reserved1;
-
-				userfaultfd_ctx_put(new);
+				release_new_ctx = new;
 			}
 			break;
 		}
@@ -617,6 +619,20 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
 	__set_current_state(TASK_RUNNING);
 	spin_unlock(&ctx->event_wqh.lock);
 
+	if (release_new_ctx) {
+		struct vm_area_struct *vma;
+		struct mm_struct *mm = release_new_ctx->mm;
+
+		/* the various vma->vm_userfaultfd_ctx still points to it */
+		down_write(&mm->mmap_sem);
+		for (vma = mm->mmap; vma; vma = vma->vm_next)
+			if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx)
+				vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+		up_write(&mm->mmap_sem);
+
+		userfaultfd_ctx_put(release_new_ctx);
+	}
+
 	/*
 	 * ctx may go away after this if the userfault pseudo fd is
 	 * already released.

From 9a0e7120109632910e77295ce6fc512c16cd367b Mon Sep 17 00:00:00 2001
From: Jeffy Chen <jeffy.chen@rock-chips.com>
Date: Thu, 4 Jan 2018 16:18:12 -0800
Subject: [PATCH 660/808] mailmap: update Mark Yao's email address

Change the previous employers email addresses to the current email
address.

Link: http://lkml.kernel.org/r/20171229121726.31589-1-jeffy.chen@rock-chips.com
Signed-off-by: Jeffy Chen <jeffy.chen@rock-chips.com>
Acked-by: Martin Kepplinger <martink@posteo.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 .mailmap | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.mailmap b/.mailmap
index 1469ff0d3f4d..e18cab73e209 100644
--- a/.mailmap
+++ b/.mailmap
@@ -107,6 +107,7 @@ Linus Lüssing <linus.luessing@c0d3.blue> <linus.luessing@ascom.ch>
 Maciej W. Rozycki <macro@mips.com> <macro@imgtec.com>
 Marcin Nowakowski <marcin.nowakowski@mips.com> <marcin.nowakowski@imgtec.com>
 Mark Brown <broonie@sirena.org.uk>
+Mark Yao <markyao0591@gmail.com> <mark.yao@rock-chips.com>
 Martin Kepplinger <martink@posteo.de> <martin.kepplinger@theobroma-systems.com>
 Martin Kepplinger <martink@posteo.de> <martin.kepplinger@ginzinger.com>
 Matthieu CASTET <castet.matthieu@free.fr>

From 9a00674213a3f00394f4e3221b88f2d21fc05789 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Fri, 29 Dec 2017 14:30:19 -0600
Subject: [PATCH 661/808] crypto: algapi - fix NULL dereference in
 crypto_remove_spawns()

syzkaller triggered a NULL pointer dereference in crypto_remove_spawns()
via a program that repeatedly and concurrently requests AEADs
"authenc(cmac(des3_ede-asm),pcbc-aes-aesni)" and hashes "cmac(des3_ede)"
through AF_ALG, where the hashes are requested as "untested"
(CRYPTO_ALG_TESTED is set in ->salg_mask but clear in ->salg_feat; this
causes the template to be instantiated for every request).

Although AF_ALG users really shouldn't be able to request an "untested"
algorithm, the NULL pointer dereference is actually caused by a
longstanding race condition where crypto_remove_spawns() can encounter
an instance which has had spawn(s) "grabbed" but hasn't yet been
registered, resulting in ->cra_users still being NULL.

We probably should properly initialize ->cra_users earlier, but that
would require updating many templates individually.  For now just fix
the bug in a simple way that can easily be backported: make
crypto_remove_spawns() treat a NULL ->cra_users list as empty.

Reported-by: syzbot <syzkaller@googlegroups.com>
Cc: stable@vger.kernel.org
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/algapi.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/crypto/algapi.c b/crypto/algapi.c
index 60d7366ed343..9a636f961572 100644
--- a/crypto/algapi.c
+++ b/crypto/algapi.c
@@ -167,6 +167,18 @@ void crypto_remove_spawns(struct crypto_alg *alg, struct list_head *list,
 
 			spawn->alg = NULL;
 			spawns = &inst->alg.cra_users;
+
+			/*
+			 * We may encounter an unregistered instance here, since
+			 * an instance's spawns are set up prior to the instance
+			 * being registered.  An unregistered instance will have
+			 * NULL ->cra_users.next, since ->cra_users isn't
+			 * properly initialized until registration.  But an
+			 * unregistered instance cannot have any users, so treat
+			 * it the same as ->cra_users being empty.
+			 */
+			if (spawns->next == NULL)
+				break;
 		}
 	} while ((spawns = crypto_more_spawns(alg, &stack, &top,
 					      &secondary_spawns)));

From 107b7d9fa94c4692d9104243f0e793e2a4e1366e Mon Sep 17 00:00:00 2001
From: Sinan Kaya <okaya@codeaurora.org>
Date: Wed, 3 Jan 2018 07:32:45 -0500
Subject: [PATCH 662/808] mfd: rtsx: Release IRQ during shutdown

'Commit cc27b735ad3a ("PCI/portdrv: Turn off PCIe services during
shutdown")' revealed a resource leak in rtsx_pci driver during shutdown.

Issue shows up as a warning during shutdown as follows:

remove_proc_entry: removing non-empty directory 'irq/17', leaking at least
'rtsx_pci'
WARNING: CPU: 0 PID: 1578 at fs/proc/generic.c:572
remove_proc_entry+0x11d/0x130
Modules linked in <long list but none that are out-of-tree>
...
Call Trace:
unregister_irq_proc
free_desc
irq_free_descs
mp_unmap_irq
acpi_unregister_gsi_apic
acpi_pci_irq_disable
do_pci_disable_device
pci_disable_device
device_shutdown
kernel_restart
Sys_reboot

Even though rtsx_pci driver implements a shutdown callback, it is not
releasing the interrupt that it registered during probe. This is causing
the ACPI layer to complain that the shared IRQ is in use while freeing
IRQ.

This code releases the IRQ to prevent resource leak and eliminate the
warning.

Fixes: cc27b735ad3a ("PCI/portdrv: Turn off PCIe services during shutdown")
Link: https://bugzilla.kernel.org/show_bug.cgi?id=198141
Reported-by: Chris Clayton <chris2553@googlemail.com>
Signed-off-by: Sinan Kaya <okaya@codeaurora.org>
Reviewed-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/rtsx_pcr.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/mfd/rtsx_pcr.c b/drivers/mfd/rtsx_pcr.c
index 590fb9aad77d..c3ed885c155c 100644
--- a/drivers/mfd/rtsx_pcr.c
+++ b/drivers/mfd/rtsx_pcr.c
@@ -1543,6 +1543,9 @@ static void rtsx_pci_shutdown(struct pci_dev *pcidev)
 	rtsx_pci_power_off(pcr, HOST_ENTER_S1);
 
 	pci_disable_device(pcidev);
+	free_irq(pcr->irq, (void *)pcr);
+	if (pcr->msi_en)
+		pci_disable_msi(pcr->pci);
 }
 
 #else /* CONFIG_PM */

From 943309d4aad6732b905f3f500e6e17e33c211494 Mon Sep 17 00:00:00 2001
From: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Date: Thu, 4 Jan 2018 09:19:13 +0200
Subject: [PATCH 663/808] iwlwifi: pcie: fix DMA memory mapping / unmapping

22000 devices (previously referenced as A000) can support
short transmit queues. This means that we have less DMA
descriptors (TFD) for those shorter queues.
Previous devices must still have 256 TFDs for each queue
even if those 256 TFDs point to fewer buffers.

When I introduced support for the short queues for 22000
I broke older devices by assuming that they can also have
less TFDs in their queues. This led to several problems:

1) the payload of the commands weren't unmapped properly
   which caused the SWIOTLB to complain at some point.
2) the hardware could get confused and we get hardware
   crashes.

The corresponding bugzilla entries are:

https://bugzilla.kernel.org/show_bug.cgi?id=198201
https://bugzilla.kernel.org/show_bug.cgi?id=198265

Cc: stable@vger.kernel.org # 4.14+
Fixes: 4ecab5616023 ("iwlwifi: pcie: support short Tx queues for A000 device family")
Reviewed-by: Sharon, Sara <sara.sharon@intel.com>
Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/intel/iwlwifi/pcie/internal.h | 10 +++++++---
 drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c  | 11 +++--------
 drivers/net/wireless/intel/iwlwifi/pcie/tx.c       |  8 ++++----
 3 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/internal.h b/drivers/net/wireless/intel/iwlwifi/pcie/internal.h
index d749abeca3ae..403e65c309d0 100644
--- a/drivers/net/wireless/intel/iwlwifi/pcie/internal.h
+++ b/drivers/net/wireless/intel/iwlwifi/pcie/internal.h
@@ -670,11 +670,15 @@ static inline u8 iwl_pcie_get_cmd_index(struct iwl_txq *q, u32 index)
 	return index & (q->n_window - 1);
 }
 
-static inline void *iwl_pcie_get_tfd(struct iwl_trans_pcie *trans_pcie,
+static inline void *iwl_pcie_get_tfd(struct iwl_trans *trans,
 				     struct iwl_txq *txq, int idx)
 {
-	return txq->tfds + trans_pcie->tfd_size * iwl_pcie_get_cmd_index(txq,
-									 idx);
+	struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans);
+
+	if (trans->cfg->use_tfh)
+		idx = iwl_pcie_get_cmd_index(txq, idx);
+
+	return txq->tfds + trans_pcie->tfd_size * idx;
 }
 
 static inline void iwl_enable_rfkill_int(struct iwl_trans *trans)
diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c b/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c
index 16b345f54ff0..6d0a907d5ba5 100644
--- a/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c
+++ b/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c
@@ -171,8 +171,6 @@ static void iwl_pcie_gen2_tfd_unmap(struct iwl_trans *trans,
 
 static void iwl_pcie_gen2_free_tfd(struct iwl_trans *trans, struct iwl_txq *txq)
 {
-	struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans);
-
 	/* rd_ptr is bounded by TFD_QUEUE_SIZE_MAX and
 	 * idx is bounded by n_window
 	 */
@@ -181,7 +179,7 @@ static void iwl_pcie_gen2_free_tfd(struct iwl_trans *trans, struct iwl_txq *txq)
 	lockdep_assert_held(&txq->lock);
 
 	iwl_pcie_gen2_tfd_unmap(trans, &txq->entries[idx].meta,
-				iwl_pcie_get_tfd(trans_pcie, txq, idx));
+				iwl_pcie_get_tfd(trans, txq, idx));
 
 	/* free SKB */
 	if (txq->entries) {
@@ -364,11 +362,9 @@ struct iwl_tfh_tfd *iwl_pcie_gen2_build_tfd(struct iwl_trans *trans,
 					    struct sk_buff *skb,
 					    struct iwl_cmd_meta *out_meta)
 {
-	struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans);
 	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
 	int idx = iwl_pcie_get_cmd_index(txq, txq->write_ptr);
-	struct iwl_tfh_tfd *tfd =
-		iwl_pcie_get_tfd(trans_pcie, txq, idx);
+	struct iwl_tfh_tfd *tfd = iwl_pcie_get_tfd(trans, txq, idx);
 	dma_addr_t tb_phys;
 	bool amsdu;
 	int i, len, tb1_len, tb2_len, hdr_len;
@@ -565,8 +561,7 @@ static int iwl_pcie_gen2_enqueue_hcmd(struct iwl_trans *trans,
 	u8 group_id = iwl_cmd_groupid(cmd->id);
 	const u8 *cmddata[IWL_MAX_CMD_TBS_PER_TFD];
 	u16 cmdlen[IWL_MAX_CMD_TBS_PER_TFD];
-	struct iwl_tfh_tfd *tfd =
-		iwl_pcie_get_tfd(trans_pcie, txq, txq->write_ptr);
+	struct iwl_tfh_tfd *tfd = iwl_pcie_get_tfd(trans, txq, txq->write_ptr);
 
 	memset(tfd, 0, sizeof(*tfd));
 
diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/tx.c b/drivers/net/wireless/intel/iwlwifi/pcie/tx.c
index fed6d842a5e1..3f85713c41dc 100644
--- a/drivers/net/wireless/intel/iwlwifi/pcie/tx.c
+++ b/drivers/net/wireless/intel/iwlwifi/pcie/tx.c
@@ -373,7 +373,7 @@ static void iwl_pcie_tfd_unmap(struct iwl_trans *trans,
 {
 	struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans);
 	int i, num_tbs;
-	void *tfd = iwl_pcie_get_tfd(trans_pcie, txq, index);
+	void *tfd = iwl_pcie_get_tfd(trans, txq, index);
 
 	/* Sanity check on number of chunks */
 	num_tbs = iwl_pcie_tfd_get_num_tbs(trans, tfd);
@@ -2018,7 +2018,7 @@ static int iwl_fill_data_tbs(struct iwl_trans *trans, struct sk_buff *skb,
 	}
 
 	trace_iwlwifi_dev_tx(trans->dev, skb,
-			     iwl_pcie_get_tfd(trans_pcie, txq, txq->write_ptr),
+			     iwl_pcie_get_tfd(trans, txq, txq->write_ptr),
 			     trans_pcie->tfd_size,
 			     &dev_cmd->hdr, IWL_FIRST_TB_SIZE + tb1_len,
 			     hdr_len);
@@ -2092,7 +2092,7 @@ static int iwl_fill_data_tbs_amsdu(struct iwl_trans *trans, struct sk_buff *skb,
 		IEEE80211_CCMP_HDR_LEN : 0;
 
 	trace_iwlwifi_dev_tx(trans->dev, skb,
-			     iwl_pcie_get_tfd(trans_pcie, txq, txq->write_ptr),
+			     iwl_pcie_get_tfd(trans, txq, txq->write_ptr),
 			     trans_pcie->tfd_size,
 			     &dev_cmd->hdr, IWL_FIRST_TB_SIZE + tb1_len, 0);
 
@@ -2425,7 +2425,7 @@ int iwl_trans_pcie_tx(struct iwl_trans *trans, struct sk_buff *skb,
 	memcpy(&txq->first_tb_bufs[txq->write_ptr], &dev_cmd->hdr,
 	       IWL_FIRST_TB_SIZE);
 
-	tfd = iwl_pcie_get_tfd(trans_pcie, txq, txq->write_ptr);
+	tfd = iwl_pcie_get_tfd(trans, txq, txq->write_ptr);
 	/* Set up entry for this TFD in Tx byte-count array */
 	iwl_pcie_txq_update_byte_cnt_tbl(trans, txq, le16_to_cpu(tx_cmd->len),
 					 iwl_pcie_tfd_get_num_tbs(trans, tfd));

From b9e705ef7cfaf22db0daab91ad3cd33b0fa32eb9 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Thu, 4 Jan 2018 14:37:05 +0000
Subject: [PATCH 664/808] x86/alternatives: Add missing '\n' at end of
 ALTERNATIVE inline asm

Where an ALTERNATIVE is used in the middle of an inline asm block, this
would otherwise lead to the following instruction being appended directly
to the trailing ".popsection", and a failed compile.

Fixes: 9cebed423c84 ("x86, alternative: Use .pushsection/.popsection")
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: gnomes@lxorguk.ukuu.org.uk
Cc: Rik van Riel <riel@redhat.com>
Cc: ak@linux.intel.com
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Paul Turner <pjt@google.com>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Kees Cook <keescook@google.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/20180104143710.8961-8-dwmw@amazon.co.uk
---
 arch/x86/include/asm/alternative.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index dbfd0854651f..cf5961ca8677 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -140,7 +140,7 @@ static inline int alternatives_text_reserved(void *start, void *end)
 	".popsection\n"							\
 	".pushsection .altinstr_replacement, \"ax\"\n"			\
 	ALTINSTR_REPLACEMENT(newinstr, feature, 1)			\
-	".popsection"
+	".popsection\n"
 
 #define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\
 	OLDINSTR_2(oldinstr, 1, 2)					\
@@ -151,7 +151,7 @@ static inline int alternatives_text_reserved(void *start, void *end)
 	".pushsection .altinstr_replacement, \"ax\"\n"			\
 	ALTINSTR_REPLACEMENT(newinstr1, feature1, 1)			\
 	ALTINSTR_REPLACEMENT(newinstr2, feature2, 2)			\
-	".popsection"
+	".popsection\n"
 
 /*
  * Alternative instructions for different CPU types or capabilities.

From de791821c295cc61419a06fe5562288417d1bc58 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 5 Jan 2018 15:27:34 +0100
Subject: [PATCH 665/808] x86/pti: Rename BUG_CPU_INSECURE to BUG_CPU_MELTDOWN

Use the name associated with the particular attack which needs page table
isolation for mitigation.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: David Woodhouse <dwmw@amazon.co.uk>
Cc: Alan Cox <gnomes@lxorguk.ukuu.org.uk>
Cc: Jiri Koshina <jikos@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Andi Lutomirski  <luto@amacapital.net>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Paul Turner <pjt@google.com>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: Greg KH <gregkh@linux-foundation.org>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Kees Cook <keescook@google.com>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801051525300.1724@nanos
---
 arch/x86/include/asm/cpufeatures.h | 2 +-
 arch/x86/kernel/cpu/common.c       | 2 +-
 arch/x86/mm/pti.c                  | 6 +++---
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 07cdd1715705..21ac898df2d8 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -341,6 +341,6 @@
 #define X86_BUG_SWAPGS_FENCE		X86_BUG(11) /* SWAPGS without input dep on GS */
 #define X86_BUG_MONITOR			X86_BUG(12) /* IPI required to wake up remote CPU */
 #define X86_BUG_AMD_E400		X86_BUG(13) /* CPU is among the affected by Erratum 400 */
-#define X86_BUG_CPU_INSECURE		X86_BUG(14) /* CPU is insecure and needs kernel page table isolation */
+#define X86_BUG_CPU_MELTDOWN		X86_BUG(14) /* CPU is affected by meltdown attack and needs kernel page table isolation */
 
 #endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index b1be494ab4e8..2d3bd2215e5b 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -900,7 +900,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
 	setup_force_cpu_cap(X86_FEATURE_ALWAYS);
 
 	if (c->x86_vendor != X86_VENDOR_AMD)
-		setup_force_cpu_bug(X86_BUG_CPU_INSECURE);
+		setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN);
 
 	fpu__init_system(c);
 
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index 2da28ba97508..43d4a4a29037 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -56,13 +56,13 @@
 
 static void __init pti_print_if_insecure(const char *reason)
 {
-	if (boot_cpu_has_bug(X86_BUG_CPU_INSECURE))
+	if (boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
 		pr_info("%s\n", reason);
 }
 
 static void __init pti_print_if_secure(const char *reason)
 {
-	if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE))
+	if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
 		pr_info("%s\n", reason);
 }
 
@@ -96,7 +96,7 @@ void __init pti_check_boottime_disable(void)
 	}
 
 autosel:
-	if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE))
+	if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
 		return;
 enable:
 	setup_force_cpu_cap(X86_FEATURE_PTI);

From fb51f1cd06f9ced7b7085a2a4636375d520431ca Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Wed, 3 Jan 2018 15:16:30 +0100
Subject: [PATCH 666/808] ALSA: pcm: Workaround for weird PulseAudio behavior
 on rewind error

The commit 9027c4639ef1 ("ALSA: pcm: Call ack() whenever appl_ptr is
updated") introduced the possible error code returned from the PCM
rewind ioctl.  Basically the change was for handling the indirect PCM
more correctly, but ironically, it caused rather a side-effect:
PulseAudio gets pissed off when receiving an error from rewind, throws
everything away and stops processing further, resulting in the
silence.

It's clearly a failure in the application side, so the best would be
to fix that bug in PA.  OTOH, PA is mostly the only user of the rewind
feature, so it's not good to slap the sole customer.

This patch tries to mitigate the situation: instead of returning an
error, now the rewind ioctl returns zero when the driver can't rewind.
It indicates that no rewind was performed, so the behavior is
consistent, at least.

Fixes: 9027c4639ef1 ("ALSA: pcm: Call ack() whenever appl_ptr is updated")
Cc: <stable@vger.kernel.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/core/pcm_native.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
index a4d92e46c459..f08772568c17 100644
--- a/sound/core/pcm_native.c
+++ b/sound/core/pcm_native.c
@@ -2580,7 +2580,7 @@ static snd_pcm_sframes_t forward_appl_ptr(struct snd_pcm_substream *substream,
 	return ret < 0 ? ret : frames;
 }
 
-/* decrease the appl_ptr; returns the processed frames or a negative error */
+/* decrease the appl_ptr; returns the processed frames or zero for error */
 static snd_pcm_sframes_t rewind_appl_ptr(struct snd_pcm_substream *substream,
 					 snd_pcm_uframes_t frames,
 					 snd_pcm_sframes_t avail)
@@ -2597,7 +2597,12 @@ static snd_pcm_sframes_t rewind_appl_ptr(struct snd_pcm_substream *substream,
 	if (appl_ptr < 0)
 		appl_ptr += runtime->boundary;
 	ret = pcm_lib_apply_appl_ptr(substream, appl_ptr);
-	return ret < 0 ? ret : frames;
+	/* NOTE: we return zero for errors because PulseAudio gets depressed
+	 * upon receiving an error from rewind ioctl and stops processing
+	 * any longer.  Returning zero means that no rewind is done, so
+	 * it's not absolutely wrong to answer like that.
+	 */
+	return ret < 0 ? 0 : frames;
 }
 
 static snd_pcm_sframes_t snd_pcm_playback_rewind(struct snd_pcm_substream *substream,

From 9685347aa0a5c2869058ca6ab79fd8e93084a67f Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Fri, 5 Jan 2018 16:09:47 +0100
Subject: [PATCH 667/808] ALSA: aloop: Release cable upon open error path

The aloop runtime object and its assignment in the cable are left even
when opening a substream fails.  This doesn't mean any memory leak,
but it still keeps the invalid pointer that may be referred by the
another side of the cable spontaneously, which is a potential Oops
cause.

Clean up the cable assignment and the empty cable upon the error path
properly.

Fixes: 597603d615d2 ("ALSA: introduce the snd-aloop module for the PCM loopback")
Cc: <stable@vger.kernel.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/drivers/aloop.c | 38 +++++++++++++++++++++++++-------------
 1 file changed, 25 insertions(+), 13 deletions(-)

diff --git a/sound/drivers/aloop.c b/sound/drivers/aloop.c
index afac886ffa28..8b6a39cb7f06 100644
--- a/sound/drivers/aloop.c
+++ b/sound/drivers/aloop.c
@@ -658,12 +658,31 @@ static int rule_channels(struct snd_pcm_hw_params *params,
 	return snd_interval_refine(hw_param_interval(params, rule->var), &t);
 }
 
+static void free_cable(struct snd_pcm_substream *substream)
+{
+	struct loopback *loopback = substream->private_data;
+	int dev = get_cable_index(substream);
+	struct loopback_cable *cable;
+
+	cable = loopback->cables[substream->number][dev];
+	if (!cable)
+		return;
+	if (cable->streams[!substream->stream]) {
+		/* other stream is still alive */
+		cable->streams[substream->stream] = NULL;
+	} else {
+		/* free the cable */
+		loopback->cables[substream->number][dev] = NULL;
+		kfree(cable);
+	}
+}
+
 static int loopback_open(struct snd_pcm_substream *substream)
 {
 	struct snd_pcm_runtime *runtime = substream->runtime;
 	struct loopback *loopback = substream->private_data;
 	struct loopback_pcm *dpcm;
-	struct loopback_cable *cable;
+	struct loopback_cable *cable = NULL;
 	int err = 0;
 	int dev = get_cable_index(substream);
 
@@ -681,7 +700,6 @@ static int loopback_open(struct snd_pcm_substream *substream)
 	if (!cable) {
 		cable = kzalloc(sizeof(*cable), GFP_KERNEL);
 		if (!cable) {
-			kfree(dpcm);
 			err = -ENOMEM;
 			goto unlock;
 		}
@@ -723,6 +741,10 @@ static int loopback_open(struct snd_pcm_substream *substream)
 	else
 		runtime->hw = cable->hw;
  unlock:
+	if (err < 0) {
+		free_cable(substream);
+		kfree(dpcm);
+	}
 	mutex_unlock(&loopback->cable_lock);
 	return err;
 }
@@ -731,20 +753,10 @@ static int loopback_close(struct snd_pcm_substream *substream)
 {
 	struct loopback *loopback = substream->private_data;
 	struct loopback_pcm *dpcm = substream->runtime->private_data;
-	struct loopback_cable *cable;
-	int dev = get_cable_index(substream);
 
 	loopback_timer_stop(dpcm);
 	mutex_lock(&loopback->cable_lock);
-	cable = loopback->cables[substream->number][dev];
-	if (cable->streams[!substream->stream]) {
-		/* other stream is still alive */
-		cable->streams[substream->stream] = NULL;
-	} else {
-		/* free the cable */
-		loopback->cables[substream->number][dev] = NULL;
-		kfree(cable);
-	}
+	free_cable(substream);
 	mutex_unlock(&loopback->cable_lock);
 	return 0;
 }

From b088b53e20c7d09b5ab84c5688e609f478e5c417 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Fri, 5 Jan 2018 16:15:33 +0100
Subject: [PATCH 668/808] ALSA: aloop: Fix inconsistent format due to
 incomplete rule

The extra hw constraint rule for the formats the aloop driver
introduced has a slight flaw, where it doesn't return a positive value
when the mask got changed.  It came from the fact that it's basically
a copy&paste from snd_hw_constraint_mask64().  The original code is
supposed to be a single-shot and it modifies the mask bits only once
and never after, while what we need for aloop is the dynamic hw rule
that limits the mask bits.

This difference results in the inconsistent state, as the hw_refine
doesn't apply the dependencies fully.  The worse and surprisingly
result is that it causes a crash in OSS emulation when multiple
full-duplex reads/writes are performed concurrently (I leave why it
triggers Oops to readers as a homework).

For fixing this, replace a few open-codes with the standard
snd_mask_*() macros.

Reported-by: syzbot+3902b5220e8ca27889ca@syzkaller.appspotmail.com
Fixes: b1c73fc8e697 ("ALSA: snd-aloop: Fix hw_params restrictions and checking")
Cc: <stable@vger.kernel.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/drivers/aloop.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/sound/drivers/aloop.c b/sound/drivers/aloop.c
index 8b6a39cb7f06..006521db487d 100644
--- a/sound/drivers/aloop.c
+++ b/sound/drivers/aloop.c
@@ -39,6 +39,7 @@
 #include <sound/core.h>
 #include <sound/control.h>
 #include <sound/pcm.h>
+#include <sound/pcm_params.h>
 #include <sound/info.h>
 #include <sound/initval.h>
 
@@ -622,14 +623,12 @@ static int rule_format(struct snd_pcm_hw_params *params,
 {
 
 	struct snd_pcm_hardware *hw = rule->private;
-	struct snd_mask *maskp = hw_param_mask(params, rule->var);
+	struct snd_mask m;
 
-	maskp->bits[0] &= (u_int32_t)hw->formats;
-	maskp->bits[1] &= (u_int32_t)(hw->formats >> 32);
-	memset(maskp->bits + 2, 0, (SNDRV_MASK_MAX-64) / 8); /* clear rest */
-	if (! maskp->bits[0] && ! maskp->bits[1])
-		return -EINVAL;
-	return 0;
+	snd_mask_none(&m);
+	m.bits[0] = (u_int32_t)hw->formats;
+	m.bits[1] = (u_int32_t)(hw->formats >> 32);
+	return snd_mask_refine(hw_param_mask(params, rule->var), &m);
 }
 
 static int rule_rate(struct snd_pcm_hw_params *params,

From 898dfe4687f460ba337a01c11549f87269a13fa2 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Thu, 4 Jan 2018 17:38:54 +0100
Subject: [PATCH 669/808] ALSA: aloop: Fix racy hw constraints adjustment

The aloop driver tries to update the hw constraints of the connected
target on the cable of the opened PCM substream.  This is done by
adding the extra hw constraints rules referring to the substream
runtime->hw fields, while the other substream may update the runtime
hw of another side on the fly.

This is, however, racy and may result in the inconsistent values when
both PCM streams perform the prepare concurrently.  One of the reason
is that it overwrites the other's runtime->hw field; which is not only
racy but also broken when it's called before the open of another side
finishes.  And, since the reference to runtime->hw isn't protected,
the concurrent write may give the partial value update and become
inconsistent.

This patch is an attempt to fix and clean up:
- The prepare doesn't change the runtime->hw of other side any longer,
  but only update the cable->hw that is referred commonly.
- The extra rules refer to the loopback_pcm object instead of the
  runtime->hw.  The actual hw is deduced from cable->hw.
- The extra rules take the cable_lock to protect against the race.

Fixes: b1c73fc8e697 ("ALSA: snd-aloop: Fix hw_params restrictions and checking")
Cc: <stable@vger.kernel.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/drivers/aloop.c | 51 ++++++++++++++++++-------------------------
 1 file changed, 21 insertions(+), 30 deletions(-)

diff --git a/sound/drivers/aloop.c b/sound/drivers/aloop.c
index 006521db487d..0333143a1fa7 100644
--- a/sound/drivers/aloop.c
+++ b/sound/drivers/aloop.c
@@ -306,19 +306,6 @@ static int loopback_trigger(struct snd_pcm_substream *substream, int cmd)
 	return 0;
 }
 
-static void params_change_substream(struct loopback_pcm *dpcm,
-				    struct snd_pcm_runtime *runtime)
-{
-	struct snd_pcm_runtime *dst_runtime;
-
-	if (dpcm == NULL || dpcm->substream == NULL)
-		return;
-	dst_runtime = dpcm->substream->runtime;
-	if (dst_runtime == NULL)
-		return;
-	dst_runtime->hw = dpcm->cable->hw;
-}
-
 static void params_change(struct snd_pcm_substream *substream)
 {
 	struct snd_pcm_runtime *runtime = substream->runtime;
@@ -330,10 +317,6 @@ static void params_change(struct snd_pcm_substream *substream)
 	cable->hw.rate_max = runtime->rate;
 	cable->hw.channels_min = runtime->channels;
 	cable->hw.channels_max = runtime->channels;
-	params_change_substream(cable->streams[SNDRV_PCM_STREAM_PLAYBACK],
-				runtime);
-	params_change_substream(cable->streams[SNDRV_PCM_STREAM_CAPTURE],
-				runtime);
 }
 
 static int loopback_prepare(struct snd_pcm_substream *substream)
@@ -621,24 +604,29 @@ static unsigned int get_cable_index(struct snd_pcm_substream *substream)
 static int rule_format(struct snd_pcm_hw_params *params,
 		       struct snd_pcm_hw_rule *rule)
 {
-
-	struct snd_pcm_hardware *hw = rule->private;
+	struct loopback_pcm *dpcm = rule->private;
+	struct loopback_cable *cable = dpcm->cable;
 	struct snd_mask m;
 
 	snd_mask_none(&m);
-	m.bits[0] = (u_int32_t)hw->formats;
-	m.bits[1] = (u_int32_t)(hw->formats >> 32);
+	mutex_lock(&dpcm->loopback->cable_lock);
+	m.bits[0] = (u_int32_t)cable->hw.formats;
+	m.bits[1] = (u_int32_t)(cable->hw.formats >> 32);
+	mutex_unlock(&dpcm->loopback->cable_lock);
 	return snd_mask_refine(hw_param_mask(params, rule->var), &m);
 }
 
 static int rule_rate(struct snd_pcm_hw_params *params,
 		     struct snd_pcm_hw_rule *rule)
 {
-	struct snd_pcm_hardware *hw = rule->private;
+	struct loopback_pcm *dpcm = rule->private;
+	struct loopback_cable *cable = dpcm->cable;
 	struct snd_interval t;
 
-        t.min = hw->rate_min;
-        t.max = hw->rate_max;
+	mutex_lock(&dpcm->loopback->cable_lock);
+	t.min = cable->hw.rate_min;
+	t.max = cable->hw.rate_max;
+	mutex_unlock(&dpcm->loopback->cable_lock);
         t.openmin = t.openmax = 0;
         t.integer = 0;
 	return snd_interval_refine(hw_param_interval(params, rule->var), &t);
@@ -647,11 +635,14 @@ static int rule_rate(struct snd_pcm_hw_params *params,
 static int rule_channels(struct snd_pcm_hw_params *params,
 			 struct snd_pcm_hw_rule *rule)
 {
-	struct snd_pcm_hardware *hw = rule->private;
+	struct loopback_pcm *dpcm = rule->private;
+	struct loopback_cable *cable = dpcm->cable;
 	struct snd_interval t;
 
-        t.min = hw->channels_min;
-        t.max = hw->channels_max;
+	mutex_lock(&dpcm->loopback->cable_lock);
+	t.min = cable->hw.channels_min;
+	t.max = cable->hw.channels_max;
+	mutex_unlock(&dpcm->loopback->cable_lock);
         t.openmin = t.openmax = 0;
         t.integer = 0;
 	return snd_interval_refine(hw_param_interval(params, rule->var), &t);
@@ -716,19 +707,19 @@ static int loopback_open(struct snd_pcm_substream *substream)
 	/* are cached -> they do not reflect the actual state */
 	err = snd_pcm_hw_rule_add(runtime, 0,
 				  SNDRV_PCM_HW_PARAM_FORMAT,
-				  rule_format, &runtime->hw,
+				  rule_format, dpcm,
 				  SNDRV_PCM_HW_PARAM_FORMAT, -1);
 	if (err < 0)
 		goto unlock;
 	err = snd_pcm_hw_rule_add(runtime, 0,
 				  SNDRV_PCM_HW_PARAM_RATE,
-				  rule_rate, &runtime->hw,
+				  rule_rate, dpcm,
 				  SNDRV_PCM_HW_PARAM_RATE, -1);
 	if (err < 0)
 		goto unlock;
 	err = snd_pcm_hw_rule_add(runtime, 0,
 				  SNDRV_PCM_HW_PARAM_CHANNELS,
-				  rule_channels, &runtime->hw,
+				  rule_channels, dpcm,
 				  SNDRV_PCM_HW_PARAM_CHANNELS, -1);
 	if (err < 0)
 		goto unlock;

From 0cb5b30698fdc8f6b4646012e3acb4ddce430788 Mon Sep 17 00:00:00 2001
From: Jim Mattson <jmattson@google.com>
Date: Wed, 3 Jan 2018 14:31:38 -0800
Subject: [PATCH 670/808] kvm: vmx: Scrub hardware GPRs at VM-exit

Guest GPR values are live in the hardware GPRs at VM-exit.  Do not
leave any guest values in hardware GPRs after the guest GPR values are
saved to the vcpu_vmx structure.

This is a partial mitigation for CVE 2017-5715 and CVE 2017-5753.
Specifically, it defeats the Project Zero PoC for CVE 2017-5715.

Suggested-by: Eric Northup <digitaleric@google.com>
Signed-off-by: Jim Mattson <jmattson@google.com>
Reviewed-by: Eric Northup <digitaleric@google.com>
Reviewed-by: Benjamin Serebrin <serebrin@google.com>
Reviewed-by: Andrew Honig <ahonig@google.com>
[Paolo: Add AMD bits, Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/svm.c | 19 +++++++++++++++++++
 arch/x86/kvm/vmx.c | 14 +++++++++++++-
 2 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index eb714f1cdf7e..bb31c801f1fc 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -4985,6 +4985,25 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 		"mov %%r13, %c[r13](%[svm]) \n\t"
 		"mov %%r14, %c[r14](%[svm]) \n\t"
 		"mov %%r15, %c[r15](%[svm]) \n\t"
+#endif
+		/*
+		* Clear host registers marked as clobbered to prevent
+		* speculative use.
+		*/
+		"xor %%" _ASM_BX ", %%" _ASM_BX " \n\t"
+		"xor %%" _ASM_CX ", %%" _ASM_CX " \n\t"
+		"xor %%" _ASM_DX ", %%" _ASM_DX " \n\t"
+		"xor %%" _ASM_SI ", %%" _ASM_SI " \n\t"
+		"xor %%" _ASM_DI ", %%" _ASM_DI " \n\t"
+#ifdef CONFIG_X86_64
+		"xor %%r8, %%r8 \n\t"
+		"xor %%r9, %%r9 \n\t"
+		"xor %%r10, %%r10 \n\t"
+		"xor %%r11, %%r11 \n\t"
+		"xor %%r12, %%r12 \n\t"
+		"xor %%r13, %%r13 \n\t"
+		"xor %%r14, %%r14 \n\t"
+		"xor %%r15, %%r15 \n\t"
 #endif
 		"pop %%" _ASM_BP
 		:
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 8eba631c4dbd..c1e7ed371259 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -9415,6 +9415,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 		/* Save guest registers, load host registers, keep flags */
 		"mov %0, %c[wordsize](%%" _ASM_SP ") \n\t"
 		"pop %0 \n\t"
+		"setbe %c[fail](%0)\n\t"
 		"mov %%" _ASM_AX ", %c[rax](%0) \n\t"
 		"mov %%" _ASM_BX ", %c[rbx](%0) \n\t"
 		__ASM_SIZE(pop) " %c[rcx](%0) \n\t"
@@ -9431,12 +9432,23 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 		"mov %%r13, %c[r13](%0) \n\t"
 		"mov %%r14, %c[r14](%0) \n\t"
 		"mov %%r15, %c[r15](%0) \n\t"
+		"xor %%r8d,  %%r8d \n\t"
+		"xor %%r9d,  %%r9d \n\t"
+		"xor %%r10d, %%r10d \n\t"
+		"xor %%r11d, %%r11d \n\t"
+		"xor %%r12d, %%r12d \n\t"
+		"xor %%r13d, %%r13d \n\t"
+		"xor %%r14d, %%r14d \n\t"
+		"xor %%r15d, %%r15d \n\t"
 #endif
 		"mov %%cr2, %%" _ASM_AX "   \n\t"
 		"mov %%" _ASM_AX ", %c[cr2](%0) \n\t"
 
+		"xor %%eax, %%eax \n\t"
+		"xor %%ebx, %%ebx \n\t"
+		"xor %%esi, %%esi \n\t"
+		"xor %%edi, %%edi \n\t"
 		"pop  %%" _ASM_BP "; pop  %%" _ASM_DX " \n\t"
-		"setbe %c[fail](%0) \n\t"
 		".pushsection .rodata \n\t"
 		".global vmx_return \n\t"
 		"vmx_return: " _ASM_PTR " 2b \n\t"

From 454be724f6f99cc7e7bbf15067128be9868186c6 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Thu, 30 Nov 2017 07:56:35 +0800
Subject: [PATCH 671/808] block: drain queue before waiting for q_usage_counter
 becoming zero

Now we track legacy requests with .q_usage_counter in commit 055f6e18e08f
("block: Make q_usage_counter also track legacy requests"), but that
commit never runs and drains legacy queue before waiting for this counter
becoming zero, then IO hang is caused in the test of pulling disk during IO.

This patch fixes the issue by draining requests before waiting for
q_usage_counter becoming zero, both Mauricio and chenxiang reported this
issue, and observed that it can be fixed by this patch.

Link: https://marc.info/?l=linux-block&m=151192424731797&w=2
Fixes: 055f6e18e08f("block: Make q_usage_counter also track legacy requests")
Cc: Wen Xiong <wenxiong@us.ibm.com>
Tested-by: "chenxiang (M)" <chenxiang66@hisilicon.com>
Tested-by: Mauricio Faria de Oliveira <mauricfo@linux.vnet.ibm.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 9 +++++++--
 block/blk-mq.c   | 2 ++
 block/blk.h      | 2 ++
 3 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index b8881750a3ac..3ba4326a63b5 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -562,6 +562,13 @@ static void __blk_drain_queue(struct request_queue *q, bool drain_all)
 	}
 }
 
+void blk_drain_queue(struct request_queue *q)
+{
+	spin_lock_irq(q->queue_lock);
+	__blk_drain_queue(q, true);
+	spin_unlock_irq(q->queue_lock);
+}
+
 /**
  * blk_queue_bypass_start - enter queue bypass mode
  * @q: queue of interest
@@ -689,8 +696,6 @@ void blk_cleanup_queue(struct request_queue *q)
 	 */
 	blk_freeze_queue(q);
 	spin_lock_irq(lock);
-	if (!q->mq_ops)
-		__blk_drain_queue(q, true);
 	queue_flag_set(QUEUE_FLAG_DEAD, q);
 	spin_unlock_irq(lock);
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 11097477eeab..3d3797327491 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -161,6 +161,8 @@ void blk_freeze_queue(struct request_queue *q)
 	 * exported to drivers as the only user for unfreeze is blk_mq.
 	 */
 	blk_freeze_queue_start(q);
+	if (!q->mq_ops)
+		blk_drain_queue(q);
 	blk_mq_freeze_queue_wait(q);
 }
 
diff --git a/block/blk.h b/block/blk.h
index 3f1446937aec..442098aa9463 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -330,4 +330,6 @@ static inline void blk_queue_bounce(struct request_queue *q, struct bio **bio)
 }
 #endif /* CONFIG_BOUNCE */
 
+extern void blk_drain_queue(struct request_queue *q);
+
 #endif /* BLK_INTERNAL_H */

From d1616f07e8f1a4a490d1791316d4a68906b284aa Mon Sep 17 00:00:00 2001
From: Fugang Duan <fugang.duan@nxp.com>
Date: Thu, 4 Jan 2018 10:47:20 +0800
Subject: [PATCH 672/808] net: fec: free/restore resource in related probe
 error pathes

Fixes in probe error path:
- Restore dev_id before failed_ioremap path.
  Fixes: ("net: fec: restore dev_id in the cases of probe error")
- Call of_node_put(phy_node) before failed_phy path.
  Fixes: ("net: fec: Support phys probed from devicetree and fixed-link")

Signed-off-by: Fugang Duan <fugang.duan@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/freescale/fec_main.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c
index 19f198e22e15..a74300a4459c 100644
--- a/drivers/net/ethernet/freescale/fec_main.c
+++ b/drivers/net/ethernet/freescale/fec_main.c
@@ -3556,11 +3556,11 @@ failed_clk_ipg:
 failed_clk:
 	if (of_phy_is_fixed_link(np))
 		of_phy_deregister_fixed_link(np);
-failed_phy:
 	of_node_put(phy_node);
+failed_phy:
+	dev_id--;
 failed_ioremap:
 	free_netdev(ndev);
-	dev_id--;
 
 	return ret;
 }

From 040ee69226f8a96b7943645d68f41d5d44b5ff7d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 2 Dec 2017 20:20:38 -0500
Subject: [PATCH 673/808] fix "netfilter: xt_bpf: Fix XT_BPF_MODE_FD_PINNED
 mode of 'xt_bpf_info_v1'"

Descriptor table is a shared object; it's not a place where you can
stick temporary references to files, especially when we don't need
an opened file at all.

Cc: stable@vger.kernel.org # v4.14
Fixes: 98589a0998b8 ("netfilter: xt_bpf: Fix XT_BPF_MODE_FD_PINNED mode of 'xt_bpf_info_v1'")
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/bpf.h    | 10 ++++++++++
 kernel/bpf/inode.c     | 40 +++++++++++++++++++++++++++++++++++++++-
 kernel/bpf/syscall.c   |  2 +-
 net/netfilter/xt_bpf.c | 14 ++------------
 4 files changed, 52 insertions(+), 14 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e55e4255a210..b63a592ad29d 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -419,6 +419,8 @@ static inline int bpf_map_attr_numa_node(const union bpf_attr *attr)
 		attr->numa_node : NUMA_NO_NODE;
 }
 
+struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type);
+
 #else /* !CONFIG_BPF_SYSCALL */
 static inline struct bpf_prog *bpf_prog_get(u32 ufd)
 {
@@ -506,6 +508,12 @@ static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu,
 {
 	return 0;
 }
+
+static inline struct bpf_prog *bpf_prog_get_type_path(const char *name,
+				enum bpf_prog_type type)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
 #endif /* CONFIG_BPF_SYSCALL */
 
 static inline struct bpf_prog *bpf_prog_get_type(u32 ufd,
@@ -514,6 +522,8 @@ static inline struct bpf_prog *bpf_prog_get_type(u32 ufd,
 	return bpf_prog_get_type_dev(ufd, type, false);
 }
 
+bool bpf_prog_get_ok(struct bpf_prog *, enum bpf_prog_type *, bool);
+
 int bpf_prog_offload_compile(struct bpf_prog *prog);
 void bpf_prog_offload_destroy(struct bpf_prog *prog);
 
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 01aaef1a77c5..5bb5e49ef4c3 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -368,7 +368,45 @@ out:
 	putname(pname);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(bpf_obj_get_user);
+
+static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type)
+{
+	struct bpf_prog *prog;
+	int ret = inode_permission(inode, MAY_READ | MAY_WRITE);
+	if (ret)
+		return ERR_PTR(ret);
+
+	if (inode->i_op == &bpf_map_iops)
+		return ERR_PTR(-EINVAL);
+	if (inode->i_op != &bpf_prog_iops)
+		return ERR_PTR(-EACCES);
+
+	prog = inode->i_private;
+
+	ret = security_bpf_prog(prog);
+	if (ret < 0)
+		return ERR_PTR(ret);
+
+	if (!bpf_prog_get_ok(prog, &type, false))
+		return ERR_PTR(-EINVAL);
+
+	return bpf_prog_inc(prog);
+}
+
+struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type)
+{
+	struct bpf_prog *prog;
+	struct path path;
+	int ret = kern_path(name, LOOKUP_FOLLOW, &path);
+	if (ret)
+		return ERR_PTR(ret);
+	prog = __get_prog_inode(d_backing_inode(path.dentry), type);
+	if (!IS_ERR(prog))
+		touch_atime(&path);
+	path_put(&path);
+	return prog;
+}
+EXPORT_SYMBOL(bpf_prog_get_type_path);
 
 static void bpf_evict_inode(struct inode *inode)
 {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 2c4cfeaa8d5e..5cb783fc8224 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1057,7 +1057,7 @@ struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
 }
 EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero);
 
-static bool bpf_prog_get_ok(struct bpf_prog *prog,
+bool bpf_prog_get_ok(struct bpf_prog *prog,
 			    enum bpf_prog_type *attach_type, bool attach_drv)
 {
 	/* not an attachment, just a refcount inc, always allow */
diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c
index 041da0d9c06f..fa2ca0a13619 100644
--- a/net/netfilter/xt_bpf.c
+++ b/net/netfilter/xt_bpf.c
@@ -52,18 +52,8 @@ static int __bpf_mt_check_fd(int fd, struct bpf_prog **ret)
 
 static int __bpf_mt_check_path(const char *path, struct bpf_prog **ret)
 {
-	mm_segment_t oldfs = get_fs();
-	int retval, fd;
-
-	set_fs(KERNEL_DS);
-	fd = bpf_obj_get_user(path, 0);
-	set_fs(oldfs);
-	if (fd < 0)
-		return fd;
-
-	retval = __bpf_mt_check_fd(fd, ret);
-	sys_close(fd);
-	return retval;
+	*ret = bpf_prog_get_type_path(path, BPF_PROG_TYPE_SOCKET_FILTER);
+	return PTR_ERR_OR_ZERO(*ret);
 }
 
 static int bpf_mt_check(const struct xt_mtchk_param *par)

From 9059a3493efea6492451430c7e2fa0af799a2abb Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nicolas.pitre@linaro.org>
Date: Thu, 16 Nov 2017 20:06:39 -0500
Subject: [PATCH 674/808] kconfig: fix relational operators for bool and
 tristate symbols

Since commit 31847b67bec0 ("kconfig: allow use of relations other than
(in)equality") it is possible to use relational operators in Kconfig
statements. However, those operators give unexpected results when
applied to bool/tristate values:

	(n < y) = y (correct)
	(m < y) = y (correct)
	(n < m) = n (wrong)

This happens because relational operators process bool and tristate
symbols as strings and m sorts before n. It makes little sense to do a
lexicographical compare on bool and tristate values though.

Documentation/kbuild/kconfig-language.txt states that expression can have
a value of 'n', 'm' or 'y' (or 0, 1, 2 respectively for calculations).
Let's make it so for relational comparisons with bool/tristate
expressions as well and document them. If at least one symbol is an
actual string then the lexicographical compare works just as before.

Signed-off-by: Nicolas Pitre <nico@linaro.org>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Tested-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
---
 Documentation/kbuild/kconfig-language.txt | 23 +++++++++++++++--------
 scripts/kconfig/expr.c                    |  5 ++++-
 2 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/Documentation/kbuild/kconfig-language.txt b/Documentation/kbuild/kconfig-language.txt
index 262722d8867b..c4a293a03c33 100644
--- a/Documentation/kbuild/kconfig-language.txt
+++ b/Documentation/kbuild/kconfig-language.txt
@@ -200,10 +200,14 @@ module state. Dependency expressions have the following syntax:
 <expr> ::= <symbol>                             (1)
            <symbol> '=' <symbol>                (2)
            <symbol> '!=' <symbol>               (3)
-           '(' <expr> ')'                       (4)
-           '!' <expr>                           (5)
-           <expr> '&&' <expr>                   (6)
-           <expr> '||' <expr>                   (7)
+           <symbol1> '<' <symbol2>              (4)
+           <symbol1> '>' <symbol2>              (4)
+           <symbol1> '<=' <symbol2>             (4)
+           <symbol1> '>=' <symbol2>             (4)
+           '(' <expr> ')'                       (5)
+           '!' <expr>                           (6)
+           <expr> '&&' <expr>                   (7)
+           <expr> '||' <expr>                   (8)
 
 Expressions are listed in decreasing order of precedence. 
 
@@ -214,10 +218,13 @@ Expressions are listed in decreasing order of precedence.
     otherwise 'n'.
 (3) If the values of both symbols are equal, it returns 'n',
     otherwise 'y'.
-(4) Returns the value of the expression. Used to override precedence.
-(5) Returns the result of (2-/expr/).
-(6) Returns the result of min(/expr/, /expr/).
-(7) Returns the result of max(/expr/, /expr/).
+(4) If value of <symbol1> is respectively lower, greater, lower-or-equal,
+    or greater-or-equal than value of <symbol2>, it returns 'y',
+    otherwise 'n'.
+(5) Returns the value of the expression. Used to override precedence.
+(6) Returns the result of (2-/expr/).
+(7) Returns the result of min(/expr/, /expr/).
+(8) Returns the result of max(/expr/, /expr/).
 
 An expression can have a value of 'n', 'm' or 'y' (or 0, 1, 2
 respectively for calculations). A menu entry becomes visible when its
diff --git a/scripts/kconfig/expr.c b/scripts/kconfig/expr.c
index cbf4996dd9c1..8cee597d33a5 100644
--- a/scripts/kconfig/expr.c
+++ b/scripts/kconfig/expr.c
@@ -893,7 +893,10 @@ static enum string_value_kind expr_parse_string(const char *str,
 	switch (type) {
 	case S_BOOLEAN:
 	case S_TRISTATE:
-		return k_string;
+		val->s = !strcmp(str, "n") ? 0 :
+			 !strcmp(str, "m") ? 1 :
+			 !strcmp(str, "y") ? 2 : -1;
+		return k_signed;
 	case S_INT:
 		val->s = strtoll(str, &tail, 10);
 		kind = k_signed;

From 5133550296d43236439494aa955bfb765a89f615 Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Date: Thu, 4 Jan 2018 21:06:49 +0300
Subject: [PATCH 675/808] sh_eth: fix SH7757 GEther initialization

Renesas  SH7757 has 2 Fast and 2 Gigabit Ether controllers, while the
'sh_eth' driver can only reset and initialize TSU of the first controller
pair. Shimoda-san tried to solve that adding the 'needs_init' member to the
'struct sh_eth_plat_data', however the platform code still never sets this
flag. I think  that we can infer this information from the 'devno' variable
(set  to 'platform_device::id') and reset/init the Ether controller pair
only for an even 'devno'; therefore 'sh_eth_plat_data::needs_init' can be
removed...

Fixes: 150647fb2c31 ("net: sh_eth: change the condition of initialization")
Signed-off-by: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/renesas/sh_eth.c | 4 ++--
 include/linux/sh_eth.h                | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c
index 1bdd67a8a869..f21c1db91c3f 100644
--- a/drivers/net/ethernet/renesas/sh_eth.c
+++ b/drivers/net/ethernet/renesas/sh_eth.c
@@ -3254,8 +3254,8 @@ static int sh_eth_drv_probe(struct platform_device *pdev)
 		ndev->features = NETIF_F_HW_VLAN_CTAG_FILTER;
 	}
 
-	/* initialize first or needed device */
-	if (!devno || pd->needs_init) {
+	/* Need to init only the first port of the two sharing a TSU */
+	if (devno % 2 == 0) {
 		if (mdp->cd->chip_reset)
 			mdp->cd->chip_reset(ndev);
 
diff --git a/include/linux/sh_eth.h b/include/linux/sh_eth.h
index ff3642d267f7..94081e9a5010 100644
--- a/include/linux/sh_eth.h
+++ b/include/linux/sh_eth.h
@@ -17,7 +17,6 @@ struct sh_eth_plat_data {
 	unsigned char mac_addr[ETH_ALEN];
 	unsigned no_ether_link:1;
 	unsigned ether_link_active_low:1;
-	unsigned needs_init:1;
 };
 
 #endif

From 5b9f57cf47b87f07210875d6a24776b4496b818d Mon Sep 17 00:00:00 2001
From: John Johansen <john.johansen@canonical.com>
Date: Thu, 7 Dec 2017 00:28:27 -0800
Subject: [PATCH 676/808] apparmor: fix regression in mount mediation when
 feature set is pinned
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When the mount code was refactored for Labels it was not correctly
updated to check whether policy supported mediation of the mount
class.  This causes a regression when the kernel feature set is
reported as supporting mount and policy is pinned to a feature set
that does not support mount mediation.

BugLink: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=882697#41
Fixes: 2ea3ffb7782a ("apparmor: add mount mediation")
Reported-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
Cc: Stable <stable@vger.kernel.org>
Signed-off-by: John Johansen <john.johansen@canonical.com>
---
 security/apparmor/mount.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/security/apparmor/mount.c b/security/apparmor/mount.c
index ed9b4d0f9f7e..8c558cbce930 100644
--- a/security/apparmor/mount.c
+++ b/security/apparmor/mount.c
@@ -329,6 +329,9 @@ static int match_mnt_path_str(struct aa_profile *profile,
 	AA_BUG(!mntpath);
 	AA_BUG(!buffer);
 
+	if (!PROFILE_MEDIATES(profile, AA_CLASS_MOUNT))
+		return 0;
+
 	error = aa_path_name(mntpath, path_flags(profile, mntpath), buffer,
 			     &mntpnt, &info, profile->disconnected);
 	if (error)
@@ -380,6 +383,9 @@ static int match_mnt(struct aa_profile *profile, const struct path *path,
 	AA_BUG(!profile);
 	AA_BUG(devpath && !devbuffer);
 
+	if (!PROFILE_MEDIATES(profile, AA_CLASS_MOUNT))
+		return 0;
+
 	if (devpath) {
 		error = aa_path_name(devpath, path_flags(profile, devpath),
 				     devbuffer, &devname, &info,
@@ -558,6 +564,9 @@ static int profile_umount(struct aa_profile *profile, struct path *path,
 	AA_BUG(!profile);
 	AA_BUG(!path);
 
+	if (!PROFILE_MEDIATES(profile, AA_CLASS_MOUNT))
+		return 0;
+
 	error = aa_path_name(path, path_flags(profile, path), buffer, &name,
 			     &info, profile->disconnected);
 	if (error)
@@ -613,7 +622,8 @@ static struct aa_label *build_pivotroot(struct aa_profile *profile,
 	AA_BUG(!new_path);
 	AA_BUG(!old_path);
 
-	if (profile_unconfined(profile))
+	if (profile_unconfined(profile) ||
+	    !PROFILE_MEDIATES(profile, AA_CLASS_MOUNT))
 		return aa_get_newest_label(&profile->label);
 
 	error = aa_path_name(old_path, path_flags(profile, old_path),

From 7729bebc619307a0233c86f8585a4bf3eadc7ce4 Mon Sep 17 00:00:00 2001
From: Valentin Ilie <valentin.ilie@gmail.com>
Date: Fri, 5 Jan 2018 23:12:59 +0000
Subject: [PATCH 677/808] ia64, sched/cputime: Fix build error if
 CONFIG_VIRT_CPU_ACCOUNTING_NATIVE=y

Remove the extra parenthesis.

This bug was introduced by:

  e2339a4caa5e: ("ia64: Convert vtime to use nsec units directly")

Signed-off-by: Valentin Ilie <valentin.ilie@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: fenghua.yu@intel.com
Cc: linux-ia64@vger.kernel.org
Cc: tony.luck@intel.com
Link: http://lkml.kernel.org/r/1515193979-24873-1-git-send-email-valentin.ilie@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/ia64/kernel/time.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index c6ecb97151a2..9025699049ca 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -88,7 +88,7 @@ void vtime_flush(struct task_struct *tsk)
 	}
 
 	if (ti->softirq_time) {
-		delta = cycle_to_nsec(ti->softirq_time));
+		delta = cycle_to_nsec(ti->softirq_time);
 		account_system_index_time(tsk, delta, CPUTIME_SOFTIRQ);
 	}
 

From 310d82784fb4d60c80569f5ca9f53a7f3bf1d477 Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Fri, 5 Jan 2018 21:55:38 +0100
Subject: [PATCH 678/808] parisc: qemu idle sleep support

Add qemu idle sleep support when running under qemu with SeaBIOS PDC
firmware.

Like the power architecture we use the "or" assembler instructions,
which translate to nops on real hardware, to indicate that qemu shall
idle sleep.

Signed-off-by: Helge Deller <deller@gmx.de>
Cc: Richard Henderson <rth@twiddle.net>
CC: stable@vger.kernel.org # v4.9+
---
 arch/parisc/kernel/process.c | 39 ++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c
index 30f92391a93e..cad3e8661cd6 100644
--- a/arch/parisc/kernel/process.c
+++ b/arch/parisc/kernel/process.c
@@ -39,6 +39,7 @@
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/fs.h>
+#include <linux/cpu.h>
 #include <linux/module.h>
 #include <linux/personality.h>
 #include <linux/ptrace.h>
@@ -183,6 +184,44 @@ int dump_task_fpu (struct task_struct *tsk, elf_fpregset_t *r)
 	return 1;
 }
 
+/*
+ * Idle thread support
+ *
+ * Detect when running on QEMU with SeaBIOS PDC Firmware and let
+ * QEMU idle the host too.
+ */
+
+int running_on_qemu __read_mostly;
+
+void __cpuidle arch_cpu_idle_dead(void)
+{
+	/* nop on real hardware, qemu will offline CPU. */
+	asm volatile("or %%r31,%%r31,%%r31\n":::);
+}
+
+void __cpuidle arch_cpu_idle(void)
+{
+	local_irq_enable();
+
+	/* nop on real hardware, qemu will idle sleep. */
+	asm volatile("or %%r10,%%r10,%%r10\n":::);
+}
+
+static int __init parisc_idle_init(void)
+{
+	const char *marker;
+
+	/* check QEMU/SeaBIOS marker in PAGE0 */
+	marker = (char *) &PAGE0->pad0;
+	running_on_qemu = (memcmp(marker, "SeaBIOS", 8) == 0);
+
+	if (!running_on_qemu)
+		cpu_idle_poll_ctrl(1);
+
+	return 0;
+}
+arch_initcall(parisc_idle_init);
+
 /*
  * Copy architecture-specific thread state
  */

From b94b7373317164402ff7728d10f7023127a02b60 Mon Sep 17 00:00:00 2001
From: Jia Zhang <qianyue.zj@alibaba-inc.com>
Date: Mon, 1 Jan 2018 10:04:47 +0800
Subject: [PATCH 679/808] x86/microcode/intel: Extend BDW late-loading with a
 revision check

Instead of blacklisting all model 79 CPUs when attempting a late
microcode loading, limit that only to CPUs with microcode revisions <
0x0b000021 because only on those late loading may cause a system hang.

For such processors either:

a) a BIOS update which might contain a newer microcode revision

or

b) the early microcode loading method

should be considered.

Processors with revisions 0x0b000021 or higher will not experience such
hangs.

For more details, see erratum BDF90 in document #334165 (Intel Xeon
Processor E7-8800/4800 v4 Product Family Specification Update) from
September 2017.

[ bp: Heavily massage commit message and pr_* statements. ]

Fixes: 723f2828a98c ("x86/microcode/intel: Disable late loading on model 79")
Signed-off-by: Jia Zhang <qianyue.zj@alibaba-inc.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Tony Luck <tony.luck@intel.com>
Cc: x86-ml <x86@kernel.org>
Cc: <stable@vger.kernel.org> # v4.14
Link: http://lkml.kernel.org/r/1514772287-92959-1-git-send-email-qianyue.zj@alibaba-inc.com
---
 arch/x86/kernel/cpu/microcode/intel.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 8ccdca6d3f9e..d9e460fc7a3b 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -910,8 +910,17 @@ static bool is_blacklisted(unsigned int cpu)
 {
 	struct cpuinfo_x86 *c = &cpu_data(cpu);
 
-	if (c->x86 == 6 && c->x86_model == INTEL_FAM6_BROADWELL_X) {
-		pr_err_once("late loading on model 79 is disabled.\n");
+	/*
+	 * Late loading on model 79 with microcode revision less than 0x0b000021
+	 * may result in a system hang. This behavior is documented in item
+	 * BDF90, #334165 (Intel Xeon Processor E7-8800/4800 v4 Product Family).
+	 */
+	if (c->x86 == 6 &&
+	    c->x86_model == INTEL_FAM6_BROADWELL_X &&
+	    c->x86_mask == 0x01 &&
+	    c->microcode < 0x0b000021) {
+		pr_err_once("Erratum BDF90: late loading with revision < 0x0b000021 (0x%x) disabled.\n", c->microcode);
+		pr_err_once("Please consider either early loading through initrd/built-in or a potential BIOS update.\n");
 		return true;
 	}
 

From ae6650163c66a7eff1acd6eb8b0f752dcfa8eba5 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 5 Jan 2018 16:26:00 -0800
Subject: [PATCH 680/808] loop: fix concurrent lo_open/lo_release
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

范龙飞 reports that KASAN can report a use-after-free in __lock_acquire.
The reason is due to insufficient serialization in lo_release(), which
will continue to use the loop device even after it has decremented the
lo_refcnt to zero.

In the meantime, another process can come in, open the loop device
again as it is being shut down. Confusion ensues.

Reported-by: 范龙飞 <long7573@126.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index bc8e61506968..d5fe720cf149 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1581,9 +1581,8 @@ out:
 	return err;
 }
 
-static void lo_release(struct gendisk *disk, fmode_t mode)
+static void __lo_release(struct loop_device *lo)
 {
-	struct loop_device *lo = disk->private_data;
 	int err;
 
 	if (atomic_dec_return(&lo->lo_refcnt))
@@ -1610,6 +1609,13 @@ static void lo_release(struct gendisk *disk, fmode_t mode)
 	mutex_unlock(&lo->lo_ctl_mutex);
 }
 
+static void lo_release(struct gendisk *disk, fmode_t mode)
+{
+	mutex_lock(&loop_index_mutex);
+	__lo_release(disk->private_data);
+	mutex_unlock(&loop_index_mutex);
+}
+
 static const struct block_device_operations lo_fops = {
 	.owner =	THIS_MODULE,
 	.open =		lo_open,

From de53c3786a3ce162a1c815d0c04c766c23ec9c0a Mon Sep 17 00:00:00 2001
From: Jiri Kosina <jkosina@suse.cz>
Date: Fri, 5 Jan 2018 22:35:41 +0100
Subject: [PATCH 681/808] x86/pti: Unbreak EFI old_memmap

EFI_OLD_MEMMAP's efi_call_phys_prolog() calls set_pgd() with swapper PGD that
has PAGE_USER set, which makes PTI set NX on it, and therefore EFI can't
execute it's code.

Fix that by forcefully clearing _PAGE_NX from the PGD (this can't be done
by the pgprot API).

_PAGE_NX will be automatically reintroduced in efi_call_phys_epilog(), as
_set_pgd() will again notice that this is _PAGE_USER, and set _PAGE_NX on
it.

Tested-by: Dimitri Sivanich <sivanich@hpe.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-efi@vger.kernel.org
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/nycvar.YFH.7.76.1801052215460.11852@cbobk.fhfr.pm
---
 arch/x86/platform/efi/efi_64.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 39c4b35ac7a4..61975b6bcb1a 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -134,7 +134,9 @@ pgd_t * __init efi_call_phys_prolog(void)
 				pud[j] = *pud_offset(p4d_k, vaddr);
 			}
 		}
+		pgd_offset_k(pgd * PGDIR_SIZE)->pgd &= ~_PAGE_NX;
 	}
+
 out:
 	__flush_tlb_all();
 

From 01c9b17bf673b05bb401b76ec763e9730ccf1376 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Fri, 5 Jan 2018 09:44:36 -0800
Subject: [PATCH 682/808] x86/Documentation: Add PTI description

Add some details about how PTI works, what some of the downsides
are, and how to debug it when things go wrong.

Also document the kernel parameter: 'pti/nopti'.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Moritz Lipp <moritz.lipp@iaik.tugraz.at>
Cc: Daniel Gruss <daniel.gruss@iaik.tugraz.at>
Cc: Michael Schwarz <michael.schwarz@iaik.tugraz.at>
Cc: Richard Fellner <richard.fellner@student.tugraz.at>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Andi Lutomirsky <luto@kernel.org>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/20180105174436.1BC6FA2B@viggo.jf.intel.com
---
 .../admin-guide/kernel-parameters.txt         |  21 +-
 Documentation/x86/pti.txt                     | 186 ++++++++++++++++++
 2 files changed, 200 insertions(+), 7 deletions(-)
 create mode 100644 Documentation/x86/pti.txt

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 520fdec15bbb..905991745d26 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2685,8 +2685,6 @@
 			steal time is computed, but won't influence scheduler
 			behaviour
 
-	nopti		[X86-64] Disable kernel page table isolation
-
 	nolapic		[X86-32,APIC] Do not enable or use the local APIC.
 
 	nolapic_timer	[X86-32,APIC] Do not use the local APIC timer.
@@ -3255,11 +3253,20 @@
 	pt.		[PARIDE]
 			See Documentation/blockdev/paride.txt.
 
-	pti=		[X86_64]
-			Control user/kernel address space isolation:
-			on - enable
-			off - disable
-			auto - default setting
+	pti=		[X86_64] Control Page Table Isolation of user and
+			kernel address spaces.  Disabling this feature
+			removes hardening, but improves performance of
+			system calls and interrupts.
+
+			on   - unconditionally enable
+			off  - unconditionally disable
+			auto - kernel detects whether your CPU model is
+			       vulnerable to issues that PTI mitigates
+
+			Not specifying this option is equivalent to pti=auto.
+
+	nopti		[X86_64]
+			Equivalent to pti=off
 
 	pty.legacy_count=
 			[KNL] Number of legacy pty's. Overwrites compiled-in
diff --git a/Documentation/x86/pti.txt b/Documentation/x86/pti.txt
new file mode 100644
index 000000000000..d11eff61fc9a
--- /dev/null
+++ b/Documentation/x86/pti.txt
@@ -0,0 +1,186 @@
+Overview
+========
+
+Page Table Isolation (pti, previously known as KAISER[1]) is a
+countermeasure against attacks on the shared user/kernel address
+space such as the "Meltdown" approach[2].
+
+To mitigate this class of attacks, we create an independent set of
+page tables for use only when running userspace applications.  When
+the kernel is entered via syscalls, interrupts or exceptions, the
+page tables are switched to the full "kernel" copy.  When the system
+switches back to user mode, the user copy is used again.
+
+The userspace page tables contain only a minimal amount of kernel
+data: only what is needed to enter/exit the kernel such as the
+entry/exit functions themselves and the interrupt descriptor table
+(IDT).  There are a few strictly unnecessary things that get mapped
+such as the first C function when entering an interrupt (see
+comments in pti.c).
+
+This approach helps to ensure that side-channel attacks leveraging
+the paging structures do not function when PTI is enabled.  It can be
+enabled by setting CONFIG_PAGE_TABLE_ISOLATION=y at compile time.
+Once enabled at compile-time, it can be disabled at boot with the
+'nopti' or 'pti=' kernel parameters (see kernel-parameters.txt).
+
+Page Table Management
+=====================
+
+When PTI is enabled, the kernel manages two sets of page tables.
+The first set is very similar to the single set which is present in
+kernels without PTI.  This includes a complete mapping of userspace
+that the kernel can use for things like copy_to_user().
+
+Although _complete_, the user portion of the kernel page tables is
+crippled by setting the NX bit in the top level.  This ensures
+that any missed kernel->user CR3 switch will immediately crash
+userspace upon executing its first instruction.
+
+The userspace page tables map only the kernel data needed to enter
+and exit the kernel.  This data is entirely contained in the 'struct
+cpu_entry_area' structure which is placed in the fixmap which gives
+each CPU's copy of the area a compile-time-fixed virtual address.
+
+For new userspace mappings, the kernel makes the entries in its
+page tables like normal.  The only difference is when the kernel
+makes entries in the top (PGD) level.  In addition to setting the
+entry in the main kernel PGD, a copy of the entry is made in the
+userspace page tables' PGD.
+
+This sharing at the PGD level also inherently shares all the lower
+layers of the page tables.  This leaves a single, shared set of
+userspace page tables to manage.  One PTE to lock, one set of
+accessed bits, dirty bits, etc...
+
+Overhead
+========
+
+Protection against side-channel attacks is important.  But,
+this protection comes at a cost:
+
+1. Increased Memory Use
+  a. Each process now needs an order-1 PGD instead of order-0.
+     (Consumes an additional 4k per process).
+  b. The 'cpu_entry_area' structure must be 2MB in size and 2MB
+     aligned so that it can be mapped by setting a single PMD
+     entry.  This consumes nearly 2MB of RAM once the kernel
+     is decompressed, but no space in the kernel image itself.
+
+2. Runtime Cost
+  a. CR3 manipulation to switch between the page table copies
+     must be done at interrupt, syscall, and exception entry
+     and exit (it can be skipped when the kernel is interrupted,
+     though.)  Moves to CR3 are on the order of a hundred
+     cycles, and are required at every entry and exit.
+  b. A "trampoline" must be used for SYSCALL entry.  This
+     trampoline depends on a smaller set of resources than the
+     non-PTI SYSCALL entry code, so requires mapping fewer
+     things into the userspace page tables.  The downside is
+     that stacks must be switched at entry time.
+  d. Global pages are disabled for all kernel structures not
+     mapped into both kernel and userspace page tables.  This
+     feature of the MMU allows different processes to share TLB
+     entries mapping the kernel.  Losing the feature means more
+     TLB misses after a context switch.  The actual loss of
+     performance is very small, however, never exceeding 1%.
+  d. Process Context IDentifiers (PCID) is a CPU feature that
+     allows us to skip flushing the entire TLB when switching page
+     tables by setting a special bit in CR3 when the page tables
+     are changed.  This makes switching the page tables (at context
+     switch, or kernel entry/exit) cheaper.  But, on systems with
+     PCID support, the context switch code must flush both the user
+     and kernel entries out of the TLB.  The user PCID TLB flush is
+     deferred until the exit to userspace, minimizing the cost.
+     See intel.com/sdm for the gory PCID/INVPCID details.
+  e. The userspace page tables must be populated for each new
+     process.  Even without PTI, the shared kernel mappings
+     are created by copying top-level (PGD) entries into each
+     new process.  But, with PTI, there are now *two* kernel
+     mappings: one in the kernel page tables that maps everything
+     and one for the entry/exit structures.  At fork(), we need to
+     copy both.
+  f. In addition to the fork()-time copying, there must also
+     be an update to the userspace PGD any time a set_pgd() is done
+     on a PGD used to map userspace.  This ensures that the kernel
+     and userspace copies always map the same userspace
+     memory.
+  g. On systems without PCID support, each CR3 write flushes
+     the entire TLB.  That means that each syscall, interrupt
+     or exception flushes the TLB.
+  h. INVPCID is a TLB-flushing instruction which allows flushing
+     of TLB entries for non-current PCIDs.  Some systems support
+     PCIDs, but do not support INVPCID.  On these systems, addresses
+     can only be flushed from the TLB for the current PCID.  When
+     flushing a kernel address, we need to flush all PCIDs, so a
+     single kernel address flush will require a TLB-flushing CR3
+     write upon the next use of every PCID.
+
+Possible Future Work
+====================
+1. We can be more careful about not actually writing to CR3
+   unless its value is actually changed.
+2. Allow PTI to be enabled/disabled at runtime in addition to the
+   boot-time switching.
+
+Testing
+========
+
+To test stability of PTI, the following test procedure is recommended,
+ideally doing all of these in parallel:
+
+1. Set CONFIG_DEBUG_ENTRY=y
+2. Run several copies of all of the tools/testing/selftests/x86/ tests
+   (excluding MPX and protection_keys) in a loop on multiple CPUs for
+   several minutes.  These tests frequently uncover corner cases in the
+   kernel entry code.  In general, old kernels might cause these tests
+   themselves to crash, but they should never crash the kernel.
+3. Run the 'perf' tool in a mode (top or record) that generates many
+   frequent performance monitoring non-maskable interrupts (see "NMI"
+   in /proc/interrupts).  This exercises the NMI entry/exit code which
+   is known to trigger bugs in code paths that did not expect to be
+   interrupted, including nested NMIs.  Using "-c" boosts the rate of
+   NMIs, and using two -c with separate counters encourages nested NMIs
+   and less deterministic behavior.
+
+	while true; do perf record -c 10000 -e instructions,cycles -a sleep 10; done
+
+4. Launch a KVM virtual machine.
+5. Run 32-bit binaries on systems supporting the SYSCALL instruction.
+   This has been a lightly-tested code path and needs extra scrutiny.
+
+Debugging
+=========
+
+Bugs in PTI cause a few different signatures of crashes
+that are worth noting here.
+
+ * Failures of the selftests/x86 code.  Usually a bug in one of the
+   more obscure corners of entry_64.S
+ * Crashes in early boot, especially around CPU bringup.  Bugs
+   in the trampoline code or mappings cause these.
+ * Crashes at the first interrupt.  Caused by bugs in entry_64.S,
+   like screwing up a page table switch.  Also caused by
+   incorrectly mapping the IRQ handler entry code.
+ * Crashes at the first NMI.  The NMI code is separate from main
+   interrupt handlers and can have bugs that do not affect
+   normal interrupts.  Also caused by incorrectly mapping NMI
+   code.  NMIs that interrupt the entry code must be very
+   careful and can be the cause of crashes that show up when
+   running perf.
+ * Kernel crashes at the first exit to userspace.  entry_64.S
+   bugs, or failing to map some of the exit code.
+ * Crashes at first interrupt that interrupts userspace. The paths
+   in entry_64.S that return to userspace are sometimes separate
+   from the ones that return to the kernel.
+ * Double faults: overflowing the kernel stack because of page
+   faults upon page faults.  Caused by touching non-pti-mapped
+   data in the entry code, or forgetting to switch to kernel
+   CR3 before calling into C functions which are not pti-mapped.
+ * Userspace segfaults early in boot, sometimes manifesting
+   as mount(8) failing to mount the rootfs.  These have
+   tended to be TLB invalidation issues.  Usually invalidating
+   the wrong PCID, or otherwise missing an invalidation.
+
+1. https://gruss.cc/files/kaiser.pdf
+2. https://meltdownattack.com/meltdown.pdf

From 99c6fa2511d8a683e61468be91b83f85452115fa Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Sat, 6 Jan 2018 11:49:23 +0000
Subject: [PATCH 683/808] x86/cpufeatures: Add X86_BUG_SPECTRE_V[12]

Add the bug bits for spectre v1/2 and force them unconditionally for all
cpus.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: gnomes@lxorguk.ukuu.org.uk
Cc: Rik van Riel <riel@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Kees Cook <keescook@google.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/1515239374-23361-2-git-send-email-dwmw@amazon.co.uk
---
 arch/x86/include/asm/cpufeatures.h | 2 ++
 arch/x86/kernel/cpu/common.c       | 3 +++
 2 files changed, 5 insertions(+)

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 21ac898df2d8..1641c2f96363 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -342,5 +342,7 @@
 #define X86_BUG_MONITOR			X86_BUG(12) /* IPI required to wake up remote CPU */
 #define X86_BUG_AMD_E400		X86_BUG(13) /* CPU is among the affected by Erratum 400 */
 #define X86_BUG_CPU_MELTDOWN		X86_BUG(14) /* CPU is affected by meltdown attack and needs kernel page table isolation */
+#define X86_BUG_SPECTRE_V1		X86_BUG(15) /* CPU is affected by Spectre variant 1 attack with conditional branches */
+#define X86_BUG_SPECTRE_V2		X86_BUG(16) /* CPU is affected by Spectre variant 2 attack with indirect branches */
 
 #endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 2d3bd2215e5b..372ba3fb400f 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -902,6 +902,9 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
 	if (c->x86_vendor != X86_VENDOR_AMD)
 		setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN);
 
+	setup_force_cpu_bug(X86_BUG_SPECTRE_V1);
+	setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
+
 	fpu__init_system(c);
 
 #ifdef CONFIG_X86_32

From fee4380f368e84ed216b62ccd2fbc4126f2bf40b Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Mon, 18 Dec 2017 11:32:45 +0100
Subject: [PATCH 684/808] mtd: nand: pxa3xx: Fix READOOB implementation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In the current driver, OOB bytes are accessed in raw mode, and when a
page access is done with NDCR_SPARE_EN set and NDCR_ECC_EN cleared, the
driver must read the whole spare area (64 bytes in case of a 2k page,
16 bytes for a 512 page). The driver was only reading the free OOB
bytes, which was leaving some unread data in the FIFO and was somehow
leading to a timeout.

We could patch the driver to read ->spare_size + ->ecc_size instead of
just ->spare_size when READOOB is requested, but we'd better make
in-band and OOB accesses consistent.
Since the driver is always accessing in-band data in non-raw mode (with
the ECC engine enabled), we should also access OOB data in this mode.
That's particularly useful when using the BCH engine because in this
mode the free OOB bytes are also ECC protected.

Fixes: 43bcfd2bb24a ("mtd: nand: pxa3xx: Add driver-specific ECC BCH support")
Cc: stable@vger.kernel.org
Reported-by: Sean Nyekjær <sean.nyekjaer@prevas.dk>
Tested-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Acked-by: Ezequiel Garcia <ezequiel@vanguardiasur.com.ar>
Tested-by: Sean Nyekjaer <sean.nyekjaer@prevas.dk>
Acked-by: Robert Jarzmik <robert.jarzmik@free.fr>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 drivers/mtd/nand/pxa3xx_nand.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/mtd/nand/pxa3xx_nand.c b/drivers/mtd/nand/pxa3xx_nand.c
index 90b9a9ccbe60..9285f60e5783 100644
--- a/drivers/mtd/nand/pxa3xx_nand.c
+++ b/drivers/mtd/nand/pxa3xx_nand.c
@@ -963,6 +963,7 @@ static void prepare_start_command(struct pxa3xx_nand_info *info, int command)
 
 	switch (command) {
 	case NAND_CMD_READ0:
+	case NAND_CMD_READOOB:
 	case NAND_CMD_PAGEPROG:
 		info->use_ecc = 1;
 		break;

From 5731a879d03bdaa00265f8ebc32dfd0e65d25276 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Thu, 4 Jan 2018 20:02:09 -0800
Subject: [PATCH 685/808] bpf: sockmap missing NULL psock check

Add psock NULL check to handle a racing sock event that can get the
sk_callback_lock before this case but after xchg happens causing the
refcnt to hit zero and sock user data (psock) to be null and queued
for garbage collection.

Also add a comment in the code because this is a bit subtle and
not obvious in my opinion.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/sockmap.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 5ee2e41893d9..1712d319c2d8 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -591,8 +591,15 @@ static void sock_map_free(struct bpf_map *map)
 
 		write_lock_bh(&sock->sk_callback_lock);
 		psock = smap_psock_sk(sock);
-		smap_list_remove(psock, &stab->sock_map[i]);
-		smap_release_sock(psock, sock);
+		/* This check handles a racing sock event that can get the
+		 * sk_callback_lock before this case but after xchg happens
+		 * causing the refcnt to hit zero and sock user data (psock)
+		 * to be null and queued for garbage collection.
+		 */
+		if (likely(psock)) {
+			smap_list_remove(psock, &stab->sock_map[i]);
+			smap_release_sock(psock, sock);
+		}
 		write_unlock_bh(&sock->sk_callback_lock);
 	}
 	rcu_read_unlock();

From 2b36047e7889b7efee22c11e17f035f721855731 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Fri, 5 Jan 2018 15:02:00 -0800
Subject: [PATCH 686/808] selftests/bpf: fix test_align

since commit 82abbf8d2fc4 the verifier rejects the bit-wise
arithmetic on pointers earlier.
The test 'dubious pointer arithmetic' now has less output to match on.
Adjust it.

Fixes: 82abbf8d2fc4 ("bpf: do not allow root to mangle valid pointers")
Reported-by: kernel test robot <xiaolong.ye@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/test_align.c | 22 +---------------------
 1 file changed, 1 insertion(+), 21 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_align.c b/tools/testing/selftests/bpf/test_align.c
index 8591c89c0828..471bbbdb94db 100644
--- a/tools/testing/selftests/bpf/test_align.c
+++ b/tools/testing/selftests/bpf/test_align.c
@@ -474,27 +474,7 @@ static struct bpf_align_test tests[] = {
 		.result = REJECT,
 		.matches = {
 			{4, "R5=pkt(id=0,off=0,r=0,imm=0)"},
-			/* ptr & 0x40 == either 0 or 0x40 */
-			{5, "R5=inv(id=0,umax_value=64,var_off=(0x0; 0x40))"},
-			/* ptr << 2 == unknown, (4n) */
-			{7, "R5=inv(id=0,smax_value=9223372036854775804,umax_value=18446744073709551612,var_off=(0x0; 0xfffffffffffffffc))"},
-			/* (4n) + 14 == (4n+2).  We blow our bounds, because
-			 * the add could overflow.
-			 */
-			{8, "R5=inv(id=0,var_off=(0x2; 0xfffffffffffffffc))"},
-			/* Checked s>=0 */
-			{10, "R5=inv(id=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"},
-			/* packet pointer + nonnegative (4n+2) */
-			{12, "R6=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"},
-			{14, "R4=pkt(id=1,off=4,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"},
-			/* NET_IP_ALIGN + (4n+2) == (4n), alignment is fine.
-			 * We checked the bounds, but it might have been able
-			 * to overflow if the packet pointer started in the
-			 * upper half of the address space.
-			 * So we did not get a 'range' on R6, and the access
-			 * attempt will fail.
-			 */
-			{16, "R6=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"},
+			/* R5 bitwise operator &= on pointer prohibited */
 		}
 	},
 	{

From 7b6af2c53192f1766892ef40c8f48a413509ed72 Mon Sep 17 00:00:00 2001
From: Jacek Anaszewski <jacek.anaszewski@gmail.com>
Date: Wed, 3 Jan 2018 21:13:45 +0100
Subject: [PATCH 687/808] leds: core: Fix regression caused by commit
 2b83ff96f51d

Commit 2b83ff96f51d ("led: core: Fix brightness setting when setting delay_off=0")
replaced del_timer_sync(&led_cdev->blink_timer) with led_stop_software_blink()
in led_blink_set(), which additionally clears LED_BLINK_SW flag as well as
zeroes blink_delay_on and blink_delay_off properties of the struct led_classdev.

Cleansing of the latter ones wasn't required to fix the original issue but
wasn't considered harmful. It nonetheless turned out to be so in case when
pointer to one or both props is passed to led_blink_set() like in the
ledtrig-timer.c. In such cases zeroes are passed later in delay_on and/or
delay_off arguments to led_blink_setup(), which results either in stopping
the software blinking or setting blinking frequency always to 1Hz.

Avoid using led_stop_software_blink() and add a single call required
to clear LED_BLINK_SW flag, which was the only needed modification to
fix the original issue.

Fixes 2b83ff96f51d ("led: core: Fix brightness setting when setting delay_off=0")
Signed-off-by: Jacek Anaszewski <jacek.anaszewski@gmail.com>
---
 drivers/leds/led-core.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/leds/led-core.c b/drivers/leds/led-core.c
index f3654fd2eaf3..ede4fa0ac2cc 100644
--- a/drivers/leds/led-core.c
+++ b/drivers/leds/led-core.c
@@ -186,8 +186,9 @@ void led_blink_set(struct led_classdev *led_cdev,
 		   unsigned long *delay_on,
 		   unsigned long *delay_off)
 {
-	led_stop_software_blink(led_cdev);
+	del_timer_sync(&led_cdev->blink_timer);
 
+	clear_bit(LED_BLINK_SW, &led_cdev->work_flags);
 	clear_bit(LED_BLINK_ONESHOT, &led_cdev->work_flags);
 	clear_bit(LED_BLINK_ONESHOT_STOP, &led_cdev->work_flags);
 

From b2cd1df66037e7c4697c7e40496bf7e4a5e16a2d Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 7 Jan 2018 14:22:41 -0800
Subject: [PATCH 688/808] Linux 4.15-rc7

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index eb1f5973813e..eb59638035dd 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,7 @@
 VERSION = 4
 PATCHLEVEL = 15
 SUBLEVEL = 0
-EXTRAVERSION = -rc6
+EXTRAVERSION = -rc7
 NAME = Fearless Coyote
 
 # *DOCUMENTATION*

From 33c57c0d3c67f51f491a9d27108f7e97adc03d96 Mon Sep 17 00:00:00 2001
From: Karsten Merker <merker@debian.org>
Date: Thu, 4 Jan 2018 23:37:02 +0100
Subject: [PATCH 689/808] RISC-V: Add a basic defconfig

This patch provides a basic defconfig for the RISC-V
architecture that enables enough kernel features to run a
basic Linux distribution on qemu's "virt" board for native
software development. Features include:

- serial console
- virtio block and network device support
- VFAT and ext2/3/4 filesystem support
- NFS client and NFS rootfs support
- an assortment of other kernel features required for
  running systemd

It also enables a number of drivers for physical hardware
that target the "SiFive U500" SoC and the corresponding
development platform.  These include:

- PCIe host controller support for the FPGA-based U500
  development platform (PCIE_XILINX)
- USB host controller support (OHCI/EHCI/XHCI)
- USB HID (keyboard/mouse) support
- USB mass storage support (bulk and UAS)
- SATA support (AHCI)
- ethernet drivers (MACB for a SoC-internal MAC block, microsemi
  ethernet phy, E1000E and R8169 for PCIe-connected external devices)
- DRM and framebuffer console support for PCIe-connected
  Radeon graphics chips

Signed-off-by: Karsten Merker <merker@debian.org>
Signed-off-by: Palmer Dabbelt <palmer@sifive.com>
---
 arch/riscv/configs/defconfig | 75 ++++++++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)

diff --git a/arch/riscv/configs/defconfig b/arch/riscv/configs/defconfig
index e69de29bb2d1..47dacf06c679 100644
--- a/arch/riscv/configs/defconfig
+++ b/arch/riscv/configs/defconfig
@@ -0,0 +1,75 @@
+CONFIG_SMP=y
+CONFIG_PCI=y
+CONFIG_PCIE_XILINX=y
+CONFIG_SYSVIPC=y
+CONFIG_POSIX_MQUEUE=y
+CONFIG_IKCONFIG=y
+CONFIG_IKCONFIG_PROC=y
+CONFIG_CGROUPS=y
+CONFIG_CGROUP_SCHED=y
+CONFIG_CFS_BANDWIDTH=y
+CONFIG_CGROUP_BPF=y
+CONFIG_NAMESPACES=y
+CONFIG_USER_NS=y
+CONFIG_BLK_DEV_INITRD=y
+CONFIG_EXPERT=y
+CONFIG_CHECKPOINT_RESTORE=y
+CONFIG_BPF_SYSCALL=y
+CONFIG_NET=y
+CONFIG_PACKET=y
+CONFIG_UNIX=y
+CONFIG_INET=y
+CONFIG_IP_MULTICAST=y
+CONFIG_IP_ADVANCED_ROUTER=y
+CONFIG_IP_PNP=y
+CONFIG_IP_PNP_DHCP=y
+CONFIG_IP_PNP_BOOTP=y
+CONFIG_IP_PNP_RARP=y
+CONFIG_NETLINK_DIAG=y
+CONFIG_DEVTMPFS=y
+CONFIG_BLK_DEV_LOOP=y
+CONFIG_VIRTIO_BLK=y
+CONFIG_BLK_DEV_SD=y
+CONFIG_BLK_DEV_SR=y
+CONFIG_ATA=y
+CONFIG_SATA_AHCI=y
+CONFIG_SATA_AHCI_PLATFORM=y
+CONFIG_NETDEVICES=y
+CONFIG_VIRTIO_NET=y
+CONFIG_MACB=y
+CONFIG_E1000E=y
+CONFIG_R8169=y
+CONFIG_MICROSEMI_PHY=y
+CONFIG_INPUT_MOUSEDEV=y
+CONFIG_SERIAL_8250=y
+CONFIG_SERIAL_8250_CONSOLE=y
+CONFIG_SERIAL_OF_PLATFORM=y
+# CONFIG_PTP_1588_CLOCK is not set
+CONFIG_DRM=y
+CONFIG_DRM_RADEON=y
+CONFIG_FRAMEBUFFER_CONSOLE=y
+CONFIG_USB=y
+CONFIG_USB_XHCI_HCD=y
+CONFIG_USB_XHCI_PLATFORM=y
+CONFIG_USB_EHCI_HCD=y
+CONFIG_USB_EHCI_HCD_PLATFORM=y
+CONFIG_USB_OHCI_HCD=y
+CONFIG_USB_OHCI_HCD_PLATFORM=y
+CONFIG_USB_STORAGE=y
+CONFIG_USB_UAS=y
+CONFIG_VIRTIO_MMIO=y
+CONFIG_RAS=y
+CONFIG_EXT4_FS=y
+CONFIG_EXT4_FS_POSIX_ACL=y
+CONFIG_AUTOFS4_FS=y
+CONFIG_MSDOS_FS=y
+CONFIG_VFAT_FS=y
+CONFIG_TMPFS=y
+CONFIG_TMPFS_POSIX_ACL=y
+CONFIG_NFS_FS=y
+CONFIG_NFS_V4=y
+CONFIG_NFS_V4_1=y
+CONFIG_NFS_V4_2=y
+CONFIG_ROOT_NFS=y
+# CONFIG_RCU_TRACE is not set
+CONFIG_CRYPTO_USER_API_HASH=y

From 9e49a4ed072ab67b17238c5a45d7cba7f848659e Mon Sep 17 00:00:00 2001
From: Palmer Dabbelt <palmer@dabbelt.com>
Date: Tue, 26 Dec 2017 19:11:22 -0800
Subject: [PATCH 690/808] RISC-V: Make __NR_riscv_flush_icache visible to
 userspace

We were hoping to avoid making this visible to userspace, but it looks
like we're going to have to because QEMU's user-mode emulation doesn't
want to emulate a vDSO.  Having vDSO-only system calls was a bit
unothodox anyway, so I think in this case it's OK to just make the
actual system call number public.

This patch simply moves the definition of __NR_riscv_flush_icache
availiable to userspace, which results in the deletion of the now empty
vdso-syscalls.h.

Changes since v1:

* I've moved the definition into uapi/asm/syscalls.h rathen than
  uapi/asm/unistd.h.  This allows me to keep asm/unistd.h, so we can
  keep the syscall table macros sane.
* As a side effect of the above, this no longer disables all system
  calls on RISC-V.  Whoops!

Signed-off-by: Palmer Dabbelt <palmer@sifive.com>
---
 arch/riscv/include/asm/unistd.h        |  1 +
 arch/riscv/include/asm/vdso-syscalls.h | 28 --------------------------
 arch/riscv/include/uapi/asm/syscalls.h | 26 ++++++++++++++++++++++++
 arch/riscv/kernel/syscall_table.c      |  1 -
 arch/riscv/kernel/vdso/flush_icache.S  |  1 -
 5 files changed, 27 insertions(+), 30 deletions(-)
 delete mode 100644 arch/riscv/include/asm/vdso-syscalls.h
 create mode 100644 arch/riscv/include/uapi/asm/syscalls.h

diff --git a/arch/riscv/include/asm/unistd.h b/arch/riscv/include/asm/unistd.h
index 9f250ed007cd..2f704a5c4196 100644
--- a/arch/riscv/include/asm/unistd.h
+++ b/arch/riscv/include/asm/unistd.h
@@ -14,3 +14,4 @@
 #define __ARCH_HAVE_MMU
 #define __ARCH_WANT_SYS_CLONE
 #include <uapi/asm/unistd.h>
+#include <uapi/asm/syscalls.h>
diff --git a/arch/riscv/include/asm/vdso-syscalls.h b/arch/riscv/include/asm/vdso-syscalls.h
deleted file mode 100644
index a2ccf1894929..000000000000
--- a/arch/riscv/include/asm/vdso-syscalls.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright (C) 2017 SiFive
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef _ASM_RISCV_VDSO_SYSCALLS_H
-#define _ASM_RISCV_VDSO_SYSCALLS_H
-
-#ifdef CONFIG_SMP
-
-/* These syscalls are only used by the vDSO and are not in the uapi. */
-#define __NR_riscv_flush_icache (__NR_arch_specific_syscall + 15)
-__SYSCALL(__NR_riscv_flush_icache, sys_riscv_flush_icache)
-
-#endif
-
-#endif /* _ASM_RISCV_VDSO_H */
diff --git a/arch/riscv/include/uapi/asm/syscalls.h b/arch/riscv/include/uapi/asm/syscalls.h
new file mode 100644
index 000000000000..818655b0d535
--- /dev/null
+++ b/arch/riscv/include/uapi/asm/syscalls.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2017 SiFive
+ */
+
+#ifndef _ASM__UAPI__SYSCALLS_H
+#define _ASM__UAPI__SYSCALLS_H
+
+/*
+ * Allows the instruction cache to be flushed from userspace.  Despite RISC-V
+ * having a direct 'fence.i' instruction available to userspace (which we
+ * can't trap!), that's not actually viable when running on Linux because the
+ * kernel might schedule a process on another hart.  There is no way for
+ * userspace to handle this without invoking the kernel (as it doesn't know the
+ * thread->hart mappings), so we've defined a RISC-V specific system call to
+ * flush the instruction cache.
+ *
+ * __NR_riscv_flush_icache is defined to flush the instruction cache over an
+ * address range, with the flush applying to either all threads or just the
+ * caller.  We don't currently do anything with the address range, that's just
+ * in there for forwards compatibility.
+ */
+#define __NR_riscv_flush_icache (__NR_arch_specific_syscall + 15)
+__SYSCALL(__NR_riscv_flush_icache, sys_riscv_flush_icache)
+
+#endif
diff --git a/arch/riscv/kernel/syscall_table.c b/arch/riscv/kernel/syscall_table.c
index a5bd6401f95e..ade52b903a43 100644
--- a/arch/riscv/kernel/syscall_table.c
+++ b/arch/riscv/kernel/syscall_table.c
@@ -23,5 +23,4 @@
 void *sys_call_table[__NR_syscalls] = {
 	[0 ... __NR_syscalls - 1] = sys_ni_syscall,
 #include <asm/unistd.h>
-#include <asm/vdso-syscalls.h>
 };
diff --git a/arch/riscv/kernel/vdso/flush_icache.S b/arch/riscv/kernel/vdso/flush_icache.S
index b0fbad74e873..023e4d4aef58 100644
--- a/arch/riscv/kernel/vdso/flush_icache.S
+++ b/arch/riscv/kernel/vdso/flush_icache.S
@@ -13,7 +13,6 @@
 
 #include <linux/linkage.h>
 #include <asm/unistd.h>
-#include <asm/vdso-syscalls.h>
 
 	.text
 /* int __vdso_flush_icache(void *start, void *end, unsigned long flags); */

From c163fb38ca34694b0cce99bb5604257bc29bf200 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 4 Jan 2018 18:35:02 +0100
Subject: [PATCH 691/808] riscv: remove CONFIG_MMU ifdefs

The RISC-V port doesn't suport a nommu mode, so there is no reason
to provide some code only under a CONFIG_MMU ifdef.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Palmer Dabbelt <palmer@sifive.com>
---
 arch/riscv/include/asm/io.h       |  4 ----
 arch/riscv/include/asm/pgtable.h  |  4 ----
 arch/riscv/include/asm/tlbflush.h |  4 ----
 arch/riscv/include/asm/uaccess.h  | 12 ------------
 4 files changed, 24 deletions(-)

diff --git a/arch/riscv/include/asm/io.h b/arch/riscv/include/asm/io.h
index a82ce599b639..b269451e7e85 100644
--- a/arch/riscv/include/asm/io.h
+++ b/arch/riscv/include/asm/io.h
@@ -21,8 +21,6 @@
 
 #include <linux/types.h>
 
-#ifdef CONFIG_MMU
-
 extern void __iomem *ioremap(phys_addr_t offset, unsigned long size);
 
 /*
@@ -36,8 +34,6 @@ extern void __iomem *ioremap(phys_addr_t offset, unsigned long size);
 
 extern void iounmap(volatile void __iomem *addr);
 
-#endif /* CONFIG_MMU */
-
 /* Generic IO read/write.  These perform native-endian accesses. */
 #define __raw_writeb __raw_writeb
 static inline void __raw_writeb(u8 val, volatile void __iomem *addr)
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 2cbd92ed1629..16301966d65b 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -20,8 +20,6 @@
 
 #ifndef __ASSEMBLY__
 
-#ifdef CONFIG_MMU
-
 /* Page Upper Directory not used in RISC-V */
 #include <asm-generic/pgtable-nopud.h>
 #include <asm/page.h>
@@ -413,8 +411,6 @@ static inline void pgtable_cache_init(void)
 	/* No page table caches to initialize */
 }
 
-#endif /* CONFIG_MMU */
-
 #define VMALLOC_SIZE     (KERN_VIRT_SIZE >> 1)
 #define VMALLOC_END      (PAGE_OFFSET - 1)
 #define VMALLOC_START    (PAGE_OFFSET - VMALLOC_SIZE)
diff --git a/arch/riscv/include/asm/tlbflush.h b/arch/riscv/include/asm/tlbflush.h
index 715b0f10af58..7b9c24ebdf52 100644
--- a/arch/riscv/include/asm/tlbflush.h
+++ b/arch/riscv/include/asm/tlbflush.h
@@ -15,8 +15,6 @@
 #ifndef _ASM_RISCV_TLBFLUSH_H
 #define _ASM_RISCV_TLBFLUSH_H
 
-#ifdef CONFIG_MMU
-
 #include <linux/mm_types.h>
 
 /*
@@ -64,6 +62,4 @@ static inline void flush_tlb_kernel_range(unsigned long start,
 	flush_tlb_all();
 }
 
-#endif /* CONFIG_MMU */
-
 #endif /* _ASM_RISCV_TLBFLUSH_H */
diff --git a/arch/riscv/include/asm/uaccess.h b/arch/riscv/include/asm/uaccess.h
index 27b90d64814b..14b0b22fb578 100644
--- a/arch/riscv/include/asm/uaccess.h
+++ b/arch/riscv/include/asm/uaccess.h
@@ -127,7 +127,6 @@ extern int fixup_exception(struct pt_regs *state);
  * call.
  */
 
-#ifdef CONFIG_MMU
 #define __get_user_asm(insn, x, ptr, err)			\
 do {								\
 	uintptr_t __tmp;					\
@@ -153,13 +152,11 @@ do {								\
 	__disable_user_access();				\
 	(x) = __x;						\
 } while (0)
-#endif /* CONFIG_MMU */
 
 #ifdef CONFIG_64BIT
 #define __get_user_8(x, ptr, err) \
 	__get_user_asm("ld", x, ptr, err)
 #else /* !CONFIG_64BIT */
-#ifdef CONFIG_MMU
 #define __get_user_8(x, ptr, err)				\
 do {								\
 	u32 __user *__ptr = (u32 __user *)(ptr);		\
@@ -193,7 +190,6 @@ do {								\
 	(x) = (__typeof__(x))((__typeof__((x)-(x)))(		\
 		(((u64)__hi << 32) | __lo)));			\
 } while (0)
-#endif /* CONFIG_MMU */
 #endif /* CONFIG_64BIT */
 
 
@@ -267,8 +263,6 @@ do {								\
 		((x) = 0, -EFAULT);				\
 })
 
-
-#ifdef CONFIG_MMU
 #define __put_user_asm(insn, x, ptr, err)			\
 do {								\
 	uintptr_t __tmp;					\
@@ -292,14 +286,11 @@ do {								\
 		: "rJ" (__x), "i" (-EFAULT));			\
 	__disable_user_access();				\
 } while (0)
-#endif /* CONFIG_MMU */
-
 
 #ifdef CONFIG_64BIT
 #define __put_user_8(x, ptr, err) \
 	__put_user_asm("sd", x, ptr, err)
 #else /* !CONFIG_64BIT */
-#ifdef CONFIG_MMU
 #define __put_user_8(x, ptr, err)				\
 do {								\
 	u32 __user *__ptr = (u32 __user *)(ptr);		\
@@ -329,7 +320,6 @@ do {								\
 		: "rJ" (__x), "rJ" (__x >> 32), "i" (-EFAULT));	\
 	__disable_user_access();				\
 } while (0)
-#endif /* CONFIG_MMU */
 #endif /* CONFIG_64BIT */
 
 
@@ -438,7 +428,6 @@ unsigned long __must_check clear_user(void __user *to, unsigned long n)
  * will set "err" to -EFAULT, while successful accesses return the previous
  * value.
  */
-#ifdef CONFIG_MMU
 #define __cmpxchg_user(ptr, old, new, err, size, lrb, scb)	\
 ({								\
 	__typeof__(ptr) __ptr = (ptr);				\
@@ -508,6 +497,5 @@ unsigned long __must_check clear_user(void __user *to, unsigned long n)
 	(err) = __err;						\
 	__ret;							\
 })
-#endif /* CONFIG_MMU */
 
 #endif /* _ASM_RISCV_UACCESS_H */

From 1125203c13b9da32125e171b4bd75e93d4918ddd Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 4 Jan 2018 18:35:03 +0100
Subject: [PATCH 692/808] riscv: rename SR_* constants to match the spec

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Palmer Dabbelt <palmer@sifive.com>
---
 arch/riscv/include/asm/csr.h      |  8 ++++----
 arch/riscv/include/asm/irqflags.h | 10 +++++-----
 arch/riscv/include/asm/ptrace.h   |  2 +-
 arch/riscv/kernel/entry.S         |  8 ++++----
 arch/riscv/kernel/process.c       |  4 ++--
 arch/riscv/mm/fault.c             |  2 +-
 6 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/arch/riscv/include/asm/csr.h b/arch/riscv/include/asm/csr.h
index 0d64bc9f4f91..3c7a2c97e377 100644
--- a/arch/riscv/include/asm/csr.h
+++ b/arch/riscv/include/asm/csr.h
@@ -17,10 +17,10 @@
 #include <linux/const.h>
 
 /* Status register flags */
-#define SR_IE   _AC(0x00000002, UL) /* Interrupt Enable */
-#define SR_PIE  _AC(0x00000020, UL) /* Previous IE */
-#define SR_PS   _AC(0x00000100, UL) /* Previously Supervisor */
-#define SR_SUM  _AC(0x00040000, UL) /* Supervisor may access User Memory */
+#define SR_SIE	_AC(0x00000002, UL) /* Supervisor Interrupt Enable */
+#define SR_SPIE	_AC(0x00000020, UL) /* Previous Supervisor IE */
+#define SR_SPP	_AC(0x00000100, UL) /* Previously Supervisor */
+#define SR_SUM	_AC(0x00040000, UL) /* Supervisor may access User Memory */
 
 #define SR_FS           _AC(0x00006000, UL) /* Floating-point Status */
 #define SR_FS_OFF       _AC(0x00000000, UL)
diff --git a/arch/riscv/include/asm/irqflags.h b/arch/riscv/include/asm/irqflags.h
index 6fdc860d7f84..07a3c6d5706f 100644
--- a/arch/riscv/include/asm/irqflags.h
+++ b/arch/riscv/include/asm/irqflags.h
@@ -27,25 +27,25 @@ static inline unsigned long arch_local_save_flags(void)
 /* unconditionally enable interrupts */
 static inline void arch_local_irq_enable(void)
 {
-	csr_set(sstatus, SR_IE);
+	csr_set(sstatus, SR_SIE);
 }
 
 /* unconditionally disable interrupts */
 static inline void arch_local_irq_disable(void)
 {
-	csr_clear(sstatus, SR_IE);
+	csr_clear(sstatus, SR_SIE);
 }
 
 /* get status and disable interrupts */
 static inline unsigned long arch_local_irq_save(void)
 {
-	return csr_read_clear(sstatus, SR_IE);
+	return csr_read_clear(sstatus, SR_SIE);
 }
 
 /* test flags */
 static inline int arch_irqs_disabled_flags(unsigned long flags)
 {
-	return !(flags & SR_IE);
+	return !(flags & SR_SIE);
 }
 
 /* test hardware interrupt enable bit */
@@ -57,7 +57,7 @@ static inline int arch_irqs_disabled(void)
 /* set interrupt enabled status */
 static inline void arch_local_irq_restore(unsigned long flags)
 {
-	csr_set(sstatus, flags & SR_IE);
+	csr_set(sstatus, flags & SR_SIE);
 }
 
 #endif /* _ASM_RISCV_IRQFLAGS_H */
diff --git a/arch/riscv/include/asm/ptrace.h b/arch/riscv/include/asm/ptrace.h
index 93b8956e25e4..2c5df945d43c 100644
--- a/arch/riscv/include/asm/ptrace.h
+++ b/arch/riscv/include/asm/ptrace.h
@@ -66,7 +66,7 @@ struct pt_regs {
 #define REG_FMT "%08lx"
 #endif
 
-#define user_mode(regs) (((regs)->sstatus & SR_PS) == 0)
+#define user_mode(regs) (((regs)->sstatus & SR_SPP) == 0)
 
 
 /* Helpers for working with the instruction pointer */
diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S
index 20ee86f782a9..7404ec222406 100644
--- a/arch/riscv/kernel/entry.S
+++ b/arch/riscv/kernel/entry.S
@@ -196,7 +196,7 @@ handle_syscall:
 	addi s2, s2, 0x4
 	REG_S s2, PT_SEPC(sp)
 	/* System calls run with interrupts enabled */
-	csrs sstatus, SR_IE
+	csrs sstatus, SR_SIE
 	/* Trace syscalls, but only if requested by the user. */
 	REG_L t0, TASK_TI_FLAGS(tp)
 	andi t0, t0, _TIF_SYSCALL_TRACE
@@ -224,8 +224,8 @@ ret_from_syscall:
 
 ret_from_exception:
 	REG_L s0, PT_SSTATUS(sp)
-	csrc sstatus, SR_IE
-	andi s0, s0, SR_PS
+	csrc sstatus, SR_SIE
+	andi s0, s0, SR_SPP
 	bnez s0, restore_all
 
 resume_userspace:
@@ -255,7 +255,7 @@ work_pending:
 	bnez s1, work_resched
 work_notifysig:
 	/* Handle pending signals and notify-resume requests */
-	csrs sstatus, SR_IE /* Enable interrupts for do_notify_resume() */
+	csrs sstatus, SR_SIE /* Enable interrupts for do_notify_resume() */
 	move a0, sp /* pt_regs */
 	move a1, s0 /* current_thread_info->flags */
 	tail do_notify_resume
diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c
index 0d90dcc1fbd3..d74d4adf2d54 100644
--- a/arch/riscv/kernel/process.c
+++ b/arch/riscv/kernel/process.c
@@ -76,7 +76,7 @@ void show_regs(struct pt_regs *regs)
 void start_thread(struct pt_regs *regs, unsigned long pc,
 	unsigned long sp)
 {
-	regs->sstatus = SR_PIE /* User mode, irqs on */ | SR_FS_INITIAL;
+	regs->sstatus = SR_SPIE /* User mode, irqs on */ | SR_FS_INITIAL;
 	regs->sepc = pc;
 	regs->sp = sp;
 	set_fs(USER_DS);
@@ -110,7 +110,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp,
 		const register unsigned long gp __asm__ ("gp");
 		memset(childregs, 0, sizeof(struct pt_regs));
 		childregs->gp = gp;
-		childregs->sstatus = SR_PS | SR_PIE; /* Supervisor, irqs on */
+		childregs->sstatus = SR_SPP | SR_SPIE; /* Supervisor, irqs on */
 
 		p->thread.ra = (unsigned long)ret_from_kernel_thread;
 		p->thread.s[0] = usp; /* fn */
diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c
index df2ca3c65048..0713f3c67ab4 100644
--- a/arch/riscv/mm/fault.c
+++ b/arch/riscv/mm/fault.c
@@ -63,7 +63,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs)
 		goto vmalloc_fault;
 
 	/* Enable interrupts if they were enabled in the parent context. */
-	if (likely(regs->sstatus & SR_PIE))
+	if (likely(regs->sstatus & SR_SPIE))
 		local_irq_enable();
 
 	/*

From e2d5915293ffdff977ddcfc12b817b08c53ffa7a Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Mon, 8 Jan 2018 14:54:32 +1100
Subject: [PATCH 693/808] powerpc/pseries: Make RAS IRQ explicitly dependent on
 DLPAR WQ

The hotplug code uses its own workqueue to handle IRQ requests
(pseries_hp_wq), however that workqueue is initialized after
init_ras_IRQ(). That can lead to a kernel panic if any hotplug
interrupts fire after init_ras_IRQ() but before pseries_hp_wq is
initialised. eg:

  UDP-Lite hash table entries: 2048 (order: 0, 65536 bytes)
  NET: Registered protocol family 1
  Unpacking initramfs...
  (qemu) object_add memory-backend-ram,id=mem1,size=10G
  (qemu) device_add pc-dimm,id=dimm1,memdev=mem1
  Unable to handle kernel paging request for data at address 0xf94d03007c421378
  Faulting instruction address: 0xc00000000012d744
  Oops: Kernel access of bad area, sig: 11 [#1]
  LE SMP NR_CPUS=2048 NUMA pSeries
  Modules linked in:
  CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.15.0-rc2-ziviani+ #26
  task:         (ptrval) task.stack:         (ptrval)
  NIP:  c00000000012d744 LR: c00000000012d744 CTR: 0000000000000000
  REGS:         (ptrval) TRAP: 0380   Not tainted  (4.15.0-rc2-ziviani+)
  MSR:  8000000000009033 <SF,EE,ME,IR,DR,RI,LE>  CR: 28088042  XER: 20040000
  CFAR: c00000000012d3c4 SOFTE: 0
  ...
  NIP [c00000000012d744] __queue_work+0xd4/0x5c0
  LR [c00000000012d744] __queue_work+0xd4/0x5c0
  Call Trace:
  [c0000000fffefb90] [c00000000012d744] __queue_work+0xd4/0x5c0 (unreliable)
  [c0000000fffefc70] [c00000000012dce4] queue_work_on+0xb4/0xf0

This commit makes the RAS IRQ registration explicitly dependent on the
creation of the pseries_hp_wq.

Reported-by: Min Deng <mdeng@redhat.com>
Reported-by: Daniel Henrique Barboza <danielhb@linux.vnet.ibm.com>
Tested-by: Jose Ricardo Ziviani <joserz@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 arch/powerpc/platforms/pseries/dlpar.c   | 21 ++++++++++++++++++---
 arch/powerpc/platforms/pseries/pseries.h |  2 ++
 arch/powerpc/platforms/pseries/ras.c     |  3 ++-
 3 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/dlpar.c b/arch/powerpc/platforms/pseries/dlpar.c
index 6e35780c5962..a0b20c03f078 100644
--- a/arch/powerpc/platforms/pseries/dlpar.c
+++ b/arch/powerpc/platforms/pseries/dlpar.c
@@ -574,11 +574,26 @@ static ssize_t dlpar_show(struct class *class, struct class_attribute *attr,
 
 static CLASS_ATTR_RW(dlpar);
 
-static int __init pseries_dlpar_init(void)
+int __init dlpar_workqueue_init(void)
 {
+	if (pseries_hp_wq)
+		return 0;
+
 	pseries_hp_wq = alloc_workqueue("pseries hotplug workqueue",
-					WQ_UNBOUND, 1);
+			WQ_UNBOUND, 1);
+
+	return pseries_hp_wq ? 0 : -ENOMEM;
+}
+
+static int __init dlpar_sysfs_init(void)
+{
+	int rc;
+
+	rc = dlpar_workqueue_init();
+	if (rc)
+		return rc;
+
 	return sysfs_create_file(kernel_kobj, &class_attr_dlpar.attr);
 }
-machine_device_initcall(pseries, pseries_dlpar_init);
+machine_device_initcall(pseries, dlpar_sysfs_init);
 
diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h
index 4470a3194311..1ae1d9f4dbe9 100644
--- a/arch/powerpc/platforms/pseries/pseries.h
+++ b/arch/powerpc/platforms/pseries/pseries.h
@@ -98,4 +98,6 @@ static inline unsigned long cmo_get_page_size(void)
 	return CMO_PageSize;
 }
 
+int dlpar_workqueue_init(void);
+
 #endif /* _PSERIES_PSERIES_H */
diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c
index 4923ffe230cf..81d8614e7379 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -69,7 +69,8 @@ static int __init init_ras_IRQ(void)
 	/* Hotplug Events */
 	np = of_find_node_by_path("/event-sources/hot-plug-events");
 	if (np != NULL) {
-		request_event_sources_irqs(np, ras_hotplug_interrupt,
+		if (dlpar_workqueue_init() == 0)
+			request_event_sources_irqs(np, ras_hotplug_interrupt,
 					   "RAS_HOTPLUG");
 		of_node_put(np);
 	}

From 65e7439204b57b7a7f6e4694f9e2a9adde5e77ed Mon Sep 17 00:00:00 2001
From: Changbin Du <changbin.du@intel.com>
Date: Thu, 21 Dec 2017 10:29:32 +0800
Subject: [PATCH 694/808] drm/i915/gvt: Fix stack-out-of-bounds bug in cmd
 parser

for_each_set_bit() only accepts variable of type unsigned long, and we can
not cast it from smaller types.

[   16.499365] ==================================================================
[   16.506655] BUG: KASAN: stack-out-of-bounds in find_first_bit+0x1d/0x70
[   16.513313] Read of size 8 at addr ffff8803616cf510 by task systemd-udevd/180
[   16.521998] CPU: 0 PID: 180 Comm: systemd-udevd Tainted: G     U     O     4.15.0-rc3+ #14
[   16.530317] Hardware name: Dell Inc. OptiPlex 7040/0Y7WYT, BIOS 1.2.8 01/26/2016
[   16.537760] Call Trace:
[   16.540230]  dump_stack+0x7c/0xbb
[   16.543569]  print_address_description+0x6b/0x290
[   16.548306]  kasan_report+0x28a/0x370
[   16.551993]  ? find_first_bit+0x1d/0x70
[   16.555858]  find_first_bit+0x1d/0x70
[   16.559625]  intel_gvt_init_cmd_parser+0x127/0x3c0 [i915]
[   16.565060]  ? __lock_is_held+0x8f/0xf0
[   16.568990]  ? intel_gvt_clean_cmd_parser+0x10/0x10 [i915]
[   16.574514]  ? __hrtimer_init+0x5d/0xb0
[   16.578445]  intel_gvt_init_device+0x2c3/0x690 [i915]
[   16.583537]  ? unregister_module_notifier+0x20/0x20
[   16.588515]  intel_gvt_init+0x89/0x100 [i915]
[   16.592962]  i915_driver_load+0x1992/0x1c70 [i915]
[   16.597846]  ? __i915_printk+0x210/0x210 [i915]
[   16.602410]  ? wait_for_completion+0x280/0x280
[   16.606883]  ? lock_downgrade+0x2c0/0x2c0
[   16.610923]  ? __pm_runtime_resume+0x46/0x90
[   16.615238]  ? acpi_dev_found+0x76/0x80
[   16.619162]  ? i915_pci_remove+0x30/0x30 [i915]
[   16.623733]  local_pci_probe+0x74/0xe0
[   16.627518]  pci_device_probe+0x208/0x310
[   16.631561]  ? pci_device_remove+0x100/0x100
[   16.635871]  ? __list_add_valid+0x29/0xa0
[   16.639919]  driver_probe_device+0x40b/0x6b0
[   16.644223]  ? driver_probe_device+0x6b0/0x6b0
[   16.648696]  __driver_attach+0x11d/0x130
[   16.652649]  bus_for_each_dev+0xe7/0x160
[   16.656600]  ? subsys_dev_iter_exit+0x10/0x10
[   16.660987]  ? __list_add_valid+0x29/0xa0
[   16.665028]  bus_add_driver+0x31d/0x3a0
[   16.668893]  driver_register+0xc6/0x170
[   16.672758]  ? 0xffffffffc0ad8000
[   16.676108]  do_one_initcall+0x9c/0x206
[   16.679984]  ? initcall_blacklisted+0x150/0x150
[   16.684545]  ? do_init_module+0x35/0x33b
[   16.688494]  ? kasan_unpoison_shadow+0x31/0x40
[   16.692968]  ? kasan_kmalloc+0xa6/0xd0
[   16.696743]  ? do_init_module+0x35/0x33b
[   16.700694]  ? kasan_unpoison_shadow+0x31/0x40
[   16.705168]  ? __asan_register_globals+0x82/0xa0
[   16.709819]  do_init_module+0xe7/0x33b
[   16.713597]  load_module+0x4481/0x4ce0
[   16.717397]  ? module_frob_arch_sections+0x20/0x20
[   16.722228]  ? vfs_read+0x13b/0x190
[   16.725742]  ? kernel_read+0x74/0xa0
[   16.729351]  ? get_user_arg_ptr.isra.17+0x70/0x70
[   16.734099]  ? SYSC_finit_module+0x175/0x1b0
[   16.738399]  SYSC_finit_module+0x175/0x1b0
[   16.742524]  ? SYSC_init_module+0x1e0/0x1e0
[   16.746741]  ? __fget+0x157/0x240
[   16.750090]  ? trace_hardirqs_on_thunk+0x1a/0x1c
[   16.754747]  entry_SYSCALL_64_fastpath+0x23/0x9a
[   16.759397] RIP: 0033:0x7f8fbc837499
[   16.762996] RSP: 002b:00007ffead76c138 EFLAGS: 00000246 ORIG_RAX: 0000000000000139
[   16.770618] RAX: ffffffffffffffda RBX: 0000000000000012 RCX: 00007f8fbc837499
[   16.777800] RDX: 0000000000000000 RSI: 000056484e67b080 RDI: 0000000000000012
[   16.784979] RBP: 00007ffead76b140 R08: 0000000000000000 R09: 0000000000000021
[   16.792164] R10: 0000000000000012 R11: 0000000000000246 R12: 000056484e67b460
[   16.799345] R13: 00007ffead76b120 R14: 0000000000000005 R15: 0000000000000000
[   16.808052] The buggy address belongs to the page:
[   16.812876] page:00000000dc4b8c1e count:0 mapcount:0 mapping:          (null) index:0x0
[   16.820934] flags: 0x17ffffc0000000()
[   16.824621] raw: 0017ffffc0000000 0000000000000000 0000000000000000 00000000ffffffff
[   16.832416] raw: ffffea000d85b3e0 ffffea000d85b3e0 0000000000000000 0000000000000000
[   16.840208] page dumped because: kasan: bad access detected
[   16.847318] Memory state around the buggy address:
[   16.852143]  ffff8803616cf400: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
[   16.859427]  ffff8803616cf480: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 f1 f1
[   16.866708] >ffff8803616cf500: f1 f1 04 f4 f4 f4 f3 f3 f3 f3 00 00 00 00 00 00
[   16.873988]                          ^
[   16.877770]  ffff8803616cf580: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
[   16.885042]  ffff8803616cf600: 00 00 00 00 00 00 00 00 00 00 00 00 f1 f1 f1 f1
[   16.892312] ==================================================================

Signed-off-by: Changbin Du <changbin.du@intel.com>
Signed-off-by: Zhenyu Wang <zhenyuw@linux.intel.com>
---
 drivers/gpu/drm/i915/gvt/cmd_parser.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/gvt/cmd_parser.c b/drivers/gpu/drm/i915/gvt/cmd_parser.c
index 701a3c6f1669..9d12090939e3 100644
--- a/drivers/gpu/drm/i915/gvt/cmd_parser.c
+++ b/drivers/gpu/drm/i915/gvt/cmd_parser.c
@@ -2777,12 +2777,12 @@ int intel_gvt_scan_and_shadow_wa_ctx(struct intel_shadow_wa_ctx *wa_ctx)
 }
 
 static struct cmd_info *find_cmd_entry_any_ring(struct intel_gvt *gvt,
-		unsigned int opcode, int rings)
+		unsigned int opcode, unsigned long rings)
 {
 	struct cmd_info *info = NULL;
 	unsigned int ring;
 
-	for_each_set_bit(ring, (unsigned long *)&rings, I915_NUM_ENGINES) {
+	for_each_set_bit(ring, &rings, I915_NUM_ENGINES) {
 		info = find_cmd_entry(gvt, opcode, ring);
 		if (info)
 			break;

From 6b018235b4daabae96d855219fae59c3fb8be417 Mon Sep 17 00:00:00 2001
From: "Ewan D. Milne" <emilne@redhat.com>
Date: Fri, 5 Jan 2018 12:44:06 -0500
Subject: [PATCH 695/808] nvme-fabrics: initialize default host->id in
 nvmf_host_default()

The field was uninitialized before use.

Signed-off-by: Ewan D. Milne <emilne@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/fabrics.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 76b4fe6816a0..894c2ccb3891 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -74,6 +74,7 @@ static struct nvmf_host *nvmf_host_default(void)
 		return NULL;
 
 	kref_init(&host->ref);
+	uuid_gen(&host->id);
 	snprintf(host->nqn, NVMF_NQN_SIZE,
 		"nqn.2014-08.org.nvmexpress:uuid:%pUb", &host->id);
 

From 87590ce6e373d1a5401f6539f0c59ef92dd924a9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 7 Jan 2018 22:48:00 +0100
Subject: [PATCH 696/808] sysfs/cpu: Add vulnerability folder

As the meltdown/spectre problem affects several CPU architectures, it makes
sense to have common way to express whether a system is affected by a
particular vulnerability or not. If affected the way to express the
mitigation should be common as well.

Create /sys/devices/system/cpu/vulnerabilities folder and files for
meltdown, spectre_v1 and spectre_v2.

Allow architectures to override the show function.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linuxfoundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: David Woodhouse <dwmw@amazon.co.uk>
Link: https://lkml.kernel.org/r/20180107214913.096657732@linutronix.de
---
 .../ABI/testing/sysfs-devices-system-cpu      | 16 +++++++
 drivers/base/Kconfig                          |  3 ++
 drivers/base/cpu.c                            | 48 +++++++++++++++++++
 include/linux/cpu.h                           |  7 +++
 4 files changed, 74 insertions(+)

diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu
index f3d5817c4ef0..bd3a88e16d8b 100644
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
@@ -373,3 +373,19 @@ Contact:	Linux kernel mailing list <linux-kernel@vger.kernel.org>
 Description:	information about CPUs heterogeneity.
 
 		cpu_capacity: capacity of cpu#.
+
+What:		/sys/devices/system/cpu/vulnerabilities
+		/sys/devices/system/cpu/vulnerabilities/meltdown
+		/sys/devices/system/cpu/vulnerabilities/spectre_v1
+		/sys/devices/system/cpu/vulnerabilities/spectre_v2
+Date:		Januar 2018
+Contact:	Linux kernel mailing list <linux-kernel@vger.kernel.org>
+Description:	Information about CPU vulnerabilities
+
+		The files are named after the code names of CPU
+		vulnerabilities. The output of those files reflects the
+		state of the CPUs in the system. Possible output values:
+
+		"Not affected"	  CPU is not affected by the vulnerability
+		"Vulnerable"	  CPU is affected and no mitigation in effect
+		"Mitigation: $M"  CPU is affetcted and mitigation $M is in effect
diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig
index 2f6614c9a229..37a71fd9043f 100644
--- a/drivers/base/Kconfig
+++ b/drivers/base/Kconfig
@@ -235,6 +235,9 @@ config GENERIC_CPU_DEVICES
 config GENERIC_CPU_AUTOPROBE
 	bool
 
+config GENERIC_CPU_VULNERABILITIES
+	bool
+
 config SOC_BUS
 	bool
 	select GLOB
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index 321cd7b4d817..825964efda1d 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -501,10 +501,58 @@ static void __init cpu_dev_register_generic(void)
 #endif
 }
 
+#ifdef CONFIG_GENERIC_CPU_VULNERABILITIES
+
+ssize_t __weak cpu_show_meltdown(struct device *dev,
+				 struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "Not affected\n");
+}
+
+ssize_t __weak cpu_show_spectre_v1(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "Not affected\n");
+}
+
+ssize_t __weak cpu_show_spectre_v2(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "Not affected\n");
+}
+
+static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL);
+static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL);
+static DEVICE_ATTR(spectre_v2, 0444, cpu_show_spectre_v2, NULL);
+
+static struct attribute *cpu_root_vulnerabilities_attrs[] = {
+	&dev_attr_meltdown.attr,
+	&dev_attr_spectre_v1.attr,
+	&dev_attr_spectre_v2.attr,
+	NULL
+};
+
+static const struct attribute_group cpu_root_vulnerabilities_group = {
+	.name  = "vulnerabilities",
+	.attrs = cpu_root_vulnerabilities_attrs,
+};
+
+static void __init cpu_register_vulnerabilities(void)
+{
+	if (sysfs_create_group(&cpu_subsys.dev_root->kobj,
+			       &cpu_root_vulnerabilities_group))
+		pr_err("Unable to register CPU vulnerabilities\n");
+}
+
+#else
+static inline void cpu_register_vulnerabilities(void) { }
+#endif
+
 void __init cpu_dev_init(void)
 {
 	if (subsys_system_register(&cpu_subsys, cpu_root_attr_groups))
 		panic("Failed to register CPU subsystem");
 
 	cpu_dev_register_generic();
+	cpu_register_vulnerabilities();
 }
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 938ea8ae0ba4..c816e6f2730c 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -47,6 +47,13 @@ extern void cpu_remove_dev_attr(struct device_attribute *attr);
 extern int cpu_add_dev_attr_group(struct attribute_group *attrs);
 extern void cpu_remove_dev_attr_group(struct attribute_group *attrs);
 
+extern ssize_t cpu_show_meltdown(struct device *dev,
+				 struct device_attribute *attr, char *buf);
+extern ssize_t cpu_show_spectre_v1(struct device *dev,
+				   struct device_attribute *attr, char *buf);
+extern ssize_t cpu_show_spectre_v2(struct device *dev,
+				   struct device_attribute *attr, char *buf);
+
 extern __printf(4, 5)
 struct device *cpu_device_create(struct device *parent, void *drvdata,
 				 const struct attribute_group **groups,

From 61dc0f555b5c761cdafb0ba5bd41ecf22d68a4c4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 7 Jan 2018 22:48:01 +0100
Subject: [PATCH 697/808] x86/cpu: Implement CPU vulnerabilites sysfs functions

Implement the CPU vulnerabilty show functions for meltdown, spectre_v1 and
spectre_v2.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linuxfoundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: David Woodhouse <dwmw@amazon.co.uk>
Link: https://lkml.kernel.org/r/20180107214913.177414879@linutronix.de
---
 arch/x86/Kconfig           |  1 +
 arch/x86/kernel/cpu/bugs.c | 29 +++++++++++++++++++++++++++++
 2 files changed, 30 insertions(+)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cd5199de231e..e23d21ac745a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -89,6 +89,7 @@ config X86
 	select GENERIC_CLOCKEVENTS_MIN_ADJUST
 	select GENERIC_CMOS_UPDATE
 	select GENERIC_CPU_AUTOPROBE
+	select GENERIC_CPU_VULNERABILITIES
 	select GENERIC_EARLY_IOREMAP
 	select GENERIC_FIND_FIRST_BIT
 	select GENERIC_IOMAP
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index ba0b2424c9b0..76ad6cb44b40 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -10,6 +10,7 @@
  */
 #include <linux/init.h>
 #include <linux/utsname.h>
+#include <linux/cpu.h>
 #include <asm/bugs.h>
 #include <asm/processor.h>
 #include <asm/processor-flags.h>
@@ -60,3 +61,31 @@ void __init check_bugs(void)
 		set_memory_4k((unsigned long)__va(0), 1);
 #endif
 }
+
+#ifdef CONFIG_SYSFS
+ssize_t cpu_show_meltdown(struct device *dev,
+			  struct device_attribute *attr, char *buf)
+{
+	if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
+		return sprintf(buf, "Not affected\n");
+	if (boot_cpu_has(X86_FEATURE_PTI))
+		return sprintf(buf, "Mitigation: PTI\n");
+	return sprintf(buf, "Vulnerable\n");
+}
+
+ssize_t cpu_show_spectre_v1(struct device *dev,
+			    struct device_attribute *attr, char *buf)
+{
+	if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1))
+		return sprintf(buf, "Not affected\n");
+	return sprintf(buf, "Vulnerable\n");
+}
+
+ssize_t cpu_show_spectre_v2(struct device *dev,
+			    struct device_attribute *attr, char *buf)
+{
+	if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
+		return sprintf(buf, "Not affected\n");
+	return sprintf(buf, "Vulnerable\n");
+}
+#endif

From 29159a4ed7044c52e3e2cf1a9fb55cec4745c60b Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Mon, 8 Jan 2018 13:58:31 +0100
Subject: [PATCH 698/808] ALSA: pcm: Abort properly at pending signal in OSS
 read/write loops

The loops for read and write in PCM OSS emulation have no proper check
of pending signals, and they keep processing even after user tries to
break.  This results in a very long delay, often seen as RCU stall
when a huge unprocessed bytes remain queued.  The bug could be easily
triggered by syzkaller.

As a simple workaround, this patch adds the proper check of pending
signals and aborts the loop appropriately.

Reported-by: syzbot+993cb4cfcbbff3947c21@syzkaller.appspotmail.com
Cc: <stable@vger.kernel.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/core/oss/pcm_oss.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/sound/core/oss/pcm_oss.c b/sound/core/oss/pcm_oss.c
index ceaa51f76591..e317964bd2ea 100644
--- a/sound/core/oss/pcm_oss.c
+++ b/sound/core/oss/pcm_oss.c
@@ -1381,6 +1381,10 @@ static ssize_t snd_pcm_oss_write1(struct snd_pcm_substream *substream, const cha
 			    tmp != runtime->oss.period_bytes)
 				break;
 		}
+		if (signal_pending(current)) {
+			tmp = -ERESTARTSYS;
+			goto err;
+		}
 	}
 	mutex_unlock(&runtime->oss.params_lock);
 	return xfer;
@@ -1466,6 +1470,10 @@ static ssize_t snd_pcm_oss_read1(struct snd_pcm_substream *substream, char __use
 			bytes -= tmp;
 			xfer += tmp;
 		}
+		if (signal_pending(current)) {
+			tmp = -ERESTARTSYS;
+			goto err;
+		}
 	}
 	mutex_unlock(&runtime->oss.params_lock);
 	return xfer;

From 0dd6d272d39c7c1fe2f4253197b505f2b66538ee Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <nick.desaulniers@gmail.com>
Date: Sat, 23 Dec 2017 21:50:13 -0500
Subject: [PATCH 699/808] x86/xen/time: fix section mismatch for
 xen_init_time_ops()

The header declares this function as __init but is defined in __ref
section.

Signed-off-by: Nick Desaulniers <nick.desaulniers@gmail.com>
Reviewed-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 arch/x86/xen/xen-ops.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index f96dbedb33d4..1a7a9469e5a7 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -71,7 +71,7 @@ u64 xen_clocksource_read(void);
 void xen_setup_cpu_clockevents(void);
 void xen_save_time_memory_area(void);
 void xen_restore_time_memory_area(void);
-void __init xen_init_time_ops(void);
+void __ref xen_init_time_ops(void);
 void __init xen_hvm_init_time_ops(void);
 
 irqreturn_t xen_debug_interrupt(int irq, void *dev_id);

From 66a640e7823da803fdb68d5d88f7a8fbd11c29e6 Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <nick.desaulniers@gmail.com>
Date: Sat, 6 Jan 2018 13:39:48 -0800
Subject: [PATCH 700/808] x86: xen: remove the use of VLAIS

Variable Length Arrays In Structs (VLAIS) is not supported by Clang, and
frowned upon by others.

https://lkml.org/lkml/2013/9/23/500

Here, the VLAIS was used because the size of the bitmap returned from
xen_mc_entry() depended on possibly (based on kernel configuration)
runtime sized data. Rather than declaring args as a VLAIS then calling
sizeof on *args, we calculate the appropriate sizeof args manually.
Further, we can get rid of the #ifdef's and rely on num_possible_cpus()
(thanks to a helpful checkpatch warning from an earlier version of this
patch).

Suggested-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Nick Desaulniers <nick.desaulniers@gmail.com>
Reviewed-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 arch/x86/xen/mmu_pv.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index 7118f776cd49..aa701d2a5023 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -1339,20 +1339,18 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
 {
 	struct {
 		struct mmuext_op op;
-#ifdef CONFIG_SMP
-		DECLARE_BITMAP(mask, num_processors);
-#else
 		DECLARE_BITMAP(mask, NR_CPUS);
-#endif
 	} *args;
 	struct multicall_space mcs;
+	const size_t mc_entry_size = sizeof(args->op) +
+		sizeof(args->mask[0]) * BITS_TO_LONGS(num_possible_cpus());
 
 	trace_xen_mmu_flush_tlb_others(cpus, info->mm, info->start, info->end);
 
 	if (cpumask_empty(cpus))
 		return;		/* nothing to do */
 
-	mcs = xen_mc_entry(sizeof(*args));
+	mcs = xen_mc_entry(mc_entry_size);
 	args = mcs.args;
 	args->op.arg2.vcpumask = to_cpumask(args->mask);
 

From 900498a34a3ac9c611e9b425094c8106bdd7dc1c Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Mon, 8 Jan 2018 14:03:53 +0100
Subject: [PATCH 701/808] ALSA: pcm: Allow aborting mutex lock at OSS
 read/write loops

PCM OSS read/write loops keep taking the mutex lock for the whole
read/write, and this might take very long when the exceptionally high
amount of data is given.  Also, since it invokes with mutex_lock(),
the concurrent read/write becomes unbreakable.

This patch tries to address these issues by replacing mutex_lock()
with mutex_lock_interruptible(), and also splits / re-takes the lock
at each read/write period chunk, so that it can switch the context
more finely if requested.

Cc: <stable@vger.kernel.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/core/oss/pcm_oss.c | 36 +++++++++++++++++++++---------------
 1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/sound/core/oss/pcm_oss.c b/sound/core/oss/pcm_oss.c
index e317964bd2ea..c2db7e905f7d 100644
--- a/sound/core/oss/pcm_oss.c
+++ b/sound/core/oss/pcm_oss.c
@@ -1334,8 +1334,11 @@ static ssize_t snd_pcm_oss_write1(struct snd_pcm_substream *substream, const cha
 
 	if ((tmp = snd_pcm_oss_make_ready(substream)) < 0)
 		return tmp;
-	mutex_lock(&runtime->oss.params_lock);
 	while (bytes > 0) {
+		if (mutex_lock_interruptible(&runtime->oss.params_lock)) {
+			tmp = -ERESTARTSYS;
+			break;
+		}
 		if (bytes < runtime->oss.period_bytes || runtime->oss.buffer_used > 0) {
 			tmp = bytes;
 			if (tmp + runtime->oss.buffer_used > runtime->oss.period_bytes)
@@ -1379,18 +1382,18 @@ static ssize_t snd_pcm_oss_write1(struct snd_pcm_substream *substream, const cha
 			xfer += tmp;
 			if ((substream->f_flags & O_NONBLOCK) != 0 &&
 			    tmp != runtime->oss.period_bytes)
-				break;
+				tmp = -EAGAIN;
 		}
+ err:
+		mutex_unlock(&runtime->oss.params_lock);
+		if (tmp < 0)
+			break;
 		if (signal_pending(current)) {
 			tmp = -ERESTARTSYS;
-			goto err;
+			break;
 		}
+		tmp = 0;
 	}
-	mutex_unlock(&runtime->oss.params_lock);
-	return xfer;
-
- err:
-	mutex_unlock(&runtime->oss.params_lock);
 	return xfer > 0 ? (snd_pcm_sframes_t)xfer : tmp;
 }
 
@@ -1438,8 +1441,11 @@ static ssize_t snd_pcm_oss_read1(struct snd_pcm_substream *substream, char __use
 
 	if ((tmp = snd_pcm_oss_make_ready(substream)) < 0)
 		return tmp;
-	mutex_lock(&runtime->oss.params_lock);
 	while (bytes > 0) {
+		if (mutex_lock_interruptible(&runtime->oss.params_lock)) {
+			tmp = -ERESTARTSYS;
+			break;
+		}
 		if (bytes < runtime->oss.period_bytes || runtime->oss.buffer_used > 0) {
 			if (runtime->oss.buffer_used == 0) {
 				tmp = snd_pcm_oss_read2(substream, runtime->oss.buffer, runtime->oss.period_bytes, 1);
@@ -1470,16 +1476,16 @@ static ssize_t snd_pcm_oss_read1(struct snd_pcm_substream *substream, char __use
 			bytes -= tmp;
 			xfer += tmp;
 		}
+ err:
+		mutex_unlock(&runtime->oss.params_lock);
+		if (tmp < 0)
+			break;
 		if (signal_pending(current)) {
 			tmp = -ERESTARTSYS;
-			goto err;
+			break;
 		}
+		tmp = 0;
 	}
-	mutex_unlock(&runtime->oss.params_lock);
-	return xfer;
-
- err:
-	mutex_unlock(&runtime->oss.params_lock);
 	return xfer > 0 ? (snd_pcm_sframes_t)xfer : tmp;
 }
 

From dba04eb76df982703fefc021a4d278347b6176a9 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Mon, 8 Jan 2018 16:27:31 +0100
Subject: [PATCH 702/808] locking/Documentation: Remove stale
 crossrelease_fullstack parameter

The cross-release lockdep functionality has been removed in:

   e966eaeeb623: ("locking/lockdep: Remove the cross-release locking checks")

... leaving the kernel parameter docs behind. The code handling
the parameter does not exist so this is a plain documentation change.

Signed-off-by: David Sterba <dsterba@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: byungchul.park@lge.com
Cc: linux-doc@vger.kernel.org
Link: http://lkml.kernel.org/r/20180108152731.27613-1-dsterba@suse.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/admin-guide/kernel-parameters.txt | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index af7104aaffd9..a626465dd877 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -713,9 +713,6 @@
 			It will be ignored when crashkernel=X,high is not used
 			or memory reserved is below 4G.
 
-	crossrelease_fullstack
-			[KNL] Allow to record full stack trace in cross-release
-
 	cryptomgr.notests
                         [KNL] Disable crypto self-tests
 

From 262b6b30087246abf09d6275eb0c0dc421bcbe38 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Sat, 6 Jan 2018 18:41:14 +0100
Subject: [PATCH 703/808] x86/tboot: Unbreak tboot with PTI enabled

This is another case similar to what EFI does: create a new set of
page tables, map some code at a low address, and jump to it.  PTI
mistakes this low address for userspace and mistakenly marks it
non-executable in an effort to make it unusable for userspace.

Undo the poison to allow execution.

Fixes: 385ce0ea4c07 ("x86/mm/pti: Add Kconfig")
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Alan Cox <gnomes@lxorguk.ukuu.org.uk>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Jon Masters <jcm@redhat.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Jeff Law <law@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org>
Cc: David" <dwmw@amazon.co.uk>
Cc: Nick Clifton <nickc@redhat.com>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/20180108102805.GK25546@redhat.com
---
 arch/x86/kernel/tboot.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index a4eb27918ceb..75869a4b6c41 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -127,6 +127,7 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn,
 	p4d = p4d_alloc(&tboot_mm, pgd, vaddr);
 	if (!p4d)
 		return -1;
+	pgd->pgd &= ~_PAGE_NX;
 	pud = pud_alloc(&tboot_mm, p4d, vaddr);
 	if (!pud)
 		return -1;

From 527187d28569e39c5d489d6306d3b79605cf85a6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Mon, 8 Jan 2018 17:27:19 +0100
Subject: [PATCH 704/808] locking/lockdep: Remove cross-release leftovers

There's two cross-release leftover facilities:

 - the crossrelease_hist_*() irq-tracing callbacks (NOPs currently)
 - the complete_release_commit() callback (NOP as well)

Remove them.

Cc: David Sterba <dsterba@suse.com>
Cc: Byungchul Park <byungchul.park@lge.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/completion.h | 1 -
 include/linux/irqflags.h   | 4 ----
 include/linux/lockdep.h    | 2 --
 kernel/sched/completion.c  | 5 -----
 4 files changed, 12 deletions(-)

diff --git a/include/linux/completion.h b/include/linux/completion.h
index 94a59ba7d422..519e94915d18 100644
--- a/include/linux/completion.h
+++ b/include/linux/completion.h
@@ -32,7 +32,6 @@ struct completion {
 #define init_completion(x) __init_completion(x)
 static inline void complete_acquire(struct completion *x) {}
 static inline void complete_release(struct completion *x) {}
-static inline void complete_release_commit(struct completion *x) {}
 
 #define COMPLETION_INITIALIZER(work) \
 	{ 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
index 46cb57d5eb13..1b3996ff3f16 100644
--- a/include/linux/irqflags.h
+++ b/include/linux/irqflags.h
@@ -27,22 +27,18 @@
 # define trace_hardirq_enter()			\
 do {						\
 	current->hardirq_context++;		\
-	crossrelease_hist_start(XHLOCK_HARD);	\
 } while (0)
 # define trace_hardirq_exit()			\
 do {						\
 	current->hardirq_context--;		\
-	crossrelease_hist_end(XHLOCK_HARD);	\
 } while (0)
 # define lockdep_softirq_enter()		\
 do {						\
 	current->softirq_context++;		\
-	crossrelease_hist_start(XHLOCK_SOFT);	\
 } while (0)
 # define lockdep_softirq_exit()			\
 do {						\
 	current->softirq_context--;		\
-	crossrelease_hist_end(XHLOCK_SOFT);	\
 } while (0)
 # define INIT_TRACE_IRQFLAGS	.softirqs_enabled = 1,
 #else
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 2e75dc34bff5..3251d9c0d313 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -475,8 +475,6 @@ enum xhlock_context_t {
 #define STATIC_LOCKDEP_MAP_INIT(_name, _key) \
 	{ .name = (_name), .key = (void *)(_key), }
 
-static inline void crossrelease_hist_start(enum xhlock_context_t c) {}
-static inline void crossrelease_hist_end(enum xhlock_context_t c) {}
 static inline void lockdep_invariant_state(bool force) {}
 static inline void lockdep_init_task(struct task_struct *task) {}
 static inline void lockdep_free_task(struct task_struct *task) {}
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index 2ddaec40956f..0926aef10dad 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -34,11 +34,6 @@ void complete(struct completion *x)
 
 	spin_lock_irqsave(&x->wait.lock, flags);
 
-	/*
-	 * Perform commit of crossrelease here.
-	 */
-	complete_release_commit(x);
-
 	if (x->done != UINT_MAX)
 		x->done++;
 	__wake_up_locked(&x->wait, TASK_NORMAL, 1);

From 8d56eff266f3e41a6c39926269c4c3f58f881a8e Mon Sep 17 00:00:00 2001
From: Jike Song <albcamus@gmail.com>
Date: Tue, 9 Jan 2018 00:03:41 +0800
Subject: [PATCH 705/808] x86/mm/pti: Remove dead logic in
 pti_user_pagetable_walk*()

The following code contains dead logic:

 162 if (pgd_none(*pgd)) {
 163         unsigned long new_p4d_page = __get_free_page(gfp);
 164         if (!new_p4d_page)
 165                 return NULL;
 166
 167         if (pgd_none(*pgd)) {
 168                 set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page)));
 169                 new_p4d_page = 0;
 170         }
 171         if (new_p4d_page)
 172                 free_page(new_p4d_page);
 173 }

There can't be any difference between two pgd_none(*pgd) at L162 and L167,
so it's always false at L171.

Dave Hansen explained:

 Yes, the double-test was part of an optimization where we attempted to
 avoid using a global spinlock in the fork() path.  We would check for
 unallocated mid-level page tables without the lock.  The lock was only
 taken when we needed to *make* an entry to avoid collisions.

 Now that it is all single-threaded, there is no chance of a collision,
 no need for a lock, and no need for the re-check.

As all these functions are only called during init, mark them __init as
well.

Fixes: 03f4424f348e ("x86/mm/pti: Add functions to clone kernel PMDs")
Signed-off-by: Jike Song <albcamus@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Alan Cox <gnomes@lxorguk.ukuu.org.uk>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Jiri Koshina <jikos@kernel.org>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Kees Cook <keescook@google.com>
Cc: Andi Lutomirski <luto@amacapital.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Greg KH <gregkh@linux-foundation.org>
Cc: David Woodhouse <dwmw@amazon.co.uk>
Cc: Paul Turner <pjt@google.com>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/20180108160341.3461-1-albcamus@gmail.com
---
 arch/x86/mm/pti.c | 32 ++++++--------------------------
 1 file changed, 6 insertions(+), 26 deletions(-)

diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index 43d4a4a29037..ce38f165489b 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -149,7 +149,7 @@ pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
  *
  * Returns a pointer to a P4D on success, or NULL on failure.
  */
-static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address)
+static __init p4d_t *pti_user_pagetable_walk_p4d(unsigned long address)
 {
 	pgd_t *pgd = kernel_to_user_pgdp(pgd_offset_k(address));
 	gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
@@ -164,12 +164,7 @@ static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address)
 		if (!new_p4d_page)
 			return NULL;
 
-		if (pgd_none(*pgd)) {
-			set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page)));
-			new_p4d_page = 0;
-		}
-		if (new_p4d_page)
-			free_page(new_p4d_page);
+		set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page)));
 	}
 	BUILD_BUG_ON(pgd_large(*pgd) != 0);
 
@@ -182,7 +177,7 @@ static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address)
  *
  * Returns a pointer to a PMD on success, or NULL on failure.
  */
-static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
+static __init pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
 {
 	gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
 	p4d_t *p4d = pti_user_pagetable_walk_p4d(address);
@@ -194,12 +189,7 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
 		if (!new_pud_page)
 			return NULL;
 
-		if (p4d_none(*p4d)) {
-			set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page)));
-			new_pud_page = 0;
-		}
-		if (new_pud_page)
-			free_page(new_pud_page);
+		set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page)));
 	}
 
 	pud = pud_offset(p4d, address);
@@ -213,12 +203,7 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
 		if (!new_pmd_page)
 			return NULL;
 
-		if (pud_none(*pud)) {
-			set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
-			new_pmd_page = 0;
-		}
-		if (new_pmd_page)
-			free_page(new_pmd_page);
+		set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
 	}
 
 	return pmd_offset(pud, address);
@@ -251,12 +236,7 @@ static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address)
 		if (!new_pte_page)
 			return NULL;
 
-		if (pmd_none(*pmd)) {
-			set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
-			new_pte_page = 0;
-		}
-		if (new_pte_page)
-			free_page(new_pte_page);
+		set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
 	}
 
 	pte = pte_offset_kernel(pmd, address);

From 98b8e4e5c17bf87c1b18ed929472051dab39878c Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 3 Jan 2018 12:49:29 +0100
Subject: [PATCH 706/808] platform/x86: wmi: Call acpi_wmi_init() later

Calling acpi_wmi_init() at the subsys_initcall() level causes ordering
issues to appear on some systems and they are difficult to reproduce,
because there is no guaranteed ordering between subsys_initcall()
calls, so they may occur in different orders on different systems.

In particular, commit 86d9f48534e8 (mm/slab: fix kmemcg cache
creation delayed issue) exposed one of these issues where genl_init()
and acpi_wmi_init() are both called at the same initcall level, but
the former must run before the latter so as to avoid a NULL pointer
dereference.

For this reason, move the acpi_wmi_init() invocation to the
initcall_sync level which should still be early enough for things
to work correctly in the WMI land.

Link: https://marc.info/?t=151274596700002&r=1&w=2
Reported-by: Jonathan McDowell <noodles@earth.li>
Reported-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Tested-by: Jonathan McDowell <noodles@earth.li>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Darren Hart (VMware) <dvhart@infradead.org>
---
 drivers/platform/x86/wmi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/platform/x86/wmi.c b/drivers/platform/x86/wmi.c
index 791449a2370f..daa68acbc900 100644
--- a/drivers/platform/x86/wmi.c
+++ b/drivers/platform/x86/wmi.c
@@ -1458,5 +1458,5 @@ static void __exit acpi_wmi_exit(void)
 	class_unregister(&wmi_bus_class);
 }
 
-subsys_initcall(acpi_wmi_init);
+subsys_initcall_sync(acpi_wmi_init);
 module_exit(acpi_wmi_exit);

From 9d0513d82f1a8fe17b41f113ac5922fa57dbaf5c Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 28 Dec 2017 14:25:23 +0200
Subject: [PATCH 707/808] x86/platform/intel-mid: Revert "Make 'bt_sfi_data'
 const"

So one of the constification patches unearthed a type casting fragility
of the underlying code:

  276c87054751 ("x86/platform/intel-mid: Make 'bt_sfi_data' const")

converted the struct to be const while it is also used as a temporary
container for important data that is used to fill 'parent' and 'name'
fields in struct platform_device_info.

The compiler doesn't notice this due to an explicit type cast that loses
the const - which fragility will be fixed separately.

This type cast turned a seemingly trivial const propagation patch into a
hard to debug data corruptor and crasher bug.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Bhumika Goyal <bhumirks@gmail.com>
Cc: Darren Hart <dvhart@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: julia.lawall@lip6.fr
Cc: platform-driver-x86@vger.kernel.org
Link: http://lkml.kernel.org/r/20171228122523.21802-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/platform/intel-mid/device_libs/platform_bt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/platform/intel-mid/device_libs/platform_bt.c b/arch/x86/platform/intel-mid/device_libs/platform_bt.c
index dc036e511f48..5a0483e7bf66 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_bt.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_bt.c
@@ -60,7 +60,7 @@ static int __init tng_bt_sfi_setup(struct bt_sfi_data *ddata)
 	return 0;
 }
 
-static const struct bt_sfi_data tng_bt_sfi_data __initdata = {
+static struct bt_sfi_data tng_bt_sfi_data __initdata = {
 	.setup	= tng_bt_sfi_setup,
 };
 

From 414a2dc138838642d28938506e31ad461648b898 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Tue, 2 Jan 2018 12:13:10 +0100
Subject: [PATCH 708/808] sched/isolation: Make CONFIG_CPU_ISOLATION=y depend
 on SMP or COMPILE_TEST

On uniprocessor systems, critical and non-critical tasks cannot be
isolated, as there is only a single CPU core.  Hence enabling CPU
isolation by default on such systems does not make much sense.

Instead of changing the default for !SMP, fix this by making the feature
depend on SMP, with an override for compile-testing.  Note that its sole
selector (NO_HZ_FULL) already depends on SMP.

This decreases kernel size for a default uniprocessor kernel by ca. 1 KiB.

Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: Nicolas Pitre <nico@linaro.org>
Cc: Frederic Weisbecker <frederic@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 2c43838c99d9d23f ("sched/isolation: Enable CONFIG_CPU_ISOLATION=y by default")
Link: http://lkml.kernel.org/r/1514891590-20782-1-git-send-email-geert@linux-m68k.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 init/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/init/Kconfig b/init/Kconfig
index 690a381adee0..c1221332e128 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -461,6 +461,7 @@ endmenu # "CPU/Task time and stats accounting"
 
 config CPU_ISOLATION
 	bool "CPU isolation"
+	depends on SMP || COMPILE_TEST
 	default y
 	help
 	  Make sure that CPUs running critical tasks are not disturbed by

From f328299e54a94998b31baf788d2b33d8122a4acb Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Fri, 29 Dec 2017 13:53:03 -0600
Subject: [PATCH 709/808] locking/refcounts: Remove stale comment from the
 ARCH_HAS_REFCOUNT Kconfig entry

ARCH_HAS_REFCOUNT is no longer marked as broken ('if BROKEN'), so remove
the stale comment regarding it being broken.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20171229195303.17781-1-ebiggers3@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/Kconfig | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d4fc98c50378..ff4e9cd99854 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -55,7 +55,6 @@ config X86
 	select ARCH_HAS_GCOV_PROFILE_ALL
 	select ARCH_HAS_KCOV			if X86_64
 	select ARCH_HAS_PMEM_API		if X86_64
-	# Causing hangs/crashes, see the commit that added this change for details.
 	select ARCH_HAS_REFCOUNT
 	select ARCH_HAS_UACCESS_FLUSHCACHE	if X86_64
 	select ARCH_HAS_SET_MEMORY

From 7deea450eb912f269d999de62c8ab922d1461748 Mon Sep 17 00:00:00 2001
From: Sunil Challa <sunilkumar.challa@broadcom.com>
Date: Thu, 4 Jan 2018 18:46:54 -0500
Subject: [PATCH 710/808] bnxt_en: Fix population of flow_type in
 bnxt_hwrm_cfa_flow_alloc()

flow_type in HWRM_FLOW_ALLOC is not being populated correctly due to
incorrect passing of pointer and size of l3_mask argument of is_wildcard().
Fixed this.

Fixes: db1d36a27324 ("bnxt_en: add TC flower offload flow_alloc/free FW cmds")
Signed-off-by: Sunil Challa <sunilkumar.challa@broadcom.com>
Reviewed-by: Sathya Perla <sathya.perla@broadcom.com>
Reviewed-by: Venkat Duvvuru <venkatkumar.duvvuru@broadcom.com>
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c
index 3d201d7324bd..d8fee26cd45e 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c
@@ -421,7 +421,7 @@ static int bnxt_hwrm_cfa_flow_alloc(struct bnxt *bp, struct bnxt_tc_flow *flow,
 	}
 
 	/* If all IP and L4 fields are wildcarded then this is an L2 flow */
-	if (is_wildcard(&l3_mask, sizeof(l3_mask)) &&
+	if (is_wildcard(l3_mask, sizeof(*l3_mask)) &&
 	    is_wildcard(&flow->l4_mask, sizeof(flow->l4_mask))) {
 		flow_flags |= CFA_FLOW_ALLOC_REQ_FLAGS_FLOWTYPE_L2;
 	} else {

From 78f300049335ae81a5cc6b4b232481dc5e1f9d41 Mon Sep 17 00:00:00 2001
From: Venkat Duvvuru <venkatkumar.duvvuru@broadcom.com>
Date: Thu, 4 Jan 2018 18:46:55 -0500
Subject: [PATCH 711/808] bnxt_en: Fix the 'Invalid VF' id check in
 bnxt_vf_ndo_prep routine.

In bnxt_vf_ndo_prep (which is called by bnxt_get_vf_config ndo), there is a
check for "Invalid VF id". Currently, the check is done against max_vfs.
However, the user doesn't always create max_vfs. So, the check should be
against the created number of VFs. The number of bnxt_vf_info structures
that are allocated in bnxt_alloc_vf_resources routine is the "number of
requested VFs". So, if an "invalid VF id" falls between the requested
number of VFs and the max_vfs, the driver will be dereferencing an invalid
pointer.

Fixes: c0c050c58d84 ("bnxt_en: New Broadcom ethernet driver.")
Signed-off-by: Venkat Devvuru <venkatkumar.duvvuru@broadcom.com>
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
index 5ee18660bc33..c9617675f934 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
@@ -70,7 +70,7 @@ static int bnxt_vf_ndo_prep(struct bnxt *bp, int vf_id)
 		netdev_err(bp->dev, "vf ndo called though sriov is disabled\n");
 		return -EINVAL;
 	}
-	if (vf_id >= bp->pf.max_vfs) {
+	if (vf_id >= bp->pf.active_vfs) {
 		netdev_err(bp->dev, "Invalid VF id %d\n", vf_id);
 		return -EINVAL;
 	}

From b707fda2df4070785d0fa8a278aa13944c5f51f8 Mon Sep 17 00:00:00 2001
From: Eduardo Otubo <otubo@redhat.com>
Date: Fri, 5 Jan 2018 09:42:16 +0100
Subject: [PATCH 712/808] xen-netfront: enable device after manual module load

When loading the module after unloading it, the network interface would
not be enabled and thus wouldn't have a backend counterpart and unable
to be used by the guest.

The guest would face errors like:

  [root@guest ~]# ethtool -i eth0
  Cannot get driver information: No such device

  [root@guest ~]# ifconfig eth0
  eth0: error fetching interface information: Device not found

This patch initializes the state of the netfront device whenever it is
loaded manually, this state would communicate the netback to create its
device and establish the connection between them.

Signed-off-by: Eduardo Otubo <otubo@redhat.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/xen-netfront.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
index c5a34671abda..9bd7ddeeb6a5 100644
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -1326,6 +1326,7 @@ static struct net_device *xennet_create_dev(struct xenbus_device *dev)
 
 	netif_carrier_off(netdev);
 
+	xenbus_switch_state(dev, XenbusStateInitialising);
 	return netdev;
 
  exit:

From cc35c3d1edf7a8373a1a5daa80a912dec96a9cd5 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Fri, 5 Jan 2018 11:17:17 -0200
Subject: [PATCH 713/808] sctp: do not retransmit upon FragNeeded if PMTU
 discovery is disabled

Currently, if PMTU discovery is disabled on a given transport, but the
configured value is higher than the actual PMTU, it is likely that we
will get some icmp Frag Needed. The issue is, if PMTU discovery is
disabled, we won't update the information and will issue a
retransmission immediately, which may very well trigger another ICMP,
and another retransmission, leading to a loop.

The fix is to simply not trigger immediate retransmissions if PMTU
discovery is disabled on the given transport.

Changes from v2:
- updated stale comment, noticed by Xin Long

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/input.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/net/sctp/input.c b/net/sctp/input.c
index 621b5ca3fd1c..9320661cc41d 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -399,20 +399,20 @@ void sctp_icmp_frag_needed(struct sock *sk, struct sctp_association *asoc,
 		return;
 	}
 
-	if (t->param_flags & SPP_PMTUD_ENABLE) {
-		/* Update transports view of the MTU */
-		sctp_transport_update_pmtu(t, pmtu);
+	if (!(t->param_flags & SPP_PMTUD_ENABLE))
+		/* We can't allow retransmitting in such case, as the
+		 * retransmission would be sized just as before, and thus we
+		 * would get another icmp, and retransmit again.
+		 */
+		return;
 
-		/* Update association pmtu. */
-		sctp_assoc_sync_pmtu(asoc);
-	}
+	/* Update transports view of the MTU */
+	sctp_transport_update_pmtu(t, pmtu);
 
-	/* Retransmit with the new pmtu setting.
-	 * Normally, if PMTU discovery is disabled, an ICMP Fragmentation
-	 * Needed will never be sent, but if a message was sent before
-	 * PMTU discovery was disabled that was larger than the PMTU, it
-	 * would not be fragmented, so it must be re-transmitted fragmented.
-	 */
+	/* Update association pmtu. */
+	sctp_assoc_sync_pmtu(asoc);
+
+	/* Retransmit with the new pmtu setting. */
 	sctp_retransmit(&asoc->outqueue, t, SCTP_RTXR_PMTUD);
 }
 

From b6c5734db07079c9410147b32407f2366d584e6c Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Fri, 5 Jan 2018 11:17:18 -0200
Subject: [PATCH 714/808] sctp: fix the handling of ICMP Frag Needed for too
 small MTUs

syzbot reported a hang involving SCTP, on which it kept flooding dmesg
with the message:
[  246.742374] sctp: sctp_transport_update_pmtu: Reported pmtu 508 too
low, using default minimum of 512

That happened because whenever SCTP hits an ICMP Frag Needed, it tries
to adjust to the new MTU and triggers an immediate retransmission. But
it didn't consider the fact that MTUs smaller than the SCTP minimum MTU
allowed (512) would not cause the PMTU to change, and issued the
retransmission anyway (thus leading to another ICMP Frag Needed, and so
on).

As IPv4 (ip_rt_min_pmtu=556) and IPv6 (IPV6_MIN_MTU=1280) minimum MTU
are higher than that, sctp_transport_update_pmtu() is changed to
re-fetch the PMTU that got set after our request, and with that, detect
if there was an actual change or not.

The fix, thus, skips the immediate retransmission if the received ICMP
resulted in no change, in the hope that SCTP will select another path.

Note: The value being used for the minimum MTU (512,
SCTP_DEFAULT_MINSEGMENT) is not right and instead it should be (576,
SCTP_MIN_PMTU), but such change belongs to another patch.

Changes from v1:
- do not disable PMTU discovery, in the light of commit
06ad391919b2 ("[SCTP] Don't disable PMTU discovery when mtu is small")
and as suggested by Xin Long.
- changed the way to break the rtx loop by detecting if the icmp
  resulted in a change or not
Changes from v2:
none

See-also: https://lkml.org/lkml/2017/12/22/811
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h |  2 +-
 net/sctp/input.c           |  8 ++++++--
 net/sctp/transport.c       | 29 +++++++++++++++++++----------
 3 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 2f8f93da5dc2..9a5ccf03a59b 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -966,7 +966,7 @@ void sctp_transport_burst_limited(struct sctp_transport *);
 void sctp_transport_burst_reset(struct sctp_transport *);
 unsigned long sctp_transport_timeout(struct sctp_transport *);
 void sctp_transport_reset(struct sctp_transport *t);
-void sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu);
+bool sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu);
 void sctp_transport_immediate_rtx(struct sctp_transport *);
 void sctp_transport_dst_release(struct sctp_transport *t);
 void sctp_transport_dst_confirm(struct sctp_transport *t);
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 9320661cc41d..141c9c466ec1 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -406,8 +406,12 @@ void sctp_icmp_frag_needed(struct sock *sk, struct sctp_association *asoc,
 		 */
 		return;
 
-	/* Update transports view of the MTU */
-	sctp_transport_update_pmtu(t, pmtu);
+	/* Update transports view of the MTU. Return if no update was needed.
+	 * If an update wasn't needed/possible, it also doesn't make sense to
+	 * try to retransmit now.
+	 */
+	if (!sctp_transport_update_pmtu(t, pmtu))
+		return;
 
 	/* Update association pmtu. */
 	sctp_assoc_sync_pmtu(asoc);
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index 1e5a22430cf5..47f82bd794d9 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -248,28 +248,37 @@ void sctp_transport_pmtu(struct sctp_transport *transport, struct sock *sk)
 		transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT;
 }
 
-void sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu)
+bool sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu)
 {
 	struct dst_entry *dst = sctp_transport_dst_check(t);
+	bool change = true;
 
 	if (unlikely(pmtu < SCTP_DEFAULT_MINSEGMENT)) {
-		pr_warn("%s: Reported pmtu %d too low, using default minimum of %d\n",
-			__func__, pmtu, SCTP_DEFAULT_MINSEGMENT);
-		/* Use default minimum segment size and disable
-		 * pmtu discovery on this transport.
-		 */
-		t->pathmtu = SCTP_DEFAULT_MINSEGMENT;
-	} else {
-		t->pathmtu = pmtu;
+		pr_warn_ratelimited("%s: Reported pmtu %d too low, using default minimum of %d\n",
+				    __func__, pmtu, SCTP_DEFAULT_MINSEGMENT);
+		/* Use default minimum segment instead */
+		pmtu = SCTP_DEFAULT_MINSEGMENT;
 	}
+	pmtu = SCTP_TRUNC4(pmtu);
 
 	if (dst) {
 		dst->ops->update_pmtu(dst, t->asoc->base.sk, NULL, pmtu);
 		dst = sctp_transport_dst_check(t);
 	}
 
-	if (!dst)
+	if (!dst) {
 		t->af_specific->get_dst(t, &t->saddr, &t->fl, t->asoc->base.sk);
+		dst = t->dst;
+	}
+
+	if (dst) {
+		/* Re-fetch, as under layers may have a higher minimum size */
+		pmtu = SCTP_TRUNC4(dst_mtu(dst));
+		change = t->pathmtu != pmtu;
+	}
+	t->pathmtu = pmtu;
+
+	return change;
 }
 
 /* Caches the dst entry and source address for a transport's destination

From 46cd75036415d94e9cf451e6606a099945d54cc6 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <garsilva@embeddedor.com>
Date: Fri, 5 Jan 2018 11:23:45 -0600
Subject: [PATCH 715/808] phylink: mark expected switch fall-throughs in
 phylink_mii_ioctl

In preparation to enabling -Wimplicit-fallthrough, mark switch cases
where we are expecting to fall through.

Addresses-Coverity-ID: 1463447 ("Missing break in switch")
Signed-off-by: Gustavo A. R. Silva <garsilva@embeddedor.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phylink.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index 150cd95a6e1e..249ce5cbea22 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -1296,6 +1296,7 @@ int phylink_mii_ioctl(struct phylink *pl, struct ifreq *ifr, int cmd)
 		switch (cmd) {
 		case SIOCGMIIPHY:
 			mii->phy_id = pl->phydev->mdio.addr;
+			/* fall through */
 
 		case SIOCGMIIREG:
 			ret = phylink_phy_read(pl, mii->phy_id, mii->reg_num);
@@ -1318,6 +1319,7 @@ int phylink_mii_ioctl(struct phylink *pl, struct ifreq *ifr, int cmd)
 		switch (cmd) {
 		case SIOCGMIIPHY:
 			mii->phy_id = 0;
+			/* fall through */
 
 		case SIOCGMIIREG:
 			ret = phylink_mii_read(pl, mii->phy_id, mii->reg_num);

From 56c0290202ab94a2f2780c449395d4ae8495fab4 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Sat, 6 Jan 2018 09:00:09 +0100
Subject: [PATCH 716/808] mdio-sun4i: Fix a memory leak

If the probing of the regulator is deferred, the memory allocated by
'mdiobus_alloc_size()' will be leaking.
It should be freed before the next call to 'sun4i_mdio_probe()' which will
reallocate it.

Fixes: 4bdcb1dd9feb ("net: Add MDIO bus driver for the Allwinner EMAC")
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/mdio-sun4i.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/net/phy/mdio-sun4i.c b/drivers/net/phy/mdio-sun4i.c
index 135296508a7e..6425ce04d3f9 100644
--- a/drivers/net/phy/mdio-sun4i.c
+++ b/drivers/net/phy/mdio-sun4i.c
@@ -118,8 +118,10 @@ static int sun4i_mdio_probe(struct platform_device *pdev)
 
 	data->regulator = devm_regulator_get(&pdev->dev, "phy");
 	if (IS_ERR(data->regulator)) {
-		if (PTR_ERR(data->regulator) == -EPROBE_DEFER)
-			return -EPROBE_DEFER;
+		if (PTR_ERR(data->regulator) == -EPROBE_DEFER) {
+			ret = -EPROBE_DEFER;
+			goto err_out_free_mdiobus;
+		}
 
 		dev_info(&pdev->dev, "no regulator found\n");
 		data->regulator = NULL;

From 50f3d740d376f664f6accc7e86c9afd8f1c7e1e4 Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Date: Sun, 7 Jan 2018 00:26:47 +0300
Subject: [PATCH 717/808] sh_eth: fix TXALCR1 offsets

The  TXALCR1 offsets are incorrect in the register offset tables, most
probably due to copy&paste error.  Luckily, the driver never uses this
register. :-)

Fixes: 4a55530f38e4 ("net: sh_eth: modify the definitions of register")
Signed-off-by: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/renesas/sh_eth.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c
index f21c1db91c3f..b9e2846589f8 100644
--- a/drivers/net/ethernet/renesas/sh_eth.c
+++ b/drivers/net/ethernet/renesas/sh_eth.c
@@ -147,7 +147,7 @@ static const u16 sh_eth_offset_gigabit[SH_ETH_MAX_REGISTER_OFFSET] = {
 	[FWNLCR0]	= 0x0090,
 	[FWALCR0]	= 0x0094,
 	[TXNLCR1]	= 0x00a0,
-	[TXALCR1]	= 0x00a0,
+	[TXALCR1]	= 0x00a4,
 	[RXNLCR1]	= 0x00a8,
 	[RXALCR1]	= 0x00ac,
 	[FWNLCR1]	= 0x00b0,
@@ -399,7 +399,7 @@ static const u16 sh_eth_offset_fast_sh3_sh2[SH_ETH_MAX_REGISTER_OFFSET] = {
 	[FWNLCR0]	= 0x0090,
 	[FWALCR0]	= 0x0094,
 	[TXNLCR1]	= 0x00a0,
-	[TXALCR1]	= 0x00a0,
+	[TXALCR1]	= 0x00a4,
 	[RXNLCR1]	= 0x00a8,
 	[RXALCR1]	= 0x00ac,
 	[FWNLCR1]	= 0x00b0,

From b2157399cc9898260d6031c5bfe45fe137c1fbe7 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Sun, 7 Jan 2018 17:33:02 -0800
Subject: [PATCH 718/808] bpf: prevent out-of-bounds speculation

Under speculation, CPUs may mis-predict branches in bounds checks. Thus,
memory accesses under a bounds check may be speculated even if the
bounds check fails, providing a primitive for building a side channel.

To avoid leaking kernel data round up array-based maps and mask the index
after bounds check, so speculated load with out of bounds index will load
either valid value from the array or zero from the padded area.

Unconditionally mask index for all array types even when max_entries
are not rounded to power of 2 for root user.
When map is created by unpriv user generate a sequence of bpf insns
that includes AND operation to make sure that JITed code includes
the same 'index & index_mask' operation.

If prog_array map is created by unpriv user replace
  bpf_tail_call(ctx, map, index);
with
  if (index >= max_entries) {
    index &= map->index_mask;
    bpf_tail_call(ctx, map, index);
  }
(along with roundup to power 2) to prevent out-of-bounds speculation.
There is secondary redundant 'if (index >= max_entries)' in the interpreter
and in all JITs, but they can be optimized later if necessary.

Other array-like maps (cpumap, devmap, sockmap, perf_event_array, cgroup_array)
cannot be used by unpriv, so no changes there.

That fixes bpf side of "Variant 1: bounds check bypass (CVE-2017-5753)" on
all architectures with and without JIT.

v2->v3:
Daniel noticed that attack potentially can be crafted via syscall commands
without loading the program, so add masking to those paths as well.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h   |  2 ++
 kernel/bpf/arraymap.c | 47 +++++++++++++++++++++++++++++++++----------
 kernel/bpf/verifier.c | 36 +++++++++++++++++++++++++++++++++
 3 files changed, 74 insertions(+), 11 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e55e4255a210..1b985ca4ffbe 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -52,6 +52,7 @@ struct bpf_map {
 	u32 pages;
 	u32 id;
 	int numa_node;
+	bool unpriv_array;
 	struct user_struct *user;
 	const struct bpf_map_ops *ops;
 	struct work_struct work;
@@ -221,6 +222,7 @@ struct bpf_prog_aux {
 struct bpf_array {
 	struct bpf_map map;
 	u32 elem_size;
+	u32 index_mask;
 	/* 'ownership' of prog_array is claimed by the first program that
 	 * is going to use this map or by the first program which FD is stored
 	 * in the map to make sure that all callers and callees have the same
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 7c25426d3cf5..aaa319848e7d 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -53,9 +53,10 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 {
 	bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
 	int numa_node = bpf_map_attr_numa_node(attr);
+	u32 elem_size, index_mask, max_entries;
+	bool unpriv = !capable(CAP_SYS_ADMIN);
 	struct bpf_array *array;
 	u64 array_size;
-	u32 elem_size;
 
 	/* check sanity of attributes */
 	if (attr->max_entries == 0 || attr->key_size != 4 ||
@@ -72,11 +73,20 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 
 	elem_size = round_up(attr->value_size, 8);
 
+	max_entries = attr->max_entries;
+	index_mask = roundup_pow_of_two(max_entries) - 1;
+
+	if (unpriv)
+		/* round up array size to nearest power of 2,
+		 * since cpu will speculate within index_mask limits
+		 */
+		max_entries = index_mask + 1;
+
 	array_size = sizeof(*array);
 	if (percpu)
-		array_size += (u64) attr->max_entries * sizeof(void *);
+		array_size += (u64) max_entries * sizeof(void *);
 	else
-		array_size += (u64) attr->max_entries * elem_size;
+		array_size += (u64) max_entries * elem_size;
 
 	/* make sure there is no u32 overflow later in round_up() */
 	if (array_size >= U32_MAX - PAGE_SIZE)
@@ -86,6 +96,8 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 	array = bpf_map_area_alloc(array_size, numa_node);
 	if (!array)
 		return ERR_PTR(-ENOMEM);
+	array->index_mask = index_mask;
+	array->map.unpriv_array = unpriv;
 
 	/* copy mandatory map attributes */
 	array->map.map_type = attr->map_type;
@@ -121,12 +133,13 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key)
 	if (unlikely(index >= array->map.max_entries))
 		return NULL;
 
-	return array->value + array->elem_size * index;
+	return array->value + array->elem_size * (index & array->index_mask);
 }
 
 /* emit BPF instructions equivalent to C code of array_map_lookup_elem() */
 static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
 {
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	struct bpf_insn *insn = insn_buf;
 	u32 elem_size = round_up(map->value_size, 8);
 	const int ret = BPF_REG_0;
@@ -135,7 +148,12 @@ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
 
 	*insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
 	*insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
-	*insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3);
+	if (map->unpriv_array) {
+		*insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 4);
+		*insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask);
+	} else {
+		*insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3);
+	}
 
 	if (is_power_of_2(elem_size)) {
 		*insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size));
@@ -157,7 +175,7 @@ static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key)
 	if (unlikely(index >= array->map.max_entries))
 		return NULL;
 
-	return this_cpu_ptr(array->pptrs[index]);
+	return this_cpu_ptr(array->pptrs[index & array->index_mask]);
 }
 
 int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
@@ -177,7 +195,7 @@ int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
 	 */
 	size = round_up(map->value_size, 8);
 	rcu_read_lock();
-	pptr = array->pptrs[index];
+	pptr = array->pptrs[index & array->index_mask];
 	for_each_possible_cpu(cpu) {
 		bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size);
 		off += size;
@@ -225,10 +243,11 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
 		return -EEXIST;
 
 	if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
-		memcpy(this_cpu_ptr(array->pptrs[index]),
+		memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]),
 		       value, map->value_size);
 	else
-		memcpy(array->value + array->elem_size * index,
+		memcpy(array->value +
+		       array->elem_size * (index & array->index_mask),
 		       value, map->value_size);
 	return 0;
 }
@@ -262,7 +281,7 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
 	 */
 	size = round_up(map->value_size, 8);
 	rcu_read_lock();
-	pptr = array->pptrs[index];
+	pptr = array->pptrs[index & array->index_mask];
 	for_each_possible_cpu(cpu) {
 		bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size);
 		off += size;
@@ -613,6 +632,7 @@ static void *array_of_map_lookup_elem(struct bpf_map *map, void *key)
 static u32 array_of_map_gen_lookup(struct bpf_map *map,
 				   struct bpf_insn *insn_buf)
 {
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	u32 elem_size = round_up(map->value_size, 8);
 	struct bpf_insn *insn = insn_buf;
 	const int ret = BPF_REG_0;
@@ -621,7 +641,12 @@ static u32 array_of_map_gen_lookup(struct bpf_map *map,
 
 	*insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
 	*insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
-	*insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5);
+	if (map->unpriv_array) {
+		*insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 6);
+		*insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask);
+	} else {
+		*insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5);
+	}
 	if (is_power_of_2(elem_size))
 		*insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size));
 	else
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 04b24876cd23..b414d6b2d470 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1729,6 +1729,13 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 	err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &meta);
 	if (err)
 		return err;
+	if (func_id == BPF_FUNC_tail_call) {
+		if (meta.map_ptr == NULL) {
+			verbose(env, "verifier bug\n");
+			return -EINVAL;
+		}
+		env->insn_aux_data[insn_idx].map_ptr = meta.map_ptr;
+	}
 	err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &meta);
 	if (err)
 		return err;
@@ -4456,6 +4463,35 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 			 */
 			insn->imm = 0;
 			insn->code = BPF_JMP | BPF_TAIL_CALL;
+
+			/* instead of changing every JIT dealing with tail_call
+			 * emit two extra insns:
+			 * if (index >= max_entries) goto out;
+			 * index &= array->index_mask;
+			 * to avoid out-of-bounds cpu speculation
+			 */
+			map_ptr = env->insn_aux_data[i + delta].map_ptr;
+			if (map_ptr == BPF_MAP_PTR_POISON) {
+				verbose(env, "tail_call obusing map_ptr\n");
+				return -EINVAL;
+			}
+			if (!map_ptr->unpriv_array)
+				continue;
+			insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3,
+						  map_ptr->max_entries, 2);
+			insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3,
+						    container_of(map_ptr,
+								 struct bpf_array,
+								 map)->index_mask);
+			insn_buf[2] = *insn;
+			cnt = 3;
+			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+			if (!new_prog)
+				return -ENOMEM;
+
+			delta    += cnt - 1;
+			env->prog = prog = new_prog;
+			insn      = new_prog->insnsi + i + delta;
 			continue;
 		}
 

From e4d0e84e490790798691aaa0f2e598637f1867ec Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 8 Jan 2018 16:09:21 -0600
Subject: [PATCH 719/808] x86/cpu/AMD: Make LFENCE a serializing instruction

To aid in speculation control, make LFENCE a serializing instruction
since it has less overhead than MFENCE.  This is done by setting bit 1
of MSR 0xc0011029 (DE_CFG).  Some families that support LFENCE do not
have this MSR.  For these families, the LFENCE instruction is already
serializing.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org>
Cc: David Woodhouse <dwmw@amazon.co.uk>
Cc: Paul Turner <pjt@google.com>
Link: https://lkml.kernel.org/r/20180108220921.12580.71694.stgit@tlendack-t1.amdoffice.net
---
 arch/x86/include/asm/msr-index.h |  2 ++
 arch/x86/kernel/cpu/amd.c        | 10 ++++++++++
 2 files changed, 12 insertions(+)

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index ab022618a50a..1e7d710fef43 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -352,6 +352,8 @@
 #define FAM10H_MMIO_CONF_BASE_MASK	0xfffffffULL
 #define FAM10H_MMIO_CONF_BASE_SHIFT	20
 #define MSR_FAM10H_NODE_ID		0xc001100c
+#define MSR_F10H_DECFG			0xc0011029
+#define MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT	1
 
 /* K8 MSRs */
 #define MSR_K8_TOP_MEM1			0xc001001a
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index bcb75dc97d44..5b438d81beb2 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -829,6 +829,16 @@ static void init_amd(struct cpuinfo_x86 *c)
 		set_cpu_cap(c, X86_FEATURE_K8);
 
 	if (cpu_has(c, X86_FEATURE_XMM2)) {
+		/*
+		 * A serializing LFENCE has less overhead than MFENCE, so
+		 * use it for execution serialization.  On families which
+		 * don't have that MSR, LFENCE is already serializing.
+		 * msr_set_bit() uses the safe accessors, too, even if the MSR
+		 * is not present.
+		 */
+		msr_set_bit(MSR_F10H_DECFG,
+			    MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT);
+
 		/* MFENCE stops RDTSC speculation */
 		set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
 	}

From 9c6a73c75864ad9fa49e5fa6513e4c4071c0e29f Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 8 Jan 2018 16:09:32 -0600
Subject: [PATCH 720/808] x86/cpu/AMD: Use LFENCE_RDTSC in preference to
 MFENCE_RDTSC

With LFENCE now a serializing instruction, use LFENCE_RDTSC in preference
to MFENCE_RDTSC.  However, since the kernel could be running under a
hypervisor that does not support writing that MSR, read the MSR back and
verify that the bit has been set successfully.  If the MSR can be read
and the bit is set, then set the LFENCE_RDTSC feature, otherwise set the
MFENCE_RDTSC feature.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org>
Cc: David Woodhouse <dwmw@amazon.co.uk>
Cc: Paul Turner <pjt@google.com>
Link: https://lkml.kernel.org/r/20180108220932.12580.52458.stgit@tlendack-t1.amdoffice.net
---
 arch/x86/include/asm/msr-index.h |  1 +
 arch/x86/kernel/cpu/amd.c        | 18 ++++++++++++++++--
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 1e7d710fef43..fa11fb1fa570 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -354,6 +354,7 @@
 #define MSR_FAM10H_NODE_ID		0xc001100c
 #define MSR_F10H_DECFG			0xc0011029
 #define MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT	1
+#define MSR_F10H_DECFG_LFENCE_SERIALIZE		BIT_ULL(MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT)
 
 /* K8 MSRs */
 #define MSR_K8_TOP_MEM1			0xc001001a
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 5b438d81beb2..ea831c858195 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -829,6 +829,9 @@ static void init_amd(struct cpuinfo_x86 *c)
 		set_cpu_cap(c, X86_FEATURE_K8);
 
 	if (cpu_has(c, X86_FEATURE_XMM2)) {
+		unsigned long long val;
+		int ret;
+
 		/*
 		 * A serializing LFENCE has less overhead than MFENCE, so
 		 * use it for execution serialization.  On families which
@@ -839,8 +842,19 @@ static void init_amd(struct cpuinfo_x86 *c)
 		msr_set_bit(MSR_F10H_DECFG,
 			    MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT);
 
-		/* MFENCE stops RDTSC speculation */
-		set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
+		/*
+		 * Verify that the MSR write was successful (could be running
+		 * under a hypervisor) and only then assume that LFENCE is
+		 * serializing.
+		 */
+		ret = rdmsrl_safe(MSR_F10H_DECFG, &val);
+		if (!ret && (val & MSR_F10H_DECFG_LFENCE_SERIALIZE)) {
+			/* A serializing LFENCE stops RDTSC speculation */
+			set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
+		} else {
+			/* MFENCE stops RDTSC speculation */
+			set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
+		}
 	}
 
 	/*

From 1b5c7ef3d0d0610bda9b63263f7c5b7178d11015 Mon Sep 17 00:00:00 2001
From: Rob Clark <robdclark@gmail.com>
Date: Sat, 6 Jan 2018 10:59:41 -0500
Subject: [PATCH 721/808] drm/nouveau/disp/gf119: add missing drive vfunc ptr

Fixes broken dp on GF119:

  Call Trace:
   ? nvkm_dp_train_drive+0x183/0x2c0 [nouveau]
   nvkm_dp_acquire+0x4f3/0xcd0 [nouveau]
   nv50_disp_super_2_2+0x5d/0x470 [nouveau]
   ? nvkm_devinit_pll_set+0xf/0x20 [nouveau]
   gf119_disp_super+0x19c/0x2f0 [nouveau]
   process_one_work+0x193/0x3c0
   worker_thread+0x35/0x3b0
   kthread+0x125/0x140
   ? process_one_work+0x3c0/0x3c0
   ? kthread_park+0x60/0x60
   ret_from_fork+0x25/0x30
  Code:  Bad RIP value.
  RIP:           (null) RSP: ffffb1e243e4bc38
  CR2: 0000000000000000

Fixes: af85389c614a drm/nouveau/disp: shuffle functions around
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=103421
Signed-off-by: Rob Clark <robdclark@gmail.com>
Signed-off-by: Ben Skeggs <bskeggs@redhat.com>
---
 drivers/gpu/drm/nouveau/nvkm/engine/disp/sorgf119.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/disp/sorgf119.c b/drivers/gpu/drm/nouveau/nvkm/engine/disp/sorgf119.c
index a2978a37b4f3..700fc754f28a 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/disp/sorgf119.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/disp/sorgf119.c
@@ -174,6 +174,7 @@ gf119_sor = {
 		.links = gf119_sor_dp_links,
 		.power = g94_sor_dp_power,
 		.pattern = gf119_sor_dp_pattern,
+		.drive = gf119_sor_dp_drive,
 		.vcpi = gf119_sor_dp_vcpi,
 		.audio = gf119_sor_dp_audio,
 		.audio_sym = gf119_sor_dp_audio_sym,

From aa1f10e85b0ab53dee85d8e293c8159d18d293a8 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Fri, 29 Dec 2017 00:22:54 +0100
Subject: [PATCH 722/808] mux: core: fix double get_device()

class_find_device already does a get_device on the returned device.
So the device returned by of_find_mux_chip_by_node is already referenced
and we should not reference it again (and unref it on error).

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Peter Rosin <peda@axentia.se>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/mux/core.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/mux/core.c b/drivers/mux/core.c
index 2260063b0ea8..6e5cf9d9cd99 100644
--- a/drivers/mux/core.c
+++ b/drivers/mux/core.c
@@ -413,6 +413,7 @@ static int of_dev_node_match(struct device *dev, const void *data)
 	return dev->of_node == data;
 }
 
+/* Note this function returns a reference to the mux_chip dev. */
 static struct mux_chip *of_find_mux_chip_by_node(struct device_node *np)
 {
 	struct device *dev;
@@ -466,6 +467,7 @@ struct mux_control *mux_control_get(struct device *dev, const char *mux_name)
 	    (!args.args_count && (mux_chip->controllers > 1))) {
 		dev_err(dev, "%pOF: wrong #mux-control-cells for %pOF\n",
 			np, args.np);
+		put_device(&mux_chip->dev);
 		return ERR_PTR(-EINVAL);
 	}
 
@@ -476,10 +478,10 @@ struct mux_control *mux_control_get(struct device *dev, const char *mux_name)
 	if (controller >= mux_chip->controllers) {
 		dev_err(dev, "%pOF: bad mux controller %u specified in %pOF\n",
 			np, controller, args.np);
+		put_device(&mux_chip->dev);
 		return ERR_PTR(-EINVAL);
 	}
 
-	get_device(&mux_chip->dev);
 	return &mux_chip->mux[controller];
 }
 EXPORT_SYMBOL_GPL(mux_control_get);

From 443064cb0b1fb4569fe0a71209da7625129fb760 Mon Sep 17 00:00:00 2001
From: Viktor Slavkovic <viktors@google.com>
Date: Mon, 8 Jan 2018 10:43:03 -0800
Subject: [PATCH 723/808] staging: android: ashmem: fix a race condition in
 ASHMEM_SET_SIZE ioctl

A lock-unlock is missing in ASHMEM_SET_SIZE ioctl which can result in a
race condition when mmap is called. After the !asma->file check, before
setting asma->size, asma->file can be set in mmap. That would result in
having different asma->size than the mapped memory size. Combined with
ASHMEM_UNPIN ioctl and shrinker invocation, this can result in memory
corruption.

Signed-off-by: Viktor Slavkovic <viktors@google.com>
Cc: stable@vger.kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/staging/android/ashmem.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/staging/android/ashmem.c b/drivers/staging/android/ashmem.c
index 0f695df14c9d..372ce9913e6d 100644
--- a/drivers/staging/android/ashmem.c
+++ b/drivers/staging/android/ashmem.c
@@ -765,10 +765,12 @@ static long ashmem_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		break;
 	case ASHMEM_SET_SIZE:
 		ret = -EINVAL;
+		mutex_lock(&ashmem_mutex);
 		if (!asma->file) {
 			ret = 0;
 			asma->size = (size_t)arg;
 		}
+		mutex_unlock(&ashmem_mutex);
 		break;
 	case ASHMEM_GET_SIZE:
 		ret = asma->size;

From 98648ae6ef6bdcdcb88c46cad963906ab452e96d Mon Sep 17 00:00:00 2001
From: Thomas Hellstrom <thellstrom@vmware.com>
Date: Tue, 9 Jan 2018 15:33:42 +0100
Subject: [PATCH 724/808] drm/vmwgfx: Don't cache framebuffer maps

Buffer objects need to be either pinned or reserved while a map is active,
that's not the case here, so avoid caching the framebuffer map.
This will cause increasing mapping activity mainly when we don't do
page flipping.

This fixes occasional garbage filled screens when the framebuffer has been
evicted after the map.

Since in-kernel mapping of whole buffer objects is error-prone on 32-bit
architectures and also quite inefficient, we will revisit this later.

Signed-off-by: Thomas Hellstrom <thellstrom@vmware.com>
Reviewed-by: Sinclair Yeh <syeh@vmware.com>
Cc: <stable@vger.kernel.org>
---
 drivers/gpu/drm/vmwgfx/vmwgfx_kms.c  |  6 ----
 drivers/gpu/drm/vmwgfx/vmwgfx_kms.h  |  2 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c | 41 ++++++++--------------------
 3 files changed, 13 insertions(+), 36 deletions(-)

diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c
index 0545740b3724..641294aef165 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c
@@ -697,7 +697,6 @@ vmw_du_plane_duplicate_state(struct drm_plane *plane)
 	vps->pinned = 0;
 
 	/* Mapping is managed by prepare_fb/cleanup_fb */
-	memset(&vps->guest_map, 0, sizeof(vps->guest_map));
 	memset(&vps->host_map, 0, sizeof(vps->host_map));
 	vps->cpp = 0;
 
@@ -760,11 +759,6 @@ vmw_du_plane_destroy_state(struct drm_plane *plane,
 
 
 	/* Should have been freed by cleanup_fb */
-	if (vps->guest_map.virtual) {
-		DRM_ERROR("Guest mapping not freed\n");
-		ttm_bo_kunmap(&vps->guest_map);
-	}
-
 	if (vps->host_map.virtual) {
 		DRM_ERROR("Host mapping not freed\n");
 		ttm_bo_kunmap(&vps->host_map);
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.h b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.h
index ff9c8389ff21..cd9da2dd79af 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.h
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.h
@@ -175,7 +175,7 @@ struct vmw_plane_state {
 	int pinned;
 
 	/* For CPU Blit */
-	struct ttm_bo_kmap_obj host_map, guest_map;
+	struct ttm_bo_kmap_obj host_map;
 	unsigned int cpp;
 };
 
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c b/drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c
index 90b5437fd787..b68d74888ab1 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c
@@ -114,7 +114,7 @@ struct vmw_screen_target_display_unit {
 	bool defined;
 
 	/* For CPU Blit */
-	struct ttm_bo_kmap_obj host_map, guest_map;
+	struct ttm_bo_kmap_obj host_map;
 	unsigned int cpp;
 };
 
@@ -695,7 +695,8 @@ static void vmw_stdu_dmabuf_cpu_commit(struct vmw_kms_dirty *dirty)
 	s32 src_pitch, dst_pitch;
 	u8 *src, *dst;
 	bool not_used;
-
+	struct ttm_bo_kmap_obj guest_map;
+	int ret;
 
 	if (!dirty->num_hits)
 		return;
@@ -706,6 +707,13 @@ static void vmw_stdu_dmabuf_cpu_commit(struct vmw_kms_dirty *dirty)
 	if (width == 0 || height == 0)
 		return;
 
+	ret = ttm_bo_kmap(&ddirty->buf->base, 0, ddirty->buf->base.num_pages,
+			  &guest_map);
+	if (ret) {
+		DRM_ERROR("Failed mapping framebuffer for blit: %d\n",
+			  ret);
+		goto out_cleanup;
+	}
 
 	/* Assume we are blitting from Host (display_srf) to Guest (dmabuf) */
 	src_pitch = stdu->display_srf->base_size.width * stdu->cpp;
@@ -713,7 +721,7 @@ static void vmw_stdu_dmabuf_cpu_commit(struct vmw_kms_dirty *dirty)
 	src += ddirty->top * src_pitch + ddirty->left * stdu->cpp;
 
 	dst_pitch = ddirty->pitch;
-	dst = ttm_kmap_obj_virtual(&stdu->guest_map, &not_used);
+	dst = ttm_kmap_obj_virtual(&guest_map, &not_used);
 	dst += ddirty->fb_top * dst_pitch + ddirty->fb_left * stdu->cpp;
 
 
@@ -772,6 +780,7 @@ static void vmw_stdu_dmabuf_cpu_commit(struct vmw_kms_dirty *dirty)
 		vmw_fifo_commit(dev_priv, sizeof(*cmd));
 	}
 
+	ttm_bo_kunmap(&guest_map);
 out_cleanup:
 	ddirty->left = ddirty->top = ddirty->fb_left = ddirty->fb_top = S32_MAX;
 	ddirty->right = ddirty->bottom = S32_MIN;
@@ -1109,9 +1118,6 @@ vmw_stdu_primary_plane_cleanup_fb(struct drm_plane *plane,
 {
 	struct vmw_plane_state *vps = vmw_plane_state_to_vps(old_state);
 
-	if (vps->guest_map.virtual)
-		ttm_bo_kunmap(&vps->guest_map);
-
 	if (vps->host_map.virtual)
 		ttm_bo_kunmap(&vps->host_map);
 
@@ -1277,33 +1283,11 @@ vmw_stdu_primary_plane_prepare_fb(struct drm_plane *plane,
 	 */
 	if (vps->content_fb_type == SEPARATE_DMA &&
 	    !(dev_priv->capabilities & SVGA_CAP_3D)) {
-
-		struct vmw_framebuffer_dmabuf *new_vfbd;
-
-		new_vfbd = vmw_framebuffer_to_vfbd(new_fb);
-
-		ret = ttm_bo_reserve(&new_vfbd->buffer->base, false, false,
-				     NULL);
-		if (ret)
-			goto out_srf_unpin;
-
-		ret = ttm_bo_kmap(&new_vfbd->buffer->base, 0,
-				  new_vfbd->buffer->base.num_pages,
-				  &vps->guest_map);
-
-		ttm_bo_unreserve(&new_vfbd->buffer->base);
-
-		if (ret) {
-			DRM_ERROR("Failed to map content buffer to CPU\n");
-			goto out_srf_unpin;
-		}
-
 		ret = ttm_bo_kmap(&vps->surf->res.backup->base, 0,
 				  vps->surf->res.backup->base.num_pages,
 				  &vps->host_map);
 		if (ret) {
 			DRM_ERROR("Failed to map display buffer to CPU\n");
-			ttm_bo_kunmap(&vps->guest_map);
 			goto out_srf_unpin;
 		}
 
@@ -1350,7 +1334,6 @@ vmw_stdu_primary_plane_atomic_update(struct drm_plane *plane,
 	stdu->display_srf = vps->surf;
 	stdu->content_fb_type = vps->content_fb_type;
 	stdu->cpp = vps->cpp;
-	memcpy(&stdu->guest_map, &vps->guest_map, sizeof(vps->guest_map));
 	memcpy(&stdu->host_map, &vps->host_map, sizeof(vps->host_map));
 
 	if (!stdu->defined)

From 191eccb1580939fb0d47deb405b82a85b0379070 Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Tue, 9 Jan 2018 03:52:05 +1100
Subject: [PATCH 725/808] powerpc/pseries: Add H_GET_CPU_CHARACTERISTICS flags
 & wrapper

A new hypervisor call has been defined to communicate various
characteristics of the CPU to guests. Add definitions for the hcall
number, flags and a wrapper function.

Signed-off-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/hvcall.h         | 17 +++++++++++++++++
 arch/powerpc/include/asm/plpar_wrappers.h | 14 ++++++++++++++
 2 files changed, 31 insertions(+)

diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h
index a409177be8bd..f0461618bf7b 100644
--- a/arch/powerpc/include/asm/hvcall.h
+++ b/arch/powerpc/include/asm/hvcall.h
@@ -241,6 +241,7 @@
 #define H_GET_HCA_INFO          0x1B8
 #define H_GET_PERF_COUNT        0x1BC
 #define H_MANAGE_TRACE          0x1C0
+#define H_GET_CPU_CHARACTERISTICS 0x1C8
 #define H_FREE_LOGICAL_LAN_BUFFER 0x1D4
 #define H_QUERY_INT_STATE       0x1E4
 #define H_POLL_PENDING		0x1D8
@@ -330,6 +331,17 @@
 #define H_SIGNAL_SYS_RESET_ALL_OTHERS		-2
 /* >= 0 values are CPU number */
 
+/* H_GET_CPU_CHARACTERISTICS return values */
+#define H_CPU_CHAR_SPEC_BAR_ORI31	(1ull << 63) // IBM bit 0
+#define H_CPU_CHAR_BCCTRL_SERIALISED	(1ull << 62) // IBM bit 1
+#define H_CPU_CHAR_L1D_FLUSH_ORI30	(1ull << 61) // IBM bit 2
+#define H_CPU_CHAR_L1D_FLUSH_TRIG2	(1ull << 60) // IBM bit 3
+#define H_CPU_CHAR_L1D_THREAD_PRIV	(1ull << 59) // IBM bit 4
+
+#define H_CPU_BEHAV_FAVOUR_SECURITY	(1ull << 63) // IBM bit 0
+#define H_CPU_BEHAV_L1D_FLUSH_PR	(1ull << 62) // IBM bit 1
+#define H_CPU_BEHAV_BNDS_CHK_SPEC_BAR	(1ull << 61) // IBM bit 2
+
 /* Flag values used in H_REGISTER_PROC_TBL hcall */
 #define PROC_TABLE_OP_MASK	0x18
 #define PROC_TABLE_DEREG	0x10
@@ -436,6 +448,11 @@ static inline unsigned int get_longbusy_msecs(int longbusy_rc)
 	}
 }
 
+struct h_cpu_char_result {
+	u64 character;
+	u64 behaviour;
+};
+
 #endif /* __ASSEMBLY__ */
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_HVCALL_H */
diff --git a/arch/powerpc/include/asm/plpar_wrappers.h b/arch/powerpc/include/asm/plpar_wrappers.h
index 7f01b22fa6cb..55eddf50d149 100644
--- a/arch/powerpc/include/asm/plpar_wrappers.h
+++ b/arch/powerpc/include/asm/plpar_wrappers.h
@@ -326,4 +326,18 @@ static inline long plapr_signal_sys_reset(long cpu)
 	return plpar_hcall_norets(H_SIGNAL_SYS_RESET, cpu);
 }
 
+static inline long plpar_get_cpu_characteristics(struct h_cpu_char_result *p)
+{
+	unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
+	long rc;
+
+	rc = plpar_hcall(H_GET_CPU_CHARACTERISTICS, retbuf);
+	if (rc == H_SUCCESS) {
+		p->character = retbuf[0];
+		p->behaviour = retbuf[1];
+	}
+
+	return rc;
+}
+
 #endif /* _ASM_POWERPC_PLPAR_WRAPPERS_H */

From 46eb14a6e1585d99c1b9f58d0e7389082a5f466b Mon Sep 17 00:00:00 2001
From: Pete Zaitcev <zaitcev@redhat.com>
Date: Mon, 8 Jan 2018 15:46:41 -0600
Subject: [PATCH 726/808] USB: fix usbmon BUG trigger

Automated tests triggered this by opening usbmon and accessing the
mmap while simultaneously resizing the buffers. This bug was with
us since 2006, because typically applications only size the buffers
once and thus avoid racing. Reported by Kirill A. Shutemov.

Reported-by: <syzbot+f9831b881b3e849829fc@syzkaller.appspotmail.com>
Signed-off-by: Pete Zaitcev <zaitcev@redhat.com>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/mon/mon_bin.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/usb/mon/mon_bin.c b/drivers/usb/mon/mon_bin.c
index f6ae753ab99b..f932f40302df 100644
--- a/drivers/usb/mon/mon_bin.c
+++ b/drivers/usb/mon/mon_bin.c
@@ -1004,7 +1004,9 @@ static long mon_bin_ioctl(struct file *file, unsigned int cmd, unsigned long arg
 		break;
 
 	case MON_IOCQ_RING_SIZE:
+		mutex_lock(&rp->fetch_lock);
 		ret = rp->b_size;
+		mutex_unlock(&rp->fetch_lock);
 		break;
 
 	case MON_IOCT_RING_SIZE:
@@ -1231,12 +1233,16 @@ static int mon_bin_vma_fault(struct vm_fault *vmf)
 	unsigned long offset, chunk_idx;
 	struct page *pageptr;
 
+	mutex_lock(&rp->fetch_lock);
 	offset = vmf->pgoff << PAGE_SHIFT;
-	if (offset >= rp->b_size)
+	if (offset >= rp->b_size) {
+		mutex_unlock(&rp->fetch_lock);
 		return VM_FAULT_SIGBUS;
+	}
 	chunk_idx = offset / CHUNK_SIZE;
 	pageptr = rp->b_vec[chunk_idx].pg;
 	get_page(pageptr);
+	mutex_unlock(&rp->fetch_lock);
 	vmf->page = pageptr;
 	return 0;
 }

From 7ae2c3c280db183ca9ada2675c34ec2f7378abfa Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Wed, 3 Jan 2018 12:51:51 -0500
Subject: [PATCH 727/808] USB: UDC core: fix double-free in
 usb_add_gadget_udc_release

The error-handling pathways in usb_add_gadget_udc_release() are messed
up.  Aside from the uninformative statement labels, they can deallocate
the udc structure after calling put_device(), which is a double-free.
This was observed by KASAN in automatic testing.

This patch cleans up the routine.  It preserves the requirement that
when any failure occurs, we call put_device(&gadget->dev).

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Reported-by: Fengguang Wu <fengguang.wu@intel.com>
CC: <stable@vger.kernel.org>
Reviewed-by: Peter Chen <peter.chen@nxp.com>
Acked-by: Felipe Balbi <felipe.balbi@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/gadget/udc/core.c | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/drivers/usb/gadget/udc/core.c b/drivers/usb/gadget/udc/core.c
index 93eff7dec2f5..1b3efb14aec7 100644
--- a/drivers/usb/gadget/udc/core.c
+++ b/drivers/usb/gadget/udc/core.c
@@ -1147,11 +1147,7 @@ int usb_add_gadget_udc_release(struct device *parent, struct usb_gadget *gadget,
 
 	udc = kzalloc(sizeof(*udc), GFP_KERNEL);
 	if (!udc)
-		goto err1;
-
-	ret = device_add(&gadget->dev);
-	if (ret)
-		goto err2;
+		goto err_put_gadget;
 
 	device_initialize(&udc->dev);
 	udc->dev.release = usb_udc_release;
@@ -1160,7 +1156,11 @@ int usb_add_gadget_udc_release(struct device *parent, struct usb_gadget *gadget,
 	udc->dev.parent = parent;
 	ret = dev_set_name(&udc->dev, "%s", kobject_name(&parent->kobj));
 	if (ret)
-		goto err3;
+		goto err_put_udc;
+
+	ret = device_add(&gadget->dev);
+	if (ret)
+		goto err_put_udc;
 
 	udc->gadget = gadget;
 	gadget->udc = udc;
@@ -1170,7 +1170,7 @@ int usb_add_gadget_udc_release(struct device *parent, struct usb_gadget *gadget,
 
 	ret = device_add(&udc->dev);
 	if (ret)
-		goto err4;
+		goto err_unlist_udc;
 
 	usb_gadget_set_state(gadget, USB_STATE_NOTATTACHED);
 	udc->vbus = true;
@@ -1178,27 +1178,25 @@ int usb_add_gadget_udc_release(struct device *parent, struct usb_gadget *gadget,
 	/* pick up one of pending gadget drivers */
 	ret = check_pending_gadget_drivers(udc);
 	if (ret)
-		goto err5;
+		goto err_del_udc;
 
 	mutex_unlock(&udc_lock);
 
 	return 0;
 
-err5:
+ err_del_udc:
 	device_del(&udc->dev);
 
-err4:
+ err_unlist_udc:
 	list_del(&udc->list);
 	mutex_unlock(&udc_lock);
 
-err3:
-	put_device(&udc->dev);
 	device_del(&gadget->dev);
 
-err2:
-	kfree(udc);
+ err_put_udc:
+	put_device(&udc->dev);
 
-err1:
+ err_put_gadget:
 	put_device(&gadget->dev);
 	return ret;
 }

From 9ecccfaa7cb5249bd31bdceb93fcf5bedb8a24d8 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Tue, 9 Jan 2018 15:02:51 +0000
Subject: [PATCH 728/808] sysfs/cpu: Fix typos in vulnerability documentation

Fixes: 87590ce6e ("sysfs/cpu: Add vulnerability folder")
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 Documentation/ABI/testing/sysfs-devices-system-cpu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu
index bd3a88e16d8b..258902db14bf 100644
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
@@ -378,7 +378,7 @@ What:		/sys/devices/system/cpu/vulnerabilities
 		/sys/devices/system/cpu/vulnerabilities/meltdown
 		/sys/devices/system/cpu/vulnerabilities/spectre_v1
 		/sys/devices/system/cpu/vulnerabilities/spectre_v2
-Date:		Januar 2018
+Date:		January 2018
 Contact:	Linux kernel mailing list <linux-kernel@vger.kernel.org>
 Description:	Information about CPU vulnerabilities
 
@@ -388,4 +388,4 @@ Description:	Information about CPU vulnerabilities
 
 		"Not affected"	  CPU is not affected by the vulnerability
 		"Vulnerable"	  CPU is affected and no mitigation in effect
-		"Mitigation: $M"  CPU is affetcted and mitigation $M is in effect
+		"Mitigation: $M"  CPU is affected and mitigation $M is in effect

From 50e51c13b3822d14ff6df4279423e4b7b2269bc3 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Wed, 10 Jan 2018 03:07:15 +1100
Subject: [PATCH 729/808] powerpc/64: Add macros for annotating the destination
 of rfid/hrfid

The rfid/hrfid ((Hypervisor) Return From Interrupt) instruction is
used for switching from the kernel to userspace, and from the
hypervisor to the guest kernel. However it can and is also used for
other transitions, eg. from real mode kernel code to virtual mode
kernel code, and it's not always clear from the code what the
destination context is.

To make it clearer when reading the code, add macros which encode the
expected destination context.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/exception-64e.h |  6 +++++
 arch/powerpc/include/asm/exception-64s.h | 29 ++++++++++++++++++++++++
 2 files changed, 35 insertions(+)

diff --git a/arch/powerpc/include/asm/exception-64e.h b/arch/powerpc/include/asm/exception-64e.h
index a703452d67b6..555e22d5e07f 100644
--- a/arch/powerpc/include/asm/exception-64e.h
+++ b/arch/powerpc/include/asm/exception-64e.h
@@ -209,5 +209,11 @@ exc_##label##_book3e:
 	ori	r3,r3,vector_offset@l;		\
 	mtspr	SPRN_IVOR##vector_number,r3;
 
+#define RFI_TO_KERNEL							\
+	rfi
+
+#define RFI_TO_USER							\
+	rfi
+
 #endif /* _ASM_POWERPC_EXCEPTION_64E_H */
 
diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h
index b27205297e1d..1af427a3c74f 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -74,6 +74,35 @@
  */
 #define EX_R3		EX_DAR
 
+/* Macros for annotating the expected destination of (h)rfid */
+
+#define RFI_TO_KERNEL							\
+	rfid
+
+#define RFI_TO_USER							\
+	rfid
+
+#define RFI_TO_USER_OR_KERNEL						\
+	rfid
+
+#define RFI_TO_GUEST							\
+	rfid
+
+#define HRFI_TO_KERNEL							\
+	hrfid
+
+#define HRFI_TO_USER							\
+	hrfid
+
+#define HRFI_TO_USER_OR_KERNEL						\
+	hrfid
+
+#define HRFI_TO_GUEST							\
+	hrfid
+
+#define HRFI_TO_UNKNOWN							\
+	hrfid
+
 #ifdef CONFIG_RELOCATABLE
 #define __EXCEPTION_RELON_PROLOG_PSERIES_1(label, h)			\
 	mfspr	r11,SPRN_##h##SRR0;	/* save SRR0 */			\

From 222f20f140623ef6033491d0103ee0875fe87d35 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Wed, 10 Jan 2018 03:07:15 +1100
Subject: [PATCH 730/808] powerpc/64s: Simple RFI macro conversions

This commit does simple conversions of rfi/rfid to the new macros that
include the expected destination context. By simple we mean cases
where there is a single well known destination context, and it's
simply a matter of substituting the instruction for the appropriate
macro.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/exception-64s.h |  4 ++--
 arch/powerpc/kernel/entry_64.S           | 14 +++++++++-----
 arch/powerpc/kernel/exceptions-64s.S     | 24 ++++++++++++------------
 arch/powerpc/kvm/book3s_hv_rmhandlers.S  |  9 ++++-----
 arch/powerpc/kvm/book3s_rmhandlers.S     |  7 +++++--
 arch/powerpc/kvm/book3s_segment.S        |  4 ++--
 6 files changed, 34 insertions(+), 28 deletions(-)

diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h
index 1af427a3c74f..dfc56daed98b 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -247,7 +247,7 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
 	mtspr	SPRN_##h##SRR0,r12;					\
 	mfspr	r12,SPRN_##h##SRR1;	/* and SRR1 */			\
 	mtspr	SPRN_##h##SRR1,r10;					\
-	h##rfid;							\
+	h##RFI_TO_KERNEL;						\
 	b	.	/* prevent speculative execution */
 #define EXCEPTION_PROLOG_PSERIES_1(label, h)				\
 	__EXCEPTION_PROLOG_PSERIES_1(label, h)
@@ -261,7 +261,7 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
 	mtspr	SPRN_##h##SRR0,r12;					\
 	mfspr	r12,SPRN_##h##SRR1;	/* and SRR1 */			\
 	mtspr	SPRN_##h##SRR1,r10;					\
-	h##rfid;							\
+	h##RFI_TO_KERNEL;						\
 	b	.	/* prevent speculative execution */
 
 #define EXCEPTION_PROLOG_PSERIES_1_NORI(label, h)			\
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 3320bcac7192..e68faa4d1b13 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -37,6 +37,11 @@
 #include <asm/tm.h>
 #include <asm/ppc-opcode.h>
 #include <asm/export.h>
+#ifdef CONFIG_PPC_BOOK3S
+#include <asm/exception-64s.h>
+#else
+#include <asm/exception-64e.h>
+#endif
 
 /*
  * System calls.
@@ -397,8 +402,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 	mtmsrd	r10, 1
 	mtspr	SPRN_SRR0, r11
 	mtspr	SPRN_SRR1, r12
-
-	rfid
+	RFI_TO_USER
 	b	.	/* prevent speculative execution */
 #endif
 _ASM_NOKPROBE_SYMBOL(system_call_common);
@@ -1073,7 +1077,7 @@ __enter_rtas:
 	
 	mtspr	SPRN_SRR0,r5
 	mtspr	SPRN_SRR1,r6
-	rfid
+	RFI_TO_KERNEL
 	b	.	/* prevent speculative execution */
 
 rtas_return_loc:
@@ -1098,7 +1102,7 @@ rtas_return_loc:
 
 	mtspr	SPRN_SRR0,r3
 	mtspr	SPRN_SRR1,r4
-	rfid
+	RFI_TO_KERNEL
 	b	.	/* prevent speculative execution */
 _ASM_NOKPROBE_SYMBOL(__enter_rtas)
 _ASM_NOKPROBE_SYMBOL(rtas_return_loc)
@@ -1171,7 +1175,7 @@ _GLOBAL(enter_prom)
 	LOAD_REG_IMMEDIATE(r12, MSR_SF | MSR_ISF | MSR_LE)
 	andc	r11,r11,r12
 	mtsrr1	r11
-	rfid
+	RFI_TO_KERNEL
 #endif /* CONFIG_PPC_BOOK3E */
 
 1:	/* Return from OF */
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index e441b469dc8f..5502b0147c4e 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -256,7 +256,7 @@ BEGIN_FTR_SECTION
 	LOAD_HANDLER(r12, machine_check_handle_early)
 1:	mtspr	SPRN_SRR0,r12
 	mtspr	SPRN_SRR1,r11
-	rfid
+	RFI_TO_KERNEL
 	b	.	/* prevent speculative execution */
 2:
 	/* Stack overflow. Stay on emergency stack and panic.
@@ -445,7 +445,7 @@ EXC_COMMON_BEGIN(machine_check_handle_early)
 	li	r3,MSR_ME
 	andc	r10,r10,r3		/* Turn off MSR_ME */
 	mtspr	SPRN_SRR1,r10
-	rfid
+	RFI_TO_KERNEL
 	b	.
 2:
 	/*
@@ -463,7 +463,7 @@ EXC_COMMON_BEGIN(machine_check_handle_early)
 	 */
 	bl	machine_check_queue_event
 	MACHINE_CHECK_HANDLER_WINDUP
-	rfid
+	RFI_TO_USER_OR_KERNEL
 9:
 	/* Deliver the machine check to host kernel in V mode. */
 	MACHINE_CHECK_HANDLER_WINDUP
@@ -651,7 +651,7 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX)
 	mtspr	SPRN_SRR0,r10
 	ld	r10,PACAKMSR(r13)
 	mtspr	SPRN_SRR1,r10
-	rfid
+	RFI_TO_KERNEL
 	b	.
 
 8:	std     r3,PACA_EXSLB+EX_DAR(r13)
@@ -662,7 +662,7 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX)
 	mtspr	SPRN_SRR0,r10
 	ld	r10,PACAKMSR(r13)
 	mtspr	SPRN_SRR1,r10
-	rfid
+	RFI_TO_KERNEL
 	b	.
 
 EXC_COMMON_BEGIN(unrecov_slb)
@@ -901,7 +901,7 @@ EXC_COMMON(trap_0b_common, 0xb00, unknown_exception)
 	mtspr	SPRN_SRR0,r10 ; 				\
 	ld	r10,PACAKMSR(r13) ;				\
 	mtspr	SPRN_SRR1,r10 ; 				\
-	rfid ; 							\
+	RFI_TO_KERNEL ;						\
 	b	. ;	/* prevent speculative execution */
 
 #ifdef CONFIG_PPC_FAST_ENDIAN_SWITCH
@@ -917,7 +917,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE)				\
 	xori	r12,r12,MSR_LE ;				\
 	mtspr	SPRN_SRR1,r12 ;					\
 	mr	r13,r9 ;					\
-	rfid ;		/* return to userspace */		\
+	RFI_TO_USER ;	/* return to userspace */		\
 	b	. ;	/* prevent speculative execution */
 #else
 #define SYSCALL_FASTENDIAN_TEST
@@ -1063,7 +1063,7 @@ TRAMP_REAL_BEGIN(hmi_exception_early)
 	mtcr	r11
 	REST_GPR(11, r1)
 	ld	r1,GPR1(r1)
-	hrfid
+	HRFI_TO_USER_OR_KERNEL
 
 1:	mtcr	r11
 	REST_GPR(11, r1)
@@ -1314,7 +1314,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
 	ld	r11,PACA_EXGEN+EX_R11(r13)
 	ld	r12,PACA_EXGEN+EX_R12(r13)
 	ld	r13,PACA_EXGEN+EX_R13(r13)
-	HRFID
+	HRFI_TO_UNKNOWN
 	b	.
 #endif
 
@@ -1418,7 +1418,7 @@ masked_##_H##interrupt:					\
 	ld	r10,PACA_EXGEN+EX_R10(r13);		\
 	ld	r11,PACA_EXGEN+EX_R11(r13);		\
 	/* returns to kernel where r13 must be set up, so don't restore it */ \
-	##_H##rfid;					\
+	##_H##RFI_TO_KERNEL;				\
 	b	.;					\
 	MASKED_DEC_HANDLER(_H)
 
@@ -1441,7 +1441,7 @@ TRAMP_REAL_BEGIN(kvmppc_skip_interrupt)
 	addi	r13, r13, 4
 	mtspr	SPRN_SRR0, r13
 	GET_SCRATCH0(r13)
-	rfid
+	RFI_TO_KERNEL
 	b	.
 
 TRAMP_REAL_BEGIN(kvmppc_skip_Hinterrupt)
@@ -1453,7 +1453,7 @@ TRAMP_REAL_BEGIN(kvmppc_skip_Hinterrupt)
 	addi	r13, r13, 4
 	mtspr	SPRN_HSRR0, r13
 	GET_SCRATCH0(r13)
-	hrfid
+	HRFI_TO_KERNEL
 	b	.
 #endif
 
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 2659844784b8..9c61f736c75b 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -79,7 +79,7 @@ _GLOBAL_TOC(kvmppc_hv_entry_trampoline)
 	mtmsrd	r0,1		/* clear RI in MSR */
 	mtsrr0	r5
 	mtsrr1	r6
-	RFI
+	RFI_TO_KERNEL
 
 kvmppc_call_hv_entry:
 BEGIN_FTR_SECTION
@@ -199,7 +199,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 	mtmsrd	r6, 1			/* Clear RI in MSR */
 	mtsrr0	r8
 	mtsrr1	r7
-	RFI
+	RFI_TO_KERNEL
 
 	/* Virtual-mode return */
 .Lvirt_return:
@@ -1167,8 +1167,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
 
 	ld	r0, VCPU_GPR(R0)(r4)
 	ld	r4, VCPU_GPR(R4)(r4)
-
-	hrfid
+	HRFI_TO_GUEST
 	b	.
 
 secondary_too_late:
@@ -3320,7 +3319,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX)
 	ld	r4, PACAKMSR(r13)
 	mtspr	SPRN_SRR0, r3
 	mtspr	SPRN_SRR1, r4
-	rfid
+	RFI_TO_KERNEL
 9:	addi	r3, r1, STACK_FRAME_OVERHEAD
 	bl	kvmppc_bad_interrupt
 	b	9b
diff --git a/arch/powerpc/kvm/book3s_rmhandlers.S b/arch/powerpc/kvm/book3s_rmhandlers.S
index 42a4b237df5f..34a5adeff084 100644
--- a/arch/powerpc/kvm/book3s_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_rmhandlers.S
@@ -46,6 +46,9 @@
 
 #define FUNC(name)		name
 
+#define RFI_TO_KERNEL	RFI
+#define RFI_TO_GUEST	RFI
+
 .macro INTERRUPT_TRAMPOLINE intno
 
 .global kvmppc_trampoline_\intno
@@ -141,7 +144,7 @@ kvmppc_handler_skip_ins:
 	GET_SCRATCH0(r13)
 
 	/* And get back into the code */
-	RFI
+	RFI_TO_KERNEL
 #endif
 
 /*
@@ -164,6 +167,6 @@ _GLOBAL_TOC(kvmppc_entry_trampoline)
 	ori	r5, r5, MSR_EE
 	mtsrr0	r7
 	mtsrr1	r6
-	RFI
+	RFI_TO_KERNEL
 
 #include "book3s_segment.S"
diff --git a/arch/powerpc/kvm/book3s_segment.S b/arch/powerpc/kvm/book3s_segment.S
index 2a2b96d53999..93a180ceefad 100644
--- a/arch/powerpc/kvm/book3s_segment.S
+++ b/arch/powerpc/kvm/book3s_segment.S
@@ -156,7 +156,7 @@ no_dcbz32_on:
 	PPC_LL	r9, SVCPU_R9(r3)
 	PPC_LL	r3, (SVCPU_R3)(r3)
 
-	RFI
+	RFI_TO_GUEST
 kvmppc_handler_trampoline_enter_end:
 
 
@@ -407,5 +407,5 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
 	cmpwi	r12, BOOK3S_INTERRUPT_DOORBELL
 	beqa	BOOK3S_INTERRUPT_DOORBELL
 
-	RFI
+	RFI_TO_KERNEL
 kvmppc_handler_trampoline_exit_end:

From b8e90cb7bc04a509e821e82ab6ed7a8ef11ba333 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Wed, 10 Jan 2018 03:07:15 +1100
Subject: [PATCH 731/808] powerpc/64: Convert the syscall exit path to use
 RFI_TO_USER/KERNEL

In the syscall exit path we may be returning to user or kernel
context. We already have a test for that, because we conditionally
restore r13. So use that existing test and branch, and bifurcate the
return based on that.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/entry_64.S | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index e68faa4d1b13..724733b74744 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -267,13 +267,23 @@ BEGIN_FTR_SECTION
 END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 
 	ld	r13,GPR13(r1)	/* only restore r13 if returning to usermode */
+	ld	r2,GPR2(r1)
+	ld	r1,GPR1(r1)
+	mtlr	r4
+	mtcr	r5
+	mtspr	SPRN_SRR0,r7
+	mtspr	SPRN_SRR1,r8
+	RFI_TO_USER
+	b	.	/* prevent speculative execution */
+
+	/* exit to kernel */
 1:	ld	r2,GPR2(r1)
 	ld	r1,GPR1(r1)
 	mtlr	r4
 	mtcr	r5
 	mtspr	SPRN_SRR0,r7
 	mtspr	SPRN_SRR1,r8
-	RFI
+	RFI_TO_KERNEL
 	b	.	/* prevent speculative execution */
 
 .Lsyscall_error:

From a08f828cf47e6c605af21d2cdec68f84e799c318 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Wed, 10 Jan 2018 03:07:15 +1100
Subject: [PATCH 732/808] powerpc/64: Convert fast_exception_return to use
 RFI_TO_USER/KERNEL

Similar to the syscall return path, in fast_exception_return we may be
returning to user or kernel context. We already have a test for that,
because we conditionally restore r13. So use that existing test and
branch, and bifurcate the return based on that.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/entry_64.S | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 724733b74744..2748584b767d 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -892,7 +892,7 @@ BEGIN_FTR_SECTION
 END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 	ACCOUNT_CPU_USER_EXIT(r13, r2, r4)
 	REST_GPR(13, r1)
-1:
+
 	mtspr	SPRN_SRR1,r3
 
 	ld	r2,_CCR(r1)
@@ -905,8 +905,22 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 	ld	r3,GPR3(r1)
 	ld	r4,GPR4(r1)
 	ld	r1,GPR1(r1)
+	RFI_TO_USER
+	b	.	/* prevent speculative execution */
 
-	rfid
+1:	mtspr	SPRN_SRR1,r3
+
+	ld	r2,_CCR(r1)
+	mtcrf	0xFF,r2
+	ld	r2,_NIP(r1)
+	mtspr	SPRN_SRR0,r2
+
+	ld	r0,GPR0(r1)
+	ld	r2,GPR2(r1)
+	ld	r3,GPR3(r1)
+	ld	r4,GPR4(r1)
+	ld	r1,GPR1(r1)
+	RFI_TO_KERNEL
 	b	.	/* prevent speculative execution */
 
 #endif /* CONFIG_PPC_BOOK3E */

From c7305645eb0c1621351cfc104038831ae87c0053 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Wed, 10 Jan 2018 03:07:15 +1100
Subject: [PATCH 733/808] powerpc/64s: Convert slb_miss_common to use
 RFI_TO_USER/KERNEL

In the SLB miss handler we may be returning to user or kernel. We need
to add a check early on and save the result in the cr4 register, and
then we bifurcate the return path based on that.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/exceptions-64s.S | 29 +++++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 5502b0147c4e..ed356194f09c 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -598,6 +598,9 @@ EXC_COMMON_BEGIN(slb_miss_common)
 	stw	r9,PACA_EXSLB+EX_CCR(r13)	/* save CR in exc. frame */
 	std	r10,PACA_EXSLB+EX_LR(r13)	/* save LR */
 
+	andi.	r9,r11,MSR_PR	// Check for exception from userspace
+	cmpdi	cr4,r9,MSR_PR	// And save the result in CR4 for later
+
 	/*
 	 * Test MSR_RI before calling slb_allocate_realmode, because the
 	 * MSR in r11 gets clobbered. However we still want to allocate
@@ -624,9 +627,12 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX)
 
 	/* All done -- return from exception. */
 
+	bne	cr4,1f		/* returning to kernel */
+
 .machine	push
 .machine	"power4"
 	mtcrf	0x80,r9
+	mtcrf	0x08,r9		/* MSR[PR] indication is in cr4 */
 	mtcrf	0x04,r9		/* MSR[RI] indication is in cr5 */
 	mtcrf	0x02,r9		/* I/D indication is in cr6 */
 	mtcrf	0x01,r9		/* slb_allocate uses cr0 and cr7 */
@@ -640,8 +646,29 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX)
 	ld	r11,PACA_EXSLB+EX_R11(r13)
 	ld	r12,PACA_EXSLB+EX_R12(r13)
 	ld	r13,PACA_EXSLB+EX_R13(r13)
-	rfid
+	RFI_TO_USER
 	b	.	/* prevent speculative execution */
+1:
+.machine	push
+.machine	"power4"
+	mtcrf	0x80,r9
+	mtcrf	0x08,r9		/* MSR[PR] indication is in cr4 */
+	mtcrf	0x04,r9		/* MSR[RI] indication is in cr5 */
+	mtcrf	0x02,r9		/* I/D indication is in cr6 */
+	mtcrf	0x01,r9		/* slb_allocate uses cr0 and cr7 */
+.machine	pop
+
+	RESTORE_CTR(r9, PACA_EXSLB)
+	RESTORE_PPR_PACA(PACA_EXSLB, r9)
+	mr	r3,r12
+	ld	r9,PACA_EXSLB+EX_R9(r13)
+	ld	r10,PACA_EXSLB+EX_R10(r13)
+	ld	r11,PACA_EXSLB+EX_R11(r13)
+	ld	r12,PACA_EXSLB+EX_R12(r13)
+	ld	r13,PACA_EXSLB+EX_R13(r13)
+	RFI_TO_KERNEL
+	b	.	/* prevent speculative execution */
+
 
 2:	std     r3,PACA_EXSLB+EX_DAR(r13)
 	mr	r3,r12

From 928afc85270753657b5543e052cc270c279a3fe9 Mon Sep 17 00:00:00 2001
From: Icenowy Zheng <icenowy@aosc.io>
Date: Sat, 6 Jan 2018 00:56:44 +0800
Subject: [PATCH 734/808] uas: ignore UAS for Norelsys NS1068(X) chips

The UAS mode of Norelsys NS1068(X) is reported to fail to work on
several platforms with the following error message:

xhci-hcd xhci-hcd.0.auto: ERROR Transfer event for unknown stream ring slot 1 ep 8
xhci-hcd xhci-hcd.0.auto: @00000000bf04a400 00000000 00000000 1b000000 01098001

And when trying to mount a partition on the disk the disk will
disconnect from the USB controller, then after re-connecting the device
will be offlined and not working at all.

Falling back to USB mass storage can solve this problem, so ignore UAS
function of this chip.

Cc: stable@vger.kernel.org
Signed-off-by: Icenowy Zheng <icenowy@aosc.io>
Acked-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/storage/unusual_uas.h | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/usb/storage/unusual_uas.h b/drivers/usb/storage/unusual_uas.h
index e6127fb21c12..a7d08ae0adad 100644
--- a/drivers/usb/storage/unusual_uas.h
+++ b/drivers/usb/storage/unusual_uas.h
@@ -143,6 +143,13 @@ UNUSUAL_DEV(0x2109, 0x0711, 0x0000, 0x9999,
 		USB_SC_DEVICE, USB_PR_DEVICE, NULL,
 		US_FL_NO_ATA_1X),
 
+/* Reported-by: Icenowy Zheng <icenowy@aosc.io> */
+UNUSUAL_DEV(0x2537, 0x1068, 0x0000, 0x9999,
+		"Norelsys",
+		"NS1068X",
+		USB_SC_DEVICE, USB_PR_DEVICE, NULL,
+		US_FL_IGNORE_UAS),
+
 /* Reported-by: Takeo Nakayama <javhera@gmx.com> */
 UNUSUAL_DEV(0x357d, 0x7788, 0x0000, 0x9999,
 		"JMicron",

From b8fd0823e0770c2d5fdbd865bccf0d5e058e5287 Mon Sep 17 00:00:00 2001
From: Andrii Vladyka <tulup@mail.ru>
Date: Thu, 4 Jan 2018 13:09:17 +0200
Subject: [PATCH 735/808] net: core: fix module type in sock_diag_bind

Use AF_INET6 instead of AF_INET in IPv6-related code path

Signed-off-by: Andrii Vladyka <tulup@mail.ru>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock_diag.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index 217f4e3b82f6..146b50e30659 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -288,7 +288,7 @@ static int sock_diag_bind(struct net *net, int group)
 	case SKNLGRP_INET6_UDP_DESTROY:
 		if (!sock_diag_handlers[AF_INET6])
 			request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
-				       NETLINK_SOCK_DIAG, AF_INET);
+				       NETLINK_SOCK_DIAG, AF_INET6);
 		break;
 	}
 	return 0;

From edd8ca8015800b354453b891d38960f3a474b7e4 Mon Sep 17 00:00:00 2001
From: Florian Margaine <florian@platform.sh>
Date: Wed, 13 Dec 2017 16:43:59 +0100
Subject: [PATCH 736/808] rbd: reacquire lock should update lock owner client
 id

Otherwise, future operations on this RBD using exclusive-lock are
going to require the lock from a non-existent client id.

Cc: stable@vger.kernel.org
Fixes: 14bb211d324d ("rbd: support updating the lock cookie without releasing the lock")
Link: http://tracker.ceph.com/issues/19929
Signed-off-by: Florian Margaine <florian@platform.sh>
[idryomov@gmail.com: rbd_set_owner_cid() call, __rbd_lock() helper]
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 drivers/block/rbd.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 38fc5f397fde..aacae6f7163e 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -3047,13 +3047,21 @@ static void format_lock_cookie(struct rbd_device *rbd_dev, char *buf)
 	mutex_unlock(&rbd_dev->watch_mutex);
 }
 
+static void __rbd_lock(struct rbd_device *rbd_dev, const char *cookie)
+{
+	struct rbd_client_id cid = rbd_get_cid(rbd_dev);
+
+	strcpy(rbd_dev->lock_cookie, cookie);
+	rbd_set_owner_cid(rbd_dev, &cid);
+	queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work);
+}
+
 /*
  * lock_rwsem must be held for write
  */
 static int rbd_lock(struct rbd_device *rbd_dev)
 {
 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
-	struct rbd_client_id cid = rbd_get_cid(rbd_dev);
 	char cookie[32];
 	int ret;
 
@@ -3068,9 +3076,7 @@ static int rbd_lock(struct rbd_device *rbd_dev)
 		return ret;
 
 	rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED;
-	strcpy(rbd_dev->lock_cookie, cookie);
-	rbd_set_owner_cid(rbd_dev, &cid);
-	queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work);
+	__rbd_lock(rbd_dev, cookie);
 	return 0;
 }
 
@@ -3856,7 +3862,7 @@ static void rbd_reacquire_lock(struct rbd_device *rbd_dev)
 			queue_delayed_work(rbd_dev->task_wq,
 					   &rbd_dev->lock_dwork, 0);
 	} else {
-		strcpy(rbd_dev->lock_cookie, cookie);
+		__rbd_lock(rbd_dev, cookie);
 	}
 }
 

From 21acdf45f4958135940f0b4767185cf911d4b010 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 21 Dec 2017 15:35:11 +0100
Subject: [PATCH 737/808] rbd: set max_segments to USHRT_MAX

Commit d3834fefcfe5 ("rbd: bump queue_max_segments") bumped
max_segments (unsigned short) to max_hw_sectors (unsigned int).
max_hw_sectors is set to the number of 512-byte sectors in an object
and overflows unsigned short for 32M (largest possible) objects, making
the block layer resort to handing us single segment (i.e. single page
or even smaller) bios in that case.

Cc: stable@vger.kernel.org
Fixes: d3834fefcfe5 ("rbd: bump queue_max_segments")
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Alex Elder <elder@linaro.org>
---
 drivers/block/rbd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index aacae6f7163e..cc93522a6d41 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -4387,7 +4387,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
 	segment_size = rbd_obj_bytes(&rbd_dev->header);
 	blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
 	q->limits.max_sectors = queue_max_hw_sectors(q);
-	blk_queue_max_segments(q, segment_size / SECTOR_SIZE);
+	blk_queue_max_segments(q, USHRT_MAX);
 	blk_queue_max_segment_size(q, segment_size);
 	blk_queue_io_min(q, segment_size);
 	blk_queue_io_opt(q, segment_size);

From 3dc2fa47549aca71773afdd12a78d31802bb22b4 Mon Sep 17 00:00:00 2001
From: Xiongfeng Wang <xiongfeng.wang@linaro.org>
Date: Mon, 8 Jan 2018 19:43:00 +0800
Subject: [PATCH 738/808] net: caif: use strlcpy() instead of strncpy()

gcc-8 reports

net/caif/caif_dev.c: In function 'caif_enroll_dev':
./include/linux/string.h:245:9: warning: '__builtin_strncpy' output may
be truncated copying 15 bytes from a string of length 15
[-Wstringop-truncation]

net/caif/cfctrl.c: In function 'cfctrl_linkup_request':
./include/linux/string.h:245:9: warning: '__builtin_strncpy' output may
be truncated copying 15 bytes from a string of length 15
[-Wstringop-truncation]

net/caif/cfcnfg.c: In function 'caif_connect_client':
./include/linux/string.h:245:9: warning: '__builtin_strncpy' output may
be truncated copying 15 bytes from a string of length 15
[-Wstringop-truncation]

The compiler require that the input param 'len' of strncpy() should be
greater than the length of the src string, so that '\0' is copied as
well. We can just use strlcpy() to avoid this warning.

Signed-off-by: Xiongfeng Wang <xiongfeng.wang@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/caif/caif_dev.c |  5 ++---
 net/caif/cfcnfg.c   | 10 ++++------
 net/caif/cfctrl.c   |  4 ++--
 3 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c
index 2d38b6e34203..e0adcd123f48 100644
--- a/net/caif/caif_dev.c
+++ b/net/caif/caif_dev.c
@@ -334,9 +334,8 @@ void caif_enroll_dev(struct net_device *dev, struct caif_dev_common *caifdev,
 	mutex_lock(&caifdevs->lock);
 	list_add_rcu(&caifd->list, &caifdevs->list);
 
-	strncpy(caifd->layer.name, dev->name,
-		sizeof(caifd->layer.name) - 1);
-	caifd->layer.name[sizeof(caifd->layer.name) - 1] = 0;
+	strlcpy(caifd->layer.name, dev->name,
+		sizeof(caifd->layer.name));
 	caifd->layer.transmit = transmit;
 	cfcnfg_add_phy_layer(cfg,
 				dev,
diff --git a/net/caif/cfcnfg.c b/net/caif/cfcnfg.c
index 273cb07f57d8..8f00bea093b9 100644
--- a/net/caif/cfcnfg.c
+++ b/net/caif/cfcnfg.c
@@ -268,17 +268,15 @@ static int caif_connect_req_to_link_param(struct cfcnfg *cnfg,
 	case CAIFPROTO_RFM:
 		l->linktype = CFCTRL_SRV_RFM;
 		l->u.datagram.connid = s->sockaddr.u.rfm.connection_id;
-		strncpy(l->u.rfm.volume, s->sockaddr.u.rfm.volume,
-			sizeof(l->u.rfm.volume)-1);
-		l->u.rfm.volume[sizeof(l->u.rfm.volume)-1] = 0;
+		strlcpy(l->u.rfm.volume, s->sockaddr.u.rfm.volume,
+			sizeof(l->u.rfm.volume));
 		break;
 	case CAIFPROTO_UTIL:
 		l->linktype = CFCTRL_SRV_UTIL;
 		l->endpoint = 0x00;
 		l->chtype = 0x00;
-		strncpy(l->u.utility.name, s->sockaddr.u.util.service,
-			sizeof(l->u.utility.name)-1);
-		l->u.utility.name[sizeof(l->u.utility.name)-1] = 0;
+		strlcpy(l->u.utility.name, s->sockaddr.u.util.service,
+			sizeof(l->u.utility.name));
 		caif_assert(sizeof(l->u.utility.name) > 10);
 		l->u.utility.paramlen = s->param.size;
 		if (l->u.utility.paramlen > sizeof(l->u.utility.params))
diff --git a/net/caif/cfctrl.c b/net/caif/cfctrl.c
index f5afda1abc76..655ed7032150 100644
--- a/net/caif/cfctrl.c
+++ b/net/caif/cfctrl.c
@@ -258,8 +258,8 @@ int cfctrl_linkup_request(struct cflayer *layer,
 		tmp16 = cpu_to_le16(param->u.utility.fifosize_bufs);
 		cfpkt_add_body(pkt, &tmp16, 2);
 		memset(utility_name, 0, sizeof(utility_name));
-		strncpy(utility_name, param->u.utility.name,
-			UTILITY_NAME_LENGTH - 1);
+		strlcpy(utility_name, param->u.utility.name,
+			UTILITY_NAME_LENGTH);
 		cfpkt_add_body(pkt, utility_name, UTILITY_NAME_LENGTH);
 		tmp8 = param->u.utility.paramlen;
 		cfpkt_add_body(pkt, &tmp8, 1);

From 20b50d79974ea3192e8c3ab7faf4e536e5f14d8f Mon Sep 17 00:00:00 2001
From: Nicolai Stange <nstange@suse.de>
Date: Mon, 8 Jan 2018 15:54:44 +0100
Subject: [PATCH 739/808] net: ipv4: emulate READ_ONCE() on ->hdrincl bit-field
 in raw_sendmsg()

Commit 8f659a03a0ba ("net: ipv4: fix for a race condition in
raw_sendmsg") fixed the issue of possibly inconsistent ->hdrincl handling
due to concurrent updates by reading this bit-field member into a local
variable and using the thus stabilized value in subsequent tests.

However, aforementioned commit also adds the (correct) comment that

  /* hdrincl should be READ_ONCE(inet->hdrincl)
   * but READ_ONCE() doesn't work with bit fields
   */

because as it stands, the compiler is free to shortcut or even eliminate
the local variable at its will.

Note that I have not seen anything like this happening in reality and thus,
the concern is a theoretical one.

However, in order to be on the safe side, emulate a READ_ONCE() on the
bit-field by doing it on the local 'hdrincl' variable itself:

	int hdrincl = inet->hdrincl;
	hdrincl = READ_ONCE(hdrincl);

This breaks the chain in the sense that the compiler is not allowed
to replace subsequent reads from hdrincl with reloads from inet->hdrincl.

Fixes: 8f659a03a0ba ("net: ipv4: fix for a race condition in raw_sendmsg")
Signed-off-by: Nicolai Stange <nstange@suse.de>
Reviewed-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/raw.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 125c1eab3eaa..5e570aa9e43b 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -520,9 +520,11 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 		goto out;
 
 	/* hdrincl should be READ_ONCE(inet->hdrincl)
-	 * but READ_ONCE() doesn't work with bit fields
+	 * but READ_ONCE() doesn't work with bit fields.
+	 * Doing this indirectly yields the same result.
 	 */
 	hdrincl = inet->hdrincl;
+	hdrincl = READ_ONCE(hdrincl);
 	/*
 	 *	Check the flags.
 	 */

From 2fdd18118dad86bf5e7880d8d02ea27be23e3671 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.vnet.ibm.com>
Date: Mon, 8 Jan 2018 08:50:17 +0200
Subject: [PATCH 740/808] docs-rst: networking: wire up msg_zerocopy

Fix the following 'make htmldocs' complaint:

Documentation/networking/msg_zerocopy.rst:: WARNING: document isn't included in any toctree.

Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/index.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst
index 66e620866245..7d4b15977d61 100644
--- a/Documentation/networking/index.rst
+++ b/Documentation/networking/index.rst
@@ -9,6 +9,7 @@ Contents:
    batman-adv
    kapi
    z8530book
+   msg_zerocopy
 
 .. only::  subproject
 
@@ -16,4 +17,3 @@ Contents:
    =======
 
    * :ref:`genindex`
-

From 195e2addbce09e5afbc766efc1e6567c9ce840d3 Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Date: Sat, 6 Jan 2018 21:53:26 +0300
Subject: [PATCH 741/808] SolutionEngine771x: fix Ether platform data

The 'sh_eth' driver's probe() method would fail  on the SolutionEngine7710
board and crash on SolutionEngine7712 board  as the platform code is
hopelessly behind the driver's platform data --  it passes the PHY address
instead of 'struct sh_eth_plat_data *'; pass the latter to the driver in
order to fix the bug...

Fixes: 71557a37adb5 ("[netdrvr] sh_eth: Add SH7619 support")
Signed-off-by: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sh/boards/mach-se/770x/setup.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/arch/sh/boards/mach-se/770x/setup.c b/arch/sh/boards/mach-se/770x/setup.c
index 77c35350ee77..b7fa7a87e946 100644
--- a/arch/sh/boards/mach-se/770x/setup.c
+++ b/arch/sh/boards/mach-se/770x/setup.c
@@ -9,6 +9,7 @@
  */
 #include <linux/init.h>
 #include <linux/platform_device.h>
+#include <linux/sh_eth.h>
 #include <mach-se/mach/se.h>
 #include <mach-se/mach/mrshpc.h>
 #include <asm/machvec.h>
@@ -115,6 +116,11 @@ static struct platform_device heartbeat_device = {
 #if defined(CONFIG_CPU_SUBTYPE_SH7710) ||\
 	defined(CONFIG_CPU_SUBTYPE_SH7712)
 /* SH771X Ethernet driver */
+static struct sh_eth_plat_data sh_eth_plat = {
+	.phy = PHY_ID,
+	.phy_interface = PHY_INTERFACE_MODE_MII,
+};
+
 static struct resource sh_eth0_resources[] = {
 	[0] = {
 		.start = SH_ETH0_BASE,
@@ -132,7 +138,7 @@ static struct platform_device sh_eth0_device = {
 	.name = "sh771x-ether",
 	.id = 0,
 	.dev = {
-		.platform_data = PHY_ID,
+		.platform_data = &sh_eth_plat,
 	},
 	.num_resources = ARRAY_SIZE(sh_eth0_resources),
 	.resource = sh_eth0_resources,
@@ -155,7 +161,7 @@ static struct platform_device sh_eth1_device = {
 	.name = "sh771x-ether",
 	.id = 1,
 	.dev = {
-		.platform_data = PHY_ID,
+		.platform_data = &sh_eth_plat,
 	},
 	.num_resources = ARRAY_SIZE(sh_eth1_resources),
 	.resource = sh_eth1_resources,

From f9a531d6731d74f1e24298d9641c2dc1fef2631b Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Date: Sat, 6 Jan 2018 21:53:27 +0300
Subject: [PATCH 742/808] SolutionEngine771x: add Ether TSU resource

After the  Ether platform data is fixed, the driver probe() method would
still fail since the 'struct sh_eth_cpu_data' corresponding  to SH771x
indicates the presence of TSU but the memory resource for it is absent.
Add the missing TSU resource  to both Ether devices and fix the harmless
off-by-one error in the main memory resources, while at it...

Fixes: 4986b996882d ("net: sh_eth: remove the SH_TSU_ADDR")
Signed-off-by: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sh/boards/mach-se/770x/setup.c | 14 ++++++++++++--
 arch/sh/include/mach-se/mach/se.h   |  1 +
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/arch/sh/boards/mach-se/770x/setup.c b/arch/sh/boards/mach-se/770x/setup.c
index b7fa7a87e946..412326d59e6f 100644
--- a/arch/sh/boards/mach-se/770x/setup.c
+++ b/arch/sh/boards/mach-se/770x/setup.c
@@ -124,10 +124,15 @@ static struct sh_eth_plat_data sh_eth_plat = {
 static struct resource sh_eth0_resources[] = {
 	[0] = {
 		.start = SH_ETH0_BASE,
-		.end = SH_ETH0_BASE + 0x1B8,
+		.end = SH_ETH0_BASE + 0x1B8 - 1,
 		.flags = IORESOURCE_MEM,
 	},
 	[1] = {
+		.start = SH_TSU_BASE,
+		.end = SH_TSU_BASE + 0x200 - 1,
+		.flags = IORESOURCE_MEM,
+	},
+	[2] = {
 		.start = SH_ETH0_IRQ,
 		.end = SH_ETH0_IRQ,
 		.flags = IORESOURCE_IRQ,
@@ -147,10 +152,15 @@ static struct platform_device sh_eth0_device = {
 static struct resource sh_eth1_resources[] = {
 	[0] = {
 		.start = SH_ETH1_BASE,
-		.end = SH_ETH1_BASE + 0x1B8,
+		.end = SH_ETH1_BASE + 0x1B8 - 1,
 		.flags = IORESOURCE_MEM,
 	},
 	[1] = {
+		.start = SH_TSU_BASE,
+		.end = SH_TSU_BASE + 0x200 - 1,
+		.flags = IORESOURCE_MEM,
+	},
+	[2] = {
 		.start = SH_ETH1_IRQ,
 		.end = SH_ETH1_IRQ,
 		.flags = IORESOURCE_IRQ,
diff --git a/arch/sh/include/mach-se/mach/se.h b/arch/sh/include/mach-se/mach/se.h
index 4246ef9b07a3..aa83fe1ff0b1 100644
--- a/arch/sh/include/mach-se/mach/se.h
+++ b/arch/sh/include/mach-se/mach/se.h
@@ -100,6 +100,7 @@
 /* Base address */
 #define SH_ETH0_BASE 0xA7000000
 #define SH_ETH1_BASE 0xA7000400
+#define SH_TSU_BASE  0xA7000800
 /* PHY ID */
 #if defined(CONFIG_CPU_SUBTYPE_SH7710)
 # define PHY_ID 0x00

From 4512c43eac7e007d982e7ea45152ea6f3f4d1921 Mon Sep 17 00:00:00 2001
From: Wei Wang <weiwan@google.com>
Date: Mon, 8 Jan 2018 10:34:00 -0800
Subject: [PATCH 743/808] ipv6: remove null_entry before adding default route

In the current code, when creating a new fib6 table, tb6_root.leaf gets
initialized to net->ipv6.ip6_null_entry.
If a default route is being added with rt->rt6i_metric = 0xffffffff,
fib6_add() will add this route after net->ipv6.ip6_null_entry. As
null_entry is shared, it could cause problem.

In order to fix it, set fn->leaf to NULL before calling
fib6_add_rt2node() when trying to add the first default route.
And reset fn->leaf to null_entry when adding fails or when deleting the
last default route.

syzkaller reported the following issue which is fixed by this commit:

WARNING: suspicious RCU usage
4.15.0-rc5+ #171 Not tainted
-----------------------------
net/ipv6/ip6_fib.c:1702 suspicious rcu_dereference_protected() usage!

other info that might help us debug this:

rcu_scheduler_active = 2, debug_locks = 1
4 locks held by swapper/0/0:
 #0:  ((&net->ipv6.ip6_fib_timer)){+.-.}, at: [<00000000d43f631b>] lockdep_copy_map include/linux/lockdep.h:178 [inline]
 #0:  ((&net->ipv6.ip6_fib_timer)){+.-.}, at: [<00000000d43f631b>] call_timer_fn+0x1c6/0x820 kernel/time/timer.c:1310
 #1:  (&(&net->ipv6.fib6_gc_lock)->rlock){+.-.}, at: [<000000002ff9d65c>] spin_lock_bh include/linux/spinlock.h:315 [inline]
 #1:  (&(&net->ipv6.fib6_gc_lock)->rlock){+.-.}, at: [<000000002ff9d65c>] fib6_run_gc+0x9d/0x3c0 net/ipv6/ip6_fib.c:2007
 #2:  (rcu_read_lock){....}, at: [<0000000091db762d>] __fib6_clean_all+0x0/0x3a0 net/ipv6/ip6_fib.c:1560
 #3:  (&(&tb->tb6_lock)->rlock){+.-.}, at: [<000000009e503581>] spin_lock_bh include/linux/spinlock.h:315 [inline]
 #3:  (&(&tb->tb6_lock)->rlock){+.-.}, at: [<000000009e503581>] __fib6_clean_all+0x1d0/0x3a0 net/ipv6/ip6_fib.c:1948

stack backtrace:
CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.15.0-rc5+ #171
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 <IRQ>
 __dump_stack lib/dump_stack.c:17 [inline]
 dump_stack+0x194/0x257 lib/dump_stack.c:53
 lockdep_rcu_suspicious+0x123/0x170 kernel/locking/lockdep.c:4585
 fib6_del+0xcaa/0x11b0 net/ipv6/ip6_fib.c:1701
 fib6_clean_node+0x3aa/0x4f0 net/ipv6/ip6_fib.c:1892
 fib6_walk_continue+0x46c/0x8a0 net/ipv6/ip6_fib.c:1815
 fib6_walk+0x91/0xf0 net/ipv6/ip6_fib.c:1863
 fib6_clean_tree+0x1e6/0x340 net/ipv6/ip6_fib.c:1933
 __fib6_clean_all+0x1f4/0x3a0 net/ipv6/ip6_fib.c:1949
 fib6_clean_all net/ipv6/ip6_fib.c:1960 [inline]
 fib6_run_gc+0x16b/0x3c0 net/ipv6/ip6_fib.c:2016
 fib6_gc_timer_cb+0x20/0x30 net/ipv6/ip6_fib.c:2033
 call_timer_fn+0x228/0x820 kernel/time/timer.c:1320
 expire_timers kernel/time/timer.c:1357 [inline]
 __run_timers+0x7ee/0xb70 kernel/time/timer.c:1660
 run_timer_softirq+0x4c/0xb0 kernel/time/timer.c:1686
 __do_softirq+0x2d7/0xb85 kernel/softirq.c:285
 invoke_softirq kernel/softirq.c:365 [inline]
 irq_exit+0x1cc/0x200 kernel/softirq.c:405
 exiting_irq arch/x86/include/asm/apic.h:540 [inline]
 smp_apic_timer_interrupt+0x16b/0x700 arch/x86/kernel/apic/apic.c:1052
 apic_timer_interrupt+0xa9/0xb0 arch/x86/entry/entry_64.S:904
 </IRQ>

Reported-by: syzbot <syzkaller@googlegroups.com>
Fixes: 66f5d6ce53e6 ("ipv6: replace rwlock with rcu and spinlock in fib6_table")
Signed-off-by: Wei Wang <weiwan@google.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_fib.c | 38 +++++++++++++++++++++++++++++---------
 1 file changed, 29 insertions(+), 9 deletions(-)

diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index d11a5578e4f8..9dcc3924a975 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -640,6 +640,11 @@ static struct fib6_node *fib6_add_1(struct net *net,
 			if (!(fn->fn_flags & RTN_RTINFO)) {
 				RCU_INIT_POINTER(fn->leaf, NULL);
 				rt6_release(leaf);
+			/* remove null_entry in the root node */
+			} else if (fn->fn_flags & RTN_TL_ROOT &&
+				   rcu_access_pointer(fn->leaf) ==
+				   net->ipv6.ip6_null_entry) {
+				RCU_INIT_POINTER(fn->leaf, NULL);
 			}
 
 			return fn;
@@ -1270,13 +1275,17 @@ out:
 	return err;
 
 failure:
-	/* fn->leaf could be NULL if fn is an intermediate node and we
-	 * failed to add the new route to it in both subtree creation
-	 * failure and fib6_add_rt2node() failure case.
-	 * In both cases, fib6_repair_tree() should be called to fix
-	 * fn->leaf.
+	/* fn->leaf could be NULL and fib6_repair_tree() needs to be called if:
+	 * 1. fn is an intermediate node and we failed to add the new
+	 * route to it in both subtree creation failure and fib6_add_rt2node()
+	 * failure case.
+	 * 2. fn is the root node in the table and we fail to add the first
+	 * default route to it.
 	 */
-	if (fn && !(fn->fn_flags & (RTN_RTINFO|RTN_ROOT)))
+	if (fn &&
+	    (!(fn->fn_flags & (RTN_RTINFO|RTN_ROOT)) ||
+	     (fn->fn_flags & RTN_TL_ROOT &&
+	      !rcu_access_pointer(fn->leaf))))
 		fib6_repair_tree(info->nl_net, table, fn);
 	/* Always release dst as dst->__refcnt is guaranteed
 	 * to be taken before entering this function
@@ -1531,6 +1540,12 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
 	struct fib6_walker *w;
 	int iter = 0;
 
+	/* Set fn->leaf to null_entry for root node. */
+	if (fn->fn_flags & RTN_TL_ROOT) {
+		rcu_assign_pointer(fn->leaf, net->ipv6.ip6_null_entry);
+		return fn;
+	}
+
 	for (;;) {
 		struct fib6_node *fn_r = rcu_dereference_protected(fn->right,
 					    lockdep_is_held(&table->tb6_lock));
@@ -1685,10 +1700,15 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
 	}
 	read_unlock(&net->ipv6.fib6_walker_lock);
 
-	/* If it was last route, expunge its radix tree node */
+	/* If it was last route, call fib6_repair_tree() to:
+	 * 1. For root node, put back null_entry as how the table was created.
+	 * 2. For other nodes, expunge its radix tree node.
+	 */
 	if (!rcu_access_pointer(fn->leaf)) {
-		fn->fn_flags &= ~RTN_RTINFO;
-		net->ipv6.rt6_stats->fib_route_nodes--;
+		if (!(fn->fn_flags & RTN_TL_ROOT)) {
+			fn->fn_flags &= ~RTN_RTINFO;
+			net->ipv6.rt6_stats->fib_route_nodes--;
+		}
 		fn = fib6_repair_tree(net, table, fn);
 	}
 

From be95a845cc4402272994ce290e3ad928aff06cb9 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 9 Jan 2018 13:17:44 +0100
Subject: [PATCH 744/808] bpf: avoid false sharing of map refcount with
 max_entries

In addition to commit b2157399cc98 ("bpf: prevent out-of-bounds
speculation") also change the layout of struct bpf_map such that
false sharing of fast-path members like max_entries is avoided
when the maps reference counter is altered. Therefore enforce
them to be placed into separate cachelines.

pahole dump after change:

  struct bpf_map {
        const struct bpf_map_ops  * ops;                 /*     0     8 */
        struct bpf_map *           inner_map_meta;       /*     8     8 */
        void *                     security;             /*    16     8 */
        enum bpf_map_type          map_type;             /*    24     4 */
        u32                        key_size;             /*    28     4 */
        u32                        value_size;           /*    32     4 */
        u32                        max_entries;          /*    36     4 */
        u32                        map_flags;            /*    40     4 */
        u32                        pages;                /*    44     4 */
        u32                        id;                   /*    48     4 */
        int                        numa_node;            /*    52     4 */
        bool                       unpriv_array;         /*    56     1 */

        /* XXX 7 bytes hole, try to pack */

        /* --- cacheline 1 boundary (64 bytes) --- */
        struct user_struct *       user;                 /*    64     8 */
        atomic_t                   refcnt;               /*    72     4 */
        atomic_t                   usercnt;              /*    76     4 */
        struct work_struct         work;                 /*    80    32 */
        char                       name[16];             /*   112    16 */
        /* --- cacheline 2 boundary (128 bytes) --- */

        /* size: 128, cachelines: 2, members: 17 */
        /* sum members: 121, holes: 1, sum holes: 7 */
  };

Now all entries in the first cacheline are read only throughout
the life time of the map, set up once during map creation. Overall
struct size and number of cachelines doesn't change from the
reordering. struct bpf_map is usually first member and embedded
in map structs in specific map implementations, so also avoid those
members to sit at the end where it could potentially share the
cacheline with first map values e.g. in the array since remote
CPUs could trigger map updates just as well for those (easily
dirtying members like max_entries intentionally as well) while
having subsequent values in cache.

Quoting from Google's Project Zero blog [1]:

  Additionally, at least on the Intel machine on which this was
  tested, bouncing modified cache lines between cores is slow,
  apparently because the MESI protocol is used for cache coherence
  [8]. Changing the reference counter of an eBPF array on one
  physical CPU core causes the cache line containing the reference
  counter to be bounced over to that CPU core, making reads of the
  reference counter on all other CPU cores slow until the changed
  reference counter has been written back to memory. Because the
  length and the reference counter of an eBPF array are stored in
  the same cache line, this also means that changing the reference
  counter on one physical CPU core causes reads of the eBPF array's
  length to be slow on other physical CPU cores (intentional false
  sharing).

While this doesn't 'control' the out-of-bounds speculation through
masking the index as in commit b2157399cc98, triggering a manipulation
of the map's reference counter is really trivial, so lets not allow
to easily affect max_entries from it.

Splitting to separate cachelines also generally makes sense from
a performance perspective anyway in that fast-path won't have a
cache miss if the map gets pinned, reused in other progs, etc out
of control path, thus also avoids unintentional false sharing.

  [1] https://googleprojectzero.blogspot.ch/2018/01/reading-privileged-memory-with-side.html

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 1b985ca4ffbe..fe2cb7c398e3 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -43,7 +43,14 @@ struct bpf_map_ops {
 };
 
 struct bpf_map {
-	atomic_t refcnt;
+	/* 1st cacheline with read-mostly members of which some
+	 * are also accessed in fast-path (e.g. ops, max_entries).
+	 */
+	const struct bpf_map_ops *ops ____cacheline_aligned;
+	struct bpf_map *inner_map_meta;
+#ifdef CONFIG_SECURITY
+	void *security;
+#endif
 	enum bpf_map_type map_type;
 	u32 key_size;
 	u32 value_size;
@@ -53,15 +60,16 @@ struct bpf_map {
 	u32 id;
 	int numa_node;
 	bool unpriv_array;
-	struct user_struct *user;
-	const struct bpf_map_ops *ops;
-	struct work_struct work;
+	/* 7 bytes hole */
+
+	/* 2nd cacheline with misc members to avoid false sharing
+	 * particularly with refcounting.
+	 */
+	struct user_struct *user ____cacheline_aligned;
+	atomic_t refcnt;
 	atomic_t usercnt;
-	struct bpf_map *inner_map_meta;
+	struct work_struct work;
 	char name[BPF_OBJ_NAME_LEN];
-#ifdef CONFIG_SECURITY
-	void *security;
-#endif
 };
 
 /* function argument constraints */

From 290af86629b25ffd1ed6232c4e9107da031705cb Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Tue, 9 Jan 2018 10:04:29 -0800
Subject: [PATCH 745/808] bpf: introduce BPF_JIT_ALWAYS_ON config

The BPF interpreter has been used as part of the spectre 2 attack CVE-2017-5715.

A quote from goolge project zero blog:
"At this point, it would normally be necessary to locate gadgets in
the host kernel code that can be used to actually leak data by reading
from an attacker-controlled location, shifting and masking the result
appropriately and then using the result of that as offset to an
attacker-controlled address for a load. But piecing gadgets together
and figuring out which ones work in a speculation context seems annoying.
So instead, we decided to use the eBPF interpreter, which is built into
the host kernel - while there is no legitimate way to invoke it from inside
a VM, the presence of the code in the host kernel's text section is sufficient
to make it usable for the attack, just like with ordinary ROP gadgets."

To make attacker job harder introduce BPF_JIT_ALWAYS_ON config
option that removes interpreter from the kernel in favor of JIT-only mode.
So far eBPF JIT is supported by:
x64, arm64, arm32, sparc64, s390, powerpc64, mips64

The start of JITed program is randomized and code page is marked as read-only.
In addition "constant blinding" can be turned on with net.core.bpf_jit_harden

v2->v3:
- move __bpf_prog_ret0 under ifdef (Daniel)

v1->v2:
- fix init order, test_bpf and cBPF (Daniel's feedback)
- fix offloaded bpf (Jakub's feedback)
- add 'return 0' dummy in case something can invoke prog->bpf_func
- retarget bpf tree. For bpf-next the patch would need one extra hunk.
  It will be sent when the trees are merged back to net-next

Considered doing:
  int bpf_jit_enable __read_mostly = BPF_EBPF_JIT_DEFAULT;
but it seems better to land the patch as-is and in bpf-next remove
bpf_jit_enable global variable from all JITs, consolidate in one place
and remove this jit_init() function.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 init/Kconfig               |  7 +++++++
 kernel/bpf/core.c          | 19 +++++++++++++++++++
 lib/test_bpf.c             | 11 +++++++----
 net/core/filter.c          |  6 ++----
 net/core/sysctl_net_core.c |  6 ++++++
 net/socket.c               |  9 +++++++++
 6 files changed, 50 insertions(+), 8 deletions(-)

diff --git a/init/Kconfig b/init/Kconfig
index 2934249fba46..5e2a4a391ba9 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1392,6 +1392,13 @@ config BPF_SYSCALL
 	  Enable the bpf() system call that allows to manipulate eBPF
 	  programs and maps via file descriptors.
 
+config BPF_JIT_ALWAYS_ON
+	bool "Permanently enable BPF JIT and remove BPF interpreter"
+	depends on BPF_SYSCALL && HAVE_EBPF_JIT && BPF_JIT
+	help
+	  Enables BPF JIT and removes BPF interpreter to avoid
+	  speculative execution of BPF instructions by the interpreter
+
 config USERFAULTFD
 	bool "Enable userfaultfd() system call"
 	select ANON_INODES
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 86b50aa26ee8..51ec2dda7f08 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -767,6 +767,7 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
 }
 EXPORT_SYMBOL_GPL(__bpf_call_base);
 
+#ifndef CONFIG_BPF_JIT_ALWAYS_ON
 /**
  *	__bpf_prog_run - run eBPF program on a given context
  *	@ctx: is the data we are operating on
@@ -1317,6 +1318,14 @@ EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384)
 EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
 };
 
+#else
+static unsigned int __bpf_prog_ret0(const void *ctx,
+				    const struct bpf_insn *insn)
+{
+	return 0;
+}
+#endif
+
 bool bpf_prog_array_compatible(struct bpf_array *array,
 			       const struct bpf_prog *fp)
 {
@@ -1364,9 +1373,13 @@ static int bpf_check_tail_call(const struct bpf_prog *fp)
  */
 struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
 {
+#ifndef CONFIG_BPF_JIT_ALWAYS_ON
 	u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1);
 
 	fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1];
+#else
+	fp->bpf_func = __bpf_prog_ret0;
+#endif
 
 	/* eBPF JITs can rewrite the program in case constant
 	 * blinding is active. However, in case of error during
@@ -1376,6 +1389,12 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
 	 */
 	if (!bpf_prog_is_dev_bound(fp->aux)) {
 		fp = bpf_int_jit_compile(fp);
+#ifdef CONFIG_BPF_JIT_ALWAYS_ON
+		if (!fp->jited) {
+			*err = -ENOTSUPP;
+			return fp;
+		}
+#endif
 	} else {
 		*err = bpf_prog_offload_compile(fp);
 		if (*err)
diff --git a/lib/test_bpf.c b/lib/test_bpf.c
index 9e9748089270..f369889e521d 100644
--- a/lib/test_bpf.c
+++ b/lib/test_bpf.c
@@ -6250,9 +6250,8 @@ static struct bpf_prog *generate_filter(int which, int *err)
 				return NULL;
 			}
 		}
-		/* We don't expect to fail. */
 		if (*err) {
-			pr_cont("FAIL to attach err=%d len=%d\n",
+			pr_cont("FAIL to prog_create err=%d len=%d\n",
 				*err, fprog.len);
 			return NULL;
 		}
@@ -6276,6 +6275,10 @@ static struct bpf_prog *generate_filter(int which, int *err)
 		 * checks.
 		 */
 		fp = bpf_prog_select_runtime(fp, err);
+		if (*err) {
+			pr_cont("FAIL to select_runtime err=%d\n", *err);
+			return NULL;
+		}
 		break;
 	}
 
@@ -6461,8 +6464,8 @@ static __init int test_bpf(void)
 				pass_cnt++;
 				continue;
 			}
-
-			return err;
+			err_cnt++;
+			continue;
 		}
 
 		pr_cont("jited:%u ", fp->jited);
diff --git a/net/core/filter.c b/net/core/filter.c
index 6a85e67fafce..d339ef170df6 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1054,11 +1054,9 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)
 		 */
 		goto out_err_free;
 
-	/* We are guaranteed to never error here with cBPF to eBPF
-	 * transitions, since there's no issue with type compatibility
-	 * checks on program arrays.
-	 */
 	fp = bpf_prog_select_runtime(fp, &err);
+	if (err)
+		goto out_err_free;
 
 	kfree(old_prog);
 	return fp;
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index cbc3dde4cfcc..a47ad6cd41c0 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -325,7 +325,13 @@ static struct ctl_table net_core_table[] = {
 		.data		= &bpf_jit_enable,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
+#ifndef CONFIG_BPF_JIT_ALWAYS_ON
 		.proc_handler	= proc_dointvec
+#else
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &one,
+		.extra2		= &one,
+#endif
 	},
 # ifdef CONFIG_HAVE_EBPF_JIT
 	{
diff --git a/net/socket.c b/net/socket.c
index 05f361faec45..78acd6ce74c7 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2619,6 +2619,15 @@ out_fs:
 
 core_initcall(sock_init);	/* early initcall */
 
+static int __init jit_init(void)
+{
+#ifdef CONFIG_BPF_JIT_ALWAYS_ON
+	bpf_jit_enable = 1;
+#endif
+	return 0;
+}
+pure_initcall(jit_init);
+
 #ifdef CONFIG_PROC_FS
 void socket_seq_show(struct seq_file *seq)
 {

From 541676078b52f365f53d46ee5517d305cd1b6350 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Fri, 15 Dec 2017 14:23:10 -0500
Subject: [PATCH 746/808] membarrier: Disable preemption when calling
 smp_call_function_many()

smp_call_function_many() requires disabling preemption around the call.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: <stable@vger.kernel.org> # v4.14+
Cc: Andrea Parri <parri.andrea@gmail.com>
Cc: Andrew Hunter <ahh@google.com>
Cc: Avi Kivity <avi@scylladb.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Dave Watson <davejwatson@fb.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Maged Michael <maged.michael@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul E . McKenney <paulmck@linux.vnet.ibm.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20171215192310.25293-1-mathieu.desnoyers@efficios.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/membarrier.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index dd7908743dab..9bcbacba82a8 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -89,7 +89,9 @@ static int membarrier_private_expedited(void)
 		rcu_read_unlock();
 	}
 	if (!fallback) {
+		preempt_disable();
 		smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
+		preempt_enable();
 		free_cpumask_var(tmpmask);
 	}
 	cpus_read_unlock();

From 6c7d47c33ed323f14f2a3b8de925e831dbaa4e69 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Wed, 22 Nov 2017 14:42:21 +1100
Subject: [PATCH 747/808] KVM: PPC: Book3S PR: Fix WIMG handling under pHyp

Commit 96df226 ("KVM: PPC: Book3S PR: Preserve storage control bits")
added code to preserve WIMG bits but it missed 2 special cases:
- a magic page in kvmppc_mmu_book3s_64_xlate() and
- guest real mode in kvmppc_handle_pagefault().

For these ptes, WIMG was 0 and pHyp failed on these causing a guest to
stop in the very beginning at NIP=0x100 (due to bd9166ffe "KVM: PPC:
Book3S PR: Exit KVM on failed mapping").

According to LoPAPR v1.1 14.5.4.1.2 H_ENTER:

 The hypervisor checks that the WIMG bits within the PTE are appropriate
 for the physical page number else H_Parameter return. (For System Memory
 pages WIMG=0010, or, 1110 if the SAO option is enabled, and for IO pages
 WIMG=01**.)

This hence initializes WIMG to non-zero value HPTE_R_M (0x10), as expected
by pHyp.

[paulus@ozlabs.org - fix compile for 32-bit]

Cc: stable@vger.kernel.org # v4.11+
Fixes: 96df226 "KVM: PPC: Book3S PR: Preserve storage control bits"
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Tested-by: Ruediger Oertel <ro@suse.de>
Reviewed-by: Greg Kurz <groug@kaod.org>
Tested-by: Greg Kurz <groug@kaod.org>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_64_mmu.c | 1 +
 arch/powerpc/kvm/book3s_pr.c     | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c
index 29ebe2fd5867..a93d719edc90 100644
--- a/arch/powerpc/kvm/book3s_64_mmu.c
+++ b/arch/powerpc/kvm/book3s_64_mmu.c
@@ -235,6 +235,7 @@ static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
 		gpte->may_read = true;
 		gpte->may_write = true;
 		gpte->page_size = MMU_PAGE_4K;
+		gpte->wimg = HPTE_R_M;
 
 		return 0;
 	}
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index d0dc8624198f..7deaeeb14b93 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -60,6 +60,7 @@ static void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac);
 #define MSR_USER32 MSR_USER
 #define MSR_USER64 MSR_USER
 #define HW_PAGE_SIZE PAGE_SIZE
+#define HPTE_R_M   _PAGE_COHERENT
 #endif
 
 static bool kvmppc_is_split_real(struct kvm_vcpu *vcpu)
@@ -557,6 +558,7 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		pte.eaddr = eaddr;
 		pte.vpage = eaddr >> 12;
 		pte.page_size = MMU_PAGE_64K;
+		pte.wimg = HPTE_R_M;
 	}
 
 	switch (kvmppc_get_msr(vcpu) & (MSR_DR|MSR_IR)) {

From ecba8297aafd50db6ae867e90844eead1611ef1c Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 10 Jan 2018 17:04:39 +1100
Subject: [PATCH 748/808] KVM: PPC: Book3S HV: Always flush TLB in
 kvmppc_alloc_reset_hpt()

The KVM_PPC_ALLOCATE_HTAB ioctl(), implemented by kvmppc_alloc_reset_hpt()
is supposed to completely clear and reset a guest's Hashed Page Table (HPT)
allocating or re-allocating it if necessary.

In the case where an HPT of the right size already exists and it just
zeroes it, it forces a TLB flush on all guest CPUs, to remove any stale TLB
entries loaded from the old HPT.

However, that situation can arise when the HPT is resizing as well - or
even when switching from an RPT to HPT - so those cases need a TLB flush as
well.

So, move the TLB flush to trigger in all cases except for errors.

Cc: stable@vger.kernel.org # v4.10+
Fixes: f98a8bf9ee20 ("KVM: PPC: Book3S HV: Allow KVM_PPC_ALLOCATE_HTAB ioctl() to change HPT size")
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_64_mmu_hv.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 8355398f0bb6..b73dbc9e797d 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -165,8 +165,6 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order)
 		 * Reset all the reverse-mapping chains for all memslots
 		 */
 		kvmppc_rmap_reset(kvm);
-		/* Ensure that each vcpu will flush its TLB on next entry. */
-		cpumask_setall(&kvm->arch.need_tlb_flush);
 		err = 0;
 		goto out;
 	}
@@ -182,6 +180,10 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order)
 	kvmppc_set_hpt(kvm, &info);
 
 out:
+	if (err == 0)
+		/* Ensure that each vcpu will flush its TLB on next entry. */
+		cpumask_setall(&kvm->arch.need_tlb_flush);
+
 	mutex_unlock(&kvm->lock);
 	return err;
 }

From aa8a5e0062ac940f7659394f4817c948dc8c0667 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Wed, 10 Jan 2018 03:07:15 +1100
Subject: [PATCH 749/808] powerpc/64s: Add support for RFI flush of L1-D cache

On some CPUs we can prevent the Meltdown vulnerability by flushing the
L1-D cache on exit from kernel to user mode, and from hypervisor to
guest.

This is known to be the case on at least Power7, Power8 and Power9. At
this time we do not know the status of the vulnerability on other CPUs
such as the 970 (Apple G5), pasemi CPUs (AmigaOne X1000) or Freescale
CPUs. As more information comes to light we can enable this, or other
mechanisms on those CPUs.

The vulnerability occurs when the load of an architecturally
inaccessible memory region (eg. userspace load of kernel memory) is
speculatively executed to the point where its result can influence the
address of a subsequent speculatively executed load.

In order for that to happen, the first load must hit in the L1,
because before the load is sent to the L2 the permission check is
performed. Therefore if no kernel addresses hit in the L1 the
vulnerability can not occur. We can ensure that is the case by
flushing the L1 whenever we return to userspace. Similarly for
hypervisor vs guest.

In order to flush the L1-D cache on exit, we add a section of nops at
each (h)rfi location that returns to a lower privileged context, and
patch that with some sequence. Newer firmwares are able to advertise
to us that there is a special nop instruction that flushes the L1-D.
If we do not see that advertised, we fall back to doing a displacement
flush in software.

For guest kernels we support migration between some CPU versions, and
different CPUs may use different flush instructions. So that we are
prepared to migrate to a machine with a different flush instruction
activated, we may have to patch more than one flush instruction at
boot if the hypervisor tells us to.

In the end this patch is mostly the work of Nicholas Piggin and
Michael Ellerman. However a cast of thousands contributed to analysis
of the issue, earlier versions of the patch, back ports testing etc.
Many thanks to all of them.

Tested-by: Jon Masters <jcm@redhat.com>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/exception-64s.h  | 40 ++++++++---
 arch/powerpc/include/asm/feature-fixups.h | 13 ++++
 arch/powerpc/include/asm/paca.h           | 10 +++
 arch/powerpc/include/asm/setup.h          | 13 ++++
 arch/powerpc/kernel/asm-offsets.c         |  5 ++
 arch/powerpc/kernel/exceptions-64s.S      | 84 +++++++++++++++++++++++
 arch/powerpc/kernel/setup_64.c            | 79 +++++++++++++++++++++
 arch/powerpc/kernel/vmlinux.lds.S         |  9 +++
 arch/powerpc/lib/feature-fixups.c         | 41 +++++++++++
 9 files changed, 286 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h
index dfc56daed98b..7197b179c1b1 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -74,34 +74,58 @@
  */
 #define EX_R3		EX_DAR
 
-/* Macros for annotating the expected destination of (h)rfid */
+/*
+ * Macros for annotating the expected destination of (h)rfid
+ *
+ * The nop instructions allow us to insert one or more instructions to flush the
+ * L1-D cache when returning to userspace or a guest.
+ */
+#define RFI_FLUSH_SLOT							\
+	RFI_FLUSH_FIXUP_SECTION;					\
+	nop;								\
+	nop;								\
+	nop
 
 #define RFI_TO_KERNEL							\
 	rfid
 
 #define RFI_TO_USER							\
-	rfid
+	RFI_FLUSH_SLOT;							\
+	rfid;								\
+	b	rfi_flush_fallback
 
 #define RFI_TO_USER_OR_KERNEL						\
-	rfid
+	RFI_FLUSH_SLOT;							\
+	rfid;								\
+	b	rfi_flush_fallback
 
 #define RFI_TO_GUEST							\
-	rfid
+	RFI_FLUSH_SLOT;							\
+	rfid;								\
+	b	rfi_flush_fallback
 
 #define HRFI_TO_KERNEL							\
 	hrfid
 
 #define HRFI_TO_USER							\
-	hrfid
+	RFI_FLUSH_SLOT;							\
+	hrfid;								\
+	b	hrfi_flush_fallback
 
 #define HRFI_TO_USER_OR_KERNEL						\
-	hrfid
+	RFI_FLUSH_SLOT;							\
+	hrfid;								\
+	b	hrfi_flush_fallback
 
 #define HRFI_TO_GUEST							\
-	hrfid
+	RFI_FLUSH_SLOT;							\
+	hrfid;								\
+	b	hrfi_flush_fallback
 
 #define HRFI_TO_UNKNOWN							\
-	hrfid
+	RFI_FLUSH_SLOT;							\
+	hrfid;								\
+	b	hrfi_flush_fallback
 
 #ifdef CONFIG_RELOCATABLE
 #define __EXCEPTION_RELON_PROLOG_PSERIES_1(label, h)			\
diff --git a/arch/powerpc/include/asm/feature-fixups.h b/arch/powerpc/include/asm/feature-fixups.h
index 8f88f771cc55..1e82eb3caabd 100644
--- a/arch/powerpc/include/asm/feature-fixups.h
+++ b/arch/powerpc/include/asm/feature-fixups.h
@@ -187,7 +187,20 @@ label##3:					       	\
 	FTR_ENTRY_OFFSET label##1b-label##3b;		\
 	.popsection;
 
+#define RFI_FLUSH_FIXUP_SECTION				\
+951:							\
+	.pushsection __rfi_flush_fixup,"a";		\
+	.align 2;					\
+952:							\
+	FTR_ENTRY_OFFSET 951b-952b;			\
+	.popsection;
+
+
 #ifndef __ASSEMBLY__
+#include <linux/types.h>
+
+extern long __start___rfi_flush_fixup, __stop___rfi_flush_fixup;
+
 void apply_feature_fixups(void);
 void setup_feature_keys(void);
 #endif
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 3892db93b837..23ac7fc0af23 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -232,6 +232,16 @@ struct paca_struct {
 	struct sibling_subcore_state *sibling_subcore_state;
 #endif
 #endif
+#ifdef CONFIG_PPC_BOOK3S_64
+	/*
+	 * rfi fallback flush must be in its own cacheline to prevent
+	 * other paca data leaking into the L1d
+	 */
+	u64 exrfi[EX_SIZE] __aligned(0x80);
+	void *rfi_flush_fallback_area;
+	u64 l1d_flush_congruence;
+	u64 l1d_flush_sets;
+#endif
 };
 
 extern void copy_mm_to_paca(struct mm_struct *mm);
diff --git a/arch/powerpc/include/asm/setup.h b/arch/powerpc/include/asm/setup.h
index cf00ec26303a..469b7fdc9be4 100644
--- a/arch/powerpc/include/asm/setup.h
+++ b/arch/powerpc/include/asm/setup.h
@@ -39,6 +39,19 @@ static inline void pseries_big_endian_exceptions(void) {}
 static inline void pseries_little_endian_exceptions(void) {}
 #endif /* CONFIG_PPC_PSERIES */
 
+void rfi_flush_enable(bool enable);
+
+/* These are bit flags */
+enum l1d_flush_type {
+	L1D_FLUSH_NONE		= 0x1,
+	L1D_FLUSH_FALLBACK	= 0x2,
+	L1D_FLUSH_ORI		= 0x4,
+	L1D_FLUSH_MTTRIG	= 0x8,
+};
+
+void __init setup_rfi_flush(enum l1d_flush_type, bool enable);
+void do_rfi_flush_fixups(enum l1d_flush_type types);
+
 #endif /* !__ASSEMBLY__ */
 
 #endif	/* _ASM_POWERPC_SETUP_H */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 6b958414b4e0..f390d57cf2e1 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -237,6 +237,11 @@ int main(void)
 	OFFSET(PACA_NMI_EMERG_SP, paca_struct, nmi_emergency_sp);
 	OFFSET(PACA_IN_MCE, paca_struct, in_mce);
 	OFFSET(PACA_IN_NMI, paca_struct, in_nmi);
+	OFFSET(PACA_RFI_FLUSH_FALLBACK_AREA, paca_struct, rfi_flush_fallback_area);
+	OFFSET(PACA_EXRFI, paca_struct, exrfi);
+	OFFSET(PACA_L1D_FLUSH_CONGRUENCE, paca_struct, l1d_flush_congruence);
+	OFFSET(PACA_L1D_FLUSH_SETS, paca_struct, l1d_flush_sets);
+
 #endif
 	OFFSET(PACAHWCPUID, paca_struct, hw_cpu_id);
 	OFFSET(PACAKEXECSTATE, paca_struct, kexec_state);
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index ed356194f09c..2dc10bf646b8 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1449,6 +1449,90 @@ masked_##_H##interrupt:					\
 	b	.;					\
 	MASKED_DEC_HANDLER(_H)
 
+TRAMP_REAL_BEGIN(rfi_flush_fallback)
+	SET_SCRATCH0(r13);
+	GET_PACA(r13);
+	std	r9,PACA_EXRFI+EX_R9(r13)
+	std	r10,PACA_EXRFI+EX_R10(r13)
+	std	r11,PACA_EXRFI+EX_R11(r13)
+	std	r12,PACA_EXRFI+EX_R12(r13)
+	std	r8,PACA_EXRFI+EX_R13(r13)
+	mfctr	r9
+	ld	r10,PACA_RFI_FLUSH_FALLBACK_AREA(r13)
+	ld	r11,PACA_L1D_FLUSH_SETS(r13)
+	ld	r12,PACA_L1D_FLUSH_CONGRUENCE(r13)
+	/*
+	 * The load adresses are at staggered offsets within cachelines,
+	 * which suits some pipelines better (on others it should not
+	 * hurt).
+	 */
+	addi	r12,r12,8
+	mtctr	r11
+	DCBT_STOP_ALL_STREAM_IDS(r11) /* Stop prefetch streams */
+
+	/* order ld/st prior to dcbt stop all streams with flushing */
+	sync
+1:	li	r8,0
+	.rept	8 /* 8-way set associative */
+	ldx	r11,r10,r8
+	add	r8,r8,r12
+	xor	r11,r11,r11	// Ensure r11 is 0 even if fallback area is not
+	add	r8,r8,r11	// Add 0, this creates a dependency on the ldx
+	.endr
+	addi	r10,r10,128 /* 128 byte cache line */
+	bdnz	1b
+
+	mtctr	r9
+	ld	r9,PACA_EXRFI+EX_R9(r13)
+	ld	r10,PACA_EXRFI+EX_R10(r13)
+	ld	r11,PACA_EXRFI+EX_R11(r13)
+	ld	r12,PACA_EXRFI+EX_R12(r13)
+	ld	r8,PACA_EXRFI+EX_R13(r13)
+	GET_SCRATCH0(r13);
+	rfid
+
+TRAMP_REAL_BEGIN(hrfi_flush_fallback)
+	SET_SCRATCH0(r13);
+	GET_PACA(r13);
+	std	r9,PACA_EXRFI+EX_R9(r13)
+	std	r10,PACA_EXRFI+EX_R10(r13)
+	std	r11,PACA_EXRFI+EX_R11(r13)
+	std	r12,PACA_EXRFI+EX_R12(r13)
+	std	r8,PACA_EXRFI+EX_R13(r13)
+	mfctr	r9
+	ld	r10,PACA_RFI_FLUSH_FALLBACK_AREA(r13)
+	ld	r11,PACA_L1D_FLUSH_SETS(r13)
+	ld	r12,PACA_L1D_FLUSH_CONGRUENCE(r13)
+	/*
+	 * The load adresses are at staggered offsets within cachelines,
+	 * which suits some pipelines better (on others it should not
+	 * hurt).
+	 */
+	addi	r12,r12,8
+	mtctr	r11
+	DCBT_STOP_ALL_STREAM_IDS(r11) /* Stop prefetch streams */
+
+	/* order ld/st prior to dcbt stop all streams with flushing */
+	sync
+1:	li	r8,0
+	.rept	8 /* 8-way set associative */
+	ldx	r11,r10,r8
+	add	r8,r8,r12
+	xor	r11,r11,r11	// Ensure r11 is 0 even if fallback area is not
+	add	r8,r8,r11	// Add 0, this creates a dependency on the ldx
+	.endr
+	addi	r10,r10,128 /* 128 byte cache line */
+	bdnz	1b
+
+	mtctr	r9
+	ld	r9,PACA_EXRFI+EX_R9(r13)
+	ld	r10,PACA_EXRFI+EX_R10(r13)
+	ld	r11,PACA_EXRFI+EX_R11(r13)
+	ld	r12,PACA_EXRFI+EX_R12(r13)
+	ld	r8,PACA_EXRFI+EX_R13(r13)
+	GET_SCRATCH0(r13);
+	hrfid
+
 /*
  * Real mode exceptions actually use this too, but alternate
  * instruction code patches (which end up in the common .text area)
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 8956a9856604..96163f4c3673 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -801,3 +801,82 @@ static int __init disable_hardlockup_detector(void)
 	return 0;
 }
 early_initcall(disable_hardlockup_detector);
+
+#ifdef CONFIG_PPC_BOOK3S_64
+static enum l1d_flush_type enabled_flush_types;
+static void *l1d_flush_fallback_area;
+bool rfi_flush;
+
+static void do_nothing(void *unused)
+{
+	/*
+	 * We don't need to do the flush explicitly, just enter+exit kernel is
+	 * sufficient, the RFI exit handlers will do the right thing.
+	 */
+}
+
+void rfi_flush_enable(bool enable)
+{
+	if (rfi_flush == enable)
+		return;
+
+	if (enable) {
+		do_rfi_flush_fixups(enabled_flush_types);
+		on_each_cpu(do_nothing, NULL, 1);
+	} else
+		do_rfi_flush_fixups(L1D_FLUSH_NONE);
+
+	rfi_flush = enable;
+}
+
+static void init_fallback_flush(void)
+{
+	u64 l1d_size, limit;
+	int cpu;
+
+	l1d_size = ppc64_caches.l1d.size;
+	limit = min(safe_stack_limit(), ppc64_rma_size);
+
+	/*
+	 * Align to L1d size, and size it at 2x L1d size, to catch possible
+	 * hardware prefetch runoff. We don't have a recipe for load patterns to
+	 * reliably avoid the prefetcher.
+	 */
+	l1d_flush_fallback_area = __va(memblock_alloc_base(l1d_size * 2, l1d_size, limit));
+	memset(l1d_flush_fallback_area, 0, l1d_size * 2);
+
+	for_each_possible_cpu(cpu) {
+		/*
+		 * The fallback flush is currently coded for 8-way
+		 * associativity. Different associativity is possible, but it
+		 * will be treated as 8-way and may not evict the lines as
+		 * effectively.
+		 *
+		 * 128 byte lines are mandatory.
+		 */
+		u64 c = l1d_size / 8;
+
+		paca[cpu].rfi_flush_fallback_area = l1d_flush_fallback_area;
+		paca[cpu].l1d_flush_congruence = c;
+		paca[cpu].l1d_flush_sets = c / 128;
+	}
+}
+
+void __init setup_rfi_flush(enum l1d_flush_type types, bool enable)
+{
+	if (types & L1D_FLUSH_FALLBACK) {
+		pr_info("rfi-flush: Using fallback displacement flush\n");
+		init_fallback_flush();
+	}
+
+	if (types & L1D_FLUSH_ORI)
+		pr_info("rfi-flush: Using ori type flush\n");
+
+	if (types & L1D_FLUSH_MTTRIG)
+		pr_info("rfi-flush: Using mttrig type flush\n");
+
+	enabled_flush_types = types;
+
+	rfi_flush_enable(enable);
+}
+#endif /* CONFIG_PPC_BOOK3S_64 */
diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S
index 0494e1566ee2..307843d23682 100644
--- a/arch/powerpc/kernel/vmlinux.lds.S
+++ b/arch/powerpc/kernel/vmlinux.lds.S
@@ -132,6 +132,15 @@ SECTIONS
 	/* Read-only data */
 	RO_DATA(PAGE_SIZE)
 
+#ifdef CONFIG_PPC64
+	. = ALIGN(8);
+	__rfi_flush_fixup : AT(ADDR(__rfi_flush_fixup) - LOAD_OFFSET) {
+		__start___rfi_flush_fixup = .;
+		*(__rfi_flush_fixup)
+		__stop___rfi_flush_fixup = .;
+	}
+#endif
+
 	EXCEPTION_TABLE(0)
 
 	NOTES :kernel :notes
diff --git a/arch/powerpc/lib/feature-fixups.c b/arch/powerpc/lib/feature-fixups.c
index 41cf5ae273cf..a95ea007d654 100644
--- a/arch/powerpc/lib/feature-fixups.c
+++ b/arch/powerpc/lib/feature-fixups.c
@@ -116,6 +116,47 @@ void do_feature_fixups(unsigned long value, void *fixup_start, void *fixup_end)
 	}
 }
 
+#ifdef CONFIG_PPC_BOOK3S_64
+void do_rfi_flush_fixups(enum l1d_flush_type types)
+{
+	unsigned int instrs[3], *dest;
+	long *start, *end;
+	int i;
+
+	start = PTRRELOC(&__start___rfi_flush_fixup),
+	end = PTRRELOC(&__stop___rfi_flush_fixup);
+
+	instrs[0] = 0x60000000; /* nop */
+	instrs[1] = 0x60000000; /* nop */
+	instrs[2] = 0x60000000; /* nop */
+
+	if (types & L1D_FLUSH_FALLBACK)
+		/* b .+16 to fallback flush */
+		instrs[0] = 0x48000010;
+
+	i = 0;
+	if (types & L1D_FLUSH_ORI) {
+		instrs[i++] = 0x63ff0000; /* ori 31,31,0 speculation barrier */
+		instrs[i++] = 0x63de0000; /* ori 30,30,0 L1d flush*/
+	}
+
+	if (types & L1D_FLUSH_MTTRIG)
+		instrs[i++] = 0x7c12dba6; /* mtspr TRIG2,r0 (SPR #882) */
+
+	for (i = 0; start < end; start++, i++) {
+		dest = (void *)start + *start;
+
+		pr_devel("patching dest %lx\n", (unsigned long)dest);
+
+		patch_instruction(dest, instrs[0]);
+		patch_instruction(dest + 1, instrs[1]);
+		patch_instruction(dest + 2, instrs[2]);
+	}
+
+	printk(KERN_DEBUG "rfi-flush: patched %d locations\n", i);
+}
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
 void do_lwsync_fixups(unsigned long value, void *fixup_start, void *fixup_end)
 {
 	long *start, *end;

From bc9c9304a45480797e13a8e1df96ffcf44fb62fe Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Wed, 10 Jan 2018 03:07:15 +1100
Subject: [PATCH 750/808] powerpc/64s: Support disabling RFI flush with
 no_rfi_flush and nopti

Because there may be some performance overhead of the RFI flush, add
kernel command line options to disable it.

We add a sensibly named 'no_rfi_flush' option, but we also hijack the
x86 option 'nopti'. The RFI flush is not the same as KPTI, but if we
see 'nopti' we can guess that the user is trying to avoid any overhead
of Meltdown mitigations, and it means we don't have to educate every
one about a different command line option.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/setup_64.c | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 96163f4c3673..491be4179ddd 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -805,8 +805,29 @@ early_initcall(disable_hardlockup_detector);
 #ifdef CONFIG_PPC_BOOK3S_64
 static enum l1d_flush_type enabled_flush_types;
 static void *l1d_flush_fallback_area;
+static bool no_rfi_flush;
 bool rfi_flush;
 
+static int __init handle_no_rfi_flush(char *p)
+{
+	pr_info("rfi-flush: disabled on command line.");
+	no_rfi_flush = true;
+	return 0;
+}
+early_param("no_rfi_flush", handle_no_rfi_flush);
+
+/*
+ * The RFI flush is not KPTI, but because users will see doco that says to use
+ * nopti we hijack that option here to also disable the RFI flush.
+ */
+static int __init handle_no_pti(char *p)
+{
+	pr_info("rfi-flush: disabling due to 'nopti' on command line.\n");
+	handle_no_rfi_flush(NULL);
+	return 0;
+}
+early_param("nopti", handle_no_pti);
+
 static void do_nothing(void *unused)
 {
 	/*
@@ -877,6 +898,7 @@ void __init setup_rfi_flush(enum l1d_flush_type types, bool enable)
 
 	enabled_flush_types = types;
 
-	rfi_flush_enable(enable);
+	if (!no_rfi_flush)
+		rfi_flush_enable(enable);
 }
 #endif /* CONFIG_PPC_BOOK3S_64 */

From 8989d56878a7735dfdb234707a2fee6faf631085 Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Wed, 10 Jan 2018 03:07:15 +1100
Subject: [PATCH 751/808] powerpc/pseries: Query hypervisor for RFI flush
 settings

A new hypervisor call is available which tells the guest settings
related to the RFI flush. Use it to query the appropriate flush
instruction(s), and whether the flush is required.

Signed-off-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/pseries/setup.c | 35 ++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index a8531e012658..ae4f596273b5 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -459,6 +459,39 @@ static void __init find_and_init_phbs(void)
 	of_pci_check_probe_only();
 }
 
+static void pseries_setup_rfi_flush(void)
+{
+	struct h_cpu_char_result result;
+	enum l1d_flush_type types;
+	bool enable;
+	long rc;
+
+	/* Enable by default */
+	enable = true;
+
+	rc = plpar_get_cpu_characteristics(&result);
+	if (rc == H_SUCCESS) {
+		types = L1D_FLUSH_NONE;
+
+		if (result.character & H_CPU_CHAR_L1D_FLUSH_TRIG2)
+			types |= L1D_FLUSH_MTTRIG;
+		if (result.character & H_CPU_CHAR_L1D_FLUSH_ORI30)
+			types |= L1D_FLUSH_ORI;
+
+		/* Use fallback if nothing set in hcall */
+		if (types == L1D_FLUSH_NONE)
+			types = L1D_FLUSH_FALLBACK;
+
+		if (!(result.behaviour & H_CPU_BEHAV_L1D_FLUSH_PR))
+			enable = false;
+	} else {
+		/* Default to fallback if case hcall is not available */
+		types = L1D_FLUSH_FALLBACK;
+	}
+
+	setup_rfi_flush(types, enable);
+}
+
 static void __init pSeries_setup_arch(void)
 {
 	set_arch_panic_timeout(10, ARCH_PANIC_TIMEOUT);
@@ -476,6 +509,8 @@ static void __init pSeries_setup_arch(void)
 
 	fwnmi_init();
 
+	pseries_setup_rfi_flush();
+
 	/* By default, only probe PCI (can be overridden by rtas_pci) */
 	pci_add_flags(PCI_PROBE_ONLY);
 

From 6e032b350cd1fdb830f18f8320ef0e13b4e24094 Mon Sep 17 00:00:00 2001
From: Oliver O'Halloran <oohall@gmail.com>
Date: Wed, 10 Jan 2018 03:07:15 +1100
Subject: [PATCH 752/808] powerpc/powernv: Check device-tree for RFI flush
 settings

New device-tree properties are available which tell the hypervisor
settings related to the RFI flush. Use them to determine the
appropriate flush instruction to use, and whether the flush is
required.

Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/setup.c | 49 ++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
index 1edfbc1e40f4..4fb21e17504a 100644
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -37,13 +37,62 @@
 #include <asm/kexec.h>
 #include <asm/smp.h>
 #include <asm/tm.h>
+#include <asm/setup.h>
 
 #include "powernv.h"
 
+static void pnv_setup_rfi_flush(void)
+{
+	struct device_node *np, *fw_features;
+	enum l1d_flush_type type;
+	int enable;
+
+	/* Default to fallback in case fw-features are not available */
+	type = L1D_FLUSH_FALLBACK;
+	enable = 1;
+
+	np = of_find_node_by_name(NULL, "ibm,opal");
+	fw_features = of_get_child_by_name(np, "fw-features");
+	of_node_put(np);
+
+	if (fw_features) {
+		np = of_get_child_by_name(fw_features, "inst-l1d-flush-trig2");
+		if (np && of_property_read_bool(np, "enabled"))
+			type = L1D_FLUSH_MTTRIG;
+
+		of_node_put(np);
+
+		np = of_get_child_by_name(fw_features, "inst-l1d-flush-ori30,30,0");
+		if (np && of_property_read_bool(np, "enabled"))
+			type = L1D_FLUSH_ORI;
+
+		of_node_put(np);
+
+		/* Enable unless firmware says NOT to */
+		enable = 2;
+		np = of_get_child_by_name(fw_features, "needs-l1d-flush-msr-hv-1-to-0");
+		if (np && of_property_read_bool(np, "disabled"))
+			enable--;
+
+		of_node_put(np);
+
+		np = of_get_child_by_name(fw_features, "needs-l1d-flush-msr-pr-0-to-1");
+		if (np && of_property_read_bool(np, "disabled"))
+			enable--;
+
+		of_node_put(np);
+		of_node_put(fw_features);
+	}
+
+	setup_rfi_flush(type, enable > 0);
+}
+
 static void __init pnv_setup_arch(void)
 {
 	set_arch_panic_timeout(10, ARCH_PANIC_TIMEOUT);
 
+	pnv_setup_rfi_flush();
+
 	/* Initialize SMP */
 	pnv_smp_init();
 

From d780537f9b49e9d714a454e5ed989d909beab8ec Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Wed, 10 Jan 2018 13:04:58 +0100
Subject: [PATCH 753/808] drm/tegra: sor: Fix hang on Tegra124 eDP

The SOR0 found on Tegra124 and Tegra210 only supports eDP and LVDS and
therefore has a slightly different clock tree than the SOR1 which does
not support eDP, but HDMI and DP instead.

Commit e1335e2f0cfc ("drm/tegra: sor: Reimplement pad clock") breaks
setups with eDP because the sor->clk_out clock is uninitialized and
therefore setting the parent clock (either the safe clock or either of
the display PLLs) fails, which can cause hangs later on since there is
no clock driving the module.

Fix this by falling back to the module clock for sor->clk_out on those
setups. This guarantees that the module will always be clocked by an
enabled clock and hence prevents those hangs.

Fixes: e1335e2f0cfc ("drm/tegra: sor: Reimplement pad clock")
Reported-by: Guillaume Tucker <guillaume.tucker@collabora.com>
Tested-by: Jon Hunter <jonathanh@nvidia.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/drm/tegra/sor.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/tegra/sor.c b/drivers/gpu/drm/tegra/sor.c
index b0a1dedac802..476079f1255f 100644
--- a/drivers/gpu/drm/tegra/sor.c
+++ b/drivers/gpu/drm/tegra/sor.c
@@ -2656,6 +2656,9 @@ static int tegra_sor_probe(struct platform_device *pdev)
 				name, err);
 			goto remove;
 		}
+	} else {
+		/* fall back to the module clock on SOR0 (eDP/LVDS only) */
+		sor->clk_out = sor->clk;
 	}
 
 	sor->clk_parent = devm_clk_get(&pdev->dev, "parent");

From 1e77fc82110ac36febf46c1e2782f504f7d23099 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Tue, 9 Jan 2018 19:08:21 +0100
Subject: [PATCH 754/808] gpio: Add missing open drain/source handling to
 gpiod_set_value_cansleep()

Since commit f11a04464ae57e8d ("i2c: gpio: Enable working over slow
can_sleep GPIOs"), probing the i2c RTC connected to an i2c-gpio bus on
r8a7740/armadillo fails with:

    rtc-s35390a 0-0030: error resetting chip
    rtc-s35390a: probe of 0-0030 failed with error -5

More debug code reveals:

    i2c i2c-0: master_xfer[0] R, addr=0x30, len=1
    i2c i2c-0: NAK from device addr 0x30 msg #0
    s35390a_get_reg: ret = -6

Commit 02e479808b5d62f8 ("gpio: Alter semantics of *raw* operations to
actually be raw") moved open drain/source handling from
gpiod_set_raw_value_commit() to gpiod_set_value(), but forgot to take
into account that gpiod_set_value_cansleep() also needs this handling.
The i2c protocol mandates that i2c signals are open drain, hence i2c
communication fails.

Fix this by adding the missing handling to gpiod_set_value_cansleep(),
using a new common helper gpiod_set_value_nocheck().

Fixes: 02e479808b5d62f8 ("gpio: Alter semantics of *raw* operations to actually be raw")
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
[removed underscore syntax, added kerneldoc]
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpiolib.c | 35 +++++++++++++++++++++++------------
 1 file changed, 23 insertions(+), 12 deletions(-)

diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 44332b793718..14532d9576e4 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -2892,6 +2892,27 @@ void gpiod_set_raw_value(struct gpio_desc *desc, int value)
 }
 EXPORT_SYMBOL_GPL(gpiod_set_raw_value);
 
+/**
+ * gpiod_set_value_nocheck() - set a GPIO line value without checking
+ * @desc: the descriptor to set the value on
+ * @value: value to set
+ *
+ * This sets the value of a GPIO line backing a descriptor, applying
+ * different semantic quirks like active low and open drain/source
+ * handling.
+ */
+static void gpiod_set_value_nocheck(struct gpio_desc *desc, int value)
+{
+	if (test_bit(FLAG_ACTIVE_LOW, &desc->flags))
+		value = !value;
+	if (test_bit(FLAG_OPEN_DRAIN, &desc->flags))
+		gpio_set_open_drain_value_commit(desc, value);
+	else if (test_bit(FLAG_OPEN_SOURCE, &desc->flags))
+		gpio_set_open_source_value_commit(desc, value);
+	else
+		gpiod_set_raw_value_commit(desc, value);
+}
+
 /**
  * gpiod_set_value() - assign a gpio's value
  * @desc: gpio whose value will be assigned
@@ -2906,16 +2927,8 @@ EXPORT_SYMBOL_GPL(gpiod_set_raw_value);
 void gpiod_set_value(struct gpio_desc *desc, int value)
 {
 	VALIDATE_DESC_VOID(desc);
-	/* Should be using gpiod_set_value_cansleep() */
 	WARN_ON(desc->gdev->chip->can_sleep);
-	if (test_bit(FLAG_ACTIVE_LOW, &desc->flags))
-		value = !value;
-	if (test_bit(FLAG_OPEN_DRAIN, &desc->flags))
-		gpio_set_open_drain_value_commit(desc, value);
-	else if (test_bit(FLAG_OPEN_SOURCE, &desc->flags))
-		gpio_set_open_source_value_commit(desc, value);
-	else
-		gpiod_set_raw_value_commit(desc, value);
+	gpiod_set_value_nocheck(desc, value);
 }
 EXPORT_SYMBOL_GPL(gpiod_set_value);
 
@@ -3243,9 +3256,7 @@ void gpiod_set_value_cansleep(struct gpio_desc *desc, int value)
 {
 	might_sleep_if(extra_checks);
 	VALIDATE_DESC_VOID(desc);
-	if (test_bit(FLAG_ACTIVE_LOW, &desc->flags))
-		value = !value;
-	gpiod_set_raw_value_commit(desc, value);
+	gpiod_set_value_nocheck(desc, value);
 }
 EXPORT_SYMBOL_GPL(gpiod_set_value_cansleep);
 

From 951a010233625b77cde3430b4b8785a9a22968d1 Mon Sep 17 00:00:00 2001
From: Ross Lagerwall <ross.lagerwall@citrix.com>
Date: Tue, 9 Jan 2018 12:10:21 +0000
Subject: [PATCH 755/808] xen/gntdev: Fix off-by-one error when unmapping with
 holes

If the requested range has a hole, the calculation of the number of
pages to unmap is off by one. Fix it.

Signed-off-by: Ross Lagerwall <ross.lagerwall@citrix.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 drivers/xen/gntdev.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
index 57efbd3b053b..d3391a1e3796 100644
--- a/drivers/xen/gntdev.c
+++ b/drivers/xen/gntdev.c
@@ -380,10 +380,8 @@ static int unmap_grant_pages(struct grant_map *map, int offset, int pages)
 		}
 		range = 0;
 		while (range < pages) {
-			if (map->unmap_ops[offset+range].handle == -1) {
-				range--;
+			if (map->unmap_ops[offset+range].handle == -1)
 				break;
-			}
 			range++;
 		}
 		err = __unmap_grant_pages(map, offset, range);

From cf2acf66ad43abb39735568f55e1f85f9844e990 Mon Sep 17 00:00:00 2001
From: Ross Lagerwall <ross.lagerwall@citrix.com>
Date: Tue, 9 Jan 2018 12:10:22 +0000
Subject: [PATCH 756/808] xen/gntdev: Fix partial gntdev_mmap() cleanup

When cleaning up after a partially successful gntdev_mmap(), unmap the
successfully mapped grant pages otherwise Xen will kill the domain if
in debug mode (Attempt to implicitly unmap a granted PTE) or Linux will
kill the process and emit "BUG: Bad page map in process" if Xen is in
release mode.

This is only needed when use_ptemod is true because gntdev_put_map()
will unmap grant pages itself when use_ptemod is false.

Signed-off-by: Ross Lagerwall <ross.lagerwall@citrix.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 drivers/xen/gntdev.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
index d3391a1e3796..bd56653b9bbc 100644
--- a/drivers/xen/gntdev.c
+++ b/drivers/xen/gntdev.c
@@ -1071,8 +1071,10 @@ unlock_out:
 out_unlock_put:
 	mutex_unlock(&priv->lock);
 out_put_map:
-	if (use_ptemod)
+	if (use_ptemod) {
 		map->vma = NULL;
+		unmap_grant_pages(map, 0, map->count);
+	}
 	gntdev_put_map(priv, map);
 	return err;
 }

From 0d9cac0ca0429830c40fe1a4e50e60f6221fd7b6 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Wed, 10 Jan 2018 12:40:04 +0300
Subject: [PATCH 757/808] drm/vmwgfx: Potential off by one in vmw_view_add()

The vmw_view_cmd_to_type() function returns vmw_view_max (3) on error.
It's one element beyond the end of the vmw_view_cotables[] table.

My read on this is that it's possible to hit this failure.  header->id
comes from vmw_cmd_check() and it's a user controlled number between
1040 and 1225 so we can hit that error.  But I don't have the hardware
to test this code.

Fixes: d80efd5cb3de ("drm/vmwgfx: Initial DX support")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Reviewed-by: Thomas Hellstrom <thellstrom@vmware.com>
Cc: <stable@vger.kernel.org>
---
 drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
index 21c62a34e558..87e8af5776a3 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
@@ -2731,6 +2731,8 @@ static int vmw_cmd_dx_view_define(struct vmw_private *dev_priv,
 	}
 
 	view_type = vmw_view_cmd_to_type(header->id);
+	if (view_type == vmw_view_max)
+		return -EINVAL;
 	cmd = container_of(header, typeof(*cmd), header);
 	ret = vmw_cmd_res_check(dev_priv, sw_context, vmw_res_surface,
 				user_surface_converter,

From 612e8e9350fd19cae6900cf36ea0c6892d1a0dca Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 10 Jan 2018 12:28:16 +0100
Subject: [PATCH 758/808] x86/alternatives: Fix optimize_nops() checking

The alternatives code checks only the first byte whether it is a NOP, but
with NOPs in front of the payload and having actual instructions after it
breaks the "optimized' test.

Make sure to scan all bytes before deciding to optimize the NOPs in there.

Reported-by: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Andrew Lutomirski <luto@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Link: https://lkml.kernel.org/r/20180110112815.mgciyf5acwacphkq@pd.tnic
---
 arch/x86/kernel/alternative.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 3344d3382e91..e0b97e4d1db5 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -344,9 +344,12 @@ done:
 static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *instr)
 {
 	unsigned long flags;
+	int i;
 
-	if (instr[0] != 0x90)
-		return;
+	for (i = 0; i < a->padlen; i++) {
+		if (instr[i] != 0x90)
+			return;
+	}
 
 	local_irq_save(flags);
 	add_nops(instr + (a->instrlen - a->padlen), a->padlen);

From 2e83acb970684008baee471427270c029a76ddbd Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Mon, 8 Jan 2018 19:02:27 -0200
Subject: [PATCH 759/808] sctp: GFP_ATOMIC is not needed in
 sctp_setsockopt_events

So replace it with GFP_USER and also add __GFP_NOWARN.

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/socket.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index b4fb6e4886d2..54c046783a89 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -2277,7 +2277,7 @@ static int sctp_setsockopt_events(struct sock *sk, char __user *optval,
 
 		if (asoc && sctp_outq_is_empty(&asoc->outqueue)) {
 			event = sctp_ulpevent_make_sender_dry_event(asoc,
-					GFP_ATOMIC);
+					GFP_USER | __GFP_NOWARN);
 			if (!event)
 				return -ENOMEM;
 

From 5960cefab9df76600a1a7d4ff592c59e14616e88 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Mon, 8 Jan 2018 19:02:28 -0200
Subject: [PATCH 760/808] sctp: add a ceiling to optlen in some sockopts

Hangbin Liu reported that some sockopt calls could cause the kernel to log
a warning on memory allocation failure if the user supplied a large optlen
value. That is because some of them called memdup_user() without a ceiling
on optlen, allowing it to try to allocate really large buffers.

This patch adds a ceiling by limiting optlen to the maximum allowed that
would still make sense for these sockopt.

Reported-by: Hangbin Liu <haliu@redhat.com>
Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/socket.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 54c046783a89..022b94f11fd8 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -3498,6 +3498,8 @@ static int sctp_setsockopt_hmac_ident(struct sock *sk,
 
 	if (optlen < sizeof(struct sctp_hmacalgo))
 		return -EINVAL;
+	optlen = min_t(unsigned int, optlen, sizeof(struct sctp_hmacalgo) +
+					     SCTP_AUTH_NUM_HMACS * sizeof(u16));
 
 	hmacs = memdup_user(optval, optlen);
 	if (IS_ERR(hmacs))
@@ -3536,6 +3538,11 @@ static int sctp_setsockopt_auth_key(struct sock *sk,
 
 	if (optlen <= sizeof(struct sctp_authkey))
 		return -EINVAL;
+	/* authkey->sca_keylength is u16, so optlen can't be bigger than
+	 * this.
+	 */
+	optlen = min_t(unsigned int, optlen, USHRT_MAX +
+					     sizeof(struct sctp_authkey));
 
 	authkey = memdup_user(optval, optlen);
 	if (IS_ERR(authkey))
@@ -3893,6 +3900,9 @@ static int sctp_setsockopt_reset_streams(struct sock *sk,
 
 	if (optlen < sizeof(*params))
 		return -EINVAL;
+	/* srs_number_streams is u16, so optlen can't be bigger than this. */
+	optlen = min_t(unsigned int, optlen, USHRT_MAX +
+					     sizeof(__u16) * sizeof(*params));
 
 	params = memdup_user(optval, optlen);
 	if (IS_ERR(params))

From c76f97c99ae6d26d14c7f0e50e074382bfbc9f98 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Mon, 8 Jan 2018 19:02:29 -0200
Subject: [PATCH 761/808] sctp: make use of pre-calculated len

Some sockopt handling functions were calculating the length of the
buffer to be written to userspace and then calculating it again when
actually writing the buffer, which could lead to some write not using
an up-to-date length.

This patch updates such places to just make use of the len variable.

Also, replace some sizeof(type) to sizeof(var).

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/socket.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 022b94f11fd8..9b01e994f661 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -5025,7 +5025,7 @@ static int sctp_getsockopt_autoclose(struct sock *sk, int len, char __user *optv
 	len = sizeof(int);
 	if (put_user(len, optlen))
 		return -EFAULT;
-	if (copy_to_user(optval, &sctp_sk(sk)->autoclose, sizeof(int)))
+	if (copy_to_user(optval, &sctp_sk(sk)->autoclose, len))
 		return -EFAULT;
 	return 0;
 }
@@ -5655,6 +5655,9 @@ copy_getaddrs:
 		err = -EFAULT;
 		goto out;
 	}
+	/* XXX: We should have accounted for sizeof(struct sctp_getaddrs) too,
+	 * but we can't change it anymore.
+	 */
 	if (put_user(bytes_copied, optlen))
 		err = -EFAULT;
 out:
@@ -6091,7 +6094,7 @@ static int sctp_getsockopt_maxseg(struct sock *sk, int len,
 		params.assoc_id = 0;
 	} else if (len >= sizeof(struct sctp_assoc_value)) {
 		len = sizeof(struct sctp_assoc_value);
-		if (copy_from_user(&params, optval, sizeof(params)))
+		if (copy_from_user(&params, optval, len))
 			return -EFAULT;
 	} else
 		return -EINVAL;
@@ -6261,7 +6264,9 @@ static int sctp_getsockopt_active_key(struct sock *sk, int len,
 
 	if (len < sizeof(struct sctp_authkeyid))
 		return -EINVAL;
-	if (copy_from_user(&val, optval, sizeof(struct sctp_authkeyid)))
+
+	len = sizeof(struct sctp_authkeyid);
+	if (copy_from_user(&val, optval, len))
 		return -EFAULT;
 
 	asoc = sctp_id2assoc(sk, val.scact_assoc_id);
@@ -6273,7 +6278,6 @@ static int sctp_getsockopt_active_key(struct sock *sk, int len,
 	else
 		val.scact_keynumber = ep->active_key_id;
 
-	len = sizeof(struct sctp_authkeyid);
 	if (put_user(len, optlen))
 		return -EFAULT;
 	if (copy_to_user(optval, &val, len))
@@ -6299,7 +6303,7 @@ static int sctp_getsockopt_peer_auth_chunks(struct sock *sk, int len,
 	if (len < sizeof(struct sctp_authchunks))
 		return -EINVAL;
 
-	if (copy_from_user(&val, optval, sizeof(struct sctp_authchunks)))
+	if (copy_from_user(&val, optval, sizeof(val)))
 		return -EFAULT;
 
 	to = p->gauth_chunks;
@@ -6344,7 +6348,7 @@ static int sctp_getsockopt_local_auth_chunks(struct sock *sk, int len,
 	if (len < sizeof(struct sctp_authchunks))
 		return -EINVAL;
 
-	if (copy_from_user(&val, optval, sizeof(struct sctp_authchunks)))
+	if (copy_from_user(&val, optval, sizeof(val)))
 		return -EFAULT;
 
 	to = p->gauth_chunks;

From 11d827a993a969c3c6ec56758ff63a44ba19b466 Mon Sep 17 00:00:00 2001
From: Yangbo Lu <yangbo.lu@nxp.com>
Date: Tue, 9 Jan 2018 11:02:33 +0800
Subject: [PATCH 762/808] net: gianfar_ptp: move set_fipers() to spinlock
 protecting area

set_fipers() calling should be protected by spinlock in
case that any interrupt breaks related registers setting
and the function we expect. This patch is to move set_fipers()
to spinlock protecting area in ptp_gianfar_adjtime().

Signed-off-by: Yangbo Lu <yangbo.lu@nxp.com>
Acked-by: Richard Cochran <richardcochran@gmail.com>
Reviewed-by: Fabio Estevam <fabio.estevam@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/freescale/gianfar_ptp.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/freescale/gianfar_ptp.c b/drivers/net/ethernet/freescale/gianfar_ptp.c
index 544114281ea7..9f8d4f8e57e3 100644
--- a/drivers/net/ethernet/freescale/gianfar_ptp.c
+++ b/drivers/net/ethernet/freescale/gianfar_ptp.c
@@ -319,11 +319,10 @@ static int ptp_gianfar_adjtime(struct ptp_clock_info *ptp, s64 delta)
 	now = tmr_cnt_read(etsects);
 	now += delta;
 	tmr_cnt_write(etsects, now);
+	set_fipers(etsects);
 
 	spin_unlock_irqrestore(&etsects->lock, flags);
 
-	set_fipers(etsects);
-
 	return 0;
 }
 

From af60d61fa846725566f4a876ae04f891bdff1c7a Mon Sep 17 00:00:00 2001
From: Kornilios Kourtis <kou@zurich.ibm.com>
Date: Tue, 9 Jan 2018 09:52:22 +0100
Subject: [PATCH 763/808] doc: clarification about setting SO_ZEROCOPY

Signed-off-by: Kornilios Kourtis <kou@zurich.ibm.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/msg_zerocopy.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Documentation/networking/msg_zerocopy.rst b/Documentation/networking/msg_zerocopy.rst
index 77f6d7e25cfd..291a01264967 100644
--- a/Documentation/networking/msg_zerocopy.rst
+++ b/Documentation/networking/msg_zerocopy.rst
@@ -72,6 +72,10 @@ this flag, a process must first signal intent by setting a socket option:
 	if (setsockopt(fd, SOL_SOCKET, SO_ZEROCOPY, &one, sizeof(one)))
 		error(1, errno, "setsockopt zerocopy");
 
+Setting the socket option only works when the socket is in its initial
+(TCP_CLOSED) state.  Trying to set the option for a socket returned by accept(),
+for example, will lead to an EBUSY error. In this case, the option should be set
+to the listening socket and it will be inherited by the accepted sockets.
 
 Transmission
 ------------

From b0d55b5bc77755501be9de2c935d106ff8dba9ac Mon Sep 17 00:00:00 2001
From: Xiongfeng Wang <xiongfeng.wang@linaro.org>
Date: Tue, 9 Jan 2018 19:58:18 +0800
Subject: [PATCH 764/808] caif_usb: use strlcpy() instead of strncpy()

gcc-8 reports

net/caif/caif_usb.c: In function 'cfusbl_device_notify':
./include/linux/string.h:245:9: warning: '__builtin_strncpy' output may
be truncated copying 15 bytes from a string of length 15
[-Wstringop-truncation]

The compiler require that the input param 'len' of strncpy() should be
greater than the length of the src string, so that '\0' is copied as
well. We can just use strlcpy() to avoid this warning.

Signed-off-by: Xiongfeng Wang <xiongfeng.wang@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/caif/caif_usb.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/net/caif/caif_usb.c b/net/caif/caif_usb.c
index 5cd44f001f64..1a082a946045 100644
--- a/net/caif/caif_usb.c
+++ b/net/caif/caif_usb.c
@@ -176,9 +176,7 @@ static int cfusbl_device_notify(struct notifier_block *me, unsigned long what,
 		dev_add_pack(&caif_usb_type);
 	pack_added = true;
 
-	strncpy(layer->name, dev->name,
-			sizeof(layer->name) - 1);
-	layer->name[sizeof(layer->name) - 1] = 0;
+	strlcpy(layer->name, dev->name, sizeof(layer->name));
 
 	return 0;
 }

From 95f566de0269a0c59fd6a737a147731302136429 Mon Sep 17 00:00:00 2001
From: Madalin Bucur <madalin.bucur@nxp.com>
Date: Tue, 9 Jan 2018 14:43:34 +0200
Subject: [PATCH 765/808] of_mdio: avoid MDIO bus removal when a PHY is missing

If one of the child devices is missing the of_mdiobus_register_phy()
call will return -ENODEV. When a missing device is encountered the
registration of the remaining PHYs is stopped and the MDIO bus will
fail to register. Propagate all errors except ENODEV to avoid it.

Signed-off-by: Madalin Bucur <madalin.bucur@nxp.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/of/of_mdio.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/of/of_mdio.c b/drivers/of/of_mdio.c
index 3481e69738b5..a327be1d264b 100644
--- a/drivers/of/of_mdio.c
+++ b/drivers/of/of_mdio.c
@@ -231,7 +231,12 @@ int of_mdiobus_register(struct mii_bus *mdio, struct device_node *np)
 			rc = of_mdiobus_register_phy(mdio, child, addr);
 		else
 			rc = of_mdiobus_register_device(mdio, child, addr);
-		if (rc)
+
+		if (rc == -ENODEV)
+			dev_err(&mdio->dev,
+				"MDIO device at address %d is missing.\n",
+				addr);
+		else if (rc)
 			goto unregister;
 	}
 
@@ -255,7 +260,7 @@ int of_mdiobus_register(struct mii_bus *mdio, struct device_node *np)
 
 			if (of_mdiobus_child_is_phy(child)) {
 				rc = of_mdiobus_register_phy(mdio, child, addr);
-				if (rc)
+				if (rc && rc != -ENODEV)
 					goto unregister;
 			}
 		}

From 78bbb15f2239bc8e663aa20bbe1987c91a0b75f6 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Tue, 9 Jan 2018 13:40:41 -0800
Subject: [PATCH 766/808] 8021q: fix a memory leak for VLAN 0 device

A vlan device with vid 0 is allow to creat by not able to be fully
cleaned up by unregister_vlan_dev() which checks for vlan_id!=0.

Also, VLAN 0 is probably not a valid number and it is kinda
"reserved" for HW accelerating devices, but it is probably too
late to reject it from creation even if makes sense. Instead,
just remove the check in unregister_vlan_dev().

Reported-by: Dmitry Vyukov <dvyukov@google.com>
Fixes: ad1afb003939 ("vlan_dev: VLAN 0 should be treated as "no vlan tag" (802.1p packet)")
Cc: Vlad Yasevich <vyasevich@gmail.com>
Cc: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/8021q/vlan.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 8dfdd94e430f..bad01b14a4ad 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -111,12 +111,7 @@ void unregister_vlan_dev(struct net_device *dev, struct list_head *head)
 		vlan_gvrp_uninit_applicant(real_dev);
 	}
 
-	/* Take it out of our own structures, but be sure to interlock with
-	 * HW accelerating devices or SW vlan input packet processing if
-	 * VLAN is not 0 (leave it there for 802.1p).
-	 */
-	if (vlan_id)
-		vlan_vid_del(real_dev, vlan->vlan_proto, vlan_id);
+	vlan_vid_del(real_dev, vlan->vlan_proto, vlan_id);
 
 	/* Get rid of the vlan's reference to real_dev */
 	dev_put(real_dev);

From fc2336505fb49a8b932a0a67a9745c408b79992c Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Tue, 9 Jan 2018 18:14:28 -0800
Subject: [PATCH 767/808] nfp: always unmask aux interrupts at init

The link state and exception interrupts may be masked when we probe.
The firmware should in theory prevent sending (and automasking) those
interrupts if the device is disabled, but if my reading of the FW code
is correct there are firmwares out there with race conditions in this
area.  The interrupt may also be masked if previous driver which used
the device was malfunctioning and we didn't load the FW (there is no
other good way to comprehensively reset the PF).

Note that FW unmasks the data interrupts by itself when vNIC is
enabled, such helpful operation is not performed for LSC/EXN interrupts.

Always unmask the auxiliary interrupts after request_irq().  On the
remove path add missing PCI write flush before free_irq().

Fixes: 4c3523623dc0 ("net: add driver for Netronome NFP4000/NFP6000 NIC VFs")
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 1a603fdd9e80..99b0487b6d82 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -568,6 +568,7 @@ nfp_net_aux_irq_request(struct nfp_net *nn, u32 ctrl_offset,
 		return err;
 	}
 	nn_writeb(nn, ctrl_offset, entry->entry);
+	nfp_net_irq_unmask(nn, entry->entry);
 
 	return 0;
 }
@@ -582,6 +583,7 @@ static void nfp_net_aux_irq_free(struct nfp_net *nn, u32 ctrl_offset,
 				 unsigned int vector_idx)
 {
 	nn_writeb(nn, ctrl_offset, 0xff);
+	nn_pci_flush(nn);
 	free_irq(nn->irq_entries[vector_idx].vector, nn);
 }
 

From 8e033a93b37f37aa9fca71a370a895155320af60 Mon Sep 17 00:00:00 2001
From: Yuval Mintz <yuvalm@mellanox.com>
Date: Wed, 10 Jan 2018 11:42:43 +0100
Subject: [PATCH 768/808] mlxsw: pci: Wait after reset before accessing HW

After performing reset driver polls on HW indication until learning
that the reset is done, but immediately after reset the device becomes
unresponsive which might lead to completion timeout on the first read.

Wait for 100ms before starting the polling.

Fixes: 233fa44bd67a ("mlxsw: pci: Implement reset done check")
Signed-off-by: Yuval Mintz <yuvalm@mellanox.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlxsw/pci.c    | 7 ++++++-
 drivers/net/ethernet/mellanox/mlxsw/pci_hw.h | 1 +
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci.c b/drivers/net/ethernet/mellanox/mlxsw/pci.c
index 23f7d828cf67..6ef20e5cc77d 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/pci.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/pci.c
@@ -1643,7 +1643,12 @@ static int mlxsw_pci_sw_reset(struct mlxsw_pci *mlxsw_pci,
 		return 0;
 	}
 
-	wmb(); /* reset needs to be written before we read control register */
+	/* Reset needs to be written before we read control register, and
+	 * we must wait for the HW to become responsive once again
+	 */
+	wmb();
+	msleep(MLXSW_PCI_SW_RESET_WAIT_MSECS);
+
 	end = jiffies + msecs_to_jiffies(MLXSW_PCI_SW_RESET_TIMEOUT_MSECS);
 	do {
 		u32 val = mlxsw_pci_read32(mlxsw_pci, FW_READY);
diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci_hw.h b/drivers/net/ethernet/mellanox/mlxsw/pci_hw.h
index a6441208e9d9..fb082ad21b00 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/pci_hw.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/pci_hw.h
@@ -59,6 +59,7 @@
 #define MLXSW_PCI_SW_RESET			0xF0010
 #define MLXSW_PCI_SW_RESET_RST_BIT		BIT(0)
 #define MLXSW_PCI_SW_RESET_TIMEOUT_MSECS	5000
+#define MLXSW_PCI_SW_RESET_WAIT_MSECS		100
 #define MLXSW_PCI_FW_READY			0xA1844
 #define MLXSW_PCI_FW_READY_MASK			0xFFFF
 #define MLXSW_PCI_FW_READY_MAGIC		0x5E

From db84924c4fc3be1ef0c965d5ece5f6d785c77c5f Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Wed, 10 Jan 2018 11:42:44 +0100
Subject: [PATCH 769/808] mlxsw: spectrum_qdisc: Don't use variable array in
 mlxsw_sp_tclass_congestion_enable

Resolve the sparse warning:
"sparse: Variable length array is used."
Use 2 arrays for 2 PRM register accesses.

Fixes: 96f17e0776c2 ("mlxsw: spectrum: Support RED qdisc offload")
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Yuval Mintz <yuvalm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_qdisc.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_qdisc.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_qdisc.c
index c33beac5def0..b5397da94d7f 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_qdisc.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_qdisc.c
@@ -46,7 +46,8 @@ mlxsw_sp_tclass_congestion_enable(struct mlxsw_sp_port *mlxsw_sp_port,
 				  int tclass_num, u32 min, u32 max,
 				  u32 probability, bool is_ecn)
 {
-	char cwtp_cmd[max_t(u8, MLXSW_REG_CWTP_LEN, MLXSW_REG_CWTPM_LEN)];
+	char cwtpm_cmd[MLXSW_REG_CWTPM_LEN];
+	char cwtp_cmd[MLXSW_REG_CWTP_LEN];
 	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
 	int err;
 
@@ -60,10 +61,10 @@ mlxsw_sp_tclass_congestion_enable(struct mlxsw_sp_port *mlxsw_sp_port,
 	if (err)
 		return err;
 
-	mlxsw_reg_cwtpm_pack(cwtp_cmd, mlxsw_sp_port->local_port, tclass_num,
+	mlxsw_reg_cwtpm_pack(cwtpm_cmd, mlxsw_sp_port->local_port, tclass_num,
 			     MLXSW_REG_CWTP_DEFAULT_PROFILE, true, is_ecn);
 
-	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(cwtpm), cwtp_cmd);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(cwtpm), cwtpm_cmd);
 }
 
 static int

From 862c03ee1deb7e19e0f9931682e0294ecd1fcaf9 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 10 Jan 2018 03:45:49 -0800
Subject: [PATCH 770/808] ipv6: fix possible mem leaks in ipv6_make_skb()

ip6_setup_cork() might return an error, while memory allocations have
been done and must be rolled back.

Fixes: 6422398c2ab0 ("ipv6: introduce ipv6_make_skb")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Vlad Yasevich <vyasevich@gmail.com>
Reported-by: Mike Maloney <maloney@google.com>
Acked-by:  Mike Maloney <maloney@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_output.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index f7dd51c42314..688ba5f7516b 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1735,9 +1735,10 @@ struct sk_buff *ip6_make_skb(struct sock *sk,
 	cork.base.opt = NULL;
 	v6_cork.opt = NULL;
 	err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6);
-	if (err)
+	if (err) {
+		ip6_cork_release(&cork, &v6_cork);
 		return ERR_PTR(err);
-
+	}
 	if (ipc6->dontfrag < 0)
 		ipc6->dontfrag = inet6_sk(sk)->dontfrag;
 

From ccc12b11c5332c84442ef120dcd631523be75089 Mon Sep 17 00:00:00 2001
From: Mathieu Xhonneux <m.xhonneux@gmail.com>
Date: Wed, 10 Jan 2018 13:35:49 +0000
Subject: [PATCH 771/808] ipv6: sr: fix TLVs not being copied using setsockopt

Function ipv6_push_rthdr4 allows to add an IPv6 Segment Routing Header
to a socket through setsockopt, but the current implementation doesn't
copy possible TLVs at the end of the SRH received from userspace.

Therefore, the execution of the following branch if (sr_has_hmac(sr_phdr))
{ ... } will never complete since the len and type fields of a possible
HMAC TLV are not copied, hence seg6_get_tlv_hmac will return an error,
and the HMAC will not be computed.

This commit adds a memcpy in case TLVs have been appended to the SRH.

Fixes: a149e7c7ce81 ("ipv6: sr: add support for SRH injection through setsockopt")
Acked-by: David Lebrun <dlebrun@google.com>
Signed-off-by: Mathieu Xhonneux <m.xhonneux@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/exthdrs.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index 83bd75713535..bc68eb661970 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -925,6 +925,15 @@ static void ipv6_push_rthdr4(struct sk_buff *skb, u8 *proto,
 	sr_phdr->segments[0] = **addr_p;
 	*addr_p = &sr_ihdr->segments[sr_ihdr->segments_left];
 
+	if (sr_ihdr->hdrlen > hops * 2) {
+		int tlvs_offset, tlvs_length;
+
+		tlvs_offset = (1 + hops * 2) << 3;
+		tlvs_length = (sr_ihdr->hdrlen - hops * 2) << 3;
+		memcpy((char *)sr_phdr + tlvs_offset,
+		       (char *)sr_ihdr + tlvs_offset, tlvs_length);
+	}
+
 #ifdef CONFIG_IPV6_SEG6_HMAC
 	if (sr_has_hmac(sr_phdr)) {
 		struct net *net = NULL;

From ce4bb04cae8924792ed92f4af2793b77fc986f0e Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 10 Jan 2018 18:47:05 -0500
Subject: [PATCH 772/808] Fix a leak in socket(2) when we fail to allocate a
 file descriptor.

Got broken by "make sock_alloc_file() do sock_release() on failures" -
cleanup after sock_map_fd() failure got pulled all the way into
sock_alloc_file(), but it used to serve the case when sock_map_fd()
failed *before* getting to sock_alloc_file() as well, and that got
lost.  Trivial to fix, fortunately.

Fixes: 8e1611e23579 (make sock_alloc_file() do sock_release() on failures)
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 net/socket.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/socket.c b/net/socket.c
index 42d8e9c9ccd5..82433a2200ec 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -432,8 +432,10 @@ static int sock_map_fd(struct socket *sock, int flags)
 {
 	struct file *newfile;
 	int fd = get_unused_fd_flags(flags);
-	if (unlikely(fd < 0))
+	if (unlikely(fd < 0)) {
+		sock_release(sock);
 		return fd;
+	}
 
 	newfile = sock_alloc_file(sock, flags, NULL);
 	if (likely(!IS_ERR(newfile))) {

From 4636bda86aa1f34f45c629477476a0dcfa04e597 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Fri, 5 Jan 2018 00:59:05 -0800
Subject: [PATCH 773/808] drm/i915: Whitelist SLICE_COMMON_ECO_CHICKEN1 on
 Geminilake.

Geminilake requires the 3D driver to select whether barriers are
intended for compute shaders, or tessellation control shaders, by
whacking a "Barrier Mode" bit in SLICE_COMMON_ECO_CHICKEN1 when
switching pipelines.  Failure to do this properly can result in GPU
hangs.

Unfortunately, this means it needs to switch mid-batch, so only
userspace can properly set it.  To facilitate this, the kernel needs
to whitelist the register.

The workarounds page currently tags this as applying to Broxton only,
but that doesn't make sense.  The documentation for the register it
references says the bit userspace is supposed to toggle only exists on
Geminilake.  Empirically, the Mesa patch to toggle this bit appears to
fix intermittent GPU hangs in tessellation control shader barrier tests
on Geminilake; we haven't seen those hangs on Broxton.

v2: Mention WA #0862 in the comment (it doesn't have a name).

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Acked-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: stable@vger.kernel.org
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180105085905.9298-1-kenneth@whitecape.org
(cherry picked from commit ab062639edb0412daf6de540725276b9a5d217f9)
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
---
 drivers/gpu/drm/i915/i915_reg.h        | 2 ++
 drivers/gpu/drm/i915/intel_engine_cs.c | 5 +++++
 2 files changed, 7 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index 333f40bc03bb..7923dfd9963c 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -7027,6 +7027,8 @@ enum {
 #define GEN9_SLICE_COMMON_ECO_CHICKEN0		_MMIO(0x7308)
 #define  DISABLE_PIXEL_MASK_CAMMING		(1<<14)
 
+#define GEN9_SLICE_COMMON_ECO_CHICKEN1		_MMIO(0x731c)
+
 #define GEN7_L3SQCREG1				_MMIO(0xB010)
 #define  VLV_B0_WA_L3SQCREG1_VALUE		0x00D30000
 
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index ab5bf4e2e28e..6074e04dc99f 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -1390,6 +1390,11 @@ static int glk_init_workarounds(struct intel_engine_cs *engine)
 	if (ret)
 		return ret;
 
+	/* WA #0862: Userspace has to set "Barrier Mode" to avoid hangs. */
+	ret = wa_ring_whitelist_reg(engine, GEN9_SLICE_COMMON_ECO_CHICKEN1);
+	if (ret)
+		return ret;
+
 	/* WaToEnableHwFixForPushConstHWBug:glk */
 	WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2,
 			  GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);

From 5005c8514285ae4f28e862f8d91faaa2015e03a3 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Sat, 6 Jan 2018 10:56:18 +0000
Subject: [PATCH 774/808] drm/i915: Don't adjust priority on an already
 signaled fence
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When we retire a signaled fence, we free the dependency tree. However,
we skip clearing the list so that if we then try to adjust the priority
of the signaled fence, we may walk the list of freed dependencies.

[ 3083.156757] ==================================================================
[ 3083.156806] BUG: KASAN: use-after-free in execlists_schedule+0x199/0x660 [i915]
[ 3083.156810] Read of size 8 at addr ffff8806bf20f400 by task Xorg/831

[ 3083.156815] CPU: 0 PID: 831 Comm: Xorg Not tainted 4.15.0-rc6-no-psn+ #1
[ 3083.156817] Hardware name: Notebook                         N24_25BU/N24_25BU, BIOS 5.12 02/17/2017
[ 3083.156818] Call Trace:
[ 3083.156823]  dump_stack+0x5c/0x7a
[ 3083.156827]  print_address_description+0x6b/0x290
[ 3083.156830]  kasan_report+0x28f/0x380
[ 3083.156872]  ? execlists_schedule+0x199/0x660 [i915]
[ 3083.156914]  execlists_schedule+0x199/0x660 [i915]
[ 3083.156956]  ? intel_crtc_atomic_check+0x146/0x4e0 [i915]
[ 3083.156997]  ? execlists_submit_request+0xe0/0xe0 [i915]
[ 3083.157038]  ? i915_vma_misplaced.part.4+0x25/0xb0 [i915]
[ 3083.157079]  ? __i915_vma_do_pin+0x7c8/0xc80 [i915]
[ 3083.157121]  ? intel_atomic_state_alloc+0x44/0x60 [i915]
[ 3083.157130]  ? drm_atomic_helper_page_flip+0x3e/0xb0 [drm_kms_helper]
[ 3083.157145]  ? drm_mode_page_flip_ioctl+0x7d2/0x850 [drm]
[ 3083.157159]  ? drm_ioctl_kernel+0xa7/0xf0 [drm]
[ 3083.157172]  ? drm_ioctl+0x45b/0x560 [drm]
[ 3083.157211]  i915_gem_object_wait_priority+0x14c/0x2c0 [i915]
[ 3083.157251]  ? i915_gem_get_aperture_ioctl+0x150/0x150 [i915]
[ 3083.157290]  ? i915_vma_pin_fence+0x1d8/0x320 [i915]
[ 3083.157331]  ? intel_pin_and_fence_fb_obj+0x175/0x250 [i915]
[ 3083.157372]  ? intel_rotation_info_size+0x60/0x60 [i915]
[ 3083.157413]  ? intel_link_compute_m_n+0x80/0x80 [i915]
[ 3083.157428]  ? drm_dev_printk+0x1b0/0x1b0 [drm]
[ 3083.157443]  ? drm_dev_printk+0x1b0/0x1b0 [drm]
[ 3083.157485]  intel_prepare_plane_fb+0x2f8/0x5a0 [i915]
[ 3083.157527]  ? intel_crtc_get_vblank_counter+0x80/0x80 [i915]
[ 3083.157536]  drm_atomic_helper_prepare_planes+0xa0/0x1c0 [drm_kms_helper]
[ 3083.157587]  intel_atomic_commit+0x12e/0x4e0 [i915]
[ 3083.157605]  drm_atomic_helper_page_flip+0xa2/0xb0 [drm_kms_helper]
[ 3083.157621]  drm_mode_page_flip_ioctl+0x7d2/0x850 [drm]
[ 3083.157638]  ? drm_mode_cursor2_ioctl+0x10/0x10 [drm]
[ 3083.157652]  ? drm_lease_owner+0x1a/0x30 [drm]
[ 3083.157668]  ? drm_mode_cursor2_ioctl+0x10/0x10 [drm]
[ 3083.157681]  drm_ioctl_kernel+0xa7/0xf0 [drm]
[ 3083.157696]  drm_ioctl+0x45b/0x560 [drm]
[ 3083.157711]  ? drm_mode_cursor2_ioctl+0x10/0x10 [drm]
[ 3083.157725]  ? drm_getstats+0x20/0x20 [drm]
[ 3083.157729]  ? timerqueue_del+0x49/0x80
[ 3083.157732]  ? __remove_hrtimer+0x62/0xb0
[ 3083.157735]  ? hrtimer_try_to_cancel+0x173/0x210
[ 3083.157738]  do_vfs_ioctl+0x13b/0x880
[ 3083.157741]  ? ioctl_preallocate+0x140/0x140
[ 3083.157744]  ? _raw_spin_unlock_irq+0xe/0x30
[ 3083.157746]  ? do_setitimer+0x234/0x370
[ 3083.157750]  ? SyS_setitimer+0x19e/0x1b0
[ 3083.157752]  ? SyS_alarm+0x140/0x140
[ 3083.157755]  ? __rcu_read_unlock+0x66/0x80
[ 3083.157757]  ? __fget+0xc4/0x100
[ 3083.157760]  SyS_ioctl+0x74/0x80
[ 3083.157763]  entry_SYSCALL_64_fastpath+0x1a/0x7d
[ 3083.157765] RIP: 0033:0x7f6135d0c6a7
[ 3083.157767] RSP: 002b:00007fff01451888 EFLAGS: 00003246 ORIG_RAX: 0000000000000010
[ 3083.157769] RAX: ffffffffffffffda RBX: 0000000000000004 RCX: 00007f6135d0c6a7
[ 3083.157771] RDX: 00007fff01451950 RSI: 00000000c01864b0 RDI: 000000000000000c
[ 3083.157772] RBP: 00007f613076f600 R08: 0000000000000001 R09: 0000000000000000
[ 3083.157773] R10: 0000000000000060 R11: 0000000000003246 R12: 0000000000000000
[ 3083.157774] R13: 0000000000000060 R14: 000000000000001b R15: 0000000000000060

[ 3083.157779] Allocated by task 831:
[ 3083.157783]  kmem_cache_alloc+0xc0/0x200
[ 3083.157822]  i915_gem_request_await_dma_fence+0x2c4/0x5d0 [i915]
[ 3083.157861]  i915_gem_request_await_object+0x321/0x370 [i915]
[ 3083.157900]  i915_gem_do_execbuffer+0x1165/0x19c0 [i915]
[ 3083.157937]  i915_gem_execbuffer2+0x1ad/0x550 [i915]
[ 3083.157950]  drm_ioctl_kernel+0xa7/0xf0 [drm]
[ 3083.157962]  drm_ioctl+0x45b/0x560 [drm]
[ 3083.157964]  do_vfs_ioctl+0x13b/0x880
[ 3083.157966]  SyS_ioctl+0x74/0x80
[ 3083.157968]  entry_SYSCALL_64_fastpath+0x1a/0x7d

[ 3083.157971] Freed by task 831:
[ 3083.157973]  kmem_cache_free+0x77/0x220
[ 3083.158012]  i915_gem_request_retire+0x72c/0xa70 [i915]
[ 3083.158051]  i915_gem_request_alloc+0x1e9/0x8b0 [i915]
[ 3083.158089]  i915_gem_do_execbuffer+0xa96/0x19c0 [i915]
[ 3083.158127]  i915_gem_execbuffer2+0x1ad/0x550 [i915]
[ 3083.158140]  drm_ioctl_kernel+0xa7/0xf0 [drm]
[ 3083.158153]  drm_ioctl+0x45b/0x560 [drm]
[ 3083.158155]  do_vfs_ioctl+0x13b/0x880
[ 3083.158156]  SyS_ioctl+0x74/0x80
[ 3083.158158]  entry_SYSCALL_64_fastpath+0x1a/0x7d

[ 3083.158162] The buggy address belongs to the object at ffff8806bf20f400
                which belongs to the cache i915_dependency of size 64
[ 3083.158166] The buggy address is located 0 bytes inside of
                64-byte region [ffff8806bf20f400, ffff8806bf20f440)
[ 3083.158168] The buggy address belongs to the page:
[ 3083.158171] page:00000000d43decc4 count:1 mapcount:0 mapping:          (null) index:0x0
[ 3083.158174] flags: 0x17ffe0000000100(slab)
[ 3083.158179] raw: 017ffe0000000100 0000000000000000 0000000000000000 0000000180200020
[ 3083.158182] raw: ffffea001afc16c0 0000000500000005 ffff880731b881c0 0000000000000000
[ 3083.158184] page dumped because: kasan: bad access detected

[ 3083.158187] Memory state around the buggy address:
[ 3083.158190]  ffff8806bf20f300: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc
[ 3083.158192]  ffff8806bf20f380: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc
[ 3083.158195] >ffff8806bf20f400: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc
[ 3083.158196]                    ^
[ 3083.158199]  ffff8806bf20f480: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc
[ 3083.158201]  ffff8806bf20f500: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc
[ 3083.158203] ==================================================================

Reported-by: Alexandru Chirvasitu <achirvasub@gmail.com>
Reported-by: Mike Keehan <mike@keehan.net>
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=104436
Fixes: 1f181225f8ec ("drm/i915/execlists: Keep request->priority for its lifetime")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Alexandru Chirvasitu <achirvasub@gmail.com>
Cc: Michał Winiarski <michal.winiarski@intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Tested-by: Alexandru Chirvasitu <achirvasub@gmail.com>
Reviewed-by: Michał Winiarski <michal.winiarski@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180106105618.13532-1-chris@chris-wilson.co.uk
(cherry picked from commit c218ee03b9315073ce43992792554dafa0626eb8)
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
---
 drivers/gpu/drm/i915/i915_gem.c  | 2 +-
 drivers/gpu/drm/i915/intel_lrc.c | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 18de6569d04a..5cfba89ed586 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -467,7 +467,7 @@ static void __fence_set_priority(struct dma_fence *fence, int prio)
 	struct drm_i915_gem_request *rq;
 	struct intel_engine_cs *engine;
 
-	if (!dma_fence_is_i915(fence))
+	if (dma_fence_is_signaled(fence) || !dma_fence_is_i915(fence))
 		return;
 
 	rq = to_request(fence);
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index d36e25607435..e71a8cd50498 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -974,6 +974,9 @@ static void execlists_schedule(struct drm_i915_gem_request *request, int prio)
 
 	GEM_BUG_ON(prio == I915_PRIORITY_INVALID);
 
+	if (i915_gem_request_completed(request))
+		return;
+
 	if (prio <= READ_ONCE(request->priotree.priority))
 		return;
 

From 2a266f23550be997d783f27e704b9b40c4010292 Mon Sep 17 00:00:00 2001
From: Haozhong Zhang <haozhong.zhang@intel.com>
Date: Wed, 10 Jan 2018 21:44:42 +0800
Subject: [PATCH 775/808] KVM MMU: check pending exception before injecting APF

For example, when two APF's for page ready happen after one exit and
the first one becomes pending, the second one will result in #DF.
Instead, just handle the second page fault synchronously.

Reported-by: Ross Zwisler <zwisler@gmail.com>
Message-ID: <CAOxpaSUBf8QoOZQ1p4KfUp0jq76OKfGY4Uxs-Gg8ngReD99xww@mail.gmail.com>
Reported-by: Alec Blayne <ab@tevsa.net>
Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index c4deb1f34faa..e577bacd4bd0 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3781,7 +3781,8 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
 bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
 {
 	if (unlikely(!lapic_in_kernel(vcpu) ||
-		     kvm_event_needs_reinjection(vcpu)))
+		     kvm_event_needs_reinjection(vcpu) ||
+		     vcpu->arch.exception.pending))
 		return false;
 
 	if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu))

From ab271bd4dfd568060ffcf5a21b667c7c5df7ab99 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 10 Jan 2018 17:26:59 +0100
Subject: [PATCH 776/808] x86: kvm: propagate register_shrinker return code

Patch "mm,vmscan: mark register_shrinker() as __must_check" is
queued for 4.16 in linux-mm and adds a warning about the unchecked
call to register_shrinker:

arch/x86/kvm/mmu.c:5485:2: warning: ignoring return value of 'register_shrinker', declared with attribute warn_unused_result [-Wunused-result]

This changes the kvm_mmu_module_init() function to fail itself
when the call to register_shrinker fails.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index e577bacd4bd0..2b8eb4da4d08 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -5466,30 +5466,34 @@ static void mmu_destroy_caches(void)
 
 int kvm_mmu_module_init(void)
 {
+	int ret = -ENOMEM;
+
 	kvm_mmu_clear_all_pte_masks();
 
 	pte_list_desc_cache = kmem_cache_create("pte_list_desc",
 					    sizeof(struct pte_list_desc),
 					    0, SLAB_ACCOUNT, NULL);
 	if (!pte_list_desc_cache)
-		goto nomem;
+		goto out;
 
 	mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
 						  sizeof(struct kvm_mmu_page),
 						  0, SLAB_ACCOUNT, NULL);
 	if (!mmu_page_header_cache)
-		goto nomem;
+		goto out;
 
 	if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL))
-		goto nomem;
+		goto out;
 
-	register_shrinker(&mmu_shrinker);
+	ret = register_shrinker(&mmu_shrinker);
+	if (ret)
+		goto out;
 
 	return 0;
 
-nomem:
+out:
 	mmu_destroy_caches();
-	return -ENOMEM;
+	return ret;
 }
 
 /*

From bd89525a823ce6edddcedbe9aed79faa1b9cf544 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 11 Jan 2018 16:55:24 +0100
Subject: [PATCH 777/808] KVM: x86: emulate #UD while in guest mode

This reverts commits ae1f57670703656cc9f293722c3b8b6782f8ab3f
and ac9b305caa0df6f5b75d294e4b86c1027648991e.

If the hardware doesn't support MOVBE, but L0 sets CPUID.01H:ECX.MOVBE
in L1's emulated CPUID information, then L1 is likely to pass that
CPUID bit through to L2. L2 will expect MOVBE to work, but if L1
doesn't intercept #UD, then any MOVBE instruction executed in L2 will
raise #UD, and the exception will be delivered in L2.

Commit ac9b305caa0df6f5b75d294e4b86c1027648991e is a better and more
complete version of ae1f57670703 ("KVM: nVMX: Do not emulate #UD while
in guest mode"); however, neither considers the above case.

Suggested-by: Jim Mattson <jmattson@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/svm.c | 9 +--------
 arch/x86/kvm/vmx.c | 5 +----
 2 files changed, 2 insertions(+), 12 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index bb31c801f1fc..3158dac87f82 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -361,7 +361,6 @@ static void recalc_intercepts(struct vcpu_svm *svm)
 {
 	struct vmcb_control_area *c, *h;
 	struct nested_state *g;
-	u32 h_intercept_exceptions;
 
 	mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
 
@@ -372,14 +371,9 @@ static void recalc_intercepts(struct vcpu_svm *svm)
 	h = &svm->nested.hsave->control;
 	g = &svm->nested;
 
-	/* No need to intercept #UD if L1 doesn't intercept it */
-	h_intercept_exceptions =
-		h->intercept_exceptions & ~(1U << UD_VECTOR);
-
 	c->intercept_cr = h->intercept_cr | g->intercept_cr;
 	c->intercept_dr = h->intercept_dr | g->intercept_dr;
-	c->intercept_exceptions =
-		h_intercept_exceptions | g->intercept_exceptions;
+	c->intercept_exceptions = h->intercept_exceptions | g->intercept_exceptions;
 	c->intercept = h->intercept | g->intercept;
 }
 
@@ -2202,7 +2196,6 @@ static int ud_interception(struct vcpu_svm *svm)
 {
 	int er;
 
-	WARN_ON_ONCE(is_guest_mode(&svm->vcpu));
 	er = emulate_instruction(&svm->vcpu, EMULTYPE_TRAP_UD);
 	if (er == EMULATE_USER_EXIT)
 		return 0;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 5c14d65f676a..427fd3200dd8 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1887,7 +1887,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
 {
 	u32 eb;
 
-	eb = (1u << PF_VECTOR) | (1u << MC_VECTOR) |
+	eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
 	     (1u << DB_VECTOR) | (1u << AC_VECTOR);
 	if ((vcpu->guest_debug &
 	     (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
@@ -1905,8 +1905,6 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
 	 */
 	if (is_guest_mode(vcpu))
 		eb |= get_vmcs12(vcpu)->exception_bitmap;
-	else
-		eb |= 1u << UD_VECTOR;
 
 	vmcs_write32(EXCEPTION_BITMAP, eb);
 }
@@ -5917,7 +5915,6 @@ static int handle_exception(struct kvm_vcpu *vcpu)
 		return 1;  /* already handled by vmx_vcpu_run() */
 
 	if (is_invalid_opcode(intr_info)) {
-		WARN_ON_ONCE(is_guest_mode(vcpu));
 		er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD);
 		if (er == EMULATE_USER_EXIT)
 			return 0;

From 75f139aaf896d6fdeec2e468ddfa4b2fe469bf40 Mon Sep 17 00:00:00 2001
From: Andrew Honig <ahonig@google.com>
Date: Wed, 10 Jan 2018 10:12:03 -0800
Subject: [PATCH 778/808] KVM: x86: Add memory barrier on vmcs field lookup

This adds a memory barrier when performing a lookup into
the vmcs_field_to_offset_table.  This is related to
CVE-2017-5753.

Signed-off-by: Andrew Honig <ahonig@google.com>
Reviewed-by: Jim Mattson <jmattson@google.com>
Cc: stable@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a6f4f095f8f4..7f8fcc5ce664 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -884,8 +884,16 @@ static inline short vmcs_field_to_offset(unsigned long field)
 {
 	BUILD_BUG_ON(ARRAY_SIZE(vmcs_field_to_offset_table) > SHRT_MAX);
 
-	if (field >= ARRAY_SIZE(vmcs_field_to_offset_table) ||
-	    vmcs_field_to_offset_table[field] == 0)
+	if (field >= ARRAY_SIZE(vmcs_field_to_offset_table))
+		return -ENOENT;
+
+	/*
+	 * FIXME: Mitigation for CVE-2017-5753.  To be replaced with a
+	 * generic mechanism.
+	 */
+	asm("lfence");
+
+	if (vmcs_field_to_offset_table[field] == 0)
 		return -ENOENT;
 
 	return vmcs_field_to_offset_table[field];

From f32ab7547161b9fa7ebfbc4f18ea1eb3fd49fe25 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FChristian=3D20K=3DC3=3DB6nig=3F=3D?=
 <ckoenig.leichtzumerken@gmail.com>
Date: Thu, 11 Jan 2018 14:23:29 +0100
Subject: [PATCH 779/808] x86/PCI: Add "pci=big_root_window" option for AMD
 64-bit windows
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Only try to enable a 64-bit window on AMD CPUs when "pci=big_root_window"
is specified.

This taints the kernel because the new 64-bit window uses address space we
don't know anything about, and it may contain unreported devices or memory
that would conflict with the window.

The pci_amd_enable_64bit_bar() quirk that enables the window is specific to
AMD CPUs.  The generic solution would be to have the firmware enable the
window and describe it in the host bridge's _CRS method, or at least
describe it in the _PRS method so the OS would have the option of enabling
it.

Signed-off-by: Christian König <christian.koenig@amd.com>
[bhelgaas: changelog, extend doc, mention taint in dmesg]
Signed-off-by: Bjorn Helgaas <helgaas@kernel.org>
---
 Documentation/admin-guide/kernel-parameters.txt | 6 ++++++
 arch/x86/include/asm/pci_x86.h                  | 1 +
 arch/x86/pci/common.c                           | 5 +++++
 arch/x86/pci/fixup.c                            | 7 ++++++-
 4 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 6571fbfdb2a1..619638362416 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3094,6 +3094,12 @@
 		pcie_scan_all	Scan all possible PCIe devices.  Otherwise we
 				only look for one device below a PCIe downstream
 				port.
+		big_root_window	Try to add a big 64bit memory window to the PCIe
+				root complex on AMD CPUs. Some GFX hardware
+				can resize a BAR to allow access to all VRAM.
+				Adding the window is slightly risky (it may
+				conflict with unreported devices), so this
+				taints the kernel.
 
 	pcie_aspm=	[PCIE] Forcibly enable or disable PCIe Active State Power
 			Management.
diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index 7a5d6695abd3..eb66fa9cd0fc 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -38,6 +38,7 @@ do {						\
 #define PCI_NOASSIGN_ROMS	0x80000
 #define PCI_ROOT_NO_CRS		0x100000
 #define PCI_NOASSIGN_BARS	0x200000
+#define PCI_BIG_ROOT_WINDOW	0x400000
 
 extern unsigned int pci_probe;
 extern unsigned long pirq_table_addr;
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 7a5350d08cef..563049c483a1 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -594,6 +594,11 @@ char *__init pcibios_setup(char *str)
 	} else if (!strcmp(str, "nocrs")) {
 		pci_probe |= PCI_ROOT_NO_CRS;
 		return NULL;
+#ifdef CONFIG_PHYS_ADDR_T_64BIT
+	} else if (!strcmp(str, "big_root_window")) {
+		pci_probe |= PCI_BIG_ROOT_WINDOW;
+		return NULL;
+#endif
 	} else if (!strcmp(str, "earlydump")) {
 		pci_early_dump_regs = 1;
 		return NULL;
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index e663d6bf1328..8bad19c7473d 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -667,6 +667,9 @@ static void pci_amd_enable_64bit_bar(struct pci_dev *dev)
 	struct resource *res, *conflict;
 	struct pci_dev *other;
 
+	if (!(pci_probe & PCI_BIG_ROOT_WINDOW))
+		return;
+
 	/* Check that we are the only device of that type */
 	other = pci_get_device(dev->vendor, dev->device, NULL);
 	if (other != dev ||
@@ -714,7 +717,9 @@ static void pci_amd_enable_64bit_bar(struct pci_dev *dev)
 		res->start = conflict->end + 1;
 	}
 
-	dev_info(&dev->dev, "adding root bus resource %pR\n", res);
+	dev_info(&dev->dev, "adding root bus resource %pR (tainting kernel)\n",
+		 res);
+	add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
 
 	base = ((res->start >> 8) & AMD_141b_MMIO_BASE_MMIOBASE_MASK) |
 		AMD_141b_MMIO_BASE_RE_MASK | AMD_141b_MMIO_BASE_WE_MASK;

From b8626f1dc29d3eee444bfaa92146ec7b291ef41c Mon Sep 17 00:00:00 2001
From: Stefan Agner <stefan@agner.ch>
Date: Thu, 11 Jan 2018 14:47:40 +0100
Subject: [PATCH 780/808] usb: misc: usb3503: make sure reset is low for at
 least 100us

When using a GPIO which is high by default, and initialize the
driver in USB Hub mode, initialization fails with:
  [  111.757794] usb3503 0-0008: SP_ILOCK failed (-5)

The reason seems to be that the chip is not properly reset.
Probe does initialize reset low, however some lines later the
code already set it back high, which is not long enouth.

Make sure reset is asserted for at least 100us by inserting a
delay after initializing the reset pin during probe.

Signed-off-by: Stefan Agner <stefan@agner.ch>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/misc/usb3503.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/usb/misc/usb3503.c b/drivers/usb/misc/usb3503.c
index 465dbf68b463..f723f7b8c9ac 100644
--- a/drivers/usb/misc/usb3503.c
+++ b/drivers/usb/misc/usb3503.c
@@ -279,6 +279,8 @@ static int usb3503_probe(struct usb3503 *hub)
 	if (gpio_is_valid(hub->gpio_reset)) {
 		err = devm_gpio_request_one(dev, hub->gpio_reset,
 				GPIOF_OUT_INIT_LOW, "usb3503 reset");
+		/* Datasheet defines a hardware reset to be at least 100us */
+		usleep_range(100, 10000);
 		if (err) {
 			dev_err(dev,
 				"unable to request GPIO %d as reset pin (%d)\n",

From 1a2e91e795def04e15fac87b8e16b635691d0b82 Mon Sep 17 00:00:00 2001
From: Bin Liu <b-liu@ti.com>
Date: Tue, 9 Jan 2018 13:27:17 -0600
Subject: [PATCH 781/808] Documentation: usb: fix typo in UVC gadgetfs config
 command

This seems to be a copy&paste error. With the fix the uvc gadget now can
be created by following the instrucitons.

Signed-off-by: Bin Liu <b-liu@ti.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/usb/gadget-testing.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/usb/gadget-testing.txt b/Documentation/usb/gadget-testing.txt
index 441a4b9b666f..5908a21fddb6 100644
--- a/Documentation/usb/gadget-testing.txt
+++ b/Documentation/usb/gadget-testing.txt
@@ -693,7 +693,7 @@ such specification consists of a number of lines with an inverval value
 in each line. The rules stated above are best illustrated with an example:
 
 # mkdir functions/uvc.usb0/control/header/h
-# cd functions/uvc.usb0/control/header/h
+# cd functions/uvc.usb0/control/
 # ln -s header/h class/fs
 # ln -s header/h class/ss
 # mkdir -p functions/uvc.usb0/streaming/uncompressed/u/360p

From 03a551734cfc2b93f83950a595974e3c9cbd82fd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FChristian=3D20K=3DC3=3DB6nig=3F=3D?=
 <ckoenig.leichtzumerken@gmail.com>
Date: Thu, 11 Jan 2018 14:23:30 +0100
Subject: [PATCH 782/808] x86/PCI: Move and shrink AMD 64-bit window to avoid
 conflict
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Avoid problems with BIOS implementations which don't report all used
resources to the OS by only allocating a 256GB window directly below the
hardware limit (from the BKDG, sec 2.4.6).

Fixes a silent reboot loop reported by Aaro Koskinen <aaro.koskinen@iki.fi>
on an AMD-based MSI MS-7699/760GA-P43(FX) system.  This was apparently
caused by RAM or other unreported hardware that conflicted with the new
window.

Link: https://support.amd.com/TechDocs/49125_15h_Models_30h-3Fh_BKDG.pdf
Link: https://lkml.kernel.org/r/20180105220412.fzpwqe4zljdawr36@darkstar.musicnaut.iki.fi
Fixes: fa564ad96366 ("x86/PCI: Enable a 64bit BAR on AMD Family 15h (Models 00-1f, 30-3f, 60-7f)")
Reported-by: Aaro Koskinen <aaro.koskinen@iki.fi>
Signed-off-by: Christian König <christian.koenig@amd.com>
[bhelgaas: changelog, comment, Fixes:]
Signed-off-by: Bjorn Helgaas <helgaas@kernel.org>
---
 arch/x86/pci/fixup.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index 8bad19c7473d..f6a26e3cb476 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -662,10 +662,11 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2033, quirk_no_aersid);
  */
 static void pci_amd_enable_64bit_bar(struct pci_dev *dev)
 {
-	unsigned i;
 	u32 base, limit, high;
-	struct resource *res, *conflict;
 	struct pci_dev *other;
+	struct resource *res;
+	unsigned i;
+	int r;
 
 	if (!(pci_probe & PCI_BIG_ROOT_WINDOW))
 		return;
@@ -702,19 +703,20 @@ static void pci_amd_enable_64bit_bar(struct pci_dev *dev)
 	if (!res)
 		return;
 
+	/*
+	 * Allocate a 256GB window directly below the 0xfd00000000 hardware
+	 * limit (see AMD Family 15h Models 30h-3Fh BKDG, sec 2.4.6).
+	 */
 	res->name = "PCI Bus 0000:00";
 	res->flags = IORESOURCE_PREFETCH | IORESOURCE_MEM |
 		IORESOURCE_MEM_64 | IORESOURCE_WINDOW;
-	res->start = 0x100000000ull;
+	res->start = 0xbd00000000ull;
 	res->end = 0xfd00000000ull - 1;
 
-	/* Just grab the free area behind system memory for this */
-	while ((conflict = request_resource_conflict(&iomem_resource, res))) {
-		if (conflict->end >= res->end) {
-			kfree(res);
-			return;
-		}
-		res->start = conflict->end + 1;
+	r = request_resource(&iomem_resource, res);
+	if (r) {
+		kfree(res);
+		return;
 	}
 
 	dev_info(&dev->dev, "adding root bus resource %pR (tainting kernel)\n",

From 445b69e3b75e42362a5bdc13c8b8f61599e2228a Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Wed, 10 Jan 2018 14:49:39 -0800
Subject: [PATCH 783/808] x86/pti: Make unpoison of pgd for trusted boot work
 for real

The inital fix for trusted boot and PTI potentially misses the pgd clearing
if pud_alloc() sets a PGD.  It probably works in *practice* because for two
adjacent calls to map_tboot_page() that share a PGD entry, the first will
clear NX, *then* allocate and set the PGD (without NX clear).  The second
call will *not* allocate but will clear the NX bit.

Defer the NX clearing to a point after it is known that all top-level
allocations have occurred.  Add a comment to clarify why.

[ tglx: Massaged changelog ]

Fixes: 262b6b30087 ("x86/tboot: Unbreak tboot with PTI enabled")
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Jon Masters <jcm@redhat.com>
Cc: "Tim Chen" <tim.c.chen@linux.intel.com>
Cc: gnomes@lxorguk.ukuu.org.uk
Cc: peterz@infradead.org
Cc: ning.sun@intel.com
Cc: tboot-devel@lists.sourceforge.net
Cc: andi@firstfloor.org
Cc: luto@kernel.org
Cc: law@redhat.com
Cc: pbonzini@redhat.com
Cc: torvalds@linux-foundation.org
Cc: gregkh@linux-foundation.org
Cc: dwmw@amazon.co.uk
Cc: nickc@redhat.com
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/20180110224939.2695CD47@viggo.jf.intel.com
---
 arch/x86/kernel/tboot.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 75869a4b6c41..a2486f444073 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -127,7 +127,6 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn,
 	p4d = p4d_alloc(&tboot_mm, pgd, vaddr);
 	if (!p4d)
 		return -1;
-	pgd->pgd &= ~_PAGE_NX;
 	pud = pud_alloc(&tboot_mm, p4d, vaddr);
 	if (!pud)
 		return -1;
@@ -139,6 +138,17 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn,
 		return -1;
 	set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot));
 	pte_unmap(pte);
+
+	/*
+	 * PTI poisons low addresses in the kernel page tables in the
+	 * name of making them unusable for userspace.  To execute
+	 * code at such a low address, the poison must be cleared.
+	 *
+	 * Note: 'pgd' actually gets set in p4d_alloc() _or_
+	 * pud_alloc() depending on 4/5-level paging.
+	 */
+	pgd->pgd &= ~_PAGE_NX;
+
 	return 0;
 }
 

From 39b735332cb8b33a27c28592d969e4016c86c3ea Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Thu, 11 Jan 2018 21:46:23 +0000
Subject: [PATCH 784/808] objtool: Detect jumps to retpoline thunks

A direct jump to a retpoline thunk is really an indirect jump in
disguise.  Change the objtool instruction type accordingly.

Objtool needs to know where indirect branches are so it can detect
switch statement jump tables.

This fixes a bunch of warnings with CONFIG_RETPOLINE like:

  arch/x86/events/intel/uncore_nhmex.o: warning: objtool: nhmex_rbox_msr_enable_event()+0x44: sibling call from callable instruction with modified stack frame
  kernel/signal.o: warning: objtool: copy_siginfo_to_user()+0x91: sibling call from callable instruction with modified stack frame
  ...

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: gnomes@lxorguk.ukuu.org.uk
Cc: Rik van Riel <riel@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: thomas.lendacky@amd.com
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Kees Cook <keescook@google.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Link: https://lkml.kernel.org/r/1515707194-20531-2-git-send-email-dwmw@amazon.co.uk
---
 tools/objtool/check.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 9b341584eb1b..de053fb7049b 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -456,6 +456,13 @@ static int add_jump_destinations(struct objtool_file *file)
 		} else if (rela->sym->sec->idx) {
 			dest_sec = rela->sym->sec;
 			dest_off = rela->sym->sym.st_value + rela->addend + 4;
+		} else if (strstr(rela->sym->name, "_indirect_thunk_")) {
+			/*
+			 * Retpoline jumps are really dynamic jumps in
+			 * disguise, so convert them accordingly.
+			 */
+			insn->type = INSN_JUMP_DYNAMIC;
+			continue;
 		} else {
 			/* sibling call */
 			insn->jump_dest = 0;

From 258c76059cece01bebae098e81bacb1af2edad17 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Thu, 11 Jan 2018 21:46:24 +0000
Subject: [PATCH 785/808] objtool: Allow alternatives to be ignored

Getting objtool to understand retpolines is going to be a bit of a
challenge.  For now, take advantage of the fact that retpolines are
patched in with alternatives.  Just read the original (sane)
non-alternative instruction, and ignore the patched-in retpoline.

This allows objtool to understand the control flow *around* the
retpoline, even if it can't yet follow what's inside.  This means the
ORC unwinder will fail to unwind from inside a retpoline, but will work
fine otherwise.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: gnomes@lxorguk.ukuu.org.uk
Cc: Rik van Riel <riel@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: thomas.lendacky@amd.com
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Kees Cook <keescook@google.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Link: https://lkml.kernel.org/r/1515707194-20531-3-git-send-email-dwmw@amazon.co.uk
---
 tools/objtool/check.c | 62 ++++++++++++++++++++++++++++++++++++++-----
 tools/objtool/check.h |  2 +-
 2 files changed, 57 insertions(+), 7 deletions(-)

diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index de053fb7049b..f40d46e24bcc 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -427,6 +427,40 @@ static void add_ignores(struct objtool_file *file)
 	}
 }
 
+/*
+ * FIXME: For now, just ignore any alternatives which add retpolines.  This is
+ * a temporary hack, as it doesn't allow ORC to unwind from inside a retpoline.
+ * But it at least allows objtool to understand the control flow *around* the
+ * retpoline.
+ */
+static int add_nospec_ignores(struct objtool_file *file)
+{
+	struct section *sec;
+	struct rela *rela;
+	struct instruction *insn;
+
+	sec = find_section_by_name(file->elf, ".rela.discard.nospec");
+	if (!sec)
+		return 0;
+
+	list_for_each_entry(rela, &sec->rela_list, list) {
+		if (rela->sym->type != STT_SECTION) {
+			WARN("unexpected relocation symbol type in %s", sec->name);
+			return -1;
+		}
+
+		insn = find_insn(file, rela->sym->sec, rela->addend);
+		if (!insn) {
+			WARN("bad .discard.nospec entry");
+			return -1;
+		}
+
+		insn->ignore_alts = true;
+	}
+
+	return 0;
+}
+
 /*
  * Find the destination instructions for all jumps.
  */
@@ -509,11 +543,18 @@ static int add_call_destinations(struct objtool_file *file)
 			dest_off = insn->offset + insn->len + insn->immediate;
 			insn->call_dest = find_symbol_by_offset(insn->sec,
 								dest_off);
+			/*
+			 * FIXME: Thanks to retpolines, it's now considered
+			 * normal for a function to call within itself.  So
+			 * disable this warning for now.
+			 */
+#if 0
 			if (!insn->call_dest) {
 				WARN_FUNC("can't find call dest symbol at offset 0x%lx",
 					  insn->sec, insn->offset, dest_off);
 				return -1;
 			}
+#endif
 		} else if (rela->sym->type == STT_SECTION) {
 			insn->call_dest = find_symbol_by_offset(rela->sym->sec,
 								rela->addend+4);
@@ -678,12 +719,6 @@ static int add_special_section_alts(struct objtool_file *file)
 		return ret;
 
 	list_for_each_entry_safe(special_alt, tmp, &special_alts, list) {
-		alt = malloc(sizeof(*alt));
-		if (!alt) {
-			WARN("malloc failed");
-			ret = -1;
-			goto out;
-		}
 
 		orig_insn = find_insn(file, special_alt->orig_sec,
 				      special_alt->orig_off);
@@ -694,6 +729,10 @@ static int add_special_section_alts(struct objtool_file *file)
 			goto out;
 		}
 
+		/* Ignore retpoline alternatives. */
+		if (orig_insn->ignore_alts)
+			continue;
+
 		new_insn = NULL;
 		if (!special_alt->group || special_alt->new_len) {
 			new_insn = find_insn(file, special_alt->new_sec,
@@ -719,6 +758,13 @@ static int add_special_section_alts(struct objtool_file *file)
 				goto out;
 		}
 
+		alt = malloc(sizeof(*alt));
+		if (!alt) {
+			WARN("malloc failed");
+			ret = -1;
+			goto out;
+		}
+
 		alt->insn = new_insn;
 		list_add_tail(&alt->list, &orig_insn->alts);
 
@@ -1035,6 +1081,10 @@ static int decode_sections(struct objtool_file *file)
 
 	add_ignores(file);
 
+	ret = add_nospec_ignores(file);
+	if (ret)
+		return ret;
+
 	ret = add_jump_destinations(file);
 	if (ret)
 		return ret;
diff --git a/tools/objtool/check.h b/tools/objtool/check.h
index 47d9ea70a83d..dbadb304a410 100644
--- a/tools/objtool/check.h
+++ b/tools/objtool/check.h
@@ -44,7 +44,7 @@ struct instruction {
 	unsigned int len;
 	unsigned char type;
 	unsigned long immediate;
-	bool alt_group, visited, dead_end, ignore, hint, save, restore;
+	bool alt_group, visited, dead_end, ignore, hint, save, restore, ignore_alts;
 	struct symbol *call_dest;
 	struct instruction *jump_dest;
 	struct list_head alts;

From 76b043848fd22dbf7f8bf3a1452f8c70d557b860 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Thu, 11 Jan 2018 21:46:25 +0000
Subject: [PATCH 786/808] x86/retpoline: Add initial retpoline support

Enable the use of -mindirect-branch=thunk-extern in newer GCC, and provide
the corresponding thunks. Provide assembler macros for invoking the thunks
in the same way that GCC does, from native and inline assembler.

This adds X86_FEATURE_RETPOLINE and sets it by default on all CPUs. In
some circumstances, IBRS microcode features may be used instead, and the
retpoline can be disabled.

On AMD CPUs if lfence is serialising, the retpoline can be dramatically
simplified to a simple "lfence; jmp *\reg". A future patch, after it has
been verified that lfence really is serialising in all circumstances, can
enable this by setting the X86_FEATURE_RETPOLINE_AMD feature bit in addition
to X86_FEATURE_RETPOLINE.

Do not align the retpoline in the altinstr section, because there is no
guarantee that it stays aligned when it's copied over the oldinstr during
alternative patching.

[ Andi Kleen: Rename the macros, add CONFIG_RETPOLINE option, export thunks]
[ tglx: Put actual function CALL/JMP in front of the macros, convert to
  	symbolic labels ]
[ dwmw2: Convert back to numeric labels, merge objtool fixes ]

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Arjan van de Ven <arjan@linux.intel.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Cc: gnomes@lxorguk.ukuu.org.uk
Cc: Rik van Riel <riel@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: thomas.lendacky@amd.com
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Kees Cook <keescook@google.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Link: https://lkml.kernel.org/r/1515707194-20531-4-git-send-email-dwmw@amazon.co.uk
---
 arch/x86/Kconfig                      |  13 +++
 arch/x86/Makefile                     |  10 ++
 arch/x86/include/asm/asm-prototypes.h |  25 +++++
 arch/x86/include/asm/cpufeatures.h    |   2 +
 arch/x86/include/asm/nospec-branch.h  | 128 ++++++++++++++++++++++++++
 arch/x86/kernel/cpu/common.c          |   4 +
 arch/x86/lib/Makefile                 |   1 +
 arch/x86/lib/retpoline.S              |  48 ++++++++++
 8 files changed, 231 insertions(+)
 create mode 100644 arch/x86/include/asm/nospec-branch.h
 create mode 100644 arch/x86/lib/retpoline.S

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e23d21ac745a..d1819161cc6c 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -429,6 +429,19 @@ config GOLDFISH
        def_bool y
        depends on X86_GOLDFISH
 
+config RETPOLINE
+	bool "Avoid speculative indirect branches in kernel"
+	default y
+	help
+	  Compile kernel with the retpoline compiler options to guard against
+	  kernel-to-user data leaks by avoiding speculative indirect
+	  branches. Requires a compiler with -mindirect-branch=thunk-extern
+	  support for full protection. The kernel may run slower.
+
+	  Without compiler support, at least indirect branches in assembler
+	  code are eliminated. Since this includes the syscall entry path,
+	  it is not entirely pointless.
+
 config INTEL_RDT
 	bool "Intel Resource Director Technology support"
 	default n
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index a20eacd9c7e9..974c61864978 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -235,6 +235,16 @@ KBUILD_CFLAGS += -Wno-sign-compare
 #
 KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
 
+# Avoid indirect branches in kernel to deal with Spectre
+ifdef CONFIG_RETPOLINE
+    RETPOLINE_CFLAGS += $(call cc-option,-mindirect-branch=thunk-extern -mindirect-branch-register)
+    ifneq ($(RETPOLINE_CFLAGS),)
+        KBUILD_CFLAGS += $(RETPOLINE_CFLAGS) -DRETPOLINE
+    else
+        $(warning CONFIG_RETPOLINE=y, but not supported by the compiler. Toolchain update recommended.)
+    endif
+endif
+
 archscripts: scripts_basic
 	$(Q)$(MAKE) $(build)=arch/x86/tools relocs
 
diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h
index ff700d81e91e..0927cdc4f946 100644
--- a/arch/x86/include/asm/asm-prototypes.h
+++ b/arch/x86/include/asm/asm-prototypes.h
@@ -11,7 +11,32 @@
 #include <asm/pgtable.h>
 #include <asm/special_insns.h>
 #include <asm/preempt.h>
+#include <asm/asm.h>
 
 #ifndef CONFIG_X86_CMPXCHG64
 extern void cmpxchg8b_emu(void);
 #endif
+
+#ifdef CONFIG_RETPOLINE
+#ifdef CONFIG_X86_32
+#define INDIRECT_THUNK(reg) extern asmlinkage void __x86_indirect_thunk_e ## reg(void);
+#else
+#define INDIRECT_THUNK(reg) extern asmlinkage void __x86_indirect_thunk_r ## reg(void);
+INDIRECT_THUNK(8)
+INDIRECT_THUNK(9)
+INDIRECT_THUNK(10)
+INDIRECT_THUNK(11)
+INDIRECT_THUNK(12)
+INDIRECT_THUNK(13)
+INDIRECT_THUNK(14)
+INDIRECT_THUNK(15)
+#endif
+INDIRECT_THUNK(ax)
+INDIRECT_THUNK(bx)
+INDIRECT_THUNK(cx)
+INDIRECT_THUNK(dx)
+INDIRECT_THUNK(si)
+INDIRECT_THUNK(di)
+INDIRECT_THUNK(bp)
+INDIRECT_THUNK(sp)
+#endif /* CONFIG_RETPOLINE */
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 1641c2f96363..f275447862f4 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -203,6 +203,8 @@
 #define X86_FEATURE_PROC_FEEDBACK	( 7*32+ 9) /* AMD ProcFeedbackInterface */
 #define X86_FEATURE_SME			( 7*32+10) /* AMD Secure Memory Encryption */
 #define X86_FEATURE_PTI			( 7*32+11) /* Kernel Page Table Isolation enabled */
+#define X86_FEATURE_RETPOLINE		( 7*32+12) /* Generic Retpoline mitigation for Spectre variant 2 */
+#define X86_FEATURE_RETPOLINE_AMD	( 7*32+13) /* AMD Retpoline mitigation for Spectre variant 2 */
 #define X86_FEATURE_INTEL_PPIN		( 7*32+14) /* Intel Processor Inventory Number */
 #define X86_FEATURE_INTEL_PT		( 7*32+15) /* Intel Processor Trace */
 #define X86_FEATURE_AVX512_4VNNIW	( 7*32+16) /* AVX-512 Neural Network Instructions */
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
new file mode 100644
index 000000000000..e20e92ef2ca8
--- /dev/null
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -0,0 +1,128 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __NOSPEC_BRANCH_H__
+#define __NOSPEC_BRANCH_H__
+
+#include <asm/alternative.h>
+#include <asm/alternative-asm.h>
+#include <asm/cpufeatures.h>
+
+#ifdef __ASSEMBLY__
+
+/*
+ * This should be used immediately before a retpoline alternative.  It tells
+ * objtool where the retpolines are so that it can make sense of the control
+ * flow by just reading the original instruction(s) and ignoring the
+ * alternatives.
+ */
+.macro ANNOTATE_NOSPEC_ALTERNATIVE
+	.Lannotate_\@:
+	.pushsection .discard.nospec
+	.long .Lannotate_\@ - .
+	.popsection
+.endm
+
+/*
+ * These are the bare retpoline primitives for indirect jmp and call.
+ * Do not use these directly; they only exist to make the ALTERNATIVE
+ * invocation below less ugly.
+ */
+.macro RETPOLINE_JMP reg:req
+	call	.Ldo_rop_\@
+.Lspec_trap_\@:
+	pause
+	jmp	.Lspec_trap_\@
+.Ldo_rop_\@:
+	mov	\reg, (%_ASM_SP)
+	ret
+.endm
+
+/*
+ * This is a wrapper around RETPOLINE_JMP so the called function in reg
+ * returns to the instruction after the macro.
+ */
+.macro RETPOLINE_CALL reg:req
+	jmp	.Ldo_call_\@
+.Ldo_retpoline_jmp_\@:
+	RETPOLINE_JMP \reg
+.Ldo_call_\@:
+	call	.Ldo_retpoline_jmp_\@
+.endm
+
+/*
+ * JMP_NOSPEC and CALL_NOSPEC macros can be used instead of a simple
+ * indirect jmp/call which may be susceptible to the Spectre variant 2
+ * attack.
+ */
+.macro JMP_NOSPEC reg:req
+#ifdef CONFIG_RETPOLINE
+	ANNOTATE_NOSPEC_ALTERNATIVE
+	ALTERNATIVE_2 __stringify(jmp *\reg),				\
+		__stringify(RETPOLINE_JMP \reg), X86_FEATURE_RETPOLINE,	\
+		__stringify(lfence; jmp *\reg), X86_FEATURE_RETPOLINE_AMD
+#else
+	jmp	*\reg
+#endif
+.endm
+
+.macro CALL_NOSPEC reg:req
+#ifdef CONFIG_RETPOLINE
+	ANNOTATE_NOSPEC_ALTERNATIVE
+	ALTERNATIVE_2 __stringify(call *\reg),				\
+		__stringify(RETPOLINE_CALL \reg), X86_FEATURE_RETPOLINE,\
+		__stringify(lfence; call *\reg), X86_FEATURE_RETPOLINE_AMD
+#else
+	call	*\reg
+#endif
+.endm
+
+#else /* __ASSEMBLY__ */
+
+#define ANNOTATE_NOSPEC_ALTERNATIVE				\
+	"999:\n\t"						\
+	".pushsection .discard.nospec\n\t"			\
+	".long 999b - .\n\t"					\
+	".popsection\n\t"
+
+#if defined(CONFIG_X86_64) && defined(RETPOLINE)
+
+/*
+ * Since the inline asm uses the %V modifier which is only in newer GCC,
+ * the 64-bit one is dependent on RETPOLINE not CONFIG_RETPOLINE.
+ */
+# define CALL_NOSPEC						\
+	ANNOTATE_NOSPEC_ALTERNATIVE				\
+	ALTERNATIVE(						\
+	"call *%[thunk_target]\n",				\
+	"call __x86_indirect_thunk_%V[thunk_target]\n",		\
+	X86_FEATURE_RETPOLINE)
+# define THUNK_TARGET(addr) [thunk_target] "r" (addr)
+
+#elif defined(CONFIG_X86_32) && defined(CONFIG_RETPOLINE)
+/*
+ * For i386 we use the original ret-equivalent retpoline, because
+ * otherwise we'll run out of registers. We don't care about CET
+ * here, anyway.
+ */
+# define CALL_NOSPEC ALTERNATIVE("call *%[thunk_target]\n",	\
+	"       jmp    904f;\n"					\
+	"       .align 16\n"					\
+	"901:	call   903f;\n"					\
+	"902:	pause;\n"					\
+	"       jmp    902b;\n"					\
+	"       .align 16\n"					\
+	"903:	addl   $4, %%esp;\n"				\
+	"       pushl  %[thunk_target];\n"			\
+	"       ret;\n"						\
+	"       .align 16\n"					\
+	"904:	call   901b;\n",				\
+	X86_FEATURE_RETPOLINE)
+
+# define THUNK_TARGET(addr) [thunk_target] "rm" (addr)
+#else /* No retpoline */
+# define CALL_NOSPEC "call *%[thunk_target]\n"
+# define THUNK_TARGET(addr) [thunk_target] "rm" (addr)
+#endif
+
+#endif /* __ASSEMBLY__ */
+#endif /* __NOSPEC_BRANCH_H__ */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 372ba3fb400f..7a671d1ae3cb 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -905,6 +905,10 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
 	setup_force_cpu_bug(X86_BUG_SPECTRE_V1);
 	setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
 
+#ifdef CONFIG_RETPOLINE
+	setup_force_cpu_cap(X86_FEATURE_RETPOLINE);
+#endif
+
 	fpu__init_system(c);
 
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 457f681ef379..d435c89875c1 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -26,6 +26,7 @@ lib-y += memcpy_$(BITS).o
 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
 lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o
 lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o
+lib-$(CONFIG_RETPOLINE) += retpoline.o
 
 obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o
 
diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S
new file mode 100644
index 000000000000..cb45c6cb465f
--- /dev/null
+++ b/arch/x86/lib/retpoline.S
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <linux/stringify.h>
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#include <asm/cpufeatures.h>
+#include <asm/alternative-asm.h>
+#include <asm/export.h>
+#include <asm/nospec-branch.h>
+
+.macro THUNK reg
+	.section .text.__x86.indirect_thunk.\reg
+
+ENTRY(__x86_indirect_thunk_\reg)
+	CFI_STARTPROC
+	JMP_NOSPEC %\reg
+	CFI_ENDPROC
+ENDPROC(__x86_indirect_thunk_\reg)
+.endm
+
+/*
+ * Despite being an assembler file we can't just use .irp here
+ * because __KSYM_DEPS__ only uses the C preprocessor and would
+ * only see one instance of "__x86_indirect_thunk_\reg" rather
+ * than one per register with the correct names. So we do it
+ * the simple and nasty way...
+ */
+#define EXPORT_THUNK(reg) EXPORT_SYMBOL(__x86_indirect_thunk_ ## reg)
+#define GENERATE_THUNK(reg) THUNK reg ; EXPORT_THUNK(reg)
+
+GENERATE_THUNK(_ASM_AX)
+GENERATE_THUNK(_ASM_BX)
+GENERATE_THUNK(_ASM_CX)
+GENERATE_THUNK(_ASM_DX)
+GENERATE_THUNK(_ASM_SI)
+GENERATE_THUNK(_ASM_DI)
+GENERATE_THUNK(_ASM_BP)
+GENERATE_THUNK(_ASM_SP)
+#ifdef CONFIG_64BIT
+GENERATE_THUNK(r8)
+GENERATE_THUNK(r9)
+GENERATE_THUNK(r10)
+GENERATE_THUNK(r11)
+GENERATE_THUNK(r12)
+GENERATE_THUNK(r13)
+GENERATE_THUNK(r14)
+GENERATE_THUNK(r15)
+#endif

From da285121560e769cc31797bba6422eea71d473e0 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Thu, 11 Jan 2018 21:46:26 +0000
Subject: [PATCH 787/808] x86/spectre: Add boot time option to select Spectre
 v2 mitigation

Add a spectre_v2= option to select the mitigation used for the indirect
branch speculation vulnerability.

Currently, the only option available is retpoline, in its various forms.
This will be expanded to cover the new IBRS/IBPB microcode features.

The RETPOLINE_AMD feature relies on a serializing LFENCE for speculation
control. For AMD hardware, only set RETPOLINE_AMD if LFENCE is a
serializing instruction, which is indicated by the LFENCE_RDTSC feature.

[ tglx: Folded back the LFENCE/AMD fixes and reworked it so IBRS
  	integration becomes simple ]

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: gnomes@lxorguk.ukuu.org.uk
Cc: Rik van Riel <riel@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: thomas.lendacky@amd.com
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Kees Cook <keescook@google.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Link: https://lkml.kernel.org/r/1515707194-20531-5-git-send-email-dwmw@amazon.co.uk
---
 .../admin-guide/kernel-parameters.txt         |  28 ++++
 arch/x86/include/asm/nospec-branch.h          |  10 ++
 arch/x86/kernel/cpu/bugs.c                    | 158 +++++++++++++++++-
 arch/x86/kernel/cpu/common.c                  |   4 -
 4 files changed, 195 insertions(+), 5 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 905991745d26..8122b5f98ea1 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2599,6 +2599,11 @@
 	nosmt		[KNL,S390] Disable symmetric multithreading (SMT).
 			Equivalent to smt=1.
 
+	nospectre_v2	[X86] Disable all mitigations for the Spectre variant 2
+			(indirect branch prediction) vulnerability. System may
+			allow data leaks with this option, which is equivalent
+			to spectre_v2=off.
+
 	noxsave		[BUGS=X86] Disables x86 extended register state save
 			and restore using xsave. The kernel will fallback to
 			enabling legacy floating-point and sse state.
@@ -3908,6 +3913,29 @@
 	sonypi.*=	[HW] Sony Programmable I/O Control Device driver
 			See Documentation/laptops/sonypi.txt
 
+	spectre_v2=	[X86] Control mitigation of Spectre variant 2
+			(indirect branch speculation) vulnerability.
+
+			on   - unconditionally enable
+			off  - unconditionally disable
+			auto - kernel detects whether your CPU model is
+			       vulnerable
+
+			Selecting 'on' will, and 'auto' may, choose a
+			mitigation method at run time according to the
+			CPU, the available microcode, the setting of the
+			CONFIG_RETPOLINE configuration option, and the
+			compiler with which the kernel was built.
+
+			Specific mitigations can also be selected manually:
+
+			retpoline	  - replace indirect branches
+			retpoline,generic - google's original retpoline
+			retpoline,amd     - AMD-specific minimal thunk
+
+			Not specifying this option is equivalent to
+			spectre_v2=auto.
+
 	spia_io_base=	[HW,MTD]
 	spia_fio_base=
 	spia_pedr=
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index e20e92ef2ca8..ea034fa6e261 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -124,5 +124,15 @@
 # define THUNK_TARGET(addr) [thunk_target] "rm" (addr)
 #endif
 
+/* The Spectre V2 mitigation variants */
+enum spectre_v2_mitigation {
+	SPECTRE_V2_NONE,
+	SPECTRE_V2_RETPOLINE_MINIMAL,
+	SPECTRE_V2_RETPOLINE_MINIMAL_AMD,
+	SPECTRE_V2_RETPOLINE_GENERIC,
+	SPECTRE_V2_RETPOLINE_AMD,
+	SPECTRE_V2_IBRS,
+};
+
 #endif /* __ASSEMBLY__ */
 #endif /* __NOSPEC_BRANCH_H__ */
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 76ad6cb44b40..e4dc26185aa7 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -11,6 +11,9 @@
 #include <linux/init.h>
 #include <linux/utsname.h>
 #include <linux/cpu.h>
+
+#include <asm/nospec-branch.h>
+#include <asm/cmdline.h>
 #include <asm/bugs.h>
 #include <asm/processor.h>
 #include <asm/processor-flags.h>
@@ -21,6 +24,8 @@
 #include <asm/pgtable.h>
 #include <asm/set_memory.h>
 
+static void __init spectre_v2_select_mitigation(void);
+
 void __init check_bugs(void)
 {
 	identify_boot_cpu();
@@ -30,6 +35,9 @@ void __init check_bugs(void)
 		print_cpu_info(&boot_cpu_data);
 	}
 
+	/* Select the proper spectre mitigation before patching alternatives */
+	spectre_v2_select_mitigation();
+
 #ifdef CONFIG_X86_32
 	/*
 	 * Check whether we are able to run this kernel safely on SMP.
@@ -62,6 +70,153 @@ void __init check_bugs(void)
 #endif
 }
 
+/* The kernel command line selection */
+enum spectre_v2_mitigation_cmd {
+	SPECTRE_V2_CMD_NONE,
+	SPECTRE_V2_CMD_AUTO,
+	SPECTRE_V2_CMD_FORCE,
+	SPECTRE_V2_CMD_RETPOLINE,
+	SPECTRE_V2_CMD_RETPOLINE_GENERIC,
+	SPECTRE_V2_CMD_RETPOLINE_AMD,
+};
+
+static const char *spectre_v2_strings[] = {
+	[SPECTRE_V2_NONE]			= "Vulnerable",
+	[SPECTRE_V2_RETPOLINE_MINIMAL]		= "Vulnerable: Minimal generic ASM retpoline",
+	[SPECTRE_V2_RETPOLINE_MINIMAL_AMD]	= "Vulnerable: Minimal AMD ASM retpoline",
+	[SPECTRE_V2_RETPOLINE_GENERIC]		= "Mitigation: Full generic retpoline",
+	[SPECTRE_V2_RETPOLINE_AMD]		= "Mitigation: Full AMD retpoline",
+};
+
+#undef pr_fmt
+#define pr_fmt(fmt)     "Spectre V2 mitigation: " fmt
+
+static enum spectre_v2_mitigation spectre_v2_enabled = SPECTRE_V2_NONE;
+
+static void __init spec2_print_if_insecure(const char *reason)
+{
+	if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
+		pr_info("%s\n", reason);
+}
+
+static void __init spec2_print_if_secure(const char *reason)
+{
+	if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
+		pr_info("%s\n", reason);
+}
+
+static inline bool retp_compiler(void)
+{
+	return __is_defined(RETPOLINE);
+}
+
+static inline bool match_option(const char *arg, int arglen, const char *opt)
+{
+	int len = strlen(opt);
+
+	return len == arglen && !strncmp(arg, opt, len);
+}
+
+static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
+{
+	char arg[20];
+	int ret;
+
+	ret = cmdline_find_option(boot_command_line, "spectre_v2", arg,
+				  sizeof(arg));
+	if (ret > 0)  {
+		if (match_option(arg, ret, "off")) {
+			goto disable;
+		} else if (match_option(arg, ret, "on")) {
+			spec2_print_if_secure("force enabled on command line.");
+			return SPECTRE_V2_CMD_FORCE;
+		} else if (match_option(arg, ret, "retpoline")) {
+			spec2_print_if_insecure("retpoline selected on command line.");
+			return SPECTRE_V2_CMD_RETPOLINE;
+		} else if (match_option(arg, ret, "retpoline,amd")) {
+			if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
+				pr_err("retpoline,amd selected but CPU is not AMD. Switching to AUTO select\n");
+				return SPECTRE_V2_CMD_AUTO;
+			}
+			spec2_print_if_insecure("AMD retpoline selected on command line.");
+			return SPECTRE_V2_CMD_RETPOLINE_AMD;
+		} else if (match_option(arg, ret, "retpoline,generic")) {
+			spec2_print_if_insecure("generic retpoline selected on command line.");
+			return SPECTRE_V2_CMD_RETPOLINE_GENERIC;
+		} else if (match_option(arg, ret, "auto")) {
+			return SPECTRE_V2_CMD_AUTO;
+		}
+	}
+
+	if (!cmdline_find_option_bool(boot_command_line, "nospectre_v2"))
+		return SPECTRE_V2_CMD_AUTO;
+disable:
+	spec2_print_if_insecure("disabled on command line.");
+	return SPECTRE_V2_CMD_NONE;
+}
+
+static void __init spectre_v2_select_mitigation(void)
+{
+	enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline();
+	enum spectre_v2_mitigation mode = SPECTRE_V2_NONE;
+
+	/*
+	 * If the CPU is not affected and the command line mode is NONE or AUTO
+	 * then nothing to do.
+	 */
+	if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2) &&
+	    (cmd == SPECTRE_V2_CMD_NONE || cmd == SPECTRE_V2_CMD_AUTO))
+		return;
+
+	switch (cmd) {
+	case SPECTRE_V2_CMD_NONE:
+		return;
+
+	case SPECTRE_V2_CMD_FORCE:
+		/* FALLTRHU */
+	case SPECTRE_V2_CMD_AUTO:
+		goto retpoline_auto;
+
+	case SPECTRE_V2_CMD_RETPOLINE_AMD:
+		if (IS_ENABLED(CONFIG_RETPOLINE))
+			goto retpoline_amd;
+		break;
+	case SPECTRE_V2_CMD_RETPOLINE_GENERIC:
+		if (IS_ENABLED(CONFIG_RETPOLINE))
+			goto retpoline_generic;
+		break;
+	case SPECTRE_V2_CMD_RETPOLINE:
+		if (IS_ENABLED(CONFIG_RETPOLINE))
+			goto retpoline_auto;
+		break;
+	}
+	pr_err("kernel not compiled with retpoline; no mitigation available!");
+	return;
+
+retpoline_auto:
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
+	retpoline_amd:
+		if (!boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) {
+			pr_err("LFENCE not serializing. Switching to generic retpoline\n");
+			goto retpoline_generic;
+		}
+		mode = retp_compiler() ? SPECTRE_V2_RETPOLINE_AMD :
+					 SPECTRE_V2_RETPOLINE_MINIMAL_AMD;
+		setup_force_cpu_cap(X86_FEATURE_RETPOLINE_AMD);
+		setup_force_cpu_cap(X86_FEATURE_RETPOLINE);
+	} else {
+	retpoline_generic:
+		mode = retp_compiler() ? SPECTRE_V2_RETPOLINE_GENERIC :
+					 SPECTRE_V2_RETPOLINE_MINIMAL;
+		setup_force_cpu_cap(X86_FEATURE_RETPOLINE);
+	}
+
+	spectre_v2_enabled = mode;
+	pr_info("%s\n", spectre_v2_strings[mode]);
+}
+
+#undef pr_fmt
+
 #ifdef CONFIG_SYSFS
 ssize_t cpu_show_meltdown(struct device *dev,
 			  struct device_attribute *attr, char *buf)
@@ -86,6 +241,7 @@ ssize_t cpu_show_spectre_v2(struct device *dev,
 {
 	if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
 		return sprintf(buf, "Not affected\n");
-	return sprintf(buf, "Vulnerable\n");
+
+	return sprintf(buf, "%s\n", spectre_v2_strings[spectre_v2_enabled]);
 }
 #endif
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 7a671d1ae3cb..372ba3fb400f 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -905,10 +905,6 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
 	setup_force_cpu_bug(X86_BUG_SPECTRE_V1);
 	setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
 
-#ifdef CONFIG_RETPOLINE
-	setup_force_cpu_cap(X86_FEATURE_RETPOLINE);
-#endif
-
 	fpu__init_system(c);
 
 #ifdef CONFIG_X86_32

From 9697fa39efd3fc3692f2949d4045f393ec58450b Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Thu, 11 Jan 2018 21:46:27 +0000
Subject: [PATCH 788/808] x86/retpoline/crypto: Convert crypto assembler
 indirect jumps

Convert all indirect jumps in crypto assembler code to use non-speculative
sequences when CONFIG_RETPOLINE is enabled.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Arjan van de Ven <arjan@linux.intel.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Cc: gnomes@lxorguk.ukuu.org.uk
Cc: Rik van Riel <riel@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: thomas.lendacky@amd.com
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Kees Cook <keescook@google.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Link: https://lkml.kernel.org/r/1515707194-20531-6-git-send-email-dwmw@amazon.co.uk
---
 arch/x86/crypto/aesni-intel_asm.S            | 5 +++--
 arch/x86/crypto/camellia-aesni-avx-asm_64.S  | 3 ++-
 arch/x86/crypto/camellia-aesni-avx2-asm_64.S | 3 ++-
 arch/x86/crypto/crc32c-pcl-intel-asm_64.S    | 3 ++-
 4 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index 16627fec80b2..3d09e3aca18d 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -32,6 +32,7 @@
 #include <linux/linkage.h>
 #include <asm/inst.h>
 #include <asm/frame.h>
+#include <asm/nospec-branch.h>
 
 /*
  * The following macros are used to move an (un)aligned 16 byte value to/from
@@ -2884,7 +2885,7 @@ ENTRY(aesni_xts_crypt8)
 	pxor INC, STATE4
 	movdqu IV, 0x30(OUTP)
 
-	call *%r11
+	CALL_NOSPEC %r11
 
 	movdqu 0x00(OUTP), INC
 	pxor INC, STATE1
@@ -2929,7 +2930,7 @@ ENTRY(aesni_xts_crypt8)
 	_aesni_gf128mul_x_ble()
 	movups IV, (IVP)
 
-	call *%r11
+	CALL_NOSPEC %r11
 
 	movdqu 0x40(OUTP), INC
 	pxor INC, STATE1
diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
index f7c495e2863c..a14af6eb09cb 100644
--- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S
+++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
@@ -17,6 +17,7 @@
 
 #include <linux/linkage.h>
 #include <asm/frame.h>
+#include <asm/nospec-branch.h>
 
 #define CAMELLIA_TABLE_BYTE_LEN 272
 
@@ -1227,7 +1228,7 @@ camellia_xts_crypt_16way:
 	vpxor 14 * 16(%rax), %xmm15, %xmm14;
 	vpxor 15 * 16(%rax), %xmm15, %xmm15;
 
-	call *%r9;
+	CALL_NOSPEC %r9;
 
 	addq $(16 * 16), %rsp;
 
diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
index eee5b3982cfd..b66bbfa62f50 100644
--- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
+++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
@@ -12,6 +12,7 @@
 
 #include <linux/linkage.h>
 #include <asm/frame.h>
+#include <asm/nospec-branch.h>
 
 #define CAMELLIA_TABLE_BYTE_LEN 272
 
@@ -1343,7 +1344,7 @@ camellia_xts_crypt_32way:
 	vpxor 14 * 32(%rax), %ymm15, %ymm14;
 	vpxor 15 * 32(%rax), %ymm15, %ymm15;
 
-	call *%r9;
+	CALL_NOSPEC %r9;
 
 	addq $(16 * 32), %rsp;
 
diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
index 7a7de27c6f41..d9b734d0c8cc 100644
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
@@ -45,6 +45,7 @@
 
 #include <asm/inst.h>
 #include <linux/linkage.h>
+#include <asm/nospec-branch.h>
 
 ## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction
 
@@ -172,7 +173,7 @@ continue_block:
 	movzxw  (bufp, %rax, 2), len
 	lea	crc_array(%rip), bufp
 	lea     (bufp, len, 1), bufp
-	jmp     *bufp
+	JMP_NOSPEC bufp
 
 	################################################################
 	## 2a) PROCESS FULL BLOCKS:

From 2641f08bb7fc63a636a2b18173221d7040a3512e Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Thu, 11 Jan 2018 21:46:28 +0000
Subject: [PATCH 789/808] x86/retpoline/entry: Convert entry assembler indirect
 jumps

Convert indirect jumps in core 32/64bit entry assembler code to use
non-speculative sequences when CONFIG_RETPOLINE is enabled.

Don't use CALL_NOSPEC in entry_SYSCALL_64_fastpath because the return
address after the 'call' instruction must be *precisely* at the
.Lentry_SYSCALL_64_after_fastpath label for stub_ptregs_64 to work,
and the use of alternatives will mess that up unless we play horrid
games to prepend with NOPs and make the variants the same length. It's
not worth it; in the case where we ALTERNATIVE out the retpoline, the
first instruction at __x86.indirect_thunk.rax is going to be a bare
jmp *%rax anyway.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: gnomes@lxorguk.ukuu.org.uk
Cc: Rik van Riel <riel@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: thomas.lendacky@amd.com
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Kees Cook <keescook@google.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Link: https://lkml.kernel.org/r/1515707194-20531-7-git-send-email-dwmw@amazon.co.uk
---
 arch/x86/entry/entry_32.S |  5 +++--
 arch/x86/entry/entry_64.S | 12 +++++++++---
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index ace8f321a5a1..a1f28a54f23a 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -44,6 +44,7 @@
 #include <asm/asm.h>
 #include <asm/smap.h>
 #include <asm/frame.h>
+#include <asm/nospec-branch.h>
 
 	.section .entry.text, "ax"
 
@@ -290,7 +291,7 @@ ENTRY(ret_from_fork)
 
 	/* kernel thread */
 1:	movl	%edi, %eax
-	call	*%ebx
+	CALL_NOSPEC %ebx
 	/*
 	 * A kernel thread is allowed to return here after successfully
 	 * calling do_execve().  Exit to userspace to complete the execve()
@@ -919,7 +920,7 @@ common_exception:
 	movl	%ecx, %es
 	TRACE_IRQS_OFF
 	movl	%esp, %eax			# pt_regs pointer
-	call	*%edi
+	CALL_NOSPEC %edi
 	jmp	ret_from_exception
 END(common_exception)
 
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index ed31d00dc5ee..59874bc1aed2 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -37,6 +37,7 @@
 #include <asm/pgtable_types.h>
 #include <asm/export.h>
 #include <asm/frame.h>
+#include <asm/nospec-branch.h>
 #include <linux/err.h>
 
 #include "calling.h"
@@ -187,7 +188,7 @@ ENTRY(entry_SYSCALL_64_trampoline)
 	 */
 	pushq	%rdi
 	movq	$entry_SYSCALL_64_stage2, %rdi
-	jmp	*%rdi
+	JMP_NOSPEC %rdi
 END(entry_SYSCALL_64_trampoline)
 
 	.popsection
@@ -266,7 +267,12 @@ entry_SYSCALL_64_fastpath:
 	 * It might end up jumping to the slow path.  If it jumps, RAX
 	 * and all argument registers are clobbered.
 	 */
+#ifdef CONFIG_RETPOLINE
+	movq	sys_call_table(, %rax, 8), %rax
+	call	__x86_indirect_thunk_rax
+#else
 	call	*sys_call_table(, %rax, 8)
+#endif
 .Lentry_SYSCALL_64_after_fastpath_call:
 
 	movq	%rax, RAX(%rsp)
@@ -438,7 +444,7 @@ ENTRY(stub_ptregs_64)
 	jmp	entry_SYSCALL64_slow_path
 
 1:
-	jmp	*%rax				/* Called from C */
+	JMP_NOSPEC %rax				/* Called from C */
 END(stub_ptregs_64)
 
 .macro ptregs_stub func
@@ -517,7 +523,7 @@ ENTRY(ret_from_fork)
 1:
 	/* kernel thread */
 	movq	%r12, %rdi
-	call	*%rbx
+	CALL_NOSPEC %rbx
 	/*
 	 * A kernel thread is allowed to return here after successfully
 	 * calling do_execve().  Exit to userspace to complete the execve()

From 9351803bd803cdbeb9b5a7850b7b6f464806e3db Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Thu, 11 Jan 2018 21:46:29 +0000
Subject: [PATCH 790/808] x86/retpoline/ftrace: Convert ftrace assembler
 indirect jumps

Convert all indirect jumps in ftrace assembler code to use non-speculative
sequences when CONFIG_RETPOLINE is enabled.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Arjan van de Ven <arjan@linux.intel.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Cc: gnomes@lxorguk.ukuu.org.uk
Cc: Rik van Riel <riel@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: thomas.lendacky@amd.com
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Kees Cook <keescook@google.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Link: https://lkml.kernel.org/r/1515707194-20531-8-git-send-email-dwmw@amazon.co.uk
---
 arch/x86/kernel/ftrace_32.S | 6 ++++--
 arch/x86/kernel/ftrace_64.S | 8 ++++----
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/ftrace_32.S b/arch/x86/kernel/ftrace_32.S
index b6c6468e10bc..4c8440de3355 100644
--- a/arch/x86/kernel/ftrace_32.S
+++ b/arch/x86/kernel/ftrace_32.S
@@ -8,6 +8,7 @@
 #include <asm/segment.h>
 #include <asm/export.h>
 #include <asm/ftrace.h>
+#include <asm/nospec-branch.h>
 
 #ifdef CC_USING_FENTRY
 # define function_hook	__fentry__
@@ -197,7 +198,8 @@ ftrace_stub:
 	movl	0x4(%ebp), %edx
 	subl	$MCOUNT_INSN_SIZE, %eax
 
-	call	*ftrace_trace_function
+	movl	ftrace_trace_function, %ecx
+	CALL_NOSPEC %ecx
 
 	popl	%edx
 	popl	%ecx
@@ -241,5 +243,5 @@ return_to_handler:
 	movl	%eax, %ecx
 	popl	%edx
 	popl	%eax
-	jmp	*%ecx
+	JMP_NOSPEC %ecx
 #endif
diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S
index c832291d948a..7cb8ba08beb9 100644
--- a/arch/x86/kernel/ftrace_64.S
+++ b/arch/x86/kernel/ftrace_64.S
@@ -7,7 +7,7 @@
 #include <asm/ptrace.h>
 #include <asm/ftrace.h>
 #include <asm/export.h>
-
+#include <asm/nospec-branch.h>
 
 	.code64
 	.section .entry.text, "ax"
@@ -286,8 +286,8 @@ trace:
 	 * ip and parent ip are used and the list function is called when
 	 * function tracing is enabled.
 	 */
-	call   *ftrace_trace_function
-
+	movq ftrace_trace_function, %r8
+	CALL_NOSPEC %r8
 	restore_mcount_regs
 
 	jmp fgraph_trace
@@ -329,5 +329,5 @@ GLOBAL(return_to_handler)
 	movq 8(%rsp), %rdx
 	movq (%rsp), %rax
 	addq $24, %rsp
-	jmp *%rdi
+	JMP_NOSPEC %rdi
 #endif

From e70e5892b28c18f517f29ab6e83bd57705104b31 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Thu, 11 Jan 2018 21:46:30 +0000
Subject: [PATCH 791/808] x86/retpoline/hyperv: Convert assembler indirect
 jumps

Convert all indirect jumps in hyperv inline asm code to use non-speculative
sequences when CONFIG_RETPOLINE is enabled.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Arjan van de Ven <arjan@linux.intel.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Cc: gnomes@lxorguk.ukuu.org.uk
Cc: Rik van Riel <riel@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: thomas.lendacky@amd.com
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Kees Cook <keescook@google.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Link: https://lkml.kernel.org/r/1515707194-20531-9-git-send-email-dwmw@amazon.co.uk
---
 arch/x86/include/asm/mshyperv.h | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 581bb54dd464..5119e4b555cc 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -7,6 +7,7 @@
 #include <linux/nmi.h>
 #include <asm/io.h>
 #include <asm/hyperv.h>
+#include <asm/nospec-branch.h>
 
 /*
  * The below CPUID leaves are present if VersionAndFeatures.HypervisorPresent
@@ -186,10 +187,11 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output)
 		return U64_MAX;
 
 	__asm__ __volatile__("mov %4, %%r8\n"
-			     "call *%5"
+			     CALL_NOSPEC
 			     : "=a" (hv_status), ASM_CALL_CONSTRAINT,
 			       "+c" (control), "+d" (input_address)
-			     :  "r" (output_address), "m" (hv_hypercall_pg)
+			     :  "r" (output_address),
+				THUNK_TARGET(hv_hypercall_pg)
 			     : "cc", "memory", "r8", "r9", "r10", "r11");
 #else
 	u32 input_address_hi = upper_32_bits(input_address);
@@ -200,13 +202,13 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output)
 	if (!hv_hypercall_pg)
 		return U64_MAX;
 
-	__asm__ __volatile__("call *%7"
+	__asm__ __volatile__(CALL_NOSPEC
 			     : "=A" (hv_status),
 			       "+c" (input_address_lo), ASM_CALL_CONSTRAINT
 			     : "A" (control),
 			       "b" (input_address_hi),
 			       "D"(output_address_hi), "S"(output_address_lo),
-			       "m" (hv_hypercall_pg)
+			       THUNK_TARGET(hv_hypercall_pg)
 			     : "cc", "memory");
 #endif /* !x86_64 */
 	return hv_status;
@@ -227,10 +229,10 @@ static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1)
 
 #ifdef CONFIG_X86_64
 	{
-		__asm__ __volatile__("call *%4"
+		__asm__ __volatile__(CALL_NOSPEC
 				     : "=a" (hv_status), ASM_CALL_CONSTRAINT,
 				       "+c" (control), "+d" (input1)
-				     : "m" (hv_hypercall_pg)
+				     : THUNK_TARGET(hv_hypercall_pg)
 				     : "cc", "r8", "r9", "r10", "r11");
 	}
 #else
@@ -238,13 +240,13 @@ static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1)
 		u32 input1_hi = upper_32_bits(input1);
 		u32 input1_lo = lower_32_bits(input1);
 
-		__asm__ __volatile__ ("call *%5"
+		__asm__ __volatile__ (CALL_NOSPEC
 				      : "=A"(hv_status),
 					"+c"(input1_lo),
 					ASM_CALL_CONSTRAINT
 				      :	"A" (control),
 					"b" (input1_hi),
-					"m" (hv_hypercall_pg)
+					THUNK_TARGET(hv_hypercall_pg)
 				      : "cc", "edi", "esi");
 	}
 #endif

From ea08816d5b185ab3d09e95e393f265af54560350 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Thu, 11 Jan 2018 21:46:31 +0000
Subject: [PATCH 792/808] x86/retpoline/xen: Convert Xen hypercall indirect
 jumps

Convert indirect call in Xen hypercall to use non-speculative sequence,
when CONFIG_RETPOLINE is enabled.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Arjan van de Ven <arjan@linux.intel.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Juergen Gross <jgross@suse.com>
Cc: gnomes@lxorguk.ukuu.org.uk
Cc: Rik van Riel <riel@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: thomas.lendacky@amd.com
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Kees Cook <keescook@google.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Link: https://lkml.kernel.org/r/1515707194-20531-10-git-send-email-dwmw@amazon.co.uk
---
 arch/x86/include/asm/xen/hypercall.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index 7cb282e9e587..bfd882617613 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -44,6 +44,7 @@
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/smap.h>
+#include <asm/nospec-branch.h>
 
 #include <xen/interface/xen.h>
 #include <xen/interface/sched.h>
@@ -217,9 +218,9 @@ privcmd_call(unsigned call,
 	__HYPERCALL_5ARG(a1, a2, a3, a4, a5);
 
 	stac();
-	asm volatile("call *%[call]"
+	asm volatile(CALL_NOSPEC
 		     : __HYPERCALL_5PARAM
-		     : [call] "a" (&hypercall_page[call])
+		     : [thunk_target] "a" (&hypercall_page[call])
 		     : __HYPERCALL_CLOBBER5);
 	clac();
 

From 5096732f6f695001fa2d6f1335a2680b37912c69 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Thu, 11 Jan 2018 21:46:32 +0000
Subject: [PATCH 793/808] x86/retpoline/checksum32: Convert assembler indirect
 jumps

Convert all indirect jumps in 32bit checksum assembler code to use
non-speculative sequences when CONFIG_RETPOLINE is enabled.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Arjan van de Ven <arjan@linux.intel.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Cc: gnomes@lxorguk.ukuu.org.uk
Cc: Rik van Riel <riel@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: thomas.lendacky@amd.com
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Kees Cook <keescook@google.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Link: https://lkml.kernel.org/r/1515707194-20531-11-git-send-email-dwmw@amazon.co.uk
---
 arch/x86/lib/checksum_32.S | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S
index 4d34bb548b41..46e71a74e612 100644
--- a/arch/x86/lib/checksum_32.S
+++ b/arch/x86/lib/checksum_32.S
@@ -29,7 +29,8 @@
 #include <asm/errno.h>
 #include <asm/asm.h>
 #include <asm/export.h>
-				
+#include <asm/nospec-branch.h>
+
 /*
  * computes a partial checksum, e.g. for TCP/UDP fragments
  */
@@ -156,7 +157,7 @@ ENTRY(csum_partial)
 	negl %ebx
 	lea 45f(%ebx,%ebx,2), %ebx
 	testl %esi, %esi
-	jmp *%ebx
+	JMP_NOSPEC %ebx
 
 	# Handle 2-byte-aligned regions
 20:	addw (%esi), %ax
@@ -439,7 +440,7 @@ ENTRY(csum_partial_copy_generic)
 	andl $-32,%edx
 	lea 3f(%ebx,%ebx), %ebx
 	testl %esi, %esi 
-	jmp *%ebx
+	JMP_NOSPEC %ebx
 1:	addl $64,%esi
 	addl $64,%edi 
 	SRC(movb -32(%edx),%bl)	; SRC(movb (%edx),%bl)

From 7614e913db1f40fff819b36216484dc3808995d4 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Thu, 11 Jan 2018 21:46:33 +0000
Subject: [PATCH 794/808] x86/retpoline/irq32: Convert assembler indirect jumps

Convert all indirect jumps in 32bit irq inline asm code to use non
speculative sequences.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Arjan van de Ven <arjan@linux.intel.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Cc: gnomes@lxorguk.ukuu.org.uk
Cc: Rik van Riel <riel@redhat.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: thomas.lendacky@amd.com
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Kees Cook <keescook@google.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Link: https://lkml.kernel.org/r/1515707194-20531-12-git-send-email-dwmw@amazon.co.uk
---
 arch/x86/kernel/irq_32.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index a83b3346a0e1..c1bdbd3d3232 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -20,6 +20,7 @@
 #include <linux/mm.h>
 
 #include <asm/apic.h>
+#include <asm/nospec-branch.h>
 
 #ifdef CONFIG_DEBUG_STACKOVERFLOW
 
@@ -55,11 +56,11 @@ DEFINE_PER_CPU(struct irq_stack *, softirq_stack);
 static void call_on_stack(void *func, void *stack)
 {
 	asm volatile("xchgl	%%ebx,%%esp	\n"
-		     "call	*%%edi		\n"
+		     CALL_NOSPEC
 		     "movl	%%ebx,%%esp	\n"
 		     : "=b" (stack)
 		     : "0" (stack),
-		       "D"(func)
+		       [thunk_target] "D"(func)
 		     : "memory", "cc", "edx", "ecx", "eax");
 }
 
@@ -95,11 +96,11 @@ static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc)
 		call_on_stack(print_stack_overflow, isp);
 
 	asm volatile("xchgl	%%ebx,%%esp	\n"
-		     "call	*%%edi		\n"
+		     CALL_NOSPEC
 		     "movl	%%ebx,%%esp	\n"
 		     : "=a" (arg1), "=b" (isp)
 		     :  "0" (desc),   "1" (isp),
-			"D" (desc->handle_irq)
+			[thunk_target] "D" (desc->handle_irq)
 		     : "memory", "cc", "ecx");
 	return 1;
 }

From 117cc7a908c83697b0b737d15ae1eb5943afe35b Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Fri, 12 Jan 2018 11:11:27 +0000
Subject: [PATCH 795/808] x86/retpoline: Fill return stack buffer on vmexit

In accordance with the Intel and AMD documentation, we need to overwrite
all entries in the RSB on exiting a guest, to prevent malicious branch
target predictions from affecting the host kernel. This is needed both
for retpoline and for IBRS.

[ak: numbers again for the RSB stuffing labels]

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: gnomes@lxorguk.ukuu.org.uk
Cc: Rik van Riel <riel@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: thomas.lendacky@amd.com
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Kees Cook <keescook@google.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Link: https://lkml.kernel.org/r/1515755487-8524-1-git-send-email-dwmw@amazon.co.uk
---
 arch/x86/include/asm/nospec-branch.h | 78 +++++++++++++++++++++++++++-
 arch/x86/kvm/svm.c                   |  4 ++
 arch/x86/kvm/vmx.c                   |  4 ++
 3 files changed, 85 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index ea034fa6e261..402a11c803c3 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -7,6 +7,48 @@
 #include <asm/alternative-asm.h>
 #include <asm/cpufeatures.h>
 
+/*
+ * Fill the CPU return stack buffer.
+ *
+ * Each entry in the RSB, if used for a speculative 'ret', contains an
+ * infinite 'pause; jmp' loop to capture speculative execution.
+ *
+ * This is required in various cases for retpoline and IBRS-based
+ * mitigations for the Spectre variant 2 vulnerability. Sometimes to
+ * eliminate potentially bogus entries from the RSB, and sometimes
+ * purely to ensure that it doesn't get empty, which on some CPUs would
+ * allow predictions from other (unwanted!) sources to be used.
+ *
+ * We define a CPP macro such that it can be used from both .S files and
+ * inline assembly. It's possible to do a .macro and then include that
+ * from C via asm(".include <asm/nospec-branch.h>") but let's not go there.
+ */
+
+#define RSB_CLEAR_LOOPS		32	/* To forcibly overwrite all entries */
+#define RSB_FILL_LOOPS		16	/* To avoid underflow */
+
+/*
+ * Google experimented with loop-unrolling and this turned out to be
+ * the optimal version — two calls, each with their own speculation
+ * trap should their return address end up getting used, in a loop.
+ */
+#define __FILL_RETURN_BUFFER(reg, nr, sp)	\
+	mov	$(nr/2), reg;			\
+771:						\
+	call	772f;				\
+773:	/* speculation trap */			\
+	pause;					\
+	jmp	773b;				\
+772:						\
+	call	774f;				\
+775:	/* speculation trap */			\
+	pause;					\
+	jmp	775b;				\
+774:						\
+	dec	reg;				\
+	jnz	771b;				\
+	add	$(BITS_PER_LONG/8) * nr, sp;
+
 #ifdef __ASSEMBLY__
 
 /*
@@ -74,6 +116,20 @@
 #else
 	call	*\reg
 #endif
+.endm
+
+ /*
+  * A simpler FILL_RETURN_BUFFER macro. Don't make people use the CPP
+  * monstrosity above, manually.
+  */
+.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req
+#ifdef CONFIG_RETPOLINE
+	ANNOTATE_NOSPEC_ALTERNATIVE
+	ALTERNATIVE "jmp .Lskip_rsb_\@",				\
+		__stringify(__FILL_RETURN_BUFFER(\reg,\nr,%_ASM_SP))	\
+		\ftr
+.Lskip_rsb_\@:
+#endif
 .endm
 
 #else /* __ASSEMBLY__ */
@@ -119,7 +175,7 @@
 	X86_FEATURE_RETPOLINE)
 
 # define THUNK_TARGET(addr) [thunk_target] "rm" (addr)
-#else /* No retpoline */
+#else /* No retpoline for C / inline asm */
 # define CALL_NOSPEC "call *%[thunk_target]\n"
 # define THUNK_TARGET(addr) [thunk_target] "rm" (addr)
 #endif
@@ -134,5 +190,25 @@ enum spectre_v2_mitigation {
 	SPECTRE_V2_IBRS,
 };
 
+/*
+ * On VMEXIT we must ensure that no RSB predictions learned in the guest
+ * can be followed in the host, by overwriting the RSB completely. Both
+ * retpoline and IBRS mitigations for Spectre v2 need this; only on future
+ * CPUs with IBRS_ATT *might* it be avoided.
+ */
+static inline void vmexit_fill_RSB(void)
+{
+#ifdef CONFIG_RETPOLINE
+	unsigned long loops = RSB_CLEAR_LOOPS / 2;
+
+	asm volatile (ANNOTATE_NOSPEC_ALTERNATIVE
+		      ALTERNATIVE("jmp 910f",
+				  __stringify(__FILL_RETURN_BUFFER(%0, RSB_CLEAR_LOOPS, %1)),
+				  X86_FEATURE_RETPOLINE)
+		      "910:"
+		      : "=&r" (loops), ASM_CALL_CONSTRAINT
+		      : "r" (loops) : "memory" );
+#endif
+}
 #endif /* __ASSEMBLY__ */
 #endif /* __NOSPEC_BRANCH_H__ */
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 0e68f0b3cbf7..2744b97345b8 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -45,6 +45,7 @@
 #include <asm/debugreg.h>
 #include <asm/kvm_para.h>
 #include <asm/irq_remapping.h>
+#include <asm/nospec-branch.h>
 
 #include <asm/virtext.h>
 #include "trace.h"
@@ -4985,6 +4986,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 #endif
 		);
 
+	/* Eliminate branch target predictions from guest mode */
+	vmexit_fill_RSB();
+
 #ifdef CONFIG_X86_64
 	wrmsrl(MSR_GS_BASE, svm->host.gs_base);
 #else
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 62ee4362e1c1..d1e25dba3112 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -50,6 +50,7 @@
 #include <asm/apic.h>
 #include <asm/irq_remapping.h>
 #include <asm/mmu_context.h>
+#include <asm/nospec-branch.h>
 
 #include "trace.h"
 #include "pmu.h"
@@ -9403,6 +9404,9 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 #endif
 	      );
 
+	/* Eliminate branch target predictions from guest mode */
+	vmexit_fill_RSB();
+
 	/* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
 	if (debugctlmsr)
 		update_debugctlmsr(debugctlmsr);

From 0dda0b3fb255048a221f736c8a2a24c674da8bf3 Mon Sep 17 00:00:00 2001
From: John Johansen <john.johansen@canonical.com>
Date: Fri, 8 Dec 2017 17:43:18 -0800
Subject: [PATCH 796/808] apparmor: fix ptrace label match when matching
 stacked labels

Given a label with a profile stack of
  A//&B or A//&C ...

A ptrace rule should be able to specify a generic trace pattern with
a rule like

  ptrace trace A//&**,

however this is failing because while the correct label match routine
is called, it is being done post label decomposition so it is always
being done against a profile instead of the stacked label.

To fix this refactor the cross check to pass the full peer label in to
the label_match.

Fixes: 290f458a4f16 ("apparmor: allow ptrace checks to be finer grained than just capability")
Cc: Stable <stable@vger.kernel.org>
Reported-by: Matthew Garrett <mjg59@google.com>
Tested-by: Matthew Garrett <mjg59@google.com>
Signed-off-by: John Johansen <john.johansen@canonical.com>
---
 security/apparmor/include/perms.h |  3 ++
 security/apparmor/ipc.c           | 53 +++++++++++++++++++------------
 2 files changed, 35 insertions(+), 21 deletions(-)

diff --git a/security/apparmor/include/perms.h b/security/apparmor/include/perms.h
index 2b27bb79aec4..d7b7e7115160 100644
--- a/security/apparmor/include/perms.h
+++ b/security/apparmor/include/perms.h
@@ -133,6 +133,9 @@ extern struct aa_perms allperms;
 #define xcheck_labels_profiles(L1, L2, FN, args...)		\
 	xcheck_ns_labels((L1), (L2), xcheck_ns_profile_label, (FN), args)
 
+#define xcheck_labels(L1, L2, P, FN1, FN2)			\
+	xcheck(fn_for_each((L1), (P), (FN1)), fn_for_each((L2), (P), (FN2)))
+
 
 void aa_perm_mask_to_str(char *str, const char *chrs, u32 mask);
 void aa_audit_perm_names(struct audit_buffer *ab, const char **names, u32 mask);
diff --git a/security/apparmor/ipc.c b/security/apparmor/ipc.c
index 7ca0032e7ba9..b40678f3c1d5 100644
--- a/security/apparmor/ipc.c
+++ b/security/apparmor/ipc.c
@@ -64,40 +64,48 @@ static void audit_ptrace_cb(struct audit_buffer *ab, void *va)
 			FLAGS_NONE, GFP_ATOMIC);
 }
 
+/* assumes check for PROFILE_MEDIATES is already done */
 /* TODO: conditionals */
 static int profile_ptrace_perm(struct aa_profile *profile,
-			       struct aa_profile *peer, u32 request,
-			       struct common_audit_data *sa)
+			     struct aa_label *peer, u32 request,
+			     struct common_audit_data *sa)
 {
 	struct aa_perms perms = { };
 
-	/* need because of peer in cross check */
-	if (profile_unconfined(profile) ||
-	    !PROFILE_MEDIATES(profile, AA_CLASS_PTRACE))
-		return 0;
-
-	aad(sa)->peer = &peer->label;
-	aa_profile_match_label(profile, &peer->label, AA_CLASS_PTRACE, request,
+	aad(sa)->peer = peer;
+	aa_profile_match_label(profile, peer, AA_CLASS_PTRACE, request,
 			       &perms);
 	aa_apply_modes_to_perms(profile, &perms);
 	return aa_check_perms(profile, &perms, request, sa, audit_ptrace_cb);
 }
 
-static int cross_ptrace_perm(struct aa_profile *tracer,
-			     struct aa_profile *tracee, u32 request,
-			     struct common_audit_data *sa)
+static int profile_tracee_perm(struct aa_profile *tracee,
+			       struct aa_label *tracer, u32 request,
+			       struct common_audit_data *sa)
 {
+	if (profile_unconfined(tracee) || unconfined(tracer) ||
+	    !PROFILE_MEDIATES(tracee, AA_CLASS_PTRACE))
+		return 0;
+
+	return profile_ptrace_perm(tracee, tracer, request, sa);
+}
+
+static int profile_tracer_perm(struct aa_profile *tracer,
+			       struct aa_label *tracee, u32 request,
+			       struct common_audit_data *sa)
+{
+	if (profile_unconfined(tracer))
+		return 0;
+
 	if (PROFILE_MEDIATES(tracer, AA_CLASS_PTRACE))
-		return xcheck(profile_ptrace_perm(tracer, tracee, request, sa),
-			      profile_ptrace_perm(tracee, tracer,
-						  request << PTRACE_PERM_SHIFT,
-						  sa));
-	/* policy uses the old style capability check for ptrace */
-	if (profile_unconfined(tracer) || tracer == tracee)
+		return profile_ptrace_perm(tracer, tracee, request, sa);
+
+	/* profile uses the old style capability check for ptrace */
+	if (&tracer->label == tracee)
 		return 0;
 
 	aad(sa)->label = &tracer->label;
-	aad(sa)->peer = &tracee->label;
+	aad(sa)->peer = tracee;
 	aad(sa)->request = 0;
 	aad(sa)->error = aa_capable(&tracer->label, CAP_SYS_PTRACE, 1);
 
@@ -115,10 +123,13 @@ static int cross_ptrace_perm(struct aa_profile *tracer,
 int aa_may_ptrace(struct aa_label *tracer, struct aa_label *tracee,
 		  u32 request)
 {
+	struct aa_profile *profile;
+	u32 xrequest = request << PTRACE_PERM_SHIFT;
 	DEFINE_AUDIT_DATA(sa, LSM_AUDIT_DATA_NONE, OP_PTRACE);
 
-	return xcheck_labels_profiles(tracer, tracee, cross_ptrace_perm,
-				      request, &sa);
+	return xcheck_labels(tracer, tracee, profile,
+			profile_tracer_perm(profile, tracee, request, &sa),
+			profile_tracee_perm(profile, tracer, xrequest, &sa));
 }
 
 
From 1a3881d305592d947ed47887306919d50112394d Mon Sep 17 00:00:00 2001
From: Matthew Garrett <mjg59@google.com>
Date: Thu, 11 Jan 2018 13:07:54 -0800
Subject: [PATCH 797/808] apparmor: Fix regression in profile conflict logic

The intended behaviour in apparmor profile matching is to flag a
conflict if two profiles match equally well. However, right now a
conflict is generated if another profile has the same match length even
if that profile doesn't actually match. Fix the logic so we only
generate a conflict if the profiles match.

Fixes: 844b8292b631 ("apparmor: ensure that undecidable profile attachments fail")
Cc: Stable <stable@vger.kernel.org>
Signed-off-by: Matthew Garrett <mjg59@google.com>
Signed-off-by: John Johansen <john.johansen@canonical.com>
---
 security/apparmor/domain.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/security/apparmor/domain.c b/security/apparmor/domain.c
index 04ba9d0718ea..6a54d2ffa840 100644
--- a/security/apparmor/domain.c
+++ b/security/apparmor/domain.c
@@ -330,10 +330,7 @@ static struct aa_profile *__attach_match(const char *name,
 			continue;
 
 		if (profile->xmatch) {
-			if (profile->xmatch_len == len) {
-				conflict = true;
-				continue;
-			} else if (profile->xmatch_len > len) {
+			if (profile->xmatch_len >= len) {
 				unsigned int state;
 				u32 perm;
 
@@ -342,6 +339,10 @@ static struct aa_profile *__attach_match(const char *name,
 				perm = dfa_user_allow(profile->xmatch, state);
 				/* any accepting state means a valid match. */
 				if (perm & MAY_EXEC) {
+					if (profile->xmatch_len == len) {
+						conflict = true;
+						continue;
+					}
 					candidate = profile;
 					len = profile->xmatch_len;
 					conflict = false;

From 352909b49ba0d74929b96af6dfbefc854ab6ebb5 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 11 Jan 2018 17:16:51 -0800
Subject: [PATCH 798/808] selftests/x86: Add test_vsyscall

This tests that the vsyscall entries do what they're expected to do.
It also confirms that attempts to read the vsyscall page behave as
expected.

If changes are made to the vsyscall code or its memory map handling,
running this test in all three of vsyscall=none, vsyscall=emulate,
and vsyscall=native are helpful.

(Because it's easy, this also compares the vsyscall results to their
 vDSO equivalents.)

Note to KAISER backporters: please test this under all three
vsyscall modes.  Also, in the emulate and native modes, make sure
that test_vsyscall_64 agrees with the command line or config
option as to which mode you're in.  It's quite easy to mess up
the kernel such that native mode accidentally emulates
or vice versa.

Greg, etc: please backport this to all your Meltdown-patched
kernels.  It'll help make sure the patches didn't regress
vsyscalls.

CSigned-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/2b9c5a174c1d60fd7774461d518aa75598b1d8fd.1515719552.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 tools/testing/selftests/x86/Makefile        |   2 +-
 tools/testing/selftests/x86/test_vsyscall.c | 500 ++++++++++++++++++++
 2 files changed, 501 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/x86/test_vsyscall.c

diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile
index 7b1adeee4b0f..91fbfa8fdc15 100644
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -7,7 +7,7 @@ include ../lib.mk
 
 TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt ptrace_syscall test_mremap_vdso \
 			check_initial_reg_state sigreturn ldt_gdt iopl mpx-mini-test ioperm \
-			protection_keys test_vdso
+			protection_keys test_vdso test_vsyscall
 TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault test_syscall_vdso unwind_vdso \
 			test_FCMOV test_FCOMI test_FISTTP \
 			vdso_restorer
diff --git a/tools/testing/selftests/x86/test_vsyscall.c b/tools/testing/selftests/x86/test_vsyscall.c
new file mode 100644
index 000000000000..7a744fa7b786
--- /dev/null
+++ b/tools/testing/selftests/x86/test_vsyscall.c
@@ -0,0 +1,500 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#define _GNU_SOURCE
+
+#include <stdio.h>
+#include <sys/time.h>
+#include <time.h>
+#include <stdlib.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#include <dlfcn.h>
+#include <string.h>
+#include <inttypes.h>
+#include <signal.h>
+#include <sys/ucontext.h>
+#include <errno.h>
+#include <err.h>
+#include <sched.h>
+#include <stdbool.h>
+#include <setjmp.h>
+
+#ifdef __x86_64__
+# define VSYS(x) (x)
+#else
+# define VSYS(x) 0
+#endif
+
+#ifndef SYS_getcpu
+# ifdef __x86_64__
+#  define SYS_getcpu 309
+# else
+#  define SYS_getcpu 318
+# endif
+#endif
+
+static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
+		       int flags)
+{
+	struct sigaction sa;
+	memset(&sa, 0, sizeof(sa));
+	sa.sa_sigaction = handler;
+	sa.sa_flags = SA_SIGINFO | flags;
+	sigemptyset(&sa.sa_mask);
+	if (sigaction(sig, &sa, 0))
+		err(1, "sigaction");
+}
+
+/* vsyscalls and vDSO */
+bool should_read_vsyscall = false;
+
+typedef long (*gtod_t)(struct timeval *tv, struct timezone *tz);
+gtod_t vgtod = (gtod_t)VSYS(0xffffffffff600000);
+gtod_t vdso_gtod;
+
+typedef int (*vgettime_t)(clockid_t, struct timespec *);
+vgettime_t vdso_gettime;
+
+typedef long (*time_func_t)(time_t *t);
+time_func_t vtime = (time_func_t)VSYS(0xffffffffff600400);
+time_func_t vdso_time;
+
+typedef long (*getcpu_t)(unsigned *, unsigned *, void *);
+getcpu_t vgetcpu = (getcpu_t)VSYS(0xffffffffff600800);
+getcpu_t vdso_getcpu;
+
+static void init_vdso(void)
+{
+	void *vdso = dlopen("linux-vdso.so.1", RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD);
+	if (!vdso)
+		vdso = dlopen("linux-gate.so.1", RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD);
+	if (!vdso) {
+		printf("[WARN]\tfailed to find vDSO\n");
+		return;
+	}
+
+	vdso_gtod = (gtod_t)dlsym(vdso, "__vdso_gettimeofday");
+	if (!vdso_gtod)
+		printf("[WARN]\tfailed to find gettimeofday in vDSO\n");
+
+	vdso_gettime = (vgettime_t)dlsym(vdso, "__vdso_clock_gettime");
+	if (!vdso_gettime)
+		printf("[WARN]\tfailed to find clock_gettime in vDSO\n");
+
+	vdso_time = (time_func_t)dlsym(vdso, "__vdso_time");
+	if (!vdso_time)
+		printf("[WARN]\tfailed to find time in vDSO\n");
+
+	vdso_getcpu = (getcpu_t)dlsym(vdso, "__vdso_getcpu");
+	if (!vdso_getcpu) {
+		/* getcpu() was never wired up in the 32-bit vDSO. */
+		printf("[%s]\tfailed to find getcpu in vDSO\n",
+		       sizeof(long) == 8 ? "WARN" : "NOTE");
+	}
+}
+
+static int init_vsys(void)
+{
+#ifdef __x86_64__
+	int nerrs = 0;
+	FILE *maps;
+	char line[128];
+	bool found = false;
+
+	maps = fopen("/proc/self/maps", "r");
+	if (!maps) {
+		printf("[WARN]\tCould not open /proc/self/maps -- assuming vsyscall is r-x\n");
+		should_read_vsyscall = true;
+		return 0;
+	}
+
+	while (fgets(line, sizeof(line), maps)) {
+		char r, x;
+		void *start, *end;
+		char name[128];
+		if (sscanf(line, "%p-%p %c-%cp %*x %*x:%*x %*u %s",
+			   &start, &end, &r, &x, name) != 5)
+			continue;
+
+		if (strcmp(name, "[vsyscall]"))
+			continue;
+
+		printf("\tvsyscall map: %s", line);
+
+		if (start != (void *)0xffffffffff600000 ||
+		    end != (void *)0xffffffffff601000) {
+			printf("[FAIL]\taddress range is nonsense\n");
+			nerrs++;
+		}
+
+		printf("\tvsyscall permissions are %c-%c\n", r, x);
+		should_read_vsyscall = (r == 'r');
+		if (x != 'x') {
+			vgtod = NULL;
+			vtime = NULL;
+			vgetcpu = NULL;
+		}
+
+		found = true;
+		break;
+	}
+
+	fclose(maps);
+
+	if (!found) {
+		printf("\tno vsyscall map in /proc/self/maps\n");
+		should_read_vsyscall = false;
+		vgtod = NULL;
+		vtime = NULL;
+		vgetcpu = NULL;
+	}
+
+	return nerrs;
+#else
+	return 0;
+#endif
+}
+
+/* syscalls */
+static inline long sys_gtod(struct timeval *tv, struct timezone *tz)
+{
+	return syscall(SYS_gettimeofday, tv, tz);
+}
+
+static inline int sys_clock_gettime(clockid_t id, struct timespec *ts)
+{
+	return syscall(SYS_clock_gettime, id, ts);
+}
+
+static inline long sys_time(time_t *t)
+{
+	return syscall(SYS_time, t);
+}
+
+static inline long sys_getcpu(unsigned * cpu, unsigned * node,
+			      void* cache)
+{
+	return syscall(SYS_getcpu, cpu, node, cache);
+}
+
+static jmp_buf jmpbuf;
+
+static void sigsegv(int sig, siginfo_t *info, void *ctx_void)
+{
+	siglongjmp(jmpbuf, 1);
+}
+
+static double tv_diff(const struct timeval *a, const struct timeval *b)
+{
+	return (double)(a->tv_sec - b->tv_sec) +
+		(double)((int)a->tv_usec - (int)b->tv_usec) * 1e-6;
+}
+
+static int check_gtod(const struct timeval *tv_sys1,
+		      const struct timeval *tv_sys2,
+		      const struct timezone *tz_sys,
+		      const char *which,
+		      const struct timeval *tv_other,
+		      const struct timezone *tz_other)
+{
+	int nerrs = 0;
+	double d1, d2;
+
+	if (tz_other && (tz_sys->tz_minuteswest != tz_other->tz_minuteswest || tz_sys->tz_dsttime != tz_other->tz_dsttime)) {
+		printf("[FAIL] %s tz mismatch\n", which);
+		nerrs++;
+	}
+
+	d1 = tv_diff(tv_other, tv_sys1);
+	d2 = tv_diff(tv_sys2, tv_other); 
+	printf("\t%s time offsets: %lf %lf\n", which, d1, d2);
+
+	if (d1 < 0 || d2 < 0) {
+		printf("[FAIL]\t%s time was inconsistent with the syscall\n", which);
+		nerrs++;
+	} else {
+		printf("[OK]\t%s gettimeofday()'s timeval was okay\n", which);
+	}
+
+	return nerrs;
+}
+
+static int test_gtod(void)
+{
+	struct timeval tv_sys1, tv_sys2, tv_vdso, tv_vsys;
+	struct timezone tz_sys, tz_vdso, tz_vsys;
+	long ret_vdso = -1;
+	long ret_vsys = -1;
+	int nerrs = 0;
+
+	printf("[RUN]\ttest gettimeofday()\n");
+
+	if (sys_gtod(&tv_sys1, &tz_sys) != 0)
+		err(1, "syscall gettimeofday");
+	if (vdso_gtod)
+		ret_vdso = vdso_gtod(&tv_vdso, &tz_vdso);
+	if (vgtod)
+		ret_vsys = vgtod(&tv_vsys, &tz_vsys);
+	if (sys_gtod(&tv_sys2, &tz_sys) != 0)
+		err(1, "syscall gettimeofday");
+
+	if (vdso_gtod) {
+		if (ret_vdso == 0) {
+			nerrs += check_gtod(&tv_sys1, &tv_sys2, &tz_sys, "vDSO", &tv_vdso, &tz_vdso);
+		} else {
+			printf("[FAIL]\tvDSO gettimeofday() failed: %ld\n", ret_vdso);
+			nerrs++;
+		}
+	}
+
+	if (vgtod) {
+		if (ret_vsys == 0) {
+			nerrs += check_gtod(&tv_sys1, &tv_sys2, &tz_sys, "vsyscall", &tv_vsys, &tz_vsys);
+		} else {
+			printf("[FAIL]\tvsys gettimeofday() failed: %ld\n", ret_vsys);
+			nerrs++;
+		}
+	}
+
+	return nerrs;
+}
+
+static int test_time(void) {
+	int nerrs = 0;
+
+	printf("[RUN]\ttest time()\n");
+	long t_sys1, t_sys2, t_vdso = 0, t_vsys = 0;
+	long t2_sys1 = -1, t2_sys2 = -1, t2_vdso = -1, t2_vsys = -1;
+	t_sys1 = sys_time(&t2_sys1);
+	if (vdso_time)
+		t_vdso = vdso_time(&t2_vdso);
+	if (vtime)
+		t_vsys = vtime(&t2_vsys);
+	t_sys2 = sys_time(&t2_sys2);
+	if (t_sys1 < 0 || t_sys1 != t2_sys1 || t_sys2 < 0 || t_sys2 != t2_sys2) {
+		printf("[FAIL]\tsyscall failed (ret1:%ld output1:%ld ret2:%ld output2:%ld)\n", t_sys1, t2_sys1, t_sys2, t2_sys2);
+		nerrs++;
+		return nerrs;
+	}
+
+	if (vdso_time) {
+		if (t_vdso < 0 || t_vdso != t2_vdso) {
+			printf("[FAIL]\tvDSO failed (ret:%ld output:%ld)\n", t_vdso, t2_vdso);
+			nerrs++;
+		} else if (t_vdso < t_sys1 || t_vdso > t_sys2) {
+			printf("[FAIL]\tvDSO returned the wrong time (%ld %ld %ld)\n", t_sys1, t_vdso, t_sys2);
+			nerrs++;
+		} else {
+			printf("[OK]\tvDSO time() is okay\n");
+		}
+	}
+
+	if (vtime) {
+		if (t_vsys < 0 || t_vsys != t2_vsys) {
+			printf("[FAIL]\tvsyscall failed (ret:%ld output:%ld)\n", t_vsys, t2_vsys);
+			nerrs++;
+		} else if (t_vsys < t_sys1 || t_vsys > t_sys2) {
+			printf("[FAIL]\tvsyscall returned the wrong time (%ld %ld %ld)\n", t_sys1, t_vsys, t_sys2);
+			nerrs++;
+		} else {
+			printf("[OK]\tvsyscall time() is okay\n");
+		}
+	}
+
+	return nerrs;
+}
+
+static int test_getcpu(int cpu)
+{
+	int nerrs = 0;
+	long ret_sys, ret_vdso = -1, ret_vsys = -1;
+
+	printf("[RUN]\tgetcpu() on CPU %d\n", cpu);
+
+	cpu_set_t cpuset;
+	CPU_ZERO(&cpuset);
+	CPU_SET(cpu, &cpuset);
+	if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0) {
+		printf("[SKIP]\tfailed to force CPU %d\n", cpu);
+		return nerrs;
+	}
+
+	unsigned cpu_sys, cpu_vdso, cpu_vsys, node_sys, node_vdso, node_vsys;
+	unsigned node = 0;
+	bool have_node = false;
+	ret_sys = sys_getcpu(&cpu_sys, &node_sys, 0);
+	if (vdso_getcpu)
+		ret_vdso = vdso_getcpu(&cpu_vdso, &node_vdso, 0);
+	if (vgetcpu)
+		ret_vsys = vgetcpu(&cpu_vsys, &node_vsys, 0);
+
+	if (ret_sys == 0) {
+		if (cpu_sys != cpu) {
+			printf("[FAIL]\tsyscall reported CPU %hu but should be %d\n", cpu_sys, cpu);
+			nerrs++;
+		}
+
+		have_node = true;
+		node = node_sys;
+	}
+
+	if (vdso_getcpu) {
+		if (ret_vdso) {
+			printf("[FAIL]\tvDSO getcpu() failed\n");
+			nerrs++;
+		} else {
+			if (!have_node) {
+				have_node = true;
+				node = node_vdso;
+			}
+
+			if (cpu_vdso != cpu) {
+				printf("[FAIL]\tvDSO reported CPU %hu but should be %d\n", cpu_vdso, cpu);
+				nerrs++;
+			} else {
+				printf("[OK]\tvDSO reported correct CPU\n");
+			}
+
+			if (node_vdso != node) {
+				printf("[FAIL]\tvDSO reported node %hu but should be %hu\n", node_vdso, node);
+				nerrs++;
+			} else {
+				printf("[OK]\tvDSO reported correct node\n");
+			}
+		}
+	}
+
+	if (vgetcpu) {
+		if (ret_vsys) {
+			printf("[FAIL]\tvsyscall getcpu() failed\n");
+			nerrs++;
+		} else {
+			if (!have_node) {
+				have_node = true;
+				node = node_vsys;
+			}
+
+			if (cpu_vsys != cpu) {
+				printf("[FAIL]\tvsyscall reported CPU %hu but should be %d\n", cpu_vsys, cpu);
+				nerrs++;
+			} else {
+				printf("[OK]\tvsyscall reported correct CPU\n");
+			}
+
+			if (node_vsys != node) {
+				printf("[FAIL]\tvsyscall reported node %hu but should be %hu\n", node_vsys, node);
+				nerrs++;
+			} else {
+				printf("[OK]\tvsyscall reported correct node\n");
+			}
+		}
+	}
+
+	return nerrs;
+}
+
+static int test_vsys_r(void)
+{
+#ifdef __x86_64__
+	printf("[RUN]\tChecking read access to the vsyscall page\n");
+	bool can_read;
+	if (sigsetjmp(jmpbuf, 1) == 0) {
+		*(volatile int *)0xffffffffff600000;
+		can_read = true;
+	} else {
+		can_read = false;
+	}
+
+	if (can_read && !should_read_vsyscall) {
+		printf("[FAIL]\tWe have read access, but we shouldn't\n");
+		return 1;
+	} else if (!can_read && should_read_vsyscall) {
+		printf("[FAIL]\tWe don't have read access, but we should\n");
+		return 1;
+	} else {
+		printf("[OK]\tgot expected result\n");
+	}
+#endif
+
+	return 0;
+}
+
+
+#ifdef __x86_64__
+#define X86_EFLAGS_TF (1UL << 8)
+static volatile sig_atomic_t num_vsyscall_traps;
+
+static unsigned long get_eflags(void)
+{
+	unsigned long eflags;
+	asm volatile ("pushfq\n\tpopq %0" : "=rm" (eflags));
+	return eflags;
+}
+
+static void set_eflags(unsigned long eflags)
+{
+	asm volatile ("pushq %0\n\tpopfq" : : "rm" (eflags) : "flags");
+}
+
+static void sigtrap(int sig, siginfo_t *info, void *ctx_void)
+{
+	ucontext_t *ctx = (ucontext_t *)ctx_void;
+	unsigned long ip = ctx->uc_mcontext.gregs[REG_RIP];
+
+	if (((ip ^ 0xffffffffff600000UL) & ~0xfffUL) == 0)
+		num_vsyscall_traps++;
+}
+
+static int test_native_vsyscall(void)
+{
+	time_t tmp;
+	bool is_native;
+
+	if (!vtime)
+		return 0;
+
+	printf("[RUN]\tchecking for native vsyscall\n");
+	sethandler(SIGTRAP, sigtrap, 0);
+	set_eflags(get_eflags() | X86_EFLAGS_TF);
+	vtime(&tmp);
+	set_eflags(get_eflags() & ~X86_EFLAGS_TF);
+
+	/*
+	 * If vsyscalls are emulated, we expect a single trap in the
+	 * vsyscall page -- the call instruction will trap with RIP
+	 * pointing to the entry point before emulation takes over.
+	 * In native mode, we expect two traps, since whatever code
+	 * the vsyscall page contains will be more than just a ret
+	 * instruction.
+	 */
+	is_native = (num_vsyscall_traps > 1);
+
+	printf("\tvsyscalls are %s (%d instructions in vsyscall page)\n",
+	       (is_native ? "native" : "emulated"),
+	       (int)num_vsyscall_traps);
+
+	return 0;
+}
+#endif
+
+int main(int argc, char **argv)
+{
+	int nerrs = 0;
+
+	init_vdso();
+	nerrs += init_vsys();
+
+	nerrs += test_gtod();
+	nerrs += test_time();
+	nerrs += test_getcpu(0);
+	nerrs += test_getcpu(1);
+
+	sethandler(SIGSEGV, sigsegv, 0);
+	nerrs += test_vsys_r();
+
+#ifdef __x86_64__
+	nerrs += test_native_vsyscall();
+#endif
+
+	return nerrs ? 1 : 0;
+}

From 36c1681678b507346e7397a235a7303dad665fc3 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Thu, 11 Jan 2018 18:28:08 +0900
Subject: [PATCH 799/808] genksyms: drop *.hash.c from .gitignore

This is a left-over of commit bb3290d91695 ("Remove gperf usage from
toolchain").

We do not generate a hash function any more.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
---
 scripts/genksyms/.gitignore | 1 -
 1 file changed, 1 deletion(-)

diff --git a/scripts/genksyms/.gitignore b/scripts/genksyms/.gitignore
index 86dc07a01b43..e7836b47f060 100644
--- a/scripts/genksyms/.gitignore
+++ b/scripts/genksyms/.gitignore
@@ -1,4 +1,3 @@
-*.hash.c
 *.lex.c
 *.tab.c
 *.tab.h

From bed6760cf2c40778a58f2e399c8947b3b3c55518 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Fri, 12 Jan 2018 16:53:07 -0800
Subject: [PATCH 800/808] MAINTAINERS, nilfs2: change project home URLs

The domain of NILFS project home was changed to "nilfs.sourceforge.io"
to enable https access (the previous domain "nilfs.sourceforge.net" is
redirected to the new one).  Modify URLs of the project home to reflect
this change and to replace their protocol from http to https.

Link: http://lkml.kernel.org/r/1515416141-5614-1-git-send-email-konishi.ryusuke@lab.ntt.co.jp
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/nilfs2.txt | 4 ++--
 MAINTAINERS                          | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/Documentation/filesystems/nilfs2.txt b/Documentation/filesystems/nilfs2.txt
index c0727dc36271..f2f3f8592a6f 100644
--- a/Documentation/filesystems/nilfs2.txt
+++ b/Documentation/filesystems/nilfs2.txt
@@ -25,8 +25,8 @@ available from the following download page.  At least "mkfs.nilfs2",
 cleaner or garbage collector) are required.  Details on the tools are
 described in the man pages included in the package.
 
-Project web page:    http://nilfs.sourceforge.net/
-Download page:       http://nilfs.sourceforge.net/en/download.html
+Project web page:    https://nilfs.sourceforge.io/
+Download page:       https://nilfs.sourceforge.io/en/download.html
 List info:           http://vger.kernel.org/vger-lists.html#linux-nilfs
 
 Caveats
diff --git a/MAINTAINERS b/MAINTAINERS
index d76af75a653a..18994806e441 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9638,8 +9638,8 @@ F:	include/uapi/linux/sunrpc/
 NILFS2 FILESYSTEM
 M:	Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
 L:	linux-nilfs@vger.kernel.org
-W:	http://nilfs.sourceforge.net/
-W:	http://nilfs.osdn.jp/
+W:	https://nilfs.sourceforge.io/
+W:	https://nilfs.osdn.jp/
 T:	git git://github.com/konis/nilfs2.git
 S:	Supported
 F:	Documentation/filesystems/nilfs2.txt

From d9570ee3bd1d4f20ce63485f5ef05663866fe6c0 Mon Sep 17 00:00:00 2001
From: Dmitry Vyukov <dvyukov@google.com>
Date: Fri, 12 Jan 2018 16:53:10 -0800
Subject: [PATCH 801/808] kmemleak: allow to coexist with fault injection

kmemleak does one slab allocation per user allocation.  So if slab fault
injection is enabled to any degree, kmemleak instantly fails to allocate
and turns itself off.  However, it's useful to use kmemleak with fault
injection to find leaks on error paths.  On the other hand, checking
kmemleak itself is not so useful because (1) it's a debugging tool and
(2) it has a very regular allocation pattern (basically a single
allocation site, so it either works or not).

Turn off fault injection for kmemleak allocations.

Link: http://lkml.kernel.org/r/20180109192243.19316-1-dvyukov@google.com
Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/kmemleak.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index d73c14294f3a..f656ca27f6c2 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -127,7 +127,7 @@
 /* GFP bitmask for kmemleak internal allocations */
 #define gfp_kmemleak_mask(gfp)	(((gfp) & (GFP_KERNEL | GFP_ATOMIC)) | \
 				 __GFP_NORETRY | __GFP_NOMEMALLOC | \
-				 __GFP_NOWARN)
+				 __GFP_NOWARN | __GFP_NOFAIL)
 
 /* scanning area inside a memory block */
 struct kmemleak_scan_area {

From a0b1280368d1e91ab72f849ef095b4f07a39bbf1 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Fri, 12 Jan 2018 16:53:14 -0800
Subject: [PATCH 802/808] kdump: write correct address of mem_section into
 vmcoreinfo

Depending on configuration mem_section can now be an array or a pointer
to an array allocated dynamically.  In most cases, we can continue to
refer to it as 'mem_section' regardless of what it is.

But there's one exception: '&mem_section' means "address of the array"
if mem_section is an array, but if mem_section is a pointer, it would
mean "address of the pointer".

We've stepped onto this in kdump code.  VMCOREINFO_SYMBOL(mem_section)
writes down address of pointer into vmcoreinfo, not array as we wanted.

Let's introduce VMCOREINFO_SYMBOL_ARRAY() that would handle the
situation correctly for both cases.

Link: http://lkml.kernel.org/r/20180112162532.35896-1-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Fixes: 83e3c48729d9 ("mm/sparsemem: Allocate mem_section at runtime for CONFIG_SPARSEMEM_EXTREME=y")
Acked-by: Baoquan He <bhe@redhat.com>
Acked-by: Dave Young <dyoung@redhat.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Dave Young <dyoung@redhat.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/crash_core.h | 2 ++
 kernel/crash_core.c        | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
index 06097ef30449..b511f6d24b42 100644
--- a/include/linux/crash_core.h
+++ b/include/linux/crash_core.h
@@ -42,6 +42,8 @@ phys_addr_t paddr_vmcoreinfo_note(void);
 	vmcoreinfo_append_str("PAGESIZE=%ld\n", value)
 #define VMCOREINFO_SYMBOL(name) \
 	vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)&name)
+#define VMCOREINFO_SYMBOL_ARRAY(name) \
+	vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)name)
 #define VMCOREINFO_SIZE(name) \
 	vmcoreinfo_append_str("SIZE(%s)=%lu\n", #name, \
 			      (unsigned long)sizeof(name))
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index b3663896278e..4f63597c824d 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -410,7 +410,7 @@ static int __init crash_save_vmcoreinfo_init(void)
 	VMCOREINFO_SYMBOL(contig_page_data);
 #endif
 #ifdef CONFIG_SPARSEMEM
-	VMCOREINFO_SYMBOL(mem_section);
+	VMCOREINFO_SYMBOL_ARRAY(mem_section);
 	VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
 	VMCOREINFO_STRUCT_SIZE(mem_section);
 	VMCOREINFO_OFFSET(mem_section, section_mem_map);

From 0f908ccbeca99ddf0ad60afa710e72aded4a5ea7 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Fri, 12 Jan 2018 16:53:17 -0800
Subject: [PATCH 803/808] tools/objtool/Makefile: don't assume sync-check.sh is
 executable

patch(1) loses the x bit.  So if a user follows our patching
instructions in Documentation/admin-guide/README.rst, their kernel will
not compile.

Fixes: 3bd51c5a371de ("objtool: Move kernel headers/code sync check to a script")
Reported-by: Nicolas Bock <nicolasbock@gentoo.org>
Reported-by Joakim Tjernlund <Joakim.Tjernlund@infinera.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 tools/objtool/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile
index ae0272f9a091..e6acc281dd37 100644
--- a/tools/objtool/Makefile
+++ b/tools/objtool/Makefile
@@ -46,7 +46,7 @@ $(OBJTOOL_IN): fixdep FORCE
 	@$(MAKE) $(build)=objtool
 
 $(OBJTOOL): $(LIBSUBCMD) $(OBJTOOL_IN)
-	@./sync-check.sh
+	@$(CONFIG_SHELL) ./sync-check.sh
 	$(QUIET_LINK)$(CC) $(OBJTOOL_IN) $(LDFLAGS) -o $@
 
 
From f10ee3dcc9f0aba92a5c4c064628be5200765dc2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 14 Jan 2018 00:23:57 +0100
Subject: [PATCH 804/808] x86/pti: Fix !PCID and sanitize defines

The switch to the user space page tables in the low level ASM code sets
unconditionally bit 12 and bit 11 of CR3. Bit 12 is switching the base
address of the page directory to the user part, bit 11 is switching the
PCID to the PCID associated with the user page tables.

This fails on a machine which lacks PCID support because bit 11 is set in
CR3. Bit 11 is reserved when PCID is inactive.

While the Intel SDM claims that the reserved bits are ignored when PCID is
disabled, the AMD APM states that they should be cleared.

This went unnoticed as the AMD APM was not checked when the code was
developed and reviewed and test systems with Intel CPUs never failed to
boot. The report is against a Centos 6 host where the guest fails to boot,
so it's not yet clear whether this is a virt issue or can happen on real
hardware too, but thats irrelevant as the AMD APM clearly ask for clearing
the reserved bits.

Make sure that on non PCID machines bit 11 is not set by the page table
switching code.

Andy suggested to rename the related bits and masks so they are clearly
describing what they should be used for, which is done as well for clarity.

That split could have been done with alternatives but the macro hell is
horrible and ugly. This can be done on top if someone cares to remove the
extra orq. For now it's a straight forward fix.

Fixes: 6fd166aae78c ("x86/mm: Use/Fix PCID to optimize user/kernel switches")
Reported-by: Laura Abbott <labbott@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: stable <stable@vger.kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Willy Tarreau <w@1wt.eu>
Cc: David Woodhouse <dwmw@amazon.co.uk>
Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801140009150.2371@nanos
---
 arch/x86/entry/calling.h               | 36 ++++++++++++++------------
 arch/x86/include/asm/processor-flags.h |  2 +-
 arch/x86/include/asm/tlbflush.h        |  6 ++---
 3 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 45a63e00a6af..3f48f695d5e6 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -198,8 +198,11 @@ For 32-bit we have the following conventions - kernel is built with
  * PAGE_TABLE_ISOLATION PGDs are 8k.  Flip bit 12 to switch between the two
  * halves:
  */
-#define PTI_SWITCH_PGTABLES_MASK	(1<<PAGE_SHIFT)
-#define PTI_SWITCH_MASK		(PTI_SWITCH_PGTABLES_MASK|(1<<X86_CR3_PTI_SWITCH_BIT))
+#define PTI_USER_PGTABLE_BIT		PAGE_SHIFT
+#define PTI_USER_PGTABLE_MASK		(1 << PTI_USER_PGTABLE_BIT)
+#define PTI_USER_PCID_BIT		X86_CR3_PTI_PCID_USER_BIT
+#define PTI_USER_PCID_MASK		(1 << PTI_USER_PCID_BIT)
+#define PTI_USER_PGTABLE_AND_PCID_MASK  (PTI_USER_PCID_MASK | PTI_USER_PGTABLE_MASK)
 
 .macro SET_NOFLUSH_BIT	reg:req
 	bts	$X86_CR3_PCID_NOFLUSH_BIT, \reg
@@ -208,7 +211,7 @@ For 32-bit we have the following conventions - kernel is built with
 .macro ADJUST_KERNEL_CR3 reg:req
 	ALTERNATIVE "", "SET_NOFLUSH_BIT \reg", X86_FEATURE_PCID
 	/* Clear PCID and "PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */
-	andq    $(~PTI_SWITCH_MASK), \reg
+	andq    $(~PTI_USER_PGTABLE_AND_PCID_MASK), \reg
 .endm
 
 .macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
@@ -239,15 +242,19 @@ For 32-bit we have the following conventions - kernel is built with
 	/* Flush needed, clear the bit */
 	btr	\scratch_reg, THIS_CPU_user_pcid_flush_mask
 	movq	\scratch_reg2, \scratch_reg
-	jmp	.Lwrcr3_\@
+	jmp	.Lwrcr3_pcid_\@
 
 .Lnoflush_\@:
 	movq	\scratch_reg2, \scratch_reg
 	SET_NOFLUSH_BIT \scratch_reg
 
+.Lwrcr3_pcid_\@:
+	/* Flip the ASID to the user version */
+	orq	$(PTI_USER_PCID_MASK), \scratch_reg
+
 .Lwrcr3_\@:
-	/* Flip the PGD and ASID to the user version */
-	orq     $(PTI_SWITCH_MASK), \scratch_reg
+	/* Flip the PGD to the user version */
+	orq     $(PTI_USER_PGTABLE_MASK), \scratch_reg
 	mov	\scratch_reg, %cr3
 .Lend_\@:
 .endm
@@ -263,17 +270,12 @@ For 32-bit we have the following conventions - kernel is built with
 	movq	%cr3, \scratch_reg
 	movq	\scratch_reg, \save_reg
 	/*
-	 * Is the "switch mask" all zero?  That means that both of
-	 * these are zero:
-	 *
-	 *	1. The user/kernel PCID bit, and
-	 *	2. The user/kernel "bit" that points CR3 to the
-	 *	   bottom half of the 8k PGD
-	 *
-	 * That indicates a kernel CR3 value, not a user CR3.
+	 * Test the user pagetable bit. If set, then the user page tables
+	 * are active. If clear CR3 already has the kernel page table
+	 * active.
 	 */
-	testq	$(PTI_SWITCH_MASK), \scratch_reg
-	jz	.Ldone_\@
+	bt	$PTI_USER_PGTABLE_BIT, \scratch_reg
+	jnc	.Ldone_\@
 
 	ADJUST_KERNEL_CR3 \scratch_reg
 	movq	\scratch_reg, %cr3
@@ -290,7 +292,7 @@ For 32-bit we have the following conventions - kernel is built with
 	 * KERNEL pages can always resume with NOFLUSH as we do
 	 * explicit flushes.
 	 */
-	bt	$X86_CR3_PTI_SWITCH_BIT, \save_reg
+	bt	$PTI_USER_PGTABLE_BIT, \save_reg
 	jnc	.Lnoflush_\@
 
 	/*
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
index 6a60fea90b9d..625a52a5594f 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -40,7 +40,7 @@
 #define CR3_NOFLUSH	BIT_ULL(63)
 
 #ifdef CONFIG_PAGE_TABLE_ISOLATION
-# define X86_CR3_PTI_SWITCH_BIT	11
+# define X86_CR3_PTI_PCID_USER_BIT	11
 #endif
 
 #else
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index f9b48ce152eb..3effd3c994af 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -81,13 +81,13 @@ static inline u16 kern_pcid(u16 asid)
 	 * Make sure that the dynamic ASID space does not confict with the
 	 * bit we are using to switch between user and kernel ASIDs.
 	 */
-	BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_SWITCH_BIT));
+	BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_PCID_USER_BIT));
 
 	/*
 	 * The ASID being passed in here should have respected the
 	 * MAX_ASID_AVAILABLE and thus never have the switch bit set.
 	 */
-	VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_SWITCH_BIT));
+	VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_PCID_USER_BIT));
 #endif
 	/*
 	 * The dynamically-assigned ASIDs that get passed in are small
@@ -112,7 +112,7 @@ static inline u16 user_pcid(u16 asid)
 {
 	u16 ret = kern_pcid(asid);
 #ifdef CONFIG_PAGE_TABLE_ISOLATION
-	ret |= 1 << X86_CR3_PTI_SWITCH_BIT;
+	ret |= 1 << X86_CR3_PTI_PCID_USER_BIT;
 #endif
 	return ret;
 }

From a237f762681e2a394ca67f21df2feb2b76a3609b Mon Sep 17 00:00:00 2001
From: "W. Trevor King" <wking@tremily.us>
Date: Fri, 12 Jan 2018 15:24:59 -0800
Subject: [PATCH 805/808] security/Kconfig: Correct the Documentation reference
 for PTI

When the config option for PTI was added a reference to documentation was
added as well. But the documentation did not exist at that point. The final
documentation has a different file name.

Fix it up to point to the proper file.

Fixes: 385ce0ea ("x86/mm/pti: Add Kconfig")
Signed-off-by: W. Trevor King <wking@tremily.us>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: linux-mm@kvack.org
Cc: linux-security-module@vger.kernel.org
Cc: James Morris <james.l.morris@oracle.com>
Cc: "Serge E. Hallyn" <serge@hallyn.com>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/3009cc8ccbddcd897ec1e0cb6dda524929de0d14.1515799398.git.wking@tremily.us
---
 security/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/security/Kconfig b/security/Kconfig
index 3d4debd0257e..b0cb9a5f9448 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -63,7 +63,7 @@ config PAGE_TABLE_ISOLATION
 	  ensuring that the majority of kernel addresses are not mapped
 	  into userspace.
 
-	  See Documentation/x86/pagetable-isolation.txt for more details.
+	  See Documentation/x86/pti.txt for more details.
 
 config SECURITY_INFINIBAND
 	bool "Infiniband Security Hooks"

From 99a9dc98ba52267ce5e062b52de88ea1f1b2a7d8 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Sun, 14 Jan 2018 11:27:13 +0100
Subject: [PATCH 806/808] x86,perf: Disable intel_bts when PTI
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The intel_bts driver does not use the 'normal' BTS buffer which is exposed
through the cpu_entry_area but instead uses the memory allocated for the
perf AUX buffer.

This obviously comes apart when using PTI because then the kernel mapping;
which includes that AUX buffer memory; disappears. Fixing this requires to
expose a mapping which is visible in all context and that's not trivial.

As a quick fix disable this driver when PTI is enabled to prevent
malfunction.

Fixes: 385ce0ea4c07 ("x86/mm/pti: Add Kconfig")
Reported-by: Vince Weaver <vincent.weaver@maine.edu>
Reported-by: Robert Święcki <robert@swiecki.net>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: greg@kroah.com
Cc: hughd@google.com
Cc: luto@amacapital.net
Cc: Vince Weaver <vince@deater.net>
Cc: torvalds@linux-foundation.org
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/20180114102713.GB6166@worktop.programming.kicks-ass.net
---
 arch/x86/events/intel/bts.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c
index 141e07b06216..24ffa1e88cf9 100644
--- a/arch/x86/events/intel/bts.c
+++ b/arch/x86/events/intel/bts.c
@@ -582,6 +582,24 @@ static __init int bts_init(void)
 	if (!boot_cpu_has(X86_FEATURE_DTES64) || !x86_pmu.bts)
 		return -ENODEV;
 
+	if (boot_cpu_has(X86_FEATURE_PTI)) {
+		/*
+		 * BTS hardware writes through a virtual memory map we must
+		 * either use the kernel physical map, or the user mapping of
+		 * the AUX buffer.
+		 *
+		 * However, since this driver supports per-CPU and per-task inherit
+		 * we cannot use the user mapping since it will not be availble
+		 * if we're not running the owning process.
+		 *
+		 * With PTI we can't use the kernal map either, because its not
+		 * there when we run userspace.
+		 *
+		 * For now, disable this driver when using PTI.
+		 */
+		return -ENODEV;
+	}
+
 	bts_pmu.capabilities	= PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_ITRACE |
 				  PERF_PMU_CAP_EXCLUSIVE;
 	bts_pmu.task_ctx_nr	= perf_sw_context;

From b8b9ce4b5aec8de9e23cabb0a26b78641f9ab1d6 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 14 Jan 2018 22:13:29 +0100
Subject: [PATCH 807/808] x86/retpoline: Remove compile time warning

Remove the compile time warning when CONFIG_RETPOLINE=y and the compiler
does not have retpoline support. Linus rationale for this is:

  It's wrong because it will just make people turn off RETPOLINE, and the
  asm updates - and return stack clearing - that are independent of the
  compiler are likely the most important parts because they are likely the
  ones easiest to target.

  And it's annoying because most people won't be able to do anything about
  it. The number of people building their own compiler? Very small. So if
  their distro hasn't got a compiler yet (and pretty much nobody does), the
  warning is just annoying crap.

  It is already properly reported as part of the sysfs interface. The
  compile-time warning only encourages bad things.

Fixes: 76b043848fd2 ("x86/retpoline: Add initial retpoline support")
Requested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: David Woodhouse <dwmw@amazon.co.uk>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: gnomes@lxorguk.ukuu.org.uk
Cc: Rik van Riel <riel@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: thomas.lendacky@amd.com
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Kees Cook <keescook@google.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org>
Link: https://lkml.kernel.org/r/CA+55aFzWgquv4i6Mab6bASqYXg3ErV3XDFEYf=GEcCDQg5uAtw@mail.gmail.com
---
 arch/x86/Makefile | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 974c61864978..504b1a4535ac 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -240,8 +240,6 @@ ifdef CONFIG_RETPOLINE
     RETPOLINE_CFLAGS += $(call cc-option,-mindirect-branch=thunk-extern -mindirect-branch-register)
     ifneq ($(RETPOLINE_CFLAGS),)
         KBUILD_CFLAGS += $(RETPOLINE_CFLAGS) -DRETPOLINE
-    else
-        $(warning CONFIG_RETPOLINE=y, but not supported by the compiler. Toolchain update recommended.)
     endif
 endif
 

From a8750ddca918032d6349adbf9a4b6555e7db20da Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 14 Jan 2018 15:32:30 -0800
Subject: [PATCH 808/808] Linux 4.15-rc8

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index c4aa6210a2a4..bf5b8cbb9469 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,7 @@
 VERSION = 4
 PATCHLEVEL = 15
 SUBLEVEL = 0
-EXTRAVERSION = -rc7
+EXTRAVERSION = -rc8
 NAME = Fearless Coyote
 
 # *DOCUMENTATION*