From 9f591ae60e1be026901398ef99eede91237aa3a1 Mon Sep 17 00:00:00 2001
From: Gerd Hoffmann <kraxel@redhat.com>
Date: Wed, 21 Mar 2018 15:08:47 +0100
Subject: [PATCH 001/288] drm/i915/gvt: throw error on unhandled vfio ioctls

On unknown/unhandled ioctls the driver should return an error, so
userspace knows it tried to use something unsupported.

Cc: stable@vger.kernel.org
Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
Reviewed-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Zhenyu Wang <zhenyuw@linux.intel.com>
---
 drivers/gpu/drm/i915/gvt/kvmgt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c
index 520fe3d0a882..f8540cc67b44 100644
--- a/drivers/gpu/drm/i915/gvt/kvmgt.c
+++ b/drivers/gpu/drm/i915/gvt/kvmgt.c
@@ -1254,7 +1254,7 @@ static long intel_vgpu_ioctl(struct mdev_device *mdev, unsigned int cmd,
 
 	}
 
-	return 0;
+	return -ENOTTY;
 }
 
 static ssize_t

From ac0fd9cfc837d1d30cb958dc34769e90aad43078 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Thu, 22 Mar 2018 12:27:54 -0500
Subject: [PATCH 002/288] drm/i915/gvt: Mark expected switch fall-through in
 handle_g2v_notification

In preparation to enabling -Wimplicit-fallthrough, mark switch cases
where we are expecting to fall through.

Addresses-Coverity-ID: 1466154 ("Missing break in switch")
Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Signed-off-by: Zhenyu Wang <zhenyuw@linux.intel.com>
---
 drivers/gpu/drm/i915/gvt/handlers.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/i915/gvt/handlers.c b/drivers/gpu/drm/i915/gvt/handlers.c
index 8c5d5d005854..a33c1c3e4a21 100644
--- a/drivers/gpu/drm/i915/gvt/handlers.c
+++ b/drivers/gpu/drm/i915/gvt/handlers.c
@@ -1150,6 +1150,7 @@ static int handle_g2v_notification(struct intel_vgpu *vgpu, int notification)
 	switch (notification) {
 	case VGT_G2V_PPGTT_L3_PAGE_TABLE_CREATE:
 		root_entry_type = GTT_TYPE_PPGTT_ROOT_L3_ENTRY;
+		/* fall through */
 	case VGT_G2V_PPGTT_L4_PAGE_TABLE_CREATE:
 		mm = intel_vgpu_get_ppgtt_mm(vgpu, root_entry_type, pdps);
 		return PTR_ERR_OR_ZERO(mm);

From 5da795b061a139b55d7393e79c7af688c58a4267 Mon Sep 17 00:00:00 2001
From: Zhipeng Gong <zhipeng.gong@intel.com>
Date: Mon, 26 Mar 2018 15:18:56 +0800
Subject: [PATCH 003/288] drm/i915/gvt: Make MI_USER_INTERRUPT nop in cmd
 parser

GVT-g dispatches request to host i915 and depends on i915 notify
ring interrupt mechanism to check completion of request.
For now MI_USER_INTERRUPT in guest requests is passed through
in GVT-g cmd parser and i915 does not use it, which causes
unnecessary interrupt handling in i915.
On the other hand, if several requests from guest are combined into
one request in and contain MI_USER_INTERRUPT in the middle of
combined request. GVT-g still has to wait on the whole request to
complete to inject user interrupts to guest.

This patch makes all the MI_USER_INTERRUPT nop to save some interrupt
handling.

Here is test result to run glmark2 on guest for 10 seconds:
host master interrupts number is reduced from 16021 to 11162
host user interrupts number is reduced from 7936 to 3536

v2:
- revise commit message. (Kevin)

Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Zhipeng Gong <zhipeng.gong@intel.com>
Signed-off-by: Zhenyu Wang <zhenyuw@linux.intel.com>
---
 drivers/gpu/drm/i915/gvt/cmd_parser.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/i915/gvt/cmd_parser.c b/drivers/gpu/drm/i915/gvt/cmd_parser.c
index c8454ac43fae..d301a204e981 100644
--- a/drivers/gpu/drm/i915/gvt/cmd_parser.c
+++ b/drivers/gpu/drm/i915/gvt/cmd_parser.c
@@ -1079,6 +1079,7 @@ static int cmd_handler_mi_user_interrupt(struct parser_exec_state *s)
 {
 	set_bit(cmd_interrupt_events[s->ring_id].mi_user_interrupt,
 			s->workload->pending_events);
+	patch_value(s, cmd_ptr(s, 0), MI_NOOP);
 	return 0;
 }
 

From ab13a9218c9883d1f51940b9e720c63ef54a2c07 Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Thu, 18 Jan 2018 18:40:25 -0800
Subject: [PATCH 004/288] ecryptfs: lookup: Don't check if mount_crypt_stat is
 NULL

mount_crypt_stat is assigned to
&ecryptfs_superblock_to_private(ecryptfs_dentry->d_sb)->mount_crypt_stat,
and mount_crypt_stat is not the first object in struct ecryptfs_sb_info.
mount_crypt_stat is therefore never NULL. At the same time, no crash
in ecryptfs_lookup() has been reported, and the lookup functions in
other file systems don't check if d_sb is NULL either.
Given that, remove the NULL check.

Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Tyler Hicks <tyhicks@canonical.com>
---
 fs/ecryptfs/inode.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 847904aa63a9..97d17eaeba07 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -395,8 +395,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
 
 	mount_crypt_stat = &ecryptfs_superblock_to_private(
 				ecryptfs_dentry->d_sb)->mount_crypt_stat;
-	if (mount_crypt_stat
-	    && (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)) {
+	if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) {
 		rc = ecryptfs_encrypt_and_encode_filename(
 			&encrypted_and_encoded_name, &len,
 			mount_crypt_stat, name, len);

From f62fd7a77717350e850f3c4a5373fe8e64871025 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Fri, 2 Mar 2018 09:07:08 +0000
Subject: [PATCH 005/288] ecryptfs: fix spelling mistake: "cadidate" ->
 "candidate"

Trivial fix to spelling mistake in debug message text.

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Tyler Hicks <tyhicks@canonical.com>
---
 fs/ecryptfs/keystore.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index c89a58cfc991..e74fe84d0886 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -1880,7 +1880,7 @@ find_next_matching_auth_tok:
 		candidate_auth_tok = &auth_tok_list_item->auth_tok;
 		if (unlikely(ecryptfs_verbosity > 0)) {
 			ecryptfs_printk(KERN_DEBUG,
-					"Considering cadidate auth tok:\n");
+					"Considering candidate auth tok:\n");
 			ecryptfs_dump_auth_tok(candidate_auth_tok);
 		}
 		rc = ecryptfs_get_auth_tok_sig(&candidate_auth_tok_sig,

From 7598e8700e9a507496660219924ca8c5aa3088d6 Mon Sep 17 00:00:00 2001
From: Changbin Du <changbin.du@intel.com>
Date: Tue, 27 Mar 2018 15:35:14 +0800
Subject: [PATCH 006/288] drm/i915/gvt: Missed to cancel dma map for ggtt
 entries

We have canceled dma map for ppgtt entries. Also we need to do it for
ggtt entries when them are invalidated.

This can fix task hung issue as:
[13517.791767] INFO: task gvt_service_thr:1081 blocked for more than 120 seconds.
[13517.792584] Not tainted 4.14.15+ #3
[13517.793417] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[13517.794267] gvt_service_thr D 0 1081 2 0x80000000
[13517.795132] Call Trace:
[13517.795996] ? __schedule+0x493/0x77b
[13517.796859] schedule+0x79/0x82
[13517.797740] schedule_preempt_disabled+0x5/0x6
[13517.798614] __mutex_lock.isra.0+0x2b5/0x445
[13517.799504] ? __switch_to_asm+0x24/0x60
[13517.800381] ? intel_gvt_cleanup+0x10/0x10
[13517.801261] ? intel_gvt_schedule+0x19/0x2b9
[13517.802107] intel_gvt_schedule+0x19/0x2b9
[13517.802954] ? intel_gvt_cleanup+0x10/0x10
[13517.803824] gvt_service_thread+0xe3/0x10d
[13517.804704] ? wait_woken+0x68/0x68
[13517.805588] kthread+0x118/0x120
[13517.806478] ? kthread_create_on_node+0x3a/0x3a
[13517.807381] ? call_usermodehelper_exec_async+0x113/0x11a
[13517.808307] ret_from_fork+0x35/0x40

v3: split out ggtt reset case.
v2: also unmap ggtt during reset.

Signed-off-by: Changbin Du <changbin.du@intel.com>
Signed-off-by: Zhenyu Wang <zhenyuw@linux.intel.com>
---
 drivers/gpu/drm/i915/gvt/gtt.c | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/gvt/gtt.c b/drivers/gpu/drm/i915/gvt/gtt.c
index d29281231507..8c9477c272be 100644
--- a/drivers/gpu/drm/i915/gvt/gtt.c
+++ b/drivers/gpu/drm/i915/gvt/gtt.c
@@ -530,6 +530,16 @@ static void ggtt_set_guest_entry(struct intel_vgpu_mm *mm,
 			   false, 0, mm->vgpu);
 }
 
+static void ggtt_get_host_entry(struct intel_vgpu_mm *mm,
+		struct intel_gvt_gtt_entry *entry, unsigned long index)
+{
+	struct intel_gvt_gtt_pte_ops *pte_ops = mm->vgpu->gvt->gtt.pte_ops;
+
+	GEM_BUG_ON(mm->type != INTEL_GVT_MM_GGTT);
+
+	pte_ops->get_entry(NULL, entry, index, false, 0, mm->vgpu);
+}
+
 static void ggtt_set_host_entry(struct intel_vgpu_mm *mm,
 		struct intel_gvt_gtt_entry *entry, unsigned long index)
 {
@@ -1818,6 +1828,18 @@ int intel_vgpu_emulate_ggtt_mmio_read(struct intel_vgpu *vgpu, unsigned int off,
 	return ret;
 }
 
+static void ggtt_invalidate_pte(struct intel_vgpu *vgpu,
+		struct intel_gvt_gtt_entry *entry)
+{
+	struct intel_gvt_gtt_pte_ops *pte_ops = vgpu->gvt->gtt.pte_ops;
+	unsigned long pfn;
+
+	pfn = pte_ops->get_pfn(entry);
+	if (pfn != vgpu->gvt->gtt.scratch_mfn)
+		intel_gvt_hypervisor_dma_unmap_guest_page(vgpu,
+						pfn << PAGE_SHIFT);
+}
+
 static int emulate_ggtt_mmio_write(struct intel_vgpu *vgpu, unsigned int off,
 	void *p_data, unsigned int bytes)
 {
@@ -1844,10 +1866,10 @@ static int emulate_ggtt_mmio_write(struct intel_vgpu *vgpu, unsigned int off,
 
 	memcpy((void *)&e.val64 + (off & (info->gtt_entry_size - 1)), p_data,
 			bytes);
-	m = e;
 
 	if (ops->test_present(&e)) {
 		gfn = ops->get_pfn(&e);
+		m = e;
 
 		/* one PTE update may be issued in multiple writes and the
 		 * first write may not construct a valid gfn
@@ -1868,8 +1890,12 @@ static int emulate_ggtt_mmio_write(struct intel_vgpu *vgpu, unsigned int off,
 			ops->set_pfn(&m, gvt->gtt.scratch_mfn);
 		} else
 			ops->set_pfn(&m, dma_addr >> PAGE_SHIFT);
-	} else
+	} else {
+		ggtt_get_host_entry(ggtt_mm, &m, g_gtt_index);
+		ggtt_invalidate_pte(vgpu, &m);
 		ops->set_pfn(&m, gvt->gtt.scratch_mfn);
+		ops->clear_present(&m);
+	}
 
 out:
 	ggtt_set_host_entry(ggtt_mm, &m, g_gtt_index);

From f4c43db356198d8091442f593871b0beb5c139b2 Mon Sep 17 00:00:00 2001
From: Changbin Du <changbin.du@intel.com>
Date: Tue, 27 Mar 2018 15:35:15 +0800
Subject: [PATCH 007/288] drm/i915/gvt: Cancel dma map when resetting ggtt
 entries

Ditto, don't forget ggtt entries during reset.

Signed-off-by: Changbin Du <changbin.du@intel.com>
Signed-off-by: Zhenyu Wang <zhenyuw@linux.intel.com>
---
 drivers/gpu/drm/i915/gvt/gtt.c | 22 +++++++++++++++++-----
 drivers/gpu/drm/i915/gvt/gtt.h |  2 +-
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/i915/gvt/gtt.c b/drivers/gpu/drm/i915/gvt/gtt.c
index 8c9477c272be..78e55aafc8bc 100644
--- a/drivers/gpu/drm/i915/gvt/gtt.c
+++ b/drivers/gpu/drm/i915/gvt/gtt.c
@@ -2056,7 +2056,7 @@ int intel_vgpu_init_gtt(struct intel_vgpu *vgpu)
 		return PTR_ERR(gtt->ggtt_mm);
 	}
 
-	intel_vgpu_reset_ggtt(vgpu);
+	intel_vgpu_reset_ggtt(vgpu, false);
 
 	return create_scratch_page_tree(vgpu);
 }
@@ -2341,17 +2341,19 @@ void intel_vgpu_invalidate_ppgtt(struct intel_vgpu *vgpu)
 /**
  * intel_vgpu_reset_ggtt - reset the GGTT entry
  * @vgpu: a vGPU
+ * @invalidate_old: invalidate old entries
  *
  * This function is called at the vGPU create stage
  * to reset all the GGTT entries.
  *
  */
-void intel_vgpu_reset_ggtt(struct intel_vgpu *vgpu)
+void intel_vgpu_reset_ggtt(struct intel_vgpu *vgpu, bool invalidate_old)
 {
 	struct intel_gvt *gvt = vgpu->gvt;
 	struct drm_i915_private *dev_priv = gvt->dev_priv;
 	struct intel_gvt_gtt_pte_ops *pte_ops = vgpu->gvt->gtt.pte_ops;
 	struct intel_gvt_gtt_entry entry = {.type = GTT_TYPE_GGTT_PTE};
+	struct intel_gvt_gtt_entry old_entry;
 	u32 index;
 	u32 num_entries;
 
@@ -2360,13 +2362,23 @@ void intel_vgpu_reset_ggtt(struct intel_vgpu *vgpu)
 
 	index = vgpu_aperture_gmadr_base(vgpu) >> PAGE_SHIFT;
 	num_entries = vgpu_aperture_sz(vgpu) >> PAGE_SHIFT;
-	while (num_entries--)
+	while (num_entries--) {
+		if (invalidate_old) {
+			ggtt_get_host_entry(vgpu->gtt.ggtt_mm, &old_entry, index);
+			ggtt_invalidate_pte(vgpu, &old_entry);
+		}
 		ggtt_set_host_entry(vgpu->gtt.ggtt_mm, &entry, index++);
+	}
 
 	index = vgpu_hidden_gmadr_base(vgpu) >> PAGE_SHIFT;
 	num_entries = vgpu_hidden_sz(vgpu) >> PAGE_SHIFT;
-	while (num_entries--)
+	while (num_entries--) {
+		if (invalidate_old) {
+			ggtt_get_host_entry(vgpu->gtt.ggtt_mm, &old_entry, index);
+			ggtt_invalidate_pte(vgpu, &old_entry);
+		}
 		ggtt_set_host_entry(vgpu->gtt.ggtt_mm, &entry, index++);
+	}
 
 	ggtt_invalidate(dev_priv);
 }
@@ -2386,5 +2398,5 @@ void intel_vgpu_reset_gtt(struct intel_vgpu *vgpu)
 	 * removing the shadow pages.
 	 */
 	intel_vgpu_destroy_all_ppgtt_mm(vgpu);
-	intel_vgpu_reset_ggtt(vgpu);
+	intel_vgpu_reset_ggtt(vgpu, true);
 }
diff --git a/drivers/gpu/drm/i915/gvt/gtt.h b/drivers/gpu/drm/i915/gvt/gtt.h
index a8b369cd352b..3792f2b7f4ff 100644
--- a/drivers/gpu/drm/i915/gvt/gtt.h
+++ b/drivers/gpu/drm/i915/gvt/gtt.h
@@ -193,7 +193,7 @@ struct intel_vgpu_gtt {
 
 extern int intel_vgpu_init_gtt(struct intel_vgpu *vgpu);
 extern void intel_vgpu_clean_gtt(struct intel_vgpu *vgpu);
-void intel_vgpu_reset_ggtt(struct intel_vgpu *vgpu);
+void intel_vgpu_reset_ggtt(struct intel_vgpu *vgpu, bool invalidate_old);
 void intel_vgpu_invalidate_ppgtt(struct intel_vgpu *vgpu);
 
 extern int intel_gvt_init_gtt(struct intel_gvt *gvt);

From a733390f9a79876635013e57da25f91b6e78ffe6 Mon Sep 17 00:00:00 2001
From: Xiong Zhang <xiong.y.zhang@intel.com>
Date: Wed, 28 Mar 2018 05:30:13 +0800
Subject: [PATCH 008/288] drm/i915/gvt: Delete redundant error message in
 fb_decode.c

Much error message exist in host dmesg when guest boot up with local
display enabled.
[  167.680011] gvt: vgpu 1: invalid range gmadr 0x0 size 0x0
[  167.680013] gvt: vgpu 1: invalid gma address: 0

The second error line duplicate with the first error line, so this
patch remove this redundant error message and make the next error
message much clearer.

Signed-off-by: Xiong Zhang <xiong.y.zhang@intel.com>
Signed-off-by: Zhenyu Wang <zhenyuw@linux.intel.com>
---
 drivers/gpu/drm/i915/gvt/fb_decoder.c | 27 +++++++++------------------
 1 file changed, 9 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/i915/gvt/fb_decoder.c b/drivers/gpu/drm/i915/gvt/fb_decoder.c
index 6b50fe78dc1b..1c120683e958 100644
--- a/drivers/gpu/drm/i915/gvt/fb_decoder.c
+++ b/drivers/gpu/drm/i915/gvt/fb_decoder.c
@@ -245,16 +245,13 @@ int intel_vgpu_decode_primary_plane(struct intel_vgpu *vgpu,
 	plane->hw_format = fmt;
 
 	plane->base = vgpu_vreg_t(vgpu, DSPSURF(pipe)) & I915_GTT_PAGE_MASK;
-	if (!intel_gvt_ggtt_validate_range(vgpu, plane->base, 0)) {
-		gvt_vgpu_err("invalid gma address: %lx\n",
-			     (unsigned long)plane->base);
+	if (!intel_gvt_ggtt_validate_range(vgpu, plane->base, 0))
 		return  -EINVAL;
-	}
 
 	plane->base_gpa = intel_vgpu_gma_to_gpa(vgpu->gtt.ggtt_mm, plane->base);
 	if (plane->base_gpa == INTEL_GVT_INVALID_ADDR) {
-		gvt_vgpu_err("invalid gma address: %lx\n",
-				(unsigned long)plane->base);
+		gvt_vgpu_err("Translate primary plane gma 0x%x to gpa fail\n",
+				plane->base);
 		return  -EINVAL;
 	}
 
@@ -371,16 +368,13 @@ int intel_vgpu_decode_cursor_plane(struct intel_vgpu *vgpu,
 			alpha_plane, alpha_force);
 
 	plane->base = vgpu_vreg_t(vgpu, CURBASE(pipe)) & I915_GTT_PAGE_MASK;
-	if (!intel_gvt_ggtt_validate_range(vgpu, plane->base, 0)) {
-		gvt_vgpu_err("invalid gma address: %lx\n",
-			     (unsigned long)plane->base);
+	if (!intel_gvt_ggtt_validate_range(vgpu, plane->base, 0))
 		return  -EINVAL;
-	}
 
 	plane->base_gpa = intel_vgpu_gma_to_gpa(vgpu->gtt.ggtt_mm, plane->base);
 	if (plane->base_gpa == INTEL_GVT_INVALID_ADDR) {
-		gvt_vgpu_err("invalid gma address: %lx\n",
-				(unsigned long)plane->base);
+		gvt_vgpu_err("Translate cursor plane gma 0x%x to gpa fail\n",
+				plane->base);
 		return  -EINVAL;
 	}
 
@@ -476,16 +470,13 @@ int intel_vgpu_decode_sprite_plane(struct intel_vgpu *vgpu,
 	plane->drm_format = drm_format;
 
 	plane->base = vgpu_vreg_t(vgpu, SPRSURF(pipe)) & I915_GTT_PAGE_MASK;
-	if (!intel_gvt_ggtt_validate_range(vgpu, plane->base, 0)) {
-		gvt_vgpu_err("invalid gma address: %lx\n",
-			     (unsigned long)plane->base);
+	if (!intel_gvt_ggtt_validate_range(vgpu, plane->base, 0))
 		return  -EINVAL;
-	}
 
 	plane->base_gpa = intel_vgpu_gma_to_gpa(vgpu->gtt.ggtt_mm, plane->base);
 	if (plane->base_gpa == INTEL_GVT_INVALID_ADDR) {
-		gvt_vgpu_err("invalid gma address: %lx\n",
-				(unsigned long)plane->base);
+		gvt_vgpu_err("Translate sprite plane gma 0x%x to gpa fail\n",
+				plane->base);
 		return  -EINVAL;
 	}
 

From 65eff272330c72689fb5e20dd6491826fd87a39c Mon Sep 17 00:00:00 2001
From: Xiong Zhang <xiong.y.zhang@intel.com>
Date: Wed, 28 Mar 2018 05:30:14 +0800
Subject: [PATCH 009/288] drm/i915/gvt: Disable primary/sprite/cursor plane at
 virtual display initialization

Much error exist in host dmesg during guest boot up with loca display
enabled.
gvt: vgpu 1: invalid range gmadr 0x0 size 0x0

This error happens when qemu get dmabuf info in case that the virtual
display plane is enabled but its base address is an invalid 0, such
case may be true before guest enable its plane. At this moment, its
state is copied from host where the plane may be enabled.

This patch disable primary/sprite/cursor plane at virtual display
initialization, so intel_vgpu_decode_primary/cursor/sprite could
return early as plane is disabled, then plane base check is skipped and
error message disapper.

Signed-off-by: Xiong Zhang <xiong.y.zhang@intel.com>
Signed-off-by: Zhenyu Wang <zhenyuw@linux.intel.com>
---
 drivers/gpu/drm/i915/gvt/display.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/drivers/gpu/drm/i915/gvt/display.c b/drivers/gpu/drm/i915/gvt/display.c
index dd96ffc878ac..6d8180e8d1e2 100644
--- a/drivers/gpu/drm/i915/gvt/display.c
+++ b/drivers/gpu/drm/i915/gvt/display.c
@@ -169,6 +169,8 @@ static u8 dpcd_fix_data[DPCD_HEADER_SIZE] = {
 static void emulate_monitor_status_change(struct intel_vgpu *vgpu)
 {
 	struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv;
+	int pipe;
+
 	vgpu_vreg_t(vgpu, SDEISR) &= ~(SDE_PORTB_HOTPLUG_CPT |
 			SDE_PORTC_HOTPLUG_CPT |
 			SDE_PORTD_HOTPLUG_CPT);
@@ -267,6 +269,14 @@ static void emulate_monitor_status_change(struct intel_vgpu *vgpu)
 	if (IS_BROADWELL(dev_priv))
 		vgpu_vreg_t(vgpu, PCH_ADPA) &= ~ADPA_CRT_HOTPLUG_MONITOR_MASK;
 
+	/* Disable Primary/Sprite/Cursor plane */
+	for_each_pipe(dev_priv, pipe) {
+		vgpu_vreg_t(vgpu, DSPCNTR(pipe)) &= ~DISPLAY_PLANE_ENABLE;
+		vgpu_vreg_t(vgpu, SPRCTL(pipe)) &= ~SPRITE_ENABLE;
+		vgpu_vreg_t(vgpu, CURCNTR(pipe)) &= ~CURSOR_MODE;
+		vgpu_vreg_t(vgpu, CURCNTR(pipe)) |= CURSOR_MODE_DISABLE;
+	}
+
 	vgpu_vreg_t(vgpu, PIPECONF(PIPE_A)) |= PIPECONF_ENABLE;
 }
 

From 10996f802109c83421ca30556cfe36ffc3bebae3 Mon Sep 17 00:00:00 2001
From: Tina Zhang <tina.zhang@intel.com>
Date: Wed, 28 Mar 2018 13:49:29 +0800
Subject: [PATCH 010/288] drm/i915/gvt: Add drm_format_mod update

Add drm_format_mod update, which is omitted.

Fixes: e546e281("drm/i915/gvt: Dmabuf support for GVT-g")
Cc: stable@vger.kernel.org
Signed-off-by: Tina Zhang <tina.zhang@intel.com>
Signed-off-by: Zhenyu Wang <zhenyuw@linux.intel.com>
---
 drivers/gpu/drm/i915/gvt/dmabuf.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/i915/gvt/dmabuf.c b/drivers/gpu/drm/i915/gvt/dmabuf.c
index b555eb26f9ce..6f4f8e941fc2 100644
--- a/drivers/gpu/drm/i915/gvt/dmabuf.c
+++ b/drivers/gpu/drm/i915/gvt/dmabuf.c
@@ -323,6 +323,7 @@ static void update_fb_info(struct vfio_device_gfx_plane_info *gvt_dmabuf,
 		      struct intel_vgpu_fb_info *fb_info)
 {
 	gvt_dmabuf->drm_format = fb_info->drm_format;
+	gvt_dmabuf->drm_format_mod = fb_info->drm_format_mod;
 	gvt_dmabuf->width = fb_info->width;
 	gvt_dmabuf->height = fb_info->height;
 	gvt_dmabuf->stride = fb_info->stride;

From ff2969c479d97c6221a9835ee0ab4c44513badc6 Mon Sep 17 00:00:00 2001
From: Tomer Maimon <tmaimon77@gmail.com>
Date: Thu, 8 Mar 2018 17:24:57 +0200
Subject: [PATCH 011/288] dt-binding: timer: document NPCM7xx timer DT bindings

Added device tree binding documentation for Nuvoton NPCM7xx timer.

Signed-off-by: Tomer Maimon <tmaimon77@gmail.com>
Acked-by: Rob Herring <robh@kernel.org>
Reviewed-by: Brendan Higgins <brendanhiggins@google.com>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 .../bindings/timer/nuvoton,npcm7xx-timer.txt  | 21 +++++++++++++++++++
 1 file changed, 21 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/timer/nuvoton,npcm7xx-timer.txt

diff --git a/Documentation/devicetree/bindings/timer/nuvoton,npcm7xx-timer.txt b/Documentation/devicetree/bindings/timer/nuvoton,npcm7xx-timer.txt
new file mode 100644
index 000000000000..ea22dfe485be
--- /dev/null
+++ b/Documentation/devicetree/bindings/timer/nuvoton,npcm7xx-timer.txt
@@ -0,0 +1,21 @@
+Nuvoton NPCM7xx timer
+
+Nuvoton NPCM7xx have three timer modules, each timer module provides five 24-bit
+timer counters.
+
+Required properties:
+- compatible      : "nuvoton,npcm750-timer" for Poleg NPCM750.
+- reg             : Offset and length of the register set for the device.
+- interrupts      : Contain the timer interrupt with flags for
+                    falling edge.
+- clocks          : phandle of timer reference clock (usually a 25 MHz clock).
+
+Example:
+
+timer@f0008000 {
+    compatible = "nuvoton,npcm750-timer";
+    interrupts = <GIC_SPI 32 IRQ_TYPE_LEVEL_HIGH>;
+    reg = <0xf0008000 0x50>;
+    clocks = <&clk NPCM7XX_CLK_TIMER>;
+};
+

From 1c00289ecd12471ba9733e61aaf1d39883a77b16 Mon Sep 17 00:00:00 2001
From: Tomer Maimon <tmaimon77@gmail.com>
Date: Thu, 8 Mar 2018 17:24:58 +0200
Subject: [PATCH 012/288] clocksource/drivers/npcm: Add NPCM7xx timer driver

Add Nuvoton BMC NPCM7xx timer driver.

The clocksource Enable 24-bit TIMER0 and TIMER1 counters,
while TIMER0 serve as clockevent and TIMER1 serve as clocksource.

Signed-off-by: Tomer Maimon <tmaimon77@gmail.com>
Reviewed-by: Brendan Higgins <brendanhiggins@xxxxxxxxxx>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/clocksource/Kconfig         |   8 ++
 drivers/clocksource/Makefile        |   1 +
 drivers/clocksource/timer-npcm7xx.c | 215 ++++++++++++++++++++++++++++
 3 files changed, 224 insertions(+)
 create mode 100644 drivers/clocksource/timer-npcm7xx.c

diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig
index b3b4ed9b6874..76194bc20bdf 100644
--- a/drivers/clocksource/Kconfig
+++ b/drivers/clocksource/Kconfig
@@ -130,6 +130,14 @@ config VT8500_TIMER
 	help
 	  Enables support for the VT8500 driver.
 
+config NPCM7XX_TIMER
+	bool "NPCM7xx timer driver" if COMPILE_TEST
+	depends on HAS_IOMEM
+	select CLKSRC_MMIO
+	help
+	  Enable 24-bit TIMER0 and TIMER1 counters in the NPCM7xx architecture,
+	  While TIMER0 serves as clockevent and TIMER1 serves as clocksource.
+
 config CADENCE_TTC_TIMER
 	bool "Cadence TTC timer driver" if COMPILE_TEST
 	depends on COMMON_CLK
diff --git a/drivers/clocksource/Makefile b/drivers/clocksource/Makefile
index d6dec4489d66..74387877f7cf 100644
--- a/drivers/clocksource/Makefile
+++ b/drivers/clocksource/Makefile
@@ -55,6 +55,7 @@ obj-$(CONFIG_CLKSRC_NPS)	+= timer-nps.o
 obj-$(CONFIG_OXNAS_RPS_TIMER)	+= timer-oxnas-rps.o
 obj-$(CONFIG_OWL_TIMER)		+= owl-timer.o
 obj-$(CONFIG_SPRD_TIMER)	+= timer-sprd.o
+obj-$(CONFIG_NPCM7XX_TIMER)	+= timer-npcm7xx.o
 
 obj-$(CONFIG_ARC_TIMERS)		+= arc_timer.o
 obj-$(CONFIG_ARM_ARCH_TIMER)		+= arm_arch_timer.o
diff --git a/drivers/clocksource/timer-npcm7xx.c b/drivers/clocksource/timer-npcm7xx.c
new file mode 100644
index 000000000000..7a9bb5532d99
--- /dev/null
+++ b/drivers/clocksource/timer-npcm7xx.c
@@ -0,0 +1,215 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2014-2018 Nuvoton Technologies tomer.maimon@nuvoton.com
+ * All rights reserved.
+ *
+ * Copyright 2017 Google, Inc.
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/err.h>
+#include <linux/clk.h>
+#include <linux/io.h>
+#include <linux/clockchips.h>
+#include <linux/of_irq.h>
+#include <linux/of_address.h>
+#include "timer-of.h"
+
+/* Timers registers */
+#define NPCM7XX_REG_TCSR0	0x0 /* Timer 0 Control and Status Register */
+#define NPCM7XX_REG_TICR0	0x8 /* Timer 0 Initial Count Register */
+#define NPCM7XX_REG_TCSR1	0x4 /* Timer 1 Control and Status Register */
+#define NPCM7XX_REG_TICR1	0xc /* Timer 1 Initial Count Register */
+#define NPCM7XX_REG_TDR1	0x14 /* Timer 1 Data Register */
+#define NPCM7XX_REG_TISR	0x18 /* Timer Interrupt Status Register */
+
+/* Timers control */
+#define NPCM7XX_Tx_RESETINT		0x1f
+#define NPCM7XX_Tx_PERIOD		BIT(27)
+#define NPCM7XX_Tx_INTEN		BIT(29)
+#define NPCM7XX_Tx_COUNTEN		BIT(30)
+#define NPCM7XX_Tx_ONESHOT		0x0
+#define NPCM7XX_Tx_OPER			GENMASK(3, 27)
+#define NPCM7XX_Tx_MIN_PRESCALE		0x1
+#define NPCM7XX_Tx_TDR_MASK_BITS	24
+#define NPCM7XX_Tx_MAX_CNT		0xFFFFFF
+#define NPCM7XX_T0_CLR_INT		0x1
+#define NPCM7XX_Tx_CLR_CSR		0x0
+
+/* Timers operating mode */
+#define NPCM7XX_START_PERIODIC_Tx (NPCM7XX_Tx_PERIOD | NPCM7XX_Tx_COUNTEN | \
+					NPCM7XX_Tx_INTEN | \
+					NPCM7XX_Tx_MIN_PRESCALE)
+
+#define NPCM7XX_START_ONESHOT_Tx (NPCM7XX_Tx_ONESHOT | NPCM7XX_Tx_COUNTEN | \
+					NPCM7XX_Tx_INTEN | \
+					NPCM7XX_Tx_MIN_PRESCALE)
+
+#define NPCM7XX_START_Tx (NPCM7XX_Tx_COUNTEN | NPCM7XX_Tx_PERIOD | \
+				NPCM7XX_Tx_MIN_PRESCALE)
+
+#define NPCM7XX_DEFAULT_CSR (NPCM7XX_Tx_CLR_CSR | NPCM7XX_Tx_MIN_PRESCALE)
+
+static int npcm7xx_timer_resume(struct clock_event_device *evt)
+{
+	struct timer_of *to = to_timer_of(evt);
+	u32 val;
+
+	val = readl(timer_of_base(to) + NPCM7XX_REG_TCSR0);
+	val |= NPCM7XX_Tx_COUNTEN;
+	writel(val, timer_of_base(to) + NPCM7XX_REG_TCSR0);
+
+	return 0;
+}
+
+static int npcm7xx_timer_shutdown(struct clock_event_device *evt)
+{
+	struct timer_of *to = to_timer_of(evt);
+	u32 val;
+
+	val = readl(timer_of_base(to) + NPCM7XX_REG_TCSR0);
+	val &= ~NPCM7XX_Tx_COUNTEN;
+	writel(val, timer_of_base(to) + NPCM7XX_REG_TCSR0);
+
+	return 0;
+}
+
+static int npcm7xx_timer_oneshot(struct clock_event_device *evt)
+{
+	struct timer_of *to = to_timer_of(evt);
+	u32 val;
+
+	val = readl(timer_of_base(to) + NPCM7XX_REG_TCSR0);
+	val &= ~NPCM7XX_Tx_OPER;
+
+	val = readl(timer_of_base(to) + NPCM7XX_REG_TCSR0);
+	val |= NPCM7XX_START_ONESHOT_Tx;
+	writel(val, timer_of_base(to) + NPCM7XX_REG_TCSR0);
+
+	return 0;
+}
+
+static int npcm7xx_timer_periodic(struct clock_event_device *evt)
+{
+	struct timer_of *to = to_timer_of(evt);
+	u32 val;
+
+	val = readl(timer_of_base(to) + NPCM7XX_REG_TCSR0);
+	val &= ~NPCM7XX_Tx_OPER;
+
+	writel(timer_of_period(to), timer_of_base(to) + NPCM7XX_REG_TICR0);
+	val |= NPCM7XX_START_PERIODIC_Tx;
+
+	writel(val, timer_of_base(to) + NPCM7XX_REG_TCSR0);
+
+	return 0;
+}
+
+static int npcm7xx_clockevent_set_next_event(unsigned long evt,
+		struct clock_event_device *clk)
+{
+	struct timer_of *to = to_timer_of(clk);
+	u32 val;
+
+	writel(evt, timer_of_base(to) + NPCM7XX_REG_TICR0);
+	val = readl(timer_of_base(to) + NPCM7XX_REG_TCSR0);
+	val |= NPCM7XX_START_Tx;
+	writel(val, timer_of_base(to) + NPCM7XX_REG_TCSR0);
+
+	return 0;
+}
+
+static irqreturn_t npcm7xx_timer0_interrupt(int irq, void *dev_id)
+{
+	struct clock_event_device *evt = (struct clock_event_device *)dev_id;
+	struct timer_of *to = to_timer_of(evt);
+
+	writel(NPCM7XX_T0_CLR_INT, timer_of_base(to) + NPCM7XX_REG_TISR);
+
+	evt->event_handler(evt);
+
+	return IRQ_HANDLED;
+}
+
+static struct timer_of npcm7xx_to = {
+	.flags = TIMER_OF_IRQ | TIMER_OF_BASE | TIMER_OF_CLOCK,
+
+	.clkevt = {
+		.name		    = "npcm7xx-timer0",
+		.features	    = CLOCK_EVT_FEAT_PERIODIC |
+				      CLOCK_EVT_FEAT_ONESHOT,
+		.set_next_event	    = npcm7xx_clockevent_set_next_event,
+		.set_state_shutdown = npcm7xx_timer_shutdown,
+		.set_state_periodic = npcm7xx_timer_periodic,
+		.set_state_oneshot  = npcm7xx_timer_oneshot,
+		.tick_resume	    = npcm7xx_timer_resume,
+		.rating		    = 300,
+	},
+
+	.of_irq = {
+		.handler = npcm7xx_timer0_interrupt,
+		.flags = IRQF_TIMER | IRQF_IRQPOLL,
+	},
+};
+
+static void __init npcm7xx_clockevents_init(void)
+{
+	writel(NPCM7XX_DEFAULT_CSR,
+		timer_of_base(&npcm7xx_to) + NPCM7XX_REG_TCSR0);
+
+	writel(NPCM7XX_Tx_RESETINT,
+		timer_of_base(&npcm7xx_to) + NPCM7XX_REG_TISR);
+
+	npcm7xx_to.clkevt.cpumask = cpumask_of(0);
+	clockevents_config_and_register(&npcm7xx_to.clkevt,
+					timer_of_rate(&npcm7xx_to),
+					0x1, NPCM7XX_Tx_MAX_CNT);
+}
+
+static void __init npcm7xx_clocksource_init(void)
+{
+	u32 val;
+
+	writel(NPCM7XX_DEFAULT_CSR,
+		timer_of_base(&npcm7xx_to) + NPCM7XX_REG_TCSR1);
+	writel(NPCM7XX_Tx_MAX_CNT,
+		timer_of_base(&npcm7xx_to) + NPCM7XX_REG_TICR1);
+
+	val = readl(timer_of_base(&npcm7xx_to) + NPCM7XX_REG_TCSR1);
+	val |= NPCM7XX_START_Tx;
+	writel(val, timer_of_base(&npcm7xx_to) + NPCM7XX_REG_TCSR1);
+
+	clocksource_mmio_init(timer_of_base(&npcm7xx_to) +
+				NPCM7XX_REG_TDR1,
+				"npcm7xx-timer1", timer_of_rate(&npcm7xx_to),
+				200, (unsigned int)NPCM7XX_Tx_TDR_MASK_BITS,
+				clocksource_mmio_readl_down);
+}
+
+static int __init npcm7xx_timer_init(struct device_node *np)
+{
+	int ret;
+
+	ret = timer_of_init(np, &npcm7xx_to);
+	if (ret)
+		return ret;
+
+	/* Clock input is divided by PRESCALE + 1 before it is fed */
+	/* to the counter */
+	npcm7xx_to.of_clk.rate = npcm7xx_to.of_clk.rate /
+		(NPCM7XX_Tx_MIN_PRESCALE + 1);
+
+	npcm7xx_clocksource_init();
+	npcm7xx_clockevents_init();
+
+	pr_info("Enabling NPCM7xx clocksource timer base: %px, IRQ: %d ",
+		timer_of_base(&npcm7xx_to), timer_of_irq(&npcm7xx_to));
+
+	return 0;
+}
+
+TIMER_OF_DECLARE(npcm7xx, "nuvoton,npcm750-timer", npcm7xx_timer_init);
+

From cc01456a0d9a3cbfec85cf23f2ce53323e8fc973 Mon Sep 17 00:00:00 2001
From: Anson Huang <Anson.Huang@nxp.com>
Date: Wed, 28 Mar 2018 11:22:35 +0800
Subject: [PATCH 013/288] dt-bindings: timer: tpm: fix typo of clock name

The clock name should be ipg instead of igp.

Signed-off-by: Anson Huang <Anson.Huang@nxp.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 Documentation/devicetree/bindings/timer/nxp,tpm-timer.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/devicetree/bindings/timer/nxp,tpm-timer.txt b/Documentation/devicetree/bindings/timer/nxp,tpm-timer.txt
index b4aa7ddb5b13..f82087b220f4 100644
--- a/Documentation/devicetree/bindings/timer/nxp,tpm-timer.txt
+++ b/Documentation/devicetree/bindings/timer/nxp,tpm-timer.txt
@@ -15,7 +15,7 @@ Required properties:
 - interrupts :	Should be the clock event device interrupt.
 - clocks :	The clocks provided by the SoC to drive the timer, must contain
 		an entry for each entry in clock-names.
-- clock-names : Must include the following entries: "igp" and "per".
+- clock-names : Must include the following entries: "ipg" and "per".
 
 Example:
 tpm5: tpm@40260000 {

From 16328e7bd428937f557a984d8b3378387c17a930 Mon Sep 17 00:00:00 2001
From: Anson Huang <Anson.Huang@nxp.com>
Date: Wed, 28 Mar 2018 11:22:36 +0800
Subject: [PATCH 014/288] clocksource/drivers/imx-tpm: Fix typo of clock name

The clock name should be ipg instead of igp.

Signed-off-by: Anson Huang <Anson.Huang@nxp.com>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/clocksource/timer-imx-tpm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/clocksource/timer-imx-tpm.c b/drivers/clocksource/timer-imx-tpm.c
index 21bffdcb2f20..3f97d4985824 100644
--- a/drivers/clocksource/timer-imx-tpm.c
+++ b/drivers/clocksource/timer-imx-tpm.c
@@ -179,7 +179,7 @@ static int __init tpm_timer_init(struct device_node *np)
 	ipg = of_clk_get_by_name(np, "ipg");
 	per = of_clk_get_by_name(np, "per");
 	if (IS_ERR(ipg) || IS_ERR(per)) {
-		pr_err("tpm: failed to get igp or per clk\n");
+		pr_err("tpm: failed to get ipg or per clk\n");
 		ret = -ENODEV;
 		goto err_clk_get;
 	}

From 506a7be93ff773d5d4cf75a59f342865605b4910 Mon Sep 17 00:00:00 2001
From: Anson Huang <Anson.Huang@nxp.com>
Date: Wed, 28 Mar 2018 11:22:37 +0800
Subject: [PATCH 015/288] clocksource/drivers/imx-tpm: Correct some registers
 operation flow

According to i.MX7ULP reference manual, TPM_SC_CPWMS can ONLY be written when
counter is disabled, TPM_SC_TOF is write-1-clear, TPM_C0SC_CHF is also
write-1-clear, correct these registers initialization flow;

Signed-off-by: Anson Huang <Anson.Huang@nxp.com>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/clocksource/timer-imx-tpm.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/clocksource/timer-imx-tpm.c b/drivers/clocksource/timer-imx-tpm.c
index 3f97d4985824..7403e494417a 100644
--- a/drivers/clocksource/timer-imx-tpm.c
+++ b/drivers/clocksource/timer-imx-tpm.c
@@ -20,6 +20,7 @@
 #define TPM_SC				0x10
 #define TPM_SC_CMOD_INC_PER_CNT		(0x1 << 3)
 #define TPM_SC_CMOD_DIV_DEFAULT		0x3
+#define TPM_SC_TOF_MASK			(0x1 << 7)
 #define TPM_CNT				0x14
 #define TPM_MOD				0x18
 #define TPM_STATUS			0x1c
@@ -29,6 +30,7 @@
 #define TPM_C0SC_MODE_SHIFT		2
 #define TPM_C0SC_MODE_MASK		0x3c
 #define TPM_C0SC_MODE_SW_COMPARE	0x4
+#define TPM_C0SC_CHF_MASK		(0x1 << 7)
 #define TPM_C0V				0x24
 
 static void __iomem *timer_base;
@@ -205,9 +207,13 @@ static int __init tpm_timer_init(struct device_node *np)
 	 * 4) Channel0 disabled
 	 * 5) DMA transfers disabled
 	 */
+	/* make sure counter is disabled */
 	writel(0, timer_base + TPM_SC);
+	/* TOF is W1C */
+	writel(TPM_SC_TOF_MASK, timer_base + TPM_SC);
 	writel(0, timer_base + TPM_CNT);
-	writel(0, timer_base + TPM_C0SC);
+	/* CHF is W1C */
+	writel(TPM_C0SC_CHF_MASK, timer_base + TPM_C0SC);
 
 	/* increase per cnt, div 8 by default */
 	writel(TPM_SC_CMOD_INC_PER_CNT | TPM_SC_CMOD_DIV_DEFAULT,

From 0136c741ff40e03323419feec05fcd594f36a463 Mon Sep 17 00:00:00 2001
From: Anson Huang <Anson.Huang@nxp.com>
Date: Wed, 28 Mar 2018 11:22:38 +0800
Subject: [PATCH 016/288] clocksource/drivers/imx-tpm: Add different counter
 width support

Different TPM modules have different width counters which is 16-bit or 32-bit,
the counter width can be read from TPM_PARAM register bit[23:16], this patch
adds dynamic check for counter width to support both 16-bit and 32-bit TPM
modules.

Signed-off-by: Anson Huang <Anson.Huang@nxp.com>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/clocksource/timer-imx-tpm.c | 33 ++++++++++++++++++++++-------
 1 file changed, 25 insertions(+), 8 deletions(-)

diff --git a/drivers/clocksource/timer-imx-tpm.c b/drivers/clocksource/timer-imx-tpm.c
index 7403e494417a..05d97a6871d8 100644
--- a/drivers/clocksource/timer-imx-tpm.c
+++ b/drivers/clocksource/timer-imx-tpm.c
@@ -17,9 +17,13 @@
 #include <linux/of_irq.h>
 #include <linux/sched_clock.h>
 
+#define TPM_PARAM			0x4
+#define TPM_PARAM_WIDTH_SHIFT		16
+#define TPM_PARAM_WIDTH_MASK		(0xff << 16)
 #define TPM_SC				0x10
 #define TPM_SC_CMOD_INC_PER_CNT		(0x1 << 3)
 #define TPM_SC_CMOD_DIV_DEFAULT		0x3
+#define TPM_SC_CMOD_DIV_MAX		0x7
 #define TPM_SC_TOF_MASK			(0x1 << 7)
 #define TPM_CNT				0x14
 #define TPM_MOD				0x18
@@ -33,6 +37,8 @@
 #define TPM_C0SC_CHF_MASK		(0x1 << 7)
 #define TPM_C0V				0x24
 
+static int counter_width;
+static int rating;
 static void __iomem *timer_base;
 static struct clock_event_device clockevent_tpm;
 
@@ -85,10 +91,11 @@ static int __init tpm_clocksource_init(unsigned long rate)
 	tpm_delay_timer.freq = rate;
 	register_current_timer_delay(&tpm_delay_timer);
 
-	sched_clock_register(tpm_read_sched_clock, 32, rate);
+	sched_clock_register(tpm_read_sched_clock, counter_width, rate);
 
 	return clocksource_mmio_init(timer_base + TPM_CNT, "imx-tpm",
-				     rate, 200, 32, clocksource_mmio_readl_up);
+				     rate, rating, counter_width,
+				     clocksource_mmio_readl_up);
 }
 
 static int tpm_set_next_event(unsigned long delta,
@@ -141,7 +148,6 @@ static struct clock_event_device clockevent_tpm = {
 	.set_state_oneshot	= tpm_set_state_oneshot,
 	.set_next_event		= tpm_set_next_event,
 	.set_state_shutdown	= tpm_set_state_shutdown,
-	.rating			= 200,
 };
 
 static int __init tpm_clockevent_init(unsigned long rate, int irq)
@@ -151,10 +157,11 @@ static int __init tpm_clockevent_init(unsigned long rate, int irq)
 	ret = request_irq(irq, tpm_timer_interrupt, IRQF_TIMER | IRQF_IRQPOLL,
 			  "i.MX7ULP TPM Timer", &clockevent_tpm);
 
+	clockevent_tpm.rating = rating;
 	clockevent_tpm.cpumask = cpumask_of(0);
 	clockevent_tpm.irq = irq;
-	clockevents_config_and_register(&clockevent_tpm,
-					rate, 300, 0xfffffffe);
+	clockevents_config_and_register(&clockevent_tpm, rate, 300,
+					GENMASK(counter_width - 1, 1));
 
 	return ret;
 }
@@ -199,6 +206,11 @@ static int __init tpm_timer_init(struct device_node *np)
 		goto err_per_clk_enable;
 	}
 
+	counter_width = (readl(timer_base + TPM_PARAM) & TPM_PARAM_WIDTH_MASK)
+		>> TPM_PARAM_WIDTH_SHIFT;
+	/* use rating 200 for 32-bit counter and 150 for 16-bit counter */
+	rating = counter_width == 0x20 ? 200 : 150;
+
 	/*
 	 * Initialize tpm module to a known state
 	 * 1) Counter disabled
@@ -215,12 +227,17 @@ static int __init tpm_timer_init(struct device_node *np)
 	/* CHF is W1C */
 	writel(TPM_C0SC_CHF_MASK, timer_base + TPM_C0SC);
 
-	/* increase per cnt, div 8 by default */
-	writel(TPM_SC_CMOD_INC_PER_CNT | TPM_SC_CMOD_DIV_DEFAULT,
+	/*
+	 * increase per cnt,
+	 * div 8 for 32-bit counter and div 128 for 16-bit counter
+	 */
+	writel(TPM_SC_CMOD_INC_PER_CNT |
+		(counter_width == 0x20 ?
+		TPM_SC_CMOD_DIV_DEFAULT : TPM_SC_CMOD_DIV_MAX),
 		     timer_base + TPM_SC);
 
 	/* set MOD register to maximum for free running mode */
-	writel(0xffffffff, timer_base + TPM_MOD);
+	writel(GENMASK(counter_width - 1, 0), timer_base + TPM_MOD);
 
 	rate = clk_get_rate(per) >> 3;
 	ret = tpm_clocksource_init(rate);

From 38057aa1639b74d5c1e0dc1ca7c22bc7a31de714 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Sat, 24 Mar 2018 12:58:29 +0000
Subject: [PATCH 017/288] drm/i915/execlists: Clear user-active flag on
 preemption completion
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When cancelling the requests and clearing out the ports following a
successful preemption completion, also clear the active flag. I had
assumed that all preemptions would be followed by an immediate dequeue
(preserving the active user flag), but under rare circumstances we may
be triggering a preemption for the second port only for it to have
completed before the preemotion kicks in; leaving execlists->active set
even though the system is now idle.

We can clear the flag inside the common execlists_cancel_port_requests()
as the other users also expect the semantics of active being cleared.

Fixes: f6322eddaff7 ("drm/i915/preemption: Allow preemption between submission ports")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Michał Winiarski <michal.winiarski@intel.com>
Cc: Michel Thierry <michel.thierry@intel.com>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180324125829.27026-1-chris@chris-wilson.co.uk
(cherry picked from commit eed7ec52f214bac2f25395ccaad610fbeb842a6e)
Signed-off-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
---
 drivers/gpu/drm/i915/intel_lrc.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 697af5add78b..e3a5f673ff67 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -577,6 +577,8 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 		 * know the next preemption status we see corresponds
 		 * to this ELSP update.
 		 */
+		GEM_BUG_ON(!execlists_is_active(execlists,
+						EXECLISTS_ACTIVE_USER));
 		GEM_BUG_ON(!port_count(&port[0]));
 		if (port_count(&port[0]) > 1)
 			goto unlock;
@@ -738,6 +740,8 @@ execlists_cancel_port_requests(struct intel_engine_execlists * const execlists)
 		memset(port, 0, sizeof(*port));
 		port++;
 	}
+
+	execlists_clear_active(execlists, EXECLISTS_ACTIVE_USER);
 }
 
 static void execlists_cancel_requests(struct intel_engine_cs *engine)
@@ -1001,6 +1005,11 @@ static void execlists_submission_tasklet(unsigned long data)
 
 	if (fw)
 		intel_uncore_forcewake_put(dev_priv, execlists->fw_domains);
+
+	/* If the engine is now idle, so should be the flag; and vice versa. */
+	GEM_BUG_ON(execlists_is_active(&engine->execlists,
+				       EXECLISTS_ACTIVE_USER) ==
+		   !port_isset(engine->execlists.port));
 }
 
 static void queue_request(struct intel_engine_cs *engine,

From 2e210bbb7429cdcf1a1a3ad00c1bf98bd9bf2452 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Tue, 3 Apr 2018 10:52:20 -0700
Subject: [PATCH 018/288] HID: input: fix battery level reporting on BT mice

The commit 581c4484769e ("HID: input: map digitizer battery usage")
assumed that devices having input (qas opposed to feature) report for
battery strength would report the data on their own, without the need to
be polled by the kernel; unfortunately it is not so. Many wireless mice
do not send unsolicited reports with battery strength data and have to
be polled explicitly. As a complication, stylus devices on digitizers
are not normally connected to the base and thus can not be polled - the
base can only determine battery strength in the stylus when it is in
proximity.

To solve this issue, we add a special flag that tells the kernel
to avoid polling the device (and expect unsolicited reports) and set it
when report field with physical usage of digitizer stylus (HID_DG_STYLUS).
Unless this flag is set, and we have not seen the unsolicited reports,
the kernel will attempt to poll the device when userspace attempts to
read "capacity" and "state" attributes of power_supply object
corresponding to the devices battery.

Fixes: 581c4484769e ("HID: input: map digitizer battery usage")
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=198095
Cc: stable@vger.kernel.org
Reported-and-tested-by: Martin van Es <martin@mrvanes.com>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/hid-input.c | 24 +++++++++++++++++-------
 include/linux/hid.h     |  9 ++++++++-
 2 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/drivers/hid/hid-input.c b/drivers/hid/hid-input.c
index 6836a856c243..930652c25120 100644
--- a/drivers/hid/hid-input.c
+++ b/drivers/hid/hid-input.c
@@ -387,7 +387,8 @@ static int hidinput_get_battery_property(struct power_supply *psy,
 		break;
 
 	case POWER_SUPPLY_PROP_CAPACITY:
-		if (dev->battery_report_type == HID_FEATURE_REPORT) {
+		if (dev->battery_status != HID_BATTERY_REPORTED &&
+		    !dev->battery_avoid_query) {
 			value = hidinput_query_battery_capacity(dev);
 			if (value < 0)
 				return value;
@@ -403,17 +404,17 @@ static int hidinput_get_battery_property(struct power_supply *psy,
 		break;
 
 	case POWER_SUPPLY_PROP_STATUS:
-		if (!dev->battery_reported &&
-		    dev->battery_report_type == HID_FEATURE_REPORT) {
+		if (dev->battery_status != HID_BATTERY_REPORTED &&
+		    !dev->battery_avoid_query) {
 			value = hidinput_query_battery_capacity(dev);
 			if (value < 0)
 				return value;
 
 			dev->battery_capacity = value;
-			dev->battery_reported = true;
+			dev->battery_status = HID_BATTERY_QUERIED;
 		}
 
-		if (!dev->battery_reported)
+		if (dev->battery_status == HID_BATTERY_UNKNOWN)
 			val->intval = POWER_SUPPLY_STATUS_UNKNOWN;
 		else if (dev->battery_capacity == 100)
 			val->intval = POWER_SUPPLY_STATUS_FULL;
@@ -486,6 +487,14 @@ static int hidinput_setup_battery(struct hid_device *dev, unsigned report_type,
 	dev->battery_report_type = report_type;
 	dev->battery_report_id = field->report->id;
 
+	/*
+	 * Stylus is normally not connected to the device and thus we
+	 * can't query the device and get meaningful battery strength.
+	 * We have to wait for the device to report it on its own.
+	 */
+	dev->battery_avoid_query = report_type == HID_INPUT_REPORT &&
+				   field->physical == HID_DG_STYLUS;
+
 	dev->battery = power_supply_register(&dev->dev, psy_desc, &psy_cfg);
 	if (IS_ERR(dev->battery)) {
 		error = PTR_ERR(dev->battery);
@@ -530,9 +539,10 @@ static void hidinput_update_battery(struct hid_device *dev, int value)
 
 	capacity = hidinput_scale_battery_capacity(dev, value);
 
-	if (!dev->battery_reported || capacity != dev->battery_capacity) {
+	if (dev->battery_status != HID_BATTERY_REPORTED ||
+	    capacity != dev->battery_capacity) {
 		dev->battery_capacity = capacity;
-		dev->battery_reported = true;
+		dev->battery_status = HID_BATTERY_REPORTED;
 		power_supply_changed(dev->battery);
 	}
 }
diff --git a/include/linux/hid.h b/include/linux/hid.h
index 8da3e1f48195..26240a22978a 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -516,6 +516,12 @@ enum hid_type {
 	HID_TYPE_USBNONE
 };
 
+enum hid_battery_status {
+	HID_BATTERY_UNKNOWN = 0,
+	HID_BATTERY_QUERIED,		/* Kernel explicitly queried battery strength */
+	HID_BATTERY_REPORTED,		/* Device sent unsolicited battery strength report */
+};
+
 struct hid_driver;
 struct hid_ll_driver;
 
@@ -558,7 +564,8 @@ struct hid_device {							/* device report descriptor */
 	__s32 battery_max;
 	__s32 battery_report_type;
 	__s32 battery_report_id;
-	bool battery_reported;
+	enum hid_battery_status battery_status;
+	bool battery_avoid_query;
 #endif
 
 	unsigned int status;						/* see STAT flags above */

From a955358d54695e4ad9f7d6489a7ac4d69a8fc711 Mon Sep 17 00:00:00 2001
From: Rodrigo Rivas Costa <rodrigorivascosta@gmail.com>
Date: Fri, 6 Apr 2018 01:09:36 +0200
Subject: [PATCH 019/288] HID: hidraw: Fix crash on HIDIOCGFEATURE with a
 destroyed device

Doing `ioctl(HIDIOCGFEATURE)` in a tight loop on a hidraw device
and then disconnecting the device, or unloading the driver, can
cause a NULL pointer dereference.

When a hidraw device is destroyed it sets 0 to `dev->exist`.
Most functions check 'dev->exist' before doing its work, but
`hidraw_get_report()` was missing that check.

Cc: stable@vger.kernel.org
Signed-off-by: Rodrigo Rivas Costa <rodrigorivascosta@gmail.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/hidraw.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/hid/hidraw.c b/drivers/hid/hidraw.c
index fbfcc8009432..b39844adea47 100644
--- a/drivers/hid/hidraw.c
+++ b/drivers/hid/hidraw.c
@@ -192,6 +192,11 @@ static ssize_t hidraw_get_report(struct file *file, char __user *buffer, size_t
 	int ret = 0, len;
 	unsigned char report_number;
 
+	if (!hidraw_table[minor] || !hidraw_table[minor]->exist) {
+		ret = -ENODEV;
+		goto out;
+	}
+
 	dev = hidraw_table[minor]->hid;
 
 	if (!dev->ll_driver->raw_request) {

From 54a307ba8d3cd00a3902337ffaae28f436eeb1a4 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Wed, 4 Apr 2018 23:42:18 +0300
Subject: [PATCH 020/288] fanotify: fix logic of events on child

When event on child inodes are sent to the parent inode mark and
parent inode mark was not marked with FAN_EVENT_ON_CHILD, the event
will not be delivered to the listener process. However, if the same
process also has a mount mark, the event to the parent inode will be
delivered regadless of the mount mark mask.

This behavior is incorrect in the case where the mount mark mask does
not contain the specific event type. For example, the process adds
a mark on a directory with mask FAN_MODIFY (without FAN_EVENT_ON_CHILD)
and a mount mark with mask FAN_CLOSE_NOWRITE (without FAN_ONDIR).

A modify event on a file inside that directory (and inside that mount)
should not create a FAN_MODIFY event, because neither of the marks
requested to get that event on the file.

Fixes: 1968f5eed54c ("fanotify: use both marks when possible")
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/fanotify/fanotify.c | 34 +++++++++++++++-------------------
 1 file changed, 15 insertions(+), 19 deletions(-)

diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index d51e1bb781cf..d94e8031fe5f 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -92,7 +92,7 @@ static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark,
 				       u32 event_mask,
 				       const void *data, int data_type)
 {
-	__u32 marks_mask, marks_ignored_mask;
+	__u32 marks_mask = 0, marks_ignored_mask = 0;
 	const struct path *path = data;
 
 	pr_debug("%s: inode_mark=%p vfsmnt_mark=%p mask=%x data=%p"
@@ -108,24 +108,20 @@ static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark,
 	    !d_can_lookup(path->dentry))
 		return false;
 
-	if (inode_mark && vfsmnt_mark) {
-		marks_mask = (vfsmnt_mark->mask | inode_mark->mask);
-		marks_ignored_mask = (vfsmnt_mark->ignored_mask | inode_mark->ignored_mask);
-	} else if (inode_mark) {
-		/*
-		 * if the event is for a child and this inode doesn't care about
-		 * events on the child, don't send it!
-		 */
-		if ((event_mask & FS_EVENT_ON_CHILD) &&
-		    !(inode_mark->mask & FS_EVENT_ON_CHILD))
-			return false;
-		marks_mask = inode_mark->mask;
-		marks_ignored_mask = inode_mark->ignored_mask;
-	} else if (vfsmnt_mark) {
-		marks_mask = vfsmnt_mark->mask;
-		marks_ignored_mask = vfsmnt_mark->ignored_mask;
-	} else {
-		BUG();
+	/*
+	 * if the event is for a child and this inode doesn't care about
+	 * events on the child, don't send it!
+	 */
+	if (inode_mark &&
+	    (!(event_mask & FS_EVENT_ON_CHILD) ||
+	     (inode_mark->mask & FS_EVENT_ON_CHILD))) {
+		marks_mask |= inode_mark->mask;
+		marks_ignored_mask |= inode_mark->ignored_mask;
+	}
+
+	if (vfsmnt_mark) {
+		marks_mask |= vfsmnt_mark->mask;
+		marks_ignored_mask |= vfsmnt_mark->ignored_mask;
 	}
 
 	if (d_is_dir(path->dentry) &&

From 0ea9924abe125a0c1277484f1a3d512b3f45c320 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Mon, 9 Apr 2018 17:01:21 +0800
Subject: [PATCH 021/288] md-cluster: don't update recovery_offset for faulty
 device

Device could become faulty when clustered array handling
METADATA_UPDATED msg, so we don't need to call read_rdev
for this device.

Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 3bea45e8ccff..c208c01f63a5 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -9256,8 +9256,10 @@ void md_reload_sb(struct mddev *mddev, int nr)
 	check_sb_changes(mddev, rdev);
 
 	/* Read all rdev's to update recovery_offset */
-	rdev_for_each_rcu(rdev, mddev)
-		read_rdev(mddev, rdev);
+	rdev_for_each_rcu(rdev, mddev) {
+		if (!test_bit(Faulty, &rdev->flags))
+			read_rdev(mddev, rdev);
+	}
 }
 EXPORT_SYMBOL(md_reload_sb);
 

From 8c2425932398a160f687534efe71f0ec4b92833e Mon Sep 17 00:00:00 2001
From: Yufen Yu <yuyufen@huawei.com>
Date: Mon, 9 Apr 2018 09:50:44 +0800
Subject: [PATCH 022/288] md/raid1: exit sync request if MD_RECOVERY_INTR is
 set

We met a sync thread stuck as follows:

 raid1_sync_request+0x2c9/0xb50
 md_do_sync+0x983/0xfa0
 md_thread+0x11c/0x160
 kthread+0x111/0x130
 ret_from_fork+0x35/0x40
 0xffffffffffffffff

At the same time, there is a stuck mdadm thread (mdadm --manage
/dev/md2 --add /dev/sda). It is trying to stop the sync thread:

 kthread_stop+0x42/0xf0
 md_unregister_thread+0x3a/0x70
 md_reap_sync_thread+0x15/0x160
 action_store+0x142/0x2a0
 md_attr_store+0x6c/0xb0
 kernfs_fop_write+0x102/0x180
 __vfs_write+0x33/0x170
 vfs_write+0xad/0x1a0
 SyS_write+0x52/0xc0
 do_syscall_64+0x6e/0x190
 entry_SYSCALL_64_after_hwframe+0x3d/0xa2

Debug tools show that the sync thread is waiting in raise_barrier(),
until raid1d() end all normal IO bios into bio_end_io_list(introduced
in commit 55ce74d4bfe1). But, raid1d() cannot end these bios if
MD_CHANGE_PENDING bit is set. It needs to get mddev->reconfig_mutex lock
and then clear the bit in md_check_recovery().
However, the lock is holding by mdadm in action_store().

Thus, there is a loop:
mdadm waiting for sync thread to stop, sync thread waiting for
raid1d() to end bios, raid1d() waiting for mdadm to release
mddev->reconfig_mutex lock and then it can end bios.

Fix this by checking MD_RECOVERY_INTR while waiting in raise_barrier(),
so that sync thread can exit while mdadm is stoping the sync thread.

Fixes: 55ce74d4bfe1 ("md/raid1: ensure device failure recorded before write request returns.")
Signed-off-by: Jason Yan <yanaijie@huawei.com>
Signed-off-by: Yufen Yu <yuyufen@huawei.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid1.c | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index e2943fb74056..6f624311cec2 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -854,7 +854,7 @@ static void flush_pending_writes(struct r1conf *conf)
  *    there is no normal IO happeing.  It must arrange to call
  *    lower_barrier when the particular background IO completes.
  */
-static void raise_barrier(struct r1conf *conf, sector_t sector_nr)
+static sector_t raise_barrier(struct r1conf *conf, sector_t sector_nr)
 {
 	int idx = sector_to_idx(sector_nr);
 
@@ -885,13 +885,23 @@ static void raise_barrier(struct r1conf *conf, sector_t sector_nr)
 	 *    max resync count which allowed on current I/O barrier bucket.
 	 */
 	wait_event_lock_irq(conf->wait_barrier,
-			    !conf->array_frozen &&
+			    (!conf->array_frozen &&
 			     !atomic_read(&conf->nr_pending[idx]) &&
-			     atomic_read(&conf->barrier[idx]) < RESYNC_DEPTH,
+			     atomic_read(&conf->barrier[idx]) < RESYNC_DEPTH) ||
+				test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery),
 			    conf->resync_lock);
 
+	if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
+		atomic_dec(&conf->barrier[idx]);
+		spin_unlock_irq(&conf->resync_lock);
+		wake_up(&conf->wait_barrier);
+		return -EINTR;
+	}
+
 	atomic_inc(&conf->nr_sync_pending);
 	spin_unlock_irq(&conf->resync_lock);
+
+	return 0;
 }
 
 static void lower_barrier(struct r1conf *conf, sector_t sector_nr)
@@ -2662,9 +2672,12 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
 
 	bitmap_cond_end_sync(mddev->bitmap, sector_nr,
 		mddev_is_clustered(mddev) && (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high));
-	r1_bio = raid1_alloc_init_r1buf(conf);
 
-	raise_barrier(conf, sector_nr);
+
+	if (raise_barrier(conf, sector_nr))
+		return 0;
+
+	r1_bio = raid1_alloc_init_r1buf(conf);
 
 	rcu_read_lock();
 	/*

From 1aa3b3e0cbdb32439f04842e88fc7557a0777660 Mon Sep 17 00:00:00 2001
From: Jia-Ju Bai <baijiaju1990@gmail.com>
Date: Mon, 9 Apr 2018 22:31:19 +0800
Subject: [PATCH 023/288] fs: quota: Replace GFP_ATOMIC with GFP_KERNEL in
 dquot_init

dquot_init() is never called in atomic context.
This function is only set as a parameter of fs_initcall().

Despite never getting called from atomic context,
dquot_init() calls __get_free_pages() with GFP_ATOMIC,
which waits busily for allocation.
GFP_ATOMIC is not necessary and can be replaced with GFP_KERNEL,
to avoid busy waiting and improve the possibility of sucessful allocation.

This is found by a static analysis tool named DCNS written by myself.
And I also manually check it.

Signed-off-by: Jia-Ju Bai <baijiaju1990@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/quota/dquot.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 020c597ef9b6..d88231e3b2be 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2966,7 +2966,7 @@ static int __init dquot_init(void)
 			NULL);
 
 	order = 0;
-	dquot_hash = (struct hlist_head *)__get_free_pages(GFP_ATOMIC, order);
+	dquot_hash = (struct hlist_head *)__get_free_pages(GFP_KERNEL, order);
 	if (!dquot_hash)
 		panic("Cannot create dquot hash table");
 

From dba40d46ebf41e3f7ac9480609529bb6037a918d Mon Sep 17 00:00:00 2001
From: Mariusz Dabrowski <mariusz.dabrowski@intel.com>
Date: Wed, 14 Feb 2018 14:23:30 +0100
Subject: [PATCH 024/288] raid1: copy write hint from master bio to behind bio

Signed-off-by: Mariusz Dabrowski <mariusz.dabrowski@intel.com>
Reviewed-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Reviewed-by: Pawel Baldysiak <pawel.baldysiak@intel.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid1.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 6f624311cec2..e9e3308cb0a7 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1102,6 +1102,8 @@ static void alloc_behind_master_bio(struct r1bio *r1_bio,
 		goto skip_copy;
 	}
 
+	behind_bio->bi_write_hint = bio->bi_write_hint;
+
 	while (i < vcnt && size) {
 		struct page *page;
 		int len = min_t(int, PAGE_SIZE, size);

From c0db1b677e1d584fab5d7ac76a32e1c0157542e0 Mon Sep 17 00:00:00 2001
From: Daniel J Blueman <daniel@quora.org>
Date: Mon, 2 Apr 2018 15:10:35 +0800
Subject: [PATCH 025/288] drm/vc4: Fix memory leak during BO teardown

During BO teardown, an indirect list 'uniform_addr_offsets' wasn't being
freed leading to leaking many 128B allocations. Fix the memory leak by
releasing it at teardown time.

Cc: stable@vger.kernel.org
Fixes: 6d45c81d229d ("drm/vc4: Add support for branching in shader validation.")
Signed-off-by: Daniel J Blueman <daniel@quora.org>
Signed-off-by: Eric Anholt <eric@anholt.net>
Reviewed-by: Eric Anholt <eric@anholt.net>
Link: https://patchwork.freedesktop.org/patch/msgid/20180402071035.25356-1-daniel@quora.org
---
 drivers/gpu/drm/vc4/vc4_bo.c               | 2 ++
 drivers/gpu/drm/vc4/vc4_validate_shaders.c | 1 +
 2 files changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/vc4/vc4_bo.c b/drivers/gpu/drm/vc4/vc4_bo.c
index 2decc8e2c79f..add9cc97a3b6 100644
--- a/drivers/gpu/drm/vc4/vc4_bo.c
+++ b/drivers/gpu/drm/vc4/vc4_bo.c
@@ -195,6 +195,7 @@ static void vc4_bo_destroy(struct vc4_bo *bo)
 	vc4_bo_set_label(obj, -1);
 
 	if (bo->validated_shader) {
+		kfree(bo->validated_shader->uniform_addr_offsets);
 		kfree(bo->validated_shader->texture_samples);
 		kfree(bo->validated_shader);
 		bo->validated_shader = NULL;
@@ -591,6 +592,7 @@ void vc4_free_object(struct drm_gem_object *gem_bo)
 	}
 
 	if (bo->validated_shader) {
+		kfree(bo->validated_shader->uniform_addr_offsets);
 		kfree(bo->validated_shader->texture_samples);
 		kfree(bo->validated_shader);
 		bo->validated_shader = NULL;
diff --git a/drivers/gpu/drm/vc4/vc4_validate_shaders.c b/drivers/gpu/drm/vc4/vc4_validate_shaders.c
index d3f15bf60900..7cf82b071de2 100644
--- a/drivers/gpu/drm/vc4/vc4_validate_shaders.c
+++ b/drivers/gpu/drm/vc4/vc4_validate_shaders.c
@@ -942,6 +942,7 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
 fail:
 	kfree(validation_state.branch_targets);
 	if (validated_shader) {
+		kfree(validated_shader->uniform_addr_offsets);
 		kfree(validated_shader->texture_samples);
 		kfree(validated_shader);
 	}

From 386c6ddbda180676b7d9fc375d54a7bdd353d39e Mon Sep 17 00:00:00 2001
From: KarimAllah Ahmed <karahmed@amazon.de>
Date: Tue, 10 Apr 2018 14:15:46 +0200
Subject: [PATCH 026/288] X86/VMX: Disable VMX preemption timer if MWAIT is not
 intercepted
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The VMX-preemption timer is used by KVM as a way to set deadlines for the
guest (i.e. timer emulation). That was safe till very recently when
capability KVM_X86_DISABLE_EXITS_MWAIT to disable intercepting MWAIT was
introduced. According to Intel SDM 25.5.1:

"""
The VMX-preemption timer operates in the C-states C0, C1, and C2; it also
operates in the shutdown and wait-for-SIPI states. If the timer counts down
to zero in any state other than the wait-for SIPI state, the logical
processor transitions to the C0 C-state and causes a VM exit; the timer
does not cause a VM exit if it counts down to zero in the wait-for-SIPI
state. The timer is not decremented in C-states deeper than C2.
"""

Now once the guest issues the MWAIT with a c-state deeper than
C2 the preemption timer will never wake it up again since it stopped
ticking! Usually this is compensated by other activities in the system that
would wake the core from the deep C-state (and cause a VMExit). For
example, if the host itself is ticking or it received interrupts, etc!

So disable the VMX-preemption timer if MWAIT is exposed to the guest!

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: kvm@vger.kernel.org
Signed-off-by: KarimAllah Ahmed <karahmed@amazon.de>
Fixes: 4d5422cea3b61f158d58924cbb43feada456ba5c
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ce639e00fd0f..e7024ad586f7 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -12218,10 +12218,16 @@ static inline int u64_shl_div_u64(u64 a, unsigned int shift,
 
 static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
 {
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	u64 tscl = rdtsc();
-	u64 guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
-	u64 delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
+	struct vcpu_vmx *vmx;
+	u64 tscl, guest_tscl, delta_tsc;
+
+	if (kvm_mwait_in_guest(vcpu->kvm))
+		return -EOPNOTSUPP;
+
+	vmx = to_vmx(vcpu);
+	tscl = rdtsc();
+	guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
+	delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
 
 	/* Convert to host delta tsc if tsc scaling is enabled */
 	if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio &&

From 4d5f26ee310237552a36aa14ceee96d6659153cd Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Tue, 10 Apr 2018 13:38:56 +0100
Subject: [PATCH 027/288] kvm: selftests: fix spelling mistake: "divisable" and
 "divisible"

Trivial fix to spelling mistakes in comment and message text

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/lib/kvm_util.c  | 2 +-
 tools/testing/selftests/kvm/lib/sparsebit.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index 7ca1bb40c498..e213d513dc61 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -1435,7 +1435,7 @@ vm_paddr_t vm_phy_page_alloc(struct kvm_vm *vm,
 	sparsebit_idx_t pg;
 
 	TEST_ASSERT((paddr_min % vm->page_size) == 0, "Min physical address "
-		"not divisable by page size.\n"
+		"not divisible by page size.\n"
 		"  paddr_min: 0x%lx page_size: 0x%x",
 		paddr_min, vm->page_size);
 
diff --git a/tools/testing/selftests/kvm/lib/sparsebit.c b/tools/testing/selftests/kvm/lib/sparsebit.c
index 0c5cf3e0cb6f..b132bc95d183 100644
--- a/tools/testing/selftests/kvm/lib/sparsebit.c
+++ b/tools/testing/selftests/kvm/lib/sparsebit.c
@@ -121,7 +121,7 @@
  *     avoided by moving the setting of the nodes mask bits into
  *     the previous nodes num_after setting.
  *
- *   + Node starting index is evenly divisable by the number of bits
+ *   + Node starting index is evenly divisible by the number of bits
  *     within a nodes mask member.
  *
  *   + Nodes never represent a range of bits that wrap around the
@@ -1741,7 +1741,7 @@ void sparsebit_validate_internal(struct sparsebit *s)
 
 		/* Validate node index is divisible by the mask size */
 		if (nodep->idx % MASK_BITS) {
-			fprintf(stderr, "Node index not divisable by "
+			fprintf(stderr, "Node index not divisible by "
 				"mask size,\n"
 				"  nodep: %p nodep->idx: 0x%lx "
 				"MASK_BITS: %lu\n",

From 5ac7c2fd6e7102532104907c0df94abca826ec5c Mon Sep 17 00:00:00 2001
From: Kyle Spiers <ksspiers@google.com>
Date: Tue, 10 Apr 2018 17:02:29 -0700
Subject: [PATCH 028/288] isofs compress: Remove VLA usage

As part of the effort to remove VLAs from the kernel[1], this changes
the allocation of the bhs and pages arrays from being on the stack to being
kcalloc()ed. This also allows for the removal of the explicit zeroing
of bhs.

https://lkml.org/lkml/2018/3/7/621

Signed-off-by: Kyle Spiers <ksspiers@google.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/isofs/compress.c | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c
index 9bb2fe35799d..10205ececc27 100644
--- a/fs/isofs/compress.c
+++ b/fs/isofs/compress.c
@@ -20,6 +20,7 @@
 #include <linux/init.h>
 #include <linux/bio.h>
 
+#include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/zlib.h>
 
@@ -59,7 +60,7 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start,
 				>> bufshift;
 	int haveblocks;
 	blkcnt_t blocknum;
-	struct buffer_head *bhs[needblocks + 1];
+	struct buffer_head **bhs;
 	int curbh, curpage;
 
 	if (block_size > deflateBound(1UL << zisofs_block_shift)) {
@@ -80,7 +81,11 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start,
 
 	/* Because zlib is not thread-safe, do all the I/O at the top. */
 	blocknum = block_start >> bufshift;
-	memset(bhs, 0, (needblocks + 1) * sizeof(struct buffer_head *));
+	bhs = kcalloc(needblocks + 1, sizeof(*bhs), GFP_KERNEL);
+	if (!bhs) {
+		*errp = -ENOMEM;
+		return 0;
+	}
 	haveblocks = isofs_get_blocks(inode, blocknum, bhs, needblocks);
 	ll_rw_block(REQ_OP_READ, 0, haveblocks, bhs);
 
@@ -190,6 +195,7 @@ z_eio:
 b_eio:
 	for (i = 0; i < haveblocks; i++)
 		brelse(bhs[i]);
+	kfree(bhs);
 	return stream.total_out;
 }
 
@@ -305,7 +311,7 @@ static int zisofs_readpage(struct file *file, struct page *page)
 	unsigned int zisofs_pages_per_cblock =
 		PAGE_SHIFT <= zisofs_block_shift ?
 		(1 << (zisofs_block_shift - PAGE_SHIFT)) : 0;
-	struct page *pages[max_t(unsigned, zisofs_pages_per_cblock, 1)];
+	struct page **pages;
 	pgoff_t index = page->index, end_index;
 
 	end_index = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
@@ -330,6 +336,12 @@ static int zisofs_readpage(struct file *file, struct page *page)
 		full_page = 0;
 		pcount = 1;
 	}
+	pages = kcalloc(max_t(unsigned int, zisofs_pages_per_cblock, 1),
+					sizeof(*pages), GFP_KERNEL);
+	if (!pages) {
+		unlock_page(page);
+		return -ENOMEM;
+	}
 	pages[full_page] = page;
 
 	for (i = 0; i < pcount; i++, index++) {
@@ -357,6 +369,7 @@ static int zisofs_readpage(struct file *file, struct page *page)
 	}			
 
 	/* At this point, err contains 0 or -EIO depending on the "critical" page */
+	kfree(pages);
 	return err;
 }
 

From 8e9b29b61851ba452e33373743fadb52778e9075 Mon Sep 17 00:00:00 2001
From: KarimAllah Ahmed <karahmed@amazon.de>
Date: Wed, 11 Apr 2018 11:16:03 +0200
Subject: [PATCH 029/288] X86/KVM: Do not allow DISABLE_EXITS_MWAIT when LAPIC
 ARAT is not available
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If the processor does not have an "Always Running APIC Timer" (aka ARAT),
we should not give guests direct access to MWAIT. The LAPIC timer would
stop ticking in deep C-states, so any host deadlines would not wakeup the
host kernel.

The host kernel intel_idle driver handles this by switching to broadcast
mode when ARAT is not available and MWAIT is issued with a deep C-state
that would stop the LAPIC timer. When MWAIT is passed through, we can not
tell when MWAIT is issued.

So just disable this capability when LAPIC ARAT is not available. I am not
even sure if there are any CPUs with VMX support but no LAPIC ARAT or not.

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Reported-by: Wanpeng Li <kernellwp@gmail.com>
Signed-off-by: KarimAllah Ahmed <karahmed@amazon.de>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b2ff74b12ec4..0334b250e102 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2819,7 +2819,8 @@ out:
 static inline bool kvm_can_mwait_in_guest(void)
 {
 	return boot_cpu_has(X86_FEATURE_MWAIT) &&
-		!boot_cpu_has_bug(X86_BUG_MONITOR);
+		!boot_cpu_has_bug(X86_BUG_MONITOR) &&
+		boot_cpu_has(X86_FEATURE_ARAT);
 }
 
 int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)

From 2698d82e519413c6ad287e6f14b29e0373ed37f8 Mon Sep 17 00:00:00 2001
From: hu huajun <huhuajun@linux.alibaba.com>
Date: Wed, 11 Apr 2018 15:16:40 +0800
Subject: [PATCH 030/288] KVM: X86: fix incorrect reference of
 trace_kvm_pi_irte_update

In arch/x86/kvm/trace.h, this function is declared as host_irq the
first input, and vcpu_id the second, instead of otherwise.

Signed-off-by: hu huajun <huhuajun@linux.alibaba.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/svm.c | 5 ++---
 arch/x86/kvm/vmx.c | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index e0a3f56a791d..b3ebc8ad6891 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -5266,9 +5266,8 @@ static int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
 		}
 
 		if (!ret && svm) {
-			trace_kvm_pi_irte_update(svm->vcpu.vcpu_id,
-						 host_irq, e->gsi,
-						 vcpu_info.vector,
+			trace_kvm_pi_irte_update(host_irq, svm->vcpu.vcpu_id,
+						 e->gsi, vcpu_info.vector,
 						 vcpu_info.pi_desc_addr, set);
 		}
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e7024ad586f7..fe3e13edf201 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -12533,7 +12533,7 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
 		vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
 		vcpu_info.vector = irq.vector;
 
-		trace_kvm_pi_irte_update(vcpu->vcpu_id, host_irq, e->gsi,
+		trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi,
 				vcpu_info.vector, vcpu_info.pi_desc_addr, set);
 
 		if (set)

From 300ad8992913025b4294d4fc37b6bfff4a8b7ad1 Mon Sep 17 00:00:00 2001
From: Daniel Kurtz <djkurtz@chromium.org>
Date: Fri, 6 Apr 2018 16:07:59 -0600
Subject: [PATCH 031/288] mmc: sdhci-pci: Only do AMD tuning for HS200

Commit c31165d7400b ("mmc: sdhci-pci: Add support for HS200 tuning mode
on AMD, eMMC-4.5.1") added a HS200 tuning method for use with AMD SDHCI
controllers.  As described in the commit subject, this tuning is specific
for HS200.  However, as implemented, this method is used for all host
timings, because platform_execute_tuning, if it exists, is called
unconditionally by sdhci_execute_tuning().  This breaks tuning when using
the AMD controller with, for example, a DDR50 SD card.

Instead, we can implement an amd execute_tuning wrapper callback, and
then conditionally do the HS200 specific tuning for HS200, and otherwise
call back to the standard sdhci_execute_tuning().

Signed-off-by: Daniel Kurtz <djkurtz@chromium.org>
Acked-by: Shyam Sundar S K <Shyam-sundar.S-k@amd.com>
Acked-by: Adrian Hunter <adrian.hunter@intel.com>
Fixes: c31165d7400b ("mmc: sdhci-pci: Add support for HS200 tuning mode on AMD, eMMC-4.5.1")
Cc: stable@vger.kernel.org # v4.11+
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/host/sdhci-pci-core.c | 25 +++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/drivers/mmc/host/sdhci-pci-core.c b/drivers/mmc/host/sdhci-pci-core.c
index 787434e5589d..78c25ad35fd2 100644
--- a/drivers/mmc/host/sdhci-pci-core.c
+++ b/drivers/mmc/host/sdhci-pci-core.c
@@ -1312,7 +1312,7 @@ static void amd_enable_manual_tuning(struct pci_dev *pdev)
 	pci_write_config_dword(pdev, AMD_SD_MISC_CONTROL, val);
 }
 
-static int amd_execute_tuning(struct sdhci_host *host, u32 opcode)
+static int amd_execute_tuning_hs200(struct sdhci_host *host, u32 opcode)
 {
 	struct sdhci_pci_slot *slot = sdhci_priv(host);
 	struct pci_dev *pdev = slot->chip->pdev;
@@ -1351,6 +1351,27 @@ static int amd_execute_tuning(struct sdhci_host *host, u32 opcode)
 	return 0;
 }
 
+static int amd_execute_tuning(struct mmc_host *mmc, u32 opcode)
+{
+	struct sdhci_host *host = mmc_priv(mmc);
+
+	/* AMD requires custom HS200 tuning */
+	if (host->timing == MMC_TIMING_MMC_HS200)
+		return amd_execute_tuning_hs200(host, opcode);
+
+	/* Otherwise perform standard SDHCI tuning */
+	return sdhci_execute_tuning(mmc, opcode);
+}
+
+static int amd_probe_slot(struct sdhci_pci_slot *slot)
+{
+	struct mmc_host_ops *ops = &slot->host->mmc_host_ops;
+
+	ops->execute_tuning = amd_execute_tuning;
+
+	return 0;
+}
+
 static int amd_probe(struct sdhci_pci_chip *chip)
 {
 	struct pci_dev	*smbus_dev;
@@ -1385,12 +1406,12 @@ static const struct sdhci_ops amd_sdhci_pci_ops = {
 	.set_bus_width			= sdhci_set_bus_width,
 	.reset				= sdhci_reset,
 	.set_uhs_signaling		= sdhci_set_uhs_signaling,
-	.platform_execute_tuning	= amd_execute_tuning,
 };
 
 static const struct sdhci_pci_fixes sdhci_amd = {
 	.probe		= amd_probe,
 	.ops		= &amd_sdhci_pci_ops,
+	.probe_slot	= amd_probe_slot,
 };
 
 static const struct pci_device_id pci_ids[] = {

From 7ecb46e9ee9af18e304eb9e7d6804c59a408e846 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fabi=C3=A1n=20Inostroza?= <soulsonceonfire@gmail.com>
Date: Thu, 12 Apr 2018 00:37:35 -0300
Subject: [PATCH 032/288] ALSA: line6: Use correct endpoint type for midi
 output
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Sending MIDI messages to a PODxt through the USB connection shows
"usb_submit_urb failed" in dmesg and the message is not received by
the POD.

The error is caused because in the funcion send_midi_async() in midi.c
there is a call to usb_sndbulkpipe() for endpoint 3 OUT, but the PODxt
USB descriptor shows that this endpoint it's an interrupt endpoint.

Patch tested with PODxt only.

[ The bug has been present from the very beginning in the staging
  driver time, but Fixes below points to the commit moving to sound/
  directory so that the fix can be cleanly applied -- tiwai ]

Fixes: 61864d844c29 ("ALSA: move line6 usb driver into sound/usb")
Signed-off-by: Fabián Inostroza <fabianinostroza@udec.cl>
Cc: <stable@vger.kernel.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/usb/line6/midi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sound/usb/line6/midi.c b/sound/usb/line6/midi.c
index 6d7cde56a355..e2cf55c53ea8 100644
--- a/sound/usb/line6/midi.c
+++ b/sound/usb/line6/midi.c
@@ -125,7 +125,7 @@ static int send_midi_async(struct usb_line6 *line6, unsigned char *data,
 	}
 
 	usb_fill_int_urb(urb, line6->usbdev,
-			 usb_sndbulkpipe(line6->usbdev,
+			 usb_sndintpipe(line6->usbdev,
 					 line6->properties->ep_ctrl_w),
 			 transfer_buffer, length, midi_sent, line6,
 			 line6->interval);

From 619d3a2922ce623ca2eca443cc936810d328317c Mon Sep 17 00:00:00 2001
From: Aaron Armstrong Skomra <skomra@gmail.com>
Date: Wed, 4 Apr 2018 14:24:11 -0700
Subject: [PATCH 033/288] HID: wacom: bluetooth: send exit report for recent
 Bluetooth devices

The code path for recent Bluetooth devices omits an exit report which
resets all the values of the device.

Fixes: 4922cd26f0 ("HID: wacom: Support 2nd-gen Intuos Pro's Bluetooth classic interface")
Cc: <stable@vger.kernel.org> # 4.11
Signed-off-by: Aaron Armstrong Skomra <aaron.skomra@wacom.com>
Reviewed-by: Ping Cheng <ping.cheng@wacom.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/wacom_wac.c | 76 +++++++++++++++++++++++++----------------
 1 file changed, 46 insertions(+), 30 deletions(-)

diff --git a/drivers/hid/wacom_wac.c b/drivers/hid/wacom_wac.c
index 6da16a879c9f..5f947ec20dcb 100644
--- a/drivers/hid/wacom_wac.c
+++ b/drivers/hid/wacom_wac.c
@@ -689,6 +689,45 @@ static int wacom_intuos_get_tool_type(int tool_id)
 	return tool_type;
 }
 
+static void wacom_exit_report(struct wacom_wac *wacom)
+{
+	struct input_dev *input = wacom->pen_input;
+	struct wacom_features *features = &wacom->features;
+	unsigned char *data = wacom->data;
+	int idx = (features->type == INTUOS) ? (data[1] & 0x01) : 0;
+
+	/*
+	 * Reset all states otherwise we lose the initial states
+	 * when in-prox next time
+	 */
+	input_report_abs(input, ABS_X, 0);
+	input_report_abs(input, ABS_Y, 0);
+	input_report_abs(input, ABS_DISTANCE, 0);
+	input_report_abs(input, ABS_TILT_X, 0);
+	input_report_abs(input, ABS_TILT_Y, 0);
+	if (wacom->tool[idx] >= BTN_TOOL_MOUSE) {
+		input_report_key(input, BTN_LEFT, 0);
+		input_report_key(input, BTN_MIDDLE, 0);
+		input_report_key(input, BTN_RIGHT, 0);
+		input_report_key(input, BTN_SIDE, 0);
+		input_report_key(input, BTN_EXTRA, 0);
+		input_report_abs(input, ABS_THROTTLE, 0);
+		input_report_abs(input, ABS_RZ, 0);
+	} else {
+		input_report_abs(input, ABS_PRESSURE, 0);
+		input_report_key(input, BTN_STYLUS, 0);
+		input_report_key(input, BTN_STYLUS2, 0);
+		input_report_key(input, BTN_TOUCH, 0);
+		input_report_abs(input, ABS_WHEEL, 0);
+		if (features->type >= INTUOS3S)
+			input_report_abs(input, ABS_Z, 0);
+	}
+	input_report_key(input, wacom->tool[idx], 0);
+	input_report_abs(input, ABS_MISC, 0); /* reset tool id */
+	input_event(input, EV_MSC, MSC_SERIAL, wacom->serial[idx]);
+	wacom->id[idx] = 0;
+}
+
 static int wacom_intuos_inout(struct wacom_wac *wacom)
 {
 	struct wacom_features *features = &wacom->features;
@@ -741,36 +780,7 @@ static int wacom_intuos_inout(struct wacom_wac *wacom)
 		if (!wacom->id[idx])
 			return 1;
 
-		/*
-		 * Reset all states otherwise we lose the initial states
-		 * when in-prox next time
-		 */
-		input_report_abs(input, ABS_X, 0);
-		input_report_abs(input, ABS_Y, 0);
-		input_report_abs(input, ABS_DISTANCE, 0);
-		input_report_abs(input, ABS_TILT_X, 0);
-		input_report_abs(input, ABS_TILT_Y, 0);
-		if (wacom->tool[idx] >= BTN_TOOL_MOUSE) {
-			input_report_key(input, BTN_LEFT, 0);
-			input_report_key(input, BTN_MIDDLE, 0);
-			input_report_key(input, BTN_RIGHT, 0);
-			input_report_key(input, BTN_SIDE, 0);
-			input_report_key(input, BTN_EXTRA, 0);
-			input_report_abs(input, ABS_THROTTLE, 0);
-			input_report_abs(input, ABS_RZ, 0);
-		} else {
-			input_report_abs(input, ABS_PRESSURE, 0);
-			input_report_key(input, BTN_STYLUS, 0);
-			input_report_key(input, BTN_STYLUS2, 0);
-			input_report_key(input, BTN_TOUCH, 0);
-			input_report_abs(input, ABS_WHEEL, 0);
-			if (features->type >= INTUOS3S)
-				input_report_abs(input, ABS_Z, 0);
-		}
-		input_report_key(input, wacom->tool[idx], 0);
-		input_report_abs(input, ABS_MISC, 0); /* reset tool id */
-		input_event(input, EV_MSC, MSC_SERIAL, wacom->serial[idx]);
-		wacom->id[idx] = 0;
+		wacom_exit_report(wacom);
 		return 2;
 	}
 
@@ -1235,6 +1245,12 @@ static void wacom_intuos_pro2_bt_pen(struct wacom_wac *wacom)
 		if (!valid)
 			continue;
 
+		if (!prox) {
+			wacom->shared->stylus_in_proximity = false;
+			wacom_exit_report(wacom);
+			input_sync(pen_input);
+			return;
+		}
 		if (range) {
 			input_report_abs(pen_input, ABS_X, get_unaligned_le16(&frame[1]));
 			input_report_abs(pen_input, ABS_Y, get_unaligned_le16(&frame[3]));

From 9dc9a95f03a69ab926d9ff1986ab2087f34a5dce Mon Sep 17 00:00:00 2001
From: Alexey Budankov <alexey.budankov@linux.intel.com>
Date: Tue, 3 Apr 2018 21:18:33 +0300
Subject: [PATCH 034/288] perf stat: Enable 1ms interval for printing event
 counters values

Currently print count interval for performance counters values is
limited by 10ms so reading the values at frequencies higher than 100Hz
is restricted by the tool.

This change makes perf stat -I possible on frequencies up to 1KHz and,
to some extent, makes perf stat -I to be on-par with perf record
sampling profiling.

When running perf stat -I for monitoring e.g. PCIe uncore counters and
at the same time profiling some I/O workload by perf record e.g. for
cpu-cycles and context switches, it is then possible to observe
consolidated CPU/OS/IO(Uncore) performance picture for that workload.

Tool overhead warning printed when specifying -v option can be missed
due to screen scrolling in case you have output to the console
so message is moved into help available by running perf stat -h.

Signed-off-by: Alexey Budankov <alexey.budankov@linux.intel.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/b842ad6a-d606-32e4-afe5-974071b5198e@linux.intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-stat.txt |  2 +-
 tools/perf/builtin-stat.c              | 14 ++------------
 2 files changed, 3 insertions(+), 13 deletions(-)

diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt
index f15b306be183..e6c3b4e555c2 100644
--- a/tools/perf/Documentation/perf-stat.txt
+++ b/tools/perf/Documentation/perf-stat.txt
@@ -153,7 +153,7 @@ perf stat --repeat 10 --null --sync --pre 'make -s O=defconfig-build/clean' -- m
 
 -I msecs::
 --interval-print msecs::
-Print count deltas every N milliseconds (minimum: 10ms)
+Print count deltas every N milliseconds (minimum: 1ms)
 The overhead percentage could be high in some cases, for instance with small, sub 100ms intervals.  Use with caution.
 	example: 'perf stat -I 1000 -e cycles -a sleep 5'
 
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index f5c454855908..147a27e8c937 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1943,7 +1943,8 @@ static const struct option stat_options[] = {
 	OPT_STRING(0, "post", &post_cmd, "command",
 			"command to run after to the measured command"),
 	OPT_UINTEGER('I', "interval-print", &stat_config.interval,
-		    "print counts at regular interval in ms (>= 10)"),
+		    "print counts at regular interval in ms "
+		    "(overhead is possible for values <= 100ms)"),
 	OPT_INTEGER(0, "interval-count", &stat_config.times,
 		    "print counts for fixed number of times"),
 	OPT_UINTEGER(0, "timeout", &stat_config.timeout,
@@ -2923,17 +2924,6 @@ int cmd_stat(int argc, const char **argv)
 		}
 	}
 
-	if (interval && interval < 100) {
-		if (interval < 10) {
-			pr_err("print interval must be >= 10ms\n");
-			parse_options_usage(stat_usage, stat_options, "I", 1);
-			goto out;
-		} else
-			pr_warning("print interval < 100ms. "
-				   "The overhead percentage could be high in some cases. "
-				   "Please proceed with caution.\n");
-	}
-
 	if (stat_config.times && interval)
 		interval_count = true;
 	else if (stat_config.times && !interval) {

From 4d3b57da1593c66835d8e3a757e4751b35493fb8 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Wed, 4 Apr 2018 17:34:45 +0100
Subject: [PATCH 035/288] tools headers: Restore READ_ONCE() C++ compatibility

Our userspace <linux/compiler.h> defines READ_ONCE() in a way that clang
doesn't like, as we have an anonymous union in which neither field is
initialized.

WRITE_ONCE() is fine since it initializes the __val field. For
READ_ONCE() we can keep clang and GCC happy with a dummy initialization
of the __c field, so let's do that.

At the same time, let's split READ_ONCE() and WRITE_ONCE() over several
lines for legibility, as we do in the in-kernel <linux/compiler.h>.

Reported-by: Li Zhijian <lizhijian@cn.fujitsu.com>
Reported-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
Tested-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Fixes: 6aa7de059173a986 ("locking/atomics: COCCINELLE/treewide: Convert trivial ACCESS_ONCE() patterns to READ_ONCE()/WRITE_ONCE()")
Link: http://lkml.kernel.org/r/20180404163445.16492-1-mark.rutland@arm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/include/linux/compiler.h | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/tools/include/linux/compiler.h b/tools/include/linux/compiler.h
index 04e32f965ad7..1827c2f973f9 100644
--- a/tools/include/linux/compiler.h
+++ b/tools/include/linux/compiler.h
@@ -151,11 +151,21 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
  * required ordering.
  */
 
-#define READ_ONCE(x) \
-	({ union { typeof(x) __val; char __c[1]; } __u; __read_once_size(&(x), __u.__c, sizeof(x)); __u.__val; })
+#define READ_ONCE(x)					\
+({							\
+	union { typeof(x) __val; char __c[1]; } __u =	\
+		{ .__c = { 0 } };			\
+	__read_once_size(&(x), __u.__c, sizeof(x));	\
+	__u.__val;					\
+})
 
-#define WRITE_ONCE(x, val) \
-	({ union { typeof(x) __val; char __c[1]; } __u = { .__val = (val) }; __write_once_size(&(x), __u.__c, sizeof(x)); __u.__val; })
+#define WRITE_ONCE(x, val)				\
+({							\
+	union { typeof(x) __val; char __c[1]; } __u =	\
+		{ .__val = (val) }; 			\
+	__write_once_size(&(x), __u.__c, sizeof(x));	\
+	__u.__val;					\
+})
 
 
 #ifndef __fallthrough

From af72cfb80af5e4cafd8e0b58ac54f222c913aa1b Mon Sep 17 00:00:00 2001
From: Kim Phillips <kim.phillips@arm.com>
Date: Tue, 10 Apr 2018 19:16:24 -0500
Subject: [PATCH 036/288] perf tests: Run dwarf unwind test on arm32

Enable the unwind test on arm32:

  $ perf test unwind
  58: DWARF unwind                                          : Ok

Signed-off-by: Kim Phillips <kim.phillips@arm.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Brian Robbins <brianrob@microsoft.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20180410191624.a3a468670dd4548c66d3d094@arm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/arch/arm/include/arch-tests.h | 12 ++++++++++++
 tools/perf/arch/arm/tests/Build          |  2 ++
 tools/perf/arch/arm/tests/arch-tests.c   | 16 ++++++++++++++++
 3 files changed, 30 insertions(+)
 create mode 100644 tools/perf/arch/arm/include/arch-tests.h
 create mode 100644 tools/perf/arch/arm/tests/arch-tests.c

diff --git a/tools/perf/arch/arm/include/arch-tests.h b/tools/perf/arch/arm/include/arch-tests.h
new file mode 100644
index 000000000000..90ec4c8cb880
--- /dev/null
+++ b/tools/perf/arch/arm/include/arch-tests.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ARCH_TESTS_H
+#define ARCH_TESTS_H
+
+#ifdef HAVE_DWARF_UNWIND_SUPPORT
+struct thread;
+struct perf_sample;
+#endif
+
+extern struct test arch_tests[];
+
+#endif
diff --git a/tools/perf/arch/arm/tests/Build b/tools/perf/arch/arm/tests/Build
index b30eff9bcc83..883c57ff0c08 100644
--- a/tools/perf/arch/arm/tests/Build
+++ b/tools/perf/arch/arm/tests/Build
@@ -1,2 +1,4 @@
 libperf-y += regs_load.o
 libperf-y += dwarf-unwind.o
+
+libperf-y += arch-tests.o
diff --git a/tools/perf/arch/arm/tests/arch-tests.c b/tools/perf/arch/arm/tests/arch-tests.c
new file mode 100644
index 000000000000..5b1543c98022
--- /dev/null
+++ b/tools/perf/arch/arm/tests/arch-tests.c
@@ -0,0 +1,16 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <string.h>
+#include "tests/tests.h"
+#include "arch-tests.h"
+
+struct test arch_tests[] = {
+#ifdef HAVE_DWARF_UNWIND_SUPPORT
+	{
+		.desc = "DWARF unwind",
+		.func = test__dwarf_unwind,
+	},
+#endif
+	{
+		.func = NULL,
+	},
+};

From 3e83eda467050f13fa69d888993458b76e733de9 Mon Sep 17 00:00:00 2001
From: Aaron Ma <aaron.ma@canonical.com>
Date: Mon, 9 Apr 2018 15:41:31 +0800
Subject: [PATCH 037/288] HID: i2c-hid: Fix resume issue on Raydium touchscreen
 device

When Rayd touchscreen resumed from S3, it issues too many errors like:
i2c_hid i2c-RAYD0001:00: i2c_hid_get_input: incomplete report (58/5442)

And all the report data are corrupted, touchscreen is unresponsive.

Fix this by re-sending report description command after resume.
Add device ID as a quirk.

Cc: stable@vger.kernel.org
Signed-off-by: Aaron Ma <aaron.ma@canonical.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/hid-ids.h         |  3 +++
 drivers/hid/i2c-hid/i2c-hid.c | 13 +++++++++++++
 2 files changed, 16 insertions(+)

diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h
index 5a3a7ead3012..0b5cc910f62e 100644
--- a/drivers/hid/hid-ids.h
+++ b/drivers/hid/hid-ids.h
@@ -525,6 +525,9 @@
 #define I2C_VENDOR_ID_HANTICK		0x0911
 #define I2C_PRODUCT_ID_HANTICK_5288	0x5288
 
+#define I2C_VENDOR_ID_RAYD		0x2386
+#define I2C_PRODUCT_ID_RAYD_3118	0x3118
+
 #define USB_VENDOR_ID_HANWANG		0x0b57
 #define USB_DEVICE_ID_HANWANG_TABLET_FIRST	0x5000
 #define USB_DEVICE_ID_HANWANG_TABLET_LAST	0x8fff
diff --git a/drivers/hid/i2c-hid/i2c-hid.c b/drivers/hid/i2c-hid/i2c-hid.c
index 97689e98e53f..615a91ac93bd 100644
--- a/drivers/hid/i2c-hid/i2c-hid.c
+++ b/drivers/hid/i2c-hid/i2c-hid.c
@@ -47,6 +47,7 @@
 /* quirks to control the device */
 #define I2C_HID_QUIRK_SET_PWR_WAKEUP_DEV	BIT(0)
 #define I2C_HID_QUIRK_NO_IRQ_AFTER_RESET	BIT(1)
+#define I2C_HID_QUIRK_RESEND_REPORT_DESCR	BIT(2)
 
 /* flags */
 #define I2C_HID_STARTED		0
@@ -171,6 +172,8 @@ static const struct i2c_hid_quirks {
 		I2C_HID_QUIRK_SET_PWR_WAKEUP_DEV },
 	{ I2C_VENDOR_ID_HANTICK, I2C_PRODUCT_ID_HANTICK_5288,
 		I2C_HID_QUIRK_NO_IRQ_AFTER_RESET },
+	{ I2C_VENDOR_ID_RAYD, I2C_PRODUCT_ID_RAYD_3118,
+		I2C_HID_QUIRK_RESEND_REPORT_DESCR },
 	{ 0, 0 }
 };
 
@@ -1220,6 +1223,16 @@ static int i2c_hid_resume(struct device *dev)
 	if (ret)
 		return ret;
 
+	/* RAYDIUM device (2386:3118) need to re-send report descr cmd
+	 * after resume, after this it will be back normal.
+	 * otherwise it issues too many incomplete reports.
+	 */
+	if (ihid->quirks & I2C_HID_QUIRK_RESEND_REPORT_DESCR) {
+		ret = i2c_hid_command(client, &hid_report_descr_cmd, NULL, 0);
+		if (!ret)
+			return ret;
+	}
+
 	if (hid->driver && hid->driver->reset_resume) {
 		ret = hid->driver->reset_resume(hid);
 		return ret;

From 592c10e217f3edb35c7e0deba161fef69ad1a336 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 11 Apr 2018 10:30:03 -0300
Subject: [PATCH 038/288] perf annotate: Allow showing offsets in more than
 just jump targets
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Jesper wanted to see offsets at callq sites when doing some performance
investigation related to retpolines, so save him some time by providing
an 'struct annotation_options' to control where offsets should appear:
just on jump targets? That + call instructions? All?

This puts in place the logic to show the offsets, now we need to wire
this up in the TUI browser (next patch) and on the 'perf annotate --stdio2"
interface, where we need a more general mechanism to setup the
'annotation_options' struct from the command line.

Suggested-by: Jesper Dangaard Brouer <brouer@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jin Yao <yao.jin@linux.intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Liška <mliska@suse.cz>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com>
Cc: Thomas Richter <tmricht@linux.vnet.ibm.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: https://lkml.kernel.org/n/tip-m3jc9c3swobye9tj08gnh5i7@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/annotate.c | 11 +++++++++--
 tools/perf/util/annotate.h |  9 +++++++++
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index fbad8dfbb186..5edc565d86c4 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -46,6 +46,7 @@
 struct annotation_options annotation__default_options = {
 	.use_offset     = true,
 	.jump_arrows    = true,
+	.offset_level	= ANNOTATION__OFFSET_JUMP_TARGETS,
 };
 
 const char 	*disassembler_style;
@@ -2512,7 +2513,8 @@ static void __annotation_line__write(struct annotation_line *al, struct annotati
 		if (!notes->options->use_offset) {
 			printed = scnprintf(bf, sizeof(bf), "%" PRIx64 ": ", addr);
 		} else {
-			if (al->jump_sources) {
+			if (al->jump_sources &&
+			    notes->options->offset_level >= ANNOTATION__OFFSET_JUMP_TARGETS) {
 				if (notes->options->show_nr_jumps) {
 					int prev;
 					printed = scnprintf(bf, sizeof(bf), "%*d ",
@@ -2523,9 +2525,14 @@ static void __annotation_line__write(struct annotation_line *al, struct annotati
 					obj__printf(obj, bf);
 					obj__set_color(obj, prev);
 				}
-
+print_addr:
 				printed = scnprintf(bf, sizeof(bf), "%*" PRIx64 ": ",
 						    notes->widths.target, addr);
+			} else if (ins__is_call(&disasm_line(al)->ins) &&
+				   notes->options->offset_level >= ANNOTATION__OFFSET_CALL) {
+				goto print_addr;
+			} else if (notes->options->offset_level == ANNOTATION__MAX_OFFSET_LEVEL) {
+				goto print_addr;
 			} else {
 				printed = scnprintf(bf, sizeof(bf), "%-*s  ",
 						    notes->widths.addr, " ");
diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h
index db8d09bea07e..f28a9e43421d 100644
--- a/tools/perf/util/annotate.h
+++ b/tools/perf/util/annotate.h
@@ -70,8 +70,17 @@ struct annotation_options {
 	     show_nr_jumps,
 	     show_nr_samples,
 	     show_total_period;
+	u8   offset_level;
 };
 
+enum {
+	ANNOTATION__OFFSET_JUMP_TARGETS = 1,
+	ANNOTATION__OFFSET_CALL,
+	ANNOTATION__MAX_OFFSET_LEVEL,
+};
+
+#define ANNOTATION__MIN_OFFSET_LEVEL ANNOTATION__OFFSET_JUMP_TARGETS
+
 extern struct annotation_options annotation__default_options;
 
 struct annotation;

From 51f39603b5f260c73635f4d06d390476b32df6a5 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 11 Apr 2018 10:41:23 -0300
Subject: [PATCH 039/288] perf annotate browser: Allow showing offsets in more
 than just jump targets
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Jesper wanted to see offsets at callq sites when doing some performance
investigation related to retpolines, so save him some time by providing
a 'O' hotkey to allow showing offsets from function start at call
instructions or in all instructions, just go on pressing 'O' till the
offsets you need appear.

Example:

Starts with:

Samples: 64  of event 'cycles:ppp', 100000 Hz, Event count (approx.): 318963
ixgbe_read_reg  /proc/kcore
Percent│    ↑ je     2a
       │   ┌──cmp    $0xffffffff,%r13d
       │   ├──je     d0
       │   │  mov    $0x53e3,%edi
       │   │→ callq  __const_udelay
       │   │  sub    $0x1,%r15d
       │   │↑ jne    83
       │   │  mov    0x8(%rbp),%rax
       │   │  testb  $0x20,0x1799(%rax)
       │   │↑ je     2a
       │   │  mov    0x200(%rax),%rdi
       │   │  mov    %r13d,%edx
       │   │  mov    $0xffffffffc02595d8,%rsi
       │   │→ callq  netdev_warn
       │   │↑ jmpq   2a
       │d0:└─→mov    0x8(%rbp),%rsi
       │      mov    %rbp,%rdi
       │      mov    %eax,0x4(%rsp)
       │    → callq  ixgbe_remove_adapter.isra.77
       │      mov    0x4(%rsp),%eax
Press 'h' for help on key bindings
============================================================================

Pess 'O':

Samples: 64  of event 'cycles:ppp', 100000 Hz, Event count (approx.): 318963
ixgbe_read_reg  /proc/kcore
Percent│    ↑ je     2a
       │   ┌──cmp    $0xffffffff,%r13d
       │   ├──je     d0
       │   │  mov    $0x53e3,%edi
       │99:│→ callq  __const_udelay
       │   │  sub    $0x1,%r15d
       │   │↑ jne    83
       │   │  mov    0x8(%rbp),%rax
       │   │  testb  $0x20,0x1799(%rax)
       │   │↑ je     2a
       │   │  mov    0x200(%rax),%rdi
       │   │  mov    %r13d,%edx
       │   │  mov    $0xffffffffc02595d8,%rsi
       │c6:│→ callq  netdev_warn
       │   │↑ jmpq   2a
       │d0:└─→mov    0x8(%rbp),%rsi
       │      mov    %rbp,%rdi
       │      mov    %eax,0x4(%rsp)
       │db: → callq  ixgbe_remove_adapter.isra.77
       │      mov    0x4(%rsp),%eax
Press 'h' for help on key bindings
============================================================================

Press 'O' again:

Samples: 64  of event 'cycles:ppp', 100000 Hz, Event count (approx.): 318963
ixgbe_read_reg  /proc/kcore
Percent│8c: ↑ je     2a
       │8e:┌──cmp    $0xffffffff,%r13d
       │92:├──je     d0
       │94:│  mov    $0x53e3,%edi
       │99:│→ callq  __const_udelay
       │9e:│  sub    $0x1,%r15d
       │a2:│↑ jne    83
       │a4:│  mov    0x8(%rbp),%rax
       │a8:│  testb  $0x20,0x1799(%rax)
       │af:│↑ je     2a
       │b5:│  mov    0x200(%rax),%rdi
       │bc:│  mov    %r13d,%edx
       │bf:│  mov    $0xffffffffc02595d8,%rsi
       │c6:│→ callq  netdev_warn
       │cb:│↑ jmpq   2a
       │d0:└─→mov    0x8(%rbp),%rsi
       │d4:   mov    %rbp,%rdi
       │d7:   mov    %eax,0x4(%rsp)
       │db: → callq  ixgbe_remove_adapter.isra.77
       │e0:   mov    0x4(%rsp),%eax
Press 'h' for help on key bindings
============================================================================

Press 'O' again and it will show just jump target offsets.

Suggested-by: Jesper Dangaard Brouer <brouer@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jin Yao <yao.jin@linux.intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Liška <mliska@suse.cz>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com>
Cc: Thomas Richter <tmricht@linux.vnet.ibm.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: https://lkml.kernel.org/n/tip-upp6pfdetwlsx18ec2uf1od4@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/ui/browsers/annotate.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c
index 12c099a87f8b..3781d74088a7 100644
--- a/tools/perf/ui/browsers/annotate.c
+++ b/tools/perf/ui/browsers/annotate.c
@@ -692,6 +692,7 @@ static int annotate_browser__run(struct annotate_browser *browser,
 		"J             Toggle showing number of jump sources on targets\n"
 		"n             Search next string\n"
 		"o             Toggle disassembler output/simplified view\n"
+		"O             Bump offset level (jump targets -> +call -> all -> cycle thru)\n"
 		"s             Toggle source code view\n"
 		"t             Circulate percent, total period, samples view\n"
 		"/             Search string\n"
@@ -719,6 +720,10 @@ static int annotate_browser__run(struct annotate_browser *browser,
 			notes->options->use_offset = !notes->options->use_offset;
 			annotation__update_column_widths(notes);
 			continue;
+		case 'O':
+			if (++notes->options->offset_level > ANNOTATION__MAX_OFFSET_LEVEL)
+				notes->options->offset_level = ANNOTATION__MIN_OFFSET_LEVEL;
+			continue;
 		case 'j':
 			notes->options->jump_arrows = !notes->options->jump_arrows;
 			continue;

From e14b733c5dc62f574b4dbc045b2cc52b03d83d4c Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 11 Apr 2018 12:08:53 -0300
Subject: [PATCH 040/288] perf jvmti: Give hints about package names needed to
 build

Give as examples of package names to install to have this built for
fedora and debian, to help the user a bit.

The part from 'e.g.:' onwards:

  No openjdk development package found, please install JDK package, e.g. openjdk-8-jdk, java-1.8.0-openjdk-devel

Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: William Cohen <wcohen@redhat.com>
Link: https://lkml.kernel.org/n/tip-edbi4r2pvzn7no6ebxbtczng@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Makefile.config | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config
index c7abd83a8e19..6b307e97dc57 100644
--- a/tools/perf/Makefile.config
+++ b/tools/perf/Makefile.config
@@ -847,7 +847,7 @@ ifndef NO_JVMTI
   ifeq ($(feature-jvmti), 1)
     $(call detected_var,JDIR)
   else
-    $(warning No openjdk development package found, please install JDK package)
+    $(warning No openjdk development package found, please install JDK package, e.g. openjdk-8-jdk, java-1.8.0-openjdk-devel)
     NO_JVMTI := 1
   endif
 endif

From c13009c1ef3a94cfea212c86bbb94c8361e5de0c Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 11 Apr 2018 17:57:32 -0300
Subject: [PATCH 041/288] perf tests bpf: Remove unused ptrace.h include from
 LLVM test

The bpf-script-test-kbuild.c script, used in one of the LLVM subtests,
includes ptrace.h unnecessarily, and that ends up making it include a
header that uses asm(_ASM_SP), a feature that is not supported by clang
<= 4.0, breaking that 'perf test' entry.

This ended up leading to the ca26cffa4e4a ("x86/asm: Allow again using
asm.h when building for the 'bpf' clang target"), adding an ifndef
__BPF__ to the arch/x86/include/asm/asm.h file.

Newer clang versions accept that asm(_ASM_SP) construct, so just remove
the ptrace.h include, which paves the way for reverting ca26cffa4e4a
("x86/asm: Allow again using asm.h when building for the 'bpf' clang
target").

Suggested-by: Yonghong Song <yhs@fb.com>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lkml.kernel.org/r/613f0a0d-c433-8f4d-dcc1-c9889deae39e@fb.com
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David Ahern <dsahern@gmail.com>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matthias Kaehlcke <mka@chromium.org>
Cc: Miguel Bernal Marin <miguel.bernal.marin@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wang Nan <wangnan0@huawei.com>
Link: https://lkml.kernel.org/n/tip-clbcnzbakdp18ibme4wt43ib@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/bpf-script-test-kbuild.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tools/perf/tests/bpf-script-test-kbuild.c b/tools/perf/tests/bpf-script-test-kbuild.c
index 3626924740d8..ff3ec8337f0a 100644
--- a/tools/perf/tests/bpf-script-test-kbuild.c
+++ b/tools/perf/tests/bpf-script-test-kbuild.c
@@ -9,7 +9,6 @@
 #define SEC(NAME) __attribute__((section(NAME), used))
 
 #include <uapi/linux/fs.h>
-#include <uapi/asm/ptrace.h>
 
 SEC("func=vfs_llseek")
 int bpf_func__vfs_llseek(void *ctx)

From fd97d39b0aa49a4beb429aec344604c1b689f089 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 11 Apr 2018 18:03:33 -0300
Subject: [PATCH 042/288] Revert "x86/asm: Allow again using asm.h when
 building for the 'bpf' clang target"

This reverts commit ca26cffa4e4aaeb09bb9e308f95c7835cb149248.

Newer clang versions accept that asm(_ASM_SP) construct, and now that
the bpf-script-test-kbuild.c script, used in one of the 'perf test LLVM'
subtests doesn't include ptrace.h, which ended up including
arch/x86/include/asm/asm.h, we can revert this patch.

Suggested-by: Yonghong Song <yhs@fb.com>
Link: https://lkml.kernel.org/r/613f0a0d-c433-8f4d-dcc1-c9889deae39e@fb.com
Acked-by: Yonghong Song <yhs@fb.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David Ahern <dsahern@gmail.com>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matthias Kaehlcke <mka@chromium.org>
Cc: Miguel Bernal Marin <miguel.bernal.marin@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wang Nan <wangnan0@huawei.com>
Link: https://lkml.kernel.org/n/tip-nqozcv8loq40tkqpfw997993@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 arch/x86/include/asm/asm.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 386a6900e206..219faaec51df 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -136,7 +136,6 @@
 #endif
 
 #ifndef __ASSEMBLY__
-#ifndef __BPF__
 /*
  * This output constraint should be used for any inline asm which has a "call"
  * instruction.  Otherwise the asm may be inserted before the frame pointer
@@ -146,6 +145,5 @@
 register unsigned long current_stack_pointer asm(_ASM_SP);
 #define ASM_CALL_CONSTRAINT "+r" (current_stack_pointer)
 #endif
-#endif
 
 #endif /* _ASM_X86_ASM_H */

From 90ce61b91903f9c357cbceced45d41642f2aa812 Mon Sep 17 00:00:00 2001
From: Jin Yao <yao.jin@linux.intel.com>
Date: Mon, 9 Apr 2018 18:26:47 +0800
Subject: [PATCH 043/288] perf script: Use HAVE_LIBXXX_SUPPORT to replace
 NO_LIBXXX

In Makefile.config, we define the conditional compilation variables
HAVE_LIBPERL_SUPPORT and HAVE_LIBPYTHON_SUPPORT.

To make the C code more consistent, this patch replaces
NO_LIBPERL/NO_LIBPYTHON in C code with HAVE_LIBPERL_SUPPORT/
HAVE_LIBPYTHON_SUPPORT.

Signed-off-by: Jin Yao <yao.jin@linux.intel.com>
Suggested-by: Ingo Molnar <mingo@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1523269609-28824-2-git-send-email-yao.jin@linux.intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-script.c             | 4 ++--
 tools/perf/util/trace-event-scripting.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 313c42423393..b17edbcd98cc 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -2801,11 +2801,11 @@ int find_scripts(char **scripts_array, char **scripts_path_array)
 	for_each_lang(scripts_path, scripts_dir, lang_dirent) {
 		scnprintf(lang_path, MAXPATHLEN, "%s/%s", scripts_path,
 			  lang_dirent->d_name);
-#ifdef NO_LIBPERL
+#ifndef HAVE_LIBPERL_SUPPORT
 		if (strstr(lang_path, "perl"))
 			continue;
 #endif
-#ifdef NO_LIBPYTHON
+#ifndef HAVE_LIBPYTHON_SUPPORT
 		if (strstr(lang_path, "python"))
 			continue;
 #endif
diff --git a/tools/perf/util/trace-event-scripting.c b/tools/perf/util/trace-event-scripting.c
index 0ac9077f62a2..b1e5c3a2b8e3 100644
--- a/tools/perf/util/trace-event-scripting.c
+++ b/tools/perf/util/trace-event-scripting.c
@@ -98,7 +98,7 @@ static void register_python_scripting(struct scripting_ops *scripting_ops)
 	}
 }
 
-#ifdef NO_LIBPYTHON
+#ifndef HAVE_LIBPYTHON_SUPPORT
 void setup_python_scripting(void)
 {
 	register_python_scripting(&python_scripting_unsupported_ops);
@@ -161,7 +161,7 @@ static void register_perl_scripting(struct scripting_ops *scripting_ops)
 	}
 }
 
-#ifdef NO_LIBPERL
+#ifndef HAVE_LIBPERL_SUPPORT
 void setup_perl_scripting(void)
 {
 	register_perl_scripting(&perl_scripting_unsupported_ops);

From 22e9af4e94801bbdf6945e55db64b877be7c71b3 Mon Sep 17 00:00:00 2001
From: Jin Yao <yao.jin@linux.intel.com>
Date: Mon, 9 Apr 2018 18:26:48 +0800
Subject: [PATCH 044/288] perf tools: Rename HAVE_SYSCALL_TABLE to
 HAVE_SYSCALL_TABLE_SUPPORT

To be consistent with other HAVE_XXX_SUPPORT uses in Makefile.config,
this patch renames HAVE_SYSCALL_TABLE to HAVE_SYSCALL_TABLE_SUPPORT and
updates the C code accordingly.

Signed-off-by: Jin Yao <yao.jin@linux.intel.com>
Suggested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1523269609-28824-3-git-send-email-yao.jin@linux.intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Makefile.config          | 2 +-
 tools/perf/builtin-help.c           | 2 +-
 tools/perf/perf.c                   | 4 ++--
 tools/perf/util/generate-cmdlist.sh | 2 +-
 tools/perf/util/syscalltbl.c        | 6 +++---
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config
index 6b307e97dc57..ae7dc46e8f8a 100644
--- a/tools/perf/Makefile.config
+++ b/tools/perf/Makefile.config
@@ -68,7 +68,7 @@ ifeq ($(NO_PERF_REGS),0)
 endif
 
 ifneq ($(NO_SYSCALL_TABLE),1)
-  CFLAGS += -DHAVE_SYSCALL_TABLE
+  CFLAGS += -DHAVE_SYSCALL_TABLE_SUPPORT
 endif
 
 # So far there's only x86 and arm libdw unwind support merged in perf.
diff --git a/tools/perf/builtin-help.c b/tools/perf/builtin-help.c
index 4aca13f23b9d..1c41b4eaf73c 100644
--- a/tools/perf/builtin-help.c
+++ b/tools/perf/builtin-help.c
@@ -439,7 +439,7 @@ int cmd_help(int argc, const char **argv)
 #ifdef HAVE_LIBELF_SUPPORT
 		"probe",
 #endif
-#if defined(HAVE_LIBAUDIT_SUPPORT) || defined(HAVE_SYSCALL_TABLE)
+#if defined(HAVE_LIBAUDIT_SUPPORT) || defined(HAVE_SYSCALL_TABLE_SUPPORT)
 		"trace",
 #endif
 	NULL };
diff --git a/tools/perf/perf.c b/tools/perf/perf.c
index 1659029d03fc..20a08cb32332 100644
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c
@@ -73,7 +73,7 @@ static struct cmd_struct commands[] = {
 	{ "lock",	cmd_lock,	0 },
 	{ "kvm",	cmd_kvm,	0 },
 	{ "test",	cmd_test,	0 },
-#if defined(HAVE_LIBAUDIT_SUPPORT) || defined(HAVE_SYSCALL_TABLE)
+#if defined(HAVE_LIBAUDIT_SUPPORT) || defined(HAVE_SYSCALL_TABLE_SUPPORT)
 	{ "trace",	cmd_trace,	0 },
 #endif
 	{ "inject",	cmd_inject,	0 },
@@ -491,7 +491,7 @@ int main(int argc, const char **argv)
 		argv[0] = cmd;
 	}
 	if (strstarts(cmd, "trace")) {
-#if defined(HAVE_LIBAUDIT_SUPPORT) || defined(HAVE_SYSCALL_TABLE)
+#if defined(HAVE_LIBAUDIT_SUPPORT) || defined(HAVE_SYSCALL_TABLE_SUPPORT)
 		setup_path();
 		argv[0] = "trace";
 		return cmd_trace(argc, argv);
diff --git a/tools/perf/util/generate-cmdlist.sh b/tools/perf/util/generate-cmdlist.sh
index ff17920a5ebc..c3cef36d4176 100755
--- a/tools/perf/util/generate-cmdlist.sh
+++ b/tools/perf/util/generate-cmdlist.sh
@@ -38,7 +38,7 @@ do
 done
 echo "#endif /* HAVE_LIBELF_SUPPORT */"
 
-echo "#if defined(HAVE_LIBAUDIT_SUPPORT) || defined(HAVE_SYSCALL_TABLE)"
+echo "#if defined(HAVE_LIBAUDIT_SUPPORT) || defined(HAVE_SYSCALL_TABLE_SUPPORT)"
 sed -n -e 's/^perf-\([^ 	]*\)[ 	].* audit*/\1/p' command-list.txt |
 sort |
 while read cmd
diff --git a/tools/perf/util/syscalltbl.c b/tools/perf/util/syscalltbl.c
index 895122d638dd..0ee7f568d60c 100644
--- a/tools/perf/util/syscalltbl.c
+++ b/tools/perf/util/syscalltbl.c
@@ -17,7 +17,7 @@
 #include <stdlib.h>
 #include <linux/compiler.h>
 
-#ifdef HAVE_SYSCALL_TABLE
+#ifdef HAVE_SYSCALL_TABLE_SUPPORT
 #include <string.h>
 #include "string2.h"
 #include "util.h"
@@ -139,7 +139,7 @@ int syscalltbl__strglobmatch_first(struct syscalltbl *tbl, const char *syscall_g
 	return syscalltbl__strglobmatch_next(tbl, syscall_glob, idx);
 }
 
-#else /* HAVE_SYSCALL_TABLE */
+#else /* HAVE_SYSCALL_TABLE_SUPPORT */
 
 #include <libaudit.h>
 
@@ -176,4 +176,4 @@ int syscalltbl__strglobmatch_first(struct syscalltbl *tbl, const char *syscall_g
 {
 	return syscalltbl__strglobmatch_next(tbl, syscall_glob, idx);
 }
-#endif /* HAVE_SYSCALL_TABLE */
+#endif /* HAVE_SYSCALL_TABLE_SUPPORT */

From 8a812bf552d98f6f887f860d3910f201b4a97b26 Mon Sep 17 00:00:00 2001
From: Jin Yao <yao.jin@linux.intel.com>
Date: Mon, 9 Apr 2018 18:26:49 +0800
Subject: [PATCH 045/288] perf version: Print status for syscall_table

This patch doesn't print "libaudit" line if HAVE_SYSCALL_TABLE_SUPPORT
is available and add a line for HAVE_SYSCALL_TABLE_SUPPORT.

For example,

$ ./perf -vv
perf version 4.13.rc5.gc2f8af9
                 dwarf: [ on  ]  # HAVE_DWARF_SUPPORT
    dwarf_getlocations: [ on  ]  # HAVE_DWARF_GETLOCATIONS_SUPPORT
                 glibc: [ on  ]  # HAVE_GLIBC_SUPPORT
                  gtk2: [ on  ]  # HAVE_GTK2_SUPPORT
         syscall_table: [ on  ]  # HAVE_SYSCALL_TABLE_SUPPORT
                libbfd: [ on  ]  # HAVE_LIBBFD_SUPPORT
                libelf: [ on  ]  # HAVE_LIBELF_SUPPORT
               libnuma: [ on  ]  # HAVE_LIBNUMA_SUPPORT
numa_num_possible_cpus: [ on  ]  # HAVE_LIBNUMA_SUPPORT
               libperl: [ on  ]  # HAVE_LIBPERL_SUPPORT
             libpython: [ on  ]  # HAVE_LIBPYTHON_SUPPORT
              libslang: [ on  ]  # HAVE_SLANG_SUPPORT
             libcrypto: [ on  ]  # HAVE_LIBCRYPTO_SUPPORT
             libunwind: [ on  ]  # HAVE_LIBUNWIND_SUPPORT
    libdw-dwarf-unwind: [ on  ]  # HAVE_DWARF_SUPPORT
                  zlib: [ on  ]  # HAVE_ZLIB_SUPPORT
                  lzma: [ on  ]  # HAVE_LZMA_SUPPORT
             get_cpuid: [ on  ]  # HAVE_AUXTRACE_SUPPORT
                   bpf: [ on  ]  # HAVE_LIBBPF_SUPPORT

The line "syscall_table: [ on  ]  # HAVE_SYSCALL_TABLE_SUPPORT" is
new created.

Signed-off-by: Jin Yao <yao.jin@linux.intel.com>
Suggested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1523269609-28824-4-git-send-email-yao.jin@linux.intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-version.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/perf/builtin-version.c b/tools/perf/builtin-version.c
index 2abe3910d6b6..50df168be326 100644
--- a/tools/perf/builtin-version.c
+++ b/tools/perf/builtin-version.c
@@ -60,7 +60,10 @@ static void library_status(void)
 	STATUS(HAVE_DWARF_GETLOCATIONS_SUPPORT, dwarf_getlocations);
 	STATUS(HAVE_GLIBC_SUPPORT, glibc);
 	STATUS(HAVE_GTK2_SUPPORT, gtk2);
+#ifndef HAVE_SYSCALL_TABLE_SUPPORT
 	STATUS(HAVE_LIBAUDIT_SUPPORT, libaudit);
+#endif
+	STATUS(HAVE_SYSCALL_TABLE_SUPPORT, syscall_table);
 	STATUS(HAVE_LIBBFD_SUPPORT, libbfd);
 	STATUS(HAVE_LIBELF_SUPPORT, libelf);
 	STATUS(HAVE_LIBNUMA_SUPPORT, libnuma);

From e8103e44cebe2ef891bc3a5c6c4b74854846968b Mon Sep 17 00:00:00 2001
From: Takuya Yamamoto <tkydevel@gmail.com>
Date: Tue, 10 Apr 2018 23:35:39 +0900
Subject: [PATCH 046/288] perf sched: Fix documentation for timehist

Fixed a incorrect option and usage to those shown by "perf sched timehist -h",
i.e. the default is really --call-graph, which is equivalent to -g.

Signed-off-by: Takuya Yamamoto <tkydevel@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lkml.kernel.org/n/tip-8fzo0dlsi1mku5aqx8brep5s@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-sched.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/perf/Documentation/perf-sched.txt b/tools/perf/Documentation/perf-sched.txt
index bb33601a823b..63f938b887dd 100644
--- a/tools/perf/Documentation/perf-sched.txt
+++ b/tools/perf/Documentation/perf-sched.txt
@@ -104,8 +104,8 @@ OPTIONS for 'perf sched timehist'
     kallsyms pathname
 
 -g::
---no-call-graph::
-	Do not display call chains if present.
+--call-graph::
+	Display call chains if present (default on).
 
 --max-stack::
 	Maximum number of functions to display in backtrace, default 5.

From f0f4cf5b306620282db0c59ff963012e1973e025 Mon Sep 17 00:00:00 2001
From: Krish Sadhukhan <krish.sadhukhan@oracle.com>
Date: Wed, 11 Apr 2018 01:10:16 -0400
Subject: [PATCH 047/288] x86: Add check for APIC access address for vmentry of
 L2 guests

According to the sub-section titled 'VM-Execution Control Fields' in the
section titled 'Basic VM-Entry Checks' in Intel SDM vol. 3C, the following
vmentry check must be enforced:

    If the 'virtualize APIC-accesses' VM-execution control is 1, the
    APIC-access address must satisfy the following checks:

	- Bits 11:0 of the address must be 0.
	- The address should not set any bits beyond the processor's
	  physical-address width.

This patch adds the necessary check to conform to this rule. If the check
fails, we cause the L2 VMENTRY to fail which is what the associated unit
test (following patch) expects.

Reviewed-by: Mihai Carabas <mihai.carabas@oracle.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Reviewed-by: Jim Mattson <jmattson@google.com>
Reviewed-by: Wanpeng Li <wanpengli@tencent.com>
Signed-off-by: Krish Sadhukhan <krish.sadhukhan@oracle.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index fe3e13edf201..a13c603bdefb 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -10602,6 +10602,16 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
 	return true;
 }
 
+static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu,
+					  struct vmcs12 *vmcs12)
+{
+	if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
+	    !page_address_valid(vcpu, vmcs12->apic_access_addr))
+		return -EINVAL;
+	else
+		return 0;
+}
+
 static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
 					   struct vmcs12 *vmcs12)
 {
@@ -11293,6 +11303,9 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 	if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12))
 		return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
 
+	if (nested_vmx_check_apic_access_controls(vcpu, vmcs12))
+		return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
 	if (nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12))
 		return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
 

From 4e1acd7b31a03f24cc6108d37d005e6b1d48c5d3 Mon Sep 17 00:00:00 2001
From: Peng Hao <peng.hao2@zte.com.cn>
Date: Fri, 13 Apr 2018 08:36:30 +0800
Subject: [PATCH 048/288] kvm: selftests: add -std=gnu99 cflags
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

lib/kvm_util.c: In function ‘kvm_memcmp_hva_gva’:
lib/kvm_util.c:332:2: error: ‘for’ loop initial declarations are only allowed in C99 mode

So add -std=gnu99 to CFLAGS

Signed-off-by: Peng Hao <peng.hao2@zte.com.cn>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index dc44de904797..1da541e1ab75 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -14,7 +14,7 @@ LIBKVM += $(LIBKVM_$(UNAME_M))
 
 INSTALL_HDR_PATH = $(top_srcdir)/usr
 LINUX_HDR_PATH = $(INSTALL_HDR_PATH)/include/
-CFLAGS += -O2 -g -I$(LINUX_HDR_PATH) -Iinclude -I$(<D)
+CFLAGS += -O2 -g -std=gnu99 -I$(LINUX_HDR_PATH) -Iinclude -I$(<D)
 
 # After inclusion, $(OUTPUT) is defined and
 # $(TEST_GEN_PROGS) starts with $(OUTPUT)/

From 4b163ca343b45855b03114ef2ab47c454989d55c Mon Sep 17 00:00:00 2001
From: Sandipan Das <sandipan@linux.vnet.ibm.com>
Date: Thu, 12 Apr 2018 21:51:40 +0530
Subject: [PATCH 049/288] perf tests: Disable breakpoint accounting test for
 powerpc

We disable this test as instruction breakpoints (HW_BREAKPOINT_X) are
not available for powerpc.

Before applying patch:

  21: Breakpoint accounting                                 :
  --- start ---
  test child forked, pid 3635
  failed opening event 0
  failed opening event 0
  watchpoints count 1, breakpoints count 0, has_ioctl 1, share 0
  test child finished with -2
  ---- end ----
  Breakpoint accounting: Skip

After applying patch:

  21: Breakpoint accounting                                 : Disabled

Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Cc: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com>
Link: http://lkml.kernel.org/r/20180412162140.2992-1-sandipan@linux.vnet.ibm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/builtin-test.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c
index 625f5a6772af..cac8f8889bc3 100644
--- a/tools/perf/tests/builtin-test.c
+++ b/tools/perf/tests/builtin-test.c
@@ -118,6 +118,7 @@ static struct test generic_tests[] = {
 	{
 		.desc = "Breakpoint accounting",
 		.func = test__bp_accounting,
+		.is_supported = test__bp_signal_is_supported,
 	},
 	{
 		.desc = "Number of exit events of a simple workload",

From f6b7aeee8f167409195fbf1364d02988fecad1d0 Mon Sep 17 00:00:00 2001
From: Sinan Kaya <okaya@codeaurora.org>
Date: Tue, 3 Apr 2018 08:55:03 -0400
Subject: [PATCH 050/288] MIPS: io: Prevent compiler reordering writeX()

writeX() has strong ordering semantics with respect to memory updates.
In the absence of a write barrier or a compiler barrier, the compiler
can reorder register and memory update instructions. This breaks the
writeX() API.

Signed-off-by: Sinan Kaya <okaya@codeaurora.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Paul Burton <paul.burton@mips.com>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/18997/
[jhogan@kernel.org: Tidy commit message]
Signed-off-by: James Hogan <jhogan@kernel.org>
---
 arch/mips/include/asm/io.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/mips/include/asm/io.h b/arch/mips/include/asm/io.h
index 0cbf3af37eca..fd00ddafb425 100644
--- a/arch/mips/include/asm/io.h
+++ b/arch/mips/include/asm/io.h
@@ -307,7 +307,7 @@ static inline void iounmap(const volatile void __iomem *addr)
 #if defined(CONFIG_CPU_CAVIUM_OCTEON) || defined(CONFIG_LOONGSON3_ENHANCEMENT)
 #define war_io_reorder_wmb()		wmb()
 #else
-#define war_io_reorder_wmb()		do { } while (0)
+#define war_io_reorder_wmb()		barrier()
 #endif
 
 #define __BUILD_MEMORY_SINGLE(pfx, bwlq, type, irq)			\

From 08ea556e14b56e9a49b19abd8e39f0c9e05582f2 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 10 Apr 2018 15:26:43 -0700
Subject: [PATCH 051/288] ibmvnic: Define vnic_login_client_data name field as
 unsized array

The "name" field of struct vnic_login_client_data is a char array of
undefined length. This should be written as "char name[]" so the compiler
can make better decisions about the field (for example, not assuming
it's a single character). This was noticed while trying to tighten the
CONFIG_FORTIFY_SOURCE checking.

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ibm/ibmvnic.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index aad5658d79d5..35fbb41cd2d4 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -3170,7 +3170,7 @@ static int send_version_xchg(struct ibmvnic_adapter *adapter)
 struct vnic_login_client_data {
 	u8	type;
 	__be16	len;
-	char	name;
+	char	name[];
 } __packed;
 
 static int vnic_client_data_len(struct ibmvnic_adapter *adapter)
@@ -3199,21 +3199,21 @@ static void vnic_add_client_data(struct ibmvnic_adapter *adapter,
 	vlcd->type = 1;
 	len = strlen(os_name) + 1;
 	vlcd->len = cpu_to_be16(len);
-	strncpy(&vlcd->name, os_name, len);
-	vlcd = (struct vnic_login_client_data *)((char *)&vlcd->name + len);
+	strncpy(vlcd->name, os_name, len);
+	vlcd = (struct vnic_login_client_data *)(vlcd->name + len);
 
 	/* Type 2 - LPAR name */
 	vlcd->type = 2;
 	len = strlen(utsname()->nodename) + 1;
 	vlcd->len = cpu_to_be16(len);
-	strncpy(&vlcd->name, utsname()->nodename, len);
-	vlcd = (struct vnic_login_client_data *)((char *)&vlcd->name + len);
+	strncpy(vlcd->name, utsname()->nodename, len);
+	vlcd = (struct vnic_login_client_data *)(vlcd->name + len);
 
 	/* Type 3 - device name */
 	vlcd->type = 3;
 	len = strlen(adapter->netdev->name) + 1;
 	vlcd->len = cpu_to_be16(len);
-	strncpy(&vlcd->name, adapter->netdev->name, len);
+	strncpy(vlcd->name, adapter->netdev->name, len);
 }
 
 static int send_login(struct ibmvnic_adapter *adapter)

From b16520f7493d06d8ef6d4255bdfcf7a803d7874a Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 10 Apr 2018 17:52:34 -0700
Subject: [PATCH 052/288] net/tls: Remove VLA usage

In the quest to remove VLAs from the kernel[1], this replaces the VLA
size with the only possible size used in the code, and adds a mechanism
to double-check future IV sizes.

[1] https://lkml.kernel.org/r/CA+55aFzCG-zNmZwX4A2FQpadafLfEzK6CC=qPXydAacU1RqZWA@mail.gmail.com

Signed-off-by: Kees Cook <keescook@chromium.org>
Acked-by: Dave Watson <davejwatson@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tls/tls_sw.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 4dc766b03f00..71e79597f940 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -41,6 +41,8 @@
 #include <net/strparser.h>
 #include <net/tls.h>
 
+#define MAX_IV_SIZE	TLS_CIPHER_AES_GCM_128_IV_SIZE
+
 static int tls_do_decryption(struct sock *sk,
 			     struct scatterlist *sgin,
 			     struct scatterlist *sgout,
@@ -673,7 +675,7 @@ static int decrypt_skb(struct sock *sk, struct sk_buff *skb,
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
-	char iv[TLS_CIPHER_AES_GCM_128_SALT_SIZE + tls_ctx->rx.iv_size];
+	char iv[TLS_CIPHER_AES_GCM_128_SALT_SIZE + MAX_IV_SIZE];
 	struct scatterlist sgin_arr[MAX_SKB_FRAGS + 2];
 	struct scatterlist *sgin = &sgin_arr[0];
 	struct strp_msg *rxm = strp_msg(skb);
@@ -1094,6 +1096,12 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
 		goto free_priv;
 	}
 
+	/* Sanity-check the IV size for stack allocations. */
+	if (iv_size > MAX_IV_SIZE) {
+		rc = -EINVAL;
+		goto free_priv;
+	}
+
 	cctx->prepend_size = TLS_HEADER_SIZE + nonce_size;
 	cctx->tag_size = tag_size;
 	cctx->overhead_size = cctx->prepend_size + cctx->tag_size;

From 9a4381618262157586051f5ba0db42df3c6ab4b5 Mon Sep 17 00:00:00 2001
From: Laura Abbott <labbott@redhat.com>
Date: Tue, 10 Apr 2018 18:04:29 -0700
Subject: [PATCH 053/288] mISDN: Remove VLAs

There's an ongoing effort to remove VLAs[1] from the kernel to eventually
turn on -Wvla. Remove the VLAs from the mISDN code by switching to using
kstrdup in one place and using an upper bound in another.

Signed-off-by: Laura Abbott <labbott@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/isdn/mISDN/dsp_hwec.c   |  8 +++++---
 drivers/isdn/mISDN/l1oip_core.c | 14 +++++++++++---
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/drivers/isdn/mISDN/dsp_hwec.c b/drivers/isdn/mISDN/dsp_hwec.c
index a6e87076acc2..5336bbdbfdc5 100644
--- a/drivers/isdn/mISDN/dsp_hwec.c
+++ b/drivers/isdn/mISDN/dsp_hwec.c
@@ -68,12 +68,12 @@ void dsp_hwec_enable(struct dsp *dsp, const char *arg)
 		goto _do;
 
 	{
-		char _dup[len + 1];
 		char *dup, *tok, *name, *val;
 		int tmp;
 
-		strcpy(_dup, arg);
-		dup = _dup;
+		dup = kstrdup(arg, GFP_ATOMIC);
+		if (!dup)
+			return;
 
 		while ((tok = strsep(&dup, ","))) {
 			if (!strlen(tok))
@@ -89,6 +89,8 @@ void dsp_hwec_enable(struct dsp *dsp, const char *arg)
 					deftaps = tmp;
 			}
 		}
+
+		kfree(dup);
 	}
 
 _do:
diff --git a/drivers/isdn/mISDN/l1oip_core.c b/drivers/isdn/mISDN/l1oip_core.c
index 21d50e4cc5e1..b05022f94f18 100644
--- a/drivers/isdn/mISDN/l1oip_core.c
+++ b/drivers/isdn/mISDN/l1oip_core.c
@@ -279,7 +279,7 @@ l1oip_socket_send(struct l1oip *hc, u8 localcodec, u8 channel, u32 chanmask,
 		  u16 timebase, u8 *buf, int len)
 {
 	u8 *p;
-	u8 frame[len + 32];
+	u8 frame[MAX_DFRAME_LEN_L1 + 32];
 	struct socket *socket = NULL;
 
 	if (debug & DEBUG_L1OIP_MSG)
@@ -902,7 +902,11 @@ handle_dmsg(struct mISDNchannel *ch, struct sk_buff *skb)
 		p = skb->data;
 		l = skb->len;
 		while (l) {
-			ll = (l < L1OIP_MAX_PERFRAME) ? l : L1OIP_MAX_PERFRAME;
+			/*
+			 * This is technically bounded by L1OIP_MAX_PERFRAME but
+			 * MAX_DFRAME_LEN_L1 < L1OIP_MAX_PERFRAME
+			 */
+			ll = (l < MAX_DFRAME_LEN_L1) ? l : MAX_DFRAME_LEN_L1;
 			l1oip_socket_send(hc, 0, dch->slot, 0,
 					  hc->chan[dch->slot].tx_counter++, p, ll);
 			p += ll;
@@ -1140,7 +1144,11 @@ handle_bmsg(struct mISDNchannel *ch, struct sk_buff *skb)
 		p = skb->data;
 		l = skb->len;
 		while (l) {
-			ll = (l < L1OIP_MAX_PERFRAME) ? l : L1OIP_MAX_PERFRAME;
+			/*
+			 * This is technically bounded by L1OIP_MAX_PERFRAME but
+			 * MAX_DFRAME_LEN_L1 < L1OIP_MAX_PERFRAME
+			 */
+			ll = (l < MAX_DFRAME_LEN_L1) ? l : MAX_DFRAME_LEN_L1;
 			l1oip_socket_send(hc, hc->codec, bch->slot, 0,
 					  hc->chan[bch->slot].tx_counter, p, ll);
 			hc->chan[bch->slot].tx_counter += ll;

From 1c2734b31d72316e3faaad88c0c9c46fa92a4b20 Mon Sep 17 00:00:00 2001
From: Raghuram Chary J <raghuramchary.jallipalli@microchip.com>
Date: Wed, 11 Apr 2018 20:36:36 +0530
Subject: [PATCH 054/288] lan78xx: PHY DSP registers initialization to address
 EEE link drop issues with long cables

The patch is to configure DSP registers of PHY device
to handle Gbe-EEE failures with >40m cable length.

Fixes: 55d7de9de6c3 ("Microchip's LAN7800 family USB 2/3 to 10/100/1000 Ethernet device driver")
Signed-off-by: Raghuram Chary J <raghuramchary.jallipalli@microchip.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/microchip.c  | 178 ++++++++++++++++++++++++++++++++++-
 include/linux/microchipphy.h |   8 ++
 2 files changed, 185 insertions(+), 1 deletion(-)

diff --git a/drivers/net/phy/microchip.c b/drivers/net/phy/microchip.c
index 0f293ef28935..a97ac8c12c4c 100644
--- a/drivers/net/phy/microchip.c
+++ b/drivers/net/phy/microchip.c
@@ -20,6 +20,7 @@
 #include <linux/ethtool.h>
 #include <linux/phy.h>
 #include <linux/microchipphy.h>
+#include <linux/delay.h>
 
 #define DRIVER_AUTHOR	"WOOJUNG HUH <woojung.huh@microchip.com>"
 #define DRIVER_DESC	"Microchip LAN88XX PHY driver"
@@ -30,6 +31,16 @@ struct lan88xx_priv {
 	__u32	wolopts;
 };
 
+static int lan88xx_read_page(struct phy_device *phydev)
+{
+	return __phy_read(phydev, LAN88XX_EXT_PAGE_ACCESS);
+}
+
+static int lan88xx_write_page(struct phy_device *phydev, int page)
+{
+	return __phy_write(phydev, LAN88XX_EXT_PAGE_ACCESS, page);
+}
+
 static int lan88xx_phy_config_intr(struct phy_device *phydev)
 {
 	int rc;
@@ -66,6 +77,150 @@ static int lan88xx_suspend(struct phy_device *phydev)
 	return 0;
 }
 
+static int lan88xx_TR_reg_set(struct phy_device *phydev, u16 regaddr,
+			      u32 data)
+{
+	int val, save_page, ret = 0;
+	u16 buf;
+
+	/* Save current page */
+	save_page = phy_save_page(phydev);
+	if (save_page < 0) {
+		pr_warn("Failed to get current page\n");
+		goto err;
+	}
+
+	/* Switch to TR page */
+	lan88xx_write_page(phydev, LAN88XX_EXT_PAGE_ACCESS_TR);
+
+	ret = __phy_write(phydev, LAN88XX_EXT_PAGE_TR_LOW_DATA,
+			  (data & 0xFFFF));
+	if (ret < 0) {
+		pr_warn("Failed to write TR low data\n");
+		goto err;
+	}
+
+	ret = __phy_write(phydev, LAN88XX_EXT_PAGE_TR_HIGH_DATA,
+			  (data & 0x00FF0000) >> 16);
+	if (ret < 0) {
+		pr_warn("Failed to write TR high data\n");
+		goto err;
+	}
+
+	/* Config control bits [15:13] of register */
+	buf = (regaddr & ~(0x3 << 13));/* Clr [14:13] to write data in reg */
+	buf |= 0x8000; /* Set [15] to Packet transmit */
+
+	ret = __phy_write(phydev, LAN88XX_EXT_PAGE_TR_CR, buf);
+	if (ret < 0) {
+		pr_warn("Failed to write data in reg\n");
+		goto err;
+	}
+
+	usleep_range(1000, 2000);/* Wait for Data to be written */
+	val = __phy_read(phydev, LAN88XX_EXT_PAGE_TR_CR);
+	if (!(val & 0x8000))
+		pr_warn("TR Register[0x%X] configuration failed\n", regaddr);
+err:
+	return phy_restore_page(phydev, save_page, ret);
+}
+
+static void lan88xx_config_TR_regs(struct phy_device *phydev)
+{
+	int err;
+
+	/* Get access to Channel 0x1, Node 0xF , Register 0x01.
+	 * Write 24-bit value 0x12B00A to register. Setting MrvlTrFix1000Kf,
+	 * MrvlTrFix1000Kp, MasterEnableTR bits.
+	 */
+	err = lan88xx_TR_reg_set(phydev, 0x0F82, 0x12B00A);
+	if (err < 0)
+		pr_warn("Failed to Set Register[0x0F82]\n");
+
+	/* Get access to Channel b'10, Node b'1101, Register 0x06.
+	 * Write 24-bit value 0xD2C46F to register. Setting SSTrKf1000Slv,
+	 * SSTrKp1000Mas bits.
+	 */
+	err = lan88xx_TR_reg_set(phydev, 0x168C, 0xD2C46F);
+	if (err < 0)
+		pr_warn("Failed to Set Register[0x168C]\n");
+
+	/* Get access to Channel b'10, Node b'1111, Register 0x11.
+	 * Write 24-bit value 0x620 to register. Setting rem_upd_done_thresh
+	 * bits
+	 */
+	err = lan88xx_TR_reg_set(phydev, 0x17A2, 0x620);
+	if (err < 0)
+		pr_warn("Failed to Set Register[0x17A2]\n");
+
+	/* Get access to Channel b'10, Node b'1101, Register 0x10.
+	 * Write 24-bit value 0xEEFFDD to register. Setting
+	 * eee_TrKp1Long_1000, eee_TrKp2Long_1000, eee_TrKp3Long_1000,
+	 * eee_TrKp1Short_1000,eee_TrKp2Short_1000, eee_TrKp3Short_1000 bits.
+	 */
+	err = lan88xx_TR_reg_set(phydev, 0x16A0, 0xEEFFDD);
+	if (err < 0)
+		pr_warn("Failed to Set Register[0x16A0]\n");
+
+	/* Get access to Channel b'10, Node b'1101, Register 0x13.
+	 * Write 24-bit value 0x071448 to register. Setting
+	 * slv_lpi_tr_tmr_val1, slv_lpi_tr_tmr_val2 bits.
+	 */
+	err = lan88xx_TR_reg_set(phydev, 0x16A6, 0x071448);
+	if (err < 0)
+		pr_warn("Failed to Set Register[0x16A6]\n");
+
+	/* Get access to Channel b'10, Node b'1101, Register 0x12.
+	 * Write 24-bit value 0x13132F to register. Setting
+	 * slv_sigdet_timer_val1, slv_sigdet_timer_val2 bits.
+	 */
+	err = lan88xx_TR_reg_set(phydev, 0x16A4, 0x13132F);
+	if (err < 0)
+		pr_warn("Failed to Set Register[0x16A4]\n");
+
+	/* Get access to Channel b'10, Node b'1101, Register 0x14.
+	 * Write 24-bit value 0x0 to register. Setting eee_3level_delay,
+	 * eee_TrKf_freeze_delay bits.
+	 */
+	err = lan88xx_TR_reg_set(phydev, 0x16A8, 0x0);
+	if (err < 0)
+		pr_warn("Failed to Set Register[0x16A8]\n");
+
+	/* Get access to Channel b'01, Node b'1111, Register 0x34.
+	 * Write 24-bit value 0x91B06C to register. Setting
+	 * FastMseSearchThreshLong1000, FastMseSearchThreshShort1000,
+	 * FastMseSearchUpdGain1000 bits.
+	 */
+	err = lan88xx_TR_reg_set(phydev, 0x0FE8, 0x91B06C);
+	if (err < 0)
+		pr_warn("Failed to Set Register[0x0FE8]\n");
+
+	/* Get access to Channel b'01, Node b'1111, Register 0x3E.
+	 * Write 24-bit value 0xC0A028 to register. Setting
+	 * FastMseKp2ThreshLong1000, FastMseKp2ThreshShort1000,
+	 * FastMseKp2UpdGain1000, FastMseKp2ExitEn1000 bits.
+	 */
+	err = lan88xx_TR_reg_set(phydev, 0x0FFC, 0xC0A028);
+	if (err < 0)
+		pr_warn("Failed to Set Register[0x0FFC]\n");
+
+	/* Get access to Channel b'01, Node b'1111, Register 0x35.
+	 * Write 24-bit value 0x041600 to register. Setting
+	 * FastMseSearchPhShNum1000, FastMseSearchClksPerPh1000,
+	 * FastMsePhChangeDelay1000 bits.
+	 */
+	err = lan88xx_TR_reg_set(phydev, 0x0FEA, 0x041600);
+	if (err < 0)
+		pr_warn("Failed to Set Register[0x0FEA]\n");
+
+	/* Get access to Channel b'10, Node b'1101, Register 0x03.
+	 * Write 24-bit value 0x000004 to register. Setting TrFreeze bits.
+	 */
+	err = lan88xx_TR_reg_set(phydev, 0x1686, 0x000004);
+	if (err < 0)
+		pr_warn("Failed to Set Register[0x1686]\n");
+}
+
 static int lan88xx_probe(struct phy_device *phydev)
 {
 	struct device *dev = &phydev->mdio.dev;
@@ -132,6 +287,25 @@ static void lan88xx_set_mdix(struct phy_device *phydev)
 	phy_write(phydev, LAN88XX_EXT_PAGE_ACCESS, LAN88XX_EXT_PAGE_SPACE_0);
 }
 
+static int lan88xx_config_init(struct phy_device *phydev)
+{
+	int val;
+
+	genphy_config_init(phydev);
+	/*Zerodetect delay enable */
+	val = phy_read_mmd(phydev, MDIO_MMD_PCS,
+			   PHY_ARDENNES_MMD_DEV_3_PHY_CFG);
+	val |= PHY_ARDENNES_MMD_DEV_3_PHY_CFG_ZD_DLY_EN_;
+
+	phy_write_mmd(phydev, MDIO_MMD_PCS, PHY_ARDENNES_MMD_DEV_3_PHY_CFG,
+		      val);
+
+	/* Config DSP registers */
+	lan88xx_config_TR_regs(phydev);
+
+	return 0;
+}
+
 static int lan88xx_config_aneg(struct phy_device *phydev)
 {
 	lan88xx_set_mdix(phydev);
@@ -151,7 +325,7 @@ static struct phy_driver microchip_phy_driver[] = {
 	.probe		= lan88xx_probe,
 	.remove		= lan88xx_remove,
 
-	.config_init	= genphy_config_init,
+	.config_init	= lan88xx_config_init,
 	.config_aneg	= lan88xx_config_aneg,
 
 	.ack_interrupt	= lan88xx_phy_ack_interrupt,
@@ -160,6 +334,8 @@ static struct phy_driver microchip_phy_driver[] = {
 	.suspend	= lan88xx_suspend,
 	.resume		= genphy_resume,
 	.set_wol	= lan88xx_set_wol,
+	.read_page	= lan88xx_read_page,
+	.write_page	= lan88xx_write_page,
 } };
 
 module_phy_driver(microchip_phy_driver);
diff --git a/include/linux/microchipphy.h b/include/linux/microchipphy.h
index eb492d47f717..8f9c90379732 100644
--- a/include/linux/microchipphy.h
+++ b/include/linux/microchipphy.h
@@ -70,4 +70,12 @@
 #define	LAN88XX_MMD3_CHIP_ID			(32877)
 #define	LAN88XX_MMD3_CHIP_REV			(32878)
 
+/* DSP registers */
+#define PHY_ARDENNES_MMD_DEV_3_PHY_CFG		(0x806A)
+#define PHY_ARDENNES_MMD_DEV_3_PHY_CFG_ZD_DLY_EN_	(0x2000)
+#define LAN88XX_EXT_PAGE_ACCESS_TR		(0x52B5)
+#define LAN88XX_EXT_PAGE_TR_CR			16
+#define LAN88XX_EXT_PAGE_TR_LOW_DATA		17
+#define LAN88XX_EXT_PAGE_TR_HIGH_DATA		18
+
 #endif /* _MICROCHIPPHY_H */

From c3317f4db831b7564ff8d1670326456a7fbbbcb3 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Wed, 11 Apr 2018 22:52:09 +0200
Subject: [PATCH 055/288] tipc: fix unbalanced reference counter

When a topology subscription is created, we may encounter (or KASAN
may provoke) a failure to create a corresponding service instance in
the binding table. Instead of letting the tipc_nametbl_subscribe()
report the failure back to the caller, the function just makes a warning
printout and returns, without incrementing the subscription reference
counter as expected by the caller.

This makes the caller believe that the subscription was successful, so
it will at a later moment try to unsubscribe the item. This involves
a sub_put() call. Since the reference counter never was incremented
in the first place, we get a premature delete of the subscription item,
followed by a "use-after-free" warning.

We fix this by adding a return value to tipc_nametbl_subscribe() and
make the caller aware of the failure to subscribe.

This bug seems to always have been around, but this fix only applies
back to the commit shown below. Given the low risk of this happening
we believe this to be sufficient.

Fixes: commit 218527fe27ad ("tipc: replace name table service range
array with rb tree")
Reported-by: syzbot+aa245f26d42b8305d157@syzkaller.appspotmail.com

Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/name_table.c | 5 ++++-
 net/tipc/name_table.h | 2 +-
 net/tipc/subscr.c     | 5 ++++-
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
index b1fe20972aa9..4068eaad61a6 100644
--- a/net/tipc/name_table.c
+++ b/net/tipc/name_table.c
@@ -665,13 +665,14 @@ int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower,
 /**
  * tipc_nametbl_subscribe - add a subscription object to the name table
  */
-void tipc_nametbl_subscribe(struct tipc_subscription *sub)
+bool tipc_nametbl_subscribe(struct tipc_subscription *sub)
 {
 	struct name_table *nt = tipc_name_table(sub->net);
 	struct tipc_net *tn = tipc_net(sub->net);
 	struct tipc_subscr *s = &sub->evt.s;
 	u32 type = tipc_sub_read(s, seq.type);
 	struct tipc_service *sc;
+	bool res = true;
 
 	spin_lock_bh(&tn->nametbl_lock);
 	sc = tipc_service_find(sub->net, type);
@@ -685,8 +686,10 @@ void tipc_nametbl_subscribe(struct tipc_subscription *sub)
 		pr_warn("Failed to subscribe for {%u,%u,%u}\n", type,
 			tipc_sub_read(s, seq.lower),
 			tipc_sub_read(s, seq.upper));
+		res = false;
 	}
 	spin_unlock_bh(&tn->nametbl_lock);
+	return res;
 }
 
 /**
diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h
index 4b14fc28d9e2..0febba41da86 100644
--- a/net/tipc/name_table.h
+++ b/net/tipc/name_table.h
@@ -126,7 +126,7 @@ struct publication *tipc_nametbl_insert_publ(struct net *net, u32 type,
 struct publication *tipc_nametbl_remove_publ(struct net *net, u32 type,
 					     u32 lower, u32 upper,
 					     u32 node, u32 key);
-void tipc_nametbl_subscribe(struct tipc_subscription *s);
+bool tipc_nametbl_subscribe(struct tipc_subscription *s);
 void tipc_nametbl_unsubscribe(struct tipc_subscription *s);
 int tipc_nametbl_init(struct net *net);
 void tipc_nametbl_stop(struct net *net);
diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c
index b7d80bc5f4ab..f340e53da625 100644
--- a/net/tipc/subscr.c
+++ b/net/tipc/subscr.c
@@ -153,7 +153,10 @@ struct tipc_subscription *tipc_sub_subscribe(struct net *net,
 	memcpy(&sub->evt.s, s, sizeof(*s));
 	spin_lock_init(&sub->lock);
 	kref_init(&sub->kref);
-	tipc_nametbl_subscribe(sub);
+	if (!tipc_nametbl_subscribe(sub)) {
+		kfree(sub);
+		return NULL;
+	}
 	timer_setup(&sub->timer, tipc_sub_timeout, 0);
 	timeout = tipc_sub_read(&sub->evt.s, timeout);
 	if (timeout != TIPC_WAIT_FOREVER)

From 7212303268918b9a203aebeacfdbd83b5e87b20d Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 11 Apr 2018 14:36:28 -0700
Subject: [PATCH 056/288] tcp: md5: reject TCP_MD5SIG or TCP_MD5SIG_EXT on
 established sockets

syzbot/KMSAN reported an uninit-value in tcp_parse_options() [1]

I believe this was caused by a TCP_MD5SIG being set on live
flow.

This is highly unexpected, since TCP option space is limited.

For instance, presence of TCP MD5 option automatically disables
TCP TimeStamp option at SYN/SYNACK time, which we can not do
once flow has been established.

Really, adding/deleting an MD5 key only makes sense on sockets
in CLOSE or LISTEN state.

[1]
BUG: KMSAN: uninit-value in tcp_parse_options+0xd74/0x1a30 net/ipv4/tcp_input.c:3720
CPU: 1 PID: 6177 Comm: syzkaller192004 Not tainted 4.16.0+ #83
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:17 [inline]
 dump_stack+0x185/0x1d0 lib/dump_stack.c:53
 kmsan_report+0x142/0x240 mm/kmsan/kmsan.c:1067
 __msan_warning_32+0x6c/0xb0 mm/kmsan/kmsan_instr.c:676
 tcp_parse_options+0xd74/0x1a30 net/ipv4/tcp_input.c:3720
 tcp_fast_parse_options net/ipv4/tcp_input.c:3858 [inline]
 tcp_validate_incoming+0x4f1/0x2790 net/ipv4/tcp_input.c:5184
 tcp_rcv_established+0xf60/0x2bb0 net/ipv4/tcp_input.c:5453
 tcp_v4_do_rcv+0x6cd/0xd90 net/ipv4/tcp_ipv4.c:1469
 sk_backlog_rcv include/net/sock.h:908 [inline]
 __release_sock+0x2d6/0x680 net/core/sock.c:2271
 release_sock+0x97/0x2a0 net/core/sock.c:2786
 tcp_sendmsg+0xd6/0x100 net/ipv4/tcp.c:1464
 inet_sendmsg+0x48d/0x740 net/ipv4/af_inet.c:764
 sock_sendmsg_nosec net/socket.c:630 [inline]
 sock_sendmsg net/socket.c:640 [inline]
 SYSC_sendto+0x6c3/0x7e0 net/socket.c:1747
 SyS_sendto+0x8a/0xb0 net/socket.c:1715
 do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287
 entry_SYSCALL_64_after_hwframe+0x3d/0xa2
RIP: 0033:0x448fe9
RSP: 002b:00007fd472c64d38 EFLAGS: 00000216 ORIG_RAX: 000000000000002c
RAX: ffffffffffffffda RBX: 00000000006e5a30 RCX: 0000000000448fe9
RDX: 000000000000029f RSI: 0000000020a88f88 RDI: 0000000000000004
RBP: 00000000006e5a34 R08: 0000000020e68000 R09: 0000000000000010
R10: 00000000200007fd R11: 0000000000000216 R12: 0000000000000000
R13: 00007fff074899ef R14: 00007fd472c659c0 R15: 0000000000000009

Uninit was created at:
 kmsan_save_stack_with_flags mm/kmsan/kmsan.c:278 [inline]
 kmsan_internal_poison_shadow+0xb8/0x1b0 mm/kmsan/kmsan.c:188
 kmsan_kmalloc+0x94/0x100 mm/kmsan/kmsan.c:314
 kmsan_slab_alloc+0x11/0x20 mm/kmsan/kmsan.c:321
 slab_post_alloc_hook mm/slab.h:445 [inline]
 slab_alloc_node mm/slub.c:2737 [inline]
 __kmalloc_node_track_caller+0xaed/0x11c0 mm/slub.c:4369
 __kmalloc_reserve net/core/skbuff.c:138 [inline]
 __alloc_skb+0x2cf/0x9f0 net/core/skbuff.c:206
 alloc_skb include/linux/skbuff.h:984 [inline]
 tcp_send_ack+0x18c/0x910 net/ipv4/tcp_output.c:3624
 __tcp_ack_snd_check net/ipv4/tcp_input.c:5040 [inline]
 tcp_ack_snd_check net/ipv4/tcp_input.c:5053 [inline]
 tcp_rcv_established+0x2103/0x2bb0 net/ipv4/tcp_input.c:5469
 tcp_v4_do_rcv+0x6cd/0xd90 net/ipv4/tcp_ipv4.c:1469
 sk_backlog_rcv include/net/sock.h:908 [inline]
 __release_sock+0x2d6/0x680 net/core/sock.c:2271
 release_sock+0x97/0x2a0 net/core/sock.c:2786
 tcp_sendmsg+0xd6/0x100 net/ipv4/tcp.c:1464
 inet_sendmsg+0x48d/0x740 net/ipv4/af_inet.c:764
 sock_sendmsg_nosec net/socket.c:630 [inline]
 sock_sendmsg net/socket.c:640 [inline]
 SYSC_sendto+0x6c3/0x7e0 net/socket.c:1747
 SyS_sendto+0x8a/0xb0 net/socket.c:1715
 do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287
 entry_SYSCALL_64_after_hwframe+0x3d/0xa2

Fixes: cfb6eeb4c860 ("[TCP]: MD5 Signature Option (RFC2385) support.")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index bccc4c270087..4fa3f812b9ff 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2813,8 +2813,10 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 #ifdef CONFIG_TCP_MD5SIG
 	case TCP_MD5SIG:
 	case TCP_MD5SIG_EXT:
-		/* Read the IP->Key mappings from userspace */
-		err = tp->af_specific->md5_parse(sk, optname, optval, optlen);
+		if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))
+			err = tp->af_specific->md5_parse(sk, optname, optval, optlen);
+		else
+			err = -EINVAL;
 		break;
 #endif
 	case TCP_USER_TIMEOUT:

From 7dd07c143a4b54d050e748bee4b4b9e94a7b1744 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 11 Apr 2018 14:46:00 -0700
Subject: [PATCH 057/288] net: validate attribute sizes in neigh_dump_table()

Since neigh_dump_table() calls nlmsg_parse() without giving policy
constraints, attributes can have arbirary size that we must validate

Reported by syzbot/KMSAN :

BUG: KMSAN: uninit-value in neigh_master_filtered net/core/neighbour.c:2292 [inline]
BUG: KMSAN: uninit-value in neigh_dump_table net/core/neighbour.c:2348 [inline]
BUG: KMSAN: uninit-value in neigh_dump_info+0x1af0/0x2250 net/core/neighbour.c:2438
CPU: 1 PID: 3575 Comm: syzkaller268891 Not tainted 4.16.0+ #83
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:17 [inline]
 dump_stack+0x185/0x1d0 lib/dump_stack.c:53
 kmsan_report+0x142/0x240 mm/kmsan/kmsan.c:1067
 __msan_warning_32+0x6c/0xb0 mm/kmsan/kmsan_instr.c:676
 neigh_master_filtered net/core/neighbour.c:2292 [inline]
 neigh_dump_table net/core/neighbour.c:2348 [inline]
 neigh_dump_info+0x1af0/0x2250 net/core/neighbour.c:2438
 netlink_dump+0x9ad/0x1540 net/netlink/af_netlink.c:2225
 __netlink_dump_start+0x1167/0x12a0 net/netlink/af_netlink.c:2322
 netlink_dump_start include/linux/netlink.h:214 [inline]
 rtnetlink_rcv_msg+0x1435/0x1560 net/core/rtnetlink.c:4598
 netlink_rcv_skb+0x355/0x5f0 net/netlink/af_netlink.c:2447
 rtnetlink_rcv+0x50/0x60 net/core/rtnetlink.c:4653
 netlink_unicast_kernel net/netlink/af_netlink.c:1311 [inline]
 netlink_unicast+0x1672/0x1750 net/netlink/af_netlink.c:1337
 netlink_sendmsg+0x1048/0x1310 net/netlink/af_netlink.c:1900
 sock_sendmsg_nosec net/socket.c:630 [inline]
 sock_sendmsg net/socket.c:640 [inline]
 ___sys_sendmsg+0xec0/0x1310 net/socket.c:2046
 __sys_sendmsg net/socket.c:2080 [inline]
 SYSC_sendmsg+0x2a3/0x3d0 net/socket.c:2091
 SyS_sendmsg+0x54/0x80 net/socket.c:2087
 do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287
 entry_SYSCALL_64_after_hwframe+0x3d/0xa2
RIP: 0033:0x43fed9
RSP: 002b:00007ffddbee2798 EFLAGS: 00000213 ORIG_RAX: 000000000000002e
RAX: ffffffffffffffda RBX: 00000000004002c8 RCX: 000000000043fed9
RDX: 0000000000000000 RSI: 0000000020005000 RDI: 0000000000000003
RBP: 00000000006ca018 R08: 00000000004002c8 R09: 00000000004002c8
R10: 00000000004002c8 R11: 0000000000000213 R12: 0000000000401800
R13: 0000000000401890 R14: 0000000000000000 R15: 0000000000000000

Uninit was created at:
 kmsan_save_stack_with_flags mm/kmsan/kmsan.c:278 [inline]
 kmsan_internal_poison_shadow+0xb8/0x1b0 mm/kmsan/kmsan.c:188
 kmsan_kmalloc+0x94/0x100 mm/kmsan/kmsan.c:314
 kmsan_slab_alloc+0x11/0x20 mm/kmsan/kmsan.c:321
 slab_post_alloc_hook mm/slab.h:445 [inline]
 slab_alloc_node mm/slub.c:2737 [inline]
 __kmalloc_node_track_caller+0xaed/0x11c0 mm/slub.c:4369
 __kmalloc_reserve net/core/skbuff.c:138 [inline]
 __alloc_skb+0x2cf/0x9f0 net/core/skbuff.c:206
 alloc_skb include/linux/skbuff.h:984 [inline]
 netlink_alloc_large_skb net/netlink/af_netlink.c:1183 [inline]
 netlink_sendmsg+0x9a6/0x1310 net/netlink/af_netlink.c:1875
 sock_sendmsg_nosec net/socket.c:630 [inline]
 sock_sendmsg net/socket.c:640 [inline]
 ___sys_sendmsg+0xec0/0x1310 net/socket.c:2046
 __sys_sendmsg net/socket.c:2080 [inline]
 SYSC_sendmsg+0x2a3/0x3d0 net/socket.c:2091
 SyS_sendmsg+0x54/0x80 net/socket.c:2087
 do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287
 entry_SYSCALL_64_after_hwframe+0x3d/0xa2

Fixes: 21fdd092acc7 ("net: Add support for filtering neigh dump by master device")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: David Ahern <dsa@cumulusnetworks.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Acked-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/neighbour.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 7b7a14abba28..a8bc02bb339f 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -2323,12 +2323,16 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
 
 	err = nlmsg_parse(nlh, sizeof(struct ndmsg), tb, NDA_MAX, NULL, NULL);
 	if (!err) {
-		if (tb[NDA_IFINDEX])
+		if (tb[NDA_IFINDEX]) {
+			if (nla_len(tb[NDA_IFINDEX]) != sizeof(u32))
+				return -EINVAL;
 			filter_idx = nla_get_u32(tb[NDA_IFINDEX]);
-
-		if (tb[NDA_MASTER])
+		}
+		if (tb[NDA_MASTER]) {
+			if (nla_len(tb[NDA_MASTER]) != sizeof(u32))
+				return -EINVAL;
 			filter_master_idx = nla_get_u32(tb[NDA_MASTER]);
-
+		}
 		if (filter_idx || filter_master_idx)
 			flags |= NLM_F_DUMP_FILTERED;
 	}

From 64d92aa2c9fe490ceffc440d7648ce369cd6cc3c Mon Sep 17 00:00:00 2001
From: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Date: Wed, 11 Apr 2018 10:09:32 -0500
Subject: [PATCH 058/288] ibmvnic: Handle all login error conditions

There is a bug in handling the possible return codes from sending the
login CRQ. The current code treats any non-success return value,
minus failure to send the crq and a timeout waiting for a login response,
as a need to re-send the login CRQ. This can put the drive in an
infinite loop of trying to login when getting return values other
that a partial success such as a return code of aborted. For these
scenarios the login will not ever succeed at this point and the
driver would need to be reset again.

To resolve this loop trying to login is updated to only retry the
login if the driver gets a return code of a partial success. Other
return codes are treated as an error and the driver returns an error
from ibmvnic_login().

To avoid infinite looping in the partial success return cases, the
number of retries is capped at the maximum number of supported
queues. This value was chosen because the driver does a renegotiation
of capabilities which sets the number of queues possible and allows
the driver to attempt a login for possible value for the number
of queues supported.

Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ibm/ibmvnic.c | 55 +++++++++++++++++++-----------
 drivers/net/ethernet/ibm/ibmvnic.h |  1 -
 2 files changed, 35 insertions(+), 21 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index 35fbb41cd2d4..d35f29d7af91 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -794,46 +794,61 @@ static int ibmvnic_login(struct net_device *netdev)
 {
 	struct ibmvnic_adapter *adapter = netdev_priv(netdev);
 	unsigned long timeout = msecs_to_jiffies(30000);
-	struct device *dev = &adapter->vdev->dev;
+	int retry_count = 0;
 	int rc;
 
 	do {
-		if (adapter->renegotiate) {
-			adapter->renegotiate = false;
+		if (retry_count > IBMVNIC_MAX_QUEUES) {
+			netdev_warn(netdev, "Login attempts exceeded\n");
+			return -1;
+		}
+
+		adapter->init_done_rc = 0;
+		reinit_completion(&adapter->init_done);
+		rc = send_login(adapter);
+		if (rc) {
+			netdev_warn(netdev, "Unable to login\n");
+			return rc;
+		}
+
+		if (!wait_for_completion_timeout(&adapter->init_done,
+						 timeout)) {
+			netdev_warn(netdev, "Login timed out\n");
+			return -1;
+		}
+
+		if (adapter->init_done_rc == PARTIALSUCCESS) {
+			retry_count++;
 			release_sub_crqs(adapter, 1);
 
+			adapter->init_done_rc = 0;
 			reinit_completion(&adapter->init_done);
 			send_cap_queries(adapter);
 			if (!wait_for_completion_timeout(&adapter->init_done,
 							 timeout)) {
-				dev_err(dev, "Capabilities query timeout\n");
+				netdev_warn(netdev,
+					    "Capabilities query timed out\n");
 				return -1;
 			}
+
 			rc = init_sub_crqs(adapter);
 			if (rc) {
-				dev_err(dev,
-					"Initialization of SCRQ's failed\n");
+				netdev_warn(netdev,
+					    "SCRQ initialization failed\n");
 				return -1;
 			}
+
 			rc = init_sub_crq_irqs(adapter);
 			if (rc) {
-				dev_err(dev,
-					"Initialization of SCRQ's irqs failed\n");
+				netdev_warn(netdev,
+					    "SCRQ irq initialization failed\n");
 				return -1;
 			}
-		}
-
-		reinit_completion(&adapter->init_done);
-		rc = send_login(adapter);
-		if (rc) {
-			dev_err(dev, "Unable to attempt device login\n");
-			return rc;
-		} else if (!wait_for_completion_timeout(&adapter->init_done,
-						 timeout)) {
-			dev_err(dev, "Login timeout\n");
+		} else if (adapter->init_done_rc) {
+			netdev_warn(netdev, "Adapter login failed\n");
 			return -1;
 		}
-	} while (adapter->renegotiate);
+	} while (adapter->init_done_rc == PARTIALSUCCESS);
 
 	/* handle pending MAC address changes after successful login */
 	if (adapter->mac_change_pending) {
@@ -3942,7 +3957,7 @@ static int handle_login_rsp(union ibmvnic_crq *login_rsp_crq,
 	 * to resend the login buffer with fewer queues requested.
 	 */
 	if (login_rsp_crq->generic.rc.code) {
-		adapter->renegotiate = true;
+		adapter->init_done_rc = login_rsp_crq->generic.rc.code;
 		complete(&adapter->init_done);
 		return 0;
 	}
diff --git a/drivers/net/ethernet/ibm/ibmvnic.h b/drivers/net/ethernet/ibm/ibmvnic.h
index 99c0b58c2c39..22391e8805f6 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.h
+++ b/drivers/net/ethernet/ibm/ibmvnic.h
@@ -1035,7 +1035,6 @@ struct ibmvnic_adapter {
 
 	struct ibmvnic_sub_crq_queue **tx_scrq;
 	struct ibmvnic_sub_crq_queue **rx_scrq;
-	bool renegotiate;
 
 	/* rx structs */
 	struct napi_struct *napi;

From ebc701b796a67a5785399dcbc83d90e3b5f1e02f Mon Sep 17 00:00:00 2001
From: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Date: Wed, 11 Apr 2018 10:09:38 -0500
Subject: [PATCH 059/288] ibmvnic: Do not notify peers on parameter change
 resets

When attempting to change the driver parameters, such as the MTU
value or number of queues, do not call netdev_notify_peers().
Doing so will deadlock on the rtnl_lock.

Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ibm/ibmvnic.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index d35f29d7af91..f4a56a3c07ad 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -1843,7 +1843,8 @@ static int do_reset(struct ibmvnic_adapter *adapter,
 	for (i = 0; i < adapter->req_rx_queues; i++)
 		napi_schedule(&adapter->napi[i]);
 
-	if (adapter->reset_reason != VNIC_RESET_FAILOVER)
+	if (adapter->reset_reason != VNIC_RESET_FAILOVER &&
+	    adapter->reset_reason != VNIC_RESET_CHANGE_PARAM)
 		netdev_notify_peers(netdev);
 
 	netif_carrier_on(netdev);

From 5ff9c1a3dd92d2d8eeea6bb15b3502cfcc0e26fa Mon Sep 17 00:00:00 2001
From: Anders Roxell <anders.roxell@linaro.org>
Date: Wed, 11 Apr 2018 17:17:34 +0200
Subject: [PATCH 060/288] selftests: net: add in_netns.sh to TEST_PROGS

Script in_netns.sh isn't installed.
--------------------
running psock_fanout test
--------------------
./run_afpackettests: line 12: ./in_netns.sh: No such file or directory
[FAIL]
--------------------
running psock_tpacket test
--------------------
./run_afpackettests: line 22: ./in_netns.sh: No such file or directory
[FAIL]

In current code added in_netns.sh to be installed.

Fixes: cc30c93fa020 ("selftests/net: ignore background traffic in psock_fanout")
Signed-off-by: Anders Roxell <anders.roxell@linaro.org>

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index 785fc18a16b4..8f1e13d2e547 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -5,7 +5,7 @@ CFLAGS =  -Wall -Wl,--no-as-needed -O2 -g
 CFLAGS += -I../../../../usr/include/
 
 TEST_PROGS := run_netsocktests run_afpackettests test_bpf.sh netdevice.sh rtnetlink.sh
-TEST_PROGS += fib_tests.sh fib-onlink-tests.sh pmtu.sh
+TEST_PROGS += fib_tests.sh fib-onlink-tests.sh in_netns.sh pmtu.sh
 TEST_GEN_FILES =  socket
 TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy
 TEST_GEN_PROGS = reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa

From 9d0c75bf6e03d9bf80c55b0f677dc9b982958fd5 Mon Sep 17 00:00:00 2001
From: Doron Roberts-Kedes <doronrk@fb.com>
Date: Wed, 11 Apr 2018 15:05:16 -0700
Subject: [PATCH 061/288] strparser: Fix incorrect strp->need_bytes value.

strp_data_ready resets strp->need_bytes to 0 if strp_peek_len indicates
that the remainder of the message has been received. However,
do_strp_work does not reset strp->need_bytes to 0. If do_strp_work
completes a partial message, the value of strp->need_bytes will continue
to reflect the needed bytes of the previous message, causing
future invocations of strp_data_ready to return early if
strp->need_bytes is less than strp_peek_len. Resetting strp->need_bytes
to 0 in __strp_recv on handing a full message to the upper layer solves
this problem.

__strp_recv also calculates strp->need_bytes using stm->accum_len before
stm->accum_len has been incremented by cand_len. This can cause
strp->need_bytes to be equal to the full length of the message instead
of the full length minus the accumulated length. This, in turn, causes
strp_data_ready to return early, even when there is sufficient data to
complete the partial message. Incrementing stm->accum_len before using
it to calculate strp->need_bytes solves this problem.

Found while testing net/tls_sw recv path.

Fixes: 43a0c6751a322847 ("strparser: Stream parser for messages")
Signed-off-by: Doron Roberts-Kedes <doronrk@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/strparser/strparser.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c
index b9283ce5cd85..805b139756db 100644
--- a/net/strparser/strparser.c
+++ b/net/strparser/strparser.c
@@ -296,9 +296,9 @@ static int __strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
 					strp_start_timer(strp, timeo);
 				}
 
+				stm->accum_len += cand_len;
 				strp->need_bytes = stm->strp.full_len -
 						       stm->accum_len;
-				stm->accum_len += cand_len;
 				stm->early_eaten = cand_len;
 				STRP_STATS_ADD(strp->stats.bytes, cand_len);
 				desc->count = 0; /* Stop reading socket */
@@ -321,6 +321,7 @@ static int __strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
 		/* Hurray, we have a new message! */
 		cancel_delayed_work(&strp->msg_timer_work);
 		strp->skb_head = NULL;
+		strp->need_bytes = 0;
 		STRP_STATS_INCR(strp->stats.msgs);
 
 		/* Give skb to upper layer */
@@ -410,9 +411,7 @@ void strp_data_ready(struct strparser *strp)
 		return;
 
 	if (strp->need_bytes) {
-		if (strp_peek_len(strp) >= strp->need_bytes)
-			strp->need_bytes = 0;
-		else
+		if (strp_peek_len(strp) < strp->need_bytes)
 			return;
 	}
 

From 335b929b28aeb5bfc0698adb21deaf685b2982d1 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Thu, 12 Apr 2018 01:15:48 +0200
Subject: [PATCH 062/288] tipc: fix missing initializer in tipc_sendmsg()

The stack variable 'dnode' in __tipc_sendmsg() may theoretically
end up tipc_node_get_mtu() as an unitilalized variable.

We fix this by intializing the variable at declaration. We also add
a default else clause to the two conditional ones already there, so
that we never end up in the named function if the given address
type is illegal.

Reported-by: syzbot+b0975ce9355b347c1546@syzkaller.appspotmail.com
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/socket.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 1fd1c8b5ce03..252a52ae0893 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -1278,7 +1278,7 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen)
 	struct tipc_msg *hdr = &tsk->phdr;
 	struct tipc_name_seq *seq;
 	struct sk_buff_head pkts;
-	u32 dnode, dport;
+	u32 dport, dnode = 0;
 	u32 type, inst;
 	int mtu, rc;
 
@@ -1348,6 +1348,8 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen)
 		msg_set_destnode(hdr, dnode);
 		msg_set_destport(hdr, dest->addr.id.ref);
 		msg_set_hdr_sz(hdr, BASIC_H_SIZE);
+	} else {
+		return -EINVAL;
 	}
 
 	/* Block or return if destination link is congested */

From 5496295aefe86995e41398b0f76de601308fc3f5 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 11 Apr 2018 16:47:35 -0700
Subject: [PATCH 063/288] nfp: ignore signals when communicating with
 management FW

We currently allow signals to interrupt the wait for management FW
commands.  Exiting the wait should not cause trouble, the FW will
just finish executing the command in the background and new commands
will wait for the old one to finish.

However, this may not be what users expect (Ctrl-C not actually stopping
the command).  Moreover some systems routinely request link information
with signals pending (Ubuntu 14.04 runs a landscape-sysinfo python tool
from MOTD) worrying users with errors like these:

nfp 0000:04:00.0: nfp_nsp: Error -512 waiting for code 0x0007 to start
nfp 0000:04:00.0: nfp: reading port table failed -512

Make the wait for management FW responses non-interruptible.

Fixes: 1a64821c6af7 ("nfp: add support for service processor access")
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp.c b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp.c
index 99bb679a9801..2abee0fe3a7c 100644
--- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp.c
+++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp.c
@@ -281,8 +281,7 @@ nfp_nsp_wait_reg(struct nfp_cpp *cpp, u64 *reg, u32 nsp_cpp, u64 addr,
 		if ((*reg & mask) == val)
 			return 0;
 
-		if (msleep_interruptible(25))
-			return -ERESTARTSYS;
+		msleep(25);
 
 		if (time_after(start_time, wait_until))
 			return -ETIMEDOUT;

From bc05f9bcd8cb62f935625850e535da183b4a07c0 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 11 Apr 2018 16:47:36 -0700
Subject: [PATCH 064/288] nfp: print a message when mutex wait is interrupted

When waiting for an NFP mutex is interrupted print a message
to make root causing later error messages easier.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/nfpcore/nfp_mutex.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_mutex.c b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_mutex.c
index f7b958181126..cb28ac03e4ca 100644
--- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_mutex.c
+++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_mutex.c
@@ -211,8 +211,11 @@ int nfp_cpp_mutex_lock(struct nfp_cpp_mutex *mutex)
 			break;
 
 		err = msleep_interruptible(timeout_ms);
-		if (err != 0)
+		if (err != 0) {
+			nfp_info(mutex->cpp,
+				 "interrupted waiting for NFP mutex\n");
 			return -ERESTARTSYS;
+		}
 
 		if (time_is_before_eq_jiffies(warn_at)) {
 			warn_at = jiffies + NFP_MUTEX_WAIT_NEXT_WARN * HZ;

From 0b1a989ef5a751b5992842d1934e22de861a848e Mon Sep 17 00:00:00 2001
From: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Date: Wed, 11 Apr 2018 16:47:37 -0700
Subject: [PATCH 065/288] nfp: flower: move route ack control messages out of
 the workqueue

Previously we processed the route ack control messages in the workqueue,
this unnecessarily loads the workqueue. We can deal with these messages
sooner as we know we are going to drop them.

Fixes: 8e6a9046b66a ("nfp: flower vxlan neighbour offload")
Signed-off-by: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/flower/cmsg.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/flower/cmsg.c b/drivers/net/ethernet/netronome/nfp/flower/cmsg.c
index 3735c09d2112..8ad857eb89c6 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/cmsg.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/cmsg.c
@@ -258,9 +258,6 @@ nfp_flower_cmsg_process_one_rx(struct nfp_app *app, struct sk_buff *skb)
 	case NFP_FLOWER_CMSG_TYPE_ACTIVE_TUNS:
 		nfp_tunnel_keep_alive(app, skb);
 		break;
-	case NFP_FLOWER_CMSG_TYPE_TUN_NEIGH:
-		/* Acks from the NFP that the route is added - ignore. */
-		break;
 	default:
 		nfp_flower_cmsg_warn(app, "Cannot handle invalid repr control type %u\n",
 				     type);
@@ -306,6 +303,9 @@ void nfp_flower_cmsg_rx(struct nfp_app *app, struct sk_buff *skb)
 		   nfp_flower_process_mtu_ack(app, skb)) {
 		/* Handle MTU acks outside wq to prevent RTNL conflict. */
 		dev_consume_skb_any(skb);
+	} else if (cmsg_hdr->type == NFP_FLOWER_CMSG_TYPE_TUN_NEIGH) {
+		/* Acks from the NFP that the route is added - ignore. */
+		dev_consume_skb_any(skb);
 	} else {
 		skb_queue_tail(&priv->cmsg_skbs, skb);
 		schedule_work(&priv->cmsg_work);

From cf2cbadc20f5651c3dde9f5ac2ee52fb43aa4ddd Mon Sep 17 00:00:00 2001
From: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Date: Wed, 11 Apr 2018 16:47:38 -0700
Subject: [PATCH 066/288] nfp: flower: split and limit cmsg skb lists

Introduce a second skb list for handling control messages and limit the
number of allowed messages. Some control messages are considered more
crucial than others, resulting in the need for a second skb list. By
splitting the list into a separate high and low priority list we can
ensure that messages on the high list get added to the head of the list
that gets processed, this however has no functional impact. Previously
there was no limit on the number of messages allowed on the queue, this
could result in the queue growing boundlessly and eventually the host
running out of memory.

Fixes: b985f870a5f0 ("nfp: process control messages in workqueue in flower app")
Signed-off-by: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/netronome/nfp/flower/cmsg.c  | 38 +++++++++++++++++--
 .../net/ethernet/netronome/nfp/flower/cmsg.h  |  2 +
 .../net/ethernet/netronome/nfp/flower/main.c  |  6 ++-
 .../net/ethernet/netronome/nfp/flower/main.h  |  8 +++-
 4 files changed, 46 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/flower/cmsg.c b/drivers/net/ethernet/netronome/nfp/flower/cmsg.c
index 8ad857eb89c6..577659f332e4 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/cmsg.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/cmsg.c
@@ -272,18 +272,49 @@ out:
 
 void nfp_flower_cmsg_process_rx(struct work_struct *work)
 {
+	struct sk_buff_head cmsg_joined;
 	struct nfp_flower_priv *priv;
 	struct sk_buff *skb;
 
 	priv = container_of(work, struct nfp_flower_priv, cmsg_work);
+	skb_queue_head_init(&cmsg_joined);
 
-	while ((skb = skb_dequeue(&priv->cmsg_skbs)))
+	spin_lock_bh(&priv->cmsg_skbs_high.lock);
+	skb_queue_splice_tail_init(&priv->cmsg_skbs_high, &cmsg_joined);
+	spin_unlock_bh(&priv->cmsg_skbs_high.lock);
+
+	spin_lock_bh(&priv->cmsg_skbs_low.lock);
+	skb_queue_splice_tail_init(&priv->cmsg_skbs_low, &cmsg_joined);
+	spin_unlock_bh(&priv->cmsg_skbs_low.lock);
+
+	while ((skb = __skb_dequeue(&cmsg_joined)))
 		nfp_flower_cmsg_process_one_rx(priv->app, skb);
 }
 
+static void
+nfp_flower_queue_ctl_msg(struct nfp_app *app, struct sk_buff *skb, int type)
+{
+	struct nfp_flower_priv *priv = app->priv;
+	struct sk_buff_head *skb_head;
+
+	if (type == NFP_FLOWER_CMSG_TYPE_PORT_REIFY ||
+	    type == NFP_FLOWER_CMSG_TYPE_PORT_MOD)
+		skb_head = &priv->cmsg_skbs_high;
+	else
+		skb_head = &priv->cmsg_skbs_low;
+
+	if (skb_queue_len(skb_head) >= NFP_FLOWER_WORKQ_MAX_SKBS) {
+		nfp_flower_cmsg_warn(app, "Dropping queued control messages\n");
+		dev_kfree_skb_any(skb);
+		return;
+	}
+
+	skb_queue_tail(skb_head, skb);
+	schedule_work(&priv->cmsg_work);
+}
+
 void nfp_flower_cmsg_rx(struct nfp_app *app, struct sk_buff *skb)
 {
-	struct nfp_flower_priv *priv = app->priv;
 	struct nfp_flower_cmsg_hdr *cmsg_hdr;
 
 	cmsg_hdr = nfp_flower_cmsg_get_hdr(skb);
@@ -307,7 +338,6 @@ void nfp_flower_cmsg_rx(struct nfp_app *app, struct sk_buff *skb)
 		/* Acks from the NFP that the route is added - ignore. */
 		dev_consume_skb_any(skb);
 	} else {
-		skb_queue_tail(&priv->cmsg_skbs, skb);
-		schedule_work(&priv->cmsg_work);
+		nfp_flower_queue_ctl_msg(app, skb, cmsg_hdr->type);
 	}
 }
diff --git a/drivers/net/ethernet/netronome/nfp/flower/cmsg.h b/drivers/net/ethernet/netronome/nfp/flower/cmsg.h
index 96bc0e33980c..b6c0fd053a50 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/cmsg.h
+++ b/drivers/net/ethernet/netronome/nfp/flower/cmsg.h
@@ -108,6 +108,8 @@
 #define NFP_FL_IPV4_TUNNEL_TYPE		GENMASK(7, 4)
 #define NFP_FL_IPV4_PRE_TUN_INDEX	GENMASK(2, 0)
 
+#define NFP_FLOWER_WORKQ_MAX_SKBS	30000
+
 #define nfp_flower_cmsg_warn(app, fmt, args...)                         \
 	do {                                                            \
 		if (net_ratelimit())                                    \
diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.c b/drivers/net/ethernet/netronome/nfp/flower/main.c
index 6357e0720f43..ad02592a82b7 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/main.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/main.c
@@ -519,7 +519,8 @@ static int nfp_flower_init(struct nfp_app *app)
 
 	app->priv = app_priv;
 	app_priv->app = app;
-	skb_queue_head_init(&app_priv->cmsg_skbs);
+	skb_queue_head_init(&app_priv->cmsg_skbs_high);
+	skb_queue_head_init(&app_priv->cmsg_skbs_low);
 	INIT_WORK(&app_priv->cmsg_work, nfp_flower_cmsg_process_rx);
 	init_waitqueue_head(&app_priv->reify_wait_queue);
 
@@ -549,7 +550,8 @@ static void nfp_flower_clean(struct nfp_app *app)
 {
 	struct nfp_flower_priv *app_priv = app->priv;
 
-	skb_queue_purge(&app_priv->cmsg_skbs);
+	skb_queue_purge(&app_priv->cmsg_skbs_high);
+	skb_queue_purge(&app_priv->cmsg_skbs_low);
 	flush_work(&app_priv->cmsg_work);
 
 	nfp_flower_metadata_cleanup(app);
diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.h b/drivers/net/ethernet/netronome/nfp/flower/main.h
index e030b3ce4510..c67e1b54c614 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/main.h
+++ b/drivers/net/ethernet/netronome/nfp/flower/main.h
@@ -107,7 +107,10 @@ struct nfp_mtu_conf {
  * @mask_table:		Hash table used to store masks
  * @flow_table:		Hash table used to store flower rules
  * @cmsg_work:		Workqueue for control messages processing
- * @cmsg_skbs:		List of skbs for control message processing
+ * @cmsg_skbs_high:	List of higher priority skbs for control message
+ *			processing
+ * @cmsg_skbs_low:	List of lower priority skbs for control message
+ *			processing
  * @nfp_mac_off_list:	List of MAC addresses to offload
  * @nfp_mac_index_list:	List of unique 8-bit indexes for non NFP netdevs
  * @nfp_ipv4_off_list:	List of IPv4 addresses to offload
@@ -136,7 +139,8 @@ struct nfp_flower_priv {
 	DECLARE_HASHTABLE(mask_table, NFP_FLOWER_MASK_HASH_BITS);
 	DECLARE_HASHTABLE(flow_table, NFP_FLOWER_HASH_BITS);
 	struct work_struct cmsg_work;
-	struct sk_buff_head cmsg_skbs;
+	struct sk_buff_head cmsg_skbs_high;
+	struct sk_buff_head cmsg_skbs_low;
 	struct list_head nfp_mac_off_list;
 	struct list_head nfp_mac_index_list;
 	struct list_head nfp_ipv4_off_list;

From 1071ec9d453a38023579714b64a951a2fb982071 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Thu, 12 Apr 2018 14:24:31 +0800
Subject: [PATCH 067/288] sctp: do not check port in sctp_inet6_cmp_addr

pf->cmp_addr() is called before binding a v6 address to the sock. It
should not check ports, like in sctp_inet_cmp_addr.

But sctp_inet6_cmp_addr checks the addr by invoking af(6)->cmp_addr,
sctp_v6_cmp_addr where it also compares the ports.

This would cause that setsockopt(SCTP_SOCKOPT_BINDX_ADD) could bind
multiple duplicated IPv6 addresses after Commit 40b4f0fd74e4 ("sctp:
lack the check for ports in sctp_v6_cmp_addr").

This patch is to remove af->cmp_addr called in sctp_inet6_cmp_addr,
but do the proper check for both v6 addrs and v4mapped addrs.

v1->v2:
  - define __sctp_v6_cmp_addr to do the common address comparison
    used for both pf and af v6 cmp_addr.

Fixes: 40b4f0fd74e4 ("sctp: lack the check for ports in sctp_v6_cmp_addr")
Reported-by: Jianwen Ji <jiji@redhat.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/ipv6.c | 74 ++++++++++++++++++++++++-------------------------
 1 file changed, 37 insertions(+), 37 deletions(-)

diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 31083b5035ec..2e3f7b75a8ec 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -556,44 +556,47 @@ static void sctp_v6_to_addr(union sctp_addr *addr, struct in6_addr *saddr,
 	addr->v6.sin6_scope_id = 0;
 }
 
+static int __sctp_v6_cmp_addr(const union sctp_addr *addr1,
+			      const union sctp_addr *addr2)
+{
+	if (addr1->sa.sa_family != addr2->sa.sa_family) {
+		if (addr1->sa.sa_family == AF_INET &&
+		    addr2->sa.sa_family == AF_INET6 &&
+		    ipv6_addr_v4mapped(&addr2->v6.sin6_addr) &&
+		    addr2->v6.sin6_addr.s6_addr32[3] ==
+		    addr1->v4.sin_addr.s_addr)
+			return 1;
+
+		if (addr2->sa.sa_family == AF_INET &&
+		    addr1->sa.sa_family == AF_INET6 &&
+		    ipv6_addr_v4mapped(&addr1->v6.sin6_addr) &&
+		    addr1->v6.sin6_addr.s6_addr32[3] ==
+		    addr2->v4.sin_addr.s_addr)
+			return 1;
+
+		return 0;
+	}
+
+	if (!ipv6_addr_equal(&addr1->v6.sin6_addr, &addr2->v6.sin6_addr))
+		return 0;
+
+	/* If this is a linklocal address, compare the scope_id. */
+	if ((ipv6_addr_type(&addr1->v6.sin6_addr) & IPV6_ADDR_LINKLOCAL) &&
+	    addr1->v6.sin6_scope_id && addr2->v6.sin6_scope_id &&
+	    addr1->v6.sin6_scope_id != addr2->v6.sin6_scope_id)
+		return 0;
+
+	return 1;
+}
+
 /* Compare addresses exactly.
  * v4-mapped-v6 is also in consideration.
  */
 static int sctp_v6_cmp_addr(const union sctp_addr *addr1,
 			    const union sctp_addr *addr2)
 {
-	if (addr1->sa.sa_family != addr2->sa.sa_family) {
-		if (addr1->sa.sa_family == AF_INET &&
-		    addr2->sa.sa_family == AF_INET6 &&
-		    ipv6_addr_v4mapped(&addr2->v6.sin6_addr)) {
-			if (addr2->v6.sin6_port == addr1->v4.sin_port &&
-			    addr2->v6.sin6_addr.s6_addr32[3] ==
-			    addr1->v4.sin_addr.s_addr)
-				return 1;
-		}
-		if (addr2->sa.sa_family == AF_INET &&
-		    addr1->sa.sa_family == AF_INET6 &&
-		    ipv6_addr_v4mapped(&addr1->v6.sin6_addr)) {
-			if (addr1->v6.sin6_port == addr2->v4.sin_port &&
-			    addr1->v6.sin6_addr.s6_addr32[3] ==
-			    addr2->v4.sin_addr.s_addr)
-				return 1;
-		}
-		return 0;
-	}
-	if (addr1->v6.sin6_port != addr2->v6.sin6_port)
-		return 0;
-	if (!ipv6_addr_equal(&addr1->v6.sin6_addr, &addr2->v6.sin6_addr))
-		return 0;
-	/* If this is a linklocal address, compare the scope_id. */
-	if (ipv6_addr_type(&addr1->v6.sin6_addr) & IPV6_ADDR_LINKLOCAL) {
-		if (addr1->v6.sin6_scope_id && addr2->v6.sin6_scope_id &&
-		    (addr1->v6.sin6_scope_id != addr2->v6.sin6_scope_id)) {
-			return 0;
-		}
-	}
-
-	return 1;
+	return __sctp_v6_cmp_addr(addr1, addr2) &&
+	       addr1->v6.sin6_port == addr2->v6.sin6_port;
 }
 
 /* Initialize addr struct to INADDR_ANY. */
@@ -875,8 +878,8 @@ static int sctp_inet6_cmp_addr(const union sctp_addr *addr1,
 			       const union sctp_addr *addr2,
 			       struct sctp_sock *opt)
 {
-	struct sctp_af *af1, *af2;
 	struct sock *sk = sctp_opt2sk(opt);
+	struct sctp_af *af1, *af2;
 
 	af1 = sctp_get_af_specific(addr1->sa.sa_family);
 	af2 = sctp_get_af_specific(addr2->sa.sa_family);
@@ -892,10 +895,7 @@ static int sctp_inet6_cmp_addr(const union sctp_addr *addr1,
 	if (sctp_is_any(sk, addr1) || sctp_is_any(sk, addr2))
 		return 1;
 
-	if (addr1->sa.sa_family != addr2->sa.sa_family)
-		return 0;
-
-	return af1->cmp_addr(addr1, addr2);
+	return __sctp_v6_cmp_addr(addr1, addr2);
 }
 
 /* Verify that the provided sockaddr looks bindable.   Common verification,

From 53b76cdf7e8fecec1d09e38aad2f8579882591a8 Mon Sep 17 00:00:00 2001
From: Wolfgang Bumiller <w.bumiller@proxmox.com>
Date: Thu, 12 Apr 2018 10:46:55 +0200
Subject: [PATCH 068/288] net: fix deadlock while clearing neighbor proxy table

When coming from ndisc_netdev_event() in net/ipv6/ndisc.c,
neigh_ifdown() is called with &nd_tbl, locking this while
clearing the proxy neighbor entries when eg. deleting an
interface. Calling the table's pndisc_destructor() with the
lock still held, however, can cause a deadlock: When a
multicast listener is available an IGMP packet of type
ICMPV6_MGM_REDUCTION may be sent out. When reaching
ip6_finish_output2(), if no neighbor entry for the target
address is found, __neigh_create() is called with &nd_tbl,
which it'll want to lock.

Move the elements into their own list, then unlock the table
and perform the destruction.

Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=199289
Fixes: 6fd6ce2056de ("ipv6: Do not depend on rt->n in ip6_finish_output2().")
Signed-off-by: Wolfgang Bumiller <w.bumiller@proxmox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/neighbour.c | 28 ++++++++++++++++++----------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index a8bc02bb339f..ce519861be59 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -55,7 +55,8 @@ static void neigh_timer_handler(struct timer_list *t);
 static void __neigh_notify(struct neighbour *n, int type, int flags,
 			   u32 pid);
 static void neigh_update_notify(struct neighbour *neigh, u32 nlmsg_pid);
-static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev);
+static int pneigh_ifdown_and_unlock(struct neigh_table *tbl,
+				    struct net_device *dev);
 
 #ifdef CONFIG_PROC_FS
 static const struct file_operations neigh_stat_seq_fops;
@@ -291,8 +292,7 @@ int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
 {
 	write_lock_bh(&tbl->lock);
 	neigh_flush_dev(tbl, dev);
-	pneigh_ifdown(tbl, dev);
-	write_unlock_bh(&tbl->lock);
+	pneigh_ifdown_and_unlock(tbl, dev);
 
 	del_timer_sync(&tbl->proxy_timer);
 	pneigh_queue_purge(&tbl->proxy_queue);
@@ -681,9 +681,10 @@ int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey,
 	return -ENOENT;
 }
 
-static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
+static int pneigh_ifdown_and_unlock(struct neigh_table *tbl,
+				    struct net_device *dev)
 {
-	struct pneigh_entry *n, **np;
+	struct pneigh_entry *n, **np, *freelist = NULL;
 	u32 h;
 
 	for (h = 0; h <= PNEIGH_HASHMASK; h++) {
@@ -691,16 +692,23 @@ static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
 		while ((n = *np) != NULL) {
 			if (!dev || n->dev == dev) {
 				*np = n->next;
-				if (tbl->pdestructor)
-					tbl->pdestructor(n);
-				if (n->dev)
-					dev_put(n->dev);
-				kfree(n);
+				n->next = freelist;
+				freelist = n;
 				continue;
 			}
 			np = &n->next;
 		}
 	}
+	write_unlock_bh(&tbl->lock);
+	while ((n = freelist)) {
+		freelist = n->next;
+		n->next = NULL;
+		if (tbl->pdestructor)
+			tbl->pdestructor(n);
+		if (n->dev)
+			dev_put(n->dev);
+		kfree(n);
+	}
 	return -ENOENT;
 }
 

From 2290482379278e0254e6edfdb681d88359143fd1 Mon Sep 17 00:00:00 2001
From: Richard Cochran <richardcochran@gmail.com>
Date: Mon, 9 Apr 2018 00:03:14 -0700
Subject: [PATCH 069/288] net: dsa: mv88e6xxx: Fix receive time stamp race
 condition.

The DSA stack passes received PTP frames to this driver via
mv88e6xxx_port_rxtstamp() for deferred delivery.  The driver then
queues the frame and kicks the worker thread.  The work callback reads
out the latched receive time stamp and then works through the queue,
delivering any non-matching frames without a time stamp.

If a new frame arrives after the worker thread has read out the time
stamp register but enters the queue before the worker finishes
processing the queue, that frame will be delivered without a time
stamp.

This patch fixes the race by moving the queue onto a list on the stack
before reading out the latched time stamp value.

Fixes: c6fe0ad2c3499 ("net: dsa: mv88e6xxx: add rx/tx timestamping support")
Signed-off-by: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/mv88e6xxx/hwtstamp.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/net/dsa/mv88e6xxx/hwtstamp.c b/drivers/net/dsa/mv88e6xxx/hwtstamp.c
index ac7694c71266..a036c490b7ce 100644
--- a/drivers/net/dsa/mv88e6xxx/hwtstamp.c
+++ b/drivers/net/dsa/mv88e6xxx/hwtstamp.c
@@ -285,10 +285,18 @@ static void mv88e6xxx_get_rxts(struct mv88e6xxx_chip *chip,
 			       struct sk_buff_head *rxq)
 {
 	u16 buf[4] = { 0 }, status, seq_id;
-	u64 ns, timelo, timehi;
 	struct skb_shared_hwtstamps *shwt;
+	struct sk_buff_head received;
+	u64 ns, timelo, timehi;
+	unsigned long flags;
 	int err;
 
+	/* The latched timestamp belongs to one of the received frames. */
+	__skb_queue_head_init(&received);
+	spin_lock_irqsave(&rxq->lock, flags);
+	skb_queue_splice_tail_init(rxq, &received);
+	spin_unlock_irqrestore(&rxq->lock, flags);
+
 	mutex_lock(&chip->reg_lock);
 	err = mv88e6xxx_port_ptp_read(chip, ps->port_id,
 				      reg, buf, ARRAY_SIZE(buf));
@@ -311,7 +319,7 @@ static void mv88e6xxx_get_rxts(struct mv88e6xxx_chip *chip,
 	/* Since the device can only handle one time stamp at a time,
 	 * we purge any extra frames from the queue.
 	 */
-	for ( ; skb; skb = skb_dequeue(rxq)) {
+	for ( ; skb; skb = __skb_dequeue(&received)) {
 		if (mv88e6xxx_ts_valid(status) && seq_match(skb, seq_id)) {
 			ns = timehi << 16 | timelo;
 

From 4f75f1cbf95e2f0853cd229d042b203931b899af Mon Sep 17 00:00:00 2001
From: Thomas Richter <tmricht@linux.ibm.com>
Date: Thu, 12 Apr 2018 15:32:46 +0200
Subject: [PATCH 070/288] perf record: Change warning for missing sysfs entry
 to debug

Using perf on 4.16.0 kernel on s390 shows this warning:

   failed: can't open node sysfs data

each time I run command perf record ... for example:

  [root@s35lp76 perf]# ./perf record -e rB0000 -- sleep 1
  [ perf record: Woken up 1 times to write data ]
  failed: can't open node sysfs data
  [ perf record: Captured and wrote 0.001 MB perf.data (4 samples) ]
  [root@s35lp76 perf]#

It turns out commit e2091cedd51bf ("perf tools: Add MEM_TOPOLOGY feature
to perf data file") tries to open directory named /sys/devices/system/node/
which does not exist on s390.

This is the call stack:
 __cmd_record
 +---> perf_session__write_header
       +---> perf_header__adds_write
             +---> do_write_feat
	           +---> write_mem_topology
		         +---> build_mem_topology
			       prints warning

The issue starts in do_write_feat() which unconditionally loops over all
features and now includes HEADER_MEM_TOPOLOGY and calls write_mem_topology().

Function record__init_features() at the beginning of __cmd_record() sets
all features and then turns off some of them.

Fix this by changing the warning to a level 2 debug output statement.

So it is only shown when debug level 2 or higher is set.

Signed-off-by: Thomas Richter <tmricht@linux.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Link: http://lkml.kernel.org/r/20180412133246.92801-1-tmricht@linux.ibm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/header.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 121df1683c36..a8bff2178fbc 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -1320,7 +1320,8 @@ static int build_mem_topology(struct memory_node *nodes, u64 size, u64 *cntp)
 
 	dir = opendir(path);
 	if (!dir) {
-		pr_warning("failed: can't open node sysfs data\n");
+		pr_debug2("%s: could't read %s, does this arch have topology information?\n",
+			  __func__, path);
 		return -1;
 	}
 

From 7b366142a50ad79e48de8e67c5b3e8cfb9fa82dd Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 12 Apr 2018 14:58:24 -0300
Subject: [PATCH 071/288] perf report: Fix switching to another perf.data file
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In the TUI the 's' hotkey can be used to switch to another perf.data
file in the current directory, but that got broken in Fixes:
b01141f4f59c ("perf annotate: Initialize the priv are in symbol__new()"),
that would show this once another file was chosen:

    ┌─Fatal Error─────────────────────────────────────┐
    │Annotation needs to be init before symbol__init()│
    │                                                 │
    │                                                 │
    │Press any key...                                 │
    └─────────────────────────────────────────────────┘

Fix it by just silently bailing out if symbol__annotation_init() was already
called, just like is done with symbol__init(), i.e. they are done just once at
session start, not when switching to a new perf.data file.

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jin Yao <yao.jin@linux.intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Martin Liška <mliska@suse.cz>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com>
Cc: Thomas Richter <tmricht@linux.vnet.ibm.com>
Cc: Wang Nan <wangnan0@huawei.com>
Fixes: b01141f4f59c ("perf annotate: Initialize the priv are in symbol__new()")
Link: https://lkml.kernel.org/n/tip-ogppdtpzfax7y1h6gjdv5s6u@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/symbol.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index 62b2dd2253eb..1466814ebada 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -2091,16 +2091,14 @@ static bool symbol__read_kptr_restrict(void)
 
 int symbol__annotation_init(void)
 {
+	if (symbol_conf.init_annotation)
+		return 0;
+
 	if (symbol_conf.initialized) {
 		pr_err("Annotation needs to be init before symbol__init()\n");
 		return -1;
 	}
 
-	if (symbol_conf.init_annotation) {
-		pr_warning("Annotation being initialized multiple times\n");
-		return 0;
-	}
-
 	symbol_conf.priv_size += sizeof(struct annotation);
 	symbol_conf.init_annotation = true;
 	return 0;

From 43c4023152a9c5742948ac919e58ade127fa4e2e Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 12 Apr 2018 15:23:02 -0300
Subject: [PATCH 072/288] perf annotate: Allow setting the offset level in
 .perfconfig
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The default is 1 (jump_target):

  # perf annotate --ignore-vmlinux --stdio2 _raw_spin_lock_irqsave
  Samples: 3K of event 'cycles:ppp', 3000 Hz, Event count (approx.): 2766398574
  _raw_spin_lock_irqsave() /proc/kcore
    0.26        nop
    4.61        push   %rbx
   19.33        pushfq
    7.97        pop    %rax
    0.32        nop
    0.06        mov    %rax,%rbx
   14.63        cli
    0.06        nop
                xor    %eax,%eax
                mov    $0x1,%edx
   49.94        lock   cmpxchg %edx,(%rdi)
    0.16        test   %eax,%eax
              ↓ jne    2b
    2.66        mov    %rbx,%rax
                pop    %rbx
              ← retq
          2b:   mov    %eax,%esi
              → callq  *ffffffffb30eaed0
                mov    %rbx,%rax
                pop    %rbx
              ← retq
  #

But one can ask for showing offsets for call instructions by setting
this:

  # perf annotate --ignore-vmlinux --stdio2 _raw_spin_lock_irqsave
  Samples: 3K of event 'cycles:ppp', 3000 Hz, Event count (approx.): 2766398574
  _raw_spin_lock_irqsave() /proc/kcore
    0.26        nop
    4.61        push   %rbx
   19.33        pushfq
    7.97        pop    %rax
    0.32        nop
    0.06        mov    %rax,%rbx
   14.63        cli
    0.06        nop
                xor    %eax,%eax
                mov    $0x1,%edx
   49.94        lock   cmpxchg %edx,(%rdi)
    0.16        test   %eax,%eax
              ↓ jne    2b
    2.66        mov    %rbx,%rax
                pop    %rbx
              ← retq
          2b:   mov    %eax,%esi
          2d: → callq  *ffffffffb30eaed0
                mov    %rbx,%rax
                pop    %rbx
              ← retq
  #

Or using a big value to ask for all offsets to be shown:

  # cat ~/.perfconfig
  [annotate]

	offset_level = 100

	hide_src_code = true
  # perf annotate --ignore-vmlinux --stdio2 _raw_spin_lock_irqsave
  Samples: 3K of event 'cycles:ppp', 3000 Hz, Event count (approx.): 2766398574
  _raw_spin_lock_irqsave() /proc/kcore
    0.26   0:   nop
    4.61   5:   push   %rbx
   19.33   6:   pushfq
    7.97   7:   pop    %rax
    0.32   8:   nop
    0.06   d:   mov    %rax,%rbx
   14.63  10:   cli
    0.06  11:   nop
          17:   xor    %eax,%eax
          19:   mov    $0x1,%edx
   49.94  1e:   lock   cmpxchg %edx,(%rdi)
    0.16  22:   test   %eax,%eax
          24: ↓ jne    2b
    2.66  26:   mov    %rbx,%rax
          29:   pop    %rbx
          2a: ← retq
          2b:   mov    %eax,%esi
          2d: → callq  *ffffffffb30eaed0
          32:   mov    %rbx,%rax
          35:   pop    %rbx
          36: ← retq
   #

This also affects the TUI, i.e. the default 'perf annotate' and 'perf
top/report' -> A hotkey -> annotate interfaces, when slang-devel is present
in the build, i.e.:

  # perf version --build-options | grep slang
              libslang: [ on  ]  # HAVE_SLANG_SUPPORT
  #

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jin Yao <yao.jin@linux.intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Martin Liška <mliska@suse.cz>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com>
Cc: Thomas Richter <tmricht@linux.vnet.ibm.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: https://lkml.kernel.org/n/tip-venm6x5zrt40eu8hxdsmqxz6@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-config.txt |  5 +++++
 tools/perf/util/annotate.c               | 15 ++++++++++++---
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/tools/perf/Documentation/perf-config.txt b/tools/perf/Documentation/perf-config.txt
index 5b4fff3adc4b..32f4a898e3f2 100644
--- a/tools/perf/Documentation/perf-config.txt
+++ b/tools/perf/Documentation/perf-config.txt
@@ -334,6 +334,11 @@ annotate.*::
 
 		99.93 │      mov    %eax,%eax
 
+	annotate.offset_level::
+		Default is '1', meaning just jump targets will have offsets show right beside
+		the instruction. When set to '2' 'call' instructions will also have its offsets
+		shown, 3 or higher will show offsets for all instructions.
+
 hist.*::
 	hist.percentage::
 		This option control the way to calculate overhead of filtered entries -
diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index 5edc565d86c4..536ee148bff8 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -2649,10 +2649,11 @@ int __annotation__scnprintf_samples_period(struct annotation *notes,
  */
 static struct annotation_config {
 	const char *name;
-	bool *value;
+	void *value;
 } annotation__configs[] = {
 	ANNOTATION__CFG(hide_src_code),
 	ANNOTATION__CFG(jump_arrows),
+	ANNOTATION__CFG(offset_level),
 	ANNOTATION__CFG(show_linenr),
 	ANNOTATION__CFG(show_nr_jumps),
 	ANNOTATION__CFG(show_nr_samples),
@@ -2684,8 +2685,16 @@ static int annotation__config(const char *var, const char *value,
 
 	if (cfg == NULL)
 		pr_debug("%s variable unknown, ignoring...", var);
-	else
-		*cfg->value = perf_config_bool(name, value);
+	else if (strcmp(var, "annotate.offset_level") == 0) {
+		perf_config_int(cfg->value, name, value);
+
+		if (*(int *)cfg->value > ANNOTATION__MAX_OFFSET_LEVEL)
+			*(int *)cfg->value = ANNOTATION__MAX_OFFSET_LEVEL;
+		else if (*(int *)cfg->value < ANNOTATION__MIN_OFFSET_LEVEL)
+			*(int *)cfg->value = ANNOTATION__MIN_OFFSET_LEVEL;
+	} else {
+		*(bool *)cfg->value = perf_config_bool(name, value);
+	}
 	return 0;
 }
 

From b0d5c81e872ed21de1e56feb0fa6e4161da7be61 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 12 Apr 2018 16:28:18 -0300
Subject: [PATCH 073/288] perf annotate: Handle variables in 'sub', 'or' and
 many other instructions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Just like is done for 'mov' and others that can have as source or
targets variables resolved by objdump, to make them more compact:

-               orb    $0x4,0x224d71(%rip)        # 226ca4 <_rtld_global+0xca4>
+               orb    $0x4,_rtld_global+0xca4

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jin Yao <yao.jin@linux.intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Martin Liška <mliska@suse.cz>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com>
Cc: Thomas Richter <tmricht@linux.vnet.ibm.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: https://lkml.kernel.org/n/tip-efex7746id4w4wa03nqxvh3m@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/arch/x86/annotate/instructions.c | 67 ++++++++++++++++++++-
 1 file changed, 66 insertions(+), 1 deletion(-)

diff --git a/tools/perf/arch/x86/annotate/instructions.c b/tools/perf/arch/x86/annotate/instructions.c
index 5bd1ba8c0282..44f5aba78210 100644
--- a/tools/perf/arch/x86/annotate/instructions.c
+++ b/tools/perf/arch/x86/annotate/instructions.c
@@ -1,21 +1,43 @@
 // SPDX-License-Identifier: GPL-2.0
 static struct ins x86__instructions[] = {
+	{ .name = "adc",	.ops = &mov_ops,  },
+	{ .name = "adcb",	.ops = &mov_ops,  },
+	{ .name = "adcl",	.ops = &mov_ops,  },
 	{ .name = "add",	.ops = &mov_ops,  },
 	{ .name = "addl",	.ops = &mov_ops,  },
 	{ .name = "addq",	.ops = &mov_ops,  },
+	{ .name = "addsd",	.ops = &mov_ops,  },
 	{ .name = "addw",	.ops = &mov_ops,  },
 	{ .name = "and",	.ops = &mov_ops,  },
+	{ .name = "andb",	.ops = &mov_ops,  },
+	{ .name = "andl",	.ops = &mov_ops,  },
+	{ .name = "andpd",	.ops = &mov_ops,  },
+	{ .name = "andps",	.ops = &mov_ops,  },
+	{ .name = "andq",	.ops = &mov_ops,  },
+	{ .name = "andw",	.ops = &mov_ops,  },
+	{ .name = "bsr",	.ops = &mov_ops,  },
+	{ .name = "bt",		.ops = &mov_ops,  },
+	{ .name = "btr",	.ops = &mov_ops,  },
 	{ .name = "bts",	.ops = &mov_ops,  },
+	{ .name = "btsq",	.ops = &mov_ops,  },
 	{ .name = "call",	.ops = &call_ops, },
 	{ .name = "callq",	.ops = &call_ops, },
+	{ .name = "cmovbe",	.ops = &mov_ops,  },
+	{ .name = "cmove",	.ops = &mov_ops,  },
+	{ .name = "cmovae",	.ops = &mov_ops,  },
 	{ .name = "cmp",	.ops = &mov_ops,  },
 	{ .name = "cmpb",	.ops = &mov_ops,  },
 	{ .name = "cmpl",	.ops = &mov_ops,  },
 	{ .name = "cmpq",	.ops = &mov_ops,  },
 	{ .name = "cmpw",	.ops = &mov_ops,  },
 	{ .name = "cmpxch",	.ops = &mov_ops,  },
+	{ .name = "cmpxchg",	.ops = &mov_ops,  },
+	{ .name = "cs",		.ops = &mov_ops,  },
 	{ .name = "dec",	.ops = &dec_ops,  },
 	{ .name = "decl",	.ops = &dec_ops,  },
+	{ .name = "divsd",	.ops = &mov_ops,  },
+	{ .name = "divss",	.ops = &mov_ops,  },
+	{ .name = "gs",		.ops = &mov_ops,  },
 	{ .name = "imul",	.ops = &mov_ops,  },
 	{ .name = "inc",	.ops = &dec_ops,  },
 	{ .name = "incl",	.ops = &dec_ops,  },
@@ -57,25 +79,68 @@ static struct ins x86__instructions[] = {
 	{ .name = "lea",	.ops = &mov_ops,  },
 	{ .name = "lock",	.ops = &lock_ops, },
 	{ .name = "mov",	.ops = &mov_ops,  },
+	{ .name = "movapd",	.ops = &mov_ops,  },
+	{ .name = "movaps",	.ops = &mov_ops,  },
 	{ .name = "movb",	.ops = &mov_ops,  },
 	{ .name = "movdqa",	.ops = &mov_ops,  },
+	{ .name = "movdqu",	.ops = &mov_ops,  },
 	{ .name = "movl",	.ops = &mov_ops,  },
 	{ .name = "movq",	.ops = &mov_ops,  },
+	{ .name = "movsd",	.ops = &mov_ops,  },
 	{ .name = "movslq",	.ops = &mov_ops,  },
+	{ .name = "movss",	.ops = &mov_ops,  },
+	{ .name = "movupd",	.ops = &mov_ops,  },
+	{ .name = "movups",	.ops = &mov_ops,  },
+	{ .name = "movw",	.ops = &mov_ops,  },
 	{ .name = "movzbl",	.ops = &mov_ops,  },
 	{ .name = "movzwl",	.ops = &mov_ops,  },
+	{ .name = "mulsd",	.ops = &mov_ops,  },
+	{ .name = "mulss",	.ops = &mov_ops,  },
 	{ .name = "nop",	.ops = &nop_ops,  },
 	{ .name = "nopl",	.ops = &nop_ops,  },
 	{ .name = "nopw",	.ops = &nop_ops,  },
 	{ .name = "or",		.ops = &mov_ops,  },
+	{ .name = "orb",	.ops = &mov_ops,  },
 	{ .name = "orl",	.ops = &mov_ops,  },
+	{ .name = "orps",	.ops = &mov_ops,  },
+	{ .name = "orq",	.ops = &mov_ops,  },
+	{ .name = "pand",	.ops = &mov_ops,  },
+	{ .name = "paddq",	.ops = &mov_ops,  },
+	{ .name = "pcmpeqb",	.ops = &mov_ops,  },
+	{ .name = "por",	.ops = &mov_ops,  },
+	{ .name = "rclb",	.ops = &mov_ops,  },
+	{ .name = "rcll",	.ops = &mov_ops,  },
+	{ .name = "retq",	.ops = &ret_ops,  },
+	{ .name = "sbb",	.ops = &mov_ops,  },
+	{ .name = "sbbl",	.ops = &mov_ops,  },
+	{ .name = "sete",	.ops = &mov_ops,  },
+	{ .name = "sub",	.ops = &mov_ops,  },
+	{ .name = "subl",	.ops = &mov_ops,  },
+	{ .name = "subq",	.ops = &mov_ops,  },
+	{ .name = "subsd",	.ops = &mov_ops,  },
+	{ .name = "subw",	.ops = &mov_ops,  },
 	{ .name = "test",	.ops = &mov_ops,  },
 	{ .name = "testb",	.ops = &mov_ops,  },
 	{ .name = "testl",	.ops = &mov_ops,  },
+	{ .name = "ucomisd",	.ops = &mov_ops,  },
+	{ .name = "ucomiss",	.ops = &mov_ops,  },
+	{ .name = "vaddsd",	.ops = &mov_ops,  },
+	{ .name = "vandpd",	.ops = &mov_ops,  },
+	{ .name = "vmovdqa",	.ops = &mov_ops,  },
+	{ .name = "vmovq",	.ops = &mov_ops,  },
+	{ .name = "vmovsd",	.ops = &mov_ops,  },
+	{ .name = "vmulsd",	.ops = &mov_ops,  },
+	{ .name = "vorpd",	.ops = &mov_ops,  },
+	{ .name = "vsubsd",	.ops = &mov_ops,  },
+	{ .name = "vucomisd",	.ops = &mov_ops,  },
 	{ .name = "xadd",	.ops = &mov_ops,  },
 	{ .name = "xbeginl",	.ops = &jump_ops, },
 	{ .name = "xbeginq",	.ops = &jump_ops, },
-	{ .name = "retq",	.ops = &ret_ops,  },
+	{ .name = "xchg",	.ops = &mov_ops,  },
+	{ .name = "xor",	.ops = &mov_ops, },
+	{ .name = "xorb",	.ops = &mov_ops, },
+	{ .name = "xorpd",	.ops = &mov_ops, },
+	{ .name = "xorps",	.ops = &mov_ops, },
 };
 
 static bool x86__ins_is_fused(struct arch *arch, const char *ins1,

From 92183a42898dc400b89da35685d1814ac6acd3d8 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Thu, 5 Apr 2018 16:18:03 +0300
Subject: [PATCH 074/288] fsnotify: fix ignore mask logic in send_to_group()

The ignore mask logic in send_to_group() does not match the logic
in fanotify_should_send_event(). In the latter, a vfsmount mark ignore
mask precedes an inode mark mask and in the former, it does not.

That difference may cause events to be sent to fanotify backend for no
reason. Fix the logic in send_to_group() to match that of
fanotify_should_send_event().

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/fsnotify.c | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 219b269c737e..613ec7e5a465 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -192,8 +192,9 @@ static int send_to_group(struct inode *to_tell,
 			 struct fsnotify_iter_info *iter_info)
 {
 	struct fsnotify_group *group = NULL;
-	__u32 inode_test_mask = 0;
-	__u32 vfsmount_test_mask = 0;
+	__u32 test_mask = (mask & ~FS_EVENT_ON_CHILD);
+	__u32 marks_mask = 0;
+	__u32 marks_ignored_mask = 0;
 
 	if (unlikely(!inode_mark && !vfsmount_mark)) {
 		BUG();
@@ -213,29 +214,25 @@ static int send_to_group(struct inode *to_tell,
 	/* does the inode mark tell us to do something? */
 	if (inode_mark) {
 		group = inode_mark->group;
-		inode_test_mask = (mask & ~FS_EVENT_ON_CHILD);
-		inode_test_mask &= inode_mark->mask;
-		inode_test_mask &= ~inode_mark->ignored_mask;
+		marks_mask |= inode_mark->mask;
+		marks_ignored_mask |= inode_mark->ignored_mask;
 	}
 
 	/* does the vfsmount_mark tell us to do something? */
 	if (vfsmount_mark) {
-		vfsmount_test_mask = (mask & ~FS_EVENT_ON_CHILD);
 		group = vfsmount_mark->group;
-		vfsmount_test_mask &= vfsmount_mark->mask;
-		vfsmount_test_mask &= ~vfsmount_mark->ignored_mask;
-		if (inode_mark)
-			vfsmount_test_mask &= ~inode_mark->ignored_mask;
+		marks_mask |= vfsmount_mark->mask;
+		marks_ignored_mask |= vfsmount_mark->ignored_mask;
 	}
 
 	pr_debug("%s: group=%p to_tell=%p mask=%x inode_mark=%p"
-		 " inode_test_mask=%x vfsmount_mark=%p vfsmount_test_mask=%x"
+		 " vfsmount_mark=%p marks_mask=%x marks_ignored_mask=%x"
 		 " data=%p data_is=%d cookie=%d\n",
-		 __func__, group, to_tell, mask, inode_mark,
-		 inode_test_mask, vfsmount_mark, vfsmount_test_mask, data,
+		 __func__, group, to_tell, mask, inode_mark, vfsmount_mark,
+		 marks_mask, marks_ignored_mask, data,
 		 data_is, cookie);
 
-	if (!inode_test_mask && !vfsmount_test_mask)
+	if (!(test_mask & marks_mask & ~marks_ignored_mask))
 		return 0;
 
 	return group->ops->handle_event(group, to_tell, inode_mark,

From 8e984f8667ff4225092af734eef28a3d7bae8626 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Thu, 5 Apr 2018 16:18:04 +0300
Subject: [PATCH 075/288] fsnotify: fix typo in a comment about mark->g_list

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 include/linux/fsnotify_backend.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 9f1edb92c97e..e0c95c9f1e29 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -248,7 +248,7 @@ struct fsnotify_mark {
 	/* Group this mark is for. Set on mark creation, stable until last ref
 	 * is dropped */
 	struct fsnotify_group *group;
-	/* List of marks by group->i_fsnotify_marks. Also reused for queueing
+	/* List of marks by group->marks_list. Also reused for queueing
 	 * mark into destroy_list when it's waiting for the end of SRCU period
 	 * before it can be freed. [group->mark_mutex] */
 	struct list_head g_list;

From 96348e49366c6e2a5a2e62ba0350f66ef5d67ea7 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Thu, 5 Apr 2018 16:18:05 +0300
Subject: [PATCH 076/288] MAINTAINERS: add an entry for FSNOTIFY infrastructure

There is alreay an entry for all the backends, but those entries do
not cover all the fsnotify files.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 MAINTAINERS | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 4c3c17e1e163..e3e0a79e2069 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5780,6 +5780,14 @@ F:	fs/crypto/
 F:	include/linux/fscrypt*.h
 F:	Documentation/filesystems/fscrypt.rst
 
+FSNOTIFY: FILESYSTEM NOTIFICATION INFRASTRUCTURE
+M:	Jan Kara <jack@suse.cz>
+R:	Amir Goldstein <amir73il@gmail.com>
+L:	linux-fsdevel@vger.kernel.org
+S:	Maintained
+F:	fs/notify/
+F:	include/linux/fsnotify*.h
+
 FUJITSU LAPTOP EXTRAS
 M:	Jonathan Woithe <jwoithe@just42.net>
 L:	platform-driver-x86@vger.kernel.org

From 9267c430c6b6f4c0120e3c6bb847313d633f02a6 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Fri, 13 Apr 2018 14:58:25 +0800
Subject: [PATCH 077/288] virtio-net: add missing virtqueue kick when flushing
 packets

We tends to batch submitting packets during XDP_TX. This requires to
kick virtqueue after a batch, we tried to do it through
xdp_do_flush_map() which only makes sense for devmap not XDP_TX. So
explicitly kick the virtqueue in this case.

Reported-by: Kimitoshi Takahashi <ktaka@nii.ac.jp>
Tested-by: Kimitoshi Takahashi <ktaka@nii.ac.jp>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Fixes: 186b3c998c50 ("virtio-net: support XDP_REDIRECT")
Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/virtio_net.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 7b187ec7411e..dfe78308044d 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1269,7 +1269,9 @@ static int virtnet_poll(struct napi_struct *napi, int budget)
 {
 	struct receive_queue *rq =
 		container_of(napi, struct receive_queue, napi);
-	unsigned int received;
+	struct virtnet_info *vi = rq->vq->vdev->priv;
+	struct send_queue *sq;
+	unsigned int received, qp;
 	bool xdp_xmit = false;
 
 	virtnet_poll_cleantx(rq);
@@ -1280,8 +1282,13 @@ static int virtnet_poll(struct napi_struct *napi, int budget)
 	if (received < budget)
 		virtqueue_napi_complete(napi, rq->vq, received);
 
-	if (xdp_xmit)
+	if (xdp_xmit) {
+		qp = vi->curr_queue_pairs - vi->xdp_queue_pairs +
+		     smp_processor_id();
+		sq = &vi->sq[qp];
+		virtqueue_kick(sq->vq);
 		xdp_do_flush_map();
+	}
 
 	return received;
 }

From 5846c131c39b6d0add36ec19dc8650700690f930 Mon Sep 17 00:00:00 2001
From: Guillaume Nault <g.nault@alphalink.fr>
Date: Thu, 12 Apr 2018 20:50:33 +0200
Subject: [PATCH 078/288] l2tp: hold reference on tunnels in netlink dumps

l2tp_tunnel_find_nth() is unsafe: no reference is held on the returned
tunnel, therefore it can be freed whenever the caller uses it.
This patch defines l2tp_tunnel_get_nth() which works similarly, but
also takes a reference on the returned tunnel. The caller then has to
drop it after it stops using the tunnel.

Convert netlink dumps to make them safe against concurrent tunnel
deletion.

Fixes: 309795f4bec2 ("l2tp: Add netlink control API for L2TP")
Signed-off-by: Guillaume Nault <g.nault@alphalink.fr>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/l2tp/l2tp_core.c    | 20 ++++++++++++++++++++
 net/l2tp/l2tp_core.h    |  2 ++
 net/l2tp/l2tp_netlink.c | 11 ++++++++---
 3 files changed, 30 insertions(+), 3 deletions(-)

diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index 0fbd3ee26165..c8c4183f0f37 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -183,6 +183,26 @@ struct l2tp_tunnel *l2tp_tunnel_get(const struct net *net, u32 tunnel_id)
 }
 EXPORT_SYMBOL_GPL(l2tp_tunnel_get);
 
+struct l2tp_tunnel *l2tp_tunnel_get_nth(const struct net *net, int nth)
+{
+	const struct l2tp_net *pn = l2tp_pernet(net);
+	struct l2tp_tunnel *tunnel;
+	int count = 0;
+
+	rcu_read_lock_bh();
+	list_for_each_entry_rcu(tunnel, &pn->l2tp_tunnel_list, list) {
+		if (++count > nth) {
+			l2tp_tunnel_inc_refcount(tunnel);
+			rcu_read_unlock_bh();
+			return tunnel;
+		}
+	}
+	rcu_read_unlock_bh();
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(l2tp_tunnel_get_nth);
+
 /* Lookup a session. A new reference is held on the returned session. */
 struct l2tp_session *l2tp_session_get(const struct net *net,
 				      struct l2tp_tunnel *tunnel,
diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h
index ba33cbec71eb..e4896413b2b6 100644
--- a/net/l2tp/l2tp_core.h
+++ b/net/l2tp/l2tp_core.h
@@ -212,6 +212,8 @@ static inline void *l2tp_session_priv(struct l2tp_session *session)
 }
 
 struct l2tp_tunnel *l2tp_tunnel_get(const struct net *net, u32 tunnel_id);
+struct l2tp_tunnel *l2tp_tunnel_get_nth(const struct net *net, int nth);
+
 void l2tp_tunnel_free(struct l2tp_tunnel *tunnel);
 
 struct l2tp_session *l2tp_session_get(const struct net *net,
diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c
index b05dbd9ffcb2..6616c9fd292f 100644
--- a/net/l2tp/l2tp_netlink.c
+++ b/net/l2tp/l2tp_netlink.c
@@ -487,14 +487,17 @@ static int l2tp_nl_cmd_tunnel_dump(struct sk_buff *skb, struct netlink_callback
 	struct net *net = sock_net(skb->sk);
 
 	for (;;) {
-		tunnel = l2tp_tunnel_find_nth(net, ti);
+		tunnel = l2tp_tunnel_get_nth(net, ti);
 		if (tunnel == NULL)
 			goto out;
 
 		if (l2tp_nl_tunnel_send(skb, NETLINK_CB(cb->skb).portid,
 					cb->nlh->nlmsg_seq, NLM_F_MULTI,
-					tunnel, L2TP_CMD_TUNNEL_GET) < 0)
+					tunnel, L2TP_CMD_TUNNEL_GET) < 0) {
+			l2tp_tunnel_dec_refcount(tunnel);
 			goto out;
+		}
+		l2tp_tunnel_dec_refcount(tunnel);
 
 		ti++;
 	}
@@ -848,7 +851,7 @@ static int l2tp_nl_cmd_session_dump(struct sk_buff *skb, struct netlink_callback
 
 	for (;;) {
 		if (tunnel == NULL) {
-			tunnel = l2tp_tunnel_find_nth(net, ti);
+			tunnel = l2tp_tunnel_get_nth(net, ti);
 			if (tunnel == NULL)
 				goto out;
 		}
@@ -856,6 +859,7 @@ static int l2tp_nl_cmd_session_dump(struct sk_buff *skb, struct netlink_callback
 		session = l2tp_session_get_nth(tunnel, si);
 		if (session == NULL) {
 			ti++;
+			l2tp_tunnel_dec_refcount(tunnel);
 			tunnel = NULL;
 			si = 0;
 			continue;
@@ -865,6 +869,7 @@ static int l2tp_nl_cmd_session_dump(struct sk_buff *skb, struct netlink_callback
 					 cb->nlh->nlmsg_seq, NLM_F_MULTI,
 					 session, L2TP_CMD_SESSION_GET) < 0) {
 			l2tp_session_dec_refcount(session);
+			l2tp_tunnel_dec_refcount(tunnel);
 			break;
 		}
 		l2tp_session_dec_refcount(session);

From 0e0c3fee3a59a387aeecc4fca6f3a2e9615a5443 Mon Sep 17 00:00:00 2001
From: Guillaume Nault <g.nault@alphalink.fr>
Date: Thu, 12 Apr 2018 20:50:34 +0200
Subject: [PATCH 079/288] l2tp: hold reference on tunnels printed in pppol2tp
 proc file

Use l2tp_tunnel_get_nth() instead of l2tp_tunnel_find_nth(), to be safe
against concurrent tunnel deletion.

Unlike sessions, we can't drop the reference held on tunnels in
pppol2tp_seq_show(). Tunnels are reused across several calls to
pppol2tp_seq_start() when iterating over sessions. These iterations
need the tunnel for accessing the next session. Therefore the only safe
moment for dropping the reference is just before searching for the next
tunnel.

Normally, the last invocation of pppol2tp_next_tunnel() doesn't find
any new tunnel, so it drops the last tunnel without taking any new
reference. However, in case of error, pppol2tp_seq_stop() is called
directly, so we have to drop the reference there.

Fixes: fd558d186df2 ("l2tp: Split pppol2tp patch into separate l2tp and ppp parts")
Signed-off-by: Guillaume Nault <g.nault@alphalink.fr>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/l2tp/l2tp_ppp.c | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index 896bbca9bdaa..7d0c963680e6 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -1551,16 +1551,19 @@ struct pppol2tp_seq_data {
 
 static void pppol2tp_next_tunnel(struct net *net, struct pppol2tp_seq_data *pd)
 {
+	/* Drop reference taken during previous invocation */
+	if (pd->tunnel)
+		l2tp_tunnel_dec_refcount(pd->tunnel);
+
 	for (;;) {
-		pd->tunnel = l2tp_tunnel_find_nth(net, pd->tunnel_idx);
+		pd->tunnel = l2tp_tunnel_get_nth(net, pd->tunnel_idx);
 		pd->tunnel_idx++;
 
-		if (pd->tunnel == NULL)
-			break;
+		/* Only accept L2TPv2 tunnels */
+		if (!pd->tunnel || pd->tunnel->version == 2)
+			return;
 
-		/* Ignore L2TPv3 tunnels */
-		if (pd->tunnel->version < 3)
-			break;
+		l2tp_tunnel_dec_refcount(pd->tunnel);
 	}
 }
 
@@ -1609,7 +1612,14 @@ static void *pppol2tp_seq_next(struct seq_file *m, void *v, loff_t *pos)
 
 static void pppol2tp_seq_stop(struct seq_file *p, void *v)
 {
-	/* nothing to do */
+	struct pppol2tp_seq_data *pd = v;
+
+	if (!pd || pd == SEQ_START_TOKEN)
+		return;
+
+	/* Drop reference taken by last invocation of pppol2tp_next_tunnel() */
+	if (pd->tunnel)
+		l2tp_tunnel_dec_refcount(pd->tunnel);
 }
 
 static void pppol2tp_seq_tunnel_show(struct seq_file *m, void *v)

From f726214d9b23e5fce8c11937577a289a3202498f Mon Sep 17 00:00:00 2001
From: Guillaume Nault <g.nault@alphalink.fr>
Date: Thu, 12 Apr 2018 20:50:35 +0200
Subject: [PATCH 080/288] l2tp: hold reference on tunnels printed in
 l2tp/tunnels debugfs file

Use l2tp_tunnel_get_nth() instead of l2tp_tunnel_find_nth(), to be safe
against concurrent tunnel deletion.

Use the same mechanism as in l2tp_ppp.c for dropping the reference
taken by l2tp_tunnel_get_nth(). That is, drop the reference just
before looking up the next tunnel. In case of error, drop the last
accessed tunnel in l2tp_dfs_seq_stop().

That was the last use of l2tp_tunnel_find_nth().

Fixes: 0ad6614048cf ("l2tp: Add debugfs files for dumping l2tp debug info")
Signed-off-by: Guillaume Nault <g.nault@alphalink.fr>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/l2tp/l2tp_core.c    | 20 --------------------
 net/l2tp/l2tp_core.h    |  1 -
 net/l2tp/l2tp_debugfs.c | 15 +++++++++++++--
 3 files changed, 13 insertions(+), 23 deletions(-)

diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index c8c4183f0f37..40261cb68e83 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -355,26 +355,6 @@ err_tlock:
 }
 EXPORT_SYMBOL_GPL(l2tp_session_register);
 
-struct l2tp_tunnel *l2tp_tunnel_find_nth(const struct net *net, int nth)
-{
-	struct l2tp_net *pn = l2tp_pernet(net);
-	struct l2tp_tunnel *tunnel;
-	int count = 0;
-
-	rcu_read_lock_bh();
-	list_for_each_entry_rcu(tunnel, &pn->l2tp_tunnel_list, list) {
-		if (++count > nth) {
-			rcu_read_unlock_bh();
-			return tunnel;
-		}
-	}
-
-	rcu_read_unlock_bh();
-
-	return NULL;
-}
-EXPORT_SYMBOL_GPL(l2tp_tunnel_find_nth);
-
 /*****************************************************************************
  * Receive data handling
  *****************************************************************************/
diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h
index e4896413b2b6..c199020f8a8a 100644
--- a/net/l2tp/l2tp_core.h
+++ b/net/l2tp/l2tp_core.h
@@ -222,7 +222,6 @@ struct l2tp_session *l2tp_session_get(const struct net *net,
 struct l2tp_session *l2tp_session_get_nth(struct l2tp_tunnel *tunnel, int nth);
 struct l2tp_session *l2tp_session_get_by_ifname(const struct net *net,
 						const char *ifname);
-struct l2tp_tunnel *l2tp_tunnel_find_nth(const struct net *net, int nth);
 
 int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id,
 		       u32 peer_tunnel_id, struct l2tp_tunnel_cfg *cfg,
diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c
index 72e713da4733..b8f9d45bfeb1 100644
--- a/net/l2tp/l2tp_debugfs.c
+++ b/net/l2tp/l2tp_debugfs.c
@@ -47,7 +47,11 @@ struct l2tp_dfs_seq_data {
 
 static void l2tp_dfs_next_tunnel(struct l2tp_dfs_seq_data *pd)
 {
-	pd->tunnel = l2tp_tunnel_find_nth(pd->net, pd->tunnel_idx);
+	/* Drop reference taken during previous invocation */
+	if (pd->tunnel)
+		l2tp_tunnel_dec_refcount(pd->tunnel);
+
+	pd->tunnel = l2tp_tunnel_get_nth(pd->net, pd->tunnel_idx);
 	pd->tunnel_idx++;
 }
 
@@ -96,7 +100,14 @@ static void *l2tp_dfs_seq_next(struct seq_file *m, void *v, loff_t *pos)
 
 static void l2tp_dfs_seq_stop(struct seq_file *p, void *v)
 {
-	/* nothing to do */
+	struct l2tp_dfs_seq_data *pd = v;
+
+	if (!pd || pd == SEQ_START_TOKEN)
+		return;
+
+	/* Drop reference taken by last invocation of l2tp_dfs_next_tunnel() */
+	if (pd->tunnel)
+		l2tp_tunnel_dec_refcount(pd->tunnel);
 }
 
 static void l2tp_dfs_seq_tunnel_show(struct seq_file *m, void *v)

From a1cc7034e33d12dc17d13fbcd7d597d552889097 Mon Sep 17 00:00:00 2001
From: Sinan Kaya <okaya@codeaurora.org>
Date: Thu, 12 Apr 2018 22:30:44 -0400
Subject: [PATCH 081/288] MIPS: io: Add barrier after register read in readX()

While a barrier is present in the writeX() functions before the register
write, a similar barrier is missing in the readX() functions after the
register read. This could allow memory accesses following readX() to
observe stale data.

Signed-off-by: Sinan Kaya <okaya@codeaurora.org>
Reported-by: Arnd Bergmann <arnd@arndb.de>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Paul Burton <paul.burton@mips.com>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/19069/
[jhogan@kernel.org: Tidy commit message]
Signed-off-by: James Hogan <jhogan@kernel.org>
---
 arch/mips/include/asm/io.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/mips/include/asm/io.h b/arch/mips/include/asm/io.h
index fd00ddafb425..a7d0b836f2f7 100644
--- a/arch/mips/include/asm/io.h
+++ b/arch/mips/include/asm/io.h
@@ -377,6 +377,8 @@ static inline type pfx##read##bwlq(const volatile void __iomem *mem)	\
 		BUG();							\
 	}								\
 									\
+	/* prevent prefetching of coherent DMA data prematurely */	\
+	rmb();								\
 	return pfx##ioswab##bwlq(__mem, __val);				\
 }
 

From c7cd882469fc5042a5c84122b4062d7f53076db7 Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Fri, 13 Apr 2018 21:54:37 +0200
Subject: [PATCH 082/288] parisc: Fix missing binfmt_elf32.o build error

Commit 71d577db01a5 ("parisc: Switch to generic COMPAT_BINFMT_ELF")
removed the binfmt_elf32.c source file, but missed to drop the object
file from the list of object files the Makefile, which then results in a
build error.

Fixes: 71d577db01a5 ("parisc: Switch to generic COMPAT_BINFMT_ELF")
Reported-by: Guenter Roeck <linux@roeck-us.net>
Tested-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Helge Deller <deller@gmx.de>
---
 arch/parisc/kernel/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/parisc/kernel/Makefile b/arch/parisc/kernel/Makefile
index eafd06ab59ef..e5de34d00b1a 100644
--- a/arch/parisc/kernel/Makefile
+++ b/arch/parisc/kernel/Makefile
@@ -23,7 +23,7 @@ obj-$(CONFIG_SMP)	+= smp.o
 obj-$(CONFIG_PA11)	+= pci-dma.o
 obj-$(CONFIG_PCI)	+= pci.o
 obj-$(CONFIG_MODULES)	+= module.o
-obj-$(CONFIG_64BIT)	+= binfmt_elf32.o sys_parisc32.o signal32.o
+obj-$(CONFIG_64BIT)	+= sys_parisc32.o signal32.o
 obj-$(CONFIG_STACKTRACE)+= stacktrace.o
 obj-$(CONFIG_AUDIT)	+= audit.o
 obj64-$(CONFIG_AUDIT)	+= compat_audit.o

From 43838a23a05fbd13e47d750d3dfd77001536dd33 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 11 Apr 2018 13:27:52 -0400
Subject: [PATCH 083/288] random: fix crng_ready() test

The crng_init variable has three states:

0: The CRNG is not initialized at all
1: The CRNG has a small amount of entropy, hopefully good enough for
   early-boot, non-cryptographical use cases
2: The CRNG is fully initialized and we are sure it is safe for
   cryptographic use cases.

The crng_ready() function should only return true once we are in the
last state.  This addresses CVE-2018-1108.

Reported-by: Jann Horn <jannh@google.com>
Fixes: e192be9d9a30 ("random: replace non-blocking pool...")
Cc: stable@kernel.org # 4.8+
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Jann Horn <jannh@google.com>
---
 drivers/char/random.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/char/random.c b/drivers/char/random.c
index e027e7fa1472..c8ec1e70abde 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -427,7 +427,7 @@ struct crng_state primary_crng = {
  * its value (from 0->1->2).
  */
 static int crng_init = 0;
-#define crng_ready() (likely(crng_init > 0))
+#define crng_ready() (likely(crng_init > 1))
 static int crng_init_cnt = 0;
 #define CRNG_INIT_CNT_THRESH (2*CHACHA20_KEY_SIZE)
 static void _extract_crng(struct crng_state *crng,
@@ -794,7 +794,7 @@ static int crng_fast_load(const char *cp, size_t len)
 
 	if (!spin_trylock_irqsave(&primary_crng.lock, flags))
 		return 0;
-	if (crng_ready()) {
+	if (crng_init != 0) {
 		spin_unlock_irqrestore(&primary_crng.lock, flags);
 		return 0;
 	}
@@ -856,7 +856,7 @@ static void _extract_crng(struct crng_state *crng,
 {
 	unsigned long v, flags;
 
-	if (crng_init > 1 &&
+	if (crng_ready() &&
 	    time_after(jiffies, crng->init_time + CRNG_RESEED_INTERVAL))
 		crng_reseed(crng, crng == &primary_crng ? &input_pool : NULL);
 	spin_lock_irqsave(&crng->lock, flags);
@@ -1139,7 +1139,7 @@ void add_interrupt_randomness(int irq, int irq_flags)
 	fast_mix(fast_pool);
 	add_interrupt_bench(cycles);
 
-	if (!crng_ready()) {
+	if (unlikely(crng_init == 0)) {
 		if ((fast_pool->count >= 64) &&
 		    crng_fast_load((char *) fast_pool->pool,
 				   sizeof(fast_pool->pool))) {
@@ -2212,7 +2212,7 @@ void add_hwgenerator_randomness(const char *buffer, size_t count,
 {
 	struct entropy_store *poolp = &input_pool;
 
-	if (!crng_ready()) {
+	if (unlikely(crng_init == 0)) {
 		crng_fast_load(buffer, count);
 		return;
 	}

From dc12baacb95f205948f64dc936a47d89ee110117 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 11 Apr 2018 14:58:27 -0400
Subject: [PATCH 084/288] random: use a different mixing algorithm for
 add_device_randomness()

add_device_randomness() use of crng_fast_load() was highly
problematic.  Some callers of add_device_randomness() can pass in a
large amount of static information.  This would immediately promote
the crng_init state from 0 to 1, without really doing much to
initialize the primary_crng's internal state with something even
vaguely unpredictable.

Since we don't have the speed constraints of add_interrupt_randomness(),
we can do a better job mixing in the what unpredictability a device
driver or architecture maintainer might see fit to give us, and do it
in a way which does not bump the crng_init_cnt variable.

Also, since add_device_randomness() doesn't bump any entropy
accounting in crng_init state 0, mix the device randomness into the
input_pool entropy pool as well.  This is related to CVE-2018-1108.

Reported-by: Jann Horn <jannh@google.com>
Fixes: ee7998c50c26 ("random: do not ignore early device randomness")
Cc: stable@kernel.org # 4.13+
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 drivers/char/random.c | 55 +++++++++++++++++++++++++++++++++++++++----
 1 file changed, 51 insertions(+), 4 deletions(-)

diff --git a/drivers/char/random.c b/drivers/char/random.c
index c8ec1e70abde..6baa828c0493 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -787,6 +787,10 @@ static void crng_initialize(struct crng_state *crng)
 	crng->init_time = jiffies - CRNG_RESEED_INTERVAL - 1;
 }
 
+/*
+ * crng_fast_load() can be called by code in the interrupt service
+ * path.  So we can't afford to dilly-dally.
+ */
 static int crng_fast_load(const char *cp, size_t len)
 {
 	unsigned long flags;
@@ -813,6 +817,51 @@ static int crng_fast_load(const char *cp, size_t len)
 	return 1;
 }
 
+/*
+ * crng_slow_load() is called by add_device_randomness, which has two
+ * attributes.  (1) We can't trust the buffer passed to it is
+ * guaranteed to be unpredictable (so it might not have any entropy at
+ * all), and (2) it doesn't have the performance constraints of
+ * crng_fast_load().
+ *
+ * So we do something more comprehensive which is guaranteed to touch
+ * all of the primary_crng's state, and which uses a LFSR with a
+ * period of 255 as part of the mixing algorithm.  Finally, we do
+ * *not* advance crng_init_cnt since buffer we may get may be something
+ * like a fixed DMI table (for example), which might very well be
+ * unique to the machine, but is otherwise unvarying.
+ */
+static int crng_slow_load(const char *cp, size_t len)
+{
+	unsigned long		flags;
+	static unsigned char	lfsr = 1;
+	unsigned char		tmp;
+	unsigned		i, max = CHACHA20_KEY_SIZE;
+	const char *		src_buf = cp;
+	char *			dest_buf = (char *) &primary_crng.state[4];
+
+	if (!spin_trylock_irqsave(&primary_crng.lock, flags))
+		return 0;
+	if (crng_init != 0) {
+		spin_unlock_irqrestore(&primary_crng.lock, flags);
+		return 0;
+	}
+	if (len > max)
+		max = len;
+
+	for (i = 0; i < max ; i++) {
+		tmp = lfsr;
+		lfsr >>= 1;
+		if (tmp & 1)
+			lfsr ^= 0xE1;
+		tmp = dest_buf[i % CHACHA20_KEY_SIZE];
+		dest_buf[i % CHACHA20_KEY_SIZE] ^= src_buf[i % len] ^ lfsr;
+		lfsr += (tmp << 3) | (tmp >> 5);
+	}
+	spin_unlock_irqrestore(&primary_crng.lock, flags);
+	return 1;
+}
+
 static void crng_reseed(struct crng_state *crng, struct entropy_store *r)
 {
 	unsigned long	flags;
@@ -981,10 +1030,8 @@ void add_device_randomness(const void *buf, unsigned int size)
 	unsigned long time = random_get_entropy() ^ jiffies;
 	unsigned long flags;
 
-	if (!crng_ready()) {
-		crng_fast_load(buf, size);
-		return;
-	}
+	if (!crng_ready() && size)
+		crng_slow_load(buf, size);
 
 	trace_add_device_randomness(size, _RET_IP_);
 	spin_lock_irqsave(&input_pool.lock, flags);

From 8ef35c866f8862df074a49a93b0309725812dea8 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 11 Apr 2018 15:23:56 -0400
Subject: [PATCH 085/288] random: set up the NUMA crng instances after the CRNG
 is fully initialized

Until the primary_crng is fully initialized, don't initialize the NUMA
crng nodes.  Otherwise users of /dev/urandom on NUMA systems before
the CRNG is fully initialized can get very bad quality randomness.  Of
course everyone should move to getrandom(2) where this won't be an
issue, but there's a lot of legacy code out there.  This related to
CVE-2018-1108.

Reported-by: Jann Horn <jannh@google.com>
Fixes: 1e7f583af67b ("random: make /dev/urandom scalable for silly...")
Cc: stable@kernel.org # 4.8+
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 drivers/char/random.c | 46 +++++++++++++++++++++++++------------------
 1 file changed, 27 insertions(+), 19 deletions(-)

diff --git a/drivers/char/random.c b/drivers/char/random.c
index 6baa828c0493..02d792f7933f 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -787,6 +787,32 @@ static void crng_initialize(struct crng_state *crng)
 	crng->init_time = jiffies - CRNG_RESEED_INTERVAL - 1;
 }
 
+#ifdef CONFIG_NUMA
+static void numa_crng_init(void)
+{
+	int i;
+	struct crng_state *crng;
+	struct crng_state **pool;
+
+	pool = kcalloc(nr_node_ids, sizeof(*pool), GFP_KERNEL|__GFP_NOFAIL);
+	for_each_online_node(i) {
+		crng = kmalloc_node(sizeof(struct crng_state),
+				    GFP_KERNEL | __GFP_NOFAIL, i);
+		spin_lock_init(&crng->lock);
+		crng_initialize(crng);
+		pool[i] = crng;
+	}
+	mb();
+	if (cmpxchg(&crng_node_pool, NULL, pool)) {
+		for_each_node(i)
+			kfree(pool[i]);
+		kfree(pool);
+	}
+}
+#else
+static void numa_crng_init(void) {}
+#endif
+
 /*
  * crng_fast_load() can be called by code in the interrupt service
  * path.  So we can't afford to dilly-dally.
@@ -893,6 +919,7 @@ static void crng_reseed(struct crng_state *crng, struct entropy_store *r)
 	spin_unlock_irqrestore(&primary_crng.lock, flags);
 	if (crng == &primary_crng && crng_init < 2) {
 		invalidate_batched_entropy();
+		numa_crng_init();
 		crng_init = 2;
 		process_random_ready_list();
 		wake_up_interruptible(&crng_init_wait);
@@ -1727,28 +1754,9 @@ static void init_std_data(struct entropy_store *r)
  */
 static int rand_initialize(void)
 {
-#ifdef CONFIG_NUMA
-	int i;
-	struct crng_state *crng;
-	struct crng_state **pool;
-#endif
-
 	init_std_data(&input_pool);
 	init_std_data(&blocking_pool);
 	crng_initialize(&primary_crng);
-
-#ifdef CONFIG_NUMA
-	pool = kcalloc(nr_node_ids, sizeof(*pool), GFP_KERNEL|__GFP_NOFAIL);
-	for_each_online_node(i) {
-		crng = kmalloc_node(sizeof(struct crng_state),
-				    GFP_KERNEL | __GFP_NOFAIL, i);
-		spin_lock_init(&crng->lock);
-		crng_initialize(crng);
-		pool[i] = crng;
-	}
-	mb();
-	crng_node_pool = pool;
-#endif
 	return 0;
 }
 early_initcall(rand_initialize);

From 0bb29a849a6433b72e249eea7695477b02056e94 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 12 Apr 2018 00:50:45 -0400
Subject: [PATCH 086/288] random: crng_reseed() should lock the crng instance
 that it is modifying

Reported-by: Jann Horn <jannh@google.com>
Fixes: 1e7f583af67b ("random: make /dev/urandom scalable for silly...")
Cc: stable@kernel.org # 4.8+
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Jann Horn <jannh@google.com>
---
 drivers/char/random.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/char/random.c b/drivers/char/random.c
index 02d792f7933f..898233f594b4 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -906,7 +906,7 @@ static void crng_reseed(struct crng_state *crng, struct entropy_store *r)
 		_crng_backtrack_protect(&primary_crng, buf.block,
 					CHACHA20_KEY_SIZE);
 	}
-	spin_lock_irqsave(&primary_crng.lock, flags);
+	spin_lock_irqsave(&crng->lock, flags);
 	for (i = 0; i < 8; i++) {
 		unsigned long	rv;
 		if (!arch_get_random_seed_long(&rv) &&
@@ -916,7 +916,7 @@ static void crng_reseed(struct crng_state *crng, struct entropy_store *r)
 	}
 	memzero_explicit(&buf, sizeof(buf));
 	crng->init_time = jiffies;
-	spin_unlock_irqrestore(&primary_crng.lock, flags);
+	spin_unlock_irqrestore(&crng->lock, flags);
 	if (crng == &primary_crng && crng_init < 2) {
 		invalidate_batched_entropy();
 		numa_crng_init();

From d848e5f8e1ebdb227d045db55fe4f825e82965fa Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 11 Apr 2018 16:32:17 -0400
Subject: [PATCH 087/288] random: add new ioctl RNDRESEEDCRNG

Add a new ioctl which forces the the crng to be reseeded.

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
---
 drivers/char/random.c       | 13 ++++++++++++-
 include/uapi/linux/random.h |  3 +++
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/drivers/char/random.c b/drivers/char/random.c
index 898233f594b4..3cd3aae24d6d 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -429,6 +429,7 @@ struct crng_state primary_crng = {
 static int crng_init = 0;
 #define crng_ready() (likely(crng_init > 1))
 static int crng_init_cnt = 0;
+static unsigned long crng_global_init_time = 0;
 #define CRNG_INIT_CNT_THRESH (2*CHACHA20_KEY_SIZE)
 static void _extract_crng(struct crng_state *crng,
 			  __u32 out[CHACHA20_BLOCK_WORDS]);
@@ -933,7 +934,8 @@ static void _extract_crng(struct crng_state *crng,
 	unsigned long v, flags;
 
 	if (crng_ready() &&
-	    time_after(jiffies, crng->init_time + CRNG_RESEED_INTERVAL))
+	    (time_after(crng_global_init_time, crng->init_time) ||
+	     time_after(jiffies, crng->init_time + CRNG_RESEED_INTERVAL)))
 		crng_reseed(crng, crng == &primary_crng ? &input_pool : NULL);
 	spin_lock_irqsave(&crng->lock, flags);
 	if (arch_get_random_long(&v))
@@ -1757,6 +1759,7 @@ static int rand_initialize(void)
 	init_std_data(&input_pool);
 	init_std_data(&blocking_pool);
 	crng_initialize(&primary_crng);
+	crng_global_init_time = jiffies;
 	return 0;
 }
 early_initcall(rand_initialize);
@@ -1930,6 +1933,14 @@ static long random_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
 		input_pool.entropy_count = 0;
 		blocking_pool.entropy_count = 0;
 		return 0;
+	case RNDRESEEDCRNG:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+		if (crng_init < 2)
+			return -ENODATA;
+		crng_reseed(&primary_crng, NULL);
+		crng_global_init_time = jiffies - 1;
+		return 0;
 	default:
 		return -EINVAL;
 	}
diff --git a/include/uapi/linux/random.h b/include/uapi/linux/random.h
index c34f4490d025..26ee91300e3e 100644
--- a/include/uapi/linux/random.h
+++ b/include/uapi/linux/random.h
@@ -35,6 +35,9 @@
 /* Clear the entropy pool and associated counters.  (Superuser only.) */
 #define RNDCLEARPOOL	_IO( 'R', 0x06 )
 
+/* Reseed CRNG.  (Superuser only.) */
+#define RNDRESEEDCRNG	_IO( 'R', 0x07 )
+
 struct rand_pool_info {
 	int	entropy_count;
 	int	buf_size;

From 494bef4c2a087876e75f3e95f7f63b06d6a65921 Mon Sep 17 00:00:00 2001
From: Edward Cree <ecree@solarflare.com>
Date: Fri, 13 Apr 2018 19:17:22 +0100
Subject: [PATCH 088/288] sfc: insert ARFS filters with replace_equal=true

Necessary to allow redirecting a flow when the application moves.

Fixes: 3af0f34290f6 ("sfc: replace asynchronous filter operations")
Signed-off-by: Edward Cree <ecree@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/sfc/rx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/sfc/rx.c b/drivers/net/ethernet/sfc/rx.c
index 95682831484e..13b0eb71dbf3 100644
--- a/drivers/net/ethernet/sfc/rx.c
+++ b/drivers/net/ethernet/sfc/rx.c
@@ -851,7 +851,7 @@ static void efx_filter_rfs_work(struct work_struct *data)
 	struct efx_channel *channel = efx_get_channel(efx, req->rxq_index);
 	int rc;
 
-	rc = efx->type->filter_insert(efx, &req->spec, false);
+	rc = efx->type->filter_insert(efx, &req->spec, true);
 	if (rc >= 0) {
 		/* Remember this so we can check whether to expire the filter
 		 * later.

From a7f80189e41c96c0c6210e9198a31859c91eb3e5 Mon Sep 17 00:00:00 2001
From: Edward Cree <ecree@solarflare.com>
Date: Fri, 13 Apr 2018 19:17:49 +0100
Subject: [PATCH 089/288] sfc: pass the correctly bogus filter_id to
 rps_may_expire_flow()

When we inserted an ARFS filter for ndo_rx_flow_steer(), we didn't know
 what the filter ID would be, so we just returned 0.  Thus, we must also
 pass 0 as the filter ID when calling rps_may_expire_flow() for it, and
 rely on the flow_id to identify what we're talking about.

Fixes: 3af0f34290f6 ("sfc: replace asynchronous filter operations")
Signed-off-by: Edward Cree <ecree@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/sfc/ef10.c  | 3 +--
 drivers/net/ethernet/sfc/farch.c | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/sfc/ef10.c b/drivers/net/ethernet/sfc/ef10.c
index 50daad0a1482..36f24c7e553a 100644
--- a/drivers/net/ethernet/sfc/ef10.c
+++ b/drivers/net/ethernet/sfc/ef10.c
@@ -4776,8 +4776,7 @@ static bool efx_ef10_filter_rfs_expire_one(struct efx_nic *efx, u32 flow_id,
 		goto out_unlock;
 	}
 
-	if (!rps_may_expire_flow(efx->net_dev, spec->dmaq_id,
-				 flow_id, filter_idx)) {
+	if (!rps_may_expire_flow(efx->net_dev, spec->dmaq_id, flow_id, 0)) {
 		ret = false;
 		goto out_unlock;
 	}
diff --git a/drivers/net/ethernet/sfc/farch.c b/drivers/net/ethernet/sfc/farch.c
index 4a19c7efdf8d..7174ef5e5c5e 100644
--- a/drivers/net/ethernet/sfc/farch.c
+++ b/drivers/net/ethernet/sfc/farch.c
@@ -2912,7 +2912,7 @@ bool efx_farch_filter_rfs_expire_one(struct efx_nic *efx, u32 flow_id,
 	if (test_bit(index, table->used_bitmap) &&
 	    table->spec[index].priority == EFX_FILTER_PRI_HINT &&
 	    rps_may_expire_flow(efx->net_dev, table->spec[index].dmaq_id,
-				flow_id, index)) {
+				flow_id, 0)) {
 		efx_farch_filter_table_clear_entry(efx, table, index);
 		ret = true;
 	}

From f993740ee05821307eca03d23d468895740450f8 Mon Sep 17 00:00:00 2001
From: Edward Cree <ecree@solarflare.com>
Date: Fri, 13 Apr 2018 19:18:09 +0100
Subject: [PATCH 090/288] sfc: limit ARFS workitems in flight per channel

A misconfigured system (e.g. with all interrupts affinitised to all CPUs)
 may produce a storm of ARFS steering events.  With the existing sfc ARFS
 implementation, that could create a backlog of workitems that grinds the
 system to a halt.  To prevent this, limit the number of workitems that
 may be in flight for a given SFC device to 8 (EFX_RPS_MAX_IN_FLIGHT), and
 return EBUSY from our ndo_rx_flow_steer method if the limit is reached.
Given this limit, also store the workitems in an array of slots within the
 struct efx_nic, rather than dynamically allocating for each request.
The limit should not negatively impact performance, because it is only
 likely to be hit in cases where ARFS will be ineffective anyway.

Signed-off-by: Edward Cree <ecree@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/sfc/net_driver.h | 25 ++++++++++++
 drivers/net/ethernet/sfc/rx.c         | 58 ++++++++++++++-------------
 2 files changed, 55 insertions(+), 28 deletions(-)

diff --git a/drivers/net/ethernet/sfc/net_driver.h b/drivers/net/ethernet/sfc/net_driver.h
index 5e379a83c729..eea3808b3f25 100644
--- a/drivers/net/ethernet/sfc/net_driver.h
+++ b/drivers/net/ethernet/sfc/net_driver.h
@@ -733,6 +733,27 @@ struct efx_rss_context {
 	u32 rx_indir_table[128];
 };
 
+#ifdef CONFIG_RFS_ACCEL
+/**
+ * struct efx_async_filter_insertion - Request to asynchronously insert a filter
+ * @net_dev: Reference to the netdevice
+ * @spec: The filter to insert
+ * @work: Workitem for this request
+ * @rxq_index: Identifies the channel for which this request was made
+ * @flow_id: Identifies the kernel-side flow for which this request was made
+ */
+struct efx_async_filter_insertion {
+	struct net_device *net_dev;
+	struct efx_filter_spec spec;
+	struct work_struct work;
+	u16 rxq_index;
+	u32 flow_id;
+};
+
+/* Maximum number of ARFS workitems that may be in flight on an efx_nic */
+#define EFX_RPS_MAX_IN_FLIGHT	8
+#endif /* CONFIG_RFS_ACCEL */
+
 /**
  * struct efx_nic - an Efx NIC
  * @name: Device name (net device name or bus id before net device registered)
@@ -850,6 +871,8 @@ struct efx_rss_context {
  * @rps_expire_channel: Next channel to check for expiry
  * @rps_expire_index: Next index to check for expiry in
  *	@rps_expire_channel's @rps_flow_id
+ * @rps_slot_map: bitmap of in-flight entries in @rps_slot
+ * @rps_slot: array of ARFS insertion requests for efx_filter_rfs_work()
  * @active_queues: Count of RX and TX queues that haven't been flushed and drained.
  * @rxq_flush_pending: Count of number of receive queues that need to be flushed.
  *	Decremented when the efx_flush_rx_queue() is called.
@@ -1004,6 +1027,8 @@ struct efx_nic {
 	struct mutex rps_mutex;
 	unsigned int rps_expire_channel;
 	unsigned int rps_expire_index;
+	unsigned long rps_slot_map;
+	struct efx_async_filter_insertion rps_slot[EFX_RPS_MAX_IN_FLIGHT];
 #endif
 
 	atomic_t active_queues;
diff --git a/drivers/net/ethernet/sfc/rx.c b/drivers/net/ethernet/sfc/rx.c
index 13b0eb71dbf3..9c593c661cbf 100644
--- a/drivers/net/ethernet/sfc/rx.c
+++ b/drivers/net/ethernet/sfc/rx.c
@@ -827,28 +827,13 @@ MODULE_PARM_DESC(rx_refill_threshold,
 
 #ifdef CONFIG_RFS_ACCEL
 
-/**
- * struct efx_async_filter_insertion - Request to asynchronously insert a filter
- * @net_dev: Reference to the netdevice
- * @spec: The filter to insert
- * @work: Workitem for this request
- * @rxq_index: Identifies the channel for which this request was made
- * @flow_id: Identifies the kernel-side flow for which this request was made
- */
-struct efx_async_filter_insertion {
-	struct net_device *net_dev;
-	struct efx_filter_spec spec;
-	struct work_struct work;
-	u16 rxq_index;
-	u32 flow_id;
-};
-
 static void efx_filter_rfs_work(struct work_struct *data)
 {
 	struct efx_async_filter_insertion *req = container_of(data, struct efx_async_filter_insertion,
 							      work);
 	struct efx_nic *efx = netdev_priv(req->net_dev);
 	struct efx_channel *channel = efx_get_channel(efx, req->rxq_index);
+	int slot_idx = req - efx->rps_slot;
 	int rc;
 
 	rc = efx->type->filter_insert(efx, &req->spec, true);
@@ -878,8 +863,8 @@ static void efx_filter_rfs_work(struct work_struct *data)
 	}
 
 	/* Release references */
+	clear_bit(slot_idx, &efx->rps_slot_map);
 	dev_put(req->net_dev);
-	kfree(req);
 }
 
 int efx_filter_rfs(struct net_device *net_dev, const struct sk_buff *skb,
@@ -888,22 +873,36 @@ int efx_filter_rfs(struct net_device *net_dev, const struct sk_buff *skb,
 	struct efx_nic *efx = netdev_priv(net_dev);
 	struct efx_async_filter_insertion *req;
 	struct flow_keys fk;
+	int slot_idx;
+	int rc;
 
-	if (flow_id == RPS_FLOW_ID_INVALID)
-		return -EINVAL;
+	/* find a free slot */
+	for (slot_idx = 0; slot_idx < EFX_RPS_MAX_IN_FLIGHT; slot_idx++)
+		if (!test_and_set_bit(slot_idx, &efx->rps_slot_map))
+			break;
+	if (slot_idx >= EFX_RPS_MAX_IN_FLIGHT)
+		return -EBUSY;
 
-	if (!skb_flow_dissect_flow_keys(skb, &fk, 0))
-		return -EPROTONOSUPPORT;
+	if (flow_id == RPS_FLOW_ID_INVALID) {
+		rc = -EINVAL;
+		goto out_clear;
+	}
 
-	if (fk.basic.n_proto != htons(ETH_P_IP) && fk.basic.n_proto != htons(ETH_P_IPV6))
-		return -EPROTONOSUPPORT;
-	if (fk.control.flags & FLOW_DIS_IS_FRAGMENT)
-		return -EPROTONOSUPPORT;
+	if (!skb_flow_dissect_flow_keys(skb, &fk, 0)) {
+		rc = -EPROTONOSUPPORT;
+		goto out_clear;
+	}
 
-	req = kmalloc(sizeof(*req), GFP_ATOMIC);
-	if (!req)
-		return -ENOMEM;
+	if (fk.basic.n_proto != htons(ETH_P_IP) && fk.basic.n_proto != htons(ETH_P_IPV6)) {
+		rc = -EPROTONOSUPPORT;
+		goto out_clear;
+	}
+	if (fk.control.flags & FLOW_DIS_IS_FRAGMENT) {
+		rc = -EPROTONOSUPPORT;
+		goto out_clear;
+	}
 
+	req = efx->rps_slot + slot_idx;
 	efx_filter_init_rx(&req->spec, EFX_FILTER_PRI_HINT,
 			   efx->rx_scatter ? EFX_FILTER_FLAG_RX_SCATTER : 0,
 			   rxq_index);
@@ -933,6 +932,9 @@ int efx_filter_rfs(struct net_device *net_dev, const struct sk_buff *skb,
 	req->flow_id = flow_id;
 	schedule_work(&req->work);
 	return 0;
+out_clear:
+	clear_bit(slot_idx, &efx->rps_slot_map);
+	return rc;
 }
 
 bool __efx_filter_rfs_expire(struct efx_nic *efx, unsigned int quota)

From c246fd333f84e6a0a8572f991637aa102f5e1865 Mon Sep 17 00:00:00 2001
From: Wang Sheng-Hui <shhuiw@foxmail.com>
Date: Sun, 15 Apr 2018 16:07:12 +0800
Subject: [PATCH 091/288] filter.txt: update 'tools/net/' to 'tools/bpf/'

The tools are located at tootls/bpf/ instead of tools/net/.
Update the filter.txt doc.

Signed-off-by: Wang Sheng-Hui <shhuiw@foxmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/filter.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt
index a4508ec1816b..fd55c7de9991 100644
--- a/Documentation/networking/filter.txt
+++ b/Documentation/networking/filter.txt
@@ -169,7 +169,7 @@ access to BPF code as well.
 BPF engine and instruction set
 ------------------------------
 
-Under tools/net/ there's a small helper tool called bpf_asm which can
+Under tools/bpf/ there's a small helper tool called bpf_asm which can
 be used to write low-level filters for example scenarios mentioned in the
 previous section. Asm-like syntax mentioned here has been implemented in
 bpf_asm and will be used for further explanations (instead of dealing with
@@ -359,7 +359,7 @@ $ ./bpf_asm -c foo
 In particular, as usage with xt_bpf or cls_bpf can result in more complex BPF
 filters that might not be obvious at first, it's good to test filters before
 attaching to a live system. For that purpose, there's a small tool called
-bpf_dbg under tools/net/ in the kernel source directory. This debugger allows
+bpf_dbg under tools/bpf/ in the kernel source directory. This debugger allows
 for testing BPF filters against given pcap files, single stepping through the
 BPF code on the pcap's packets and to do BPF machine register dumps.
 
@@ -483,7 +483,7 @@ Example output from dmesg:
 [ 3389.935851] JIT code: 00000030: 00 e8 28 94 ff e0 83 f8 01 75 07 b8 ff ff 00 00
 [ 3389.935852] JIT code: 00000040: eb 02 31 c0 c9 c3
 
-In the kernel source tree under tools/net/, there's bpf_jit_disasm for
+In the kernel source tree under tools/bpf/, there's bpf_jit_disasm for
 generating disassembly out of the kernel log's hexdump:
 
 # ./bpf_jit_disasm

From 60cc43fc888428bb2f18f08997432d426a243338 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 15 Apr 2018 18:24:20 -0700
Subject: [PATCH 092/288] Linux 4.17-rc1

---
 Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 74567b0ec2f0..e811e0c509c5 100644
--- a/Makefile
+++ b/Makefile
@@ -1,8 +1,8 @@
 # SPDX-License-Identifier: GPL-2.0
 VERSION = 4
-PATCHLEVEL = 16
+PATCHLEVEL = 17
 SUBLEVEL = 0
-EXTRAVERSION =
+EXTRAVERSION = -rc1
 NAME = Fearless Coyote
 
 # *DOCUMENTATION*

From a24cd490739586a7d2da3549a1844e1d7c4f4fc4 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 2 Apr 2018 23:50:31 -0400
Subject: [PATCH 093/288] hypfs_kill_super(): deal with failed allocations

hypfs_fill_super() might fail to allocate sbi; hypfs_kill_super()
should not oops on that.

Cc: stable@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/s390/hypfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c
index 43bbe63e2992..06b513d192b9 100644
--- a/arch/s390/hypfs/inode.c
+++ b/arch/s390/hypfs/inode.c
@@ -320,7 +320,7 @@ static void hypfs_kill_super(struct super_block *sb)
 
 	if (sb->s_root)
 		hypfs_delete_tree(sb->s_root);
-	if (sb_info->update_file)
+	if (sb_info && sb_info->update_file)
 		hypfs_remove(sb_info->update_file);
 	kfree(sb->s_fs_info);
 	sb->s_fs_info = NULL;

From c66b23c2840446a82c389e4cb1a12eb2a71fa2e4 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 2 Apr 2018 23:56:44 -0400
Subject: [PATCH 094/288] jffs2_kill_sb(): deal with failed allocations

jffs2_fill_super() might fail to allocate jffs2_sb_info;
jffs2_kill_sb() must survive that.

Cc: stable@kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/jffs2/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index f60dee7faf03..87bdf0f4cba1 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -342,7 +342,7 @@ static void jffs2_put_super (struct super_block *sb)
 static void jffs2_kill_sb(struct super_block *sb)
 {
 	struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
-	if (!sb_rdonly(sb))
+	if (c && !sb_rdonly(sb))
 		jffs2_stop_garbage_collect_thread(c);
 	kill_mtd_super(sb);
 	kfree(c);

From 659038428cb43a66e3eff71e2c845c9de3611a98 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 3 Apr 2018 00:13:17 -0400
Subject: [PATCH 095/288] orangefs_kill_sb(): deal with allocation failures

orangefs_fill_sb() might've failed to allocate ORANGEFS_SB(s); don't
oops in that case.

Cc: stable@kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/orangefs/super.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c
index 3ae5fdba0225..10796d3fe27d 100644
--- a/fs/orangefs/super.c
+++ b/fs/orangefs/super.c
@@ -579,6 +579,11 @@ void orangefs_kill_sb(struct super_block *sb)
 	/* provided sb cleanup */
 	kill_anon_super(sb);
 
+	if (!ORANGEFS_SB(sb)) {
+		mutex_lock(&orangefs_request_mutex);
+		mutex_unlock(&orangefs_request_mutex);
+		return;
+	}
 	/*
 	 * issue the unmount to userspace to tell it to remove the
 	 * dynamic mount info it has for this superblock

From 4a3877c4cedd95543f8726b0a98743ed8db0c0fb Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 3 Apr 2018 01:15:46 -0400
Subject: [PATCH 096/288] rpc_pipefs: fix double-dput()

if we ever hit rpc_gssd_dummy_depopulate() dentry passed to
it has refcount equal to 1.  __rpc_rmpipe() drops it and
dput() done after that hits an already freed dentry.

Cc: stable@kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 net/sunrpc/rpc_pipe.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 0f08934b2cea..c81ef5e6c981 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -1375,6 +1375,7 @@ rpc_gssd_dummy_depopulate(struct dentry *pipe_dentry)
 	struct dentry *clnt_dir = pipe_dentry->d_parent;
 	struct dentry *gssd_dir = clnt_dir->d_parent;
 
+	dget(pipe_dentry);
 	__rpc_rmpipe(d_inode(clnt_dir), pipe_dentry);
 	__rpc_depopulate(clnt_dir, gssd_dummy_info_file, 0, 1);
 	__rpc_depopulate(gssd_dir, gssd_dummy_clnt_dir, 0, 1);

From 8e04944f0ea8b838399049bdcda920ab36ae3b04 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Wed, 4 Apr 2018 19:53:07 +0900
Subject: [PATCH 097/288] mm,vmscan: Allow preallocating memory for
 register_shrinker().

syzbot is catching so many bugs triggered by commit 9ee332d99e4d5a97
("sget(): handle failures of register_shrinker()"). That commit expected
that calling kill_sb() from deactivate_locked_super() without successful
fill_super() is safe, but the reality was different; some callers assign
attributes which are needed for kill_sb() after sget() succeeds.

For example, [1] is a report where sb->s_mode (which seems to be either
FMODE_READ | FMODE_EXCL | FMODE_WRITE or FMODE_READ | FMODE_EXCL) is not
assigned unless sget() succeeds. But it does not worth complicate sget()
so that register_shrinker() failure path can safely call
kill_block_super() via kill_sb(). Making alloc_super() fail if memory
allocation for register_shrinker() failed is much simpler. Let's avoid
calling deactivate_locked_super() from sget_userns() by preallocating
memory for the shrinker and making register_shrinker() in sget_userns()
never fail.

[1] https://syzkaller.appspot.com/bug?id=588996a25a2587be2e3a54e8646728fb9cae44e7

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Reported-by: syzbot <syzbot+5a170e19c963a2e0df79@syzkaller.appspotmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/super.c               |  9 ++++-----
 include/linux/shrinker.h |  7 +++++--
 mm/vmscan.c              | 21 ++++++++++++++++++++-
 3 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/fs/super.c b/fs/super.c
index 5fa9a8d8d865..122c402049a2 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -167,6 +167,7 @@ static void destroy_unused_super(struct super_block *s)
 	security_sb_free(s);
 	put_user_ns(s->s_user_ns);
 	kfree(s->s_subtype);
+	free_prealloced_shrinker(&s->s_shrink);
 	/* no delays needed */
 	destroy_super_work(&s->destroy_work);
 }
@@ -252,6 +253,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
 	s->s_shrink.count_objects = super_cache_count;
 	s->s_shrink.batch = 1024;
 	s->s_shrink.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE;
+	if (prealloc_shrinker(&s->s_shrink))
+		goto fail;
 	return s;
 
 fail:
@@ -518,11 +521,7 @@ retry:
 	hlist_add_head(&s->s_instances, &type->fs_supers);
 	spin_unlock(&sb_lock);
 	get_filesystem(type);
-	err = register_shrinker(&s->s_shrink);
-	if (err) {
-		deactivate_locked_super(s);
-		s = ERR_PTR(err);
-	}
+	register_shrinker_prepared(&s->s_shrink);
 	return s;
 }
 
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 388ff2936a87..6794490f25b2 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -75,6 +75,9 @@ struct shrinker {
 #define SHRINKER_NUMA_AWARE	(1 << 0)
 #define SHRINKER_MEMCG_AWARE	(1 << 1)
 
-extern int register_shrinker(struct shrinker *);
-extern void unregister_shrinker(struct shrinker *);
+extern int prealloc_shrinker(struct shrinker *shrinker);
+extern void register_shrinker_prepared(struct shrinker *shrinker);
+extern int register_shrinker(struct shrinker *shrinker);
+extern void unregister_shrinker(struct shrinker *shrinker);
+extern void free_prealloced_shrinker(struct shrinker *shrinker);
 #endif
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8b920ce3ae02..9b697323a88c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -303,7 +303,7 @@ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone
 /*
  * Add a shrinker callback to be called from the vm.
  */
-int register_shrinker(struct shrinker *shrinker)
+int prealloc_shrinker(struct shrinker *shrinker)
 {
 	size_t size = sizeof(*shrinker->nr_deferred);
 
@@ -313,10 +313,29 @@ int register_shrinker(struct shrinker *shrinker)
 	shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
 	if (!shrinker->nr_deferred)
 		return -ENOMEM;
+	return 0;
+}
 
+void free_prealloced_shrinker(struct shrinker *shrinker)
+{
+	kfree(shrinker->nr_deferred);
+	shrinker->nr_deferred = NULL;
+}
+
+void register_shrinker_prepared(struct shrinker *shrinker)
+{
 	down_write(&shrinker_rwsem);
 	list_add_tail(&shrinker->list, &shrinker_list);
 	up_write(&shrinker_rwsem);
+}
+
+int register_shrinker(struct shrinker *shrinker)
+{
+	int err = prealloc_shrinker(shrinker);
+
+	if (err)
+		return err;
+	register_shrinker_prepared(shrinker);
 	return 0;
 }
 EXPORT_SYMBOL(register_shrinker);

From dccccd332d028f57358a8b64ca88e691fc8be5b7 Mon Sep 17 00:00:00 2001
From: Vasily Gorbik <gor@linux.ibm.com>
Date: Fri, 13 Apr 2018 18:22:14 +0200
Subject: [PATCH 098/288] s390/sclp: avoid potential usage of uninitialized
 value

sclp_early_printk could be used before .bss section is zeroed
(i.e. from als.c during the decompressor phase), therefore values used
by sclp_early_printk should be located in the .data section.

Another reason for that is to avoid potential initrd corruption, if some
code in future would use sclp_early_printk before initrd is moved from
possibly overlapping with .bss section region to a safe location.

Fixes: 0b0d1173d8ae ("s390/sclp: 32 bit event mask compatibility mode")
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 drivers/s390/char/sclp_early_core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/s390/char/sclp_early_core.c b/drivers/s390/char/sclp_early_core.c
index 5f8d9ea69ebd..eceba3858cef 100644
--- a/drivers/s390/char/sclp_early_core.c
+++ b/drivers/s390/char/sclp_early_core.c
@@ -18,7 +18,7 @@ int sclp_init_state __section(.data) = sclp_init_state_uninitialized;
  * Used to keep track of the size of the event masks. Qemu until version 2.11
  * only supports 4 and needs a workaround.
  */
-bool sclp_mask_compat_mode;
+bool sclp_mask_compat_mode __section(.data);
 
 void sclp_early_wait_irq(void)
 {

From 760dd0eeaec1689430243ead14e5a429613d8c52 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Tue, 3 Apr 2018 11:08:52 +0200
Subject: [PATCH 099/288] s390/smsgiucv: disable SMSG on module unload

The module exit function of the smsgiucv module uses the incorrect CP
command to disable SMSG messages. The correct command is "SET SMSG OFF".
Use it.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 drivers/s390/net/smsgiucv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/s390/net/smsgiucv.c b/drivers/s390/net/smsgiucv.c
index 3b0c8b8a7634..066b5c3aaae6 100644
--- a/drivers/s390/net/smsgiucv.c
+++ b/drivers/s390/net/smsgiucv.c
@@ -176,7 +176,7 @@ static struct device_driver smsg_driver = {
 
 static void __exit smsg_exit(void)
 {
-	cpcmd("SET SMSG IUCV", NULL, 0, NULL);
+	cpcmd("SET SMSG OFF", NULL, 0, NULL);
 	device_unregister(smsg_dev);
 	iucv_unregister(&smsg_handler, 1);
 	driver_unregister(&smsg_driver);

From 15ceb8c936d13d940ca9e53996fbd05a26ce96db Mon Sep 17 00:00:00 2001
From: Philipp Rudo <prudo@linux.vnet.ibm.com>
Date: Tue, 27 Jun 2017 12:44:11 +0200
Subject: [PATCH 100/288] s390/kexec_file: Prepare setup.h for kexec_file_load

kexec_file_load needs to prepare the new kernels before they are loaded.
For that it has to know the offsets in head.S, e.g. to register the new
command line. Unfortunately there are no macros right now defining those
offsets. Define them now.

Signed-off-by: Philipp Rudo <prudo@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/include/asm/setup.h | 40 +++++++++++++++++++++++------------
 1 file changed, 27 insertions(+), 13 deletions(-)

diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h
index 124154fdfc97..9c30ebe046f3 100644
--- a/arch/s390/include/asm/setup.h
+++ b/arch/s390/include/asm/setup.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
  *  S390 version
- *    Copyright IBM Corp. 1999, 2010
+ *    Copyright IBM Corp. 1999, 2017
  */
 #ifndef _ASM_S390_SETUP_H
 #define _ASM_S390_SETUP_H
@@ -37,17 +37,31 @@
 #define LPP_MAGIC		_BITUL(31)
 #define LPP_PID_MASK		_AC(0xffffffff, UL)
 
+/* Offsets to entry points in kernel/head.S  */
+
+#define STARTUP_NORMAL_OFFSET	0x10000
+#define STARTUP_KDUMP_OFFSET	0x10010
+
+/* Offsets to parameters in kernel/head.S  */
+
+#define IPL_DEVICE_OFFSET	0x10400
+#define INITRD_START_OFFSET	0x10408
+#define INITRD_SIZE_OFFSET	0x10410
+#define OLDMEM_BASE_OFFSET	0x10418
+#define OLDMEM_SIZE_OFFSET	0x10420
+#define COMMAND_LINE_OFFSET	0x10480
+
 #ifndef __ASSEMBLY__
 
 #include <asm/lowcore.h>
 #include <asm/types.h>
 
-#define IPL_DEVICE        (*(unsigned long *)  (0x10400))
-#define INITRD_START      (*(unsigned long *)  (0x10408))
-#define INITRD_SIZE       (*(unsigned long *)  (0x10410))
-#define OLDMEM_BASE	  (*(unsigned long *)  (0x10418))
-#define OLDMEM_SIZE	  (*(unsigned long *)  (0x10420))
-#define COMMAND_LINE      ((char *)            (0x10480))
+#define IPL_DEVICE	(*(unsigned long *)  (IPL_DEVICE_OFFSET))
+#define INITRD_START	(*(unsigned long *)  (INITRD_START_OFFSET))
+#define INITRD_SIZE	(*(unsigned long *)  (INITRD_SIZE_OFFSET))
+#define OLDMEM_BASE	(*(unsigned long *)  (OLDMEM_BASE_OFFSET))
+#define OLDMEM_SIZE	(*(unsigned long *)  (OLDMEM_SIZE_OFFSET))
+#define COMMAND_LINE	((char *)	     (COMMAND_LINE_OFFSET))
 
 extern int memory_end_set;
 extern unsigned long memory_end;
@@ -121,12 +135,12 @@ extern void (*_machine_power_off)(void);
 
 #else /* __ASSEMBLY__ */
 
-#define IPL_DEVICE        0x10400
-#define INITRD_START      0x10408
-#define INITRD_SIZE       0x10410
-#define OLDMEM_BASE	  0x10418
-#define OLDMEM_SIZE	  0x10420
-#define COMMAND_LINE      0x10480
+#define IPL_DEVICE	(IPL_DEVICE_OFFSET)
+#define INITRD_START	(INITRD_START_OFFSET)
+#define INITRD_SIZE	(INITRD_SIZE_OFFSET)
+#define OLDMEM_BASE	(OLDMEM_BASE_OFFSET)
+#define OLDMEM_SIZE	(OLDMEM_SIZE_OFFSET)
+#define COMMAND_LINE	(COMMAND_LINE_OFFSET)
 
 #endif /* __ASSEMBLY__ */
 #endif /* _ASM_S390_SETUP_H */

From 840798a1f52994c172270893bd2ec6013cc92e40 Mon Sep 17 00:00:00 2001
From: Philipp Rudo <prudo@linux.vnet.ibm.com>
Date: Mon, 28 Aug 2017 15:32:36 +0200
Subject: [PATCH 101/288] s390/kexec_file: Add purgatory

The common code expects the architecture to have a purgatory that runs
between the two kernels. Add it now. For simplicity first skip crash
support.

Signed-off-by: Philipp Rudo <prudo@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/Kbuild                  |  1 +
 arch/s390/Kconfig                 |  4 ++
 arch/s390/include/asm/purgatory.h | 17 ++++++
 arch/s390/kernel/asm-offsets.c    |  5 ++
 arch/s390/purgatory/Makefile      | 37 ++++++++++++
 arch/s390/purgatory/head.S        | 96 +++++++++++++++++++++++++++++++
 arch/s390/purgatory/purgatory.c   | 38 ++++++++++++
 7 files changed, 198 insertions(+)
 create mode 100644 arch/s390/include/asm/purgatory.h
 create mode 100644 arch/s390/purgatory/Makefile
 create mode 100644 arch/s390/purgatory/head.S
 create mode 100644 arch/s390/purgatory/purgatory.c

diff --git a/arch/s390/Kbuild b/arch/s390/Kbuild
index 9fdff3fe1a42..e63940bb57cd 100644
--- a/arch/s390/Kbuild
+++ b/arch/s390/Kbuild
@@ -8,3 +8,4 @@ obj-$(CONFIG_APPLDATA_BASE)	+= appldata/
 obj-y				+= net/
 obj-$(CONFIG_PCI)		+= pci/
 obj-$(CONFIG_NUMA)		+= numa/
+obj-$(CONFIG_ARCH_HAS_KEXEC_PURGATORY) += purgatory/
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 32a0d5b958bf..5b8d0859b317 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -51,6 +51,10 @@ config KEXEC
 	def_bool y
 	select KEXEC_CORE
 
+config ARCH_HAS_KEXEC_PURGATORY
+	def_bool y
+	depends on KEXEC_FILE
+
 config AUDIT_ARCH
 	def_bool y
 
diff --git a/arch/s390/include/asm/purgatory.h b/arch/s390/include/asm/purgatory.h
new file mode 100644
index 000000000000..e297bcfc476f
--- /dev/null
+++ b/arch/s390/include/asm/purgatory.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright IBM Corp. 2018
+ *
+ * Author(s): Philipp Rudo <prudo@linux.vnet.ibm.com>
+ */
+
+#ifndef _S390_PURGATORY_H_
+#define _S390_PURGATORY_H_
+#ifndef __ASSEMBLY__
+
+#include <linux/purgatory.h>
+
+int verify_sha256_digest(void);
+
+#endif	/* __ASSEMBLY__ */
+#endif /* _S390_PURGATORY_H_ */
diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c
index cfe2c45c5180..eb2a5c0443cd 100644
--- a/arch/s390/kernel/asm-offsets.c
+++ b/arch/s390/kernel/asm-offsets.c
@@ -10,6 +10,7 @@
 #include <linux/kbuild.h>
 #include <linux/kvm_host.h>
 #include <linux/sched.h>
+#include <linux/purgatory.h>
 #include <asm/idle.h>
 #include <asm/vdso.h>
 #include <asm/pgtable.h>
@@ -204,5 +205,9 @@ int main(void)
 	OFFSET(__GMAP_ASCE, gmap, asce);
 	OFFSET(__SIE_PROG0C, kvm_s390_sie_block, prog0c);
 	OFFSET(__SIE_PROG20, kvm_s390_sie_block, prog20);
+	/* kexec_sha_region */
+	OFFSET(__KEXEC_SHA_REGION_START, kexec_sha_region, start);
+	OFFSET(__KEXEC_SHA_REGION_LEN, kexec_sha_region, len);
+	DEFINE(__KEXEC_SHA_REGION_SIZE, sizeof(struct kexec_sha_region));
 	return 0;
 }
diff --git a/arch/s390/purgatory/Makefile b/arch/s390/purgatory/Makefile
new file mode 100644
index 000000000000..e9525bc1b4a6
--- /dev/null
+++ b/arch/s390/purgatory/Makefile
@@ -0,0 +1,37 @@
+# SPDX-License-Identifier: GPL-2.0
+
+OBJECT_FILES_NON_STANDARD := y
+
+purgatory-y := head.o purgatory.o string.o sha256.o mem.o
+
+targets += $(purgatory-y) purgatory.ro kexec-purgatory.c
+PURGATORY_OBJS = $(addprefix $(obj)/,$(purgatory-y))
+
+$(obj)/sha256.o: $(srctree)/lib/sha256.c
+	$(call if_changed_rule,cc_o_c)
+
+$(obj)/mem.o: $(srctree)/arch/s390/lib/mem.S
+	$(call if_changed_rule,as_o_S)
+
+$(obj)/string.o: $(srctree)/arch/s390/lib/string.c
+	$(call if_changed_rule,cc_o_c)
+
+LDFLAGS_purgatory.ro := -e purgatory_start -r --no-undefined -nostdlib
+LDFLAGS_purgatory.ro += -z nodefaultlib
+KBUILD_CFLAGS := -fno-strict-aliasing -Wall -Wstrict-prototypes
+KBUILD_CFLAGS += -Wno-pointer-sign -Wno-sign-compare
+KBUILD_CFLAGS += -fno-zero-initialized-in-bss -fno-builtin -ffreestanding
+KBUILD_CFLAGS += -c -MD -Os -m64
+KBUILD_CFLAGS += $(call cc-option,-fno-PIE)
+
+$(obj)/purgatory.ro: $(PURGATORY_OBJS) FORCE
+		$(call if_changed,ld)
+
+CMD_BIN2C = $(objtree)/scripts/basic/bin2c
+quiet_cmd_bin2c = BIN2C   $@
+      cmd_bin2c = $(CMD_BIN2C) kexec_purgatory < $< > $@
+
+$(obj)/kexec-purgatory.c: $(obj)/purgatory.ro FORCE
+	$(call if_changed,bin2c)
+
+obj-$(CONFIG_ARCH_HAS_KEXEC_PURGATORY) += kexec-purgatory.o
diff --git a/arch/s390/purgatory/head.S b/arch/s390/purgatory/head.S
new file mode 100644
index 000000000000..8735409d0280
--- /dev/null
+++ b/arch/s390/purgatory/head.S
@@ -0,0 +1,96 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Purgatory setup code
+ *
+ * Copyright IBM Corp. 2018
+ *
+ * Author(s): Philipp Rudo <prudo@linux.vnet.ibm.com>
+ */
+
+#include <linux/linkage.h>
+#include <asm/asm-offsets.h>
+#include <asm/page.h>
+#include <asm/sigp.h>
+
+/* The purgatory is the code running between two kernels. It's main purpose
+ * is to verify that the next kernel was not corrupted after load and to
+ * start it.
+ */
+
+.macro START_NEXT_KERNEL base
+	lg	%r4,kernel_entry-\base(%r13)
+	lg	%r5,load_psw_mask-\base(%r13)
+	ogr	%r4,%r5
+	stg	%r4,0(%r0)
+
+	xgr	%r0,%r0
+	diag	%r0,%r0,0x308
+.endm
+
+.text
+.align PAGE_SIZE
+ENTRY(purgatory_start)
+	/* The purgatory might be called after a diag308 so better set
+	 * architecture and addressing mode.
+	 */
+	lhi	%r1,1
+	sigp	%r1,%r0,SIGP_SET_ARCHITECTURE
+	sam64
+
+	larl	%r5,gprregs
+	stmg	%r6,%r15,0(%r5)
+
+	basr	%r13,0
+.base_crash:
+
+	/* Setup stack */
+	larl	%r15,purgatory_end
+	aghi	%r15,-160
+
+.do_checksum_verification:
+	brasl	%r14,verify_sha256_digest
+
+	cghi	%r2,0		/* checksum match */
+	jne	.disabled_wait
+
+	/* start normal kernel */
+	START_NEXT_KERNEL .base_crash
+
+.disabled_wait:
+	lpswe	disabled_wait_psw-.base_crash(%r13)
+
+
+load_psw_mask:
+	.long	0x00080000,0x80000000
+
+	.align	8
+disabled_wait_psw:
+	.quad	0x0002000180000000
+	.quad	0x0000000000000000 + .do_checksum_verification
+
+gprregs:
+	.rept	10
+	.quad	0
+	.endr
+
+purgatory_sha256_digest:
+	.global purgatory_sha256_digest
+	.rept	32	/* SHA256_DIGEST_SIZE */
+	.byte	0
+	.endr
+
+purgatory_sha_regions:
+	.global purgatory_sha_regions
+	.rept	16 * __KEXEC_SHA_REGION_SIZE	/* KEXEC_SEGMENTS_MAX */
+	.byte	0
+	.endr
+
+kernel_entry:
+	.global kernel_entry
+	.quad	0
+
+	.align	PAGE_SIZE
+stack:
+	.skip	PAGE_SIZE
+	.align	PAGE_SIZE
+purgatory_end:
diff --git a/arch/s390/purgatory/purgatory.c b/arch/s390/purgatory/purgatory.c
new file mode 100644
index 000000000000..52b92f2bf0b9
--- /dev/null
+++ b/arch/s390/purgatory/purgatory.c
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Purgatory code running between two kernels.
+ *
+ * Copyright IBM Corp. 2018
+ *
+ * Author(s): Philipp Rudo <prudo@linux.vnet.ibm.com>
+ */
+
+#include <linux/kexec.h>
+#include <linux/sha256.h>
+#include <linux/string.h>
+#include <asm/purgatory.h>
+
+struct kexec_sha_region purgatory_sha_regions[KEXEC_SEGMENT_MAX];
+u8 purgatory_sha256_digest[SHA256_DIGEST_SIZE];
+
+u64 kernel_entry;
+
+int verify_sha256_digest(void)
+{
+	struct kexec_sha_region *ptr, *end;
+	u8 digest[SHA256_DIGEST_SIZE];
+	struct sha256_state sctx;
+
+	sha256_init(&sctx);
+	end = purgatory_sha_regions + ARRAY_SIZE(purgatory_sha_regions);
+
+	for (ptr = purgatory_sha_regions; ptr < end; ptr++)
+		sha256_update(&sctx, (uint8_t *)(ptr->start), ptr->len);
+
+	sha256_final(&sctx, digest);
+
+	if (memcmp(digest, purgatory_sha256_digest, sizeof(digest)))
+		return 1;
+
+	return 0;
+}

From 71406883fd35794d573b3085433c41d0a3bf6c21 Mon Sep 17 00:00:00 2001
From: Philipp Rudo <prudo@linux.vnet.ibm.com>
Date: Mon, 19 Jun 2017 10:45:33 +0200
Subject: [PATCH 102/288] s390/kexec_file: Add kexec_file_load system call

This patch adds the kexec_file_load system call to s390 as well as the arch
specific functions common code requires to work. Loaders for the different
file types will be added later.

Signed-off-by: Philipp Rudo <prudo@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/Kconfig                     |  13 +++
 arch/s390/configs/default_defconfig   |   1 +
 arch/s390/kernel/Makefile             |   2 +
 arch/s390/kernel/compat_wrapper.c     |   1 +
 arch/s390/kernel/machine_kexec_file.c | 126 ++++++++++++++++++++++++++
 arch/s390/kernel/syscalls/syscall.tbl |   1 +
 6 files changed, 144 insertions(+)
 create mode 100644 arch/s390/kernel/machine_kexec_file.c

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 5b8d0859b317..3223ce0680c0 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -51,6 +51,19 @@ config KEXEC
 	def_bool y
 	select KEXEC_CORE
 
+config KEXEC_FILE
+	bool "kexec file based system call"
+	select KEXEC_CORE
+	select BUILD_BIN2C
+	depends on CRYPTO
+	depends on CRYPTO_SHA256
+	depends on CRYPTO_SHA256_S390
+	---help---
+	  This is new version of kexec system call. This system call is
+	  file based and takes file descriptors as system call argument
+	  for kernel and initramfs as opposed to list of segments as
+	  accepted by previous system call.
+
 config ARCH_HAS_KEXEC_PURGATORY
 	def_bool y
 	depends on KEXEC_FILE
diff --git a/arch/s390/configs/default_defconfig b/arch/s390/configs/default_defconfig
index 5af8458951cf..2310315b0744 100644
--- a/arch/s390/configs/default_defconfig
+++ b/arch/s390/configs/default_defconfig
@@ -719,3 +719,4 @@ CONFIG_APPLDATA_BASE=y
 CONFIG_KVM=m
 CONFIG_KVM_S390_UCONTROL=y
 CONFIG_VHOST_NET=m
+CONFIG_KEXEC_FILE=y
diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile
index b06a6f79c1ec..35bec1ab84e3 100644
--- a/arch/s390/kernel/Makefile
+++ b/arch/s390/kernel/Makefile
@@ -82,6 +82,8 @@ obj-$(CONFIG_FUNCTION_TRACER)	+= mcount.o ftrace.o
 obj-$(CONFIG_CRASH_DUMP)	+= crash_dump.o
 obj-$(CONFIG_UPROBES)		+= uprobes.o
 
+obj-$(CONFIG_KEXEC_FILE)	+= machine_kexec_file.o
+
 obj-$(CONFIG_PERF_EVENTS)	+= perf_event.o perf_cpum_cf.o perf_cpum_sf.o
 obj-$(CONFIG_PERF_EVENTS)	+= perf_cpum_cf_events.o perf_regs.o
 
diff --git a/arch/s390/kernel/compat_wrapper.c b/arch/s390/kernel/compat_wrapper.c
index 11e9d8b5c1b0..607c5e9fba3d 100644
--- a/arch/s390/kernel/compat_wrapper.c
+++ b/arch/s390/kernel/compat_wrapper.c
@@ -182,3 +182,4 @@ COMPAT_SYSCALL_WRAP6(copy_file_range, int, fd_in, loff_t __user *, off_in, int,
 COMPAT_SYSCALL_WRAP2(s390_guarded_storage, int, command, struct gs_cb *, gs_cb);
 COMPAT_SYSCALL_WRAP5(statx, int, dfd, const char __user *, path, unsigned, flags, unsigned, mask, struct statx __user *, buffer);
 COMPAT_SYSCALL_WRAP4(s390_sthyi, unsigned long, code, void __user *, info, u64 __user *, rc, unsigned long, flags);
+COMPAT_SYSCALL_WRAP5(kexec_file_load, int, kernel_fd, int, initrd_fd, unsigned long, cmdline_len, const char __user *, cmdline_ptr, unsigned long, flags)
diff --git a/arch/s390/kernel/machine_kexec_file.c b/arch/s390/kernel/machine_kexec_file.c
new file mode 100644
index 000000000000..d9b4f9d23e9f
--- /dev/null
+++ b/arch/s390/kernel/machine_kexec_file.c
@@ -0,0 +1,126 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * s390 code for kexec_file_load system call
+ *
+ * Copyright IBM Corp. 2018
+ *
+ * Author(s): Philipp Rudo <prudo@linux.vnet.ibm.com>
+ */
+
+#include <linux/elf.h>
+#include <linux/kexec.h>
+#include <asm/setup.h>
+
+const struct kexec_file_ops * const kexec_file_loaders[] = {
+	NULL,
+};
+
+/*
+ * The kernel is loaded to a fixed location. Turn off kexec_locate_mem_hole
+ * and provide kbuf->mem by hand.
+ */
+int arch_kexec_walk_mem(struct kexec_buf *kbuf,
+			int (*func)(struct resource *, void *))
+{
+	return 1;
+}
+
+int arch_kexec_apply_relocations_add(struct purgatory_info *pi,
+				     Elf_Shdr *section,
+				     const Elf_Shdr *relsec,
+				     const Elf_Shdr *symtab)
+{
+	Elf_Rela *relas;
+	int i;
+
+	relas = (void *)pi->ehdr + relsec->sh_offset;
+
+	for (i = 0; i < relsec->sh_size / sizeof(*relas); i++) {
+		const Elf_Sym *sym;	/* symbol to relocate */
+		unsigned long addr;	/* final location after relocation */
+		unsigned long val;	/* relocated symbol value */
+		void *loc;		/* tmp location to modify */
+
+		sym = (void *)pi->ehdr + symtab->sh_offset;
+		sym += ELF64_R_SYM(relas[i].r_info);
+
+		if (sym->st_shndx == SHN_UNDEF)
+			return -ENOEXEC;
+
+		if (sym->st_shndx == SHN_COMMON)
+			return -ENOEXEC;
+
+		if (sym->st_shndx >= pi->ehdr->e_shnum &&
+		    sym->st_shndx != SHN_ABS)
+			return -ENOEXEC;
+
+		loc = pi->purgatory_buf;
+		loc += section->sh_offset;
+		loc += relas[i].r_offset;
+
+		val = sym->st_value;
+		if (sym->st_shndx != SHN_ABS)
+			val += pi->sechdrs[sym->st_shndx].sh_addr;
+		val += relas[i].r_addend;
+
+		addr = section->sh_addr + relas[i].r_offset;
+
+		switch (ELF64_R_TYPE(relas[i].r_info)) {
+		case R_390_8:		/* Direct 8 bit.   */
+			*(u8 *)loc = val;
+			break;
+		case R_390_12:		/* Direct 12 bit.  */
+			*(u16 *)loc &= 0xf000;
+			*(u16 *)loc |= val & 0xfff;
+			break;
+		case R_390_16:		/* Direct 16 bit.  */
+			*(u16 *)loc = val;
+			break;
+		case R_390_20:		/* Direct 20 bit.  */
+			*(u32 *)loc &= 0xf00000ff;
+			*(u32 *)loc |= (val & 0xfff) << 16;	/* DL */
+			*(u32 *)loc |= (val & 0xff000) >> 4;	/* DH */
+			break;
+		case R_390_32:		/* Direct 32 bit.  */
+			*(u32 *)loc = val;
+			break;
+		case R_390_64:		/* Direct 64 bit.  */
+			*(u64 *)loc = val;
+			break;
+		case R_390_PC16:	/* PC relative 16 bit.	*/
+			*(u16 *)loc = (val - addr);
+			break;
+		case R_390_PC16DBL:	/* PC relative 16 bit shifted by 1.  */
+			*(u16 *)loc = (val - addr) >> 1;
+			break;
+		case R_390_PC32DBL:	/* PC relative 32 bit shifted by 1.  */
+			*(u32 *)loc = (val - addr) >> 1;
+			break;
+		case R_390_PC32:	/* PC relative 32 bit.	*/
+			*(u32 *)loc = (val - addr);
+			break;
+		case R_390_PC64:	/* PC relative 64 bit.	*/
+			*(u64 *)loc = (val - addr);
+			break;
+		default:
+			break;
+		}
+	}
+	return 0;
+}
+
+int arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
+				  unsigned long buf_len)
+{
+	/* A kernel must be at least large enough to contain head.S. During
+	 * load memory in head.S will be accessed, e.g. to register the next
+	 * command line. If the next kernel were smaller the current kernel
+	 * will panic at load.
+	 *
+	 * 0x11000 = sizeof(head.S)
+	 */
+	if (buf_len < 0x11000)
+		return -ENOEXEC;
+
+	return kexec_image_probe_default(image, buf, buf_len);
+}
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index b38d48464368..8b210ead7956 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -388,3 +388,4 @@
 378  common	s390_guarded_storage	sys_s390_guarded_storage	compat_sys_s390_guarded_storage
 379  common	statx			sys_statx			compat_sys_statx
 380  common	s390_sthyi		sys_s390_sthyi			compat_sys_s390_sthyi
+381  common	kexec_file_load		sys_kexec_file_load		compat_sys_kexec_file_load

From e49bb0a27fa3c6ec45cc13e2102a6ec13c4ae697 Mon Sep 17 00:00:00 2001
From: Philipp Rudo <prudo@linux.vnet.ibm.com>
Date: Wed, 30 Aug 2017 14:03:38 +0200
Subject: [PATCH 103/288] s390/kexec_file: Add image loader

Add an image loader for kexec_file_load. For simplicity first skip crash
support. The functions defined in machine_kexec_file will later be shared
with the ELF loader.

Signed-off-by: Philipp Rudo <prudo@linux.vnet.ibm.com>
Reviewed-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/include/asm/kexec.h         | 22 ++++++++
 arch/s390/kernel/Makefile             |  2 +-
 arch/s390/kernel/kexec_image.c        | 78 +++++++++++++++++++++++++++
 arch/s390/kernel/machine_kexec_file.c | 75 ++++++++++++++++++++++++++
 4 files changed, 176 insertions(+), 1 deletion(-)
 create mode 100644 arch/s390/kernel/kexec_image.c

diff --git a/arch/s390/include/asm/kexec.h b/arch/s390/include/asm/kexec.h
index 1d708a419326..4c9da9974cf4 100644
--- a/arch/s390/include/asm/kexec.h
+++ b/arch/s390/include/asm/kexec.h
@@ -46,4 +46,26 @@
 static inline void crash_setup_regs(struct pt_regs *newregs,
 					struct pt_regs *oldregs) { }
 
+struct kimage;
+struct s390_load_data {
+	/* Pointer to the kernel buffer. Used to register cmdline etc.. */
+	void *kernel_buf;
+
+	/* Total size of loaded segments in memory. Used as an offset. */
+	size_t memsz;
+
+	/* Load address of initrd. Used to register INITRD_START in kernel. */
+	unsigned long initrd_load_addr;
+};
+
+int kexec_file_add_purgatory(struct kimage *image,
+			     struct s390_load_data *data);
+int kexec_file_add_initrd(struct kimage *image,
+			  struct s390_load_data *data,
+			  char *initrd, unsigned long initrd_len);
+int *kexec_file_update_kernel(struct kimage *iamge,
+			      struct s390_load_data *data);
+
+extern const struct kexec_file_ops s390_kexec_image_ops;
+
 #endif /*_S390_KEXEC_H */
diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile
index 35bec1ab84e3..a84e9611c5c7 100644
--- a/arch/s390/kernel/Makefile
+++ b/arch/s390/kernel/Makefile
@@ -82,7 +82,7 @@ obj-$(CONFIG_FUNCTION_TRACER)	+= mcount.o ftrace.o
 obj-$(CONFIG_CRASH_DUMP)	+= crash_dump.o
 obj-$(CONFIG_UPROBES)		+= uprobes.o
 
-obj-$(CONFIG_KEXEC_FILE)	+= machine_kexec_file.o
+obj-$(CONFIG_KEXEC_FILE)	+= machine_kexec_file.o kexec_image.o
 
 obj-$(CONFIG_PERF_EVENTS)	+= perf_event.o perf_cpum_cf.o perf_cpum_sf.o
 obj-$(CONFIG_PERF_EVENTS)	+= perf_cpum_cf_events.o perf_regs.o
diff --git a/arch/s390/kernel/kexec_image.c b/arch/s390/kernel/kexec_image.c
new file mode 100644
index 000000000000..7f5021e6c242
--- /dev/null
+++ b/arch/s390/kernel/kexec_image.c
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Image loader for kexec_file_load system call.
+ *
+ * Copyright IBM Corp. 2018
+ *
+ * Author(s): Philipp Rudo <prudo@linux.vnet.ibm.com>
+ */
+
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/kexec.h>
+#include <asm/setup.h>
+
+static int kexec_file_add_image_kernel(struct kimage *image,
+				       struct s390_load_data *data,
+				       char *kernel, unsigned long kernel_len)
+{
+	struct kexec_buf buf;
+	int ret;
+
+	buf.image = image;
+
+	buf.buffer = kernel + STARTUP_NORMAL_OFFSET;
+	buf.bufsz = kernel_len - STARTUP_NORMAL_OFFSET;
+
+	buf.mem = STARTUP_NORMAL_OFFSET;
+	buf.memsz = buf.bufsz;
+
+	ret = kexec_add_buffer(&buf);
+
+	data->kernel_buf = kernel;
+	data->memsz += buf.memsz + STARTUP_NORMAL_OFFSET;
+
+	return ret;
+}
+
+static void *s390_image_load(struct kimage *image,
+			     char *kernel, unsigned long kernel_len,
+			     char *initrd, unsigned long initrd_len,
+			     char *cmdline, unsigned long cmdline_len)
+{
+	struct s390_load_data data = {0};
+	int ret;
+
+	/* We don't support crash kernels yet. */
+	if (image->type == KEXEC_TYPE_CRASH)
+		return ERR_PTR(-ENOTSUPP);
+
+	ret = kexec_file_add_image_kernel(image, &data, kernel, kernel_len);
+	if (ret)
+		return ERR_PTR(ret);
+
+	if (initrd) {
+		ret = kexec_file_add_initrd(image, &data, initrd, initrd_len);
+		if (ret)
+			return ERR_PTR(ret);
+	}
+
+	ret = kexec_file_add_purgatory(image, &data);
+	if (ret)
+		return ERR_PTR(ret);
+
+	return kexec_file_update_kernel(image, &data);
+}
+
+static int s390_image_probe(const char *buf, unsigned long len)
+{
+	/* Can't reliably tell if an image is valid.  Therefore give the
+	 * user whatever he wants.
+	 */
+	return 0;
+}
+
+const struct kexec_file_ops s390_kexec_image_ops = {
+	.probe = s390_image_probe,
+	.load = s390_image_load,
+};
diff --git a/arch/s390/kernel/machine_kexec_file.c b/arch/s390/kernel/machine_kexec_file.c
index d9b4f9d23e9f..2a2ceece77b0 100644
--- a/arch/s390/kernel/machine_kexec_file.c
+++ b/arch/s390/kernel/machine_kexec_file.c
@@ -12,9 +12,84 @@
 #include <asm/setup.h>
 
 const struct kexec_file_ops * const kexec_file_loaders[] = {
+	&s390_kexec_image_ops,
 	NULL,
 };
 
+int *kexec_file_update_kernel(struct kimage *image,
+			      struct s390_load_data *data)
+{
+	unsigned long *loc;
+
+	if (image->cmdline_buf_len >= ARCH_COMMAND_LINE_SIZE)
+		return ERR_PTR(-EINVAL);
+
+	if (image->cmdline_buf_len)
+		memcpy(data->kernel_buf + COMMAND_LINE_OFFSET,
+		       image->cmdline_buf, image->cmdline_buf_len);
+
+	if (image->initrd_buf) {
+		loc = (unsigned long *)(data->kernel_buf + INITRD_START_OFFSET);
+		*loc = data->initrd_load_addr;
+
+		loc = (unsigned long *)(data->kernel_buf + INITRD_SIZE_OFFSET);
+		*loc = image->initrd_buf_len;
+	}
+
+	return NULL;
+}
+
+static int kexec_file_update_purgatory(struct kimage *image)
+{
+	u64 entry, type;
+	int ret;
+
+	entry = STARTUP_NORMAL_OFFSET;
+	ret = kexec_purgatory_get_set_symbol(image, "kernel_entry", &entry,
+					     sizeof(entry), false);
+	return ret;
+}
+
+int kexec_file_add_purgatory(struct kimage *image, struct s390_load_data *data)
+{
+	struct kexec_buf buf;
+	int ret;
+
+	buf.image = image;
+
+	data->memsz = ALIGN(data->memsz, PAGE_SIZE);
+	buf.mem = data->memsz;
+
+	ret = kexec_load_purgatory(image, &buf);
+	if (ret)
+		return ret;
+
+	ret = kexec_file_update_purgatory(image);
+	return ret;
+}
+
+int kexec_file_add_initrd(struct kimage *image, struct s390_load_data *data,
+			  char *initrd, unsigned long initrd_len)
+{
+	struct kexec_buf buf;
+	int ret;
+
+	buf.image = image;
+
+	buf.buffer = initrd;
+	buf.bufsz = initrd_len;
+
+	data->memsz = ALIGN(data->memsz, PAGE_SIZE);
+	buf.mem = data->memsz;
+	buf.memsz = buf.bufsz;
+
+	data->initrd_load_addr = buf.mem;
+	data->memsz += buf.memsz;
+
+	ret = kexec_add_buffer(&buf);
+	return ret;
+}
+
 /*
  * The kernel is loaded to a fixed location. Turn off kexec_locate_mem_hole
  * and provide kbuf->mem by hand.

From ee337f5469fd67f22d231e520ec4189ce0589d92 Mon Sep 17 00:00:00 2001
From: Philipp Rudo <prudo@linux.vnet.ibm.com>
Date: Tue, 5 Sep 2017 11:55:23 +0200
Subject: [PATCH 104/288] s390/kexec_file: Add crash support to image loader

Add support to load a crash kernel to the image loader. This requires
extending the purgatory.

Signed-off-by: Philipp Rudo <prudo@linux.vnet.ibm.com>
Reviewed-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/kernel/kexec_image.c        |   6 +-
 arch/s390/kernel/machine_kexec_file.c |  45 ++++++-
 arch/s390/purgatory/head.S            | 185 +++++++++++++++++++++++++-
 arch/s390/purgatory/purgatory.c       |   4 +
 4 files changed, 234 insertions(+), 6 deletions(-)

diff --git a/arch/s390/kernel/kexec_image.c b/arch/s390/kernel/kexec_image.c
index 7f5021e6c242..3800852595e8 100644
--- a/arch/s390/kernel/kexec_image.c
+++ b/arch/s390/kernel/kexec_image.c
@@ -25,6 +25,8 @@ static int kexec_file_add_image_kernel(struct kimage *image,
 	buf.bufsz = kernel_len - STARTUP_NORMAL_OFFSET;
 
 	buf.mem = STARTUP_NORMAL_OFFSET;
+	if (image->type == KEXEC_TYPE_CRASH)
+		buf.mem += crashk_res.start;
 	buf.memsz = buf.bufsz;
 
 	ret = kexec_add_buffer(&buf);
@@ -43,10 +45,6 @@ static void *s390_image_load(struct kimage *image,
 	struct s390_load_data data = {0};
 	int ret;
 
-	/* We don't support crash kernels yet. */
-	if (image->type == KEXEC_TYPE_CRASH)
-		return ERR_PTR(-ENOTSUPP);
-
 	ret = kexec_file_add_image_kernel(image, &data, kernel, kernel_len);
 	if (ret)
 		return ERR_PTR(ret);
diff --git a/arch/s390/kernel/machine_kexec_file.c b/arch/s390/kernel/machine_kexec_file.c
index 2a2ceece77b0..75aea2c1d823 100644
--- a/arch/s390/kernel/machine_kexec_file.c
+++ b/arch/s390/kernel/machine_kexec_file.c
@@ -28,6 +28,14 @@ int *kexec_file_update_kernel(struct kimage *image,
 		memcpy(data->kernel_buf + COMMAND_LINE_OFFSET,
 		       image->cmdline_buf, image->cmdline_buf_len);
 
+	if (image->type == KEXEC_TYPE_CRASH) {
+		loc = (unsigned long *)(data->kernel_buf + OLDMEM_BASE_OFFSET);
+		*loc = crashk_res.start;
+
+		loc = (unsigned long *)(data->kernel_buf + OLDMEM_SIZE_OFFSET);
+		*loc = crashk_res.end - crashk_res.start + 1;
+	}
+
 	if (image->initrd_buf) {
 		loc = (unsigned long *)(data->kernel_buf + INITRD_START_OFFSET);
 		*loc = data->initrd_load_addr;
@@ -44,9 +52,40 @@ static int kexec_file_update_purgatory(struct kimage *image)
 	u64 entry, type;
 	int ret;
 
-	entry = STARTUP_NORMAL_OFFSET;
+	if (image->type == KEXEC_TYPE_CRASH) {
+		entry = STARTUP_KDUMP_OFFSET;
+		type = KEXEC_TYPE_CRASH;
+	} else {
+		entry = STARTUP_NORMAL_OFFSET;
+		type = KEXEC_TYPE_DEFAULT;
+	}
+
 	ret = kexec_purgatory_get_set_symbol(image, "kernel_entry", &entry,
 					     sizeof(entry), false);
+	if (ret)
+		return ret;
+
+	ret = kexec_purgatory_get_set_symbol(image, "kernel_type", &type,
+					     sizeof(type), false);
+	if (ret)
+		return ret;
+
+	if (image->type == KEXEC_TYPE_CRASH) {
+		u64 crash_size;
+
+		ret = kexec_purgatory_get_set_symbol(image, "crash_start",
+						     &crashk_res.start,
+						     sizeof(crashk_res.start),
+						     false);
+		if (ret)
+			return ret;
+
+		crash_size = crashk_res.end - crashk_res.start + 1;
+		ret = kexec_purgatory_get_set_symbol(image, "crash_size",
+						     &crash_size,
+						     sizeof(crash_size),
+						     false);
+	}
 	return ret;
 }
 
@@ -59,6 +98,8 @@ int kexec_file_add_purgatory(struct kimage *image, struct s390_load_data *data)
 
 	data->memsz = ALIGN(data->memsz, PAGE_SIZE);
 	buf.mem = data->memsz;
+	if (image->type == KEXEC_TYPE_CRASH)
+		buf.mem += crashk_res.start;
 
 	ret = kexec_load_purgatory(image, &buf);
 	if (ret)
@@ -81,6 +122,8 @@ int kexec_file_add_initrd(struct kimage *image, struct s390_load_data *data,
 
 	data->memsz = ALIGN(data->memsz, PAGE_SIZE);
 	buf.mem = data->memsz;
+	if (image->type == KEXEC_TYPE_CRASH)
+		buf.mem += crashk_res.start;
 	buf.memsz = buf.bufsz;
 
 	data->initrd_load_addr = buf.mem;
diff --git a/arch/s390/purgatory/head.S b/arch/s390/purgatory/head.S
index 8735409d0280..660c96a05a9b 100644
--- a/arch/s390/purgatory/head.S
+++ b/arch/s390/purgatory/head.S
@@ -15,8 +15,52 @@
 /* The purgatory is the code running between two kernels. It's main purpose
  * is to verify that the next kernel was not corrupted after load and to
  * start it.
+ *
+ * If the next kernel is a crash kernel there are some peculiarities to
+ * consider:
+ *
+ * First the purgatory is called twice. Once only to verify the
+ * sha digest. So if the crash kernel got corrupted the old kernel can try
+ * to trigger a stand-alone dumper. And once to actually load the crash kernel.
+ *
+ * Second the purgatory also has to swap the crash memory region with its
+ * destination at address 0. As the purgatory is part of crash memory this
+ * requires some finesse. The tactic here is that the purgatory first copies
+ * itself to the end of the destination and then swaps the rest of the
+ * memory running from there.
  */
 
+#define bufsz purgatory_end-stack
+
+.macro MEMCPY dst,src,len
+	lgr	%r0,\dst
+	lgr	%r1,\len
+	lgr	%r2,\src
+	lgr	%r3,\len
+
+20:	mvcle	%r0,%r2,0
+	jo	20b
+.endm
+
+.macro MEMSWAP dst,src,buf,len
+10:	cghi	\len,bufsz
+	jh	11f
+	lgr	%r4,\len
+	j	12f
+11:	lghi	%r4,bufsz
+
+12:	MEMCPY	\buf,\dst,%r4
+	MEMCPY	\dst,\src,%r4
+	MEMCPY	\src,\buf,%r4
+
+	agr	\dst,%r4
+	agr	\src,%r4
+	sgr	\len,%r4
+
+	cghi	\len,0
+	jh	10b
+.endm
+
 .macro START_NEXT_KERNEL base
 	lg	%r4,kernel_entry-\base(%r13)
 	lg	%r5,load_psw_mask-\base(%r13)
@@ -47,18 +91,144 @@ ENTRY(purgatory_start)
 	larl	%r15,purgatory_end
 	aghi	%r15,-160
 
+	/* If the next kernel is KEXEC_TYPE_CRASH the purgatory is called
+	 * directly with a flag passed in %r2 whether the purgatory shall do
+	 * checksum verification only (%r2 = 0 -> verification only).
+	 *
+	 * Check now and preserve over C function call by storing in
+	 * %r10 whith
+	 *	1 -> checksum verification only
+	 *	0 -> load new kernel
+	 */
+	lghi	%r10,0
+	lg	%r11,kernel_type-.base_crash(%r13)
+	cghi	%r11,1		/* KEXEC_TYPE_CRASH */
+	jne	.do_checksum_verification
+	cghi	%r2,0		/* checksum verification only */
+	jne	.do_checksum_verification
+	lghi	%r10,1
+
 .do_checksum_verification:
 	brasl	%r14,verify_sha256_digest
 
+	cghi	%r10,1		/* checksum verification only */
+	je	.return_old_kernel
 	cghi	%r2,0		/* checksum match */
 	jne	.disabled_wait
 
+	/* If the next kernel is a crash kernel the purgatory has to swap
+	 * the mem regions first.
+	 */
+	cghi	%r11,1 /* KEXEC_TYPE_CRASH */
+	je	.start_crash_kernel
+
 	/* start normal kernel */
 	START_NEXT_KERNEL .base_crash
 
+.return_old_kernel:
+	lmg	%r6,%r15,gprregs-.base_crash(%r13)
+	br	%r14
+
 .disabled_wait:
 	lpswe	disabled_wait_psw-.base_crash(%r13)
 
+.start_crash_kernel:
+	/* Location of purgatory_start in crash memory */
+	lgr	%r8,%r13
+	aghi	%r8,-(.base_crash-purgatory_start)
+
+	/* Destination for this code i.e. end of memory to be swapped. */
+	lg	%r9,crash_size-.base_crash(%r13)
+	aghi	%r9,-(purgatory_end-purgatory_start)
+
+	/* Destination in crash memory, i.e. same as r9 but in crash memory. */
+	lg	%r10,crash_start-.base_crash(%r13)
+	agr	%r10,%r9
+
+	/* Buffer location (in crash memory) and size. As the purgatory is
+	 * behind the point of no return it can re-use the stack as buffer.
+	 */
+	lghi	%r11,bufsz
+	larl	%r12,stack
+
+	MEMCPY	%r12,%r9,%r11	/* dst	-> (crash) buf */
+	MEMCPY	%r9,%r8,%r11	/* self -> dst */
+
+	/* Jump to new location. */
+	lgr	%r7,%r9
+	aghi	%r7,.jump_to_dst-purgatory_start
+	br	%r7
+
+.jump_to_dst:
+	basr	%r13,0
+.base_dst:
+
+	/* clear buffer */
+	MEMCPY	%r12,%r10,%r11	/* (crash) buf -> (crash) dst */
+
+	/* Load new buffer location after jump */
+	larl	%r7,stack
+	aghi	%r10,stack-purgatory_start
+	MEMCPY	%r10,%r7,%r11	/* (new) buf -> (crash) buf */
+
+	/* Now the code is set up to run from its designated location. Start
+	 * swapping the rest of crash memory now.
+	 *
+	 * The registers will be used as follow:
+	 *
+	 *	%r0-%r4	reserved for macros defined above
+	 *	%r5-%r6 tmp registers
+	 *	%r7	pointer to current struct sha region
+	 *	%r8	index to iterate over all sha regions
+	 *	%r9	pointer in crash memory
+	 *	%r10	pointer in old kernel
+	 *	%r11	total size (still) to be moved
+	 *	%r12	pointer to buffer
+	 */
+	lgr	%r12,%r7
+	lgr	%r11,%r9
+	lghi	%r10,0
+	lg	%r9,crash_start-.base_dst(%r13)
+	lghi	%r8,16	/* KEXEC_SEGMENTS_MAX */
+	larl	%r7,purgatory_sha_regions
+
+	j .loop_first
+
+	/* Loop over all purgatory_sha_regions. */
+.loop_next:
+	aghi	%r8,-1
+	cghi	%r8,0
+	je	.loop_out
+
+	aghi	%r7,__KEXEC_SHA_REGION_SIZE
+
+.loop_first:
+	lg	%r5,__KEXEC_SHA_REGION_START(%r7)
+	cghi	%r5,0
+	je	.loop_next
+
+	/* Copy [end last sha region, start current sha region) */
+	/* Note: kexec_sha_region->start points in crash memory */
+	sgr	%r5,%r9
+	MEMCPY	%r9,%r10,%r5
+
+	agr	%r9,%r5
+	agr	%r10,%r5
+	sgr	%r11,%r5
+
+	/* Swap sha region */
+	lg	%r6,__KEXEC_SHA_REGION_LEN(%r7)
+	MEMSWAP	%r9,%r10,%r12,%r6
+	sg	%r11,__KEXEC_SHA_REGION_LEN(%r7)
+	j	.loop_next
+
+.loop_out:
+	/* Copy rest of crash memory */
+	MEMCPY	%r9,%r10,%r11
+
+	/* start crash kernel */
+	START_NEXT_KERNEL .base_dst
+
 
 load_psw_mask:
 	.long	0x00080000,0x80000000
@@ -89,8 +259,21 @@ kernel_entry:
 	.global kernel_entry
 	.quad	0
 
+kernel_type:
+	.global kernel_type
+	.quad	0
+
+crash_start:
+	.global crash_start
+	.quad	0
+
+crash_size:
+	.global crash_size
+	.quad	0
+
 	.align	PAGE_SIZE
 stack:
-	.skip	PAGE_SIZE
+	/* The buffer to move this code must be as big as the code. */
+	.skip	stack-purgatory_start
 	.align	PAGE_SIZE
 purgatory_end:
diff --git a/arch/s390/purgatory/purgatory.c b/arch/s390/purgatory/purgatory.c
index 52b92f2bf0b9..4e2beb3c29b7 100644
--- a/arch/s390/purgatory/purgatory.c
+++ b/arch/s390/purgatory/purgatory.c
@@ -16,6 +16,10 @@ struct kexec_sha_region purgatory_sha_regions[KEXEC_SEGMENT_MAX];
 u8 purgatory_sha256_digest[SHA256_DIGEST_SIZE];
 
 u64 kernel_entry;
+u64 kernel_type;
+
+u64 crash_start;
+u64 crash_size;
 
 int verify_sha256_digest(void)
 {

From 8be018827154666d1fe5904cb7a43b6706e01c87 Mon Sep 17 00:00:00 2001
From: Philipp Rudo <prudo@linux.vnet.ibm.com>
Date: Mon, 11 Sep 2017 15:15:29 +0200
Subject: [PATCH 105/288] s390/kexec_file: Add ELF loader

Add an ELF loader for kexec_file. The main task here is to do proper sanity
checks on the ELF file. Basically all other functionality was already
implemented for the image loader.

Signed-off-by: Philipp Rudo <prudo@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/include/asm/kexec.h         |   1 +
 arch/s390/kernel/Makefile             |   1 +
 arch/s390/kernel/kexec_elf.c          | 147 ++++++++++++++++++++++++++
 arch/s390/kernel/machine_kexec_file.c |   1 +
 4 files changed, 150 insertions(+)
 create mode 100644 arch/s390/kernel/kexec_elf.c

diff --git a/arch/s390/include/asm/kexec.h b/arch/s390/include/asm/kexec.h
index 4c9da9974cf4..825dd0f7f221 100644
--- a/arch/s390/include/asm/kexec.h
+++ b/arch/s390/include/asm/kexec.h
@@ -67,5 +67,6 @@ int *kexec_file_update_kernel(struct kimage *iamge,
 			      struct s390_load_data *data);
 
 extern const struct kexec_file_ops s390_kexec_image_ops;
+extern const struct kexec_file_ops s390_kexec_elf_ops;
 
 #endif /*_S390_KEXEC_H */
diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile
index a84e9611c5c7..84ea6225efb4 100644
--- a/arch/s390/kernel/Makefile
+++ b/arch/s390/kernel/Makefile
@@ -83,6 +83,7 @@ obj-$(CONFIG_CRASH_DUMP)	+= crash_dump.o
 obj-$(CONFIG_UPROBES)		+= uprobes.o
 
 obj-$(CONFIG_KEXEC_FILE)	+= machine_kexec_file.o kexec_image.o
+obj-$(CONFIG_KEXEC_FILE)	+= kexec_elf.o
 
 obj-$(CONFIG_PERF_EVENTS)	+= perf_event.o perf_cpum_cf.o perf_cpum_sf.o
 obj-$(CONFIG_PERF_EVENTS)	+= perf_cpum_cf_events.o perf_regs.o
diff --git a/arch/s390/kernel/kexec_elf.c b/arch/s390/kernel/kexec_elf.c
new file mode 100644
index 000000000000..5a286b012043
--- /dev/null
+++ b/arch/s390/kernel/kexec_elf.c
@@ -0,0 +1,147 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ELF loader for kexec_file_load system call.
+ *
+ * Copyright IBM Corp. 2018
+ *
+ * Author(s): Philipp Rudo <prudo@linux.vnet.ibm.com>
+ */
+
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/kexec.h>
+#include <asm/setup.h>
+
+static int kexec_file_add_elf_kernel(struct kimage *image,
+				     struct s390_load_data *data,
+				     char *kernel, unsigned long kernel_len)
+{
+	struct kexec_buf buf;
+	const Elf_Ehdr *ehdr;
+	const Elf_Phdr *phdr;
+	int i, ret;
+
+	ehdr = (Elf_Ehdr *)kernel;
+	buf.image = image;
+
+	phdr = (void *)ehdr + ehdr->e_phoff;
+	for (i = 0; i < ehdr->e_phnum; i++, phdr++) {
+		if (phdr->p_type != PT_LOAD)
+			continue;
+
+		buf.buffer = kernel + phdr->p_offset;
+		buf.bufsz = phdr->p_filesz;
+
+		buf.mem = ALIGN(phdr->p_paddr, phdr->p_align);
+		buf.memsz = phdr->p_memsz;
+
+		if (phdr->p_paddr == 0) {
+			data->kernel_buf = buf.buffer;
+			data->memsz += STARTUP_NORMAL_OFFSET;
+
+			buf.buffer += STARTUP_NORMAL_OFFSET;
+			buf.bufsz -= STARTUP_NORMAL_OFFSET;
+
+			buf.mem += STARTUP_NORMAL_OFFSET;
+			buf.memsz -= STARTUP_NORMAL_OFFSET;
+		}
+
+		if (image->type == KEXEC_TYPE_CRASH)
+			buf.mem += crashk_res.start;
+
+		ret = kexec_add_buffer(&buf);
+		if (ret)
+			return ret;
+
+		data->memsz += buf.memsz;
+	}
+
+	return 0;
+}
+
+static void *s390_elf_load(struct kimage *image,
+			   char *kernel, unsigned long kernel_len,
+			   char *initrd, unsigned long initrd_len,
+			   char *cmdline, unsigned long cmdline_len)
+{
+	struct s390_load_data data = {0};
+	const Elf_Ehdr *ehdr;
+	const Elf_Phdr *phdr;
+	size_t size;
+	int i, ret;
+
+	/* image->fobs->probe already checked for valid ELF magic number. */
+	ehdr = (Elf_Ehdr *)kernel;
+
+	if (ehdr->e_type != ET_EXEC ||
+	    ehdr->e_ident[EI_CLASS] != ELFCLASS64 ||
+	    !elf_check_arch(ehdr))
+		return ERR_PTR(-EINVAL);
+
+	if (!ehdr->e_phnum || ehdr->e_phentsize != sizeof(Elf_Phdr))
+		return ERR_PTR(-EINVAL);
+
+	size = ehdr->e_ehsize + ehdr->e_phoff;
+	size += ehdr->e_phentsize * ehdr->e_phnum;
+	if (size > kernel_len)
+		return ERR_PTR(-EINVAL);
+
+	phdr = (void *)ehdr + ehdr->e_phoff;
+	size = ALIGN(size, phdr->p_align);
+	for (i = 0; i < ehdr->e_phnum; i++, phdr++) {
+		if (phdr->p_type == PT_INTERP)
+			return ERR_PTR(-EINVAL);
+
+		if (phdr->p_offset > kernel_len)
+			return ERR_PTR(-EINVAL);
+
+		size += ALIGN(phdr->p_filesz, phdr->p_align);
+	}
+
+	if (size > kernel_len)
+		return ERR_PTR(-EINVAL);
+
+	ret = kexec_file_add_elf_kernel(image, &data, kernel, kernel_len);
+	if (ret)
+		return ERR_PTR(ret);
+
+	if (!data.memsz)
+		return ERR_PTR(-EINVAL);
+
+	if (initrd) {
+		ret = kexec_file_add_initrd(image, &data, initrd, initrd_len);
+		if (ret)
+			return ERR_PTR(ret);
+	}
+
+	ret = kexec_file_add_purgatory(image, &data);
+	if (ret)
+		return ERR_PTR(ret);
+
+	return kexec_file_update_kernel(image, &data);
+}
+
+static int s390_elf_probe(const char *buf, unsigned long len)
+{
+	const Elf_Ehdr *ehdr;
+
+	if (len < sizeof(Elf_Ehdr))
+		return -ENOEXEC;
+
+	ehdr = (Elf_Ehdr *)buf;
+
+	/* Only check the ELF magic number here and do proper validity check
+	 * in the loader. Any check here that fails would send the erroneous
+	 * ELF file to the image loader that does not care what it gets.
+	 * (Most likely) causing behavior not intended by the user.
+	 */
+	if (memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0)
+		return -ENOEXEC;
+
+	return 0;
+}
+
+const struct kexec_file_ops s390_kexec_elf_ops = {
+	.probe = s390_elf_probe,
+	.load = s390_elf_load,
+};
diff --git a/arch/s390/kernel/machine_kexec_file.c b/arch/s390/kernel/machine_kexec_file.c
index 75aea2c1d823..f413f57f8d20 100644
--- a/arch/s390/kernel/machine_kexec_file.c
+++ b/arch/s390/kernel/machine_kexec_file.c
@@ -12,6 +12,7 @@
 #include <asm/setup.h>
 
 const struct kexec_file_ops * const kexec_file_loaders[] = {
+	&s390_kexec_elf_ops,
 	&s390_kexec_image_ops,
 	NULL,
 };

From bdea9f6f7a707301878573a5c35e39e4fe817378 Mon Sep 17 00:00:00 2001
From: Philipp Rudo <prudo@linux.vnet.ibm.com>
Date: Tue, 27 Mar 2018 13:14:12 +0200
Subject: [PATCH 106/288] s390/Kconfig: Move kexec config options to "Processor
 type and features"

The config options for kexec are currently not under any menu directory. Up
until now this was not a problem as standard kexec is always compiled in
and thus does not create a menu entry. This changed when kexec_file_load
was enabled. Its config option requires a menu entry which, when added
beneath standard kexec option, appears on the main directory above "General
Setup". Thus move the whole block further down such that the entry in now
in "Processor type and features".

While at it also update the help text for kexec file.

Signed-off-by: Philipp Rudo <prudo@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/Kconfig | 41 ++++++++++++++++++++---------------------
 1 file changed, 20 insertions(+), 21 deletions(-)

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 3223ce0680c0..beccb58a82e5 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -47,27 +47,6 @@ config PGSTE
 config ARCH_SUPPORTS_DEBUG_PAGEALLOC
 	def_bool y
 
-config KEXEC
-	def_bool y
-	select KEXEC_CORE
-
-config KEXEC_FILE
-	bool "kexec file based system call"
-	select KEXEC_CORE
-	select BUILD_BIN2C
-	depends on CRYPTO
-	depends on CRYPTO_SHA256
-	depends on CRYPTO_SHA256_S390
-	---help---
-	  This is new version of kexec system call. This system call is
-	  file based and takes file descriptors as system call argument
-	  for kernel and initramfs as opposed to list of segments as
-	  accepted by previous system call.
-
-config ARCH_HAS_KEXEC_PURGATORY
-	def_bool y
-	depends on KEXEC_FILE
-
 config AUDIT_ARCH
 	def_bool y
 
@@ -542,6 +521,26 @@ source kernel/Kconfig.preempt
 
 source kernel/Kconfig.hz
 
+config KEXEC
+	def_bool y
+	select KEXEC_CORE
+
+config KEXEC_FILE
+	bool "kexec file based system call"
+	select KEXEC_CORE
+	select BUILD_BIN2C
+	depends on CRYPTO
+	depends on CRYPTO_SHA256
+	depends on CRYPTO_SHA256_S390
+	help
+	  Enable the kexec file based system call. In contrast to the normal
+	  kexec system call this system call takes file descriptors for the
+	  kernel and initramfs as arguments.
+
+config ARCH_HAS_KEXEC_PURGATORY
+	def_bool y
+	depends on KEXEC_FILE
+
 config ARCH_RANDOM
 	def_bool y
 	prompt "s390 architectural random number generation API"

From de66b2429100c85b72db5c409526351d3ffc5faa Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Thu, 12 Apr 2018 13:45:52 +0200
Subject: [PATCH 107/288] s390/kexec_file: add generated files to .gitignore

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/purgatory/.gitignore | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 arch/s390/purgatory/.gitignore

diff --git a/arch/s390/purgatory/.gitignore b/arch/s390/purgatory/.gitignore
new file mode 100644
index 000000000000..e9e66f178a6d
--- /dev/null
+++ b/arch/s390/purgatory/.gitignore
@@ -0,0 +1,2 @@
+kexec-purgatory.c
+purgatory.ro

From 701e188c6560d6abeba508f530c4224b4e830fb5 Mon Sep 17 00:00:00 2001
From: Thomas Richter <tmricht@linux.ibm.com>
Date: Thu, 12 Apr 2018 08:42:48 +0100
Subject: [PATCH 108/288] s390/decompressor: Ignore file vmlinux.bin.full

Commit 81796a3c6a4a ("s390/decompressor: trim uncompressed
image head during the build") introduced a new
file named vmlinux.bin.full in directory
arch/s390/boot/compressed.

Add this file to the list of ignored files so it does
not show up on git status.

Signed-off-by: Thomas Richter <tmricht@linux.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/boot/compressed/.gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/s390/boot/compressed/.gitignore b/arch/s390/boot/compressed/.gitignore
index ae06b9b4c02f..2088cc140629 100644
--- a/arch/s390/boot/compressed/.gitignore
+++ b/arch/s390/boot/compressed/.gitignore
@@ -1,3 +1,4 @@
 sizes.h
 vmlinux
 vmlinux.lds
+vmlinux.bin.full

From 232acdff21fb02f0ccd538cd29c9ee7e028b6101 Mon Sep 17 00:00:00 2001
From: Sebastian Ott <sebott@linux.ibm.com>
Date: Tue, 10 Apr 2018 12:39:34 +0200
Subject: [PATCH 109/288] s390/nospec: include cpu.h

Fix the following sparse warnings:
symbol 'cpu_show_spectre_v1' was not declared. Should it be static?
symbol 'cpu_show_spectre_v2' was not declared. Should it be static?

Signed-off-by: Sebastian Ott <sebott@linux.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/kernel/nospec-branch.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/s390/kernel/nospec-branch.c b/arch/s390/kernel/nospec-branch.c
index f236ce8757e8..46d49a11663f 100644
--- a/arch/s390/kernel/nospec-branch.c
+++ b/arch/s390/kernel/nospec-branch.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <linux/module.h>
 #include <linux/device.h>
+#include <linux/cpu.h>
 #include <asm/nospec-branch.h>
 
 static int __init nobp_setup_early(char *str)

From c65bbb51c6e98a1956c08faab81941ec558ef0ba Mon Sep 17 00:00:00 2001
From: Vasily Gorbik <gor@linux.ibm.com>
Date: Wed, 11 Apr 2018 10:24:29 +0200
Subject: [PATCH 110/288] s390/boot: remove unused COMPILE_VERSION and
 ccflags-y

ccflags-y has no effect (no code is built in that directory,
arch/s390/boot/compressed/Makefile defines its own KBUILD_CFLAGS).
Removing ccflags-y together with COMPILE_VERSION.

Reviewed-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/boot/Makefile | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/arch/s390/boot/Makefile b/arch/s390/boot/Makefile
index da9dad35c28e..d1fa37fcce83 100644
--- a/arch/s390/boot/Makefile
+++ b/arch/s390/boot/Makefile
@@ -3,12 +3,6 @@
 # Makefile for the linux s390-specific parts of the memory manager.
 #
 
-COMPILE_VERSION := __linux_compile_version_id__`hostname |  \
-			tr -c '[0-9A-Za-z]' '_'`__`date | \
-			tr -c '[0-9A-Za-z]' '_'`_t
-
-ccflags-y  := -DCOMPILE_VERSION=$(COMPILE_VERSION) -gstabs -I.
-
 targets := image
 targets += bzImage
 subdir- := compressed

From f43c426a581f04272a852f0486ae431acff6d87e Mon Sep 17 00:00:00 2001
From: Vasily Gorbik <gor@linux.ibm.com>
Date: Fri, 13 Apr 2018 10:57:27 +0200
Subject: [PATCH 111/288] s390: remove couple of duplicate includes

Removing couple of duplicate includes, found by "make includecheck".
That leaves 1 duplicate include in arch/s390/kernel/entry.S, which is
there for a reason (it includes generated asm/syscall_table.h twice).

Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 drivers/s390/block/dasd_diag.c  | 1 -
 drivers/s390/net/qeth_l2_main.c | 1 -
 2 files changed, 2 deletions(-)

diff --git a/drivers/s390/block/dasd_diag.c b/drivers/s390/block/dasd_diag.c
index f035c2f25d35..131f1989f6f3 100644
--- a/drivers/s390/block/dasd_diag.c
+++ b/drivers/s390/block/dasd_diag.c
@@ -27,7 +27,6 @@
 #include <asm/io.h>
 #include <asm/irq.h>
 #include <asm/vtoc.h>
-#include <asm/diag.h>
 
 #include "dasd_int.h"
 #include "dasd_diag.h"
diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c
index 50a313806dde..2ad6f12f3d49 100644
--- a/drivers/s390/net/qeth_l2_main.c
+++ b/drivers/s390/net/qeth_l2_main.c
@@ -21,7 +21,6 @@
 #include <linux/list.h>
 #include <linux/hash.h>
 #include <linux/hashtable.h>
-#include <linux/string.h>
 #include <asm/setup.h>
 #include "qeth_core.h"
 #include "qeth_l2.h"

From 451239eb3d397bd197a79cc3aab943da41ba0905 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Fri, 13 Apr 2018 14:04:24 +0200
Subject: [PATCH 112/288] s390: add support for IBM z14 Model ZR1

Just add the new machine type number to the two places that matter.

Cc: <stable@vger.kernel.org> # v4.14+
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/Kconfig                      | 8 ++++----
 arch/s390/kernel/perf_cpum_cf_events.c | 1 +
 arch/s390/kernel/setup.c               | 1 +
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index beccb58a82e5..199ac3e4da1d 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -286,12 +286,12 @@ config MARCH_Z13
 	  older machines.
 
 config MARCH_Z14
-	bool "IBM z14"
+	bool "IBM z14 ZR1 and z14"
 	select HAVE_MARCH_Z14_FEATURES
 	help
-	  Select this to enable optimizations for IBM z14 (3906 series).
-	  The kernel will be slightly faster but will not work on older
-	  machines.
+	  Select this to enable optimizations for IBM z14 ZR1 and z14 (3907
+	  and 3906 series). The kernel will be slightly faster but will not
+	  work on older machines.
 
 endchoice
 
diff --git a/arch/s390/kernel/perf_cpum_cf_events.c b/arch/s390/kernel/perf_cpum_cf_events.c
index c5bc3f209652..5ee27dc9a10c 100644
--- a/arch/s390/kernel/perf_cpum_cf_events.c
+++ b/arch/s390/kernel/perf_cpum_cf_events.c
@@ -583,6 +583,7 @@ __init const struct attribute_group **cpumf_cf_event_group(void)
 		model = cpumcf_z13_pmu_event_attr;
 		break;
 	case 0x3906:
+	case 0x3907:
 		model = cpumcf_z14_pmu_event_attr;
 		break;
 	default:
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index fc3b4aa185cc..d82a9ec64ea9 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -821,6 +821,7 @@ static int __init setup_hwcaps(void)
 		strcpy(elf_platform, "z13");
 		break;
 	case 0x3906:
+	case 0x3907:
 		strcpy(elf_platform, "z14");
 		break;
 	}

From 2f18d46683cb3047c41229d57cf7c6e2ee48676f Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Wed, 4 Apr 2018 10:15:38 +0200
Subject: [PATCH 113/288] rbd: refactor rbd_wait_state_locked()

In preparation for lock_timeout option, make rbd_wait_state_locked()
return error codes.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 drivers/block/rbd.c | 43 ++++++++++++++++++++++++++-----------------
 1 file changed, 26 insertions(+), 17 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 07dc5419bd63..f4b1b91e6d4d 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -3533,9 +3533,21 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
 /*
  * lock_rwsem must be held for read
  */
-static void rbd_wait_state_locked(struct rbd_device *rbd_dev)
+static int rbd_wait_state_locked(struct rbd_device *rbd_dev, bool may_acquire)
 {
 	DEFINE_WAIT(wait);
+	int ret = 0;
+
+	if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags))
+		return -EBLACKLISTED;
+
+	if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
+		return 0;
+
+	if (!may_acquire) {
+		rbd_warn(rbd_dev, "exclusive lock required");
+		return -EROFS;
+	}
 
 	do {
 		/*
@@ -3549,10 +3561,14 @@ static void rbd_wait_state_locked(struct rbd_device *rbd_dev)
 		up_read(&rbd_dev->lock_rwsem);
 		schedule();
 		down_read(&rbd_dev->lock_rwsem);
-	} while (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED &&
-		 !test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags));
+		if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) {
+			ret = -EBLACKLISTED;
+			break;
+		}
+	} while (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED);
 
 	finish_wait(&rbd_dev->lock_waitq, &wait);
+	return ret;
 }
 
 static void rbd_queue_workfn(struct work_struct *work)
@@ -3638,19 +3654,10 @@ static void rbd_queue_workfn(struct work_struct *work)
 	    (op_type != OBJ_OP_READ || rbd_dev->opts->lock_on_read);
 	if (must_be_locked) {
 		down_read(&rbd_dev->lock_rwsem);
-		if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED &&
-		    !test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) {
-			if (rbd_dev->opts->exclusive) {
-				rbd_warn(rbd_dev, "exclusive lock required");
-				result = -EROFS;
-				goto err_unlock;
-			}
-			rbd_wait_state_locked(rbd_dev);
-		}
-		if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) {
-			result = -EBLACKLISTED;
+		result = rbd_wait_state_locked(rbd_dev,
+					       !rbd_dev->opts->exclusive);
+		if (result)
 			goto err_unlock;
-		}
 	}
 
 	img_request = rbd_img_request_create(rbd_dev, op_type, snapc);
@@ -5216,6 +5223,8 @@ static void rbd_dev_image_unlock(struct rbd_device *rbd_dev)
 
 static int rbd_add_acquire_lock(struct rbd_device *rbd_dev)
 {
+	int ret;
+
 	if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK)) {
 		rbd_warn(rbd_dev, "exclusive-lock feature is not enabled");
 		return -EINVAL;
@@ -5223,9 +5232,9 @@ static int rbd_add_acquire_lock(struct rbd_device *rbd_dev)
 
 	/* FIXME: "rbd map --exclusive" should be in interruptible */
 	down_read(&rbd_dev->lock_rwsem);
-	rbd_wait_state_locked(rbd_dev);
+	ret = rbd_wait_state_locked(rbd_dev, true);
 	up_read(&rbd_dev->lock_rwsem);
-	if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) {
+	if (ret) {
 		rbd_warn(rbd_dev, "failed to acquire exclusive lock");
 		return -EROFS;
 	}

From 34f55d0b3a0a39c95134c0c89173893b846d4c80 Mon Sep 17 00:00:00 2001
From: Dongsheng Yang <dongsheng.yang@easystack.cn>
Date: Mon, 26 Mar 2018 10:22:55 -0400
Subject: [PATCH 114/288] rbd: support timeout in rbd_wait_state_locked()

currently, the rbd_wait_state_locked() will wait forever if we
can't get our state locked. Example:

rbd map --exclusive test1  --> /dev/rbd0
rbd map test1  --> /dev/rbd1
dd if=/dev/zero of=/dev/rbd1 bs=1M count=1 --> IO blocked

To avoid this problem, this patch introduce a timeout design
in rbd_wait_state_locked(). Then rbd_wait_state_locked() will
return error when we reach a timeout.

This patch allow user to set the lock_timeout in rbd mapping.

Signed-off-by: Dongsheng Yang <dongsheng.yang@easystack.cn>
Reviewed-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 drivers/block/rbd.c | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index f4b1b91e6d4d..d5a51493e8b5 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -732,6 +732,7 @@ static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
  */
 enum {
 	Opt_queue_depth,
+	Opt_lock_timeout,
 	Opt_last_int,
 	/* int args above */
 	Opt_last_string,
@@ -745,6 +746,7 @@ enum {
 
 static match_table_t rbd_opts_tokens = {
 	{Opt_queue_depth, "queue_depth=%d"},
+	{Opt_lock_timeout, "lock_timeout=%d"},
 	/* int args above */
 	/* string args above */
 	{Opt_read_only, "read_only"},
@@ -758,12 +760,14 @@ static match_table_t rbd_opts_tokens = {
 
 struct rbd_options {
 	int	queue_depth;
+	unsigned long	lock_timeout;
 	bool	read_only;
 	bool	lock_on_read;
 	bool	exclusive;
 };
 
 #define RBD_QUEUE_DEPTH_DEFAULT	BLKDEV_MAX_RQ
+#define RBD_LOCK_TIMEOUT_DEFAULT 0  /* no timeout */
 #define RBD_READ_ONLY_DEFAULT	false
 #define RBD_LOCK_ON_READ_DEFAULT false
 #define RBD_EXCLUSIVE_DEFAULT	false
@@ -796,6 +800,14 @@ static int parse_rbd_opts_token(char *c, void *private)
 		}
 		rbd_opts->queue_depth = intval;
 		break;
+	case Opt_lock_timeout:
+		/* 0 is "wait forever" (i.e. infinite timeout) */
+		if (intval < 0 || intval > INT_MAX / 1000) {
+			pr_err("lock_timeout out of range\n");
+			return -EINVAL;
+		}
+		rbd_opts->lock_timeout = msecs_to_jiffies(intval * 1000);
+		break;
 	case Opt_read_only:
 		rbd_opts->read_only = true;
 		break;
@@ -3536,6 +3548,7 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
 static int rbd_wait_state_locked(struct rbd_device *rbd_dev, bool may_acquire)
 {
 	DEFINE_WAIT(wait);
+	unsigned long timeout;
 	int ret = 0;
 
 	if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags))
@@ -3559,12 +3572,18 @@ static int rbd_wait_state_locked(struct rbd_device *rbd_dev, bool may_acquire)
 		prepare_to_wait_exclusive(&rbd_dev->lock_waitq, &wait,
 					  TASK_UNINTERRUPTIBLE);
 		up_read(&rbd_dev->lock_rwsem);
-		schedule();
+		timeout = schedule_timeout(ceph_timeout_jiffies(
+						rbd_dev->opts->lock_timeout));
 		down_read(&rbd_dev->lock_rwsem);
 		if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) {
 			ret = -EBLACKLISTED;
 			break;
 		}
+		if (!timeout) {
+			rbd_warn(rbd_dev, "timed out waiting for lock");
+			ret = -ETIMEDOUT;
+			break;
+		}
 	} while (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED);
 
 	finish_wait(&rbd_dev->lock_waitq, &wait);
@@ -5186,6 +5205,7 @@ static int rbd_add_parse_args(const char *buf,
 
 	rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
 	rbd_opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT;
+	rbd_opts->lock_timeout = RBD_LOCK_TIMEOUT_DEFAULT;
 	rbd_opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT;
 	rbd_opts->exclusive = RBD_EXCLUSIVE_DEFAULT;
 

From ffdeec7aa41aa61ca4ee68fddf4669df9ce661d1 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Mon, 26 Mar 2018 16:46:39 +0800
Subject: [PATCH 115/288] ceph: always update atime/mtime/ctime for new inode

For new inode, atime/mtime/ctime are uninitialized.  Don't compare
against them.

Cc: stable@kernel.org
Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
Reviewed-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/inode.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 8bf60250309e..ae056927080d 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -669,13 +669,15 @@ void ceph_fill_file_time(struct inode *inode, int issued,
 		      CEPH_CAP_FILE_BUFFER|
 		      CEPH_CAP_AUTH_EXCL|
 		      CEPH_CAP_XATTR_EXCL)) {
-		if (timespec_compare(ctime, &inode->i_ctime) > 0) {
+		if (ci->i_version == 0 ||
+		    timespec_compare(ctime, &inode->i_ctime) > 0) {
 			dout("ctime %ld.%09ld -> %ld.%09ld inc w/ cap\n",
 			     inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
 			     ctime->tv_sec, ctime->tv_nsec);
 			inode->i_ctime = *ctime;
 		}
-		if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) {
+		if (ci->i_version == 0 ||
+		    ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) {
 			/* the MDS did a utimes() */
 			dout("mtime %ld.%09ld -> %ld.%09ld "
 			     "tw %d -> %d\n",
@@ -795,7 +797,6 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
 	new_issued = ~issued & le32_to_cpu(info->cap.caps);
 
 	/* update inode */
-	ci->i_version = le64_to_cpu(info->version);
 	inode->i_rdev = le32_to_cpu(info->rdev);
 	inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
 
@@ -868,6 +869,9 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
 		xattr_blob = NULL;
 	}
 
+	/* finally update i_version */
+	ci->i_version = le64_to_cpu(info->version);
+
 	inode->i_mapping->a_ops = &ceph_aops;
 
 	switch (inode->i_mode & S_IFMT) {

From c6244b3b23771b258656445dcd212be759265b84 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 4 Apr 2018 14:53:39 +0200
Subject: [PATCH 116/288] rbd: avoid Wreturn-type warnings

In some configurations gcc cannot see that rbd_assert(0) leads to an
unreachable code path:

  drivers/block/rbd.c: In function 'rbd_img_is_write':
  drivers/block/rbd.c:1397:1: error: control reaches end of non-void function [-Werror=return-type]
  drivers/block/rbd.c: In function '__rbd_obj_handle_request':
  drivers/block/rbd.c:2499:1: error: control reaches end of non-void function [-Werror=return-type]
  drivers/block/rbd.c: In function 'rbd_obj_handle_write':
  drivers/block/rbd.c:2471:1: error: control reaches end of non-void function [-Werror=return-type]

As the rbd_assert() here shows has no extra information beyond the verbose
BUG(), we can simply use BUG() directly in its place.  This is reliably
detected as not returning on any architecture, since it doesn't depend
on the unlikely() comparison that confused gcc.

Fixes: 3da691bf4366 ("rbd: new request handling code")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 drivers/block/rbd.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index d5a51493e8b5..e40e490ff967 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1404,7 +1404,7 @@ static bool rbd_img_is_write(struct rbd_img_request *img_req)
 	case OBJ_OP_DISCARD:
 		return true;
 	default:
-		rbd_assert(0);
+		BUG();
 	}
 }
 
@@ -2478,7 +2478,7 @@ again:
 		}
 		return false;
 	default:
-		rbd_assert(0);
+		BUG();
 	}
 }
 
@@ -2506,7 +2506,7 @@ static bool __rbd_obj_handle_request(struct rbd_obj_request *obj_req)
 		}
 		return false;
 	default:
-		rbd_assert(0);
+		BUG();
 	}
 }
 

From 420efbdf4d2358dc12913298ad44d041c6ac0ed6 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 16 Apr 2018 09:32:18 +0200
Subject: [PATCH 117/288] rbd: adjust queue limits for "fancy" striping

In order to take full advantage of merging in ceph_file_to_extents(),
allow object set sized I/Os.  If the layout is not "fancy", an object
set consists of just one object.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 drivers/block/rbd.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index e40e490ff967..6a1805858b79 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -3928,7 +3928,8 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
 {
 	struct gendisk *disk;
 	struct request_queue *q;
-	u64 segment_size;
+	unsigned int objset_bytes =
+	    rbd_dev->layout.object_size * rbd_dev->layout.stripe_count;
 	int err;
 
 	/* create gendisk info */
@@ -3968,20 +3969,18 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
 	blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
 	/* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */
 
-	/* set io sizes to object size */
-	segment_size = rbd_obj_bytes(&rbd_dev->header);
-	blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
+	blk_queue_max_hw_sectors(q, objset_bytes >> SECTOR_SHIFT);
 	q->limits.max_sectors = queue_max_hw_sectors(q);
 	blk_queue_max_segments(q, USHRT_MAX);
 	blk_queue_max_segment_size(q, UINT_MAX);
-	blk_queue_io_min(q, segment_size);
-	blk_queue_io_opt(q, segment_size);
+	blk_queue_io_min(q, objset_bytes);
+	blk_queue_io_opt(q, objset_bytes);
 
 	/* enable the discard support */
 	blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
-	q->limits.discard_granularity = segment_size;
-	blk_queue_max_discard_sectors(q, segment_size / SECTOR_SIZE);
-	blk_queue_max_write_zeroes_sectors(q, segment_size / SECTOR_SIZE);
+	q->limits.discard_granularity = objset_bytes;
+	blk_queue_max_discard_sectors(q, objset_bytes >> SECTOR_SHIFT);
+	blk_queue_max_write_zeroes_sectors(q, objset_bytes >> SECTOR_SHIFT);
 
 	if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
 		q->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES;

From d93605407af34eb0b7eb8aff6b1eae2cde3cdd22 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Fri, 23 Mar 2018 06:14:47 +0100
Subject: [PATCH 118/288] rbd: notrim map option

Add an option to turn off discard and write zeroes offload support to
avoid deprovisioning a fully provisioned image.  When enabled, discard
requests will fail with -EOPNOTSUPP, write zeroes requests will fall
back to manually zeroing.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Tested-by: Hitoshi Kamei <hitoshi.kamei.xm@hitachi.com>
---
 drivers/block/rbd.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 6a1805858b79..8e8b04cc569a 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -741,6 +741,7 @@ enum {
 	Opt_read_write,
 	Opt_lock_on_read,
 	Opt_exclusive,
+	Opt_notrim,
 	Opt_err
 };
 
@@ -755,6 +756,7 @@ static match_table_t rbd_opts_tokens = {
 	{Opt_read_write, "rw"},		/* Alternate spelling */
 	{Opt_lock_on_read, "lock_on_read"},
 	{Opt_exclusive, "exclusive"},
+	{Opt_notrim, "notrim"},
 	{Opt_err, NULL}
 };
 
@@ -764,6 +766,7 @@ struct rbd_options {
 	bool	read_only;
 	bool	lock_on_read;
 	bool	exclusive;
+	bool	trim;
 };
 
 #define RBD_QUEUE_DEPTH_DEFAULT	BLKDEV_MAX_RQ
@@ -771,6 +774,7 @@ struct rbd_options {
 #define RBD_READ_ONLY_DEFAULT	false
 #define RBD_LOCK_ON_READ_DEFAULT false
 #define RBD_EXCLUSIVE_DEFAULT	false
+#define RBD_TRIM_DEFAULT	true
 
 static int parse_rbd_opts_token(char *c, void *private)
 {
@@ -820,6 +824,9 @@ static int parse_rbd_opts_token(char *c, void *private)
 	case Opt_exclusive:
 		rbd_opts->exclusive = true;
 		break;
+	case Opt_notrim:
+		rbd_opts->trim = false;
+		break;
 	default:
 		/* libceph prints "bad option" msg */
 		return -EINVAL;
@@ -3976,11 +3983,12 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
 	blk_queue_io_min(q, objset_bytes);
 	blk_queue_io_opt(q, objset_bytes);
 
-	/* enable the discard support */
-	blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
-	q->limits.discard_granularity = objset_bytes;
-	blk_queue_max_discard_sectors(q, objset_bytes >> SECTOR_SHIFT);
-	blk_queue_max_write_zeroes_sectors(q, objset_bytes >> SECTOR_SHIFT);
+	if (rbd_dev->opts->trim) {
+		blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
+		q->limits.discard_granularity = objset_bytes;
+		blk_queue_max_discard_sectors(q, objset_bytes >> SECTOR_SHIFT);
+		blk_queue_max_write_zeroes_sectors(q, objset_bytes >> SECTOR_SHIFT);
+	}
 
 	if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
 		q->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES;
@@ -5207,6 +5215,7 @@ static int rbd_add_parse_args(const char *buf,
 	rbd_opts->lock_timeout = RBD_LOCK_TIMEOUT_DEFAULT;
 	rbd_opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT;
 	rbd_opts->exclusive = RBD_EXCLUSIVE_DEFAULT;
+	rbd_opts->trim = RBD_TRIM_DEFAULT;
 
 	copts = ceph_parse_options(options, mon_addrs,
 					mon_addrs + mon_addrs_size - 1,

From 4f34a5130a471f32f2fe7750769ab4057dc3eaa0 Mon Sep 17 00:00:00 2001
From: Chengguang Xu <cgxu519@gmx.com>
Date: Sat, 14 Apr 2018 20:16:06 +0800
Subject: [PATCH 119/288] isofs: fix potential memory leak in mount option
 parsing

When specifying string type mount option (e.g., iocharset)
several times in a mount, current option parsing may
cause memory leak. Hence, call kfree for previous one
in this case. Meanwhile, check memory allocation result
for it.

Signed-off-by: Chengguang Xu <cgxu519@gmx.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/isofs/inode.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index bc258a4402f6..ec3fba7d492f 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -394,7 +394,10 @@ static int parse_options(char *options, struct iso9660_options *popt)
 			break;
 #ifdef CONFIG_JOLIET
 		case Opt_iocharset:
+			kfree(popt->iocharset);
 			popt->iocharset = match_strdup(&args[0]);
+			if (!popt->iocharset)
+				return 0;
 			break;
 #endif
 		case Opt_map_a:

From 06856938112b84ff3c6b0594d017f59cfda2a43d Mon Sep 17 00:00:00 2001
From: Souptick Joarder <jrdr.linux@gmail.com>
Date: Sun, 15 Apr 2018 01:03:42 +0530
Subject: [PATCH 120/288] fs: ext2: Adding new return type vm_fault_t

Use new return type vm_fault_t for page_mkwrite,
pfn_mkwrite and fault handler.

Signed-off-by: Souptick Joarder <jrdr.linux@gmail.com>
Reviewed-by: Matthew Wilcox <mawilcox@microsoft.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext2/file.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 09640220fda8..047c327a6b23 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -88,11 +88,11 @@ out_unlock:
  * The default page_lock and i_size verification done by non-DAX fault paths
  * is sufficient because ext2 doesn't support hole punching.
  */
-static int ext2_dax_fault(struct vm_fault *vmf)
+static vm_fault_t ext2_dax_fault(struct vm_fault *vmf)
 {
 	struct inode *inode = file_inode(vmf->vma->vm_file);
 	struct ext2_inode_info *ei = EXT2_I(inode);
-	int ret;
+	vm_fault_t ret;
 
 	if (vmf->flags & FAULT_FLAG_WRITE) {
 		sb_start_pagefault(inode->i_sb);

From 5f20407a2415181ce3af06fc84a86280a2708dd4 Mon Sep 17 00:00:00 2001
From: Jerry Hoemann <jerry.hoemann@hpe.com>
Date: Mon, 9 Oct 2017 12:56:56 -0600
Subject: [PATCH 121/288] watchdog: hpwdt: change maintainer.

Signed-off-by: Jerry Hoemann <jerry.hoemann@hpe.com>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Acked-by: Jimmy Vance <jimmy.vance@hpe.com>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Wim Van Sebroeck <wim@iguana.be>
---
 MAINTAINERS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 0a1410d5a621..2a8a542a2cdb 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6256,7 +6256,7 @@ S:	Odd Fixes
 F:	drivers/media/usb/hdpvr/
 
 HEWLETT PACKARD ENTERPRISE ILO NMI WATCHDOG DRIVER
-M:	Jimmy Vance <jimmy.vance@hpe.com>
+M:	Jerry Hoemann <jerry.hoemann@hpe.com>
 S:	Supported
 F:	Documentation/watchdog/hpwdt.txt
 F:	drivers/watchdog/hpwdt.c

From a03cc689e5c9d89d500f4a4dae1a81ea512dbb25 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Tue, 27 Mar 2018 14:25:40 -0500
Subject: [PATCH 122/288] watchdog: sch311x_wdt: Mark expected switch
 fall-through

In preparation to enabling -Wimplicit-fallthrough, mark switch cases
where we are expecting to fall through.

Notice that in this particular case I replaced "Fall" with a proper
"Fall through" comment, which is what GCC is expecting to find.

Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Wim Van Sebroeck <wim@iguana.be>
---
 drivers/watchdog/sch311x_wdt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/watchdog/sch311x_wdt.c b/drivers/watchdog/sch311x_wdt.c
index 43d0cbb7ba0b..814cdf539b0f 100644
--- a/drivers/watchdog/sch311x_wdt.c
+++ b/drivers/watchdog/sch311x_wdt.c
@@ -299,7 +299,7 @@ static long sch311x_wdt_ioctl(struct file *file, unsigned int cmd,
 		if (sch311x_wdt_set_heartbeat(new_timeout))
 			return -EINVAL;
 		sch311x_wdt_keepalive();
-		/* Fall */
+		/* Fall through */
 	case WDIOC_GETTIMEOUT:
 		return put_user(timeout, p);
 	default:

From 3ff0751fa04208459b74cfbe492df79468b8425c Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Tue, 27 Mar 2018 14:30:41 -0500
Subject: [PATCH 123/288] watchdog: w83977f_wdt: Mark expected switch
 fall-through

In preparation to enabling -Wimplicit-fallthrough, mark switch cases
where we are expecting to fall through.

Notice that in this particular case I replaced "Fall" with a proper
"Fall through" comment, which is what GCC is expecting to find.

Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Wim Van Sebroeck <wim@iguana.be>
---
 drivers/watchdog/w83977f_wdt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/watchdog/w83977f_wdt.c b/drivers/watchdog/w83977f_wdt.c
index 20e2bba10400..672b61a7f9a3 100644
--- a/drivers/watchdog/w83977f_wdt.c
+++ b/drivers/watchdog/w83977f_wdt.c
@@ -427,7 +427,7 @@ static long wdt_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 			return -EINVAL;
 
 		wdt_keepalive();
-		/* Fall */
+		/* Fall through */
 
 	case WDIOC_GETTIMEOUT:
 		return put_user(timeout, uarg.i);

From e30d69df78fb2667dc58e906cabd0f70ed0af95d Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Tue, 27 Mar 2018 14:33:49 -0500
Subject: [PATCH 124/288] watchdog: wafer5823wdt: Mark expected switch
 fall-through

In preparation to enabling -Wimplicit-fallthrough, mark switch cases
where we are expecting to fall through.

Notice that in this particular case I replaced "Fall" with a proper
"Fall through" comment, which is what GCC is expecting to find.

Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Wim Van Sebroeck <wim@iguana.be>
---
 drivers/watchdog/wafer5823wdt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/watchdog/wafer5823wdt.c b/drivers/watchdog/wafer5823wdt.c
index db0da7ea4fd8..93c5b610e264 100644
--- a/drivers/watchdog/wafer5823wdt.c
+++ b/drivers/watchdog/wafer5823wdt.c
@@ -178,7 +178,7 @@ static long wafwdt_ioctl(struct file *file, unsigned int cmd,
 		timeout = new_timeout;
 		wafwdt_stop();
 		wafwdt_start();
-		/* Fall */
+		/* Fall through */
 	case WDIOC_GETTIMEOUT:
 		return put_user(timeout, p);
 

From fdac6a90d2d151abdbb7e5ec14bb9ab64e2931ec Mon Sep 17 00:00:00 2001
From: Veeraiyan Chidambaram <veeraiyan.chidambaram@in.bosch.com>
Date: Fri, 13 Apr 2018 16:19:24 +0200
Subject: [PATCH 125/288] watchdog: renesas-wdt: Add support for
 WDIOF_CARDRESET

This patch adds the WDIOF_CARDRESET support for the Renesas platform
watchdog, to know if the board reboot is due to a watchdog reset.

This is done via the WOVF bit (bit 4) of the RWTCSRA register, which
indicates if RWTCNT overflowed, triggering the reset in last boot.

Signed-off-by: Veeraiyan Chidambaram <veeraiyan.chidambaram@in.bosch.com>
[takeshi.kihara.df: changed to read the RWTCSRA register while clock is
 enabled]
Signed-off-by: Takeshi Kihara <takeshi.kihara.df@renesas.com>
Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Reviewed-by: Vladimir Zapolskiy <vz@mleia.com>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Wim Van Sebroeck <wim@iguana.be>
---
 drivers/watchdog/renesas_wdt.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/watchdog/renesas_wdt.c b/drivers/watchdog/renesas_wdt.c
index 6b8c6ddfe30b..514db5cc1595 100644
--- a/drivers/watchdog/renesas_wdt.c
+++ b/drivers/watchdog/renesas_wdt.c
@@ -121,7 +121,8 @@ static int rwdt_restart(struct watchdog_device *wdev, unsigned long action,
 }
 
 static const struct watchdog_info rwdt_ident = {
-	.options = WDIOF_MAGICCLOSE | WDIOF_KEEPALIVEPING | WDIOF_SETTIMEOUT,
+	.options = WDIOF_MAGICCLOSE | WDIOF_KEEPALIVEPING | WDIOF_SETTIMEOUT |
+		WDIOF_CARDRESET,
 	.identity = "Renesas WDT Watchdog",
 };
 
@@ -197,9 +198,10 @@ static int rwdt_probe(struct platform_device *pdev)
 		return PTR_ERR(clk);
 
 	pm_runtime_enable(&pdev->dev);
-
 	pm_runtime_get_sync(&pdev->dev);
 	priv->clk_rate = clk_get_rate(clk);
+	priv->wdev.bootstatus = (readb_relaxed(priv->base + RWTCSRA) &
+				RWTCSRA_WOVF) ? WDIOF_CARDRESET : 0;
 	pm_runtime_put(&pdev->dev);
 
 	if (!priv->clk_rate) {

From 49d4d277ca54e04170d39484c8758a0ea9bca37d Mon Sep 17 00:00:00 2001
From: Eddie James <eajames@linux.vnet.ibm.com>
Date: Tue, 27 Mar 2018 15:09:27 -0500
Subject: [PATCH 126/288] aspeed: watchdog: Set bootstatus during probe

Check the aspeed timeout status register to see if the system has booted
from the secondary boot source. If so, set the watchdog device
bootstatus flag for "Card previously reset the CPU."

Signed-off-by: Eddie James <eajames@linux.vnet.ibm.com>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Wim Van Sebroeck <wim@iguana.be>
---
 drivers/watchdog/aspeed_wdt.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/watchdog/aspeed_wdt.c b/drivers/watchdog/aspeed_wdt.c
index a5b8eb21201f..1abe4d021fd2 100644
--- a/drivers/watchdog/aspeed_wdt.c
+++ b/drivers/watchdog/aspeed_wdt.c
@@ -55,6 +55,8 @@ MODULE_DEVICE_TABLE(of, aspeed_wdt_of_table);
 #define   WDT_CTRL_WDT_INTR		BIT(2)
 #define   WDT_CTRL_RESET_SYSTEM		BIT(1)
 #define   WDT_CTRL_ENABLE		BIT(0)
+#define WDT_TIMEOUT_STATUS	0x10
+#define   WDT_TIMEOUT_STATUS_BOOT_SECONDARY	BIT(1)
 
 /*
  * WDT_RESET_WIDTH controls the characteristics of the external pulse (if
@@ -192,6 +194,7 @@ static int aspeed_wdt_probe(struct platform_device *pdev)
 	struct device_node *np;
 	const char *reset_type;
 	u32 duration;
+	u32 status;
 	int ret;
 
 	wdt = devm_kzalloc(&pdev->dev, sizeof(*wdt), GFP_KERNEL);
@@ -307,6 +310,10 @@ static int aspeed_wdt_probe(struct platform_device *pdev)
 		writel(duration - 1, wdt->base + WDT_RESET_WIDTH);
 	}
 
+	status = readl(wdt->base + WDT_TIMEOUT_STATUS);
+	if (status & WDT_TIMEOUT_STATUS_BOOT_SECONDARY)
+		wdt->wdd.bootstatus = WDIOF_CARDRESET;
+
 	ret = devm_watchdog_register_device(&pdev->dev, &wdt->wdd);
 	if (ret) {
 		dev_err(&pdev->dev, "failed to register\n");

From de2011197d15746307e709687401397fe52bea83 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Mon, 20 Nov 2017 08:48:02 +0100
Subject: [PATCH 127/288] s390: update defconfig

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/configs/default_defconfig     | 31 ++++++++++++++++---------
 arch/s390/configs/performance_defconfig | 20 ++++++++++++----
 arch/s390/defconfig                     | 13 +++++++++--
 3 files changed, 46 insertions(+), 18 deletions(-)

diff --git a/arch/s390/configs/default_defconfig b/arch/s390/configs/default_defconfig
index 2310315b0744..6176fe9795ca 100644
--- a/arch/s390/configs/default_defconfig
+++ b/arch/s390/configs/default_defconfig
@@ -24,13 +24,13 @@ CONFIG_CPUSETS=y
 CONFIG_CGROUP_DEVICE=y
 CONFIG_CGROUP_CPUACCT=y
 CONFIG_CGROUP_PERF=y
-CONFIG_CHECKPOINT_RESTORE=y
 CONFIG_NAMESPACES=y
 CONFIG_USER_NS=y
 CONFIG_SCHED_AUTOGROUP=y
 CONFIG_BLK_DEV_INITRD=y
 CONFIG_EXPERT=y
 # CONFIG_SYSFS_SYSCALL is not set
+CONFIG_CHECKPOINT_RESTORE=y
 CONFIG_BPF_SYSCALL=y
 CONFIG_USERFAULTFD=y
 # CONFIG_COMPAT_BRK is not set
@@ -59,10 +59,11 @@ CONFIG_CFQ_GROUP_IOSCHED=y
 CONFIG_DEFAULT_DEADLINE=y
 CONFIG_LIVEPATCH=y
 CONFIG_TUNE_ZEC12=y
-CONFIG_NR_CPUS=256
+CONFIG_NR_CPUS=512
 CONFIG_NUMA=y
 CONFIG_PREEMPT=y
 CONFIG_HZ_100=y
+CONFIG_KEXEC_FILE=y
 CONFIG_MEMORY_HOTPLUG=y
 CONFIG_MEMORY_HOTREMOVE=y
 CONFIG_KSM=y
@@ -305,7 +306,6 @@ CONFIG_IP6_NF_SECURITY=m
 CONFIG_IP6_NF_NAT=m
 CONFIG_IP6_NF_TARGET_MASQUERADE=m
 CONFIG_NF_TABLES_BRIDGE=m
-CONFIG_NET_SCTPPROBE=m
 CONFIG_RDS=m
 CONFIG_RDS_RDMA=m
 CONFIG_RDS_TCP=m
@@ -364,11 +364,11 @@ CONFIG_NET_ACT_SIMP=m
 CONFIG_NET_ACT_SKBEDIT=m
 CONFIG_NET_ACT_CSUM=m
 CONFIG_DNS_RESOLVER=y
+CONFIG_OPENVSWITCH=m
 CONFIG_NETLINK_DIAG=m
 CONFIG_CGROUP_NET_PRIO=y
 CONFIG_BPF_JIT=y
 CONFIG_NET_PKTGEN=m
-CONFIG_NET_TCPPROBE=m
 CONFIG_DEVTMPFS=y
 CONFIG_DMA_CMA=y
 CONFIG_CMA_SIZE_MBYTES=0
@@ -380,9 +380,9 @@ CONFIG_BLK_DEV_DRBD=m
 CONFIG_BLK_DEV_NBD=m
 CONFIG_BLK_DEV_RAM=y
 CONFIG_BLK_DEV_RAM_SIZE=32768
-CONFIG_BLK_DEV_RAM_DAX=y
 CONFIG_VIRTIO_BLK=y
 CONFIG_BLK_DEV_RBD=m
+CONFIG_BLK_DEV_NVME=m
 CONFIG_ENCLOSURE_SERVICES=m
 CONFIG_GENWQE=m
 CONFIG_RAID_ATTRS=m
@@ -461,6 +461,7 @@ CONFIG_PPTP=m
 CONFIG_PPPOL2TP=m
 CONFIG_PPP_ASYNC=m
 CONFIG_PPP_SYNC_TTY=m
+CONFIG_INPUT_EVDEV=y
 # CONFIG_INPUT_KEYBOARD is not set
 # CONFIG_INPUT_MOUSE is not set
 # CONFIG_SERIO is not set
@@ -474,6 +475,9 @@ CONFIG_WATCHDOG=y
 CONFIG_WATCHDOG_NOWAYOUT=y
 CONFIG_SOFT_WATCHDOG=m
 CONFIG_DIAG288_WATCHDOG=m
+CONFIG_DRM=y
+CONFIG_DRM_VIRTIO_GPU=y
+CONFIG_FRAMEBUFFER_CONSOLE=y
 # CONFIG_HID is not set
 # CONFIG_USB_SUPPORT is not set
 CONFIG_INFINIBAND=m
@@ -482,7 +486,9 @@ CONFIG_MLX4_INFINIBAND=m
 CONFIG_MLX5_INFINIBAND=m
 CONFIG_VFIO=m
 CONFIG_VFIO_PCI=m
+CONFIG_VIRTIO_PCI=m
 CONFIG_VIRTIO_BALLOON=m
+CONFIG_VIRTIO_INPUT=y
 CONFIG_EXT4_FS=y
 CONFIG_EXT4_FS_POSIX_ACL=y
 CONFIG_EXT4_FS_SECURITY=y
@@ -641,6 +647,8 @@ CONFIG_ATOMIC64_SELFTEST=y
 CONFIG_TEST_BPF=m
 CONFIG_BUG_ON_DATA_CORRUPTION=y
 CONFIG_S390_PTDUMP=y
+CONFIG_PERSISTENT_KEYRINGS=y
+CONFIG_BIG_KEYS=y
 CONFIG_ENCRYPTED_KEYS=m
 CONFIG_SECURITY=y
 CONFIG_SECURITY_NETWORK=y
@@ -649,17 +657,20 @@ CONFIG_SECURITY_SELINUX=y
 CONFIG_SECURITY_SELINUX_BOOTPARAM=y
 CONFIG_SECURITY_SELINUX_BOOTPARAM_VALUE=0
 CONFIG_SECURITY_SELINUX_DISABLE=y
+CONFIG_INTEGRITY_SIGNATURE=y
+CONFIG_INTEGRITY_ASYMMETRIC_KEYS=y
 CONFIG_IMA=y
+CONFIG_IMA_DEFAULT_HASH_SHA256=y
+CONFIG_IMA_WRITE_POLICY=y
 CONFIG_IMA_APPRAISE=y
-CONFIG_CRYPTO_RSA=m
 CONFIG_CRYPTO_DH=m
 CONFIG_CRYPTO_ECDH=m
 CONFIG_CRYPTO_USER=m
+# CONFIG_CRYPTO_MANAGER_DISABLE_TESTS is not set
 CONFIG_CRYPTO_PCRYPT=m
 CONFIG_CRYPTO_CRYPTD=m
 CONFIG_CRYPTO_MCRYPTD=m
 CONFIG_CRYPTO_TEST=m
-CONFIG_CRYPTO_GCM=m
 CONFIG_CRYPTO_CHACHA20POLY1305=m
 CONFIG_CRYPTO_LRW=m
 CONFIG_CRYPTO_PCBC=m
@@ -707,9 +718,8 @@ CONFIG_CRYPTO_DES_S390=m
 CONFIG_CRYPTO_AES_S390=m
 CONFIG_CRYPTO_GHASH_S390=m
 CONFIG_CRYPTO_CRC32_S390=y
-CONFIG_ASYMMETRIC_KEY_TYPE=y
-CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=m
-CONFIG_X509_CERTIFICATE_PARSER=m
+CONFIG_PKCS7_MESSAGE_PARSER=y
+CONFIG_SYSTEM_TRUSTED_KEYRING=y
 CONFIG_CRC7=m
 CONFIG_CRC8=m
 CONFIG_RANDOM32_SELFTEST=y
@@ -719,4 +729,3 @@ CONFIG_APPLDATA_BASE=y
 CONFIG_KVM=m
 CONFIG_KVM_S390_UCONTROL=y
 CONFIG_VHOST_NET=m
-CONFIG_KEXEC_FILE=y
diff --git a/arch/s390/configs/performance_defconfig b/arch/s390/configs/performance_defconfig
index 20ed149e1137..c105bcc6d7a6 100644
--- a/arch/s390/configs/performance_defconfig
+++ b/arch/s390/configs/performance_defconfig
@@ -25,13 +25,13 @@ CONFIG_CPUSETS=y
 CONFIG_CGROUP_DEVICE=y
 CONFIG_CGROUP_CPUACCT=y
 CONFIG_CGROUP_PERF=y
-CONFIG_CHECKPOINT_RESTORE=y
 CONFIG_NAMESPACES=y
 CONFIG_USER_NS=y
 CONFIG_SCHED_AUTOGROUP=y
 CONFIG_BLK_DEV_INITRD=y
 CONFIG_EXPERT=y
 # CONFIG_SYSFS_SYSCALL is not set
+CONFIG_CHECKPOINT_RESTORE=y
 CONFIG_BPF_SYSCALL=y
 CONFIG_USERFAULTFD=y
 # CONFIG_COMPAT_BRK is not set
@@ -45,6 +45,8 @@ CONFIG_MODULE_UNLOAD=y
 CONFIG_MODULE_FORCE_UNLOAD=y
 CONFIG_MODVERSIONS=y
 CONFIG_MODULE_SRCVERSION_ALL=y
+CONFIG_MODULE_SIG=y
+CONFIG_MODULE_SIG_SHA256=y
 CONFIG_BLK_DEV_INTEGRITY=y
 CONFIG_BLK_DEV_THROTTLING=y
 CONFIG_BLK_WBT=y
@@ -62,6 +64,7 @@ CONFIG_TUNE_ZEC12=y
 CONFIG_NR_CPUS=512
 CONFIG_NUMA=y
 CONFIG_HZ_100=y
+CONFIG_KEXEC_FILE=y
 CONFIG_MEMORY_HOTPLUG=y
 CONFIG_MEMORY_HOTREMOVE=y
 CONFIG_KSM=y
@@ -301,7 +304,6 @@ CONFIG_IP6_NF_SECURITY=m
 CONFIG_IP6_NF_NAT=m
 CONFIG_IP6_NF_TARGET_MASQUERADE=m
 CONFIG_NF_TABLES_BRIDGE=m
-CONFIG_NET_SCTPPROBE=m
 CONFIG_RDS=m
 CONFIG_RDS_RDMA=m
 CONFIG_RDS_TCP=m
@@ -359,11 +361,11 @@ CONFIG_NET_ACT_SIMP=m
 CONFIG_NET_ACT_SKBEDIT=m
 CONFIG_NET_ACT_CSUM=m
 CONFIG_DNS_RESOLVER=y
+CONFIG_OPENVSWITCH=m
 CONFIG_NETLINK_DIAG=m
 CONFIG_CGROUP_NET_PRIO=y
 CONFIG_BPF_JIT=y
 CONFIG_NET_PKTGEN=m
-CONFIG_NET_TCPPROBE=m
 CONFIG_DEVTMPFS=y
 CONFIG_DMA_CMA=y
 CONFIG_CMA_SIZE_MBYTES=0
@@ -375,8 +377,9 @@ CONFIG_BLK_DEV_DRBD=m
 CONFIG_BLK_DEV_NBD=m
 CONFIG_BLK_DEV_RAM=y
 CONFIG_BLK_DEV_RAM_SIZE=32768
-CONFIG_BLK_DEV_RAM_DAX=y
 CONFIG_VIRTIO_BLK=y
+CONFIG_BLK_DEV_RBD=m
+CONFIG_BLK_DEV_NVME=m
 CONFIG_ENCLOSURE_SERVICES=m
 CONFIG_GENWQE=m
 CONFIG_RAID_ATTRS=m
@@ -455,6 +458,7 @@ CONFIG_PPTP=m
 CONFIG_PPPOL2TP=m
 CONFIG_PPP_ASYNC=m
 CONFIG_PPP_SYNC_TTY=m
+CONFIG_INPUT_EVDEV=y
 # CONFIG_INPUT_KEYBOARD is not set
 # CONFIG_INPUT_MOUSE is not set
 # CONFIG_SERIO is not set
@@ -468,6 +472,9 @@ CONFIG_WATCHDOG=y
 CONFIG_WATCHDOG_NOWAYOUT=y
 CONFIG_SOFT_WATCHDOG=m
 CONFIG_DIAG288_WATCHDOG=m
+CONFIG_DRM=y
+CONFIG_DRM_VIRTIO_GPU=y
+CONFIG_FRAMEBUFFER_CONSOLE=y
 # CONFIG_HID is not set
 # CONFIG_USB_SUPPORT is not set
 CONFIG_INFINIBAND=m
@@ -476,7 +483,9 @@ CONFIG_MLX4_INFINIBAND=m
 CONFIG_MLX5_INFINIBAND=m
 CONFIG_VFIO=m
 CONFIG_VFIO_PCI=m
+CONFIG_VIRTIO_PCI=m
 CONFIG_VIRTIO_BALLOON=m
+CONFIG_VIRTIO_INPUT=y
 CONFIG_EXT4_FS=y
 CONFIG_EXT4_FS_POSIX_ACL=y
 CONFIG_EXT4_FS_SECURITY=y
@@ -507,7 +516,6 @@ CONFIG_AUTOFS4_FS=m
 CONFIG_FUSE_FS=y
 CONFIG_CUSE=m
 CONFIG_OVERLAY_FS=m
-CONFIG_OVERLAY_FS_REDIRECT_DIR=y
 CONFIG_FSCACHE=m
 CONFIG_CACHEFILES=m
 CONFIG_ISO9660_FS=y
@@ -592,8 +600,10 @@ CONFIG_SECURITY_SELINUX_DISABLE=y
 CONFIG_INTEGRITY_SIGNATURE=y
 CONFIG_INTEGRITY_ASYMMETRIC_KEYS=y
 CONFIG_IMA=y
+CONFIG_IMA_DEFAULT_HASH_SHA256=y
 CONFIG_IMA_WRITE_POLICY=y
 CONFIG_IMA_APPRAISE=y
+CONFIG_CRYPTO_FIPS=y
 CONFIG_CRYPTO_DH=m
 CONFIG_CRYPTO_ECDH=m
 CONFIG_CRYPTO_USER=m
diff --git a/arch/s390/defconfig b/arch/s390/defconfig
index 46a3178d8bc6..f40600eb1762 100644
--- a/arch/s390/defconfig
+++ b/arch/s390/defconfig
@@ -8,6 +8,7 @@ CONFIG_TASKSTATS=y
 CONFIG_TASK_DELAY_ACCT=y
 CONFIG_TASK_XACCT=y
 CONFIG_TASK_IO_ACCOUNTING=y
+# CONFIG_CPU_ISOLATION is not set
 CONFIG_IKCONFIG=y
 CONFIG_IKCONFIG_PROC=y
 CONFIG_CGROUPS=y
@@ -23,12 +24,12 @@ CONFIG_CPUSETS=y
 CONFIG_CGROUP_DEVICE=y
 CONFIG_CGROUP_CPUACCT=y
 CONFIG_CGROUP_PERF=y
-CONFIG_CHECKPOINT_RESTORE=y
 CONFIG_NAMESPACES=y
 CONFIG_USER_NS=y
 CONFIG_BLK_DEV_INITRD=y
 CONFIG_EXPERT=y
 # CONFIG_SYSFS_SYSCALL is not set
+CONFIG_CHECKPOINT_RESTORE=y
 CONFIG_BPF_SYSCALL=y
 CONFIG_USERFAULTFD=y
 # CONFIG_COMPAT_BRK is not set
@@ -47,6 +48,7 @@ CONFIG_LIVEPATCH=y
 CONFIG_NR_CPUS=256
 CONFIG_NUMA=y
 CONFIG_HZ_100=y
+CONFIG_KEXEC_FILE=y
 CONFIG_MEMORY_HOTPLUG=y
 CONFIG_MEMORY_HOTREMOVE=y
 CONFIG_KSM=y
@@ -129,10 +131,13 @@ CONFIG_EQUALIZER=m
 CONFIG_TUN=m
 CONFIG_VIRTIO_NET=y
 # CONFIG_NET_VENDOR_ALACRITECH is not set
+# CONFIG_NET_VENDOR_CORTINA is not set
 # CONFIG_NET_VENDOR_SOLARFLARE is not set
+# CONFIG_NET_VENDOR_SOCIONEXT is not set
 # CONFIG_NET_VENDOR_SYNOPSYS is not set
 # CONFIG_INPUT is not set
 # CONFIG_SERIO is not set
+# CONFIG_VT is not set
 CONFIG_DEVKMEM=y
 CONFIG_RAW_DRIVER=m
 CONFIG_VIRTIO_BALLOON=y
@@ -177,13 +182,15 @@ CONFIG_TRACER_SNAPSHOT_PER_CPU_SWAP=y
 CONFIG_STACK_TRACER=y
 CONFIG_BLK_DEV_IO_TRACE=y
 CONFIG_FUNCTION_PROFILER=y
-CONFIG_KPROBES_SANITY_TEST=y
+# CONFIG_RUNTIME_TESTING_MENU is not set
 CONFIG_S390_PTDUMP=y
 CONFIG_CRYPTO_CRYPTD=m
+CONFIG_CRYPTO_AUTHENC=m
 CONFIG_CRYPTO_TEST=m
 CONFIG_CRYPTO_CCM=m
 CONFIG_CRYPTO_GCM=m
 CONFIG_CRYPTO_CBC=y
+CONFIG_CRYPTO_CFB=m
 CONFIG_CRYPTO_CTS=m
 CONFIG_CRYPTO_LRW=m
 CONFIG_CRYPTO_PCBC=m
@@ -213,6 +220,8 @@ CONFIG_CRYPTO_KHAZAD=m
 CONFIG_CRYPTO_SALSA20=m
 CONFIG_CRYPTO_SEED=m
 CONFIG_CRYPTO_SERPENT=m
+CONFIG_CRYPTO_SM4=m
+CONFIG_CRYPTO_SPECK=m
 CONFIG_CRYPTO_TEA=m
 CONFIG_CRYPTO_TWOFISH=m
 CONFIG_CRYPTO_DEFLATE=m

From cd7cf57f18be4196306997d4325b8ebf895ab318 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Thu, 12 Apr 2018 11:00:31 +0200
Subject: [PATCH 128/288] s390: remove gcov defconfig

This config is not needed anymore.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/configs/gcov_defconfig | 661 -------------------------------
 1 file changed, 661 deletions(-)
 delete mode 100644 arch/s390/configs/gcov_defconfig

diff --git a/arch/s390/configs/gcov_defconfig b/arch/s390/configs/gcov_defconfig
deleted file mode 100644
index d52eafe57ae8..000000000000
--- a/arch/s390/configs/gcov_defconfig
+++ /dev/null
@@ -1,661 +0,0 @@
-CONFIG_SYSVIPC=y
-CONFIG_POSIX_MQUEUE=y
-CONFIG_AUDIT=y
-CONFIG_NO_HZ_IDLE=y
-CONFIG_HIGH_RES_TIMERS=y
-CONFIG_BSD_PROCESS_ACCT=y
-CONFIG_BSD_PROCESS_ACCT_V3=y
-CONFIG_TASKSTATS=y
-CONFIG_TASK_DELAY_ACCT=y
-CONFIG_TASK_XACCT=y
-CONFIG_TASK_IO_ACCOUNTING=y
-CONFIG_IKCONFIG=y
-CONFIG_IKCONFIG_PROC=y
-CONFIG_NUMA_BALANCING=y
-# CONFIG_NUMA_BALANCING_DEFAULT_ENABLED is not set
-CONFIG_MEMCG=y
-CONFIG_MEMCG_SWAP=y
-CONFIG_BLK_CGROUP=y
-CONFIG_CFS_BANDWIDTH=y
-CONFIG_RT_GROUP_SCHED=y
-CONFIG_CGROUP_PIDS=y
-CONFIG_CGROUP_FREEZER=y
-CONFIG_CGROUP_HUGETLB=y
-CONFIG_CPUSETS=y
-CONFIG_CGROUP_DEVICE=y
-CONFIG_CGROUP_CPUACCT=y
-CONFIG_CGROUP_PERF=y
-CONFIG_CHECKPOINT_RESTORE=y
-CONFIG_NAMESPACES=y
-CONFIG_USER_NS=y
-CONFIG_SCHED_AUTOGROUP=y
-CONFIG_BLK_DEV_INITRD=y
-CONFIG_EXPERT=y
-# CONFIG_SYSFS_SYSCALL is not set
-CONFIG_BPF_SYSCALL=y
-CONFIG_USERFAULTFD=y
-# CONFIG_COMPAT_BRK is not set
-CONFIG_PROFILING=y
-CONFIG_OPROFILE=m
-CONFIG_KPROBES=y
-CONFIG_JUMP_LABEL=y
-CONFIG_GCOV_KERNEL=y
-CONFIG_GCOV_PROFILE_ALL=y
-CONFIG_MODULES=y
-CONFIG_MODULE_FORCE_LOAD=y
-CONFIG_MODULE_UNLOAD=y
-CONFIG_MODULE_FORCE_UNLOAD=y
-CONFIG_MODVERSIONS=y
-CONFIG_MODULE_SRCVERSION_ALL=y
-CONFIG_BLK_DEV_INTEGRITY=y
-CONFIG_BLK_DEV_THROTTLING=y
-CONFIG_BLK_WBT=y
-CONFIG_BLK_WBT_SQ=y
-CONFIG_PARTITION_ADVANCED=y
-CONFIG_IBM_PARTITION=y
-CONFIG_BSD_DISKLABEL=y
-CONFIG_MINIX_SUBPARTITION=y
-CONFIG_SOLARIS_X86_PARTITION=y
-CONFIG_UNIXWARE_DISKLABEL=y
-CONFIG_CFQ_GROUP_IOSCHED=y
-CONFIG_DEFAULT_DEADLINE=y
-CONFIG_LIVEPATCH=y
-CONFIG_TUNE_ZEC12=y
-CONFIG_NR_CPUS=512
-CONFIG_NUMA=y
-CONFIG_HZ_100=y
-CONFIG_MEMORY_HOTPLUG=y
-CONFIG_MEMORY_HOTREMOVE=y
-CONFIG_KSM=y
-CONFIG_TRANSPARENT_HUGEPAGE=y
-CONFIG_CLEANCACHE=y
-CONFIG_FRONTSWAP=y
-CONFIG_MEM_SOFT_DIRTY=y
-CONFIG_ZSWAP=y
-CONFIG_ZBUD=m
-CONFIG_ZSMALLOC=m
-CONFIG_ZSMALLOC_STAT=y
-CONFIG_DEFERRED_STRUCT_PAGE_INIT=y
-CONFIG_IDLE_PAGE_TRACKING=y
-CONFIG_PCI=y
-CONFIG_HOTPLUG_PCI=y
-CONFIG_HOTPLUG_PCI_S390=y
-CONFIG_CHSC_SCH=y
-CONFIG_CRASH_DUMP=y
-CONFIG_BINFMT_MISC=m
-CONFIG_HIBERNATION=y
-CONFIG_NET=y
-CONFIG_PACKET=y
-CONFIG_PACKET_DIAG=m
-CONFIG_UNIX=y
-CONFIG_UNIX_DIAG=m
-CONFIG_XFRM_USER=m
-CONFIG_NET_KEY=m
-CONFIG_SMC=m
-CONFIG_SMC_DIAG=m
-CONFIG_INET=y
-CONFIG_IP_MULTICAST=y
-CONFIG_IP_ADVANCED_ROUTER=y
-CONFIG_IP_MULTIPLE_TABLES=y
-CONFIG_IP_ROUTE_MULTIPATH=y
-CONFIG_IP_ROUTE_VERBOSE=y
-CONFIG_NET_IPIP=m
-CONFIG_NET_IPGRE_DEMUX=m
-CONFIG_NET_IPGRE=m
-CONFIG_NET_IPGRE_BROADCAST=y
-CONFIG_IP_MROUTE=y
-CONFIG_IP_MROUTE_MULTIPLE_TABLES=y
-CONFIG_IP_PIMSM_V1=y
-CONFIG_IP_PIMSM_V2=y
-CONFIG_SYN_COOKIES=y
-CONFIG_NET_IPVTI=m
-CONFIG_INET_AH=m
-CONFIG_INET_ESP=m
-CONFIG_INET_IPCOMP=m
-CONFIG_INET_XFRM_MODE_TRANSPORT=m
-CONFIG_INET_XFRM_MODE_TUNNEL=m
-CONFIG_INET_XFRM_MODE_BEET=m
-CONFIG_INET_DIAG=m
-CONFIG_INET_UDP_DIAG=m
-CONFIG_TCP_CONG_ADVANCED=y
-CONFIG_TCP_CONG_HSTCP=m
-CONFIG_TCP_CONG_HYBLA=m
-CONFIG_TCP_CONG_SCALABLE=m
-CONFIG_TCP_CONG_LP=m
-CONFIG_TCP_CONG_VENO=m
-CONFIG_TCP_CONG_YEAH=m
-CONFIG_TCP_CONG_ILLINOIS=m
-CONFIG_IPV6_ROUTER_PREF=y
-CONFIG_INET6_AH=m
-CONFIG_INET6_ESP=m
-CONFIG_INET6_IPCOMP=m
-CONFIG_IPV6_MIP6=m
-CONFIG_INET6_XFRM_MODE_TRANSPORT=m
-CONFIG_INET6_XFRM_MODE_TUNNEL=m
-CONFIG_INET6_XFRM_MODE_BEET=m
-CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION=m
-CONFIG_IPV6_VTI=m
-CONFIG_IPV6_SIT=m
-CONFIG_IPV6_GRE=m
-CONFIG_IPV6_MULTIPLE_TABLES=y
-CONFIG_IPV6_SUBTREES=y
-CONFIG_NETFILTER=y
-CONFIG_NF_CONNTRACK=m
-CONFIG_NF_CONNTRACK_SECMARK=y
-CONFIG_NF_CONNTRACK_EVENTS=y
-CONFIG_NF_CONNTRACK_TIMEOUT=y
-CONFIG_NF_CONNTRACK_TIMESTAMP=y
-CONFIG_NF_CONNTRACK_AMANDA=m
-CONFIG_NF_CONNTRACK_FTP=m
-CONFIG_NF_CONNTRACK_H323=m
-CONFIG_NF_CONNTRACK_IRC=m
-CONFIG_NF_CONNTRACK_NETBIOS_NS=m
-CONFIG_NF_CONNTRACK_SNMP=m
-CONFIG_NF_CONNTRACK_PPTP=m
-CONFIG_NF_CONNTRACK_SANE=m
-CONFIG_NF_CONNTRACK_SIP=m
-CONFIG_NF_CONNTRACK_TFTP=m
-CONFIG_NF_CT_NETLINK=m
-CONFIG_NF_CT_NETLINK_TIMEOUT=m
-CONFIG_NF_TABLES=m
-CONFIG_NFT_EXTHDR=m
-CONFIG_NFT_META=m
-CONFIG_NFT_CT=m
-CONFIG_NFT_COUNTER=m
-CONFIG_NFT_LOG=m
-CONFIG_NFT_LIMIT=m
-CONFIG_NFT_NAT=m
-CONFIG_NFT_COMPAT=m
-CONFIG_NFT_HASH=m
-CONFIG_NETFILTER_XT_SET=m
-CONFIG_NETFILTER_XT_TARGET_AUDIT=m
-CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m
-CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m
-CONFIG_NETFILTER_XT_TARGET_CONNMARK=m
-CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=m
-CONFIG_NETFILTER_XT_TARGET_CT=m
-CONFIG_NETFILTER_XT_TARGET_DSCP=m
-CONFIG_NETFILTER_XT_TARGET_HMARK=m
-CONFIG_NETFILTER_XT_TARGET_IDLETIMER=m
-CONFIG_NETFILTER_XT_TARGET_LOG=m
-CONFIG_NETFILTER_XT_TARGET_MARK=m
-CONFIG_NETFILTER_XT_TARGET_NFLOG=m
-CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m
-CONFIG_NETFILTER_XT_TARGET_TEE=m
-CONFIG_NETFILTER_XT_TARGET_TPROXY=m
-CONFIG_NETFILTER_XT_TARGET_TRACE=m
-CONFIG_NETFILTER_XT_TARGET_SECMARK=m
-CONFIG_NETFILTER_XT_TARGET_TCPMSS=m
-CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP=m
-CONFIG_NETFILTER_XT_MATCH_ADDRTYPE=m
-CONFIG_NETFILTER_XT_MATCH_BPF=m
-CONFIG_NETFILTER_XT_MATCH_CLUSTER=m
-CONFIG_NETFILTER_XT_MATCH_COMMENT=m
-CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m
-CONFIG_NETFILTER_XT_MATCH_CONNLABEL=m
-CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m
-CONFIG_NETFILTER_XT_MATCH_CONNMARK=m
-CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m
-CONFIG_NETFILTER_XT_MATCH_CPU=m
-CONFIG_NETFILTER_XT_MATCH_DCCP=m
-CONFIG_NETFILTER_XT_MATCH_DEVGROUP=m
-CONFIG_NETFILTER_XT_MATCH_DSCP=m
-CONFIG_NETFILTER_XT_MATCH_ESP=m
-CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m
-CONFIG_NETFILTER_XT_MATCH_HELPER=m
-CONFIG_NETFILTER_XT_MATCH_IPRANGE=m
-CONFIG_NETFILTER_XT_MATCH_IPVS=m
-CONFIG_NETFILTER_XT_MATCH_LENGTH=m
-CONFIG_NETFILTER_XT_MATCH_LIMIT=m
-CONFIG_NETFILTER_XT_MATCH_MAC=m
-CONFIG_NETFILTER_XT_MATCH_MARK=m
-CONFIG_NETFILTER_XT_MATCH_MULTIPORT=m
-CONFIG_NETFILTER_XT_MATCH_NFACCT=m
-CONFIG_NETFILTER_XT_MATCH_OSF=m
-CONFIG_NETFILTER_XT_MATCH_OWNER=m
-CONFIG_NETFILTER_XT_MATCH_POLICY=m
-CONFIG_NETFILTER_XT_MATCH_PHYSDEV=m
-CONFIG_NETFILTER_XT_MATCH_PKTTYPE=m
-CONFIG_NETFILTER_XT_MATCH_QUOTA=m
-CONFIG_NETFILTER_XT_MATCH_RATEEST=m
-CONFIG_NETFILTER_XT_MATCH_REALM=m
-CONFIG_NETFILTER_XT_MATCH_RECENT=m
-CONFIG_NETFILTER_XT_MATCH_STATE=m
-CONFIG_NETFILTER_XT_MATCH_STATISTIC=m
-CONFIG_NETFILTER_XT_MATCH_STRING=m
-CONFIG_NETFILTER_XT_MATCH_TCPMSS=m
-CONFIG_NETFILTER_XT_MATCH_TIME=m
-CONFIG_NETFILTER_XT_MATCH_U32=m
-CONFIG_IP_SET=m
-CONFIG_IP_SET_BITMAP_IP=m
-CONFIG_IP_SET_BITMAP_IPMAC=m
-CONFIG_IP_SET_BITMAP_PORT=m
-CONFIG_IP_SET_HASH_IP=m
-CONFIG_IP_SET_HASH_IPPORT=m
-CONFIG_IP_SET_HASH_IPPORTIP=m
-CONFIG_IP_SET_HASH_IPPORTNET=m
-CONFIG_IP_SET_HASH_NETPORTNET=m
-CONFIG_IP_SET_HASH_NET=m
-CONFIG_IP_SET_HASH_NETNET=m
-CONFIG_IP_SET_HASH_NETPORT=m
-CONFIG_IP_SET_HASH_NETIFACE=m
-CONFIG_IP_SET_LIST_SET=m
-CONFIG_IP_VS=m
-CONFIG_IP_VS_PROTO_TCP=y
-CONFIG_IP_VS_PROTO_UDP=y
-CONFIG_IP_VS_PROTO_ESP=y
-CONFIG_IP_VS_PROTO_AH=y
-CONFIG_IP_VS_RR=m
-CONFIG_IP_VS_WRR=m
-CONFIG_IP_VS_LC=m
-CONFIG_IP_VS_WLC=m
-CONFIG_IP_VS_LBLC=m
-CONFIG_IP_VS_LBLCR=m
-CONFIG_IP_VS_DH=m
-CONFIG_IP_VS_SH=m
-CONFIG_IP_VS_SED=m
-CONFIG_IP_VS_NQ=m
-CONFIG_IP_VS_FTP=m
-CONFIG_IP_VS_PE_SIP=m
-CONFIG_NF_CONNTRACK_IPV4=m
-CONFIG_NF_TABLES_IPV4=m
-CONFIG_NFT_CHAIN_ROUTE_IPV4=m
-CONFIG_NF_TABLES_ARP=m
-CONFIG_NFT_CHAIN_NAT_IPV4=m
-CONFIG_IP_NF_IPTABLES=m
-CONFIG_IP_NF_MATCH_AH=m
-CONFIG_IP_NF_MATCH_ECN=m
-CONFIG_IP_NF_MATCH_RPFILTER=m
-CONFIG_IP_NF_MATCH_TTL=m
-CONFIG_IP_NF_FILTER=m
-CONFIG_IP_NF_TARGET_REJECT=m
-CONFIG_IP_NF_NAT=m
-CONFIG_IP_NF_TARGET_MASQUERADE=m
-CONFIG_IP_NF_MANGLE=m
-CONFIG_IP_NF_TARGET_CLUSTERIP=m
-CONFIG_IP_NF_TARGET_ECN=m
-CONFIG_IP_NF_TARGET_TTL=m
-CONFIG_IP_NF_RAW=m
-CONFIG_IP_NF_SECURITY=m
-CONFIG_IP_NF_ARPTABLES=m
-CONFIG_IP_NF_ARPFILTER=m
-CONFIG_IP_NF_ARP_MANGLE=m
-CONFIG_NF_CONNTRACK_IPV6=m
-CONFIG_NF_TABLES_IPV6=m
-CONFIG_NFT_CHAIN_ROUTE_IPV6=m
-CONFIG_NFT_CHAIN_NAT_IPV6=m
-CONFIG_IP6_NF_IPTABLES=m
-CONFIG_IP6_NF_MATCH_AH=m
-CONFIG_IP6_NF_MATCH_EUI64=m
-CONFIG_IP6_NF_MATCH_FRAG=m
-CONFIG_IP6_NF_MATCH_OPTS=m
-CONFIG_IP6_NF_MATCH_HL=m
-CONFIG_IP6_NF_MATCH_IPV6HEADER=m
-CONFIG_IP6_NF_MATCH_MH=m
-CONFIG_IP6_NF_MATCH_RPFILTER=m
-CONFIG_IP6_NF_MATCH_RT=m
-CONFIG_IP6_NF_TARGET_HL=m
-CONFIG_IP6_NF_FILTER=m
-CONFIG_IP6_NF_TARGET_REJECT=m
-CONFIG_IP6_NF_MANGLE=m
-CONFIG_IP6_NF_RAW=m
-CONFIG_IP6_NF_SECURITY=m
-CONFIG_IP6_NF_NAT=m
-CONFIG_IP6_NF_TARGET_MASQUERADE=m
-CONFIG_NF_TABLES_BRIDGE=m
-CONFIG_NET_SCTPPROBE=m
-CONFIG_RDS=m
-CONFIG_RDS_RDMA=m
-CONFIG_RDS_TCP=m
-CONFIG_L2TP=m
-CONFIG_L2TP_DEBUGFS=m
-CONFIG_L2TP_V3=y
-CONFIG_L2TP_IP=m
-CONFIG_L2TP_ETH=m
-CONFIG_BRIDGE=m
-CONFIG_VLAN_8021Q=m
-CONFIG_VLAN_8021Q_GVRP=y
-CONFIG_NET_SCHED=y
-CONFIG_NET_SCH_CBQ=m
-CONFIG_NET_SCH_HTB=m
-CONFIG_NET_SCH_HFSC=m
-CONFIG_NET_SCH_PRIO=m
-CONFIG_NET_SCH_MULTIQ=m
-CONFIG_NET_SCH_RED=m
-CONFIG_NET_SCH_SFB=m
-CONFIG_NET_SCH_SFQ=m
-CONFIG_NET_SCH_TEQL=m
-CONFIG_NET_SCH_TBF=m
-CONFIG_NET_SCH_GRED=m
-CONFIG_NET_SCH_DSMARK=m
-CONFIG_NET_SCH_NETEM=m
-CONFIG_NET_SCH_DRR=m
-CONFIG_NET_SCH_MQPRIO=m
-CONFIG_NET_SCH_CHOKE=m
-CONFIG_NET_SCH_QFQ=m
-CONFIG_NET_SCH_CODEL=m
-CONFIG_NET_SCH_FQ_CODEL=m
-CONFIG_NET_SCH_INGRESS=m
-CONFIG_NET_SCH_PLUG=m
-CONFIG_NET_CLS_BASIC=m
-CONFIG_NET_CLS_TCINDEX=m
-CONFIG_NET_CLS_ROUTE4=m
-CONFIG_NET_CLS_FW=m
-CONFIG_NET_CLS_U32=m
-CONFIG_CLS_U32_PERF=y
-CONFIG_CLS_U32_MARK=y
-CONFIG_NET_CLS_RSVP=m
-CONFIG_NET_CLS_RSVP6=m
-CONFIG_NET_CLS_FLOW=m
-CONFIG_NET_CLS_CGROUP=y
-CONFIG_NET_CLS_BPF=m
-CONFIG_NET_CLS_ACT=y
-CONFIG_NET_ACT_POLICE=m
-CONFIG_NET_ACT_GACT=m
-CONFIG_GACT_PROB=y
-CONFIG_NET_ACT_MIRRED=m
-CONFIG_NET_ACT_IPT=m
-CONFIG_NET_ACT_NAT=m
-CONFIG_NET_ACT_PEDIT=m
-CONFIG_NET_ACT_SIMP=m
-CONFIG_NET_ACT_SKBEDIT=m
-CONFIG_NET_ACT_CSUM=m
-CONFIG_DNS_RESOLVER=y
-CONFIG_NETLINK_DIAG=m
-CONFIG_CGROUP_NET_PRIO=y
-CONFIG_BPF_JIT=y
-CONFIG_NET_PKTGEN=m
-CONFIG_NET_TCPPROBE=m
-CONFIG_DEVTMPFS=y
-CONFIG_DMA_CMA=y
-CONFIG_CMA_SIZE_MBYTES=0
-CONFIG_CONNECTOR=y
-CONFIG_ZRAM=m
-CONFIG_BLK_DEV_LOOP=m
-CONFIG_BLK_DEV_CRYPTOLOOP=m
-CONFIG_BLK_DEV_DRBD=m
-CONFIG_BLK_DEV_NBD=m
-CONFIG_BLK_DEV_RAM=y
-CONFIG_BLK_DEV_RAM_SIZE=32768
-CONFIG_BLK_DEV_RAM_DAX=y
-CONFIG_VIRTIO_BLK=y
-CONFIG_ENCLOSURE_SERVICES=m
-CONFIG_GENWQE=m
-CONFIG_RAID_ATTRS=m
-CONFIG_SCSI=y
-CONFIG_BLK_DEV_SD=y
-CONFIG_CHR_DEV_ST=m
-CONFIG_CHR_DEV_OSST=m
-CONFIG_BLK_DEV_SR=m
-CONFIG_CHR_DEV_SG=y
-CONFIG_CHR_DEV_SCH=m
-CONFIG_SCSI_ENCLOSURE=m
-CONFIG_SCSI_CONSTANTS=y
-CONFIG_SCSI_LOGGING=y
-CONFIG_SCSI_SPI_ATTRS=m
-CONFIG_SCSI_FC_ATTRS=y
-CONFIG_SCSI_SAS_LIBSAS=m
-CONFIG_SCSI_SRP_ATTRS=m
-CONFIG_ISCSI_TCP=m
-CONFIG_SCSI_DEBUG=m
-CONFIG_ZFCP=y
-CONFIG_SCSI_VIRTIO=m
-CONFIG_SCSI_DH=y
-CONFIG_SCSI_DH_RDAC=m
-CONFIG_SCSI_DH_HP_SW=m
-CONFIG_SCSI_DH_EMC=m
-CONFIG_SCSI_DH_ALUA=m
-CONFIG_SCSI_OSD_INITIATOR=m
-CONFIG_SCSI_OSD_ULD=m
-CONFIG_MD=y
-CONFIG_BLK_DEV_MD=y
-CONFIG_MD_LINEAR=m
-CONFIG_MD_MULTIPATH=m
-CONFIG_MD_FAULTY=m
-CONFIG_BLK_DEV_DM=m
-CONFIG_DM_CRYPT=m
-CONFIG_DM_SNAPSHOT=m
-CONFIG_DM_THIN_PROVISIONING=m
-CONFIG_DM_MIRROR=m
-CONFIG_DM_LOG_USERSPACE=m
-CONFIG_DM_RAID=m
-CONFIG_DM_ZERO=m
-CONFIG_DM_MULTIPATH=m
-CONFIG_DM_MULTIPATH_QL=m
-CONFIG_DM_MULTIPATH_ST=m
-CONFIG_DM_DELAY=m
-CONFIG_DM_UEVENT=y
-CONFIG_DM_FLAKEY=m
-CONFIG_DM_VERITY=m
-CONFIG_DM_SWITCH=m
-CONFIG_NETDEVICES=y
-CONFIG_BONDING=m
-CONFIG_DUMMY=m
-CONFIG_EQUALIZER=m
-CONFIG_IFB=m
-CONFIG_MACVLAN=m
-CONFIG_MACVTAP=m
-CONFIG_VXLAN=m
-CONFIG_TUN=m
-CONFIG_VETH=m
-CONFIG_VIRTIO_NET=m
-CONFIG_NLMON=m
-# CONFIG_NET_VENDOR_ARC is not set
-# CONFIG_NET_VENDOR_CHELSIO is not set
-# CONFIG_NET_VENDOR_INTEL is not set
-# CONFIG_NET_VENDOR_MARVELL is not set
-CONFIG_MLX4_EN=m
-CONFIG_MLX5_CORE=m
-CONFIG_MLX5_CORE_EN=y
-# CONFIG_NET_VENDOR_NATSEMI is not set
-CONFIG_PPP=m
-CONFIG_PPP_BSDCOMP=m
-CONFIG_PPP_DEFLATE=m
-CONFIG_PPP_MPPE=m
-CONFIG_PPPOE=m
-CONFIG_PPTP=m
-CONFIG_PPPOL2TP=m
-CONFIG_PPP_ASYNC=m
-CONFIG_PPP_SYNC_TTY=m
-# CONFIG_INPUT_KEYBOARD is not set
-# CONFIG_INPUT_MOUSE is not set
-# CONFIG_SERIO is not set
-CONFIG_LEGACY_PTY_COUNT=0
-CONFIG_HW_RANDOM_VIRTIO=m
-CONFIG_RAW_DRIVER=m
-CONFIG_HANGCHECK_TIMER=m
-CONFIG_TN3270_FS=y
-# CONFIG_HWMON is not set
-CONFIG_WATCHDOG=y
-CONFIG_WATCHDOG_NOWAYOUT=y
-CONFIG_SOFT_WATCHDOG=m
-CONFIG_DIAG288_WATCHDOG=m
-# CONFIG_HID is not set
-# CONFIG_USB_SUPPORT is not set
-CONFIG_INFINIBAND=m
-CONFIG_INFINIBAND_USER_ACCESS=m
-CONFIG_MLX4_INFINIBAND=m
-CONFIG_MLX5_INFINIBAND=m
-CONFIG_VFIO=m
-CONFIG_VFIO_PCI=m
-CONFIG_VIRTIO_BALLOON=m
-CONFIG_EXT4_FS=y
-CONFIG_EXT4_FS_POSIX_ACL=y
-CONFIG_EXT4_FS_SECURITY=y
-CONFIG_EXT4_ENCRYPTION=y
-CONFIG_JBD2_DEBUG=y
-CONFIG_JFS_FS=m
-CONFIG_JFS_POSIX_ACL=y
-CONFIG_JFS_SECURITY=y
-CONFIG_JFS_STATISTICS=y
-CONFIG_XFS_FS=y
-CONFIG_XFS_QUOTA=y
-CONFIG_XFS_POSIX_ACL=y
-CONFIG_XFS_RT=y
-CONFIG_GFS2_FS=m
-CONFIG_GFS2_FS_LOCKING_DLM=y
-CONFIG_OCFS2_FS=m
-CONFIG_BTRFS_FS=y
-CONFIG_BTRFS_FS_POSIX_ACL=y
-CONFIG_NILFS2_FS=m
-CONFIG_FS_DAX=y
-CONFIG_EXPORTFS_BLOCK_OPS=y
-CONFIG_FANOTIFY=y
-CONFIG_FANOTIFY_ACCESS_PERMISSIONS=y
-CONFIG_QUOTA_NETLINK_INTERFACE=y
-CONFIG_QFMT_V1=m
-CONFIG_QFMT_V2=m
-CONFIG_AUTOFS4_FS=m
-CONFIG_FUSE_FS=y
-CONFIG_CUSE=m
-CONFIG_OVERLAY_FS=m
-CONFIG_OVERLAY_FS_REDIRECT_DIR=y
-CONFIG_FSCACHE=m
-CONFIG_CACHEFILES=m
-CONFIG_ISO9660_FS=y
-CONFIG_JOLIET=y
-CONFIG_ZISOFS=y
-CONFIG_UDF_FS=m
-CONFIG_MSDOS_FS=m
-CONFIG_VFAT_FS=m
-CONFIG_NTFS_FS=m
-CONFIG_NTFS_RW=y
-CONFIG_PROC_KCORE=y
-CONFIG_TMPFS=y
-CONFIG_TMPFS_POSIX_ACL=y
-CONFIG_HUGETLBFS=y
-CONFIG_CONFIGFS_FS=m
-CONFIG_ECRYPT_FS=m
-CONFIG_CRAMFS=m
-CONFIG_SQUASHFS=m
-CONFIG_SQUASHFS_XATTR=y
-CONFIG_SQUASHFS_LZO=y
-CONFIG_SQUASHFS_XZ=y
-CONFIG_ROMFS_FS=m
-CONFIG_NFS_FS=m
-CONFIG_NFS_V3_ACL=y
-CONFIG_NFS_V4=m
-CONFIG_NFS_SWAP=y
-CONFIG_NFSD=m
-CONFIG_NFSD_V3_ACL=y
-CONFIG_NFSD_V4=y
-CONFIG_NFSD_V4_SECURITY_LABEL=y
-CONFIG_CIFS=m
-CONFIG_CIFS_STATS=y
-CONFIG_CIFS_STATS2=y
-CONFIG_CIFS_WEAK_PW_HASH=y
-CONFIG_CIFS_UPCALL=y
-CONFIG_CIFS_XATTR=y
-CONFIG_CIFS_POSIX=y
-# CONFIG_CIFS_DEBUG is not set
-CONFIG_CIFS_DFS_UPCALL=y
-CONFIG_NLS_DEFAULT="utf8"
-CONFIG_NLS_CODEPAGE_437=m
-CONFIG_NLS_CODEPAGE_850=m
-CONFIG_NLS_ASCII=m
-CONFIG_NLS_ISO8859_1=m
-CONFIG_NLS_ISO8859_15=m
-CONFIG_NLS_UTF8=m
-CONFIG_DLM=m
-CONFIG_PRINTK_TIME=y
-CONFIG_DEBUG_INFO=y
-CONFIG_DEBUG_INFO_DWARF4=y
-CONFIG_GDB_SCRIPTS=y
-# CONFIG_ENABLE_MUST_CHECK is not set
-CONFIG_FRAME_WARN=1024
-CONFIG_UNUSED_SYMBOLS=y
-CONFIG_MAGIC_SYSRQ=y
-CONFIG_DEBUG_MEMORY_INIT=y
-CONFIG_PANIC_ON_OOPS=y
-CONFIG_RCU_TORTURE_TEST=m
-CONFIG_RCU_CPU_STALL_TIMEOUT=60
-CONFIG_LATENCYTOP=y
-CONFIG_SCHED_TRACER=y
-CONFIG_FTRACE_SYSCALLS=y
-CONFIG_STACK_TRACER=y
-CONFIG_BLK_DEV_IO_TRACE=y
-CONFIG_FUNCTION_PROFILER=y
-CONFIG_HIST_TRIGGERS=y
-CONFIG_LKDTM=m
-CONFIG_PERCPU_TEST=m
-CONFIG_ATOMIC64_SELFTEST=y
-CONFIG_TEST_BPF=m
-CONFIG_BUG_ON_DATA_CORRUPTION=y
-CONFIG_S390_PTDUMP=y
-CONFIG_PERSISTENT_KEYRINGS=y
-CONFIG_BIG_KEYS=y
-CONFIG_ENCRYPTED_KEYS=m
-CONFIG_SECURITY=y
-CONFIG_SECURITY_NETWORK=y
-CONFIG_SECURITY_SELINUX=y
-CONFIG_SECURITY_SELINUX_BOOTPARAM=y
-CONFIG_SECURITY_SELINUX_BOOTPARAM_VALUE=0
-CONFIG_SECURITY_SELINUX_DISABLE=y
-CONFIG_INTEGRITY_SIGNATURE=y
-CONFIG_INTEGRITY_ASYMMETRIC_KEYS=y
-CONFIG_IMA=y
-CONFIG_IMA_WRITE_POLICY=y
-CONFIG_IMA_APPRAISE=y
-CONFIG_CRYPTO_DH=m
-CONFIG_CRYPTO_ECDH=m
-CONFIG_CRYPTO_USER=m
-# CONFIG_CRYPTO_MANAGER_DISABLE_TESTS is not set
-CONFIG_CRYPTO_PCRYPT=m
-CONFIG_CRYPTO_CRYPTD=m
-CONFIG_CRYPTO_MCRYPTD=m
-CONFIG_CRYPTO_TEST=m
-CONFIG_CRYPTO_CHACHA20POLY1305=m
-CONFIG_CRYPTO_LRW=m
-CONFIG_CRYPTO_PCBC=m
-CONFIG_CRYPTO_KEYWRAP=m
-CONFIG_CRYPTO_XCBC=m
-CONFIG_CRYPTO_VMAC=m
-CONFIG_CRYPTO_CRC32=m
-CONFIG_CRYPTO_MICHAEL_MIC=m
-CONFIG_CRYPTO_RMD128=m
-CONFIG_CRYPTO_RMD160=m
-CONFIG_CRYPTO_RMD256=m
-CONFIG_CRYPTO_RMD320=m
-CONFIG_CRYPTO_SHA512=m
-CONFIG_CRYPTO_SHA3=m
-CONFIG_CRYPTO_TGR192=m
-CONFIG_CRYPTO_WP512=m
-CONFIG_CRYPTO_AES_TI=m
-CONFIG_CRYPTO_ANUBIS=m
-CONFIG_CRYPTO_BLOWFISH=m
-CONFIG_CRYPTO_CAMELLIA=m
-CONFIG_CRYPTO_CAST5=m
-CONFIG_CRYPTO_CAST6=m
-CONFIG_CRYPTO_FCRYPT=m
-CONFIG_CRYPTO_KHAZAD=m
-CONFIG_CRYPTO_SALSA20=m
-CONFIG_CRYPTO_SEED=m
-CONFIG_CRYPTO_SERPENT=m
-CONFIG_CRYPTO_TEA=m
-CONFIG_CRYPTO_TWOFISH=m
-CONFIG_CRYPTO_842=m
-CONFIG_CRYPTO_LZ4=m
-CONFIG_CRYPTO_LZ4HC=m
-CONFIG_CRYPTO_ANSI_CPRNG=m
-CONFIG_CRYPTO_USER_API_HASH=m
-CONFIG_CRYPTO_USER_API_SKCIPHER=m
-CONFIG_CRYPTO_USER_API_RNG=m
-CONFIG_CRYPTO_USER_API_AEAD=m
-CONFIG_ZCRYPT=m
-CONFIG_PKEY=m
-CONFIG_CRYPTO_PAES_S390=m
-CONFIG_CRYPTO_SHA1_S390=m
-CONFIG_CRYPTO_SHA256_S390=m
-CONFIG_CRYPTO_SHA512_S390=m
-CONFIG_CRYPTO_DES_S390=m
-CONFIG_CRYPTO_AES_S390=m
-CONFIG_CRYPTO_GHASH_S390=m
-CONFIG_CRYPTO_CRC32_S390=y
-CONFIG_CRC7=m
-CONFIG_CRC8=m
-CONFIG_CORDIC=m
-CONFIG_CMM=m
-CONFIG_APPLDATA_BASE=y
-CONFIG_KVM=m
-CONFIG_KVM_S390_UCONTROL=y
-CONFIG_VHOST_NET=m

From 49d23a851d62c03daebae2d245dcc9b07dbfa89f Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Thu, 12 Apr 2018 11:01:07 +0200
Subject: [PATCH 129/288] s390: rename default_defconfig to debug_defconfig

The name debug_defconfig reflects what the config is actually good
for and should be less confusing.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/configs/{default_defconfig => debug_defconfig} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename arch/s390/configs/{default_defconfig => debug_defconfig} (100%)

diff --git a/arch/s390/configs/default_defconfig b/arch/s390/configs/debug_defconfig
similarity index 100%
rename from arch/s390/configs/default_defconfig
rename to arch/s390/configs/debug_defconfig

From 2c2bf522ed8cbfaac666f7dc65cfd38de2b89f0f Mon Sep 17 00:00:00 2001
From: Matt Redfearn <matt.redfearn@mips.com>
Date: Fri, 13 Apr 2018 09:50:44 +0100
Subject: [PATCH 130/288] MIPS: dts: Boston: Fix PCI bus dtc warnings:

dtc recently (v1.4.4-8-g756ffc4f52f6) added PCI bus checks. Fix the
warnings now emitted:

arch/mips/boot/dts/img/boston.dtb: Warning (pci_bridge): /pci@10000000: missing bus-range for PCI bridge
arch/mips/boot/dts/img/boston.dtb: Warning (pci_bridge): /pci@12000000: missing bus-range for PCI bridge
arch/mips/boot/dts/img/boston.dtb: Warning (pci_bridge): /pci@14000000: missing bus-range for PCI bridge

Signed-off-by: Matt Redfearn <matt.redfearn@mips.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Paul Burton <paul.burton@mips.com>
Cc: Rob Herring <robh+dt@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: linux-mips@linux-mips.org
Cc: devicetree@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/19070/
Signed-off-by: James Hogan <jhogan@kernel.org>
---
 arch/mips/boot/dts/img/boston.dts | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/arch/mips/boot/dts/img/boston.dts b/arch/mips/boot/dts/img/boston.dts
index 1bd105428f61..65af3f6ba81c 100644
--- a/arch/mips/boot/dts/img/boston.dts
+++ b/arch/mips/boot/dts/img/boston.dts
@@ -51,6 +51,8 @@
 		ranges = <0x02000000 0 0x40000000
 			  0x40000000 0 0x40000000>;
 
+		bus-range = <0x00 0xff>;
+
 		interrupt-map-mask = <0 0 0 7>;
 		interrupt-map = <0 0 0 1 &pci0_intc 1>,
 				<0 0 0 2 &pci0_intc 2>,
@@ -79,6 +81,8 @@
 		ranges = <0x02000000 0 0x20000000
 			  0x20000000 0 0x20000000>;
 
+		bus-range = <0x00 0xff>;
+
 		interrupt-map-mask = <0 0 0 7>;
 		interrupt-map = <0 0 0 1 &pci1_intc 1>,
 				<0 0 0 2 &pci1_intc 2>,
@@ -107,6 +111,8 @@
 		ranges = <0x02000000 0 0x16000000
 			  0x16000000 0 0x100000>;
 
+		bus-range = <0x00 0xff>;
+
 		interrupt-map-mask = <0 0 0 7>;
 		interrupt-map = <0 0 0 1 &pci2_intc 1>,
 				<0 0 0 2 &pci2_intc 2>,

From af52f9982e410edac21ca4b49563053ffc9da1eb Mon Sep 17 00:00:00 2001
From: David Wang <davidwang@zhaoxin.com>
Date: Mon, 16 Apr 2018 17:48:09 +0800
Subject: [PATCH 131/288] ALSA: hda - New VIA controller suppor no-snoop path

This patch is used to tell kernel that new VIA HDAC controller also
support no-snoop path.

[ minor coding style fix by tiwai ]

Signed-off-by: David Wang <davidwang@zhaoxin.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/pci/hda/hda_intel.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c
index ccffce068634..fece779f6260 100644
--- a/sound/pci/hda/hda_intel.c
+++ b/sound/pci/hda/hda_intel.c
@@ -1645,7 +1645,8 @@ static void azx_check_snoop_available(struct azx *chip)
 		 */
 		u8 val;
 		pci_read_config_byte(chip->pci, 0x42, &val);
-		if (!(val & 0x80) && chip->pci->revision == 0x30)
+		if (!(val & 0x80) && (chip->pci->revision == 0x30 ||
+				      chip->pci->revision == 0x20))
 			snoop = false;
 	}
 

From bd28899dd34f9283c567f7eeb31bb546f10820b5 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Mon, 16 Apr 2018 13:17:50 +0300
Subject: [PATCH 132/288] Revert "macsec: missing dev_put() on error in
 macsec_newlink()"

This patch is just wrong, sorry.  I was trying to fix a static checker
warning and misread the code.  The reference taken in macsec_newlink()
is released in macsec_free_netdev() when the netdevice is destroyed.

This reverts commit 5dcd8400884cc4a043a6d4617e042489e5d566a9.

Reported-by: Laura Abbott <labbott@redhat.com>
Fixes: 5dcd8400884c ("macsec: missing dev_put() on error in macsec_newlink()")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Acked-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/macsec.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c
index 9cbb0c8a896a..7de88b33d5b9 100644
--- a/drivers/net/macsec.c
+++ b/drivers/net/macsec.c
@@ -3277,7 +3277,7 @@ static int macsec_newlink(struct net *net, struct net_device *dev,
 
 	err = netdev_upper_dev_link(real_dev, dev, extack);
 	if (err < 0)
-		goto put_dev;
+		goto unregister;
 
 	/* need to be already registered so that ->init has run and
 	 * the MAC addr is set
@@ -3316,8 +3316,7 @@ del_dev:
 	macsec_del_dev(macsec);
 unlink:
 	netdev_upper_dev_unlink(real_dev, dev);
-put_dev:
-	dev_put(real_dev);
+unregister:
 	unregister_netdevice(dev);
 	return err;
 }

From 982e05001c472066ab288e4269ad6cab48889f0d Mon Sep 17 00:00:00 2001
From: Maxime Chevallier <maxime.chevallier@bootlin.com>
Date: Mon, 16 Apr 2018 10:07:23 +0200
Subject: [PATCH 133/288] net: mvpp2: Fix TCAM filter reserved range

Marvell's PPv2 controller has a Packet Header parser, which uses a
fixed-size TCAM array of filter entries.

The mvpp2 driver reserves some ranges among the 256 TCAM entries to
perform MAC and VID filtering. The rest of the TCAM ids are freely usable
for other features, such as IPv4 proto matching.

This commit fixes the MVPP2_PE_LAST_FREE_TID define that sets the end of
the "free range", which included the MAC range. This could therefore allow
some other features to use entries dedicated to MAC filtering,
lowering the number of unicast/multicast addresses that could be allowed
before switching to promiscuous mode.

Fixes: 10fea26ce2aa ("net: mvpp2: Add support for unicast filtering")
Signed-off-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/marvell/mvpp2.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/marvell/mvpp2.c b/drivers/net/ethernet/marvell/mvpp2.c
index 54a038943c06..9deb79b6dcc8 100644
--- a/drivers/net/ethernet/marvell/mvpp2.c
+++ b/drivers/net/ethernet/marvell/mvpp2.c
@@ -663,7 +663,7 @@ enum mvpp2_tag_type {
 #define MVPP2_PE_VID_FILT_RANGE_END     (MVPP2_PRS_TCAM_SRAM_SIZE - 31)
 #define MVPP2_PE_VID_FILT_RANGE_START   (MVPP2_PE_VID_FILT_RANGE_END - \
 					 MVPP2_PRS_VLAN_FILT_RANGE_SIZE + 1)
-#define MVPP2_PE_LAST_FREE_TID          (MVPP2_PE_VID_FILT_RANGE_START - 1)
+#define MVPP2_PE_LAST_FREE_TID          (MVPP2_PE_MAC_RANGE_START - 1)
 #define MVPP2_PE_IP6_EXT_PROTO_UN	(MVPP2_PRS_TCAM_SRAM_SIZE - 30)
 #define MVPP2_PE_IP6_ADDR_UN		(MVPP2_PRS_TCAM_SRAM_SIZE - 29)
 #define MVPP2_PE_IP4_ADDR_UN		(MVPP2_PRS_TCAM_SRAM_SIZE - 28)

From cc5cd5079699c7831fdc58e74352736706c3df3c Mon Sep 17 00:00:00 2001
From: Jia-Ju Bai <baijiaju1990@gmail.com>
Date: Mon, 9 Apr 2018 23:03:36 +0800
Subject: [PATCH 134/288] xen: xen-pciback: Replace GFP_ATOMIC with GFP_KERNEL
 in pcistub_probe

pcistub_probe() is never called in atomic context.
This function is only set as ".probe" in struct pci_driver.

Despite never getting called from atomic context,
pcistub_probe() calls kmalloc() with GFP_ATOMIC,
which does not sleep for allocation.
GFP_ATOMIC is not necessary and can be replaced with GFP_KERNEL,
which can sleep and improve the possibility of sucessful allocation.

This is found by a static analysis tool named DCNS written by myself.
And I also manually check it.

Signed-off-by: Jia-Ju Bai <baijiaju1990@gmail.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 drivers/xen/xen-pciback/pci_stub.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/xen/xen-pciback/pci_stub.c b/drivers/xen/xen-pciback/pci_stub.c
index 9e480fdebe1f..95e6ddd7c402 100644
--- a/drivers/xen/xen-pciback/pci_stub.c
+++ b/drivers/xen/xen-pciback/pci_stub.c
@@ -577,7 +577,7 @@ static int pcistub_probe(struct pci_dev *dev, const struct pci_device_id *id)
 		}
 
 		if (!match) {
-			pci_dev_id = kmalloc(sizeof(*pci_dev_id), GFP_ATOMIC);
+			pci_dev_id = kmalloc(sizeof(*pci_dev_id), GFP_KERNEL);
 			if (!pci_dev_id) {
 				err = -ENOMEM;
 				goto out;

From bb52e3169cb7dd5a9deea39b94342fce36235a5b Mon Sep 17 00:00:00 2001
From: Jia-Ju Bai <baijiaju1990@gmail.com>
Date: Mon, 9 Apr 2018 23:03:53 +0800
Subject: [PATCH 135/288] xen: xen-pciback: Replace GFP_ATOMIC with GFP_KERNEL
 in pcistub_init_device

pcistub_init_device() is never called in atomic context.

The call chain ending up at pcistub_init_device() is:
[1] pcistub_init_device() <- pcistub_seize() <- pcistub_probe()
[2] pcistub_init_device() <- pcistub_init_devices_late() <-
	xen_pcibk_init()
pcistub_probe() is only set as ".probe" in struct pci_driver.
xen_pcibk_init() is is only set as a parameter of module_init().
These functions are not called in atomic context.

Despite never getting called from atomic context,
pcistub_init_device() calls kzalloc() with GFP_ATOMIC,
which does not sleep for allocation.
GFP_ATOMIC is not necessary and can be replaced with GFP_KERNEL,
which can sleep and improve the possibility of sucessful allocation.

This is found by a static analysis tool named DCNS written by myself.
And I also manually check it.

Signed-off-by: Jia-Ju Bai <baijiaju1990@gmail.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 drivers/xen/xen-pciback/pci_stub.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/xen/xen-pciback/pci_stub.c b/drivers/xen/xen-pciback/pci_stub.c
index 95e6ddd7c402..b36fa76a69c7 100644
--- a/drivers/xen/xen-pciback/pci_stub.c
+++ b/drivers/xen/xen-pciback/pci_stub.c
@@ -364,7 +364,7 @@ static int pcistub_init_device(struct pci_dev *dev)
 	 * here and then to call kfree(pci_get_drvdata(psdev->dev)).
 	 */
 	dev_data = kzalloc(sizeof(*dev_data) +  strlen(DRV_NAME "[]")
-				+ strlen(pci_name(dev)) + 1, GFP_ATOMIC);
+				+ strlen(pci_name(dev)) + 1, GFP_KERNEL);
 	if (!dev_data) {
 		err = -ENOMEM;
 		goto out;

From 9eb5f15b47b69847bfceb94350bd68fbdbf829e3 Mon Sep 17 00:00:00 2001
From: Jia-Ju Bai <baijiaju1990@gmail.com>
Date: Mon, 9 Apr 2018 23:04:12 +0800
Subject: [PATCH 136/288] xen: xen-pciback: Replace GFP_ATOMIC with GFP_KERNEL
 in pcistub_device_alloc

pcistub_device_alloc() is never called in atomic context.

The call chain ending up at pcistub_device_alloc() is:
[1] pcistub_device_alloc() <- pcistub_seize() <- pcistub_probe()
pcistub_probe() is only set as ".probe" in struct pci_driver.
This function is not called in atomic context.

Despite never getting called from atomic context,
pcistub_device_alloc() calls kzalloc() with GFP_ATOMIC,
which does not sleep for allocation.
GFP_ATOMIC is not necessary and can be replaced with GFP_KERNEL,
which can sleep and improve the possibility of sucessful allocation.

Signed-off-by: Jia-Ju Bai <baijiaju1990@gmail.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 drivers/xen/xen-pciback/pci_stub.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/xen/xen-pciback/pci_stub.c b/drivers/xen/xen-pciback/pci_stub.c
index b36fa76a69c7..210b93f41385 100644
--- a/drivers/xen/xen-pciback/pci_stub.c
+++ b/drivers/xen/xen-pciback/pci_stub.c
@@ -71,7 +71,7 @@ static struct pcistub_device *pcistub_device_alloc(struct pci_dev *dev)
 
 	dev_dbg(&dev->dev, "pcistub_device_alloc\n");
 
-	psdev = kzalloc(sizeof(*psdev), GFP_ATOMIC);
+	psdev = kzalloc(sizeof(*psdev), GFP_KERNEL);
 	if (!psdev)
 		return NULL;
 

From 230d211472d2779253e5a8383353fc44783dd038 Mon Sep 17 00:00:00 2001
From: Jia-Ju Bai <baijiaju1990@gmail.com>
Date: Mon, 9 Apr 2018 23:04:25 +0800
Subject: [PATCH 137/288] xen: xen-pciback: Replace GFP_ATOMIC with GFP_KERNEL
 in xen_pcibk_config_quirks_init

xen_pcibk_config_quirks_init() is never called in atomic context.

The call chains ending up at xen_pcibk_config_quirks_init() are:
[1] xen_pcibk_config_quirks_init() <- xen_pcibk_config_init_dev() <-
	pcistub_init_device() <- pcistub_seize() <- pcistub_probe()
[2] xen_pcibk_config_quirks_init() <- xen_pcibk_config_init_dev() <-
	pcistub_init_device() <- pcistub_init_devices_late() <-
	xen_pcibk_init()
pcistub_probe() is only set as ".probe" in struct pci_driver.
xen_pcibk_init() is is only set as a parameter of module_init().
These functions are not called in atomic context.

Despite never getting called from atomic context,
xen_pcibk_config_quirks_init() calls kzalloc() with GFP_ATOMIC,
which does not sleep for allocation.
GFP_ATOMIC is not necessary and can be replaced with GFP_KERNEL,
which can sleep and improve the possibility of sucessful allocation.

Signed-off-by: Jia-Ju Bai <baijiaju1990@gmail.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 drivers/xen/xen-pciback/conf_space_quirks.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/xen/xen-pciback/conf_space_quirks.c b/drivers/xen/xen-pciback/conf_space_quirks.c
index 89d9744ece61..ed593d1042a6 100644
--- a/drivers/xen/xen-pciback/conf_space_quirks.c
+++ b/drivers/xen/xen-pciback/conf_space_quirks.c
@@ -95,7 +95,7 @@ int xen_pcibk_config_quirks_init(struct pci_dev *dev)
 	struct xen_pcibk_config_quirk *quirk;
 	int ret = 0;
 
-	quirk = kzalloc(sizeof(*quirk), GFP_ATOMIC);
+	quirk = kzalloc(sizeof(*quirk), GFP_KERNEL);
 	if (!quirk) {
 		ret = -ENOMEM;
 		goto out;

From de3d01fd8549ec0444fc917aab711b3f884930c5 Mon Sep 17 00:00:00 2001
From: Jia-Ju Bai <baijiaju1990@gmail.com>
Date: Wed, 11 Apr 2018 09:15:31 +0800
Subject: [PATCH 138/288] xen: xen-pciback: Replace GFP_ATOMIC with GFP_KERNEL
 in pcistub_reg_add

pcistub_reg_add() is never called in atomic context.

pcistub_reg_add() is only called by pcistub_quirk_add, which is
only set in DRIVER_ATTR().

Despite never getting called from atomic context,
pcistub_reg_add() calls kzalloc() with GFP_ATOMIC,
which does not sleep for allocation.
GFP_ATOMIC is not necessary and can be replaced with GFP_KERNEL,
which can sleep and improve the possibility of sucessful allocation.

This is found by a static analysis tool named DCNS written by myself.
And I also manually check it.

Signed-off-by: Jia-Ju Bai <baijiaju1990@gmail.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 drivers/xen/xen-pciback/pci_stub.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/xen/xen-pciback/pci_stub.c b/drivers/xen/xen-pciback/pci_stub.c
index 210b93f41385..59661db144e5 100644
--- a/drivers/xen/xen-pciback/pci_stub.c
+++ b/drivers/xen/xen-pciback/pci_stub.c
@@ -1149,7 +1149,7 @@ static int pcistub_reg_add(int domain, int bus, int slot, int func,
 	}
 	dev = psdev->dev;
 
-	field = kzalloc(sizeof(*field), GFP_ATOMIC);
+	field = kzalloc(sizeof(*field), GFP_KERNEL);
 	if (!field) {
 		err = -ENOMEM;
 		goto out;

From b8858581febb050688e276b956796bc4a78299ed Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Mon, 16 Apr 2018 23:25:19 +1000
Subject: [PATCH 139/288] powerpc/lib: Fix off-by-one in alternate feature
 patching

When we patch an alternate feature section, we have to adjust any
relative branches that branch out of the alternate section.

But currently we have a bug if we have a branch that points to past
the last instruction of the alternate section, eg:

  FTR_SECTION_ELSE
  1:     b       2f
         or      6,6,6
  2:
  ALT_FTR_SECTION_END(...)
         nop

This will result in a relative branch at 1 with a target that equals
the end of the alternate section.

That branch does not need adjusting when it's moved to the non-else
location. Currently we do adjust it, resulting in a branch that goes
off into the link-time location of the else section, which is junk.

The fix is to not patch branches that have a target == end of the
alternate section.

Fixes: d20fe50a7b3c ("KVM: PPC: Book3S HV: Branch inside feature section")
Fixes: 9b1a735de64c ("powerpc: Add logic to patch alternative feature sections")
Cc: stable@vger.kernel.org # v2.6.27+
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/lib/feature-fixups.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/lib/feature-fixups.c b/arch/powerpc/lib/feature-fixups.c
index 35f80ab7cbd8..288fe4f0db4e 100644
--- a/arch/powerpc/lib/feature-fixups.c
+++ b/arch/powerpc/lib/feature-fixups.c
@@ -55,7 +55,7 @@ static int patch_alt_instruction(unsigned int *src, unsigned int *dest,
 		unsigned int *target = (unsigned int *)branch_target(src);
 
 		/* Branch within the section doesn't need translating */
-		if (target < alt_start || target >= alt_end) {
+		if (target < alt_start || target > alt_end) {
 			instr = translate_branch(dest, src);
 			if (!instr)
 				return 1;

From 4fb0534fb7bbc2346ba7d3a072b538007f4135a5 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 13 Apr 2018 13:59:25 +0200
Subject: [PATCH 140/288] team: avoid adding twice the same option to the event
 list

When parsing the options provided by the user space,
team_nl_cmd_options_set() insert them in a temporary list to send
multiple events with a single message.
While each option's attribute is correctly validated, the code does
not check for duplicate entries before inserting into the event
list.

Exploiting the above, the syzbot was able to trigger the following
splat:

kernel BUG at lib/list_debug.c:31!
invalid opcode: 0000 [#1] SMP KASAN
Dumping ftrace buffer:
    (ftrace buffer empty)
Modules linked in:
CPU: 0 PID: 4466 Comm: syzkaller556835 Not tainted 4.16.0+ #17
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
Google 01/01/2011
RIP: 0010:__list_add_valid+0xaa/0xb0 lib/list_debug.c:29
RSP: 0018:ffff8801b04bf248 EFLAGS: 00010286
RAX: 0000000000000058 RBX: ffff8801c8fc7a90 RCX: 0000000000000000
RDX: 0000000000000058 RSI: ffffffff815fbf41 RDI: ffffed0036097e3f
RBP: ffff8801b04bf260 R08: ffff8801b0b2a700 R09: ffffed003b604f90
R10: ffffed003b604f90 R11: ffff8801db027c87 R12: ffff8801c8fc7a90
R13: ffff8801c8fc7a90 R14: dffffc0000000000 R15: 0000000000000000
FS:  0000000000b98880(0000) GS:ffff8801db000000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 000000000043fc30 CR3: 00000001afe8e000 CR4: 00000000001406f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
  __list_add include/linux/list.h:60 [inline]
  list_add include/linux/list.h:79 [inline]
  team_nl_cmd_options_set+0x9ff/0x12b0 drivers/net/team/team.c:2571
  genl_family_rcv_msg+0x889/0x1120 net/netlink/genetlink.c:599
  genl_rcv_msg+0xc6/0x170 net/netlink/genetlink.c:624
  netlink_rcv_skb+0x172/0x440 net/netlink/af_netlink.c:2448
  genl_rcv+0x28/0x40 net/netlink/genetlink.c:635
  netlink_unicast_kernel net/netlink/af_netlink.c:1310 [inline]
  netlink_unicast+0x58b/0x740 net/netlink/af_netlink.c:1336
  netlink_sendmsg+0x9f0/0xfa0 net/netlink/af_netlink.c:1901
  sock_sendmsg_nosec net/socket.c:629 [inline]
  sock_sendmsg+0xd5/0x120 net/socket.c:639
  ___sys_sendmsg+0x805/0x940 net/socket.c:2117
  __sys_sendmsg+0x115/0x270 net/socket.c:2155
  SYSC_sendmsg net/socket.c:2164 [inline]
  SyS_sendmsg+0x29/0x30 net/socket.c:2162
  do_syscall_64+0x29e/0x9d0 arch/x86/entry/common.c:287
  entry_SYSCALL_64_after_hwframe+0x42/0xb7
RIP: 0033:0x4458b9
RSP: 002b:00007ffd1d4a7278 EFLAGS: 00000213 ORIG_RAX: 000000000000002e
RAX: ffffffffffffffda RBX: 000000000000001b RCX: 00000000004458b9
RDX: 0000000000000010 RSI: 0000000020000d00 RDI: 0000000000000004
RBP: 00000000004a74ed R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000213 R12: 00007ffd1d4a7348
R13: 0000000000402a60 R14: 0000000000000000 R15: 0000000000000000
Code: 75 e8 eb a9 48 89 f7 48 89 75 e8 e8 d1 85 7b fe 48 8b 75 e8 eb bb 48
89 f2 48 89 d9 4c 89 e6 48 c7 c7 a0 84 d8 87 e8 ea 67 28 fe <0f> 0b 0f 1f
40 00 48 b8 00 00 00 00 00 fc ff df 55 48 89 e5 41
RIP: __list_add_valid+0xaa/0xb0 lib/list_debug.c:29 RSP: ffff8801b04bf248

This changeset addresses the avoiding list_add() if the current
option is already present in the event list.

Reported-and-tested-by: syzbot+4d4af685432dc0e56c91@syzkaller.appspotmail.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Fixes: 2fcdb2c9e659 ("team: allow to send multiple set events in one message")
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/team/team.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index a6c6ce19eeee..acbe84967834 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -261,6 +261,17 @@ static void __team_option_inst_mark_removed_port(struct team *team,
 	}
 }
 
+static bool __team_option_inst_tmp_find(const struct list_head *opts,
+					const struct team_option_inst *needle)
+{
+	struct team_option_inst *opt_inst;
+
+	list_for_each_entry(opt_inst, opts, tmp_list)
+		if (opt_inst == needle)
+			return true;
+	return false;
+}
+
 static int __team_options_register(struct team *team,
 				   const struct team_option *option,
 				   size_t option_count)
@@ -2568,6 +2579,14 @@ static int team_nl_cmd_options_set(struct sk_buff *skb, struct genl_info *info)
 			if (err)
 				goto team_put;
 			opt_inst->changed = true;
+
+			/* dumb/evil user-space can send us duplicate opt,
+			 * keep only the last one
+			 */
+			if (__team_option_inst_tmp_find(&opt_inst_list,
+							opt_inst))
+				continue;
+
 			list_add(&opt_inst->tmp_list, &opt_inst_list);
 		}
 		if (!opt_found) {

From e7c5a571a8d6a266aee9ca3f3f26e5afe3717eca Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Mon, 9 Apr 2018 12:34:24 -0700
Subject: [PATCH 141/288] libnvdimm, dimm: handle EACCES failures from label
 reads

The new support for the standard _LSR and _LSW methods neglected to also
update the nvdimm_init_config_data() and nvdimm_set_config_data() to
return the translated error code from failed commands. This precision is
necessary because the locked status that was previously returned on
ND_CMD_GET_CONFIG_SIZE commands is now returned on
ND_CMD_{GET,SET}_CONFIG_DATA commands.

If the kernel misses this indication it can inadvertently fall back to
label-less mode when it should otherwise avoid all access to locked
regions.

Cc: <stable@vger.kernel.org>
Fixes: 4b27db7e26cd ("acpi, nfit: add support for the _LSI, _LSR, and...")
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/dimm_devs.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c
index e00d45522b80..8d348b22ba45 100644
--- a/drivers/nvdimm/dimm_devs.c
+++ b/drivers/nvdimm/dimm_devs.c
@@ -88,9 +88,9 @@ int nvdimm_init_nsarea(struct nvdimm_drvdata *ndd)
 int nvdimm_init_config_data(struct nvdimm_drvdata *ndd)
 {
 	struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(ndd->dev);
+	int rc = validate_dimm(ndd), cmd_rc = 0;
 	struct nd_cmd_get_config_data_hdr *cmd;
 	struct nvdimm_bus_descriptor *nd_desc;
-	int rc = validate_dimm(ndd);
 	u32 max_cmd_size, config_size;
 	size_t offset;
 
@@ -124,9 +124,11 @@ int nvdimm_init_config_data(struct nvdimm_drvdata *ndd)
 		cmd->in_offset = offset;
 		rc = nd_desc->ndctl(nd_desc, to_nvdimm(ndd->dev),
 				ND_CMD_GET_CONFIG_DATA, cmd,
-				cmd->in_length + sizeof(*cmd), NULL);
-		if (rc || cmd->status) {
-			rc = -ENXIO;
+				cmd->in_length + sizeof(*cmd), &cmd_rc);
+		if (rc < 0)
+			break;
+		if (cmd_rc < 0) {
+			rc = cmd_rc;
 			break;
 		}
 		memcpy(ndd->data + offset, cmd->out_buf, cmd->in_length);
@@ -140,9 +142,9 @@ int nvdimm_init_config_data(struct nvdimm_drvdata *ndd)
 int nvdimm_set_config_data(struct nvdimm_drvdata *ndd, size_t offset,
 		void *buf, size_t len)
 {
-	int rc = validate_dimm(ndd);
 	size_t max_cmd_size, buf_offset;
 	struct nd_cmd_set_config_hdr *cmd;
+	int rc = validate_dimm(ndd), cmd_rc = 0;
 	struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(ndd->dev);
 	struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc;
 
@@ -164,7 +166,6 @@ int nvdimm_set_config_data(struct nvdimm_drvdata *ndd, size_t offset,
 	for (buf_offset = 0; len; len -= cmd->in_length,
 			buf_offset += cmd->in_length) {
 		size_t cmd_size;
-		u32 *status;
 
 		cmd->in_offset = offset + buf_offset;
 		cmd->in_length = min(max_cmd_size, len);
@@ -172,12 +173,13 @@ int nvdimm_set_config_data(struct nvdimm_drvdata *ndd, size_t offset,
 
 		/* status is output in the last 4-bytes of the command buffer */
 		cmd_size = sizeof(*cmd) + cmd->in_length + sizeof(u32);
-		status = ((void *) cmd) + cmd_size - sizeof(u32);
 
 		rc = nd_desc->ndctl(nd_desc, to_nvdimm(ndd->dev),
-				ND_CMD_SET_CONFIG_DATA, cmd, cmd_size, NULL);
-		if (rc || *status) {
-			rc = rc ? rc : -ENXIO;
+				ND_CMD_SET_CONFIG_DATA, cmd, cmd_size, &cmd_rc);
+		if (rc < 0)
+			break;
+		if (cmd_rc < 0) {
+			rc = cmd_rc;
 			break;
 		}
 	}

From 55c72ab62e47fc584131901baddb2752e949ebcd Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Mon, 9 Apr 2018 13:56:43 -0700
Subject: [PATCH 142/288] tools/testing/nvdimm: allow custom error code
 injection

Given that libnvdimm driver stack takes specific actions on DIMM command
error codes like -EACCES, provide a facility to inject custom failures.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 tools/testing/nvdimm/test/nfit.c | 38 +++++++++++++++++++++++++++++++-
 1 file changed, 37 insertions(+), 1 deletion(-)

diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
index cb166be4918d..dc6cf5630280 100644
--- a/tools/testing/nvdimm/test/nfit.c
+++ b/tools/testing/nvdimm/test/nfit.c
@@ -138,6 +138,7 @@ static u32 handle[] = {
 };
 
 static unsigned long dimm_fail_cmd_flags[NUM_DCR];
+static int dimm_fail_cmd_code[NUM_DCR];
 
 struct nfit_test_fw {
 	enum intel_fw_update_state state;
@@ -892,8 +893,11 @@ static int get_dimm(struct nfit_mem *nfit_mem, unsigned int func)
 	if (i >= ARRAY_SIZE(handle))
 		return -ENXIO;
 
-	if ((1 << func) & dimm_fail_cmd_flags[i])
+	if ((1 << func) & dimm_fail_cmd_flags[i]) {
+		if (dimm_fail_cmd_code[i])
+			return dimm_fail_cmd_code[i];
 		return -EIO;
+	}
 
 	return i;
 }
@@ -1225,8 +1229,40 @@ static ssize_t fail_cmd_store(struct device *dev, struct device_attribute *attr,
 }
 static DEVICE_ATTR_RW(fail_cmd);
 
+static ssize_t fail_cmd_code_show(struct device *dev, struct device_attribute *attr,
+		char *buf)
+{
+	int dimm = dimm_name_to_id(dev);
+
+	if (dimm < 0)
+		return dimm;
+
+	return sprintf(buf, "%d\n", dimm_fail_cmd_code[dimm]);
+}
+
+static ssize_t fail_cmd_code_store(struct device *dev, struct device_attribute *attr,
+		const char *buf, size_t size)
+{
+	int dimm = dimm_name_to_id(dev);
+	unsigned long val;
+	ssize_t rc;
+
+	if (dimm < 0)
+		return dimm;
+
+	rc = kstrtol(buf, 0, &val);
+	if (rc)
+		return rc;
+
+	dimm_fail_cmd_code[dimm] = val;
+	return size;
+}
+static DEVICE_ATTR_RW(fail_cmd_code);
+
+
 static struct attribute *nfit_test_dimm_attributes[] = {
 	&dev_attr_fail_cmd.attr,
+	&dev_attr_fail_cmd_code.attr,
 	&dev_attr_handle.attr,
 	NULL,
 };

From 718fda67d2f69cc6074b4b6a740a6e4aacd44eff Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Mon, 9 Apr 2018 16:38:01 -0700
Subject: [PATCH 143/288] tools/testing/nvdimm: support nfit_test_dimm
 attributes under nfit_test.1

The nfit_test.1 bus provides a pmem topology without blk-aperture
enabling, so it presents different failure modes for label space
handling. Allow custom DSM command error injection.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 tools/testing/nvdimm/test/nfit.c | 43 +++++++++++++++++++-------------
 1 file changed, 25 insertions(+), 18 deletions(-)

diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
index dc6cf5630280..c8c88363311b 100644
--- a/tools/testing/nvdimm/test/nfit.c
+++ b/tools/testing/nvdimm/test/nfit.c
@@ -1166,12 +1166,12 @@ static int ars_state_init(struct device *dev, struct ars_state *ars_state)
 
 static void put_dimms(void *data)
 {
-	struct device **dimm_dev = data;
+	struct nfit_test *t = data;
 	int i;
 
-	for (i = 0; i < NUM_DCR; i++)
-		if (dimm_dev[i])
-			device_unregister(dimm_dev[i]);
+	for (i = 0; i < t->num_dcr; i++)
+		if (t->dimm_dev[i])
+			device_unregister(t->dimm_dev[i]);
 }
 
 static struct class *nfit_test_dimm;
@@ -1180,13 +1180,11 @@ static int dimm_name_to_id(struct device *dev)
 {
 	int dimm;
 
-	if (sscanf(dev_name(dev), "test_dimm%d", &dimm) != 1
-			|| dimm >= NUM_DCR || dimm < 0)
+	if (sscanf(dev_name(dev), "test_dimm%d", &dimm) != 1)
 		return -ENXIO;
 	return dimm;
 }
 
-
 static ssize_t handle_show(struct device *dev, struct device_attribute *attr,
 		char *buf)
 {
@@ -1259,7 +1257,6 @@ static ssize_t fail_cmd_code_store(struct device *dev, struct device_attribute *
 }
 static DEVICE_ATTR_RW(fail_cmd_code);
 
-
 static struct attribute *nfit_test_dimm_attributes[] = {
 	&dev_attr_fail_cmd.attr,
 	&dev_attr_fail_cmd_code.attr,
@@ -1276,6 +1273,23 @@ static const struct attribute_group *nfit_test_dimm_attribute_groups[] = {
 	NULL,
 };
 
+static int nfit_test_dimm_init(struct nfit_test *t)
+{
+	int i;
+
+	if (devm_add_action_or_reset(&t->pdev.dev, put_dimms, t))
+		return -ENOMEM;
+	for (i = 0; i < t->num_dcr; i++) {
+		t->dimm_dev[i] = device_create_with_groups(nfit_test_dimm,
+				&t->pdev.dev, 0, NULL,
+				nfit_test_dimm_attribute_groups,
+				"test_dimm%d", i + t->dcr_idx);
+		if (!t->dimm_dev[i])
+			return -ENOMEM;
+	}
+	return 0;
+}
+
 static void smart_init(struct nfit_test *t)
 {
 	int i;
@@ -1371,17 +1385,8 @@ static int nfit_test0_alloc(struct nfit_test *t)
 	if (!t->_fit)
 		return -ENOMEM;
 
-	if (devm_add_action_or_reset(&t->pdev.dev, put_dimms, t->dimm_dev))
+	if (nfit_test_dimm_init(t))
 		return -ENOMEM;
-	for (i = 0; i < NUM_DCR; i++) {
-		t->dimm_dev[i] = device_create_with_groups(nfit_test_dimm,
-				&t->pdev.dev, 0, NULL,
-				nfit_test_dimm_attribute_groups,
-				"test_dimm%d", i);
-		if (!t->dimm_dev[i])
-			return -ENOMEM;
-	}
-
 	smart_init(t);
 	return ars_state_init(&t->pdev.dev, &t->ars_state);
 }
@@ -1413,6 +1418,8 @@ static int nfit_test1_alloc(struct nfit_test *t)
 	if (!t->spa_set[1])
 		return -ENOMEM;
 
+	if (nfit_test_dimm_init(t))
+		return -ENOMEM;
 	smart_init(t);
 	return ars_state_init(&t->pdev.dev, &t->ars_state);
 }

From 19357a685e870eeb825cbb7fd3104082ab041987 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 10 Apr 2018 14:23:47 -0700
Subject: [PATCH 144/288] tools/testing/nvdimm: fix missing newline in
 nfit_test_dimm 'handle' attribute

Sysfs userspace tooling generally expects the kernel to emit a newlines
when reading sysfs attributes.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 tools/testing/nvdimm/test/nfit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
index c8c88363311b..51fc41c24a77 100644
--- a/tools/testing/nvdimm/test/nfit.c
+++ b/tools/testing/nvdimm/test/nfit.c
@@ -1193,7 +1193,7 @@ static ssize_t handle_show(struct device *dev, struct device_attribute *attr,
 	if (dimm < 0)
 		return dimm;
 
-	return sprintf(buf, "%#x", handle[dimm]);
+	return sprintf(buf, "%#x\n", handle[dimm]);
 }
 DEVICE_ATTR_RO(handle);
 

From 9484e12d7999a3c82d8732b60c0149f038bdd985 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Mon, 9 Apr 2018 14:29:33 -0700
Subject: [PATCH 145/288] tools/testing/nvdimm: enable labels for nfit_test.1
 dimms

Enable test cases for the kernel's fallback to label-less mode.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 tools/testing/nvdimm/test/nfit.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
index 51fc41c24a77..4ea385be528f 100644
--- a/tools/testing/nvdimm/test/nfit.c
+++ b/tools/testing/nvdimm/test/nfit.c
@@ -2265,6 +2265,9 @@ static void nfit_test1_setup(struct nfit_test *t)
 	set_bit(ND_CMD_ARS_STATUS, &acpi_desc->bus_cmd_force_en);
 	set_bit(ND_CMD_CLEAR_ERROR, &acpi_desc->bus_cmd_force_en);
 	set_bit(ND_INTEL_ENABLE_LSS_STATUS, &acpi_desc->dimm_cmd_force_en);
+	set_bit(ND_CMD_GET_CONFIG_SIZE, &acpi_desc->dimm_cmd_force_en);
+	set_bit(ND_CMD_GET_CONFIG_DATA, &acpi_desc->dimm_cmd_force_en);
+	set_bit(ND_CMD_SET_CONFIG_DATA, &acpi_desc->dimm_cmd_force_en);
 }
 
 static int nfit_test_blk_do_io(struct nd_blk_region *ndbr, resource_size_t dpa,

From bffd168c3fc5cc7d2bad4c668fa90e7a9010db4b Mon Sep 17 00:00:00 2001
From: Soheil Hassas Yeganeh <soheil@google.com>
Date: Sat, 14 Apr 2018 20:44:46 -0400
Subject: [PATCH 146/288] tcp: clear tp->packets_out when purging write queue

Clear tp->packets_out when purging the write queue, otherwise
tcp_rearm_rto() mistakenly assumes TCP write queue is not empty.
This results in NULL pointer dereference.

Also, remove the redundant `tp->packets_out = 0` from
tcp_disconnect(), since tcp_disconnect() calls
tcp_write_queue_purge().

Fixes: a27fd7a8ed38 (tcp: purge write queue upon RST)
Reported-by: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Reported-by: Sami Farin <hvtaifwkbgefbaei@gmail.com>
Tested-by: Sami Farin <hvtaifwkbgefbaei@gmail.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 4fa3f812b9ff..9ce1c726185e 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2368,6 +2368,7 @@ void tcp_write_queue_purge(struct sock *sk)
 	INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
 	sk_mem_reclaim(sk);
 	tcp_clear_all_retrans_hints(tcp_sk(sk));
+	tcp_sk(sk)->packets_out = 0;
 }
 
 int tcp_disconnect(struct sock *sk, int flags)
@@ -2417,7 +2418,6 @@ int tcp_disconnect(struct sock *sk, int flags)
 	icsk->icsk_backoff = 0;
 	tp->snd_cwnd = 2;
 	icsk->icsk_probes_out = 0;
-	tp->packets_out = 0;
 	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
 	tp->snd_cwnd_cnt = 0;
 	tp->window_clamp = 0;

From f23e0643cd0b53e68e283b6f26194d56c28a2eb1 Mon Sep 17 00:00:00 2001
From: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
Date: Sun, 15 Apr 2018 18:53:36 -0500
Subject: [PATCH 147/288] ibmvnic: Clear pending interrupt after device reset

Due to a firmware bug, the hypervisor can send an interrupt to a
transmit or receive queue just prior to a partition migration, not
allowing the device enough time to handle it and send an EOI. When
the partition migrates, the interrupt is lost but an "EOI-pending"
flag for the interrupt line is still set in firmware. No further
interrupts will be sent until that flag is cleared, effectively
freezing that queue. To workaround this, the driver will disable the
hardware interrupt and send an H_EOI signal prior to re-enabling it.
This will flush the pending EOI and allow the driver to continue
operation.

Signed-off-by: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ibm/ibmvnic.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index f4a56a3c07ad..2df01ad98df7 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -1049,16 +1049,14 @@ static int __ibmvnic_open(struct net_device *netdev)
 		netdev_dbg(netdev, "Enabling rx_scrq[%d] irq\n", i);
 		if (prev_state == VNIC_CLOSED)
 			enable_irq(adapter->rx_scrq[i]->irq);
-		else
-			enable_scrq_irq(adapter, adapter->rx_scrq[i]);
+		enable_scrq_irq(adapter, adapter->rx_scrq[i]);
 	}
 
 	for (i = 0; i < adapter->req_tx_queues; i++) {
 		netdev_dbg(netdev, "Enabling tx_scrq[%d] irq\n", i);
 		if (prev_state == VNIC_CLOSED)
 			enable_irq(adapter->tx_scrq[i]->irq);
-		else
-			enable_scrq_irq(adapter, adapter->tx_scrq[i]);
+		enable_scrq_irq(adapter, adapter->tx_scrq[i]);
 	}
 
 	rc = set_link_state(adapter, IBMVNIC_LOGICAL_LNK_UP);
@@ -1199,6 +1197,7 @@ static void ibmvnic_disable_irqs(struct ibmvnic_adapter *adapter)
 			if (adapter->tx_scrq[i]->irq) {
 				netdev_dbg(netdev,
 					   "Disabling tx_scrq[%d] irq\n", i);
+				disable_scrq_irq(adapter, adapter->tx_scrq[i]);
 				disable_irq(adapter->tx_scrq[i]->irq);
 			}
 	}
@@ -1208,6 +1207,7 @@ static void ibmvnic_disable_irqs(struct ibmvnic_adapter *adapter)
 			if (adapter->rx_scrq[i]->irq) {
 				netdev_dbg(netdev,
 					   "Disabling rx_scrq[%d] irq\n", i);
+				disable_scrq_irq(adapter, adapter->rx_scrq[i]);
 				disable_irq(adapter->rx_scrq[i]->irq);
 			}
 		}
@@ -2617,12 +2617,19 @@ static int enable_scrq_irq(struct ibmvnic_adapter *adapter,
 {
 	struct device *dev = &adapter->vdev->dev;
 	unsigned long rc;
+	u64 val;
 
 	if (scrq->hw_irq > 0x100000000ULL) {
 		dev_err(dev, "bad hw_irq = %lx\n", scrq->hw_irq);
 		return 1;
 	}
 
+	val = (0xff000000) | scrq->hw_irq;
+	rc = plpar_hcall_norets(H_EOI, val);
+	if (rc)
+		dev_err(dev, "H_EOI FAILED irq 0x%llx. rc=%ld\n",
+			val, rc);
+
 	rc = plpar_hcall_norets(H_VIOCTL, adapter->vdev->unit_address,
 				H_ENABLE_VIO_INTERRUPT, scrq->hw_irq, 0, 0);
 	if (rc)

From 5171b37d959641bbc619781caf62e61f7b940871 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 15 Apr 2018 17:52:04 -0700
Subject: [PATCH 148/288] net: af_packet: fix race in PACKET_{R|T}X_RING

In order to remove the race caught by syzbot [1], we need
to lock the socket before using po->tp_version as this could
change under us otherwise.

This means lock_sock() and release_sock() must be done by
packet_set_ring() callers.

[1] :
BUG: KMSAN: uninit-value in packet_set_ring+0x1254/0x3870 net/packet/af_packet.c:4249
CPU: 0 PID: 20195 Comm: syzkaller707632 Not tainted 4.16.0+ #83
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:17 [inline]
 dump_stack+0x185/0x1d0 lib/dump_stack.c:53
 kmsan_report+0x142/0x240 mm/kmsan/kmsan.c:1067
 __msan_warning_32+0x6c/0xb0 mm/kmsan/kmsan_instr.c:676
 packet_set_ring+0x1254/0x3870 net/packet/af_packet.c:4249
 packet_setsockopt+0x12c6/0x5a90 net/packet/af_packet.c:3662
 SYSC_setsockopt+0x4b8/0x570 net/socket.c:1849
 SyS_setsockopt+0x76/0xa0 net/socket.c:1828
 do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287
 entry_SYSCALL_64_after_hwframe+0x3d/0xa2
RIP: 0033:0x449099
RSP: 002b:00007f42b5307ce8 EFLAGS: 00000246 ORIG_RAX: 0000000000000036
RAX: ffffffffffffffda RBX: 000000000070003c RCX: 0000000000449099
RDX: 0000000000000005 RSI: 0000000000000107 RDI: 0000000000000003
RBP: 0000000000700038 R08: 000000000000001c R09: 0000000000000000
R10: 00000000200000c0 R11: 0000000000000246 R12: 0000000000000000
R13: 000000000080eecf R14: 00007f42b53089c0 R15: 0000000000000001

Local variable description: ----req_u@packet_setsockopt
Variable was created at:
 packet_setsockopt+0x13f/0x5a90 net/packet/af_packet.c:3612
 SYSC_setsockopt+0x4b8/0x570 net/socket.c:1849

Fixes: f6fb8f100b80 ("af-packet: TPACKET_V3 flexible buffer implementation.")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/packet/af_packet.c | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 616cb9c18f88..c31b0687396a 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -3008,6 +3008,7 @@ static int packet_release(struct socket *sock)
 
 	packet_flush_mclist(sk);
 
+	lock_sock(sk);
 	if (po->rx_ring.pg_vec) {
 		memset(&req_u, 0, sizeof(req_u));
 		packet_set_ring(sk, &req_u, 1, 0);
@@ -3017,6 +3018,7 @@ static int packet_release(struct socket *sock)
 		memset(&req_u, 0, sizeof(req_u));
 		packet_set_ring(sk, &req_u, 1, 1);
 	}
+	release_sock(sk);
 
 	f = fanout_release(sk);
 
@@ -3643,6 +3645,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
 		union tpacket_req_u req_u;
 		int len;
 
+		lock_sock(sk);
 		switch (po->tp_version) {
 		case TPACKET_V1:
 		case TPACKET_V2:
@@ -3653,12 +3656,17 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
 			len = sizeof(req_u.req3);
 			break;
 		}
-		if (optlen < len)
-			return -EINVAL;
-		if (copy_from_user(&req_u.req, optval, len))
-			return -EFAULT;
-		return packet_set_ring(sk, &req_u, 0,
-			optname == PACKET_TX_RING);
+		if (optlen < len) {
+			ret = -EINVAL;
+		} else {
+			if (copy_from_user(&req_u.req, optval, len))
+				ret = -EFAULT;
+			else
+				ret = packet_set_ring(sk, &req_u, 0,
+						    optname == PACKET_TX_RING);
+		}
+		release_sock(sk);
+		return ret;
 	}
 	case PACKET_COPY_THRESH:
 	{
@@ -4208,8 +4216,6 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
 	/* Added to avoid minimal code churn */
 	struct tpacket_req *req = &req_u->req;
 
-	lock_sock(sk);
-
 	rb = tx_ring ? &po->tx_ring : &po->rx_ring;
 	rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
 
@@ -4347,7 +4353,6 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
 	if (pg_vec)
 		free_pg_vec(pg_vec, order, req->tp_block_nr);
 out:
-	release_sock(sk);
 	return err;
 }
 

From e79f245ddec17bbd89d73cd0169dba4be46c9b55 Mon Sep 17 00:00:00 2001
From: KarimAllah Ahmed <karahmed@amazon.de>
Date: Sat, 14 Apr 2018 05:10:52 +0200
Subject: [PATCH 149/288] X86/KVM: Properly update 'tsc_offset' to represent
 the running guest
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Update 'tsc_offset' on vmentry/vmexit of L2 guests to ensure that it always
captures the TSC_OFFSET of the running guest whether it is the L1 or L2
guest.

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Reviewed-by: Jim Mattson <jmattson@google.com>
Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: KarimAllah Ahmed <karahmed@amazon.de>
[AMD changes, fix update_ia32_tsc_adjust_msr. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/svm.c              | 17 ++++++++++-
 arch/x86/kvm/vmx.c              | 54 +++++++++++++++++++++------------
 arch/x86/kvm/x86.c              |  6 ++--
 4 files changed, 56 insertions(+), 22 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 949c977bc4c9..c25775fad4ed 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1013,6 +1013,7 @@ struct kvm_x86_ops {
 
 	bool (*has_wbinvd_exit)(void);
 
+	u64 (*read_l1_tsc_offset)(struct kvm_vcpu *vcpu);
 	void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
 
 	void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index b3ebc8ad6891..e77a536d0b7c 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1424,12 +1424,23 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
 	seg->base = 0;
 }
 
+static u64 svm_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	if (is_guest_mode(vcpu))
+		return svm->nested.hsave->control.tsc_offset;
+
+	return vcpu->arch.tsc_offset;
+}
+
 static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 	u64 g_tsc_offset = 0;
 
 	if (is_guest_mode(vcpu)) {
+		/* Write L1's TSC offset.  */
 		g_tsc_offset = svm->vmcb->control.tsc_offset -
 			       svm->nested.hsave->control.tsc_offset;
 		svm->nested.hsave->control.tsc_offset = offset;
@@ -3323,6 +3334,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
 	/* Restore the original control entries */
 	copy_vmcb_control_area(vmcb, hsave);
 
+	svm->vcpu.arch.tsc_offset = svm->vmcb->control.tsc_offset;
 	kvm_clear_exception_queue(&svm->vcpu);
 	kvm_clear_interrupt_queue(&svm->vcpu);
 
@@ -3483,10 +3495,12 @@ static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
 	/* We don't want to see VMMCALLs from a nested guest */
 	clr_intercept(svm, INTERCEPT_VMMCALL);
 
+	svm->vcpu.arch.tsc_offset += nested_vmcb->control.tsc_offset;
+	svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset;
+
 	svm->vmcb->control.virt_ext = nested_vmcb->control.virt_ext;
 	svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
 	svm->vmcb->control.int_state = nested_vmcb->control.int_state;
-	svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset;
 	svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
 	svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
 
@@ -7102,6 +7116,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
 
 	.has_wbinvd_exit = svm_has_wbinvd_exit,
 
+	.read_l1_tsc_offset = svm_read_l1_tsc_offset,
 	.write_tsc_offset = svm_write_tsc_offset,
 
 	.set_tdp_cr3 = set_tdp_cr3,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a13c603bdefb..7207e6cc07c1 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2874,6 +2874,17 @@ static void setup_msrs(struct vcpu_vmx *vmx)
 		vmx_update_msr_bitmap(&vmx->vcpu);
 }
 
+static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
+{
+	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+	if (is_guest_mode(vcpu) &&
+	    (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING))
+		return vcpu->arch.tsc_offset - vmcs12->tsc_offset;
+
+	return vcpu->arch.tsc_offset;
+}
+
 /*
  * reads and returns guest's timestamp counter "register"
  * guest_tsc = (host_tsc * tsc multiplier) >> 48 + tsc_offset
@@ -11175,11 +11186,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 		vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
 	}
 
-	if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
-		vmcs_write64(TSC_OFFSET,
-			vcpu->arch.tsc_offset + vmcs12->tsc_offset);
-	else
-		vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
+	vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
+
 	if (kvm_has_tsc_control)
 		decache_tsc_multiplier(vmx);
 
@@ -11427,6 +11435,7 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 	u32 msr_entry_idx;
 	u32 exit_qual;
+	int r;
 
 	enter_guest_mode(vcpu);
 
@@ -11436,26 +11445,21 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
 	vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
 	vmx_segment_cache_clear(vmx);
 
-	if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual)) {
-		leave_guest_mode(vcpu);
-		vmx_switch_vmcs(vcpu, &vmx->vmcs01);
-		nested_vmx_entry_failure(vcpu, vmcs12,
-					 EXIT_REASON_INVALID_STATE, exit_qual);
-		return 1;
-	}
+	if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
+		vcpu->arch.tsc_offset += vmcs12->tsc_offset;
+
+	r = EXIT_REASON_INVALID_STATE;
+	if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual))
+		goto fail;
 
 	nested_get_vmcs12_pages(vcpu, vmcs12);
 
+	r = EXIT_REASON_MSR_LOAD_FAIL;
 	msr_entry_idx = nested_vmx_load_msr(vcpu,
 					    vmcs12->vm_entry_msr_load_addr,
 					    vmcs12->vm_entry_msr_load_count);
-	if (msr_entry_idx) {
-		leave_guest_mode(vcpu);
-		vmx_switch_vmcs(vcpu, &vmx->vmcs01);
-		nested_vmx_entry_failure(vcpu, vmcs12,
-				EXIT_REASON_MSR_LOAD_FAIL, msr_entry_idx);
-		return 1;
-	}
+	if (msr_entry_idx)
+		goto fail;
 
 	/*
 	 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
@@ -11464,6 +11468,14 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
 	 * the success flag) when L2 exits (see nested_vmx_vmexit()).
 	 */
 	return 0;
+
+fail:
+	if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
+		vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
+	leave_guest_mode(vcpu);
+	vmx_switch_vmcs(vcpu, &vmx->vmcs01);
+	nested_vmx_entry_failure(vcpu, vmcs12, r, exit_qual);
+	return 1;
 }
 
 /*
@@ -12035,6 +12047,9 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 
 	leave_guest_mode(vcpu);
 
+	if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
+		vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
+
 	if (likely(!vmx->fail)) {
 		if (exit_reason == -1)
 			sync_vmcs12(vcpu, vmcs12);
@@ -12725,6 +12740,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 
 	.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
 
+	.read_l1_tsc_offset = vmx_read_l1_tsc_offset,
 	.write_tsc_offset = vmx_write_tsc_offset,
 
 	.set_tdp_cr3 = vmx_set_cr3,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0334b250e102..3f3fba58c960 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1490,7 +1490,7 @@ static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
 
 static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)
 {
-	u64 curr_offset = vcpu->arch.tsc_offset;
+	u64 curr_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu);
 	vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;
 }
 
@@ -1532,7 +1532,9 @@ static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
 
 u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
 {
-	return vcpu->arch.tsc_offset + kvm_scale_tsc(vcpu, host_tsc);
+	u64 tsc_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu);
+
+	return tsc_offset + kvm_scale_tsc(vcpu, host_tsc);
 }
 EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
 

From dd259935e4eec844dc3e5b8a7cd951cd658b4fb6 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 13 Apr 2018 11:38:35 +0200
Subject: [PATCH 150/288] kvm: x86: move MSR_IA32_TSC handling to x86.c

This is not specific to Intel/AMD anymore.  The TSC offset is available
in vcpu->arch.tsc_offset.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/svm.c |  9 ---------
 arch/x86/kvm/vmx.c | 20 --------------------
 arch/x86/kvm/x86.c |  6 ++++++
 3 files changed, 6 insertions(+), 29 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index e77a536d0b7c..80ea26274a54 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -4050,12 +4050,6 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	struct vcpu_svm *svm = to_svm(vcpu);
 
 	switch (msr_info->index) {
-	case MSR_IA32_TSC: {
-		msr_info->data = svm->vmcb->control.tsc_offset +
-			kvm_scale_tsc(vcpu, rdtsc());
-
-		break;
-	}
 	case MSR_STAR:
 		msr_info->data = svm->vmcb->save.star;
 		break;
@@ -4208,9 +4202,6 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
 		svm->vmcb->save.g_pat = data;
 		mark_dirty(svm->vmcb, VMCB_NPT);
 		break;
-	case MSR_IA32_TSC:
-		kvm_write_tsc(vcpu, msr);
-		break;
 	case MSR_IA32_SPEC_CTRL:
 		if (!msr->host_initiated &&
 		    !guest_cpuid_has(vcpu, X86_FEATURE_IBRS))
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7207e6cc07c1..5b49ad448ad6 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2885,20 +2885,6 @@ static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
 	return vcpu->arch.tsc_offset;
 }
 
-/*
- * reads and returns guest's timestamp counter "register"
- * guest_tsc = (host_tsc * tsc multiplier) >> 48 + tsc_offset
- * -- Intel TSC Scaling for Virtualization White Paper, sec 1.3
- */
-static u64 guest_read_tsc(struct kvm_vcpu *vcpu)
-{
-	u64 host_tsc, tsc_offset;
-
-	host_tsc = rdtsc();
-	tsc_offset = vmcs_read64(TSC_OFFSET);
-	return kvm_scale_tsc(vcpu, host_tsc) + tsc_offset;
-}
-
 /*
  * writes 'offset' into guest's timestamp counter offset register
  */
@@ -3529,9 +3515,6 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 #endif
 	case MSR_EFER:
 		return kvm_get_msr_common(vcpu, msr_info);
-	case MSR_IA32_TSC:
-		msr_info->data = guest_read_tsc(vcpu);
-		break;
 	case MSR_IA32_SPEC_CTRL:
 		if (!msr_info->host_initiated &&
 		    !guest_cpuid_has(vcpu, X86_FEATURE_IBRS) &&
@@ -3651,9 +3634,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 			return 1;
 		vmcs_write64(GUEST_BNDCFGS, data);
 		break;
-	case MSR_IA32_TSC:
-		kvm_write_tsc(vcpu, msr_info);
-		break;
 	case MSR_IA32_SPEC_CTRL:
 		if (!msr_info->host_initiated &&
 		    !guest_cpuid_has(vcpu, X86_FEATURE_IBRS) &&
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3f3fba58c960..51ecd381793b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2364,6 +2364,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 			return 1;
 		vcpu->arch.smbase = data;
 		break;
+	case MSR_IA32_TSC:
+		kvm_write_tsc(vcpu, msr_info);
+		break;
 	case MSR_SMI_COUNT:
 		if (!msr_info->host_initiated)
 			return 1;
@@ -2607,6 +2610,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_IA32_UCODE_REV:
 		msr_info->data = vcpu->arch.microcode_version;
 		break;
+	case MSR_IA32_TSC:
+		msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + vcpu->arch.tsc_offset;
+		break;
 	case MSR_MTRRcap:
 	case 0x200 ... 0x2ff:
 		return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);

From d5edb7f8e7ab9fd5fd54a77d957b1733f117a813 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 27 Mar 2018 22:46:11 +0200
Subject: [PATCH 151/288] kvm: selftests: add vmx_tsc_adjust_test

The test checks the behavior of setting MSR_IA32_TSC in a nested guest,
and the TSC_OFFSET VMCS field in general.  It also introduces the testing
infrastructure for Intel nested virtualization.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/Makefile          |   3 +-
 .../testing/selftests/kvm/include/kvm_util.h  |  15 +-
 tools/testing/selftests/kvm/include/vmx.h     | 494 ++++++++++++++++++
 tools/testing/selftests/kvm/lib/kvm_util.c    |  18 +-
 tools/testing/selftests/kvm/lib/vmx.c         | 243 +++++++++
 .../selftests/kvm/vmx_tsc_adjust_test.c       | 231 ++++++++
 6 files changed, 991 insertions(+), 13 deletions(-)
 create mode 100644 tools/testing/selftests/kvm/include/vmx.h
 create mode 100644 tools/testing/selftests/kvm/lib/vmx.c
 create mode 100644 tools/testing/selftests/kvm/vmx_tsc_adjust_test.c

diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index 1da541e1ab75..2ddcc96ae456 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -4,10 +4,11 @@ top_srcdir = ../../../../
 UNAME_M := $(shell uname -m)
 
 LIBKVM = lib/assert.c lib/elf.c lib/io.c lib/kvm_util.c lib/sparsebit.c
-LIBKVM_x86_64 = lib/x86.c
+LIBKVM_x86_64 = lib/x86.c lib/vmx.c
 
 TEST_GEN_PROGS_x86_64 = set_sregs_test
 TEST_GEN_PROGS_x86_64 += sync_regs_test
+TEST_GEN_PROGS_x86_64 += vmx_tsc_adjust_test
 
 TEST_GEN_PROGS += $(TEST_GEN_PROGS_$(UNAME_M))
 LIBKVM += $(LIBKVM_$(UNAME_M))
diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
index 57974ad46373..637b7017b6ee 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -112,24 +112,27 @@ void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
 vm_paddr_t vm_phy_page_alloc(struct kvm_vm *vm,
 	vm_paddr_t paddr_min, uint32_t memslot);
 
-void kvm_get_supported_cpuid(struct kvm_cpuid2 *cpuid);
+struct kvm_cpuid2 *kvm_get_supported_cpuid(void);
 void vcpu_set_cpuid(
 	struct kvm_vm *vm, uint32_t vcpuid, struct kvm_cpuid2 *cpuid);
 
-struct kvm_cpuid2 *allocate_kvm_cpuid2(void);
 struct kvm_cpuid_entry2 *
-find_cpuid_index_entry(struct kvm_cpuid2 *cpuid, uint32_t function,
-		       uint32_t index);
+kvm_get_supported_cpuid_index(uint32_t function, uint32_t index);
 
 static inline struct kvm_cpuid_entry2 *
-find_cpuid_entry(struct kvm_cpuid2 *cpuid, uint32_t function)
+kvm_get_supported_cpuid_entry(uint32_t function)
 {
-	return find_cpuid_index_entry(cpuid, function, 0);
+	return kvm_get_supported_cpuid_index(function, 0);
 }
 
 struct kvm_vm *vm_create_default(uint32_t vcpuid, void *guest_code);
 void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code);
 
+typedef void (*vmx_guest_code_t)(vm_vaddr_t vmxon_vaddr,
+				 vm_paddr_t vmxon_paddr,
+				 vm_vaddr_t vmcs_vaddr,
+				 vm_paddr_t vmcs_paddr);
+
 struct kvm_userspace_memory_region *
 kvm_userspace_memory_region_find(struct kvm_vm *vm, uint64_t start,
 				 uint64_t end);
diff --git a/tools/testing/selftests/kvm/include/vmx.h b/tools/testing/selftests/kvm/include/vmx.h
new file mode 100644
index 000000000000..6ed8499807fd
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/vmx.h
@@ -0,0 +1,494 @@
+/*
+ * tools/testing/selftests/kvm/include/vmx.h
+ *
+ * Copyright (C) 2018, Google LLC.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ *
+ */
+
+#ifndef SELFTEST_KVM_VMX_H
+#define SELFTEST_KVM_VMX_H
+
+#include <stdint.h>
+#include "x86.h"
+
+#define CPUID_VMX_BIT				5
+
+#define CPUID_VMX				(1 << 5)
+
+/*
+ * Definitions of Primary Processor-Based VM-Execution Controls.
+ */
+#define CPU_BASED_VIRTUAL_INTR_PENDING		0x00000004
+#define CPU_BASED_USE_TSC_OFFSETING		0x00000008
+#define CPU_BASED_HLT_EXITING			0x00000080
+#define CPU_BASED_INVLPG_EXITING		0x00000200
+#define CPU_BASED_MWAIT_EXITING			0x00000400
+#define CPU_BASED_RDPMC_EXITING			0x00000800
+#define CPU_BASED_RDTSC_EXITING			0x00001000
+#define CPU_BASED_CR3_LOAD_EXITING		0x00008000
+#define CPU_BASED_CR3_STORE_EXITING		0x00010000
+#define CPU_BASED_CR8_LOAD_EXITING		0x00080000
+#define CPU_BASED_CR8_STORE_EXITING		0x00100000
+#define CPU_BASED_TPR_SHADOW			0x00200000
+#define CPU_BASED_VIRTUAL_NMI_PENDING		0x00400000
+#define CPU_BASED_MOV_DR_EXITING		0x00800000
+#define CPU_BASED_UNCOND_IO_EXITING		0x01000000
+#define CPU_BASED_USE_IO_BITMAPS		0x02000000
+#define CPU_BASED_MONITOR_TRAP			0x08000000
+#define CPU_BASED_USE_MSR_BITMAPS		0x10000000
+#define CPU_BASED_MONITOR_EXITING		0x20000000
+#define CPU_BASED_PAUSE_EXITING			0x40000000
+#define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS	0x80000000
+
+#define CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR	0x0401e172
+
+/*
+ * Definitions of Secondary Processor-Based VM-Execution Controls.
+ */
+#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
+#define SECONDARY_EXEC_ENABLE_EPT		0x00000002
+#define SECONDARY_EXEC_DESC			0x00000004
+#define SECONDARY_EXEC_RDTSCP			0x00000008
+#define SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE	0x00000010
+#define SECONDARY_EXEC_ENABLE_VPID		0x00000020
+#define SECONDARY_EXEC_WBINVD_EXITING		0x00000040
+#define SECONDARY_EXEC_UNRESTRICTED_GUEST	0x00000080
+#define SECONDARY_EXEC_APIC_REGISTER_VIRT	0x00000100
+#define SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY	0x00000200
+#define SECONDARY_EXEC_PAUSE_LOOP_EXITING	0x00000400
+#define SECONDARY_EXEC_RDRAND_EXITING		0x00000800
+#define SECONDARY_EXEC_ENABLE_INVPCID		0x00001000
+#define SECONDARY_EXEC_ENABLE_VMFUNC		0x00002000
+#define SECONDARY_EXEC_SHADOW_VMCS		0x00004000
+#define SECONDARY_EXEC_RDSEED_EXITING		0x00010000
+#define SECONDARY_EXEC_ENABLE_PML		0x00020000
+#define SECONDARY_EPT_VE			0x00040000
+#define SECONDARY_ENABLE_XSAV_RESTORE		0x00100000
+#define SECONDARY_EXEC_TSC_SCALING		0x02000000
+
+#define PIN_BASED_EXT_INTR_MASK			0x00000001
+#define PIN_BASED_NMI_EXITING			0x00000008
+#define PIN_BASED_VIRTUAL_NMIS			0x00000020
+#define PIN_BASED_VMX_PREEMPTION_TIMER		0x00000040
+#define PIN_BASED_POSTED_INTR			0x00000080
+
+#define PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR	0x00000016
+
+#define VM_EXIT_SAVE_DEBUG_CONTROLS		0x00000004
+#define VM_EXIT_HOST_ADDR_SPACE_SIZE		0x00000200
+#define VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL	0x00001000
+#define VM_EXIT_ACK_INTR_ON_EXIT		0x00008000
+#define VM_EXIT_SAVE_IA32_PAT			0x00040000
+#define VM_EXIT_LOAD_IA32_PAT			0x00080000
+#define VM_EXIT_SAVE_IA32_EFER			0x00100000
+#define VM_EXIT_LOAD_IA32_EFER			0x00200000
+#define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER	0x00400000
+
+#define VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR	0x00036dff
+
+#define VM_ENTRY_LOAD_DEBUG_CONTROLS		0x00000004
+#define VM_ENTRY_IA32E_MODE			0x00000200
+#define VM_ENTRY_SMM				0x00000400
+#define VM_ENTRY_DEACT_DUAL_MONITOR		0x00000800
+#define VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL	0x00002000
+#define VM_ENTRY_LOAD_IA32_PAT			0x00004000
+#define VM_ENTRY_LOAD_IA32_EFER			0x00008000
+
+#define VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR	0x000011ff
+
+#define VMX_MISC_PREEMPTION_TIMER_RATE_MASK	0x0000001f
+#define VMX_MISC_SAVE_EFER_LMA			0x00000020
+
+#define EXIT_REASON_FAILED_VMENTRY	0x80000000
+#define EXIT_REASON_EXCEPTION_NMI	0
+#define EXIT_REASON_EXTERNAL_INTERRUPT	1
+#define EXIT_REASON_TRIPLE_FAULT	2
+#define EXIT_REASON_PENDING_INTERRUPT	7
+#define EXIT_REASON_NMI_WINDOW		8
+#define EXIT_REASON_TASK_SWITCH		9
+#define EXIT_REASON_CPUID		10
+#define EXIT_REASON_HLT			12
+#define EXIT_REASON_INVD		13
+#define EXIT_REASON_INVLPG		14
+#define EXIT_REASON_RDPMC		15
+#define EXIT_REASON_RDTSC		16
+#define EXIT_REASON_VMCALL		18
+#define EXIT_REASON_VMCLEAR		19
+#define EXIT_REASON_VMLAUNCH		20
+#define EXIT_REASON_VMPTRLD		21
+#define EXIT_REASON_VMPTRST		22
+#define EXIT_REASON_VMREAD		23
+#define EXIT_REASON_VMRESUME		24
+#define EXIT_REASON_VMWRITE		25
+#define EXIT_REASON_VMOFF		26
+#define EXIT_REASON_VMON		27
+#define EXIT_REASON_CR_ACCESS		28
+#define EXIT_REASON_DR_ACCESS		29
+#define EXIT_REASON_IO_INSTRUCTION	30
+#define EXIT_REASON_MSR_READ		31
+#define EXIT_REASON_MSR_WRITE		32
+#define EXIT_REASON_INVALID_STATE	33
+#define EXIT_REASON_MWAIT_INSTRUCTION	36
+#define EXIT_REASON_MONITOR_INSTRUCTION 39
+#define EXIT_REASON_PAUSE_INSTRUCTION	40
+#define EXIT_REASON_MCE_DURING_VMENTRY	41
+#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
+#define EXIT_REASON_APIC_ACCESS		44
+#define EXIT_REASON_EOI_INDUCED		45
+#define EXIT_REASON_EPT_VIOLATION	48
+#define EXIT_REASON_EPT_MISCONFIG	49
+#define EXIT_REASON_INVEPT		50
+#define EXIT_REASON_RDTSCP		51
+#define EXIT_REASON_PREEMPTION_TIMER	52
+#define EXIT_REASON_INVVPID		53
+#define EXIT_REASON_WBINVD		54
+#define EXIT_REASON_XSETBV		55
+#define EXIT_REASON_APIC_WRITE		56
+#define EXIT_REASON_INVPCID		58
+#define EXIT_REASON_PML_FULL		62
+#define EXIT_REASON_XSAVES		63
+#define EXIT_REASON_XRSTORS		64
+#define LAST_EXIT_REASON		64
+
+enum vmcs_field {
+	VIRTUAL_PROCESSOR_ID		= 0x00000000,
+	POSTED_INTR_NV			= 0x00000002,
+	GUEST_ES_SELECTOR		= 0x00000800,
+	GUEST_CS_SELECTOR		= 0x00000802,
+	GUEST_SS_SELECTOR		= 0x00000804,
+	GUEST_DS_SELECTOR		= 0x00000806,
+	GUEST_FS_SELECTOR		= 0x00000808,
+	GUEST_GS_SELECTOR		= 0x0000080a,
+	GUEST_LDTR_SELECTOR		= 0x0000080c,
+	GUEST_TR_SELECTOR		= 0x0000080e,
+	GUEST_INTR_STATUS		= 0x00000810,
+	GUEST_PML_INDEX			= 0x00000812,
+	HOST_ES_SELECTOR		= 0x00000c00,
+	HOST_CS_SELECTOR		= 0x00000c02,
+	HOST_SS_SELECTOR		= 0x00000c04,
+	HOST_DS_SELECTOR		= 0x00000c06,
+	HOST_FS_SELECTOR		= 0x00000c08,
+	HOST_GS_SELECTOR		= 0x00000c0a,
+	HOST_TR_SELECTOR		= 0x00000c0c,
+	IO_BITMAP_A			= 0x00002000,
+	IO_BITMAP_A_HIGH		= 0x00002001,
+	IO_BITMAP_B			= 0x00002002,
+	IO_BITMAP_B_HIGH		= 0x00002003,
+	MSR_BITMAP			= 0x00002004,
+	MSR_BITMAP_HIGH			= 0x00002005,
+	VM_EXIT_MSR_STORE_ADDR		= 0x00002006,
+	VM_EXIT_MSR_STORE_ADDR_HIGH	= 0x00002007,
+	VM_EXIT_MSR_LOAD_ADDR		= 0x00002008,
+	VM_EXIT_MSR_LOAD_ADDR_HIGH	= 0x00002009,
+	VM_ENTRY_MSR_LOAD_ADDR		= 0x0000200a,
+	VM_ENTRY_MSR_LOAD_ADDR_HIGH	= 0x0000200b,
+	PML_ADDRESS			= 0x0000200e,
+	PML_ADDRESS_HIGH		= 0x0000200f,
+	TSC_OFFSET			= 0x00002010,
+	TSC_OFFSET_HIGH			= 0x00002011,
+	VIRTUAL_APIC_PAGE_ADDR		= 0x00002012,
+	VIRTUAL_APIC_PAGE_ADDR_HIGH	= 0x00002013,
+	APIC_ACCESS_ADDR		= 0x00002014,
+	APIC_ACCESS_ADDR_HIGH		= 0x00002015,
+	POSTED_INTR_DESC_ADDR		= 0x00002016,
+	POSTED_INTR_DESC_ADDR_HIGH	= 0x00002017,
+	EPT_POINTER			= 0x0000201a,
+	EPT_POINTER_HIGH		= 0x0000201b,
+	EOI_EXIT_BITMAP0		= 0x0000201c,
+	EOI_EXIT_BITMAP0_HIGH		= 0x0000201d,
+	EOI_EXIT_BITMAP1		= 0x0000201e,
+	EOI_EXIT_BITMAP1_HIGH		= 0x0000201f,
+	EOI_EXIT_BITMAP2		= 0x00002020,
+	EOI_EXIT_BITMAP2_HIGH		= 0x00002021,
+	EOI_EXIT_BITMAP3		= 0x00002022,
+	EOI_EXIT_BITMAP3_HIGH		= 0x00002023,
+	VMREAD_BITMAP			= 0x00002026,
+	VMREAD_BITMAP_HIGH		= 0x00002027,
+	VMWRITE_BITMAP			= 0x00002028,
+	VMWRITE_BITMAP_HIGH		= 0x00002029,
+	XSS_EXIT_BITMAP			= 0x0000202C,
+	XSS_EXIT_BITMAP_HIGH		= 0x0000202D,
+	TSC_MULTIPLIER			= 0x00002032,
+	TSC_MULTIPLIER_HIGH		= 0x00002033,
+	GUEST_PHYSICAL_ADDRESS		= 0x00002400,
+	GUEST_PHYSICAL_ADDRESS_HIGH	= 0x00002401,
+	VMCS_LINK_POINTER		= 0x00002800,
+	VMCS_LINK_POINTER_HIGH		= 0x00002801,
+	GUEST_IA32_DEBUGCTL		= 0x00002802,
+	GUEST_IA32_DEBUGCTL_HIGH	= 0x00002803,
+	GUEST_IA32_PAT			= 0x00002804,
+	GUEST_IA32_PAT_HIGH		= 0x00002805,
+	GUEST_IA32_EFER			= 0x00002806,
+	GUEST_IA32_EFER_HIGH		= 0x00002807,
+	GUEST_IA32_PERF_GLOBAL_CTRL	= 0x00002808,
+	GUEST_IA32_PERF_GLOBAL_CTRL_HIGH= 0x00002809,
+	GUEST_PDPTR0			= 0x0000280a,
+	GUEST_PDPTR0_HIGH		= 0x0000280b,
+	GUEST_PDPTR1			= 0x0000280c,
+	GUEST_PDPTR1_HIGH		= 0x0000280d,
+	GUEST_PDPTR2			= 0x0000280e,
+	GUEST_PDPTR2_HIGH		= 0x0000280f,
+	GUEST_PDPTR3			= 0x00002810,
+	GUEST_PDPTR3_HIGH		= 0x00002811,
+	GUEST_BNDCFGS			= 0x00002812,
+	GUEST_BNDCFGS_HIGH		= 0x00002813,
+	HOST_IA32_PAT			= 0x00002c00,
+	HOST_IA32_PAT_HIGH		= 0x00002c01,
+	HOST_IA32_EFER			= 0x00002c02,
+	HOST_IA32_EFER_HIGH		= 0x00002c03,
+	HOST_IA32_PERF_GLOBAL_CTRL	= 0x00002c04,
+	HOST_IA32_PERF_GLOBAL_CTRL_HIGH	= 0x00002c05,
+	PIN_BASED_VM_EXEC_CONTROL	= 0x00004000,
+	CPU_BASED_VM_EXEC_CONTROL	= 0x00004002,
+	EXCEPTION_BITMAP		= 0x00004004,
+	PAGE_FAULT_ERROR_CODE_MASK	= 0x00004006,
+	PAGE_FAULT_ERROR_CODE_MATCH	= 0x00004008,
+	CR3_TARGET_COUNT		= 0x0000400a,
+	VM_EXIT_CONTROLS		= 0x0000400c,
+	VM_EXIT_MSR_STORE_COUNT		= 0x0000400e,
+	VM_EXIT_MSR_LOAD_COUNT		= 0x00004010,
+	VM_ENTRY_CONTROLS		= 0x00004012,
+	VM_ENTRY_MSR_LOAD_COUNT		= 0x00004014,
+	VM_ENTRY_INTR_INFO_FIELD	= 0x00004016,
+	VM_ENTRY_EXCEPTION_ERROR_CODE	= 0x00004018,
+	VM_ENTRY_INSTRUCTION_LEN	= 0x0000401a,
+	TPR_THRESHOLD			= 0x0000401c,
+	SECONDARY_VM_EXEC_CONTROL	= 0x0000401e,
+	PLE_GAP				= 0x00004020,
+	PLE_WINDOW			= 0x00004022,
+	VM_INSTRUCTION_ERROR		= 0x00004400,
+	VM_EXIT_REASON			= 0x00004402,
+	VM_EXIT_INTR_INFO		= 0x00004404,
+	VM_EXIT_INTR_ERROR_CODE		= 0x00004406,
+	IDT_VECTORING_INFO_FIELD	= 0x00004408,
+	IDT_VECTORING_ERROR_CODE	= 0x0000440a,
+	VM_EXIT_INSTRUCTION_LEN		= 0x0000440c,
+	VMX_INSTRUCTION_INFO		= 0x0000440e,
+	GUEST_ES_LIMIT			= 0x00004800,
+	GUEST_CS_LIMIT			= 0x00004802,
+	GUEST_SS_LIMIT			= 0x00004804,
+	GUEST_DS_LIMIT			= 0x00004806,
+	GUEST_FS_LIMIT			= 0x00004808,
+	GUEST_GS_LIMIT			= 0x0000480a,
+	GUEST_LDTR_LIMIT		= 0x0000480c,
+	GUEST_TR_LIMIT			= 0x0000480e,
+	GUEST_GDTR_LIMIT		= 0x00004810,
+	GUEST_IDTR_LIMIT		= 0x00004812,
+	GUEST_ES_AR_BYTES		= 0x00004814,
+	GUEST_CS_AR_BYTES		= 0x00004816,
+	GUEST_SS_AR_BYTES		= 0x00004818,
+	GUEST_DS_AR_BYTES		= 0x0000481a,
+	GUEST_FS_AR_BYTES		= 0x0000481c,
+	GUEST_GS_AR_BYTES		= 0x0000481e,
+	GUEST_LDTR_AR_BYTES		= 0x00004820,
+	GUEST_TR_AR_BYTES		= 0x00004822,
+	GUEST_INTERRUPTIBILITY_INFO	= 0x00004824,
+	GUEST_ACTIVITY_STATE		= 0X00004826,
+	GUEST_SYSENTER_CS		= 0x0000482A,
+	VMX_PREEMPTION_TIMER_VALUE	= 0x0000482E,
+	HOST_IA32_SYSENTER_CS		= 0x00004c00,
+	CR0_GUEST_HOST_MASK		= 0x00006000,
+	CR4_GUEST_HOST_MASK		= 0x00006002,
+	CR0_READ_SHADOW			= 0x00006004,
+	CR4_READ_SHADOW			= 0x00006006,
+	CR3_TARGET_VALUE0		= 0x00006008,
+	CR3_TARGET_VALUE1		= 0x0000600a,
+	CR3_TARGET_VALUE2		= 0x0000600c,
+	CR3_TARGET_VALUE3		= 0x0000600e,
+	EXIT_QUALIFICATION		= 0x00006400,
+	GUEST_LINEAR_ADDRESS		= 0x0000640a,
+	GUEST_CR0			= 0x00006800,
+	GUEST_CR3			= 0x00006802,
+	GUEST_CR4			= 0x00006804,
+	GUEST_ES_BASE			= 0x00006806,
+	GUEST_CS_BASE			= 0x00006808,
+	GUEST_SS_BASE			= 0x0000680a,
+	GUEST_DS_BASE			= 0x0000680c,
+	GUEST_FS_BASE			= 0x0000680e,
+	GUEST_GS_BASE			= 0x00006810,
+	GUEST_LDTR_BASE			= 0x00006812,
+	GUEST_TR_BASE			= 0x00006814,
+	GUEST_GDTR_BASE			= 0x00006816,
+	GUEST_IDTR_BASE			= 0x00006818,
+	GUEST_DR7			= 0x0000681a,
+	GUEST_RSP			= 0x0000681c,
+	GUEST_RIP			= 0x0000681e,
+	GUEST_RFLAGS			= 0x00006820,
+	GUEST_PENDING_DBG_EXCEPTIONS	= 0x00006822,
+	GUEST_SYSENTER_ESP		= 0x00006824,
+	GUEST_SYSENTER_EIP		= 0x00006826,
+	HOST_CR0			= 0x00006c00,
+	HOST_CR3			= 0x00006c02,
+	HOST_CR4			= 0x00006c04,
+	HOST_FS_BASE			= 0x00006c06,
+	HOST_GS_BASE			= 0x00006c08,
+	HOST_TR_BASE			= 0x00006c0a,
+	HOST_GDTR_BASE			= 0x00006c0c,
+	HOST_IDTR_BASE			= 0x00006c0e,
+	HOST_IA32_SYSENTER_ESP		= 0x00006c10,
+	HOST_IA32_SYSENTER_EIP		= 0x00006c12,
+	HOST_RSP			= 0x00006c14,
+	HOST_RIP			= 0x00006c16,
+};
+
+struct vmx_msr_entry {
+	uint32_t index;
+	uint32_t reserved;
+	uint64_t value;
+} __attribute__ ((aligned(16)));
+
+static inline int vmxon(uint64_t phys)
+{
+	uint8_t ret;
+
+	__asm__ __volatile__ ("vmxon %[pa]; setna %[ret]"
+		: [ret]"=rm"(ret)
+		: [pa]"m"(phys)
+		: "cc", "memory");
+
+	return ret;
+}
+
+static inline void vmxoff(void)
+{
+	__asm__ __volatile__("vmxoff");
+}
+
+static inline int vmclear(uint64_t vmcs_pa)
+{
+	uint8_t ret;
+
+	__asm__ __volatile__ ("vmclear %[pa]; setna %[ret]"
+		: [ret]"=rm"(ret)
+		: [pa]"m"(vmcs_pa)
+		: "cc", "memory");
+
+	return ret;
+}
+
+static inline int vmptrld(uint64_t vmcs_pa)
+{
+	uint8_t ret;
+
+	__asm__ __volatile__ ("vmptrld %[pa]; setna %[ret]"
+		: [ret]"=rm"(ret)
+		: [pa]"m"(vmcs_pa)
+		: "cc", "memory");
+
+	return ret;
+}
+
+/*
+ * No guest state (e.g. GPRs) is established by this vmlaunch.
+ */
+static inline int vmlaunch(void)
+{
+	int ret;
+
+	__asm__ __volatile__("push %%rbp;"
+			     "push %%rcx;"
+			     "push %%rdx;"
+			     "push %%rsi;"
+			     "push %%rdi;"
+			     "push $0;"
+			     "vmwrite %%rsp, %[host_rsp];"
+			     "lea 1f(%%rip), %%rax;"
+			     "vmwrite %%rax, %[host_rip];"
+			     "vmlaunch;"
+			     "incq (%%rsp);"
+			     "1: pop %%rax;"
+			     "pop %%rdi;"
+			     "pop %%rsi;"
+			     "pop %%rdx;"
+			     "pop %%rcx;"
+			     "pop %%rbp;"
+			     : [ret]"=&a"(ret)
+			     : [host_rsp]"r"((uint64_t)HOST_RSP),
+			       [host_rip]"r"((uint64_t)HOST_RIP)
+			     : "memory", "cc", "rbx", "r8", "r9", "r10",
+			       "r11", "r12", "r13", "r14", "r15");
+	return ret;
+}
+
+/*
+ * No guest state (e.g. GPRs) is established by this vmresume.
+ */
+static inline int vmresume(void)
+{
+	int ret;
+
+	__asm__ __volatile__("push %%rbp;"
+			     "push %%rcx;"
+			     "push %%rdx;"
+			     "push %%rsi;"
+			     "push %%rdi;"
+			     "push $0;"
+			     "vmwrite %%rsp, %[host_rsp];"
+			     "lea 1f(%%rip), %%rax;"
+			     "vmwrite %%rax, %[host_rip];"
+			     "vmresume;"
+			     "incq (%%rsp);"
+			     "1: pop %%rax;"
+			     "pop %%rdi;"
+			     "pop %%rsi;"
+			     "pop %%rdx;"
+			     "pop %%rcx;"
+			     "pop %%rbp;"
+			     : [ret]"=&a"(ret)
+			     : [host_rsp]"r"((uint64_t)HOST_RSP),
+			       [host_rip]"r"((uint64_t)HOST_RIP)
+			     : "memory", "cc", "rbx", "r8", "r9", "r10",
+			       "r11", "r12", "r13", "r14", "r15");
+	return ret;
+}
+
+static inline int vmread(uint64_t encoding, uint64_t *value)
+{
+	uint64_t tmp;
+	uint8_t ret;
+
+	__asm__ __volatile__("vmread %[encoding], %[value]; setna %[ret]"
+		: [value]"=rm"(tmp), [ret]"=rm"(ret)
+		: [encoding]"r"(encoding)
+		: "cc", "memory");
+
+	*value = tmp;
+	return ret;
+}
+
+/*
+ * A wrapper around vmread that ignores errors and returns zero if the
+ * vmread instruction fails.
+ */
+static inline uint64_t vmreadz(uint64_t encoding)
+{
+	uint64_t value = 0;
+	vmread(encoding, &value);
+	return value;
+}
+
+static inline int vmwrite(uint64_t encoding, uint64_t value)
+{
+	uint8_t ret;
+
+	__asm__ __volatile__ ("vmwrite %[value], %[encoding]; setna %[ret]"
+		: [ret]"=rm"(ret)
+		: [value]"rm"(value), [encoding]"r"(encoding)
+		: "cc", "memory");
+
+	return ret;
+}
+
+static inline uint32_t vmcs_revision(void)
+{
+	return rdmsr(MSR_IA32_VMX_BASIC);
+}
+
+void prepare_for_vmx_operation(void);
+void prepare_vmcs(void *guest_rip, void *guest_rsp);
+struct kvm_vm *vm_create_default_vmx(uint32_t vcpuid,
+				     vmx_guest_code_t guest_code);
+
+#endif /* !SELFTEST_KVM_VMX_H */
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index e213d513dc61..2cedfda181d4 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -378,7 +378,7 @@ int kvm_memcmp_hva_gva(void *hva,
  * complicated. This function uses a reasonable default length for
  * the array and performs the appropriate allocation.
  */
-struct kvm_cpuid2 *allocate_kvm_cpuid2(void)
+static struct kvm_cpuid2 *allocate_kvm_cpuid2(void)
 {
 	struct kvm_cpuid2 *cpuid;
 	int nent = 100;
@@ -402,17 +402,21 @@ struct kvm_cpuid2 *allocate_kvm_cpuid2(void)
  * Input Args: None
  *
  * Output Args:
- *   cpuid - The supported KVM CPUID
  *
- * Return: void
+ * Return: The supported KVM CPUID
  *
  * Get the guest CPUID supported by KVM.
  */
-void kvm_get_supported_cpuid(struct kvm_cpuid2 *cpuid)
+struct kvm_cpuid2 *kvm_get_supported_cpuid(void)
 {
+	static struct kvm_cpuid2 *cpuid;
 	int ret;
 	int kvm_fd;
 
+	if (cpuid)
+		return cpuid;
+
+	cpuid = allocate_kvm_cpuid2();
 	kvm_fd = open(KVM_DEV_PATH, O_RDONLY);
 	TEST_ASSERT(kvm_fd >= 0, "open %s failed, rc: %i errno: %i",
 		KVM_DEV_PATH, kvm_fd, errno);
@@ -422,6 +426,7 @@ void kvm_get_supported_cpuid(struct kvm_cpuid2 *cpuid)
 		    ret, errno);
 
 	close(kvm_fd);
+	return cpuid;
 }
 
 /* Locate a cpuid entry.
@@ -435,12 +440,13 @@ void kvm_get_supported_cpuid(struct kvm_cpuid2 *cpuid)
  * Return: A pointer to the cpuid entry. Never returns NULL.
  */
 struct kvm_cpuid_entry2 *
-find_cpuid_index_entry(struct kvm_cpuid2 *cpuid, uint32_t function,
-		       uint32_t index)
+kvm_get_supported_cpuid_index(uint32_t function, uint32_t index)
 {
+	struct kvm_cpuid2 *cpuid;
 	struct kvm_cpuid_entry2 *entry = NULL;
 	int i;
 
+	cpuid = kvm_get_supported_cpuid();
 	for (i = 0; i < cpuid->nent; i++) {
 		if (cpuid->entries[i].function == function &&
 		    cpuid->entries[i].index == index) {
diff --git a/tools/testing/selftests/kvm/lib/vmx.c b/tools/testing/selftests/kvm/lib/vmx.c
new file mode 100644
index 000000000000..0231bc0aae7b
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/vmx.c
@@ -0,0 +1,243 @@
+/*
+ * tools/testing/selftests/kvm/lib/x86.c
+ *
+ * Copyright (C) 2018, Google LLC.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_name */
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "x86.h"
+#include "vmx.h"
+
+/* Create a default VM for VMX tests.
+ *
+ * Input Args:
+ *   vcpuid - The id of the single VCPU to add to the VM.
+ *   guest_code - The vCPU's entry point
+ *
+ * Output Args: None
+ *
+ * Return:
+ *   Pointer to opaque structure that describes the created VM.
+ */
+struct kvm_vm *
+vm_create_default_vmx(uint32_t vcpuid, vmx_guest_code_t guest_code)
+{
+	struct kvm_cpuid2 *cpuid;
+	struct kvm_vm *vm;
+	vm_vaddr_t vmxon_vaddr;
+	vm_paddr_t vmxon_paddr;
+	vm_vaddr_t vmcs_vaddr;
+	vm_paddr_t vmcs_paddr;
+
+	vm = vm_create_default(vcpuid, (void *) guest_code);
+
+	/* Enable nesting in CPUID */
+	vcpu_set_cpuid(vm, vcpuid, kvm_get_supported_cpuid());
+
+	/* Setup of a region of guest memory for the vmxon region. */
+	vmxon_vaddr = vm_vaddr_alloc(vm, getpagesize(), 0, 0, 0);
+	vmxon_paddr = addr_gva2gpa(vm, vmxon_vaddr);
+
+	/* Setup of a region of guest memory for a vmcs. */
+	vmcs_vaddr = vm_vaddr_alloc(vm, getpagesize(), 0, 0, 0);
+	vmcs_paddr = addr_gva2gpa(vm, vmcs_vaddr);
+
+	vcpu_args_set(vm, vcpuid, 4, vmxon_vaddr, vmxon_paddr, vmcs_vaddr,
+		      vmcs_paddr);
+
+	return vm;
+}
+
+void prepare_for_vmx_operation(void)
+{
+	uint64_t feature_control;
+	uint64_t required;
+	unsigned long cr0;
+	unsigned long cr4;
+
+	/*
+	 * Ensure bits in CR0 and CR4 are valid in VMX operation:
+	 * - Bit X is 1 in _FIXED0: bit X is fixed to 1 in CRx.
+	 * - Bit X is 0 in _FIXED1: bit X is fixed to 0 in CRx.
+	 */
+	__asm__ __volatile__("mov %%cr0, %0" : "=r"(cr0) : : "memory");
+	cr0 &= rdmsr(MSR_IA32_VMX_CR0_FIXED1);
+	cr0 |= rdmsr(MSR_IA32_VMX_CR0_FIXED0);
+	__asm__ __volatile__("mov %0, %%cr0" : : "r"(cr0) : "memory");
+
+	__asm__ __volatile__("mov %%cr4, %0" : "=r"(cr4) : : "memory");
+	cr4 &= rdmsr(MSR_IA32_VMX_CR4_FIXED1);
+	cr4 |= rdmsr(MSR_IA32_VMX_CR4_FIXED0);
+	/* Enable VMX operation */
+	cr4 |= X86_CR4_VMXE;
+	__asm__ __volatile__("mov %0, %%cr4" : : "r"(cr4) : "memory");
+
+	/*
+	 * Configure IA32_FEATURE_CONTROL MSR to allow VMXON:
+	 *  Bit 0: Lock bit. If clear, VMXON causes a #GP.
+	 *  Bit 2: Enables VMXON outside of SMX operation. If clear, VMXON
+	 *    outside of SMX causes a #GP.
+	 */
+	required = FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
+	required |= FEATURE_CONTROL_LOCKED;
+	feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
+	if ((feature_control & required) != required)
+		wrmsr(MSR_IA32_FEATURE_CONTROL, feature_control | required);
+}
+
+/*
+ * Initialize the control fields to the most basic settings possible.
+ */
+static inline void init_vmcs_control_fields(void)
+{
+	vmwrite(VIRTUAL_PROCESSOR_ID, 0);
+	vmwrite(POSTED_INTR_NV, 0);
+
+	vmwrite(PIN_BASED_VM_EXEC_CONTROL, rdmsr(MSR_IA32_VMX_PINBASED_CTLS));
+	vmwrite(CPU_BASED_VM_EXEC_CONTROL, rdmsr(MSR_IA32_VMX_PROCBASED_CTLS));
+	vmwrite(EXCEPTION_BITMAP, 0);
+	vmwrite(PAGE_FAULT_ERROR_CODE_MASK, 0);
+	vmwrite(PAGE_FAULT_ERROR_CODE_MATCH, -1); /* Never match */
+	vmwrite(CR3_TARGET_COUNT, 0);
+	vmwrite(VM_EXIT_CONTROLS, rdmsr(MSR_IA32_VMX_EXIT_CTLS) |
+		VM_EXIT_HOST_ADDR_SPACE_SIZE);	  /* 64-bit host */
+	vmwrite(VM_EXIT_MSR_STORE_COUNT, 0);
+	vmwrite(VM_EXIT_MSR_LOAD_COUNT, 0);
+	vmwrite(VM_ENTRY_CONTROLS, rdmsr(MSR_IA32_VMX_ENTRY_CTLS) |
+		VM_ENTRY_IA32E_MODE);		  /* 64-bit guest */
+	vmwrite(VM_ENTRY_MSR_LOAD_COUNT, 0);
+	vmwrite(VM_ENTRY_INTR_INFO_FIELD, 0);
+	vmwrite(TPR_THRESHOLD, 0);
+	vmwrite(SECONDARY_VM_EXEC_CONTROL, 0);
+
+	vmwrite(CR0_GUEST_HOST_MASK, 0);
+	vmwrite(CR4_GUEST_HOST_MASK, 0);
+	vmwrite(CR0_READ_SHADOW, get_cr0());
+	vmwrite(CR4_READ_SHADOW, get_cr4());
+}
+
+/*
+ * Initialize the host state fields based on the current host state, with
+ * the exception of HOST_RSP and HOST_RIP, which should be set by vmlaunch
+ * or vmresume.
+ */
+static inline void init_vmcs_host_state(void)
+{
+	uint32_t exit_controls = vmreadz(VM_EXIT_CONTROLS);
+
+	vmwrite(HOST_ES_SELECTOR, get_es());
+	vmwrite(HOST_CS_SELECTOR, get_cs());
+	vmwrite(HOST_SS_SELECTOR, get_ss());
+	vmwrite(HOST_DS_SELECTOR, get_ds());
+	vmwrite(HOST_FS_SELECTOR, get_fs());
+	vmwrite(HOST_GS_SELECTOR, get_gs());
+	vmwrite(HOST_TR_SELECTOR, get_tr());
+
+	if (exit_controls & VM_EXIT_LOAD_IA32_PAT)
+		vmwrite(HOST_IA32_PAT, rdmsr(MSR_IA32_CR_PAT));
+	if (exit_controls & VM_EXIT_LOAD_IA32_EFER)
+		vmwrite(HOST_IA32_EFER, rdmsr(MSR_EFER));
+	if (exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
+		vmwrite(HOST_IA32_PERF_GLOBAL_CTRL,
+			rdmsr(MSR_CORE_PERF_GLOBAL_CTRL));
+
+	vmwrite(HOST_IA32_SYSENTER_CS, rdmsr(MSR_IA32_SYSENTER_CS));
+
+	vmwrite(HOST_CR0, get_cr0());
+	vmwrite(HOST_CR3, get_cr3());
+	vmwrite(HOST_CR4, get_cr4());
+	vmwrite(HOST_FS_BASE, rdmsr(MSR_FS_BASE));
+	vmwrite(HOST_GS_BASE, rdmsr(MSR_GS_BASE));
+	vmwrite(HOST_TR_BASE,
+		get_desc64_base((struct desc64 *)(get_gdt_base() + get_tr())));
+	vmwrite(HOST_GDTR_BASE, get_gdt_base());
+	vmwrite(HOST_IDTR_BASE, get_idt_base());
+	vmwrite(HOST_IA32_SYSENTER_ESP, rdmsr(MSR_IA32_SYSENTER_ESP));
+	vmwrite(HOST_IA32_SYSENTER_EIP, rdmsr(MSR_IA32_SYSENTER_EIP));
+}
+
+/*
+ * Initialize the guest state fields essentially as a clone of
+ * the host state fields. Some host state fields have fixed
+ * values, and we set the corresponding guest state fields accordingly.
+ */
+static inline void init_vmcs_guest_state(void *rip, void *rsp)
+{
+	vmwrite(GUEST_ES_SELECTOR, vmreadz(HOST_ES_SELECTOR));
+	vmwrite(GUEST_CS_SELECTOR, vmreadz(HOST_CS_SELECTOR));
+	vmwrite(GUEST_SS_SELECTOR, vmreadz(HOST_SS_SELECTOR));
+	vmwrite(GUEST_DS_SELECTOR, vmreadz(HOST_DS_SELECTOR));
+	vmwrite(GUEST_FS_SELECTOR, vmreadz(HOST_FS_SELECTOR));
+	vmwrite(GUEST_GS_SELECTOR, vmreadz(HOST_GS_SELECTOR));
+	vmwrite(GUEST_LDTR_SELECTOR, 0);
+	vmwrite(GUEST_TR_SELECTOR, vmreadz(HOST_TR_SELECTOR));
+	vmwrite(GUEST_INTR_STATUS, 0);
+	vmwrite(GUEST_PML_INDEX, 0);
+
+	vmwrite(VMCS_LINK_POINTER, -1ll);
+	vmwrite(GUEST_IA32_DEBUGCTL, 0);
+	vmwrite(GUEST_IA32_PAT, vmreadz(HOST_IA32_PAT));
+	vmwrite(GUEST_IA32_EFER, vmreadz(HOST_IA32_EFER));
+	vmwrite(GUEST_IA32_PERF_GLOBAL_CTRL,
+		vmreadz(HOST_IA32_PERF_GLOBAL_CTRL));
+
+	vmwrite(GUEST_ES_LIMIT, -1);
+	vmwrite(GUEST_CS_LIMIT, -1);
+	vmwrite(GUEST_SS_LIMIT, -1);
+	vmwrite(GUEST_DS_LIMIT, -1);
+	vmwrite(GUEST_FS_LIMIT, -1);
+	vmwrite(GUEST_GS_LIMIT, -1);
+	vmwrite(GUEST_LDTR_LIMIT, -1);
+	vmwrite(GUEST_TR_LIMIT, 0x67);
+	vmwrite(GUEST_GDTR_LIMIT, 0xffff);
+	vmwrite(GUEST_IDTR_LIMIT, 0xffff);
+	vmwrite(GUEST_ES_AR_BYTES,
+		vmreadz(GUEST_ES_SELECTOR) == 0 ? 0x10000 : 0xc093);
+	vmwrite(GUEST_CS_AR_BYTES, 0xa09b);
+	vmwrite(GUEST_SS_AR_BYTES, 0xc093);
+	vmwrite(GUEST_DS_AR_BYTES,
+		vmreadz(GUEST_DS_SELECTOR) == 0 ? 0x10000 : 0xc093);
+	vmwrite(GUEST_FS_AR_BYTES,
+		vmreadz(GUEST_FS_SELECTOR) == 0 ? 0x10000 : 0xc093);
+	vmwrite(GUEST_GS_AR_BYTES,
+		vmreadz(GUEST_GS_SELECTOR) == 0 ? 0x10000 : 0xc093);
+	vmwrite(GUEST_LDTR_AR_BYTES, 0x10000);
+	vmwrite(GUEST_TR_AR_BYTES, 0x8b);
+	vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0);
+	vmwrite(GUEST_ACTIVITY_STATE, 0);
+	vmwrite(GUEST_SYSENTER_CS, vmreadz(HOST_IA32_SYSENTER_CS));
+	vmwrite(VMX_PREEMPTION_TIMER_VALUE, 0);
+
+	vmwrite(GUEST_CR0, vmreadz(HOST_CR0));
+	vmwrite(GUEST_CR3, vmreadz(HOST_CR3));
+	vmwrite(GUEST_CR4, vmreadz(HOST_CR4));
+	vmwrite(GUEST_ES_BASE, 0);
+	vmwrite(GUEST_CS_BASE, 0);
+	vmwrite(GUEST_SS_BASE, 0);
+	vmwrite(GUEST_DS_BASE, 0);
+	vmwrite(GUEST_FS_BASE, vmreadz(HOST_FS_BASE));
+	vmwrite(GUEST_GS_BASE, vmreadz(HOST_GS_BASE));
+	vmwrite(GUEST_LDTR_BASE, 0);
+	vmwrite(GUEST_TR_BASE, vmreadz(HOST_TR_BASE));
+	vmwrite(GUEST_GDTR_BASE, vmreadz(HOST_GDTR_BASE));
+	vmwrite(GUEST_IDTR_BASE, vmreadz(HOST_IDTR_BASE));
+	vmwrite(GUEST_DR7, 0x400);
+	vmwrite(GUEST_RSP, (uint64_t)rsp);
+	vmwrite(GUEST_RIP, (uint64_t)rip);
+	vmwrite(GUEST_RFLAGS, 2);
+	vmwrite(GUEST_PENDING_DBG_EXCEPTIONS, 0);
+	vmwrite(GUEST_SYSENTER_ESP, vmreadz(HOST_IA32_SYSENTER_ESP));
+	vmwrite(GUEST_SYSENTER_EIP, vmreadz(HOST_IA32_SYSENTER_EIP));
+}
+
+void prepare_vmcs(void *guest_rip, void *guest_rsp)
+{
+	init_vmcs_control_fields();
+	init_vmcs_host_state();
+	init_vmcs_guest_state(guest_rip, guest_rsp);
+}
diff --git a/tools/testing/selftests/kvm/vmx_tsc_adjust_test.c b/tools/testing/selftests/kvm/vmx_tsc_adjust_test.c
new file mode 100644
index 000000000000..8f7f62093add
--- /dev/null
+++ b/tools/testing/selftests/kvm/vmx_tsc_adjust_test.c
@@ -0,0 +1,231 @@
+/*
+ * gtests/tests/vmx_tsc_adjust_test.c
+ *
+ * Copyright (C) 2018, Google LLC.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ *
+ *
+ * IA32_TSC_ADJUST test
+ *
+ * According to the SDM, "if an execution of WRMSR to the
+ * IA32_TIME_STAMP_COUNTER MSR adds (or subtracts) value X from the TSC,
+ * the logical processor also adds (or subtracts) value X from the
+ * IA32_TSC_ADJUST MSR.
+ *
+ * Note that when L1 doesn't intercept writes to IA32_TSC, a
+ * WRMSR(IA32_TSC) from L2 sets L1's TSC value, not L2's perceived TSC
+ * value.
+ *
+ * This test verifies that this unusual case is handled correctly.
+ */
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "x86.h"
+#include "vmx.h"
+
+#include <string.h>
+#include <sys/ioctl.h>
+
+#ifndef MSR_IA32_TSC_ADJUST
+#define MSR_IA32_TSC_ADJUST 0x3b
+#endif
+
+#define PAGE_SIZE	4096
+#define VCPU_ID		5
+
+#define TSC_ADJUST_VALUE (1ll << 32)
+#define TSC_OFFSET_VALUE -(1ll << 48)
+
+enum {
+	PORT_ABORT = 0x1000,
+	PORT_REPORT,
+	PORT_DONE,
+};
+
+struct vmx_page {
+	vm_vaddr_t virt;
+	vm_paddr_t phys;
+};
+
+enum {
+	VMXON_PAGE = 0,
+	VMCS_PAGE,
+	MSR_BITMAP_PAGE,
+
+	NUM_VMX_PAGES,
+};
+
+struct kvm_single_msr {
+	struct kvm_msrs header;
+	struct kvm_msr_entry entry;
+} __attribute__((packed));
+
+/* The virtual machine object. */
+static struct kvm_vm *vm;
+
+/* Array of vmx_page descriptors that is shared with the guest. */
+struct vmx_page *vmx_pages;
+
+#define exit_to_l0(_port, _arg) do_exit_to_l0(_port, (unsigned long) (_arg))
+static void do_exit_to_l0(uint16_t port, unsigned long arg)
+{
+	__asm__ __volatile__("in %[port], %%al"
+		:
+		: [port]"d"(port), "D"(arg)
+		: "rax");
+}
+
+
+#define GUEST_ASSERT(_condition) do {					     \
+	if (!(_condition))						     \
+		exit_to_l0(PORT_ABORT, "Failed guest assert: " #_condition); \
+} while (0)
+
+static void check_ia32_tsc_adjust(int64_t max)
+{
+	int64_t adjust;
+
+	adjust = rdmsr(MSR_IA32_TSC_ADJUST);
+	exit_to_l0(PORT_REPORT, adjust);
+	GUEST_ASSERT(adjust <= max);
+}
+
+static void l2_guest_code(void)
+{
+	uint64_t l1_tsc = rdtsc() - TSC_OFFSET_VALUE;
+
+	wrmsr(MSR_IA32_TSC, l1_tsc - TSC_ADJUST_VALUE);
+	check_ia32_tsc_adjust(-2 * TSC_ADJUST_VALUE);
+
+	/* Exit to L1 */
+	__asm__ __volatile__("vmcall");
+}
+
+static void l1_guest_code(struct vmx_page *vmx_pages)
+{
+#define L2_GUEST_STACK_SIZE 64
+	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+	uint32_t control;
+	uintptr_t save_cr3;
+
+	GUEST_ASSERT(rdtsc() < TSC_ADJUST_VALUE);
+	wrmsr(MSR_IA32_TSC, rdtsc() - TSC_ADJUST_VALUE);
+	check_ia32_tsc_adjust(-1 * TSC_ADJUST_VALUE);
+
+	prepare_for_vmx_operation();
+
+	/* Enter VMX root operation. */
+	*(uint32_t *)vmx_pages[VMXON_PAGE].virt = vmcs_revision();
+	GUEST_ASSERT(!vmxon(vmx_pages[VMXON_PAGE].phys));
+
+	/* Load a VMCS. */
+	*(uint32_t *)vmx_pages[VMCS_PAGE].virt = vmcs_revision();
+	GUEST_ASSERT(!vmclear(vmx_pages[VMCS_PAGE].phys));
+	GUEST_ASSERT(!vmptrld(vmx_pages[VMCS_PAGE].phys));
+
+	/* Prepare the VMCS for L2 execution. */
+	prepare_vmcs(l2_guest_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+	control = vmreadz(CPU_BASED_VM_EXEC_CONTROL);
+	control |= CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_USE_TSC_OFFSETING;
+	vmwrite(CPU_BASED_VM_EXEC_CONTROL, control);
+	vmwrite(MSR_BITMAP, vmx_pages[MSR_BITMAP_PAGE].phys);
+	vmwrite(TSC_OFFSET, TSC_OFFSET_VALUE);
+
+	/* Jump into L2.  First, test failure to load guest CR3.  */
+	save_cr3 = vmreadz(GUEST_CR3);
+	vmwrite(GUEST_CR3, -1ull);
+	GUEST_ASSERT(!vmlaunch());
+	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) ==
+		     (EXIT_REASON_FAILED_VMENTRY | EXIT_REASON_INVALID_STATE));
+	check_ia32_tsc_adjust(-1 * TSC_ADJUST_VALUE);
+	vmwrite(GUEST_CR3, save_cr3);
+
+	GUEST_ASSERT(!vmlaunch());
+	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+
+	check_ia32_tsc_adjust(-2 * TSC_ADJUST_VALUE);
+
+	exit_to_l0(PORT_DONE, 0);
+}
+
+static void allocate_vmx_page(struct vmx_page *page)
+{
+	vm_vaddr_t virt;
+
+	virt = vm_vaddr_alloc(vm, PAGE_SIZE, 0, 0, 0);
+	memset(addr_gva2hva(vm, virt), 0, PAGE_SIZE);
+
+	page->virt = virt;
+	page->phys = addr_gva2gpa(vm, virt);
+}
+
+static vm_vaddr_t allocate_vmx_pages(void)
+{
+	vm_vaddr_t vmx_pages_vaddr;
+	int i;
+
+	vmx_pages_vaddr = vm_vaddr_alloc(
+		vm, sizeof(struct vmx_page) * NUM_VMX_PAGES, 0, 0, 0);
+
+	vmx_pages = (void *) addr_gva2hva(vm, vmx_pages_vaddr);
+
+	for (i = 0; i < NUM_VMX_PAGES; i++)
+		allocate_vmx_page(&vmx_pages[i]);
+
+	return vmx_pages_vaddr;
+}
+
+void report(int64_t val)
+{
+	printf("IA32_TSC_ADJUST is %ld (%lld * TSC_ADJUST_VALUE + %lld).\n",
+	       val, val / TSC_ADJUST_VALUE, val % TSC_ADJUST_VALUE);
+}
+
+int main(int argc, char *argv[])
+{
+	vm_vaddr_t vmx_pages_vaddr;
+	struct kvm_cpuid_entry2 *entry = kvm_get_supported_cpuid_entry(1);
+
+	if (!(entry->ecx & CPUID_VMX)) {
+		printf("nested VMX not enabled, skipping test");
+		return 0;
+	}
+
+	vm = vm_create_default_vmx(VCPU_ID, (void *) l1_guest_code);
+
+	/* Allocate VMX pages and shared descriptors (vmx_pages). */
+	vmx_pages_vaddr = allocate_vmx_pages();
+	vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_vaddr);
+
+	for (;;) {
+		volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID);
+		struct kvm_regs regs;
+
+		vcpu_run(vm, VCPU_ID);
+		TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+			    "Got exit_reason other than KVM_EXIT_IO: %u (%s),\n",
+			    run->exit_reason,
+			    exit_reason_str(run->exit_reason));
+
+		vcpu_regs_get(vm, VCPU_ID, &regs);
+
+		switch (run->io.port) {
+		case PORT_ABORT:
+			TEST_ASSERT(false, "%s", (const char *) regs.rdi);
+			/* NOT REACHED */
+		case PORT_REPORT:
+			report(regs.rdi);
+			break;
+		case PORT_DONE:
+			goto done;
+		default:
+			TEST_ASSERT(false, "Unknown port 0x%x.", run->io.port);
+		}
+	}
+
+	kvm_vm_free(vm);
+done:
+	return 0;
+}

From b8e47d87be65aec931846ced9a34a22d2021c311 Mon Sep 17 00:00:00 2001
From: Ramalingam C <ramalingam.c@intel.com>
Date: Thu, 5 Apr 2018 17:33:22 +0530
Subject: [PATCH 152/288] drm: Fix HDCP downstream dev count read

In both HDMI and DP, device count is represented by 6:0 bits of a
register(BInfo/Bstatus)

So macro for bitmasking the device_count is fixed(0x3F->0x7F).

v3:
  Retained the Rb-ed.
v4:
  %s/drm\/i915/drm [rodrigo]
v5:
  Added "Fixes:" and HDCP keyword in subject [Rodrigo, Sean Paul]

Signed-off-by: Ramalingam C <ramalingam.c@intel.com>
Fixes: 495eb7f877ab drm: Add some HDCP related #defines
cc: Sean Paul <seanpaul@chromium.org>
Reviewed-by: Sean Paul <seanpaul@chromium.org>
Signed-off-by: Sean Paul <seanpaul@chromium.org>
Link: https://patchwork.freedesktop.org/patch/msgid/1522929802-22850-1-git-send-email-ramalingam.c@intel.com
---
 include/drm/drm_hdcp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/drm/drm_hdcp.h b/include/drm/drm_hdcp.h
index 562fa7df2637..98e63d870139 100644
--- a/include/drm/drm_hdcp.h
+++ b/include/drm/drm_hdcp.h
@@ -19,7 +19,7 @@
 #define DRM_HDCP_RI_LEN				2
 #define DRM_HDCP_V_PRIME_PART_LEN		4
 #define DRM_HDCP_V_PRIME_NUM_PARTS		5
-#define DRM_HDCP_NUM_DOWNSTREAM(x)		(x & 0x3f)
+#define DRM_HDCP_NUM_DOWNSTREAM(x)		(x & 0x7f)
 #define DRM_HDCP_MAX_CASCADE_EXCEEDED(x)	(x & BIT(3))
 #define DRM_HDCP_MAX_DEVICE_EXCEEDED(x)		(x & BIT(7))
 

From 9783ccd0f2507cbe3c5ff1cb84bf6ae3a512d17d Mon Sep 17 00:00:00 2001
From: Gao Feng <gfree.wind@vip.163.com>
Date: Mon, 16 Apr 2018 10:16:45 +0800
Subject: [PATCH 153/288] net: Fix one possible memleak in ip_setup_cork

It would allocate memory in this function when the cork->opt is NULL. But
the memory isn't freed if failed in the latter rt check, and return error
directly. It causes the memleak if its caller is ip_make_skb which also
doesn't free the cork->opt when meet a error.

Now move the rt check ahead to avoid the memleak.

Signed-off-by: Gao Feng <gfree.wind@vip.163.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_output.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 4c11b810a447..83c73bab2c3d 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1109,6 +1109,10 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
 	struct ip_options_rcu *opt;
 	struct rtable *rt;
 
+	rt = *rtp;
+	if (unlikely(!rt))
+		return -EFAULT;
+
 	/*
 	 * setup for corking.
 	 */
@@ -1124,9 +1128,7 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
 		cork->flags |= IPCORK_OPT;
 		cork->addr = ipc->addr;
 	}
-	rt = *rtp;
-	if (unlikely(!rt))
-		return -EFAULT;
+
 	/*
 	 * We steal reference to this route, caller should not release it
 	 */

From e6f39e87b6439939a14cb7fdd94086a082b63b87 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Mon, 16 Apr 2018 11:43:57 +0200
Subject: [PATCH 154/288] x86/ldt: Fix support_pte_mask filtering in
 map_ldt_struct()

The |= operator will let us end up with an invalid PTE. Use
the correct &= instead.

[ The bug was also independently reported by Shuah Khan ]

Fixes: fb43d6cb91ef ('x86/mm: Do not auto-massage page protections')
Acked-by: Andy Lutomirski <luto@kernel.org>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/ldt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index d41d896481b8..c9b14020f4dd 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -166,7 +166,7 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
 		 */
 		pte_prot = __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL);
 		/* Filter out unsuppored __PAGE_KERNEL* bits: */
-		pgprot_val(pte_prot) |= __supported_pte_mask;
+		pgprot_val(pte_prot) &= __supported_pte_mask;
 		pte = pfn_pte(pfn, pte_prot);
 		set_pte_at(mm, va, ptep, pte);
 		pte_unmap_unlock(ptep, ptl);

From e86281e700cca8a773f9a572fa406adf2784ba5c Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@canonical.com>
Date: Wed, 28 Mar 2018 23:41:52 +0000
Subject: [PATCH 155/288] eCryptfs: don't pass up plaintext names when using
 filename encryption

Both ecryptfs_filldir() and ecryptfs_readlink_lower() use
ecryptfs_decode_and_decrypt_filename() to translate lower filenames to
upper filenames. The function correctly passes up lower filenames,
unchanged, when filename encryption isn't in use. However, it was also
passing up lower filenames when the filename wasn't encrypted or
when decryption failed. Since 88ae4ab9802e, eCryptfs refuses to lookup
lower plaintext names when filename encryption is enabled so this
resulted in a situation where userspace would see lower plaintext
filenames in calls to getdents(2) but then not be able to lookup those
filenames.

An example of this can be seen when enabling filename encryption on an
eCryptfs mount at the root directory of an Ext4 filesystem:

$ ls -1i /lower
12 ECRYPTFS_FNEK_ENCRYPTED.FWYZD8TcW.5FV-TKTEYOHsheiHX9a-w.NURCCYIMjI8pn5BDB9-h3fXwrE--
11 lost+found
$ ls -1i /upper
ls: cannot access '/upper/lost+found': No such file or directory
 ? lost+found
12 test

With this change, the lower lost+found dentry is ignored:

$ ls -1i /lower
12 ECRYPTFS_FNEK_ENCRYPTED.FWYZD8TcW.5FV-TKTEYOHsheiHX9a-w.NURCCYIMjI8pn5BDB9-h3fXwrE--
11 lost+found
$ ls -1i /upper
12 test

Additionally, some potentially noisy error/info messages in the related
code paths are turned into debug messages so that the logs can't be
easily filled.

Fixes: 88ae4ab9802e ("ecryptfs_lookup(): try either only encrypted or plaintext name")
Reported-by: Guenter Roeck <linux@roeck-us.net>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Tyler Hicks <tyhicks@canonical.com>
---
 fs/ecryptfs/crypto.c | 41 ++++++++++++++++++++++++++++-------------
 fs/ecryptfs/file.c   | 21 ++++++++++++++++-----
 2 files changed, 44 insertions(+), 18 deletions(-)

diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 846ca150d52e..4dd842f72846 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1997,6 +1997,16 @@ out:
 	return rc;
 }
 
+static bool is_dot_dotdot(const char *name, size_t name_size)
+{
+	if (name_size == 1 && name[0] == '.')
+		return true;
+	else if (name_size == 2 && name[0] == '.' && name[1] == '.')
+		return true;
+
+	return false;
+}
+
 /**
  * ecryptfs_decode_and_decrypt_filename - converts the encoded cipher text name to decoded plaintext
  * @plaintext_name: The plaintext name
@@ -2021,13 +2031,21 @@ int ecryptfs_decode_and_decrypt_filename(char **plaintext_name,
 	size_t packet_size;
 	int rc = 0;
 
-	if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)
-	    && !(mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED)
-	    && (name_size > ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE)
-	    && (strncmp(name, ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX,
-			ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE) == 0)) {
-		const char *orig_name = name;
-		size_t orig_name_size = name_size;
+	if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) &&
+	    !(mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED)) {
+		if (is_dot_dotdot(name, name_size)) {
+			rc = ecryptfs_copy_filename(plaintext_name,
+						    plaintext_name_size,
+						    name, name_size);
+			goto out;
+		}
+
+		if (name_size <= ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE ||
+		    strncmp(name, ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX,
+			    ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE)) {
+			rc = -EINVAL;
+			goto out;
+		}
 
 		name += ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE;
 		name_size -= ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE;
@@ -2047,12 +2065,9 @@ int ecryptfs_decode_and_decrypt_filename(char **plaintext_name,
 						  decoded_name,
 						  decoded_name_size);
 		if (rc) {
-			printk(KERN_INFO "%s: Could not parse tag 70 packet "
-			       "from filename; copying through filename "
-			       "as-is\n", __func__);
-			rc = ecryptfs_copy_filename(plaintext_name,
-						    plaintext_name_size,
-						    orig_name, orig_name_size);
+			ecryptfs_printk(KERN_DEBUG,
+					"%s: Could not parse tag 70 packet from filename\n",
+					__func__);
 			goto out_free;
 		}
 	} else {
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index c74ed3ca3372..b76a9853325e 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -82,17 +82,28 @@ ecryptfs_filldir(struct dir_context *ctx, const char *lower_name,
 						  buf->sb, lower_name,
 						  lower_namelen);
 	if (rc) {
-		printk(KERN_ERR "%s: Error attempting to decode and decrypt "
-		       "filename [%s]; rc = [%d]\n", __func__, lower_name,
-		       rc);
-		goto out;
+		if (rc != -EINVAL) {
+			ecryptfs_printk(KERN_DEBUG,
+					"%s: Error attempting to decode and decrypt filename [%s]; rc = [%d]\n",
+					__func__, lower_name, rc);
+			return rc;
+		}
+
+		/* Mask -EINVAL errors as these are most likely due a plaintext
+		 * filename present in the lower filesystem despite filename
+		 * encryption being enabled. One unavoidable example would be
+		 * the "lost+found" dentry in the root directory of an Ext4
+		 * filesystem.
+		 */
+		return 0;
 	}
+
 	buf->caller->pos = buf->ctx.pos;
 	rc = !dir_emit(buf->caller, name, name_size, ino, d_type);
 	kfree(name);
 	if (!rc)
 		buf->entries_written++;
-out:
+
 	return rc;
 }
 

From 8a8158c85e1e774a44fbe81106fa41138580dfd1 Mon Sep 17 00:00:00 2001
From: Matt Redfearn <matt.redfearn@mips.com>
Date: Thu, 29 Mar 2018 10:28:23 +0100
Subject: [PATCH 156/288] MIPS: memset.S: EVA & fault support for small_memset

The MIPS kernel memset / bzero implementation includes a small_memset
branch which is used when the region to be set is smaller than a long (4
bytes on 32bit, 8 bytes on 64bit). The current small_memset
implementation uses a simple store byte loop to write the destination.
There are 2 issues with this implementation:

1. When EVA mode is active, user and kernel address spaces may overlap.
Currently the use of the sb instruction means kernel mode addressing is
always used and an intended write to userspace may actually overwrite
some critical kernel data.

2. If the write triggers a page fault, for example by calling
__clear_user(NULL, 2), instead of gracefully handling the fault, an OOPS
is triggered.

Fix these issues by replacing the sb instruction with the EX() macro,
which will emit EVA compatible instuctions as required. Additionally
implement a fault fixup for small_memset which sets a2 to the number of
bytes that could not be cleared (as defined by __clear_user).

Reported-by: Chuanhua Lei <chuanhua.lei@intel.com>
Signed-off-by: Matt Redfearn <matt.redfearn@mips.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: stable@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/18975/
Signed-off-by: James Hogan <jhogan@kernel.org>
---
 arch/mips/lib/memset.S | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/mips/lib/memset.S b/arch/mips/lib/memset.S
index a1456664d6c2..90bcdf1224ee 100644
--- a/arch/mips/lib/memset.S
+++ b/arch/mips/lib/memset.S
@@ -219,7 +219,7 @@
 1:	PTR_ADDIU	a0, 1			/* fill bytewise */
 	R10KCBARRIER(0(ra))
 	bne		t1, a0, 1b
-	sb		a1, -1(a0)
+	 EX(sb, a1, -1(a0), .Lsmall_fixup\@)
 
 2:	jr		ra			/* done */
 	move		a2, zero
@@ -260,6 +260,11 @@
 	jr		ra
 	andi		v1, a2, STORMASK
 
+.Lsmall_fixup\@:
+	PTR_SUBU	a2, t1, a0
+	jr		ra
+	 PTR_ADDIU	a2, 1
+
 	.endm
 
 /*

From ec518f21cb1a1b1f8a516499ea05c60299e04963 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 16 Apr 2018 08:29:42 -0700
Subject: [PATCH 157/288] tipc: add policy for TIPC_NLA_NET_ADDR

Before syzbot/KMSAN bites, add the missing policy for TIPC_NLA_NET_ADDR

Fixes: 27c21416727a ("tipc: add net set to new netlink api")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Jon Maloy <jon.maloy@ericsson.com>
Cc: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/netlink.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c
index b76f13f6fea1..d4e0bbeee727 100644
--- a/net/tipc/netlink.c
+++ b/net/tipc/netlink.c
@@ -79,7 +79,8 @@ const struct nla_policy tipc_nl_sock_policy[TIPC_NLA_SOCK_MAX + 1] = {
 
 const struct nla_policy tipc_nl_net_policy[TIPC_NLA_NET_MAX + 1] = {
 	[TIPC_NLA_NET_UNSPEC]		= { .type = NLA_UNSPEC },
-	[TIPC_NLA_NET_ID]		= { .type = NLA_U32 }
+	[TIPC_NLA_NET_ID]		= { .type = NLA_U32 },
+	[TIPC_NLA_NET_ADDR]		= { .type = NLA_U32 },
 };
 
 const struct nla_policy tipc_nl_link_policy[TIPC_NLA_LINK_MAX + 1] = {

From c6404122cb18f1fbd2a6dc85ab687f6fa2e454cf Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 16 Apr 2018 08:29:43 -0700
Subject: [PATCH 158/288] tipc: fix possible crash in __tipc_nl_net_set()

syzbot reported a crash in __tipc_nl_net_set() caused by NULL dereference.

We need to check that both TIPC_NLA_NET_NODEID and TIPC_NLA_NET_NODEID_W1
are present.

We also need to make sure userland provided u64 attributes.

Fixes: d50ccc2d3909 ("tipc: add 128-bit node identifier")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Jon Maloy <jon.maloy@ericsson.com>
Cc: Ying Xue <ying.xue@windriver.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/net.c     | 2 ++
 net/tipc/netlink.c | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/net/tipc/net.c b/net/tipc/net.c
index 856f9e97ea29..4fbaa0464405 100644
--- a/net/tipc/net.c
+++ b/net/tipc/net.c
@@ -252,6 +252,8 @@ int __tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info)
 		u64 *w0 = (u64 *)&node_id[0];
 		u64 *w1 = (u64 *)&node_id[8];
 
+		if (!attrs[TIPC_NLA_NET_NODEID_W1])
+			return -EINVAL;
 		*w0 = nla_get_u64(attrs[TIPC_NLA_NET_NODEID]);
 		*w1 = nla_get_u64(attrs[TIPC_NLA_NET_NODEID_W1]);
 		tipc_net_init(net, node_id, 0);
diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c
index d4e0bbeee727..6ff2254088f6 100644
--- a/net/tipc/netlink.c
+++ b/net/tipc/netlink.c
@@ -81,6 +81,8 @@ const struct nla_policy tipc_nl_net_policy[TIPC_NLA_NET_MAX + 1] = {
 	[TIPC_NLA_NET_UNSPEC]		= { .type = NLA_UNSPEC },
 	[TIPC_NLA_NET_ID]		= { .type = NLA_U32 },
 	[TIPC_NLA_NET_ADDR]		= { .type = NLA_U32 },
+	[TIPC_NLA_NET_NODEID]		= { .type = NLA_U64 },
+	[TIPC_NLA_NET_NODEID_W1]	= { .type = NLA_U64 },
 };
 
 const struct nla_policy tipc_nl_link_policy[TIPC_NLA_LINK_MAX + 1] = {

From 5968a70d7af5f2abbd9d9f9c8e86da51f0a6b16d Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Mon, 16 Apr 2018 12:32:55 -0700
Subject: [PATCH 159/288] textsearch: fix kernel-doc warnings and add
 kernel-api section

Make lib/textsearch.c usable as kernel-doc.
Add textsearch() function family to kernel-api documentation.
Fix kernel-doc warnings in <linux/textsearch.h>:
  ../include/linux/textsearch.h:65: warning: Incorrect use of kernel-doc format:
	* get_next_block - fetch next block of data
  ../include/linux/textsearch.h:82: warning: Incorrect use of kernel-doc format:
	* finish - finalize/clean a series of get_next_block() calls

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/core-api/kernel-api.rst | 13 +++++++++
 include/linux/textsearch.h            |  4 +--
 lib/textsearch.c                      | 40 +++++++++++++++------------
 3 files changed, 38 insertions(+), 19 deletions(-)

diff --git a/Documentation/core-api/kernel-api.rst b/Documentation/core-api/kernel-api.rst
index ff335f8aeb39..92f30006adae 100644
--- a/Documentation/core-api/kernel-api.rst
+++ b/Documentation/core-api/kernel-api.rst
@@ -136,6 +136,19 @@ Sorting
 .. kernel-doc:: lib/list_sort.c
    :export:
 
+Text Searching
+--------------
+
+.. kernel-doc:: lib/textsearch.c
+   :doc: ts_intro
+
+.. kernel-doc:: lib/textsearch.c
+   :export:
+
+.. kernel-doc:: include/linux/textsearch.h
+   :functions: textsearch_find textsearch_next \
+               textsearch_get_pattern textsearch_get_pattern_len
+
 UUID/GUID
 ---------
 
diff --git a/include/linux/textsearch.h b/include/linux/textsearch.h
index 0494db3fd9e8..13770cfe33ad 100644
--- a/include/linux/textsearch.h
+++ b/include/linux/textsearch.h
@@ -62,7 +62,7 @@ struct ts_config
 	int 			flags;
 
 	/**
-	 * get_next_block - fetch next block of data
+	 * @get_next_block: fetch next block of data
 	 * @consumed: number of bytes consumed by the caller
 	 * @dst: destination buffer
 	 * @conf: search configuration
@@ -79,7 +79,7 @@ struct ts_config
 						  struct ts_state *state);
 
 	/**
-	 * finish - finalize/clean a series of get_next_block() calls
+	 * @finish: finalize/clean a series of get_next_block() calls
 	 * @conf: search configuration
 	 * @state: search state
 	 *
diff --git a/lib/textsearch.c b/lib/textsearch.c
index 0b79908dfe89..5939549c0e7b 100644
--- a/lib/textsearch.c
+++ b/lib/textsearch.c
@@ -10,7 +10,10 @@
  * 		Pablo Neira Ayuso <pablo@netfilter.org>
  *
  * ==========================================================================
- *
+ */
+
+/**
+ * DOC: ts_intro
  * INTRODUCTION
  *
  *   The textsearch infrastructure provides text searching facilities for
@@ -19,7 +22,9 @@
  *
  * ARCHITECTURE
  *
- *      User
+ * .. code-block:: none
+ *
+ *     User
  *     +----------------+
  *     |        finish()|<--------------(6)-----------------+
  *     |get_next_block()|<--------------(5)---------------+ |
@@ -33,21 +38,21 @@
  *     |             (3)|----->| find()/next() |-----------+          |
  *     |             (7)|----->| destroy()     |----------------------+
  *     +----------------+      +---------------+
- *  
- *   (1) User configures a search by calling _prepare() specifying the
- *       search parameters such as the pattern and algorithm name.
+ *
+ *   (1) User configures a search by calling textsearch_prepare() specifying
+ *       the search parameters such as the pattern and algorithm name.
  *   (2) Core requests the algorithm to allocate and initialize a search
  *       configuration according to the specified parameters.
- *   (3) User starts the search(es) by calling _find() or _next() to
- *       fetch subsequent occurrences. A state variable is provided
- *       to the algorithm to store persistent variables.
+ *   (3) User starts the search(es) by calling textsearch_find() or
+ *       textsearch_next() to fetch subsequent occurrences. A state variable
+ *       is provided to the algorithm to store persistent variables.
  *   (4) Core eventually resets the search offset and forwards the find()
  *       request to the algorithm.
  *   (5) Algorithm calls get_next_block() provided by the user continuously
  *       to fetch the data to be searched in block by block.
  *   (6) Algorithm invokes finish() after the last call to get_next_block
  *       to clean up any leftovers from get_next_block. (Optional)
- *   (7) User destroys the configuration by calling _destroy().
+ *   (7) User destroys the configuration by calling textsearch_destroy().
  *   (8) Core notifies the algorithm to destroy algorithm specific
  *       allocations. (Optional)
  *
@@ -62,9 +67,10 @@
  *   amount of times and even in parallel as long as a separate struct
  *   ts_state variable is provided to every instance.
  *
- *   The actual search is performed by either calling textsearch_find_-
- *   continuous() for linear data or by providing an own get_next_block()
- *   implementation and calling textsearch_find(). Both functions return
+ *   The actual search is performed by either calling
+ *   textsearch_find_continuous() for linear data or by providing
+ *   an own get_next_block() implementation and
+ *   calling textsearch_find(). Both functions return
  *   the position of the first occurrence of the pattern or UINT_MAX if
  *   no match was found. Subsequent occurrences can be found by calling
  *   textsearch_next() regardless of the linearity of the data.
@@ -72,7 +78,7 @@
  *   Once you're done using a configuration it must be given back via
  *   textsearch_destroy.
  *
- * EXAMPLE
+ * EXAMPLE::
  *
  *   int pos;
  *   struct ts_config *conf;
@@ -87,13 +93,13 @@
  *       goto errout;
  *   }
  *
- *   pos = textsearch_find_continuous(conf, &state, example, strlen(example));
+ *   pos = textsearch_find_continuous(conf, \&state, example, strlen(example));
  *   if (pos != UINT_MAX)
- *       panic("Oh my god, dancing chickens at %d\n", pos);
+ *       panic("Oh my god, dancing chickens at \%d\n", pos);
  *
  *   textsearch_destroy(conf);
- * ==========================================================================
  */
+/* ========================================================================== */
 
 #include <linux/module.h>
 #include <linux/types.h>
@@ -225,7 +231,7 @@ static unsigned int get_linear_data(unsigned int consumed, const u8 **dst,
  *
  * Returns the position of first occurrence of the pattern or
  * %UINT_MAX if no occurrence was found.
- */ 
+ */
 unsigned int textsearch_find_continuous(struct ts_config *conf,
 					struct ts_state *state,
 					const void *data, unsigned int len)

From 5c8dad48e4f53d6fd0a7e4f95d7c1c983374de88 Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Fri, 13 Apr 2018 11:55:13 -0700
Subject: [PATCH 160/288] trace_kprobe: Remove warning message "Could not
 insert probe at..."

This warning message is not very helpful, as the return value should
already show information about the error. Also, this message will
spam dmesg if the user space does testing in a loop, like:

    for x in {0..5}
    do
        echo p:xx xx+$x >> /sys/kernel/debug/tracing/kprobe_events
    done

Reported-by: Vince Weaver <vincent.weaver@maine.edu>
Signed-off-by: Song Liu <songliubraving@fb.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kernel-team@fb.com
Link: http://lkml.kernel.org/r/20180413185513.3626052-1-songliubraving@fb.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/trace/trace_kprobe.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 1cd3fb4d70f8..02aed76e0978 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -512,8 +512,6 @@ static int __register_trace_kprobe(struct trace_kprobe *tk)
 	if (ret == 0)
 		tk->tp.flags |= TP_FLAG_REGISTERED;
 	else {
-		pr_warn("Could not insert probe at %s+%lu: %d\n",
-			trace_kprobe_symbol(tk), trace_kprobe_offset(tk), ret);
 		if (ret == -ENOENT && trace_kprobe_is_on_module(tk)) {
 			pr_warn("This probe might be able to register after target module is loaded. Continue.\n");
 			ret = 0;

From b11954a6971baa5842e24e6c0dcf56f117249638 Mon Sep 17 00:00:00 2001
From: Daniel Stone <daniels@collabora.com>
Date: Fri, 30 Mar 2018 15:11:30 +0100
Subject: [PATCH 161/288] drm/exynos: Move GEM BOs to drm_framebuffer

Since drm_framebuffer can now store GEM objects directly, place them
there rather than in our own subclass. As this makes the framebuffer
create_handle and destroy functions the same as the GEM framebuffer
helper, we can reuse those.

Signed-off-by: Daniel Stone <daniels@collabora.com>
Signed-off-by: Inki Dae <inki.dae@samsung.com>
Cc: Inki Dae <inki.dae@samsung.com>
Cc: Joonyoung Shim <jy0922.shim@samsung.com>
Cc: Seung-Woo Kim <sw0312.kim@samsung.com>
Cc: Kyungmin Park <kyungmin.park@samsung.com>
---
 drivers/gpu/drm/exynos/exynos_drm_fb.c | 39 +++-----------------------
 1 file changed, 4 insertions(+), 35 deletions(-)

diff --git a/drivers/gpu/drm/exynos/exynos_drm_fb.c b/drivers/gpu/drm/exynos/exynos_drm_fb.c
index 0faaf829f5bf..f28ce493e314 100644
--- a/drivers/gpu/drm/exynos/exynos_drm_fb.c
+++ b/drivers/gpu/drm/exynos/exynos_drm_fb.c
@@ -18,6 +18,7 @@
 #include <drm/drm_fb_helper.h>
 #include <drm/drm_atomic.h>
 #include <drm/drm_atomic_helper.h>
+#include <drm/drm_gem_framebuffer_helper.h>
 #include <uapi/drm/exynos_drm.h>
 
 #include "exynos_drm_drv.h"
@@ -36,7 +37,6 @@
  */
 struct exynos_drm_fb {
 	struct drm_framebuffer	fb;
-	struct exynos_drm_gem	*exynos_gem[MAX_FB_BUFFER];
 	dma_addr_t			dma_addr[MAX_FB_BUFFER];
 };
 
@@ -66,40 +66,9 @@ static int check_fb_gem_memory_type(struct drm_device *drm_dev,
 	return 0;
 }
 
-static void exynos_drm_fb_destroy(struct drm_framebuffer *fb)
-{
-	struct exynos_drm_fb *exynos_fb = to_exynos_fb(fb);
-	unsigned int i;
-
-	drm_framebuffer_cleanup(fb);
-
-	for (i = 0; i < ARRAY_SIZE(exynos_fb->exynos_gem); i++) {
-		struct drm_gem_object *obj;
-
-		if (exynos_fb->exynos_gem[i] == NULL)
-			continue;
-
-		obj = &exynos_fb->exynos_gem[i]->base;
-		drm_gem_object_unreference_unlocked(obj);
-	}
-
-	kfree(exynos_fb);
-	exynos_fb = NULL;
-}
-
-static int exynos_drm_fb_create_handle(struct drm_framebuffer *fb,
-					struct drm_file *file_priv,
-					unsigned int *handle)
-{
-	struct exynos_drm_fb *exynos_fb = to_exynos_fb(fb);
-
-	return drm_gem_handle_create(file_priv,
-				     &exynos_fb->exynos_gem[0]->base, handle);
-}
-
 static const struct drm_framebuffer_funcs exynos_drm_fb_funcs = {
-	.destroy	= exynos_drm_fb_destroy,
-	.create_handle	= exynos_drm_fb_create_handle,
+	.destroy	= drm_gem_fb_destroy,
+	.create_handle	= drm_gem_fb_create_handle,
 };
 
 struct drm_framebuffer *
@@ -121,7 +90,7 @@ exynos_drm_framebuffer_init(struct drm_device *dev,
 		if (ret < 0)
 			goto err;
 
-		exynos_fb->exynos_gem[i] = exynos_gem[i];
+		exynos_fb->fb.obj[i] = &exynos_gem[i]->base;
 		exynos_fb->dma_addr[i] = exynos_gem[i]->dma_addr
 						+ mode_cmd->offsets[i];
 	}

From 7b30508f5116574f94b50c71d3da1089d145e603 Mon Sep 17 00:00:00 2001
From: Daniel Stone <daniels@collabora.com>
Date: Fri, 30 Mar 2018 15:11:31 +0100
Subject: [PATCH 162/288] drm/exynos: Move dma_addr out of exynos_drm_fb

This can be calculated from the GEM BO DMA address as well as the offset
stored in the base framebuffer.

Signed-off-by: Daniel Stone <daniels@collabora.com>
Signed-off-by: Inki Dae <inki.dae@samsung.com>
Cc: Inki Dae <inki.dae@samsung.com>
Cc: Joonyoung Shim <jy0922.shim@samsung.com>
Cc: Seung-Woo Kim <sw0312.kim@samsung.com>
Cc: Kyungmin Park <kyungmin.park@samsung.com>
---
 drivers/gpu/drm/exynos/exynos_drm_fb.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/exynos/exynos_drm_fb.c b/drivers/gpu/drm/exynos/exynos_drm_fb.c
index f28ce493e314..168c71f80b72 100644
--- a/drivers/gpu/drm/exynos/exynos_drm_fb.c
+++ b/drivers/gpu/drm/exynos/exynos_drm_fb.c
@@ -37,7 +37,6 @@
  */
 struct exynos_drm_fb {
 	struct drm_framebuffer	fb;
-	dma_addr_t			dma_addr[MAX_FB_BUFFER];
 };
 
 static int check_fb_gem_memory_type(struct drm_device *drm_dev,
@@ -91,8 +90,6 @@ exynos_drm_framebuffer_init(struct drm_device *dev,
 			goto err;
 
 		exynos_fb->fb.obj[i] = &exynos_gem[i]->base;
-		exynos_fb->dma_addr[i] = exynos_gem[i]->dma_addr
-						+ mode_cmd->offsets[i];
 	}
 
 	drm_helper_mode_fill_fb_struct(dev, &exynos_fb->fb, mode_cmd);
@@ -160,12 +157,13 @@ err:
 
 dma_addr_t exynos_drm_fb_dma_addr(struct drm_framebuffer *fb, int index)
 {
-	struct exynos_drm_fb *exynos_fb = to_exynos_fb(fb);
+	struct exynos_drm_gem *exynos_gem;
 
 	if (WARN_ON_ONCE(index >= MAX_FB_BUFFER))
 		return 0;
 
-	return exynos_fb->dma_addr[index];
+	exynos_gem = to_exynos_gem(fb->obj[index]);
+	return exynos_gem->dma_addr + fb->offsets[index];
 }
 
 static struct drm_mode_config_helper_funcs exynos_drm_mode_config_helpers = {

From ff059fcbeed9cbed7421f82d1463dd74c472636e Mon Sep 17 00:00:00 2001
From: Daniel Stone <daniels@collabora.com>
Date: Fri, 30 Mar 2018 15:11:32 +0100
Subject: [PATCH 163/288] drm/exynos: exynos_drm_fb -> drm_framebuffer

Now exynos_drm_fb is just an empty wrapper around drm_framebuffer, we
can drop it.

Signed-off-by: Daniel Stone <daniels@collabora.com>
Signed-off-by: Inki Dae <inki.dae@samsung.com>
Cc: Inki Dae <inki.dae@samsung.com>
Cc: Joonyoung Shim <jy0922.shim@samsung.com>
Cc: Seung-Woo Kim <sw0312.kim@samsung.com>
Cc: Kyungmin Park <kyungmin.park@samsung.com>
---
 drivers/gpu/drm/exynos/exynos_drm_fb.c | 28 ++++++++------------------
 1 file changed, 8 insertions(+), 20 deletions(-)

diff --git a/drivers/gpu/drm/exynos/exynos_drm_fb.c b/drivers/gpu/drm/exynos/exynos_drm_fb.c
index 168c71f80b72..f0e79178bde6 100644
--- a/drivers/gpu/drm/exynos/exynos_drm_fb.c
+++ b/drivers/gpu/drm/exynos/exynos_drm_fb.c
@@ -27,18 +27,6 @@
 #include "exynos_drm_iommu.h"
 #include "exynos_drm_crtc.h"
 
-#define to_exynos_fb(x)	container_of(x, struct exynos_drm_fb, fb)
-
-/*
- * exynos specific framebuffer structure.
- *
- * @fb: drm framebuffer obejct.
- * @exynos_gem: array of exynos specific gem object containing a gem object.
- */
-struct exynos_drm_fb {
-	struct drm_framebuffer	fb;
-};
-
 static int check_fb_gem_memory_type(struct drm_device *drm_dev,
 				    struct exynos_drm_gem *exynos_gem)
 {
@@ -76,12 +64,12 @@ exynos_drm_framebuffer_init(struct drm_device *dev,
 			    struct exynos_drm_gem **exynos_gem,
 			    int count)
 {
-	struct exynos_drm_fb *exynos_fb;
+	struct drm_framebuffer *fb;
 	int i;
 	int ret;
 
-	exynos_fb = kzalloc(sizeof(*exynos_fb), GFP_KERNEL);
-	if (!exynos_fb)
+	fb = kzalloc(sizeof(*fb), GFP_KERNEL);
+	if (!fb)
 		return ERR_PTR(-ENOMEM);
 
 	for (i = 0; i < count; i++) {
@@ -89,21 +77,21 @@ exynos_drm_framebuffer_init(struct drm_device *dev,
 		if (ret < 0)
 			goto err;
 
-		exynos_fb->fb.obj[i] = &exynos_gem[i]->base;
+		fb->obj[i] = &exynos_gem[i]->base;
 	}
 
-	drm_helper_mode_fill_fb_struct(dev, &exynos_fb->fb, mode_cmd);
+	drm_helper_mode_fill_fb_struct(dev, fb, mode_cmd);
 
-	ret = drm_framebuffer_init(dev, &exynos_fb->fb, &exynos_drm_fb_funcs);
+	ret = drm_framebuffer_init(dev, fb, &exynos_drm_fb_funcs);
 	if (ret < 0) {
 		DRM_ERROR("failed to initialize framebuffer\n");
 		goto err;
 	}
 
-	return &exynos_fb->fb;
+	return fb;
 
 err:
-	kfree(exynos_fb);
+	kfree(fb);
 	return ERR_PTR(ret);
 }
 

From 596a9f6768af58d7034f6ebe559a0d489ae61467 Mon Sep 17 00:00:00 2001
From: Laura Abbott <labbott@redhat.com>
Date: Thu, 12 Apr 2018 14:21:54 -0500
Subject: [PATCH 164/288] objtool: Support HOSTCFLAGS and HOSTLDFLAGS

It may be useful to compile host programs with different flags (e.g.
hardening). Ensure that objtool picks up the appropriate flags.

Signed-off-by: Laura Abbott <labbott@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kbuild@vger.kernel.org
Link: http://lkml.kernel.org/r/05a360681176f1423cb2fde8faae3a0a0261afc5.1523560825.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 tools/objtool/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile
index 8ae824dbfca3..f76d9914686a 100644
--- a/tools/objtool/Makefile
+++ b/tools/objtool/Makefile
@@ -31,8 +31,8 @@ INCLUDES := -I$(srctree)/tools/include \
 	    -I$(srctree)/tools/arch/$(HOSTARCH)/include/uapi \
 	    -I$(srctree)/tools/objtool/arch/$(ARCH)/include
 WARNINGS := $(EXTRA_WARNINGS) -Wno-switch-default -Wno-switch-enum -Wno-packed
-CFLAGS   += -Wall -Werror $(WARNINGS) -fomit-frame-pointer -O2 -g $(INCLUDES)
-LDFLAGS  += -lelf $(LIBSUBCMD)
+CFLAGS   += -Werror $(WARNINGS) $(HOSTCFLAGS) -g $(INCLUDES)
+LDFLAGS  += -lelf $(LIBSUBCMD) $(HOSTLDFLAGS)
 
 # Allow old libelf to be used:
 elfshdr := $(shell echo '$(pound)include <libelf.h>' | $(CC) $(CFLAGS) -x c -E - | grep elf_getshdr)

From fae764912153065ea55eda47f834e0764a54df94 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Thu, 12 Apr 2018 13:48:25 +0200
Subject: [PATCH 165/288] s390/signal: cleanup uapi struct sigaction

The struct sigaction for user space in arch/s390/include/uapi/asm/signal.h
is ill defined. The kernel uses two structures 'struct sigaction' and
'struct old_sigaction', the correlation in the kernel for both 31 and
64 bit is as follows

    sys_sigaction -> struct old_sigaction
    sys_rt_sigaction -> struct sigaction

The correlation of the (single) uapi definition for 'struct sigaction'
under '#ifndef __KERNEL__':

    31-bit: sys_sigaction -> uapi struct sigaction
    31-bit: sys_rt_sigaction -> no structure available

    64-bit: sys_sigaction -> no structure available
    64-bit: sys_rt_sigaction -> uapi struct sigaction

This is quite confusing. To make it a bit less confusing make the
uapi definition of 'struct sigaction' usable for sys_rt_sigaction for
both 31-bit and 64-bit.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/include/uapi/asm/signal.h | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/arch/s390/include/uapi/asm/signal.h b/arch/s390/include/uapi/asm/signal.h
index c57f9d28d894..9a14a611ed82 100644
--- a/arch/s390/include/uapi/asm/signal.h
+++ b/arch/s390/include/uapi/asm/signal.h
@@ -97,22 +97,31 @@ typedef unsigned long sigset_t;
 #include <asm-generic/signal-defs.h>
 
 #ifndef __KERNEL__
-/* Here we must cater to libcs that poke about in kernel headers.  */
 
+/*
+ * There are two system calls in regard to sigaction, sys_rt_sigaction
+ * and sys_sigaction. Internally the kernel uses the struct old_sigaction
+ * for the older sys_sigaction system call, and the kernel version of the
+ * struct sigaction for the newer sys_rt_sigaction.
+ *
+ * The uapi definition for struct sigaction has made a strange distinction
+ * between 31-bit and 64-bit in the past. For 64-bit the uapi structure
+ * looks like the kernel struct sigaction, but for 31-bit it used to
+ * look like the kernel struct old_sigaction. That practically made the
+ * structure unusable for either system call. To get around this problem
+ * the glibc always had its own definitions for the sigaction structures.
+ *
+ * The current struct sigaction uapi definition below is suitable for the
+ * sys_rt_sigaction system call only.
+ */
 struct sigaction {
         union {
           __sighandler_t _sa_handler;
           void (*_sa_sigaction)(int, struct siginfo *, void *);
         } _u;
-#ifndef __s390x__ /* lovely */
-        sigset_t sa_mask;
-        unsigned long sa_flags;
-        void (*sa_restorer)(void);
-#else  /* __s390x__ */
         unsigned long sa_flags;
         void (*sa_restorer)(void);
 	sigset_t sa_mask;
-#endif /* __s390x__ */
 };
 
 #define sa_handler      _u._sa_handler

From 9dfbf78e4114fcaf4ef61c49885c3ab5bad40d0b Mon Sep 17 00:00:00 2001
From: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
Date: Thu, 18 Jan 2018 00:33:36 +0530
Subject: [PATCH 166/288] powerpc/64s: Default l1d_size to 64K in RFI fallback
 flush

If there is no d-cache-size property in the device tree, l1d_size could
be zero. We don't actually expect that to happen, it's only been seen
on mambo (simulator) in some configurations.

A zero-size l1d_size leads to the loop in the asm wrapping around to
2^64-1, and then walking off the end of the fallback area and
eventually causing a page fault which is fatal.

Just default to 64K which is correct on some CPUs, and sane enough to
not cause a crash on others.

Fixes: aa8a5e0062ac9 ('powerpc/64s: Add support for RFI flush of L1-D cache')
Signed-off-by: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
[mpe: Rewrite comment and change log]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/setup_64.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 44c30dd38067..b78f142a4148 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -890,6 +890,17 @@ static void __ref init_fallback_flush(void)
 		return;
 
 	l1d_size = ppc64_caches.l1d.size;
+
+	/*
+	 * If there is no d-cache-size property in the device tree, l1d_size
+	 * could be zero. That leads to the loop in the asm wrapping around to
+	 * 2^64-1, and then walking off the end of the fallback area and
+	 * eventually causing a page fault which is fatal. Just default to
+	 * something vaguely sane.
+	 */
+	if (!l1d_size)
+		l1d_size = (64 * 1024);
+
 	limit = min(ppc64_bolted_size(), ppc64_rma_size);
 
 	/*

From ef97837db916075af54554552628bcb0840df62f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 16 Apr 2018 14:44:42 +0200
Subject: [PATCH 167/288] x86: Remove pci-nommu.c

The commit that switched x86 to dma_direct_ops stopped using and building
this file, but accidentally left it in the tree.  Remove it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: iommu@lists.infradead.org
Link: https://lkml.kernel.org/r/20180416124442.13831-1-hch@lst.de
---
 arch/x86/kernel/pci-nommu.c | 90 -------------------------------------
 1 file changed, 90 deletions(-)
 delete mode 100644 arch/x86/kernel/pci-nommu.c

diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
deleted file mode 100644
index ac7ea3a8242f..000000000000
--- a/arch/x86/kernel/pci-nommu.c
+++ /dev/null
@@ -1,90 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* Fallback functions when the main IOMMU code is not compiled in. This
-   code is roughly equivalent to i386. */
-#include <linux/dma-direct.h>
-#include <linux/scatterlist.h>
-#include <linux/string.h>
-#include <linux/gfp.h>
-#include <linux/pci.h>
-#include <linux/mm.h>
-
-#include <asm/processor.h>
-#include <asm/iommu.h>
-#include <asm/dma.h>
-
-#define NOMMU_MAPPING_ERROR		0
-
-static int
-check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size)
-{
-	if (hwdev && !dma_capable(hwdev, bus, size)) {
-		if (*hwdev->dma_mask >= DMA_BIT_MASK(32))
-			printk(KERN_ERR
-			    "nommu_%s: overflow %Lx+%zu of device mask %Lx\n",
-				name, (long long)bus, size,
-				(long long)*hwdev->dma_mask);
-		return 0;
-	}
-	return 1;
-}
-
-static dma_addr_t nommu_map_page(struct device *dev, struct page *page,
-				 unsigned long offset, size_t size,
-				 enum dma_data_direction dir,
-				 unsigned long attrs)
-{
-	dma_addr_t bus = phys_to_dma(dev, page_to_phys(page)) + offset;
-	WARN_ON(size == 0);
-	if (!check_addr("map_single", dev, bus, size))
-		return NOMMU_MAPPING_ERROR;
-	return bus;
-}
-
-/* Map a set of buffers described by scatterlist in streaming
- * mode for DMA.  This is the scatter-gather version of the
- * above pci_map_single interface.  Here the scatter gather list
- * elements are each tagged with the appropriate dma address
- * and length.  They are obtained via sg_dma_{address,length}(SG).
- *
- * NOTE: An implementation may be able to use a smaller number of
- *       DMA address/length pairs than there are SG table elements.
- *       (for example via virtual mapping capabilities)
- *       The routine returns the number of addr/length pairs actually
- *       used, at most nents.
- *
- * Device ownership issues as mentioned above for pci_map_single are
- * the same here.
- */
-static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg,
-			int nents, enum dma_data_direction dir,
-			unsigned long attrs)
-{
-	struct scatterlist *s;
-	int i;
-
-	WARN_ON(nents == 0 || sg[0].length == 0);
-
-	for_each_sg(sg, s, nents, i) {
-		BUG_ON(!sg_page(s));
-		s->dma_address = sg_phys(s);
-		if (!check_addr("map_sg", hwdev, s->dma_address, s->length))
-			return 0;
-		s->dma_length = s->length;
-	}
-	return nents;
-}
-
-static int nommu_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
-	return dma_addr == NOMMU_MAPPING_ERROR;
-}
-
-const struct dma_map_ops nommu_dma_ops = {
-	.alloc			= dma_generic_alloc_coherent,
-	.free			= dma_generic_free_coherent,
-	.map_sg			= nommu_map_sg,
-	.map_page		= nommu_map_page,
-	.is_phys		= 1,
-	.mapping_error		= nommu_mapping_error,
-	.dma_supported		= x86_dma_supported,
-};

From d3878e164dcd3925a237a20e879432400e369172 Mon Sep 17 00:00:00 2001
From: Xiaoming Gao <gxm.linux.kernel@gmail.com>
Date: Fri, 13 Apr 2018 17:48:08 +0800
Subject: [PATCH 168/288] x86/tsc: Prevent 32bit truncation in calc_hpet_ref()

The TSC calibration code uses HPET as reference. The conversion normalizes
the delta of two HPET timestamps:

    hpetref = ((tshpet1 - tshpet2) * HPET_PERIOD) / 1e6

and then divides the normalized delta of the corresponding TSC timestamps
by the result to calulate the TSC frequency.

    tscfreq = ((tstsc1 - tstsc2 ) * 1e6) / hpetref

This uses do_div() which takes an u32 as the divisor, which worked so far
because the HPET frequency was low enough that 'hpetref' never exceeded
32bit.

On Skylake machines the HPET frequency increased so 'hpetref' can exceed
32bit. do_div() truncates the divisor, which causes the calibration to
fail.

Use div64_u64() to avoid the problem.

[ tglx: Fixes whitespace mangled patch and rewrote changelog ]

Signed-off-by: Xiaoming Gao <newtongao@tencent.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Cc: peterz@infradead.org
Cc: hpa@zytor.com
Link: https://lkml.kernel.org/r/38894564-4fc9-b8ec-353f-de702839e44e@gmail.com
---
 arch/x86/kernel/tsc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index ef32297ff17e..91e6da48cbb6 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -317,7 +317,7 @@ static unsigned long calc_hpet_ref(u64 deltatsc, u64 hpet1, u64 hpet2)
 	hpet2 -= hpet1;
 	tmp = ((u64)hpet2 * hpet_readl(HPET_PERIOD));
 	do_div(tmp, 1000000);
-	do_div(deltatsc, tmp);
+	deltatsc = div64_u64(deltatsc, tmp);
 
 	return (unsigned long) deltatsc;
 }

From 10daf10ab154e31237a8c07242be3063fb6a9bf4 Mon Sep 17 00:00:00 2001
From: Dou Liyang <douly.fnst@cn.fujitsu.com>
Date: Thu, 12 Apr 2018 09:40:52 +0800
Subject: [PATCH 169/288] x86/acpi: Prevent X2APIC id 0xffffffff from being
 accounted
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

RongQing reported that there are some X2APIC id 0xffffffff in his machine's
ACPI MADT table, which makes the number of possible CPU inaccurate.

The reason is that the ACPI X2APIC parser has no sanity check for APIC ID
0xffffffff, which is an invalid id in all APIC types. See "Intel® 64
Architecture x2APIC Specification", Chapter 2.4.1.

Add a sanity check to acpi_parse_x2apic() which ignores the invalid id.

Reported-by: Li RongQing <lirongqing@baidu.com>
Signed-off-by: Dou Liyang <douly.fnst@cn.fujitsu.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Cc: len.brown@intel.com
Cc: rjw@rjwysocki.net
Cc: hpa@zytor.com
Link: https://lkml.kernel.org/r/20180412014052.25186-1-douly.fnst@cn.fujitsu.com
---
 arch/x86/kernel/acpi/boot.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index dde444f932c1..3b20607d581b 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -215,6 +215,10 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end)
 	apic_id = processor->local_apic_id;
 	enabled = processor->lapic_flags & ACPI_MADT_ENABLED;
 
+	/* Ignore invalid ID */
+	if (apic_id == 0xffffffff)
+		return 0;
+
 	/*
 	 * We need to register disabled CPU as well to permit
 	 * counting disabled CPUs. This allows us to size

From 451cf3ca7d4615631443014ee769c25e267c25ff Mon Sep 17 00:00:00 2001
From: Dou Liyang <douly.fnst@cn.fujitsu.com>
Date: Wed, 4 Apr 2018 14:45:27 +0800
Subject: [PATCH 170/288] x86/processor: Remove two unused function
 declarations

early_trap_init() and cpu_set_gdt() have been removed, so remove the stale
declarations as well.

Signed-off-by: Dou Liyang <douly.fnst@cn.fujitsu.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: keescook@chromium.org
Cc: luto@kernel.org
Cc: hpa@zytor.com
Cc: bp@suse.de
Cc: kirill.shutemov@linux.intel.com
Link: https://lkml.kernel.org/r/20180404064527.10562-1-douly.fnst@cn.fujitsu.com
---
 arch/x86/include/asm/processor.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 4fa4206029e3..21a114914ba4 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -749,13 +749,11 @@ enum idle_boot_override {IDLE_NO_OVERRIDE=0, IDLE_HALT, IDLE_NOMWAIT,
 extern void enable_sep_cpu(void);
 extern int sysenter_setup(void);
 
-extern void early_trap_init(void);
 void early_trap_pf_init(void);
 
 /* Defined in head.S */
 extern struct desc_ptr		early_gdt_descr;
 
-extern void cpu_set_gdt(int);
 extern void switch_to_new_gdt(int);
 extern void load_direct_gdt(int);
 extern void load_fixmap_gdt(int);

From e91c2518a5d22a07642f35d85f39001ad379dae4 Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.com>
Date: Mon, 16 Apr 2018 13:36:46 +0200
Subject: [PATCH 171/288] livepatch: Initialize shadow variables safely by a
 custom callback

The existing API allows to pass a sample data to initialize the shadow
data. It works well when the data are position independent. But it fails
miserably when we need to set a pointer to the shadow structure itself.

Unfortunately, we might need to initialize the pointer surprisingly
often because of struct list_head. It is even worse because the list
might be hidden in other common structures, for example, struct mutex,
struct wait_queue_head.

For example, this was needed to fix races in ALSA sequencer. It required
to add mutex into struct snd_seq_client. See commit b3defb791b26ea06
("ALSA: seq: Make ioctls race-free") and commit d15d662e89fc667b9
("ALSA: seq: Fix racy pool initializations")

This patch makes the API more safe. A custom constructor function and data
are passed to klp_shadow_*alloc() functions instead of the sample data.

Note that ctor_data are no longer a template for shadow->data. It might
point to any data that might be necessary when the constructor is called.

Also note that the constructor is called under klp_shadow_lock. It is
an internal spin_lock that synchronizes alloc() vs. get() operations,
see klp_shadow_get_or_alloc(). On one hand, this adds a risk of ABBA
deadlocks. On the other hand, it allows to do some operations safely.
For example, we could add the new structure into an existing list.
This must be done only once when the structure is allocated.

Reported-by: Nicolai Stange <nstange@suse.de>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Acked-by: Miroslav Benes <mbenes@suse.cz>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 Documentation/livepatch/shadow-vars.txt   | 31 ++++++---
 include/linux/livepatch.h                 | 14 ++--
 kernel/livepatch/shadow.c                 | 82 +++++++++++++++--------
 samples/livepatch/livepatch-shadow-fix1.c | 18 ++++-
 samples/livepatch/livepatch-shadow-fix2.c |  6 +-
 5 files changed, 104 insertions(+), 47 deletions(-)

diff --git a/Documentation/livepatch/shadow-vars.txt b/Documentation/livepatch/shadow-vars.txt
index 89c66634d600..9c7ae191641c 100644
--- a/Documentation/livepatch/shadow-vars.txt
+++ b/Documentation/livepatch/shadow-vars.txt
@@ -34,9 +34,13 @@ meta-data and shadow-data:
   - data[] - storage for shadow data
 
 It is important to note that the klp_shadow_alloc() and
-klp_shadow_get_or_alloc() calls, described below, store a *copy* of the
-data that the functions are provided.  Callers should provide whatever
-mutual exclusion is required of the shadow data.
+klp_shadow_get_or_alloc() are zeroing the variable by default.
+They also allow to call a custom constructor function when a non-zero
+value is needed. Callers should provide whatever mutual exclusion
+is required.
+
+Note that the constructor is called under klp_shadow_lock spinlock. It allows
+to do actions that can be done only once when a new variable is allocated.
 
 * klp_shadow_get() - retrieve a shadow variable data pointer
   - search hashtable for <obj, id> pair
@@ -47,7 +51,7 @@ mutual exclusion is required of the shadow data.
     - WARN and return NULL
   - if <obj, id> doesn't already exist
     - allocate a new shadow variable
-    - copy data into the new shadow variable
+    - initialize the variable using a custom constructor and data when provided
     - add <obj, id> to the global hashtable
 
 * klp_shadow_get_or_alloc() - get existing or alloc a new shadow variable
@@ -56,7 +60,7 @@ mutual exclusion is required of the shadow data.
     - return existing shadow variable
   - if <obj, id> doesn't already exist
     - allocate a new shadow variable
-    - copy data into the new shadow variable
+    - initialize the variable using a custom constructor and data when provided
     - add <obj, id> pair to the global hashtable
 
 * klp_shadow_free() - detach and free a <obj, id> shadow variable
@@ -107,7 +111,8 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata,
 	sta = kzalloc(sizeof(*sta) + hw->sta_data_size, gfp);
 
 	/* Attach a corresponding shadow variable, then initialize it */
-	ps_lock = klp_shadow_alloc(sta, PS_LOCK, NULL, sizeof(*ps_lock), gfp);
+	ps_lock = klp_shadow_alloc(sta, PS_LOCK, sizeof(*ps_lock), gfp,
+				   NULL, NULL);
 	if (!ps_lock)
 		goto shadow_fail;
 	spin_lock_init(ps_lock);
@@ -148,16 +153,24 @@ shadow variables to parents already in-flight.
 For commit 1d147bfa6429, a good spot to allocate a shadow spinlock is
 inside ieee80211_sta_ps_deliver_wakeup():
 
+int ps_lock_shadow_ctor(void *obj, void *shadow_data, void *ctor_data)
+{
+	spinlock_t *lock = shadow_data;
+
+	spin_lock_init(lock);
+	return 0;
+}
+
 #define PS_LOCK 1
 void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta)
 {
-	DEFINE_SPINLOCK(ps_lock_fallback);
 	spinlock_t *ps_lock;
 
 	/* sync with ieee80211_tx_h_unicast_ps_buf */
 	ps_lock = klp_shadow_get_or_alloc(sta, PS_LOCK,
-			&ps_lock_fallback, sizeof(ps_lock_fallback),
-			GFP_ATOMIC);
+			sizeof(*ps_lock), GFP_ATOMIC,
+			ps_lock_shadow_ctor, NULL);
+
 	if (ps_lock)
 		spin_lock(ps_lock);
 	...
diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h
index 4754f01c1abb..7e084321b146 100644
--- a/include/linux/livepatch.h
+++ b/include/linux/livepatch.h
@@ -186,11 +186,17 @@ static inline bool klp_have_reliable_stack(void)
 	       IS_ENABLED(CONFIG_HAVE_RELIABLE_STACKTRACE);
 }
 
+typedef int (*klp_shadow_ctor_t)(void *obj,
+				 void *shadow_data,
+				 void *ctor_data);
+
 void *klp_shadow_get(void *obj, unsigned long id);
-void *klp_shadow_alloc(void *obj, unsigned long id, void *data,
-		       size_t size, gfp_t gfp_flags);
-void *klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data,
-			      size_t size, gfp_t gfp_flags);
+void *klp_shadow_alloc(void *obj, unsigned long id,
+		       size_t size, gfp_t gfp_flags,
+		       klp_shadow_ctor_t ctor, void *ctor_data);
+void *klp_shadow_get_or_alloc(void *obj, unsigned long id,
+			      size_t size, gfp_t gfp_flags,
+			      klp_shadow_ctor_t ctor, void *ctor_data);
 void klp_shadow_free(void *obj, unsigned long id);
 void klp_shadow_free_all(unsigned long id);
 
diff --git a/kernel/livepatch/shadow.c b/kernel/livepatch/shadow.c
index fdac27588d60..b10a0bbb7f84 100644
--- a/kernel/livepatch/shadow.c
+++ b/kernel/livepatch/shadow.c
@@ -113,8 +113,10 @@ void *klp_shadow_get(void *obj, unsigned long id)
 }
 EXPORT_SYMBOL_GPL(klp_shadow_get);
 
-static void *__klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data,
-		       size_t size, gfp_t gfp_flags, bool warn_on_exist)
+static void *__klp_shadow_get_or_alloc(void *obj, unsigned long id,
+				       size_t size, gfp_t gfp_flags,
+				       klp_shadow_ctor_t ctor, void *ctor_data,
+				       bool warn_on_exist)
 {
 	struct klp_shadow *new_shadow;
 	void *shadow_data;
@@ -125,18 +127,15 @@ static void *__klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data,
 	if (shadow_data)
 		goto exists;
 
-	/* Allocate a new shadow variable for use inside the lock below */
+	/*
+	 * Allocate a new shadow variable.  Fill it with zeroes by default.
+	 * More complex setting can be done by @ctor function.  But it is
+	 * called only when the buffer is really used (under klp_shadow_lock).
+	 */
 	new_shadow = kzalloc(size + sizeof(*new_shadow), gfp_flags);
 	if (!new_shadow)
 		return NULL;
 
-	new_shadow->obj = obj;
-	new_shadow->id = id;
-
-	/* Initialize the shadow variable if data provided */
-	if (data)
-		memcpy(new_shadow->data, data, size);
-
 	/* Look for <obj, id> again under the lock */
 	spin_lock_irqsave(&klp_shadow_lock, flags);
 	shadow_data = klp_shadow_get(obj, id);
@@ -150,6 +149,22 @@ static void *__klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data,
 		goto exists;
 	}
 
+	new_shadow->obj = obj;
+	new_shadow->id = id;
+
+	if (ctor) {
+		int err;
+
+		err = ctor(obj, new_shadow->data, ctor_data);
+		if (err) {
+			spin_unlock_irqrestore(&klp_shadow_lock, flags);
+			kfree(new_shadow);
+			pr_err("Failed to construct shadow variable <%p, %lx> (%d)\n",
+			       obj, id, err);
+			return NULL;
+		}
+	}
+
 	/* No <obj, id> found, so attach the newly allocated one */
 	hash_add_rcu(klp_shadow_hash, &new_shadow->node,
 		     (unsigned long)new_shadow->obj);
@@ -170,26 +185,32 @@ exists:
  * klp_shadow_alloc() - allocate and add a new shadow variable
  * @obj:	pointer to parent object
  * @id:		data identifier
- * @data:	pointer to data to attach to parent
  * @size:	size of attached data
  * @gfp_flags:	GFP mask for allocation
+ * @ctor:	custom constructor to initialize the shadow data (optional)
+ * @ctor_data:	pointer to any data needed by @ctor (optional)
  *
- * Allocates @size bytes for new shadow variable data using @gfp_flags
- * and copies @size bytes from @data into the new shadow variable's own
- * data space.  If @data is NULL, @size bytes are still allocated, but
- * no copy is performed.  The new shadow variable is then added to the
- * global hashtable.
+ * Allocates @size bytes for new shadow variable data using @gfp_flags.
+ * The data are zeroed by default.  They are further initialized by @ctor
+ * function if it is not NULL.  The new shadow variable is then added
+ * to the global hashtable.
  *
- * If an existing <obj, id> shadow variable can be found, this routine
- * will issue a WARN, exit early and return NULL.
+ * If an existing <obj, id> shadow variable can be found, this routine will
+ * issue a WARN, exit early and return NULL.
+ *
+ * This function guarantees that the constructor function is called only when
+ * the variable did not exist before.  The cost is that @ctor is called
+ * in atomic context under a spin lock.
  *
  * Return: the shadow variable data element, NULL on duplicate or
  * failure.
  */
-void *klp_shadow_alloc(void *obj, unsigned long id, void *data,
-		       size_t size, gfp_t gfp_flags)
+void *klp_shadow_alloc(void *obj, unsigned long id,
+		       size_t size, gfp_t gfp_flags,
+		       klp_shadow_ctor_t ctor, void *ctor_data)
 {
-	return __klp_shadow_get_or_alloc(obj, id, data, size, gfp_flags, true);
+	return __klp_shadow_get_or_alloc(obj, id, size, gfp_flags,
+					 ctor, ctor_data, true);
 }
 EXPORT_SYMBOL_GPL(klp_shadow_alloc);
 
@@ -197,25 +218,28 @@ EXPORT_SYMBOL_GPL(klp_shadow_alloc);
  * klp_shadow_get_or_alloc() - get existing or allocate a new shadow variable
  * @obj:	pointer to parent object
  * @id:		data identifier
- * @data:	pointer to data to attach to parent
  * @size:	size of attached data
  * @gfp_flags:	GFP mask for allocation
+ * @ctor:	custom constructor to initialize the shadow data (optional)
+ * @ctor_data:	pointer to any data needed by @ctor (optional)
  *
  * Returns a pointer to existing shadow data if an <obj, id> shadow
  * variable is already present.  Otherwise, it creates a new shadow
  * variable like klp_shadow_alloc().
  *
- * This function guarantees that only one shadow variable exists with
- * the given @id for the given @obj.  It also guarantees that the shadow
- * variable will be initialized by the given @data only when it did not
- * exist before.
+ * This function guarantees that only one shadow variable exists with the given
+ * @id for the given @obj.  It also guarantees that the constructor function
+ * will be called only when the variable did not exist before.  The cost is
+ * that @ctor is called in atomic context under a spin lock.
  *
  * Return: the shadow variable data element, NULL on failure.
  */
-void *klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data,
-			       size_t size, gfp_t gfp_flags)
+void *klp_shadow_get_or_alloc(void *obj, unsigned long id,
+			      size_t size, gfp_t gfp_flags,
+			      klp_shadow_ctor_t ctor, void *ctor_data)
 {
-	return __klp_shadow_get_or_alloc(obj, id, data, size, gfp_flags, false);
+	return __klp_shadow_get_or_alloc(obj, id, size, gfp_flags,
+					 ctor, ctor_data, false);
 }
 EXPORT_SYMBOL_GPL(klp_shadow_get_or_alloc);
 
diff --git a/samples/livepatch/livepatch-shadow-fix1.c b/samples/livepatch/livepatch-shadow-fix1.c
index 830c55514f9f..04151c7f2631 100644
--- a/samples/livepatch/livepatch-shadow-fix1.c
+++ b/samples/livepatch/livepatch-shadow-fix1.c
@@ -56,6 +56,21 @@ struct dummy {
 	unsigned long jiffies_expire;
 };
 
+/*
+ * The constructor makes more sense together with klp_shadow_get_or_alloc().
+ * In this example, it would be safe to assign the pointer also to the shadow
+ * variable returned by klp_shadow_alloc().  But we wanted to show the more
+ * complicated use of the API.
+ */
+static int shadow_leak_ctor(void *obj, void *shadow_data, void *ctor_data)
+{
+	void **shadow_leak = shadow_data;
+	void *leak = ctor_data;
+
+	*shadow_leak = leak;
+	return 0;
+}
+
 struct dummy *livepatch_fix1_dummy_alloc(void)
 {
 	struct dummy *d;
@@ -74,7 +89,8 @@ struct dummy *livepatch_fix1_dummy_alloc(void)
 	 * pointer to handle resource release.
 	 */
 	leak = kzalloc(sizeof(int), GFP_KERNEL);
-	klp_shadow_alloc(d, SV_LEAK, &leak, sizeof(leak), GFP_KERNEL);
+	klp_shadow_alloc(d, SV_LEAK, sizeof(leak), GFP_KERNEL,
+			 shadow_leak_ctor, leak);
 
 	pr_info("%s: dummy @ %p, expires @ %lx\n",
 		__func__, d, d->jiffies_expire);
diff --git a/samples/livepatch/livepatch-shadow-fix2.c b/samples/livepatch/livepatch-shadow-fix2.c
index ff9948f0ec00..d6c62844dc15 100644
--- a/samples/livepatch/livepatch-shadow-fix2.c
+++ b/samples/livepatch/livepatch-shadow-fix2.c
@@ -53,17 +53,15 @@ struct dummy {
 bool livepatch_fix2_dummy_check(struct dummy *d, unsigned long jiffies)
 {
 	int *shadow_count;
-	int count;
 
 	/*
 	 * Patch: handle in-flight dummy structures, if they do not
 	 * already have a SV_COUNTER shadow variable, then attach a
 	 * new one.
 	 */
-	count = 0;
 	shadow_count = klp_shadow_get_or_alloc(d, SV_COUNTER,
-					       &count, sizeof(count),
-					       GFP_NOWAIT);
+				sizeof(*shadow_count), GFP_NOWAIT,
+				NULL, NULL);
 	if (shadow_count)
 		*shadow_count += 1;
 

From 3b2c77d000fe9f7d02e9e726e00dccf9f92b256f Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.com>
Date: Mon, 16 Apr 2018 13:36:47 +0200
Subject: [PATCH 172/288] livepatch: Allow to call a custom callback when
 freeing shadow variables

We might need to do some actions before the shadow variable is freed.
For example, we might need to remove it from a list or free some data
that it points to.

This is already possible now. The user can get the shadow variable
by klp_shadow_get(), do the necessary actions, and then call
klp_shadow_free().

This patch allows to do it a more elegant way. The user could implement
the needed actions in a callback that is passed to klp_shadow_free()
as a parameter. The callback usually does reverse operations to
the constructor callback that can be called by klp_shadow_*alloc().

It is especially useful for klp_shadow_free_all(). There we need to do
these extra actions for each found shadow variable with the given ID.

Note that the memory used by the shadow variable itself is still released
later by rcu callback. It is needed to protect internal structures that
keep all shadow variables. But the destructor is called immediately.
The shadow variable must not be access anyway after klp_shadow_free()
is called. The user is responsible to protect this any suitable way.

Be aware that the destructor is called under klp_shadow_lock. It is
the same as for the contructor in klp_shadow_alloc().

Signed-off-by: Petr Mladek <pmladek@suse.com>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Acked-by: Miroslav Benes <mbenes@suse.cz>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 Documentation/livepatch/shadow-vars.txt   | 10 ++++++---
 include/linux/livepatch.h                 |  5 +++--
 kernel/livepatch/shadow.c                 | 26 +++++++++++++++-------
 samples/livepatch/livepatch-shadow-fix1.c | 25 ++++++++++++---------
 samples/livepatch/livepatch-shadow-fix2.c | 27 ++++++++++++++---------
 5 files changed, 59 insertions(+), 34 deletions(-)

diff --git a/Documentation/livepatch/shadow-vars.txt b/Documentation/livepatch/shadow-vars.txt
index 9c7ae191641c..ecc09a7be5dd 100644
--- a/Documentation/livepatch/shadow-vars.txt
+++ b/Documentation/livepatch/shadow-vars.txt
@@ -65,11 +65,15 @@ to do actions that can be done only once when a new variable is allocated.
 
 * klp_shadow_free() - detach and free a <obj, id> shadow variable
   - find and remove a <obj, id> reference from global hashtable
-    - if found, free shadow variable
+    - if found
+      - call destructor function if defined
+      - free shadow variable
 
 * klp_shadow_free_all() - detach and free all <*, id> shadow variables
   - find and remove any <*, id> references from global hashtable
-    - if found, free shadow variable
+    - if found
+      - call destructor function if defined
+      - free shadow variable
 
 
 2. Use cases
@@ -136,7 +140,7 @@ variable:
 
 void sta_info_free(struct ieee80211_local *local, struct sta_info *sta)
 {
-	klp_shadow_free(sta, PS_LOCK);
+	klp_shadow_free(sta, PS_LOCK, NULL);
 	kfree(sta);
 	...
 
diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h
index 7e084321b146..aec44b1d9582 100644
--- a/include/linux/livepatch.h
+++ b/include/linux/livepatch.h
@@ -189,6 +189,7 @@ static inline bool klp_have_reliable_stack(void)
 typedef int (*klp_shadow_ctor_t)(void *obj,
 				 void *shadow_data,
 				 void *ctor_data);
+typedef void (*klp_shadow_dtor_t)(void *obj, void *shadow_data);
 
 void *klp_shadow_get(void *obj, unsigned long id);
 void *klp_shadow_alloc(void *obj, unsigned long id,
@@ -197,8 +198,8 @@ void *klp_shadow_alloc(void *obj, unsigned long id,
 void *klp_shadow_get_or_alloc(void *obj, unsigned long id,
 			      size_t size, gfp_t gfp_flags,
 			      klp_shadow_ctor_t ctor, void *ctor_data);
-void klp_shadow_free(void *obj, unsigned long id);
-void klp_shadow_free_all(unsigned long id);
+void klp_shadow_free(void *obj, unsigned long id, klp_shadow_dtor_t dtor);
+void klp_shadow_free_all(unsigned long id, klp_shadow_dtor_t dtor);
 
 #else /* !CONFIG_LIVEPATCH */
 
diff --git a/kernel/livepatch/shadow.c b/kernel/livepatch/shadow.c
index b10a0bbb7f84..83958c814439 100644
--- a/kernel/livepatch/shadow.c
+++ b/kernel/livepatch/shadow.c
@@ -243,15 +243,26 @@ void *klp_shadow_get_or_alloc(void *obj, unsigned long id,
 }
 EXPORT_SYMBOL_GPL(klp_shadow_get_or_alloc);
 
+static void klp_shadow_free_struct(struct klp_shadow *shadow,
+				   klp_shadow_dtor_t dtor)
+{
+	hash_del_rcu(&shadow->node);
+	if (dtor)
+		dtor(shadow->obj, shadow->data);
+	kfree_rcu(shadow, rcu_head);
+}
+
 /**
  * klp_shadow_free() - detach and free a <obj, id> shadow variable
  * @obj:	pointer to parent object
  * @id:		data identifier
+ * @dtor:	custom callback that can be used to unregister the variable
+ *		and/or free data that the shadow variable points to (optional)
  *
  * This function releases the memory for this <obj, id> shadow variable
  * instance, callers should stop referencing it accordingly.
  */
-void klp_shadow_free(void *obj, unsigned long id)
+void klp_shadow_free(void *obj, unsigned long id, klp_shadow_dtor_t dtor)
 {
 	struct klp_shadow *shadow;
 	unsigned long flags;
@@ -263,8 +274,7 @@ void klp_shadow_free(void *obj, unsigned long id)
 			       (unsigned long)obj) {
 
 		if (klp_shadow_match(shadow, obj, id)) {
-			hash_del_rcu(&shadow->node);
-			kfree_rcu(shadow, rcu_head);
+			klp_shadow_free_struct(shadow, dtor);
 			break;
 		}
 	}
@@ -276,11 +286,13 @@ EXPORT_SYMBOL_GPL(klp_shadow_free);
 /**
  * klp_shadow_free_all() - detach and free all <*, id> shadow variables
  * @id:		data identifier
+ * @dtor:	custom callback that can be used to unregister the variable
+ *		and/or free data that the shadow variable points to (optional)
  *
  * This function releases the memory for all <*, id> shadow variable
  * instances, callers should stop referencing them accordingly.
  */
-void klp_shadow_free_all(unsigned long id)
+void klp_shadow_free_all(unsigned long id, klp_shadow_dtor_t dtor)
 {
 	struct klp_shadow *shadow;
 	unsigned long flags;
@@ -290,10 +302,8 @@ void klp_shadow_free_all(unsigned long id)
 
 	/* Delete all <*, id> from hash */
 	hash_for_each(klp_shadow_hash, i, shadow, node) {
-		if (klp_shadow_match(shadow, shadow->obj, id)) {
-			hash_del_rcu(&shadow->node);
-			kfree_rcu(shadow, rcu_head);
-		}
+		if (klp_shadow_match(shadow, shadow->obj, id))
+			klp_shadow_free_struct(shadow, dtor);
 	}
 
 	spin_unlock_irqrestore(&klp_shadow_lock, flags);
diff --git a/samples/livepatch/livepatch-shadow-fix1.c b/samples/livepatch/livepatch-shadow-fix1.c
index 04151c7f2631..49b13553eaae 100644
--- a/samples/livepatch/livepatch-shadow-fix1.c
+++ b/samples/livepatch/livepatch-shadow-fix1.c
@@ -98,9 +98,19 @@ struct dummy *livepatch_fix1_dummy_alloc(void)
 	return d;
 }
 
+static void livepatch_fix1_dummy_leak_dtor(void *obj, void *shadow_data)
+{
+	void *d = obj;
+	void **shadow_leak = shadow_data;
+
+	kfree(*shadow_leak);
+	pr_info("%s: dummy @ %p, prevented leak @ %p\n",
+			 __func__, d, *shadow_leak);
+}
+
 void livepatch_fix1_dummy_free(struct dummy *d)
 {
-	void **shadow_leak, *leak;
+	void **shadow_leak;
 
 	/*
 	 * Patch: fetch the saved SV_LEAK shadow variable, detach and
@@ -109,15 +119,10 @@ void livepatch_fix1_dummy_free(struct dummy *d)
 	 * was loaded.)
 	 */
 	shadow_leak = klp_shadow_get(d, SV_LEAK);
-	if (shadow_leak) {
-		leak = *shadow_leak;
-		klp_shadow_free(d, SV_LEAK);
-		kfree(leak);
-		pr_info("%s: dummy @ %p, prevented leak @ %p\n",
-			 __func__, d, leak);
-	} else {
+	if (shadow_leak)
+		klp_shadow_free(d, SV_LEAK, livepatch_fix1_dummy_leak_dtor);
+	else
 		pr_info("%s: dummy @ %p leaked!\n", __func__, d);
-	}
 
 	kfree(d);
 }
@@ -163,7 +168,7 @@ static int livepatch_shadow_fix1_init(void)
 static void livepatch_shadow_fix1_exit(void)
 {
 	/* Cleanup any existing SV_LEAK shadow variables */
-	klp_shadow_free_all(SV_LEAK);
+	klp_shadow_free_all(SV_LEAK, livepatch_fix1_dummy_leak_dtor);
 
 	WARN_ON(klp_unregister_patch(&patch));
 }
diff --git a/samples/livepatch/livepatch-shadow-fix2.c b/samples/livepatch/livepatch-shadow-fix2.c
index d6c62844dc15..b34c7bf83356 100644
--- a/samples/livepatch/livepatch-shadow-fix2.c
+++ b/samples/livepatch/livepatch-shadow-fix2.c
@@ -68,22 +68,27 @@ bool livepatch_fix2_dummy_check(struct dummy *d, unsigned long jiffies)
 	return time_after(jiffies, d->jiffies_expire);
 }
 
+static void livepatch_fix2_dummy_leak_dtor(void *obj, void *shadow_data)
+{
+	void *d = obj;
+	void **shadow_leak = shadow_data;
+
+	kfree(*shadow_leak);
+	pr_info("%s: dummy @ %p, prevented leak @ %p\n",
+			 __func__, d, *shadow_leak);
+}
+
 void livepatch_fix2_dummy_free(struct dummy *d)
 {
-	void **shadow_leak, *leak;
+	void **shadow_leak;
 	int *shadow_count;
 
 	/* Patch: copy the memory leak patch from the fix1 module. */
 	shadow_leak = klp_shadow_get(d, SV_LEAK);
-	if (shadow_leak) {
-		leak = *shadow_leak;
-		klp_shadow_free(d, SV_LEAK);
-		kfree(leak);
-		pr_info("%s: dummy @ %p, prevented leak @ %p\n",
-			 __func__, d, leak);
-	} else {
+	if (shadow_leak)
+		klp_shadow_free(d, SV_LEAK, livepatch_fix2_dummy_leak_dtor);
+	else
 		pr_info("%s: dummy @ %p leaked!\n", __func__, d);
-	}
 
 	/*
 	 * Patch: fetch the SV_COUNTER shadow variable and display
@@ -93,7 +98,7 @@ void livepatch_fix2_dummy_free(struct dummy *d)
 	if (shadow_count) {
 		pr_info("%s: dummy @ %p, check counter = %d\n",
 			__func__, d, *shadow_count);
-		klp_shadow_free(d, SV_COUNTER);
+		klp_shadow_free(d, SV_COUNTER, NULL);
 	}
 
 	kfree(d);
@@ -140,7 +145,7 @@ static int livepatch_shadow_fix2_init(void)
 static void livepatch_shadow_fix2_exit(void)
 {
 	/* Cleanup any existing SV_COUNTER shadow variables */
-	klp_shadow_free_all(SV_COUNTER);
+	klp_shadow_free_all(SV_COUNTER, NULL);
 
 	WARN_ON(klp_unregister_patch(&patch));
 }

From cd6e992b3aab072cc90839508aaf5573c8f7e066 Mon Sep 17 00:00:00 2001
From: Oleksandr Andrushchenko <andr2000@gmail.com>
Date: Thu, 12 Apr 2018 20:26:27 +0300
Subject: [PATCH 173/288] xen/sndif: Sync up with the canonical definition in
 Xen

This is the sync up with the canonical definition of the sound
protocol in Xen:

1. Protocol version was referenced in the protocol description,
   but missed its definition. Fixed by adding a constant
   for current protocol version.

2. Some of the request descriptions have "reserved" fields
   missed: fixed by adding corresponding entries.

3. Extend the size of the requests and responses to 64 octets.
   Bump protocol version to 2.

4. Add explicit back and front synchronization
   In order to provide explicit synchronization between backend and
   frontend the following changes are introduced in the protocol:
    - add new ring buffer for sending asynchronous events from
      backend to frontend to report number of bytes played by the
      frontend (XENSND_EVT_CUR_POS)
    - introduce trigger events for playback control: start/stop/pause/resume
    - add "req-" prefix to event-channel and ring-ref to unify naming
      of the Xen event channels for requests and events

5. Add explicit back and front parameter negotiation
   In order to provide explicit stream parameter negotiation between
   backend and frontend the following changes are introduced in the protocol:
   add XENSND_OP_HW_PARAM_QUERY request to read/update
   configuration space for the parameters given: request passes
   desired parameter's intervals/masks and the response to this request
   returns allowed min/max intervals/masks to be used.

Signed-off-by: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>
Signed-off-by: Oleksandr Grytsov <oleksandr_grytsov@epam.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 include/xen/interface/io/sndif.h | 324 +++++++++++++++++++++++++++++--
 1 file changed, 307 insertions(+), 17 deletions(-)

diff --git a/include/xen/interface/io/sndif.h b/include/xen/interface/io/sndif.h
index 5c918276835e..78bb5d9f8d83 100644
--- a/include/xen/interface/io/sndif.h
+++ b/include/xen/interface/io/sndif.h
@@ -36,6 +36,13 @@
 #include "ring.h"
 #include "../grant_table.h"
 
+/*
+ ******************************************************************************
+ *                           Protocol version
+ ******************************************************************************
+ */
+#define XENSND_PROTOCOL_VERSION	2
+
 /*
  ******************************************************************************
  *                  Feature and Parameter Negotiation
@@ -106,6 +113,8 @@
  *
  * /local/domain/1/device/vsnd/0/0/0/ring-ref = "386"
  * /local/domain/1/device/vsnd/0/0/0/event-channel = "15"
+ * /local/domain/1/device/vsnd/0/0/0/evt-ring-ref = "1386"
+ * /local/domain/1/device/vsnd/0/0/0/evt-event-channel = "215"
  *
  *------------------------------ Stream 1, capture ----------------------------
  *
@@ -115,6 +124,8 @@
  *
  * /local/domain/1/device/vsnd/0/0/1/ring-ref = "384"
  * /local/domain/1/device/vsnd/0/0/1/event-channel = "13"
+ * /local/domain/1/device/vsnd/0/0/1/evt-ring-ref = "1384"
+ * /local/domain/1/device/vsnd/0/0/1/evt-event-channel = "213"
  *
  *------------------------------- PCM device 1 --------------------------------
  *
@@ -128,6 +139,8 @@
  *
  * /local/domain/1/device/vsnd/0/1/0/ring-ref = "387"
  * /local/domain/1/device/vsnd/0/1/0/event-channel = "151"
+ * /local/domain/1/device/vsnd/0/1/0/evt-ring-ref = "1387"
+ * /local/domain/1/device/vsnd/0/1/0/evt-event-channel = "351"
  *
  *------------------------------- PCM device 2 --------------------------------
  *
@@ -140,6 +153,8 @@
  *
  * /local/domain/1/device/vsnd/0/2/0/ring-ref = "389"
  * /local/domain/1/device/vsnd/0/2/0/event-channel = "152"
+ * /local/domain/1/device/vsnd/0/2/0/evt-ring-ref = "1389"
+ * /local/domain/1/device/vsnd/0/2/0/evt-event-channel = "452"
  *
  ******************************************************************************
  *                            Backend XenBus Nodes
@@ -285,6 +300,23 @@
  *      The Xen grant reference granting permission for the backend to map
  *      a sole page in a single page sized ring buffer.
  *
+ *--------------------- Stream Event Transport Parameters ---------------------
+ *
+ * This communication path is used to deliver asynchronous events from backend
+ * to frontend, set up per stream.
+ *
+ * evt-event-channel
+ *      Values:         <uint32_t>
+ *
+ *      The identifier of the Xen event channel used to signal activity
+ *      in the ring buffer.
+ *
+ * evt-ring-ref
+ *      Values:         <uint32_t>
+ *
+ *      The Xen grant reference granting permission for the backend to map
+ *      a sole page in a single page sized ring buffer.
+ *
  ******************************************************************************
  *                               STATE DIAGRAMS
  ******************************************************************************
@@ -432,6 +464,20 @@
 #define XENSND_OP_GET_VOLUME		5
 #define XENSND_OP_MUTE			6
 #define XENSND_OP_UNMUTE		7
+#define XENSND_OP_TRIGGER		8
+#define XENSND_OP_HW_PARAM_QUERY	9
+
+#define XENSND_OP_TRIGGER_START		0
+#define XENSND_OP_TRIGGER_PAUSE		1
+#define XENSND_OP_TRIGGER_STOP		2
+#define XENSND_OP_TRIGGER_RESUME	3
+
+/*
+ ******************************************************************************
+ *                                 EVENT CODES
+ ******************************************************************************
+ */
+#define XENSND_EVT_CUR_POS		0
 
 /*
  ******************************************************************************
@@ -448,6 +494,8 @@
 #define XENSND_FIELD_VCARD_LONG_NAME	"long-name"
 #define XENSND_FIELD_RING_REF		"ring-ref"
 #define XENSND_FIELD_EVT_CHNL		"event-channel"
+#define XENSND_FIELD_EVT_RING_REF	"evt-ring-ref"
+#define XENSND_FIELD_EVT_EVT_CHNL	"evt-event-channel"
 #define XENSND_FIELD_DEVICE_NAME	"name"
 #define XENSND_FIELD_TYPE		"type"
 #define XENSND_FIELD_STREAM_UNIQUE_ID	"unique-id"
@@ -526,7 +574,7 @@
  *
  *---------------------------------- Requests ---------------------------------
  *
- * All request packets have the same length (32 octets)
+ * All request packets have the same length (64 octets)
  * All request packets have common header:
  *         0                1                 2               3        octet
  * +----------------+----------------+----------------+----------------+
@@ -559,11 +607,13 @@
  * +----------------+----------------+----------------+----------------+
  * |                           gref_directory                          | 24
  * +----------------+----------------+----------------+----------------+
- * |                             reserved                              | 28
+ * |                             period_sz                             | 28
+ * +----------------+----------------+----------------+----------------+
+ * |                             reserved                              | 32
  * +----------------+----------------+----------------+----------------+
  * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/|
  * +----------------+----------------+----------------+----------------+
- * |                             reserved                              | 32
+ * |                             reserved                              | 64
  * +----------------+----------------+----------------+----------------+
  *
  * pcm_rate - uint32_t, stream data rate, Hz
@@ -571,6 +621,14 @@
  * pcm_channels - uint8_t, number of channels of this stream,
  *   [channels-min; channels-max]
  * buffer_sz - uint32_t, buffer size to be allocated, octets
+ * period_sz - uint32_t, event period size, octets
+ *   This is the requested value of the period at which frontend would
+ *   like to receive XENSND_EVT_CUR_POS notifications from the backend when
+ *   stream position advances during playback/capture.
+ *   It shows how many octets are expected to be played/captured before
+ *   sending such an event.
+ *   If set to 0 no XENSND_EVT_CUR_POS events are sent by the backend.
+ *
  * gref_directory - grant_ref_t, a reference to the first shared page
  *   describing shared buffer references. At least one page exists. If shared
  *   buffer size  (buffer_sz) exceeds what can be addressed by this single page,
@@ -585,6 +643,7 @@ struct xensnd_open_req {
 	uint16_t reserved;
 	uint32_t buffer_sz;
 	grant_ref_t gref_directory;
+	uint32_t period_sz;
 };
 
 /*
@@ -632,7 +691,7 @@ struct xensnd_page_directory {
  * +----------------+----------------+----------------+----------------+
  * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/|
  * +----------------+----------------+----------------+----------------+
- * |                             reserved                              | 32
+ * |                             reserved                              | 64
  * +----------------+----------------+----------------+----------------+
  *
  * Request read/write - used for read (for capture) or write (for playback):
@@ -650,7 +709,7 @@ struct xensnd_page_directory {
  * +----------------+----------------+----------------+----------------+
  * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/|
  * +----------------+----------------+----------------+----------------+
- * |                             reserved                              | 32
+ * |                             reserved                              | 64
  * +----------------+----------------+----------------+----------------+
  *
  * operation - XENSND_OP_READ for read or XENSND_OP_WRITE for write
@@ -673,9 +732,11 @@ struct xensnd_rw_req {
  * +----------------+----------------+----------------+----------------+
  * |                              length                               | 16
  * +----------------+----------------+----------------+----------------+
+ * |                             reserved                              | 20
+ * +----------------+----------------+----------------+----------------+
  * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/|
  * +----------------+----------------+----------------+----------------+
- * |                             reserved                              | 32
+ * |                             reserved                              | 64
  * +----------------+----------------+----------------+----------------+
  *
  * operation - XENSND_OP_SET_VOLUME for volume set
@@ -713,9 +774,11 @@ struct xensnd_rw_req {
  * +----------------+----------------+----------------+----------------+
  * |                              length                               | 16
  * +----------------+----------------+----------------+----------------+
+ * |                             reserved                              | 20
+ * +----------------+----------------+----------------+----------------+
  * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/|
  * +----------------+----------------+----------------+----------------+
- * |                             reserved                              | 32
+ * |                             reserved                              | 64
  * +----------------+----------------+----------------+----------------+
  *
  * operation - XENSND_OP_MUTE for mute or XENSND_OP_UNMUTE for unmute
@@ -743,32 +806,213 @@ struct xensnd_rw_req {
  *
  * The 'struct xensnd_rw_req' is also used for XENSND_OP_SET_VOLUME,
  * XENSND_OP_GET_VOLUME, XENSND_OP_MUTE, XENSND_OP_UNMUTE.
+ *
+ * Request stream running state change - trigger PCM stream running state
+ * to start, stop, pause or resume:
+ *
+ *         0                1                 2               3        octet
+ * +----------------+----------------+----------------+----------------+
+ * |               id                |   _OP_TRIGGER  |    reserved    | 4
+ * +----------------+----------------+----------------+----------------+
+ * |                             reserved                              | 8
+ * +----------------+----------------+----------------+----------------+
+ * |      type      |                     reserved                     | 12
+ * +----------------+----------------+----------------+----------------+
+ * |                             reserved                              | 16
+ * +----------------+----------------+----------------+----------------+
+ * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/|
+ * +----------------+----------------+----------------+----------------+
+ * |                             reserved                              | 64
+ * +----------------+----------------+----------------+----------------+
+ *
+ * type - uint8_t, XENSND_OP_TRIGGER_XXX value
  */
 
+struct xensnd_trigger_req {
+	uint8_t type;
+};
+
+/*
+ * Request stream parameter ranges: request intervals and
+ *   masks of supported ranges for stream configuration values.
+ *
+ *   Sound device configuration for a particular stream is a limited subset
+ *   of the multidimensional configuration available on XenStore, e.g.
+ *   once the frame rate has been selected there is a limited supported range
+ *   for sample rates becomes available (which might be the same set configured
+ *   on XenStore or less). For example, selecting 96kHz sample rate may limit
+ *   number of channels available for such configuration from 4 to 2, etc.
+ *   Thus, each call to XENSND_OP_HW_PARAM_QUERY may reduce configuration
+ *   space making it possible to iteratively get the final stream configuration,
+ *   used in XENSND_OP_OPEN request.
+ *
+ *   See response format for this request.
+ *
+ *         0                1                 2               3        octet
+ * +----------------+----------------+----------------+----------------+
+ * |               id                | _HW_PARAM_QUERY|    reserved    | 4
+ * +----------------+----------------+----------------+----------------+
+ * |                             reserved                              | 8
+ * +----------------+----------------+----------------+----------------+
+ * |                     formats mask low 32-bit                       | 12
+ * +----------------+----------------+----------------+----------------+
+ * |                     formats mask high 32-bit                      | 16
+ * +----------------+----------------+----------------+----------------+
+ * |                              min rate                             | 20
+ * +----------------+----------------+----------------+----------------+
+ * |                              max rate                             | 24
+ * +----------------+----------------+----------------+----------------+
+ * |                            min channels                           | 28
+ * +----------------+----------------+----------------+----------------+
+ * |                            max channels                           | 32
+ * +----------------+----------------+----------------+----------------+
+ * |                         min buffer frames                         | 36
+ * +----------------+----------------+----------------+----------------+
+ * |                         max buffer frames                         | 40
+ * +----------------+----------------+----------------+----------------+
+ * |                         min period frames                         | 44
+ * +----------------+----------------+----------------+----------------+
+ * |                         max period frames                         | 48
+ * +----------------+----------------+----------------+----------------+
+ * |                             reserved                              | 52
+ * +----------------+----------------+----------------+----------------+
+ * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/|
+ * +----------------+----------------+----------------+----------------+
+ * |                             reserved                              | 64
+ * +----------------+----------------+----------------+----------------+
+ *
+ * formats - uint64_t, bit mask representing values of the parameter
+ *     made as bitwise OR of (1 << XENSND_PCM_FORMAT_XXX) values
+ *
+ * For interval parameters:
+ *   min - uint32_t, minimum value of the parameter
+ *   max - uint32_t, maximum value of the parameter
+ *
+ * Frame is defined as a product of the number of channels by the
+ * number of octets per one sample.
+ */
+
+struct xensnd_query_hw_param {
+	uint64_t formats;
+	struct {
+		uint32_t min;
+		uint32_t max;
+	} rates;
+	struct {
+		uint32_t min;
+		uint32_t max;
+	} channels;
+	struct {
+		uint32_t min;
+		uint32_t max;
+	} buffer;
+	struct {
+		uint32_t min;
+		uint32_t max;
+	} period;
+};
+
 /*
  *---------------------------------- Responses --------------------------------
  *
- * All response packets have the same length (32 octets)
+ * All response packets have the same length (64 octets)
  *
- * Response for all requests:
+ * All response packets have common header:
  *         0                1                 2               3        octet
  * +----------------+----------------+----------------+----------------+
  * |               id                |    operation   |    reserved    | 4
  * +----------------+----------------+----------------+----------------+
  * |                              status                               | 8
  * +----------------+----------------+----------------+----------------+
- * |                             reserved                              | 12
- * +----------------+----------------+----------------+----------------+
- * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/|
- * +----------------+----------------+----------------+----------------+
- * |                             reserved                              | 32
- * +----------------+----------------+----------------+----------------+
  *
  * id - uint16_t, copied from the request
  * operation - uint8_t, XENSND_OP_* - copied from request
  * status - int32_t, response status, zero on success and -XEN_EXX on failure
+ *
+ *
+ * HW parameter query response - response for XENSND_OP_HW_PARAM_QUERY:
+ *         0                1                 2               3        octet
+ * +----------------+----------------+----------------+----------------+
+ * |               id                |    operation   |    reserved    | 4
+ * +----------------+----------------+----------------+----------------+
+ * |                              status                               | 8
+ * +----------------+----------------+----------------+----------------+
+ * |                     formats mask low 32-bit                       | 12
+ * +----------------+----------------+----------------+----------------+
+ * |                     formats mask high 32-bit                      | 16
+ * +----------------+----------------+----------------+----------------+
+ * |                              min rate                             | 20
+ * +----------------+----------------+----------------+----------------+
+ * |                              max rate                             | 24
+ * +----------------+----------------+----------------+----------------+
+ * |                            min channels                           | 28
+ * +----------------+----------------+----------------+----------------+
+ * |                            max channels                           | 32
+ * +----------------+----------------+----------------+----------------+
+ * |                         min buffer frames                         | 36
+ * +----------------+----------------+----------------+----------------+
+ * |                         max buffer frames                         | 40
+ * +----------------+----------------+----------------+----------------+
+ * |                         min period frames                         | 44
+ * +----------------+----------------+----------------+----------------+
+ * |                         max period frames                         | 48
+ * +----------------+----------------+----------------+----------------+
+ * |                             reserved                              | 52
+ * +----------------+----------------+----------------+----------------+
+ * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/|
+ * +----------------+----------------+----------------+----------------+
+ * |                             reserved                              | 64
+ * +----------------+----------------+----------------+----------------+
+ *
+ * Meaning of the values in this response is the same as for
+ * XENSND_OP_HW_PARAM_QUERY request.
  */
 
+/*
+ *----------------------------------- Events ----------------------------------
+ *
+ * Events are sent via shared page allocated by the front and propagated by
+ *   evt-event-channel/evt-ring-ref XenStore entries
+ * All event packets have the same length (64 octets)
+ * All event packets have common header:
+ *         0                1                 2               3        octet
+ * +----------------+----------------+----------------+----------------+
+ * |               id                |      type      |   reserved     | 4
+ * +----------------+----------------+----------------+----------------+
+ * |                             reserved                              | 8
+ * +----------------+----------------+----------------+----------------+
+ *
+ * id - uint16_t, event id, may be used by front
+ * type - uint8_t, type of the event
+ *
+ *
+ * Current stream position - event from back to front when stream's
+ *   playback/capture position has advanced:
+ *         0                1                 2               3        octet
+ * +----------------+----------------+----------------+----------------+
+ * |               id                |   _EVT_CUR_POS |   reserved     | 4
+ * +----------------+----------------+----------------+----------------+
+ * |                             reserved                              | 8
+ * +----------------+----------------+----------------+----------------+
+ * |                         position low 32-bit                       | 12
+ * +----------------+----------------+----------------+----------------+
+ * |                         position high 32-bit                      | 16
+ * +----------------+----------------+----------------+----------------+
+ * |                             reserved                              | 20
+ * +----------------+----------------+----------------+----------------+
+ * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/|
+ * +----------------+----------------+----------------+----------------+
+ * |                             reserved                              | 64
+ * +----------------+----------------+----------------+----------------+
+ *
+ * position - current value of stream's playback/capture position, octets
+ *
+ */
+
+struct xensnd_cur_pos_evt {
+	uint64_t position;
+};
+
 struct xensnd_req {
 	uint16_t id;
 	uint8_t operation;
@@ -776,7 +1020,9 @@ struct xensnd_req {
 	union {
 		struct xensnd_open_req open;
 		struct xensnd_rw_req rw;
-		uint8_t reserved[24];
+		struct xensnd_trigger_req trigger;
+		struct xensnd_query_hw_param hw_param;
+		uint8_t reserved[56];
 	} op;
 };
 
@@ -785,9 +1031,53 @@ struct xensnd_resp {
 	uint8_t operation;
 	uint8_t reserved;
 	int32_t status;
-	uint8_t reserved1[24];
+	union {
+		struct xensnd_query_hw_param hw_param;
+		uint8_t reserved1[56];
+	} resp;
+};
+
+struct xensnd_evt {
+	uint16_t id;
+	uint8_t type;
+	uint8_t reserved[5];
+	union {
+		struct xensnd_cur_pos_evt cur_pos;
+		uint8_t reserved[56];
+	} op;
 };
 
 DEFINE_RING_TYPES(xen_sndif, struct xensnd_req, struct xensnd_resp);
 
+/*
+ ******************************************************************************
+ *                        Back to front events delivery
+ ******************************************************************************
+ * In order to deliver asynchronous events from back to front a shared page is
+ * allocated by front and its granted reference propagated to back via
+ * XenStore entries (evt-ring-ref/evt-event-channel).
+ * This page has a common header used by both front and back to synchronize
+ * access and control event's ring buffer, while back being a producer of the
+ * events and front being a consumer. The rest of the page after the header
+ * is used for event packets.
+ *
+ * Upon reception of an event(s) front may confirm its reception
+ * for either each event, group of events or none.
+ */
+
+struct xensnd_event_page {
+	uint32_t in_cons;
+	uint32_t in_prod;
+	uint8_t reserved[56];
+};
+
+#define XENSND_EVENT_PAGE_SIZE XEN_PAGE_SIZE
+#define XENSND_IN_RING_OFFS (sizeof(struct xensnd_event_page))
+#define XENSND_IN_RING_SIZE (XENSND_EVENT_PAGE_SIZE - XENSND_IN_RING_OFFS)
+#define XENSND_IN_RING_LEN (XENSND_IN_RING_SIZE / sizeof(struct xensnd_evt))
+#define XENSND_IN_RING(page) \
+	((struct xensnd_evt *)((char *)(page) + XENSND_IN_RING_OFFS))
+#define XENSND_IN_RING_REF(page, idx) \
+	(XENSND_IN_RING((page))[(idx) % XENSND_IN_RING_LEN])
+
 #endif /* __XEN_PUBLIC_IO_SNDIF_H__ */

From ebf04f331fa15a966262341a7dc6b1a0efd633e4 Mon Sep 17 00:00:00 2001
From: Simon Gaiser <simon@invisiblethingslab.com>
Date: Thu, 15 Mar 2018 04:08:03 +0100
Subject: [PATCH 174/288] xen: xenbus_dev_frontend: Really return response
 string

xenbus_command_reply() did not actually copy the response string and
leaked stack content instead.

Fixes: 9a6161fe73bd ("xen: return xenstore command failures via response instead of rc")
Signed-off-by: Simon Gaiser <simon@invisiblethingslab.com>
Reviewed-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 drivers/xen/xenbus/xenbus_dev_frontend.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/xen/xenbus/xenbus_dev_frontend.c b/drivers/xen/xenbus/xenbus_dev_frontend.c
index 0d6d9264d6a9..c3e201025ef0 100644
--- a/drivers/xen/xenbus/xenbus_dev_frontend.c
+++ b/drivers/xen/xenbus/xenbus_dev_frontend.c
@@ -403,7 +403,7 @@ static int xenbus_command_reply(struct xenbus_file_priv *u,
 {
 	struct {
 		struct xsd_sockmsg hdr;
-		const char body[16];
+		char body[16];
 	} msg;
 	int rc;
 
@@ -412,6 +412,7 @@ static int xenbus_command_reply(struct xenbus_file_priv *u,
 	msg.hdr.len = strlen(reply) + 1;
 	if (msg.hdr.len > sizeof(msg.body))
 		return -E2BIG;
+	memcpy(&msg.body, reply, msg.hdr.len);
 
 	mutex_lock(&u->reply_mutex);
 	rc = queue_reply(&u->read_buffers, &msg, sizeof(msg.hdr) + msg.hdr.len);

From e2f73a1828e9ffd2765ce1726b9a9c6e022e3cd6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Mon, 16 Apr 2018 08:18:22 +0200
Subject: [PATCH 175/288] tools/headers: Synchronize kernel ABI headers,
 v4.17-rc1
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Sync the following tooling headers with the latest kernel version:

  tools/arch/arm/include/uapi/asm/kvm.h
    - New ABI: KVM_REG_ARM_*

  tools/arch/x86/include/asm/required-features.h
    - Removal of NEED_LA57 dependency

  tools/arch/x86/include/uapi/asm/kvm.h
    - New KVM ABI: KVM_SYNC_X86_*

  tools/include/uapi/asm-generic/mman-common.h
    - New ABI: MAP_FIXED_NOREPLACE flag

  tools/include/uapi/linux/bpf.h
    - New ABI: BPF_F_SEQ_NUMBER functions

  tools/include/uapi/linux/if_link.h
    - New ABI: IFLA tun and rmnet support

  tools/include/uapi/linux/kvm.h
    - New ABI: hyperv eventfd and CONN_ID_MASK support plus header cleanups

  tools/include/uapi/sound/asound.h
    - New ABI: SNDRV_PCM_FORMAT_FIRST PCM format specifier

  tools/perf/arch/x86/entry/syscalls/syscall_64.tbl
    - The x86 system call table description changed due to the ptregs changes and the renames, in:

	d5a00528b58c: syscalls/core, syscalls/x86: Rename struct pt_regs-based sys_*() to __x64_sys_*()
	5ac9efa3c50d: syscalls/core, syscalls/x86: Clean up compat syscall stub naming convention
	ebeb8c82ffaf: syscalls/x86: Use 'struct pt_regs' based syscall calling for IA32_EMULATION and x32

Also fix the x86 syscall table warning:

  -Warning: Kernel ABI header at 'tools/arch/x86/entry/syscalls/syscall_64.tbl' differs from latest version at 'arch/x86/entry/syscalls/syscall_64.tbl'
  +Warning: Kernel ABI header at 'tools/perf/arch/x86/entry/syscalls/syscall_64.tbl' differs from latest version at 'arch/x86/entry/syscalls/syscall_64.tbl'

None of these changes impact existing tooling code, so we only have to copy the kernel version.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Cc: Alexey Budankov <alexey.budankov@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Brian Robbins <brianrob@microsoft.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David Ahern <dsahern@gmail.com>
Cc: Dmitriy Vyukov <dvyukov@google.com> <dvyukov@google.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Cc: Jin Yao <yao.jin@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Kim Phillips <kim.phillips@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Li Zhijian <lizhijian@cn.fujitsu.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Martin Liška <mliska@suse.cz>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Matthias Kaehlcke <mka@chromium.org>
Cc: Miguel Bernal Marin <miguel.bernal.marin@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com>
Cc: Sandipan Das <sandipan@linux.vnet.ibm.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Takuya Yamamoto <tkydevel@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Thomas Richter <tmricht@linux.ibm.com>
Cc: Wang Nan <wangnan0@huawei.com>
Cc: William Cohen <wcohen@redhat.com>
Cc: Yonghong Song <yhs@fb.com>
Link: http://lkml.kernel.org/r/20180416064024.ofjtrz5yuu3ykhvl@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/arch/arm/include/uapi/asm/kvm.h         |   9 +
 .../arch/x86/include/asm/required-features.h  |   8 +-
 tools/arch/x86/include/uapi/asm/kvm.h         |  19 +-
 tools/include/uapi/asm-generic/mman-common.h  |   3 +
 tools/include/uapi/linux/bpf.h                |   1 +
 tools/include/uapi/linux/if_link.h            |  39 +
 tools/include/uapi/linux/kvm.h                |  21 +-
 tools/include/uapi/sound/asound.h             |   1 +
 tools/perf/arch/x86/Makefile                  |   2 +-
 .../arch/x86/entry/syscalls/syscall_64.tbl    | 712 +++++++++---------
 10 files changed, 451 insertions(+), 364 deletions(-)

diff --git a/tools/arch/arm/include/uapi/asm/kvm.h b/tools/arch/arm/include/uapi/asm/kvm.h
index 6edd177bb1c7..2ba95d6fe852 100644
--- a/tools/arch/arm/include/uapi/asm/kvm.h
+++ b/tools/arch/arm/include/uapi/asm/kvm.h
@@ -135,6 +135,15 @@ struct kvm_arch_memory_slot {
 #define KVM_REG_ARM_CRM_SHIFT		7
 #define KVM_REG_ARM_32_CRN_MASK		0x0000000000007800
 #define KVM_REG_ARM_32_CRN_SHIFT	11
+/*
+ * For KVM currently all guest registers are nonsecure, but we reserve a bit
+ * in the encoding to distinguish secure from nonsecure for AArch32 system
+ * registers that are banked by security. This is 1 for the secure banked
+ * register, and 0 for the nonsecure banked register or if the register is
+ * not banked by security.
+ */
+#define KVM_REG_ARM_SECURE_MASK	0x0000000010000000
+#define KVM_REG_ARM_SECURE_SHIFT	28
 
 #define ARM_CP15_REG_SHIFT_MASK(x,n) \
 	(((x) << KVM_REG_ARM_ ## n ## _SHIFT) & KVM_REG_ARM_ ## n ## _MASK)
diff --git a/tools/arch/x86/include/asm/required-features.h b/tools/arch/x86/include/asm/required-features.h
index fb3a6de7440b..6847d85400a8 100644
--- a/tools/arch/x86/include/asm/required-features.h
+++ b/tools/arch/x86/include/asm/required-features.h
@@ -53,12 +53,6 @@
 # define NEED_MOVBE	0
 #endif
 
-#ifdef CONFIG_X86_5LEVEL
-# define NEED_LA57	(1<<(X86_FEATURE_LA57 & 31))
-#else
-# define NEED_LA57	0
-#endif
-
 #ifdef CONFIG_X86_64
 #ifdef CONFIG_PARAVIRT
 /* Paravirtualized systems may not have PSE or PGE available */
@@ -104,7 +98,7 @@
 #define REQUIRED_MASK13	0
 #define REQUIRED_MASK14	0
 #define REQUIRED_MASK15	0
-#define REQUIRED_MASK16	(NEED_LA57)
+#define REQUIRED_MASK16	0
 #define REQUIRED_MASK17	0
 #define REQUIRED_MASK18	0
 #define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19)
diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h
index f3a960488eae..c535c2fdea13 100644
--- a/tools/arch/x86/include/uapi/asm/kvm.h
+++ b/tools/arch/x86/include/uapi/asm/kvm.h
@@ -354,8 +354,25 @@ struct kvm_xcrs {
 	__u64 padding[16];
 };
 
-/* definition of registers in kvm_run */
+#define KVM_SYNC_X86_REGS      (1UL << 0)
+#define KVM_SYNC_X86_SREGS     (1UL << 1)
+#define KVM_SYNC_X86_EVENTS    (1UL << 2)
+
+#define KVM_SYNC_X86_VALID_FIELDS \
+	(KVM_SYNC_X86_REGS| \
+	 KVM_SYNC_X86_SREGS| \
+	 KVM_SYNC_X86_EVENTS)
+
+/* kvm_sync_regs struct included by kvm_run struct */
 struct kvm_sync_regs {
+	/* Members of this structure are potentially malicious.
+	 * Care must be taken by code reading, esp. interpreting,
+	 * data fields from them inside KVM to prevent TOCTOU and
+	 * double-fetch types of vulnerabilities.
+	 */
+	struct kvm_regs regs;
+	struct kvm_sregs sregs;
+	struct kvm_vcpu_events events;
 };
 
 #define KVM_X86_QUIRK_LINT0_REENABLED	(1 << 0)
diff --git a/tools/include/uapi/asm-generic/mman-common.h b/tools/include/uapi/asm-generic/mman-common.h
index f8b134f5608f..e7ee32861d51 100644
--- a/tools/include/uapi/asm-generic/mman-common.h
+++ b/tools/include/uapi/asm-generic/mman-common.h
@@ -27,6 +27,9 @@
 # define MAP_UNINITIALIZED 0x0		/* Don't support this flag */
 #endif
 
+/* 0x0100 - 0x80000 flags are defined in asm-generic/mman.h */
+#define MAP_FIXED_NOREPLACE	0x100000	/* MAP_FIXED which doesn't unmap underlying mapping */
+
 /*
  * Flags for mlock
  */
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 9d07465023a2..c5ec89732a8d 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -864,6 +864,7 @@ enum bpf_func_id {
 /* BPF_FUNC_skb_set_tunnel_key flags. */
 #define BPF_F_ZERO_CSUM_TX		(1ULL << 1)
 #define BPF_F_DONT_FRAGMENT		(1ULL << 2)
+#define BPF_F_SEQ_NUMBER		(1ULL << 3)
 
 /* BPF_FUNC_perf_event_output, BPF_FUNC_perf_event_read and
  * BPF_FUNC_perf_event_read_value flags.
diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h
index 6d9447700e18..68699f654118 100644
--- a/tools/include/uapi/linux/if_link.h
+++ b/tools/include/uapi/linux/if_link.h
@@ -941,4 +941,43 @@ enum {
 	IFLA_EVENT_BONDING_OPTIONS,	/* change in bonding options */
 };
 
+/* tun section */
+
+enum {
+	IFLA_TUN_UNSPEC,
+	IFLA_TUN_OWNER,
+	IFLA_TUN_GROUP,
+	IFLA_TUN_TYPE,
+	IFLA_TUN_PI,
+	IFLA_TUN_VNET_HDR,
+	IFLA_TUN_PERSIST,
+	IFLA_TUN_MULTI_QUEUE,
+	IFLA_TUN_NUM_QUEUES,
+	IFLA_TUN_NUM_DISABLED_QUEUES,
+	__IFLA_TUN_MAX,
+};
+
+#define IFLA_TUN_MAX (__IFLA_TUN_MAX - 1)
+
+/* rmnet section */
+
+#define RMNET_FLAGS_INGRESS_DEAGGREGATION         (1U << 0)
+#define RMNET_FLAGS_INGRESS_MAP_COMMANDS          (1U << 1)
+#define RMNET_FLAGS_INGRESS_MAP_CKSUMV4           (1U << 2)
+#define RMNET_FLAGS_EGRESS_MAP_CKSUMV4            (1U << 3)
+
+enum {
+	IFLA_RMNET_UNSPEC,
+	IFLA_RMNET_MUX_ID,
+	IFLA_RMNET_FLAGS,
+	__IFLA_RMNET_MAX,
+};
+
+#define IFLA_RMNET_MAX	(__IFLA_RMNET_MAX - 1)
+
+struct ifla_rmnet_flags {
+	__u32	flags;
+	__u32	mask;
+};
+
 #endif /* _UAPI_LINUX_IF_LINK_H */
diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h
index 6b89f87db200..1065006c9bf5 100644
--- a/tools/include/uapi/linux/kvm.h
+++ b/tools/include/uapi/linux/kvm.h
@@ -396,6 +396,10 @@ struct kvm_run {
 		char padding[256];
 	};
 
+	/* 2048 is the size of the char array used to bound/pad the size
+	 * of the union that holds sync regs.
+	 */
+	#define SYNC_REGS_SIZE_BYTES 2048
 	/*
 	 * shared registers between kvm and userspace.
 	 * kvm_valid_regs specifies the register classes set by the host
@@ -407,7 +411,7 @@ struct kvm_run {
 	__u64 kvm_dirty_regs;
 	union {
 		struct kvm_sync_regs regs;
-		char padding[2048];
+		char padding[SYNC_REGS_SIZE_BYTES];
 	} s;
 };
 
@@ -936,6 +940,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_PPC_GET_CPU_CHAR 151
 #define KVM_CAP_S390_BPB 152
 #define KVM_CAP_GET_MSR_FEATURES 153
+#define KVM_CAP_HYPERV_EVENTFD 154
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1375,6 +1380,10 @@ struct kvm_enc_region {
 #define KVM_MEMORY_ENCRYPT_REG_REGION    _IOR(KVMIO, 0xbb, struct kvm_enc_region)
 #define KVM_MEMORY_ENCRYPT_UNREG_REGION  _IOR(KVMIO, 0xbc, struct kvm_enc_region)
 
+/* Available with KVM_CAP_HYPERV_EVENTFD */
+#define KVM_HYPERV_EVENTFD        _IOW(KVMIO,  0xbd, struct kvm_hyperv_eventfd)
+
+
 /* Secure Encrypted Virtualization command */
 enum sev_cmd_id {
 	/* Guest initialization commands */
@@ -1515,4 +1524,14 @@ struct kvm_assigned_msix_entry {
 #define KVM_ARM_DEV_EL1_PTIMER		(1 << 1)
 #define KVM_ARM_DEV_PMU			(1 << 2)
 
+struct kvm_hyperv_eventfd {
+	__u32 conn_id;
+	__s32 fd;
+	__u32 flags;
+	__u32 padding[3];
+};
+
+#define KVM_HYPERV_CONN_ID_MASK		0x00ffffff
+#define KVM_HYPERV_EVENTFD_DEASSIGN	(1 << 0)
+
 #endif /* __LINUX_KVM_H */
diff --git a/tools/include/uapi/sound/asound.h b/tools/include/uapi/sound/asound.h
index 07d61583fd02..ed0a120d4f08 100644
--- a/tools/include/uapi/sound/asound.h
+++ b/tools/include/uapi/sound/asound.h
@@ -242,6 +242,7 @@ typedef int __bitwise snd_pcm_format_t;
 #define	SNDRV_PCM_FORMAT_DSD_U16_BE	((__force snd_pcm_format_t) 51) /* DSD, 2-byte samples DSD (x16), big endian */
 #define	SNDRV_PCM_FORMAT_DSD_U32_BE	((__force snd_pcm_format_t) 52) /* DSD, 4-byte samples DSD (x32), big endian */
 #define	SNDRV_PCM_FORMAT_LAST		SNDRV_PCM_FORMAT_DSD_U32_BE
+#define	SNDRV_PCM_FORMAT_FIRST		SNDRV_PCM_FORMAT_S8
 
 #ifdef SNDRV_LITTLE_ENDIAN
 #define	SNDRV_PCM_FORMAT_S16		SNDRV_PCM_FORMAT_S16_LE
diff --git a/tools/perf/arch/x86/Makefile b/tools/perf/arch/x86/Makefile
index d74eaa7aa927..1a38e78117ce 100644
--- a/tools/perf/arch/x86/Makefile
+++ b/tools/perf/arch/x86/Makefile
@@ -21,7 +21,7 @@ _dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)')
 $(header): $(sys)/syscall_64.tbl $(systbl)
 	@(test -d ../../kernel -a -d ../../tools -a -d ../perf && ( \
         (diff -B arch/x86/entry/syscalls/syscall_64.tbl ../../arch/x86/entry/syscalls/syscall_64.tbl >/dev/null) \
-        || echo "Warning: Kernel ABI header at 'tools/arch/x86/entry/syscalls/syscall_64.tbl' differs from latest version at 'arch/x86/entry/syscalls/syscall_64.tbl'" >&2 )) || true
+        || echo "Warning: Kernel ABI header at 'tools/perf/arch/x86/entry/syscalls/syscall_64.tbl' differs from latest version at 'arch/x86/entry/syscalls/syscall_64.tbl'" >&2 )) || true
 	$(Q)$(SHELL) '$(systbl)' $(sys)/syscall_64.tbl 'x86_64' > $@
 
 clean::
diff --git a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl
index 5aef183e2f85..4dfe42666d0c 100644
--- a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl
@@ -4,379 +4,383 @@
 # The format is:
 # <number> <abi> <name> <entry point>
 #
+# The __x64_sys_*() stubs are created on-the-fly for sys_*() system calls
+#
 # The abi is "common", "64" or "x32" for this file.
 #
-0	common	read			sys_read
-1	common	write			sys_write
-2	common	open			sys_open
-3	common	close			sys_close
-4	common	stat			sys_newstat
-5	common	fstat			sys_newfstat
-6	common	lstat			sys_newlstat
-7	common	poll			sys_poll
-8	common	lseek			sys_lseek
-9	common	mmap			sys_mmap
-10	common	mprotect		sys_mprotect
-11	common	munmap			sys_munmap
-12	common	brk			sys_brk
-13	64	rt_sigaction		sys_rt_sigaction
-14	common	rt_sigprocmask		sys_rt_sigprocmask
-15	64	rt_sigreturn		sys_rt_sigreturn/ptregs
-16	64	ioctl			sys_ioctl
-17	common	pread64			sys_pread64
-18	common	pwrite64		sys_pwrite64
-19	64	readv			sys_readv
-20	64	writev			sys_writev
-21	common	access			sys_access
-22	common	pipe			sys_pipe
-23	common	select			sys_select
-24	common	sched_yield		sys_sched_yield
-25	common	mremap			sys_mremap
-26	common	msync			sys_msync
-27	common	mincore			sys_mincore
-28	common	madvise			sys_madvise
-29	common	shmget			sys_shmget
-30	common	shmat			sys_shmat
-31	common	shmctl			sys_shmctl
-32	common	dup			sys_dup
-33	common	dup2			sys_dup2
-34	common	pause			sys_pause
-35	common	nanosleep		sys_nanosleep
-36	common	getitimer		sys_getitimer
-37	common	alarm			sys_alarm
-38	common	setitimer		sys_setitimer
-39	common	getpid			sys_getpid
-40	common	sendfile		sys_sendfile64
-41	common	socket			sys_socket
-42	common	connect			sys_connect
-43	common	accept			sys_accept
-44	common	sendto			sys_sendto
-45	64	recvfrom		sys_recvfrom
-46	64	sendmsg			sys_sendmsg
-47	64	recvmsg			sys_recvmsg
-48	common	shutdown		sys_shutdown
-49	common	bind			sys_bind
-50	common	listen			sys_listen
-51	common	getsockname		sys_getsockname
-52	common	getpeername		sys_getpeername
-53	common	socketpair		sys_socketpair
-54	64	setsockopt		sys_setsockopt
-55	64	getsockopt		sys_getsockopt
-56	common	clone			sys_clone/ptregs
-57	common	fork			sys_fork/ptregs
-58	common	vfork			sys_vfork/ptregs
-59	64	execve			sys_execve/ptregs
-60	common	exit			sys_exit
-61	common	wait4			sys_wait4
-62	common	kill			sys_kill
-63	common	uname			sys_newuname
-64	common	semget			sys_semget
-65	common	semop			sys_semop
-66	common	semctl			sys_semctl
-67	common	shmdt			sys_shmdt
-68	common	msgget			sys_msgget
-69	common	msgsnd			sys_msgsnd
-70	common	msgrcv			sys_msgrcv
-71	common	msgctl			sys_msgctl
-72	common	fcntl			sys_fcntl
-73	common	flock			sys_flock
-74	common	fsync			sys_fsync
-75	common	fdatasync		sys_fdatasync
-76	common	truncate		sys_truncate
-77	common	ftruncate		sys_ftruncate
-78	common	getdents		sys_getdents
-79	common	getcwd			sys_getcwd
-80	common	chdir			sys_chdir
-81	common	fchdir			sys_fchdir
-82	common	rename			sys_rename
-83	common	mkdir			sys_mkdir
-84	common	rmdir			sys_rmdir
-85	common	creat			sys_creat
-86	common	link			sys_link
-87	common	unlink			sys_unlink
-88	common	symlink			sys_symlink
-89	common	readlink		sys_readlink
-90	common	chmod			sys_chmod
-91	common	fchmod			sys_fchmod
-92	common	chown			sys_chown
-93	common	fchown			sys_fchown
-94	common	lchown			sys_lchown
-95	common	umask			sys_umask
-96	common	gettimeofday		sys_gettimeofday
-97	common	getrlimit		sys_getrlimit
-98	common	getrusage		sys_getrusage
-99	common	sysinfo			sys_sysinfo
-100	common	times			sys_times
-101	64	ptrace			sys_ptrace
-102	common	getuid			sys_getuid
-103	common	syslog			sys_syslog
-104	common	getgid			sys_getgid
-105	common	setuid			sys_setuid
-106	common	setgid			sys_setgid
-107	common	geteuid			sys_geteuid
-108	common	getegid			sys_getegid
-109	common	setpgid			sys_setpgid
-110	common	getppid			sys_getppid
-111	common	getpgrp			sys_getpgrp
-112	common	setsid			sys_setsid
-113	common	setreuid		sys_setreuid
-114	common	setregid		sys_setregid
-115	common	getgroups		sys_getgroups
-116	common	setgroups		sys_setgroups
-117	common	setresuid		sys_setresuid
-118	common	getresuid		sys_getresuid
-119	common	setresgid		sys_setresgid
-120	common	getresgid		sys_getresgid
-121	common	getpgid			sys_getpgid
-122	common	setfsuid		sys_setfsuid
-123	common	setfsgid		sys_setfsgid
-124	common	getsid			sys_getsid
-125	common	capget			sys_capget
-126	common	capset			sys_capset
-127	64	rt_sigpending		sys_rt_sigpending
-128	64	rt_sigtimedwait		sys_rt_sigtimedwait
-129	64	rt_sigqueueinfo		sys_rt_sigqueueinfo
-130	common	rt_sigsuspend		sys_rt_sigsuspend
-131	64	sigaltstack		sys_sigaltstack
-132	common	utime			sys_utime
-133	common	mknod			sys_mknod
+0	common	read			__x64_sys_read
+1	common	write			__x64_sys_write
+2	common	open			__x64_sys_open
+3	common	close			__x64_sys_close
+4	common	stat			__x64_sys_newstat
+5	common	fstat			__x64_sys_newfstat
+6	common	lstat			__x64_sys_newlstat
+7	common	poll			__x64_sys_poll
+8	common	lseek			__x64_sys_lseek
+9	common	mmap			__x64_sys_mmap
+10	common	mprotect		__x64_sys_mprotect
+11	common	munmap			__x64_sys_munmap
+12	common	brk			__x64_sys_brk
+13	64	rt_sigaction		__x64_sys_rt_sigaction
+14	common	rt_sigprocmask		__x64_sys_rt_sigprocmask
+15	64	rt_sigreturn		__x64_sys_rt_sigreturn/ptregs
+16	64	ioctl			__x64_sys_ioctl
+17	common	pread64			__x64_sys_pread64
+18	common	pwrite64		__x64_sys_pwrite64
+19	64	readv			__x64_sys_readv
+20	64	writev			__x64_sys_writev
+21	common	access			__x64_sys_access
+22	common	pipe			__x64_sys_pipe
+23	common	select			__x64_sys_select
+24	common	sched_yield		__x64_sys_sched_yield
+25	common	mremap			__x64_sys_mremap
+26	common	msync			__x64_sys_msync
+27	common	mincore			__x64_sys_mincore
+28	common	madvise			__x64_sys_madvise
+29	common	shmget			__x64_sys_shmget
+30	common	shmat			__x64_sys_shmat
+31	common	shmctl			__x64_sys_shmctl
+32	common	dup			__x64_sys_dup
+33	common	dup2			__x64_sys_dup2
+34	common	pause			__x64_sys_pause
+35	common	nanosleep		__x64_sys_nanosleep
+36	common	getitimer		__x64_sys_getitimer
+37	common	alarm			__x64_sys_alarm
+38	common	setitimer		__x64_sys_setitimer
+39	common	getpid			__x64_sys_getpid
+40	common	sendfile		__x64_sys_sendfile64
+41	common	socket			__x64_sys_socket
+42	common	connect			__x64_sys_connect
+43	common	accept			__x64_sys_accept
+44	common	sendto			__x64_sys_sendto
+45	64	recvfrom		__x64_sys_recvfrom
+46	64	sendmsg			__x64_sys_sendmsg
+47	64	recvmsg			__x64_sys_recvmsg
+48	common	shutdown		__x64_sys_shutdown
+49	common	bind			__x64_sys_bind
+50	common	listen			__x64_sys_listen
+51	common	getsockname		__x64_sys_getsockname
+52	common	getpeername		__x64_sys_getpeername
+53	common	socketpair		__x64_sys_socketpair
+54	64	setsockopt		__x64_sys_setsockopt
+55	64	getsockopt		__x64_sys_getsockopt
+56	common	clone			__x64_sys_clone/ptregs
+57	common	fork			__x64_sys_fork/ptregs
+58	common	vfork			__x64_sys_vfork/ptregs
+59	64	execve			__x64_sys_execve/ptregs
+60	common	exit			__x64_sys_exit
+61	common	wait4			__x64_sys_wait4
+62	common	kill			__x64_sys_kill
+63	common	uname			__x64_sys_newuname
+64	common	semget			__x64_sys_semget
+65	common	semop			__x64_sys_semop
+66	common	semctl			__x64_sys_semctl
+67	common	shmdt			__x64_sys_shmdt
+68	common	msgget			__x64_sys_msgget
+69	common	msgsnd			__x64_sys_msgsnd
+70	common	msgrcv			__x64_sys_msgrcv
+71	common	msgctl			__x64_sys_msgctl
+72	common	fcntl			__x64_sys_fcntl
+73	common	flock			__x64_sys_flock
+74	common	fsync			__x64_sys_fsync
+75	common	fdatasync		__x64_sys_fdatasync
+76	common	truncate		__x64_sys_truncate
+77	common	ftruncate		__x64_sys_ftruncate
+78	common	getdents		__x64_sys_getdents
+79	common	getcwd			__x64_sys_getcwd
+80	common	chdir			__x64_sys_chdir
+81	common	fchdir			__x64_sys_fchdir
+82	common	rename			__x64_sys_rename
+83	common	mkdir			__x64_sys_mkdir
+84	common	rmdir			__x64_sys_rmdir
+85	common	creat			__x64_sys_creat
+86	common	link			__x64_sys_link
+87	common	unlink			__x64_sys_unlink
+88	common	symlink			__x64_sys_symlink
+89	common	readlink		__x64_sys_readlink
+90	common	chmod			__x64_sys_chmod
+91	common	fchmod			__x64_sys_fchmod
+92	common	chown			__x64_sys_chown
+93	common	fchown			__x64_sys_fchown
+94	common	lchown			__x64_sys_lchown
+95	common	umask			__x64_sys_umask
+96	common	gettimeofday		__x64_sys_gettimeofday
+97	common	getrlimit		__x64_sys_getrlimit
+98	common	getrusage		__x64_sys_getrusage
+99	common	sysinfo			__x64_sys_sysinfo
+100	common	times			__x64_sys_times
+101	64	ptrace			__x64_sys_ptrace
+102	common	getuid			__x64_sys_getuid
+103	common	syslog			__x64_sys_syslog
+104	common	getgid			__x64_sys_getgid
+105	common	setuid			__x64_sys_setuid
+106	common	setgid			__x64_sys_setgid
+107	common	geteuid			__x64_sys_geteuid
+108	common	getegid			__x64_sys_getegid
+109	common	setpgid			__x64_sys_setpgid
+110	common	getppid			__x64_sys_getppid
+111	common	getpgrp			__x64_sys_getpgrp
+112	common	setsid			__x64_sys_setsid
+113	common	setreuid		__x64_sys_setreuid
+114	common	setregid		__x64_sys_setregid
+115	common	getgroups		__x64_sys_getgroups
+116	common	setgroups		__x64_sys_setgroups
+117	common	setresuid		__x64_sys_setresuid
+118	common	getresuid		__x64_sys_getresuid
+119	common	setresgid		__x64_sys_setresgid
+120	common	getresgid		__x64_sys_getresgid
+121	common	getpgid			__x64_sys_getpgid
+122	common	setfsuid		__x64_sys_setfsuid
+123	common	setfsgid		__x64_sys_setfsgid
+124	common	getsid			__x64_sys_getsid
+125	common	capget			__x64_sys_capget
+126	common	capset			__x64_sys_capset
+127	64	rt_sigpending		__x64_sys_rt_sigpending
+128	64	rt_sigtimedwait		__x64_sys_rt_sigtimedwait
+129	64	rt_sigqueueinfo		__x64_sys_rt_sigqueueinfo
+130	common	rt_sigsuspend		__x64_sys_rt_sigsuspend
+131	64	sigaltstack		__x64_sys_sigaltstack
+132	common	utime			__x64_sys_utime
+133	common	mknod			__x64_sys_mknod
 134	64	uselib
-135	common	personality		sys_personality
-136	common	ustat			sys_ustat
-137	common	statfs			sys_statfs
-138	common	fstatfs			sys_fstatfs
-139	common	sysfs			sys_sysfs
-140	common	getpriority		sys_getpriority
-141	common	setpriority		sys_setpriority
-142	common	sched_setparam		sys_sched_setparam
-143	common	sched_getparam		sys_sched_getparam
-144	common	sched_setscheduler	sys_sched_setscheduler
-145	common	sched_getscheduler	sys_sched_getscheduler
-146	common	sched_get_priority_max	sys_sched_get_priority_max
-147	common	sched_get_priority_min	sys_sched_get_priority_min
-148	common	sched_rr_get_interval	sys_sched_rr_get_interval
-149	common	mlock			sys_mlock
-150	common	munlock			sys_munlock
-151	common	mlockall		sys_mlockall
-152	common	munlockall		sys_munlockall
-153	common	vhangup			sys_vhangup
-154	common	modify_ldt		sys_modify_ldt
-155	common	pivot_root		sys_pivot_root
-156	64	_sysctl			sys_sysctl
-157	common	prctl			sys_prctl
-158	common	arch_prctl		sys_arch_prctl
-159	common	adjtimex		sys_adjtimex
-160	common	setrlimit		sys_setrlimit
-161	common	chroot			sys_chroot
-162	common	sync			sys_sync
-163	common	acct			sys_acct
-164	common	settimeofday		sys_settimeofday
-165	common	mount			sys_mount
-166	common	umount2			sys_umount
-167	common	swapon			sys_swapon
-168	common	swapoff			sys_swapoff
-169	common	reboot			sys_reboot
-170	common	sethostname		sys_sethostname
-171	common	setdomainname		sys_setdomainname
-172	common	iopl			sys_iopl/ptregs
-173	common	ioperm			sys_ioperm
+135	common	personality		__x64_sys_personality
+136	common	ustat			__x64_sys_ustat
+137	common	statfs			__x64_sys_statfs
+138	common	fstatfs			__x64_sys_fstatfs
+139	common	sysfs			__x64_sys_sysfs
+140	common	getpriority		__x64_sys_getpriority
+141	common	setpriority		__x64_sys_setpriority
+142	common	sched_setparam		__x64_sys_sched_setparam
+143	common	sched_getparam		__x64_sys_sched_getparam
+144	common	sched_setscheduler	__x64_sys_sched_setscheduler
+145	common	sched_getscheduler	__x64_sys_sched_getscheduler
+146	common	sched_get_priority_max	__x64_sys_sched_get_priority_max
+147	common	sched_get_priority_min	__x64_sys_sched_get_priority_min
+148	common	sched_rr_get_interval	__x64_sys_sched_rr_get_interval
+149	common	mlock			__x64_sys_mlock
+150	common	munlock			__x64_sys_munlock
+151	common	mlockall		__x64_sys_mlockall
+152	common	munlockall		__x64_sys_munlockall
+153	common	vhangup			__x64_sys_vhangup
+154	common	modify_ldt		__x64_sys_modify_ldt
+155	common	pivot_root		__x64_sys_pivot_root
+156	64	_sysctl			__x64_sys_sysctl
+157	common	prctl			__x64_sys_prctl
+158	common	arch_prctl		__x64_sys_arch_prctl
+159	common	adjtimex		__x64_sys_adjtimex
+160	common	setrlimit		__x64_sys_setrlimit
+161	common	chroot			__x64_sys_chroot
+162	common	sync			__x64_sys_sync
+163	common	acct			__x64_sys_acct
+164	common	settimeofday		__x64_sys_settimeofday
+165	common	mount			__x64_sys_mount
+166	common	umount2			__x64_sys_umount
+167	common	swapon			__x64_sys_swapon
+168	common	swapoff			__x64_sys_swapoff
+169	common	reboot			__x64_sys_reboot
+170	common	sethostname		__x64_sys_sethostname
+171	common	setdomainname		__x64_sys_setdomainname
+172	common	iopl			__x64_sys_iopl/ptregs
+173	common	ioperm			__x64_sys_ioperm
 174	64	create_module
-175	common	init_module		sys_init_module
-176	common	delete_module		sys_delete_module
+175	common	init_module		__x64_sys_init_module
+176	common	delete_module		__x64_sys_delete_module
 177	64	get_kernel_syms
 178	64	query_module
-179	common	quotactl		sys_quotactl
+179	common	quotactl		__x64_sys_quotactl
 180	64	nfsservctl
 181	common	getpmsg
 182	common	putpmsg
 183	common	afs_syscall
 184	common	tuxcall
 185	common	security
-186	common	gettid			sys_gettid
-187	common	readahead		sys_readahead
-188	common	setxattr		sys_setxattr
-189	common	lsetxattr		sys_lsetxattr
-190	common	fsetxattr		sys_fsetxattr
-191	common	getxattr		sys_getxattr
-192	common	lgetxattr		sys_lgetxattr
-193	common	fgetxattr		sys_fgetxattr
-194	common	listxattr		sys_listxattr
-195	common	llistxattr		sys_llistxattr
-196	common	flistxattr		sys_flistxattr
-197	common	removexattr		sys_removexattr
-198	common	lremovexattr		sys_lremovexattr
-199	common	fremovexattr		sys_fremovexattr
-200	common	tkill			sys_tkill
-201	common	time			sys_time
-202	common	futex			sys_futex
-203	common	sched_setaffinity	sys_sched_setaffinity
-204	common	sched_getaffinity	sys_sched_getaffinity
+186	common	gettid			__x64_sys_gettid
+187	common	readahead		__x64_sys_readahead
+188	common	setxattr		__x64_sys_setxattr
+189	common	lsetxattr		__x64_sys_lsetxattr
+190	common	fsetxattr		__x64_sys_fsetxattr
+191	common	getxattr		__x64_sys_getxattr
+192	common	lgetxattr		__x64_sys_lgetxattr
+193	common	fgetxattr		__x64_sys_fgetxattr
+194	common	listxattr		__x64_sys_listxattr
+195	common	llistxattr		__x64_sys_llistxattr
+196	common	flistxattr		__x64_sys_flistxattr
+197	common	removexattr		__x64_sys_removexattr
+198	common	lremovexattr		__x64_sys_lremovexattr
+199	common	fremovexattr		__x64_sys_fremovexattr
+200	common	tkill			__x64_sys_tkill
+201	common	time			__x64_sys_time
+202	common	futex			__x64_sys_futex
+203	common	sched_setaffinity	__x64_sys_sched_setaffinity
+204	common	sched_getaffinity	__x64_sys_sched_getaffinity
 205	64	set_thread_area
-206	64	io_setup		sys_io_setup
-207	common	io_destroy		sys_io_destroy
-208	common	io_getevents		sys_io_getevents
-209	64	io_submit		sys_io_submit
-210	common	io_cancel		sys_io_cancel
+206	64	io_setup		__x64_sys_io_setup
+207	common	io_destroy		__x64_sys_io_destroy
+208	common	io_getevents		__x64_sys_io_getevents
+209	64	io_submit		__x64_sys_io_submit
+210	common	io_cancel		__x64_sys_io_cancel
 211	64	get_thread_area
-212	common	lookup_dcookie		sys_lookup_dcookie
-213	common	epoll_create		sys_epoll_create
+212	common	lookup_dcookie		__x64_sys_lookup_dcookie
+213	common	epoll_create		__x64_sys_epoll_create
 214	64	epoll_ctl_old
 215	64	epoll_wait_old
-216	common	remap_file_pages	sys_remap_file_pages
-217	common	getdents64		sys_getdents64
-218	common	set_tid_address		sys_set_tid_address
-219	common	restart_syscall		sys_restart_syscall
-220	common	semtimedop		sys_semtimedop
-221	common	fadvise64		sys_fadvise64
-222	64	timer_create		sys_timer_create
-223	common	timer_settime		sys_timer_settime
-224	common	timer_gettime		sys_timer_gettime
-225	common	timer_getoverrun	sys_timer_getoverrun
-226	common	timer_delete		sys_timer_delete
-227	common	clock_settime		sys_clock_settime
-228	common	clock_gettime		sys_clock_gettime
-229	common	clock_getres		sys_clock_getres
-230	common	clock_nanosleep		sys_clock_nanosleep
-231	common	exit_group		sys_exit_group
-232	common	epoll_wait		sys_epoll_wait
-233	common	epoll_ctl		sys_epoll_ctl
-234	common	tgkill			sys_tgkill
-235	common	utimes			sys_utimes
+216	common	remap_file_pages	__x64_sys_remap_file_pages
+217	common	getdents64		__x64_sys_getdents64
+218	common	set_tid_address		__x64_sys_set_tid_address
+219	common	restart_syscall		__x64_sys_restart_syscall
+220	common	semtimedop		__x64_sys_semtimedop
+221	common	fadvise64		__x64_sys_fadvise64
+222	64	timer_create		__x64_sys_timer_create
+223	common	timer_settime		__x64_sys_timer_settime
+224	common	timer_gettime		__x64_sys_timer_gettime
+225	common	timer_getoverrun	__x64_sys_timer_getoverrun
+226	common	timer_delete		__x64_sys_timer_delete
+227	common	clock_settime		__x64_sys_clock_settime
+228	common	clock_gettime		__x64_sys_clock_gettime
+229	common	clock_getres		__x64_sys_clock_getres
+230	common	clock_nanosleep		__x64_sys_clock_nanosleep
+231	common	exit_group		__x64_sys_exit_group
+232	common	epoll_wait		__x64_sys_epoll_wait
+233	common	epoll_ctl		__x64_sys_epoll_ctl
+234	common	tgkill			__x64_sys_tgkill
+235	common	utimes			__x64_sys_utimes
 236	64	vserver
-237	common	mbind			sys_mbind
-238	common	set_mempolicy		sys_set_mempolicy
-239	common	get_mempolicy		sys_get_mempolicy
-240	common	mq_open			sys_mq_open
-241	common	mq_unlink		sys_mq_unlink
-242	common	mq_timedsend		sys_mq_timedsend
-243	common	mq_timedreceive		sys_mq_timedreceive
-244	64	mq_notify		sys_mq_notify
-245	common	mq_getsetattr		sys_mq_getsetattr
-246	64	kexec_load		sys_kexec_load
-247	64	waitid			sys_waitid
-248	common	add_key			sys_add_key
-249	common	request_key		sys_request_key
-250	common	keyctl			sys_keyctl
-251	common	ioprio_set		sys_ioprio_set
-252	common	ioprio_get		sys_ioprio_get
-253	common	inotify_init		sys_inotify_init
-254	common	inotify_add_watch	sys_inotify_add_watch
-255	common	inotify_rm_watch	sys_inotify_rm_watch
-256	common	migrate_pages		sys_migrate_pages
-257	common	openat			sys_openat
-258	common	mkdirat			sys_mkdirat
-259	common	mknodat			sys_mknodat
-260	common	fchownat		sys_fchownat
-261	common	futimesat		sys_futimesat
-262	common	newfstatat		sys_newfstatat
-263	common	unlinkat		sys_unlinkat
-264	common	renameat		sys_renameat
-265	common	linkat			sys_linkat
-266	common	symlinkat		sys_symlinkat
-267	common	readlinkat		sys_readlinkat
-268	common	fchmodat		sys_fchmodat
-269	common	faccessat		sys_faccessat
-270	common	pselect6		sys_pselect6
-271	common	ppoll			sys_ppoll
-272	common	unshare			sys_unshare
-273	64	set_robust_list		sys_set_robust_list
-274	64	get_robust_list		sys_get_robust_list
-275	common	splice			sys_splice
-276	common	tee			sys_tee
-277	common	sync_file_range		sys_sync_file_range
-278	64	vmsplice		sys_vmsplice
-279	64	move_pages		sys_move_pages
-280	common	utimensat		sys_utimensat
-281	common	epoll_pwait		sys_epoll_pwait
-282	common	signalfd		sys_signalfd
-283	common	timerfd_create		sys_timerfd_create
-284	common	eventfd			sys_eventfd
-285	common	fallocate		sys_fallocate
-286	common	timerfd_settime		sys_timerfd_settime
-287	common	timerfd_gettime		sys_timerfd_gettime
-288	common	accept4			sys_accept4
-289	common	signalfd4		sys_signalfd4
-290	common	eventfd2		sys_eventfd2
-291	common	epoll_create1		sys_epoll_create1
-292	common	dup3			sys_dup3
-293	common	pipe2			sys_pipe2
-294	common	inotify_init1		sys_inotify_init1
-295	64	preadv			sys_preadv
-296	64	pwritev			sys_pwritev
-297	64	rt_tgsigqueueinfo	sys_rt_tgsigqueueinfo
-298	common	perf_event_open		sys_perf_event_open
-299	64	recvmmsg		sys_recvmmsg
-300	common	fanotify_init		sys_fanotify_init
-301	common	fanotify_mark		sys_fanotify_mark
-302	common	prlimit64		sys_prlimit64
-303	common	name_to_handle_at	sys_name_to_handle_at
-304	common	open_by_handle_at	sys_open_by_handle_at
-305	common	clock_adjtime		sys_clock_adjtime
-306	common	syncfs			sys_syncfs
-307	64	sendmmsg		sys_sendmmsg
-308	common	setns			sys_setns
-309	common	getcpu			sys_getcpu
-310	64	process_vm_readv	sys_process_vm_readv
-311	64	process_vm_writev	sys_process_vm_writev
-312	common	kcmp			sys_kcmp
-313	common	finit_module		sys_finit_module
-314	common	sched_setattr		sys_sched_setattr
-315	common	sched_getattr		sys_sched_getattr
-316	common	renameat2		sys_renameat2
-317	common	seccomp			sys_seccomp
-318	common	getrandom		sys_getrandom
-319	common	memfd_create		sys_memfd_create
-320	common	kexec_file_load		sys_kexec_file_load
-321	common	bpf			sys_bpf
-322	64	execveat		sys_execveat/ptregs
-323	common	userfaultfd		sys_userfaultfd
-324	common	membarrier		sys_membarrier
-325	common	mlock2			sys_mlock2
-326	common	copy_file_range		sys_copy_file_range
-327	64	preadv2			sys_preadv2
-328	64	pwritev2		sys_pwritev2
-329	common	pkey_mprotect		sys_pkey_mprotect
-330	common	pkey_alloc		sys_pkey_alloc
-331	common	pkey_free		sys_pkey_free
-332	common	statx			sys_statx
+237	common	mbind			__x64_sys_mbind
+238	common	set_mempolicy		__x64_sys_set_mempolicy
+239	common	get_mempolicy		__x64_sys_get_mempolicy
+240	common	mq_open			__x64_sys_mq_open
+241	common	mq_unlink		__x64_sys_mq_unlink
+242	common	mq_timedsend		__x64_sys_mq_timedsend
+243	common	mq_timedreceive		__x64_sys_mq_timedreceive
+244	64	mq_notify		__x64_sys_mq_notify
+245	common	mq_getsetattr		__x64_sys_mq_getsetattr
+246	64	kexec_load		__x64_sys_kexec_load
+247	64	waitid			__x64_sys_waitid
+248	common	add_key			__x64_sys_add_key
+249	common	request_key		__x64_sys_request_key
+250	common	keyctl			__x64_sys_keyctl
+251	common	ioprio_set		__x64_sys_ioprio_set
+252	common	ioprio_get		__x64_sys_ioprio_get
+253	common	inotify_init		__x64_sys_inotify_init
+254	common	inotify_add_watch	__x64_sys_inotify_add_watch
+255	common	inotify_rm_watch	__x64_sys_inotify_rm_watch
+256	common	migrate_pages		__x64_sys_migrate_pages
+257	common	openat			__x64_sys_openat
+258	common	mkdirat			__x64_sys_mkdirat
+259	common	mknodat			__x64_sys_mknodat
+260	common	fchownat		__x64_sys_fchownat
+261	common	futimesat		__x64_sys_futimesat
+262	common	newfstatat		__x64_sys_newfstatat
+263	common	unlinkat		__x64_sys_unlinkat
+264	common	renameat		__x64_sys_renameat
+265	common	linkat			__x64_sys_linkat
+266	common	symlinkat		__x64_sys_symlinkat
+267	common	readlinkat		__x64_sys_readlinkat
+268	common	fchmodat		__x64_sys_fchmodat
+269	common	faccessat		__x64_sys_faccessat
+270	common	pselect6		__x64_sys_pselect6
+271	common	ppoll			__x64_sys_ppoll
+272	common	unshare			__x64_sys_unshare
+273	64	set_robust_list		__x64_sys_set_robust_list
+274	64	get_robust_list		__x64_sys_get_robust_list
+275	common	splice			__x64_sys_splice
+276	common	tee			__x64_sys_tee
+277	common	sync_file_range		__x64_sys_sync_file_range
+278	64	vmsplice		__x64_sys_vmsplice
+279	64	move_pages		__x64_sys_move_pages
+280	common	utimensat		__x64_sys_utimensat
+281	common	epoll_pwait		__x64_sys_epoll_pwait
+282	common	signalfd		__x64_sys_signalfd
+283	common	timerfd_create		__x64_sys_timerfd_create
+284	common	eventfd			__x64_sys_eventfd
+285	common	fallocate		__x64_sys_fallocate
+286	common	timerfd_settime		__x64_sys_timerfd_settime
+287	common	timerfd_gettime		__x64_sys_timerfd_gettime
+288	common	accept4			__x64_sys_accept4
+289	common	signalfd4		__x64_sys_signalfd4
+290	common	eventfd2		__x64_sys_eventfd2
+291	common	epoll_create1		__x64_sys_epoll_create1
+292	common	dup3			__x64_sys_dup3
+293	common	pipe2			__x64_sys_pipe2
+294	common	inotify_init1		__x64_sys_inotify_init1
+295	64	preadv			__x64_sys_preadv
+296	64	pwritev			__x64_sys_pwritev
+297	64	rt_tgsigqueueinfo	__x64_sys_rt_tgsigqueueinfo
+298	common	perf_event_open		__x64_sys_perf_event_open
+299	64	recvmmsg		__x64_sys_recvmmsg
+300	common	fanotify_init		__x64_sys_fanotify_init
+301	common	fanotify_mark		__x64_sys_fanotify_mark
+302	common	prlimit64		__x64_sys_prlimit64
+303	common	name_to_handle_at	__x64_sys_name_to_handle_at
+304	common	open_by_handle_at	__x64_sys_open_by_handle_at
+305	common	clock_adjtime		__x64_sys_clock_adjtime
+306	common	syncfs			__x64_sys_syncfs
+307	64	sendmmsg		__x64_sys_sendmmsg
+308	common	setns			__x64_sys_setns
+309	common	getcpu			__x64_sys_getcpu
+310	64	process_vm_readv	__x64_sys_process_vm_readv
+311	64	process_vm_writev	__x64_sys_process_vm_writev
+312	common	kcmp			__x64_sys_kcmp
+313	common	finit_module		__x64_sys_finit_module
+314	common	sched_setattr		__x64_sys_sched_setattr
+315	common	sched_getattr		__x64_sys_sched_getattr
+316	common	renameat2		__x64_sys_renameat2
+317	common	seccomp			__x64_sys_seccomp
+318	common	getrandom		__x64_sys_getrandom
+319	common	memfd_create		__x64_sys_memfd_create
+320	common	kexec_file_load		__x64_sys_kexec_file_load
+321	common	bpf			__x64_sys_bpf
+322	64	execveat		__x64_sys_execveat/ptregs
+323	common	userfaultfd		__x64_sys_userfaultfd
+324	common	membarrier		__x64_sys_membarrier
+325	common	mlock2			__x64_sys_mlock2
+326	common	copy_file_range		__x64_sys_copy_file_range
+327	64	preadv2			__x64_sys_preadv2
+328	64	pwritev2		__x64_sys_pwritev2
+329	common	pkey_mprotect		__x64_sys_pkey_mprotect
+330	common	pkey_alloc		__x64_sys_pkey_alloc
+331	common	pkey_free		__x64_sys_pkey_free
+332	common	statx			__x64_sys_statx
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
-# for native 64-bit operation.
+# for native 64-bit operation. The __x32_compat_sys stubs are created
+# on-the-fly for compat_sys_*() compatibility system calls if X86_X32
+# is defined.
 #
-512	x32	rt_sigaction		compat_sys_rt_sigaction
+512	x32	rt_sigaction		__x32_compat_sys_rt_sigaction
 513	x32	rt_sigreturn		sys32_x32_rt_sigreturn
-514	x32	ioctl			compat_sys_ioctl
-515	x32	readv			compat_sys_readv
-516	x32	writev			compat_sys_writev
-517	x32	recvfrom		compat_sys_recvfrom
-518	x32	sendmsg			compat_sys_sendmsg
-519	x32	recvmsg			compat_sys_recvmsg
-520	x32	execve			compat_sys_execve/ptregs
-521	x32	ptrace			compat_sys_ptrace
-522	x32	rt_sigpending		compat_sys_rt_sigpending
-523	x32	rt_sigtimedwait		compat_sys_rt_sigtimedwait
-524	x32	rt_sigqueueinfo		compat_sys_rt_sigqueueinfo
-525	x32	sigaltstack		compat_sys_sigaltstack
-526	x32	timer_create		compat_sys_timer_create
-527	x32	mq_notify		compat_sys_mq_notify
-528	x32	kexec_load		compat_sys_kexec_load
-529	x32	waitid			compat_sys_waitid
-530	x32	set_robust_list		compat_sys_set_robust_list
-531	x32	get_robust_list		compat_sys_get_robust_list
-532	x32	vmsplice		compat_sys_vmsplice
-533	x32	move_pages		compat_sys_move_pages
-534	x32	preadv			compat_sys_preadv64
-535	x32	pwritev			compat_sys_pwritev64
-536	x32	rt_tgsigqueueinfo	compat_sys_rt_tgsigqueueinfo
-537	x32	recvmmsg		compat_sys_recvmmsg
-538	x32	sendmmsg		compat_sys_sendmmsg
-539	x32	process_vm_readv	compat_sys_process_vm_readv
-540	x32	process_vm_writev	compat_sys_process_vm_writev
-541	x32	setsockopt		compat_sys_setsockopt
-542	x32	getsockopt		compat_sys_getsockopt
-543	x32	io_setup		compat_sys_io_setup
-544	x32	io_submit		compat_sys_io_submit
-545	x32	execveat		compat_sys_execveat/ptregs
-546	x32	preadv2			compat_sys_preadv64v2
-547	x32	pwritev2		compat_sys_pwritev64v2
+514	x32	ioctl			__x32_compat_sys_ioctl
+515	x32	readv			__x32_compat_sys_readv
+516	x32	writev			__x32_compat_sys_writev
+517	x32	recvfrom		__x32_compat_sys_recvfrom
+518	x32	sendmsg			__x32_compat_sys_sendmsg
+519	x32	recvmsg			__x32_compat_sys_recvmsg
+520	x32	execve			__x32_compat_sys_execve/ptregs
+521	x32	ptrace			__x32_compat_sys_ptrace
+522	x32	rt_sigpending		__x32_compat_sys_rt_sigpending
+523	x32	rt_sigtimedwait		__x32_compat_sys_rt_sigtimedwait
+524	x32	rt_sigqueueinfo		__x32_compat_sys_rt_sigqueueinfo
+525	x32	sigaltstack		__x32_compat_sys_sigaltstack
+526	x32	timer_create		__x32_compat_sys_timer_create
+527	x32	mq_notify		__x32_compat_sys_mq_notify
+528	x32	kexec_load		__x32_compat_sys_kexec_load
+529	x32	waitid			__x32_compat_sys_waitid
+530	x32	set_robust_list		__x32_compat_sys_set_robust_list
+531	x32	get_robust_list		__x32_compat_sys_get_robust_list
+532	x32	vmsplice		__x32_compat_sys_vmsplice
+533	x32	move_pages		__x32_compat_sys_move_pages
+534	x32	preadv			__x32_compat_sys_preadv64
+535	x32	pwritev			__x32_compat_sys_pwritev64
+536	x32	rt_tgsigqueueinfo	__x32_compat_sys_rt_tgsigqueueinfo
+537	x32	recvmmsg		__x32_compat_sys_recvmmsg
+538	x32	sendmmsg		__x32_compat_sys_sendmmsg
+539	x32	process_vm_readv	__x32_compat_sys_process_vm_readv
+540	x32	process_vm_writev	__x32_compat_sys_process_vm_writev
+541	x32	setsockopt		__x32_compat_sys_setsockopt
+542	x32	getsockopt		__x32_compat_sys_getsockopt
+543	x32	io_setup		__x32_compat_sys_io_setup
+544	x32	io_submit		__x32_compat_sys_io_submit
+545	x32	execveat		__x32_compat_sys_execveat/ptregs
+546	x32	preadv2			__x32_compat_sys_preadv64v2
+547	x32	pwritev2		__x32_compat_sys_pwritev64v2

From 101592b4904ecf6b8ed2a4784d41d180319d95a1 Mon Sep 17 00:00:00 2001
From: Alexey Budankov <alexey.budankov@linux.intel.com>
Date: Mon, 9 Apr 2018 10:25:32 +0300
Subject: [PATCH 176/288] perf/core: Store context switch out type in
 PERF_RECORD_SWITCH[_CPU_WIDE]

Store preempting context switch out event into Perf trace as a part of
PERF_RECORD_SWITCH[_CPU_WIDE] record.

Percentage of preempting and non-preempting context switches help
understanding the nature of workloads (CPU or IO bound) that are running
on a machine;

The event is treated as preemption one when task->state value of the
thread being switched out is TASK_RUNNING. Event type encoding is
implemented using PERF_RECORD_MISC_SWITCH_OUT_PREEMPT bit;

Signed-off-by: Alexey Budankov <alexey.budankov@linux.intel.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: http://lkml.kernel.org/r/9ff84e83-a0ca-dd82-a6d0-cb951689be74@linux.intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 include/uapi/linux/perf_event.h       | 18 +++++++++++++++---
 kernel/events/core.c                  |  4 ++++
 tools/include/uapi/linux/perf_event.h | 18 +++++++++++++++---
 3 files changed, 34 insertions(+), 6 deletions(-)

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 912b85b52344..b8e288a1f740 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -650,11 +650,23 @@ struct perf_event_mmap_page {
 #define PERF_RECORD_MISC_COMM_EXEC		(1 << 13)
 #define PERF_RECORD_MISC_SWITCH_OUT		(1 << 13)
 /*
- * Indicates that the content of PERF_SAMPLE_IP points to
- * the actual instruction that triggered the event. See also
- * perf_event_attr::precise_ip.
+ * These PERF_RECORD_MISC_* flags below are safely reused
+ * for the following events:
+ *
+ *   PERF_RECORD_MISC_EXACT_IP           - PERF_RECORD_SAMPLE of precise events
+ *   PERF_RECORD_MISC_SWITCH_OUT_PREEMPT - PERF_RECORD_SWITCH* events
+ *
+ *
+ * PERF_RECORD_MISC_EXACT_IP:
+ *   Indicates that the content of PERF_SAMPLE_IP points to
+ *   the actual instruction that triggered the event. See also
+ *   perf_event_attr::precise_ip.
+ *
+ * PERF_RECORD_MISC_SWITCH_OUT_PREEMPT:
+ *   Indicates that thread was preempted in TASK_RUNNING state.
  */
 #define PERF_RECORD_MISC_EXACT_IP		(1 << 14)
+#define PERF_RECORD_MISC_SWITCH_OUT_PREEMPT	(1 << 14)
 /*
  * Reserve the last bit to indicate some extended misc field
  */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 2d5fe26551f8..1bae80aaabfb 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7587,6 +7587,10 @@ static void perf_event_switch(struct task_struct *task,
 		},
 	};
 
+	if (!sched_in && task->state == TASK_RUNNING)
+		switch_event.event_id.header.misc |=
+				PERF_RECORD_MISC_SWITCH_OUT_PREEMPT;
+
 	perf_iterate_sb(perf_event_switch_output,
 		       &switch_event,
 		       NULL);
diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h
index 912b85b52344..b8e288a1f740 100644
--- a/tools/include/uapi/linux/perf_event.h
+++ b/tools/include/uapi/linux/perf_event.h
@@ -650,11 +650,23 @@ struct perf_event_mmap_page {
 #define PERF_RECORD_MISC_COMM_EXEC		(1 << 13)
 #define PERF_RECORD_MISC_SWITCH_OUT		(1 << 13)
 /*
- * Indicates that the content of PERF_SAMPLE_IP points to
- * the actual instruction that triggered the event. See also
- * perf_event_attr::precise_ip.
+ * These PERF_RECORD_MISC_* flags below are safely reused
+ * for the following events:
+ *
+ *   PERF_RECORD_MISC_EXACT_IP           - PERF_RECORD_SAMPLE of precise events
+ *   PERF_RECORD_MISC_SWITCH_OUT_PREEMPT - PERF_RECORD_SWITCH* events
+ *
+ *
+ * PERF_RECORD_MISC_EXACT_IP:
+ *   Indicates that the content of PERF_SAMPLE_IP points to
+ *   the actual instruction that triggered the event. See also
+ *   perf_event_attr::precise_ip.
+ *
+ * PERF_RECORD_MISC_SWITCH_OUT_PREEMPT:
+ *   Indicates that thread was preempted in TASK_RUNNING state.
  */
 #define PERF_RECORD_MISC_EXACT_IP		(1 << 14)
+#define PERF_RECORD_MISC_SWITCH_OUT_PREEMPT	(1 << 14)
 /*
  * Reserve the last bit to indicate some extended misc field
  */

From b3f35b5d5d36fba9311d1a965fcce2dd35614f2e Mon Sep 17 00:00:00 2001
From: Alexey Budankov <alexey.budankov@linux.intel.com>
Date: Mon, 9 Apr 2018 10:26:05 +0300
Subject: [PATCH 177/288] perf report: Extend raw dump (-D) out with switch out
 event type

Print additional 'preempt' tag for PERF_RECORD_SWITCH[_CPU_WIDE] OUT records when
event header misc field contains PERF_RECORD_MISC_SWITCH_OUT_PREEMPT bit set
designating preemption context switch out event:

tools/perf/perf report -D -i perf.data | grep _SWITCH

0 768361415226 0x27f076 [0x28]: PERF_RECORD_SWITCH_CPU_WIDE IN           prev pid/tid:     8/8
4 768362216813 0x28f45e [0x28]: PERF_RECORD_SWITCH_CPU_WIDE OUT          next pid/tid:     0/0
4 768362217824 0x28f486 [0x28]: PERF_RECORD_SWITCH_CPU_WIDE IN           prev pid/tid:  4073/4073
0 768362414027 0x27f0ce [0x28]: PERF_RECORD_SWITCH_CPU_WIDE OUT preempt  next pid/tid:     8/8
0 768362414367 0x27f0f6 [0x28]: PERF_RECORD_SWITCH_CPU_WIDE IN           prev pid/tid:     0/0

Signed-off-by: Alexey Budankov <alexey.budankov@linux.intel.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/6f5aebb9-b96c-f304-f08f-8f046d38de4f@linux.intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/event.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index f0a6cbd033cc..98ff3a6a3d50 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -1421,7 +1421,9 @@ size_t perf_event__fprintf_itrace_start(union perf_event *event, FILE *fp)
 size_t perf_event__fprintf_switch(union perf_event *event, FILE *fp)
 {
 	bool out = event->header.misc & PERF_RECORD_MISC_SWITCH_OUT;
-	const char *in_out = out ? "OUT" : "IN ";
+	const char *in_out = !out ? "IN         " :
+		!(event->header.misc & PERF_RECORD_MISC_SWITCH_OUT_PREEMPT) ?
+				    "OUT        " : "OUT preempt";
 
 	if (event->header.type == PERF_RECORD_SWITCH)
 		return fprintf(fp, " %s\n", in_out);

From bf30cc1882d2c65aaf92842cc9bcf06565eab73c Mon Sep 17 00:00:00 2001
From: Alexey Budankov <alexey.budankov@linux.intel.com>
Date: Mon, 9 Apr 2018 10:26:46 +0300
Subject: [PATCH 178/288] perf script: Extend misc field decoding with switch
 out event type

Append 'p' sign to 'S' tag designating the type of context switch out event so
'Sp' means preemption context switch. Documentation is extended to cover
new presentation changes.

  $ perf script --show-switch-events -F +misc -I -i perf.data:

          hdparm 4073 [004] U  762.198265:     380194 cycles:ppp:      7faf727f5a23 strchr (/usr/lib64/ld-2.26.so)
          hdparm 4073 [004] K  762.198366:     441572 cycles:ppp:  ffffffffb9218435 alloc_set_pte (/lib/modules/4.16.0-rc6+/build/vmlinux)
          hdparm 4073 [004] S  762.198391: PERF_RECORD_SWITCH_CPU_WIDE OUT          next pid/tid:    0/0
         swapper    0 [004]    762.198392: PERF_RECORD_SWITCH_CPU_WIDE IN           prev pid/tid: 4073/4073
         swapper    0 [004] Sp 762.198477: PERF_RECORD_SWITCH_CPU_WIDE OUT preempt  next pid/tid: 4073/4073
          hdparm 4073 [004]    762.198478: PERF_RECORD_SWITCH_CPU_WIDE IN           prev pid/tid:    0/0
         swapper    0 [007] K  762.198514:    2303073 cycles:ppp:  ffffffffb98b0c66 intel_idle (/lib/modules/4.16.0-rc6+/build/vmlinux)
         swapper    0 [007] Sp 762.198561: PERF_RECORD_SWITCH_CPU_WIDE OUT preempt  next pid/tid: 1134/1134
  kworker/u16:18 1134 [007]    762.198562: PERF_RECORD_SWITCH_CPU_WIDE IN           prev pid/tid:    0/0
  kworker/u16:18 1134 [007] S  762.198567: PERF_RECORD_SWITCH_CPU_WIDE OUT          next pid/tid:    0/0

Signed-off-by: Alexey Budankov <alexey.budankov@linux.intel.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/5fc65ce7-8ca5-53ae-8858-8ddd27290575@linux.intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-script.txt | 17 +++++++++--------
 tools/perf/builtin-script.c              |  5 ++++-
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt
index 36ec0257f8d3..afdafe2110a1 100644
--- a/tools/perf/Documentation/perf-script.txt
+++ b/tools/perf/Documentation/perf-script.txt
@@ -228,14 +228,15 @@ OPTIONS
 	For sample events it's possible to display misc field with -F +misc option,
 	following letters are displayed for each bit:
 
-	  PERF_RECORD_MISC_KERNEL        K
-	  PERF_RECORD_MISC_USER          U
-	  PERF_RECORD_MISC_HYPERVISOR    H
-	  PERF_RECORD_MISC_GUEST_KERNEL  G
-	  PERF_RECORD_MISC_GUEST_USER    g
-	  PERF_RECORD_MISC_MMAP_DATA*    M
-	  PERF_RECORD_MISC_COMM_EXEC     E
-	  PERF_RECORD_MISC_SWITCH_OUT    S
+	  PERF_RECORD_MISC_KERNEL               K
+	  PERF_RECORD_MISC_USER                 U
+	  PERF_RECORD_MISC_HYPERVISOR           H
+	  PERF_RECORD_MISC_GUEST_KERNEL         G
+	  PERF_RECORD_MISC_GUEST_USER           g
+	  PERF_RECORD_MISC_MMAP_DATA*           M
+	  PERF_RECORD_MISC_COMM_EXEC            E
+	  PERF_RECORD_MISC_SWITCH_OUT           S
+	  PERF_RECORD_MISC_SWITCH_OUT_PREEMPT   Sp
 
 	  $ perf script -F +misc ...
 	   sched-messaging  1414 K     28690.636582:       4590 cycles ...
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index b17edbcd98cc..e0a9845b6cbc 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -657,8 +657,11 @@ static int perf_sample__fprintf_start(struct perf_sample *sample,
 			break;
 		case PERF_RECORD_SWITCH:
 		case PERF_RECORD_SWITCH_CPU_WIDE:
-			if (has(SWITCH_OUT))
+			if (has(SWITCH_OUT)) {
 				ret += fprintf(fp, "S");
+				if (sample->misc & PERF_RECORD_MISC_SWITCH_OUT_PREEMPT)
+					ret += fprintf(fp, "p");
+			}
 		default:
 			break;
 		}

From 038586c34301578e538f6c5aa79ca82bce1b9152 Mon Sep 17 00:00:00 2001
From: Thomas Richter <tmricht@linux.vnet.ibm.com>
Date: Mon, 16 Apr 2018 15:23:14 +0200
Subject: [PATCH 179/288] perf list: Add s390 support for detailed/verbose PMU
 event description

'perf list' with flags -d and -v print a description (-d) or a very
verbose explanation (-v) of CPU specific counter events.  These
descriptions are provided with the json files in directory
pmu-events/arch/s390/*.json.

Display of these descriptions on s390 requires the corresponding json
files.

On s390 this does not work because function is_pmu_core() does not
detect the s390 directory name where the CPU specific events are listed.
On x86 it is:

  /sys/bus/event_source/devices/cpu

whereas on s390 it is:

  /sys/bus/event_source/devices/cpum_cf
  /sys/bus/event_source/devices/cpum_sf

Fix this by adding s390 directory name testing to function
is_pmu_core(). This is the same approach as taken for the ARM platform.

Output before:

[root@s35lp76 perf]# ./perf list -d pmu
List of pre-defined events (to be used in -e):

  cpum_cf/AES_BLOCKED_CYCLES/      [Kernel PMU event]
  cpum_cf/AES_BLOCKED_FUNCTIONS/   [Kernel PMU event]
  cpum_cf/AES_CYCLES/              [Kernel PMU event]
  cpum_cf/AES_FUNCTIONS/           [Kernel PMU event]
  ....
  cpum_cf/TX_NC_TEND/              [Kernel PMU event]
  cpum_cf/VX_BCD_EXECUTION_SLOTS/  [Kernel PMU event]
  cpum_sf/SF_CYCLES_BASIC/         [Kernel PMU event]

Output after:

[root@s35lp76 perf]# ./perf list -d pmu
List of pre-defined events (to be used in -e):

  cpum_cf/AES_BLOCKED_CYCLES/      [Kernel PMU event]
  cpum_cf/AES_BLOCKED_FUNCTIONS/   [Kernel PMU event]
  cpum_cf/AES_CYCLES/              [Kernel PMU event]
  cpum_cf/AES_FUNCTIONS/           [Kernel PMU event]
  ....
  cpum_cf/TX_NC_TEND/              [Kernel PMU event]
  cpum_cf/VX_BCD_EXECUTION_SLOTS/  [Kernel PMU event]
  cpum_sf/SF_CYCLES_BASIC/         [Kernel PMU event]

3906:
  bcd_dfp_execution_slots
       [BCD DFP Execution Slots]
  decimal_instructions
       [Decimal Instructions]
  dtlb2_gpage_writes
       [DTLB2 GPAGE Writes]
  dtlb2_hpage_writes
       [DTLB2 HPAGE Writes]
  dtlb2_misses
       [DTLB2 Misses]
  dtlb2_writes
       [DTLB2 Writes]
  itlb2_misses
       [ITLB2 Misses]
  itlb2_writes
       [ITLB2 Writes]
  l1c_tlb2_misses
       [L1C TLB2 Misses]
  .....

cfvn 3:
  cpu_cycles
       [CPU Cycles]
  instructions
       [Instructions]
  l1d_dir_writes
       [L1D Directory Writes]
  l1d_penalty_cycles
       [L1D Penalty Cycles]
  l1i_dir_writes
       [L1I Directory Writes]
  l1i_penalty_cycles
       [L1I Penalty Cycles]
  problem_state_cpu_cycles
       [Problem State CPU Cycles]
  problem_state_instructions
       [Problem State Instructions]
  ....

csvn generic:
  aes_blocked_cycles
       [AES Blocked Cycles]
  aes_blocked_functions
       [AES Blocked Functions]
  aes_cycles
       [AES Cycles]
  aes_functions
       [AES Functions]
  dea_blocked_cycles
       [DEA Blocked Cycles]
  dea_blocked_functions
       [DEA Blocked Functions]
  ....

Signed-off-by: Thomas Richter <tmricht@linux.vnet.ibm.com>
Reviewed-by: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Link: http://lkml.kernel.org/r/20180416132314.33249-1-tmricht@linux.ibm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/pmu.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
index 064bdcb7bd78..61a5e5027338 100644
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c
@@ -562,6 +562,12 @@ static int is_pmu_core(const char *name)
 	if (stat(path, &st) == 0)
 		return 1;
 
+	/* Look for cpu sysfs (specific to s390) */
+	scnprintf(path, PATH_MAX, "%s/bus/event_source/devices/%s",
+		  sysfs, name);
+	if (stat(path, &st) == 0 && !strncmp(name, "cpum_", 5))
+		return 1;
+
 	return 0;
 }
 

From 78b562fbfa2cf0a9fcb23c3154756b690f4905c1 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Sun, 15 Apr 2018 11:23:50 +0200
Subject: [PATCH 180/288] perf: Return proper values for user stack errors

Return immediately when we find issue in the user stack checks. The
error value could get overwritten by following check for
PERF_SAMPLE_REGS_INTR.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: syzkaller-bugs@googlegroups.com
Cc: x86@kernel.org
Fixes: 60e2364e60e8 ("perf: Add ability to sample machine state on interrupt")
Link: http://lkml.kernel.org/r/20180415092352.12403-1-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 kernel/events/core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 1bae80aaabfb..67612ce359ad 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -10209,9 +10209,9 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
 		 * __u16 sample size limit.
 		 */
 		if (attr->sample_stack_user >= USHRT_MAX)
-			ret = -EINVAL;
+			return -EINVAL;
 		else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
-			ret = -EINVAL;
+			return -EINVAL;
 	}
 
 	if (!attr->sample_max_stack)

From 5af44ca53d019de47efe6dbc4003dd518e5197ed Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Sun, 15 Apr 2018 11:23:51 +0200
Subject: [PATCH 181/288] perf: Fix sample_max_stack maximum check

The syzbot hit KASAN bug in perf_callchain_store having the entry stored
behind the allocated bounds [1].

We miss the sample_max_stack check for the initial event that allocates
callchain buffers. This missing check allows to create an event with
sample_max_stack value bigger than the global sysctl maximum:

  # sysctl -a | grep perf_event_max_stack
  kernel.perf_event_max_stack = 127

  # perf record -vv -C 1 -e cycles/max-stack=256/ kill
  ...
  perf_event_attr:
    size                             112
    ...
    sample_max_stack                 256
  ------------------------------------------------------------
  sys_perf_event_open: pid -1  cpu 1  group_fd -1  flags 0x8 = 4

Note the '-C 1', which forces perf record to create just single event.
Otherwise it opens event for every cpu, then the sample_max_stack check
fails on the second event and all's fine.

The fix is to run the sample_max_stack check also for the first event
with callchains.

[1] https://marc.info/?l=linux-kernel&m=152352732920874&w=2

Reported-by: syzbot+7c449856228b63ac951e@syzkaller.appspotmail.com
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: syzkaller-bugs@googlegroups.com
Cc: x86@kernel.org
Fixes: 97c79a38cd45 ("perf core: Per event callchain limit")
Link: http://lkml.kernel.org/r/20180415092352.12403-2-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 kernel/events/callchain.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 772a43fea825..73cc26e321de 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -119,19 +119,22 @@ int get_callchain_buffers(int event_max_stack)
 		goto exit;
 	}
 
+	/*
+	 * If requesting per event more than the global cap,
+	 * return a different error to help userspace figure
+	 * this out.
+	 *
+	 * And also do it here so that we have &callchain_mutex held.
+	 */
+	if (event_max_stack > sysctl_perf_event_max_stack) {
+		err = -EOVERFLOW;
+		goto exit;
+	}
+
 	if (count > 1) {
 		/* If the allocation failed, give up */
 		if (!callchain_cpus_entries)
 			err = -ENOMEM;
-		/*
-		 * If requesting per event more than the global cap,
-		 * return a different error to help userspace figure
-		 * this out.
-		 *
-		 * And also do it here so that we have &callchain_mutex held.
-		 */
-		if (event_max_stack > sysctl_perf_event_max_stack)
-			err = -EOVERFLOW;
 		goto exit;
 	}
 

From bfb3d7b8b906b66551424d7636182126e1d134c8 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Sun, 15 Apr 2018 11:23:52 +0200
Subject: [PATCH 182/288] perf: Remove superfluous allocation error check

If the get_callchain_buffers fails to allocate the buffer it will
decrease the nr_callchain_events right away.

There's no point of checking the allocation error for
nr_callchain_events > 1. Removing that check.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: syzkaller-bugs@googlegroups.com
Cc: x86@kernel.org
Link: http://lkml.kernel.org/r/20180415092352.12403-3-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 kernel/events/callchain.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 73cc26e321de..c187aa3df3c8 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -131,14 +131,8 @@ int get_callchain_buffers(int event_max_stack)
 		goto exit;
 	}
 
-	if (count > 1) {
-		/* If the allocation failed, give up */
-		if (!callchain_cpus_entries)
-			err = -ENOMEM;
-		goto exit;
-	}
-
-	err = alloc_callchain_buffers();
+	if (count == 1)
+		err = alloc_callchain_buffers();
 exit:
 	if (err)
 		atomic_dec(&nr_callchain_events);

From 1340ccfa9a9afefdbab90d7935d4ed19817e37c2 Mon Sep 17 00:00:00 2001
From: Alison Schofield <alison.schofield@intel.com>
Date: Fri, 6 Apr 2018 17:21:30 -0700
Subject: [PATCH 183/288] x86,sched: Allow topologies where NUMA nodes share an
 LLC

Intel's Skylake Server CPUs have a different LLC topology than previous
generations. When in Sub-NUMA-Clustering (SNC) mode, the package is divided
into two "slices", each containing half the cores, half the LLC, and one
memory controller and each slice is enumerated to Linux as a NUMA
node. This is similar to how the cores and LLC were arranged for the
Cluster-On-Die (CoD) feature.

CoD allowed the same cache line to be present in each half of the LLC.
But, with SNC, each line is only ever present in *one* slice. This means
that the portion of the LLC *available* to a CPU depends on the data being
accessed:

    Remote socket: entire package LLC is shared
    Local socket->local slice: data goes into local slice LLC
    Local socket->remote slice: data goes into remote-slice LLC. Slightly
                    		higher latency than local slice LLC.

The biggest implication from this is that a process accessing all
NUMA-local memory only sees half the LLC capacity.

The CPU describes its cache hierarchy with the CPUID instruction. One of
the CPUID leaves enumerates the "logical processors sharing this
cache". This information is used for scheduling decisions so that tasks
move more freely between CPUs sharing the cache.

But, the CPUID for the SNC configuration discussed above enumerates the LLC
as being shared by the entire package. This is not 100% precise because the
entire cache is not usable by all accesses. But, it *is* the way the
hardware enumerates itself, and this is not likely to change.

The userspace visible impact of all the above is that the sysfs info
reports the entire LLC as being available to the entire package. As noted
above, this is not true for local socket accesses. This patch does not
correct the sysfs info. It is the same, pre and post patch.

The current code emits the following warning:

 sched: CPU #3's llc-sibling CPU #0 is not on the same node! [node: 1 != 0]. Ignoring dependency.

The warning is coming from the topology_sane() check in smpboot.c because
the topology is not matching the expectations of the model for obvious
reasons.

To fix this, add a vendor and model specific check to never call
topology_sane() for these systems. Also, just like "Cluster-on-Die" disable
the "coregroup" sched_domain_topology_level and use NUMA information from
the SRAT alone.

This is OK at least on the hardware we are immediately concerned about
because the LLC sharing happens at both the slice and at the package level,
which are also NUMA boundaries.

Signed-off-by: Alison Schofield <alison.schofield@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: brice.goglin@gmail.com
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: David Rientjes <rientjes@google.com>
Cc: Igor Mammedov <imammedo@redhat.com>
Cc: "H. Peter Anvin" <hpa@linux.intel.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Link: https://lkml.kernel.org/r/20180407002130.GA18984@alison-desk.jf.intel.com
---
 arch/x86/kernel/smpboot.c | 45 ++++++++++++++++++++++++++++++++++-----
 1 file changed, 40 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index ff99e2b6fc54..45175b81dd5b 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -77,6 +77,8 @@
 #include <asm/i8259.h>
 #include <asm/misc.h>
 #include <asm/qspinlock.h>
+#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
 
 /* Number of siblings per CPU package */
 int smp_num_siblings = 1;
@@ -390,15 +392,47 @@ static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
 	return false;
 }
 
+/*
+ * Define snc_cpu[] for SNC (Sub-NUMA Cluster) CPUs.
+ *
+ * These are Intel CPUs that enumerate an LLC that is shared by
+ * multiple NUMA nodes. The LLC on these systems is shared for
+ * off-package data access but private to the NUMA node (half
+ * of the package) for on-package access.
+ *
+ * CPUID (the source of the information about the LLC) can only
+ * enumerate the cache as being shared *or* unshared, but not
+ * this particular configuration. The CPU in this case enumerates
+ * the cache to be shared across the entire package (spanning both
+ * NUMA nodes).
+ */
+
+static const struct x86_cpu_id snc_cpu[] = {
+	{ X86_VENDOR_INTEL, 6, INTEL_FAM6_SKYLAKE_X },
+	{}
+};
+
 static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
 {
 	int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
 
-	if (per_cpu(cpu_llc_id, cpu1) != BAD_APICID &&
-	    per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2))
-		return topology_sane(c, o, "llc");
+	/* Do not match if we do not have a valid APICID for cpu: */
+	if (per_cpu(cpu_llc_id, cpu1) == BAD_APICID)
+		return false;
 
-	return false;
+	/* Do not match if LLC id does not match: */
+	if (per_cpu(cpu_llc_id, cpu1) != per_cpu(cpu_llc_id, cpu2))
+		return false;
+
+	/*
+	 * Allow the SNC topology without warning. Return of false
+	 * means 'c' does not share the LLC of 'o'. This will be
+	 * reflected to userspace.
+	 */
+	if (!topology_same_node(c, o) && x86_match_cpu(snc_cpu))
+		return false;
+
+	return topology_sane(c, o, "llc");
 }
 
 /*
@@ -456,7 +490,8 @@ static struct sched_domain_topology_level x86_topology[] = {
 
 /*
  * Set if a package/die has multiple NUMA nodes inside.
- * AMD Magny-Cours and Intel Cluster-on-Die have this.
+ * AMD Magny-Cours, Intel Cluster-on-Die, and Intel
+ * Sub-NUMA Clustering have this.
  */
 static bool x86_has_numa_in_package;
 

From d6ef1f194b7569af8b8397876dc9ab07649d63cb Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Tue, 17 Apr 2018 15:27:16 +0200
Subject: [PATCH 184/288] x86/mm: Prevent kernel Oops in PTDUMP code with
 HIGHPTE=y

The walk_pte_level() function just uses __va to get the virtual address of
the PTE page, but that breaks when the PTE page is not in the direct
mapping with HIGHPTE=y.

The result is an unhandled kernel paging request at some random address
when accessing the current_kernel or current_user file.

Use the correct API to access PTE pages.

Fixes: fe770bf0310d ('x86: clean up the page table dumper and add 32-bit support')
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Cc: jgross@suse.com
Cc: JBeulich@suse.com
Cc: hpa@zytor.com
Cc: aryabinin@virtuozzo.com
Cc: kirill.shutemov@linux.intel.com
Link: https://lkml.kernel.org/r/1523971636-4137-1-git-send-email-joro@8bytes.org
---
 arch/x86/mm/dump_pagetables.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 62a7e9f65dec..cc7ff5957194 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -18,6 +18,7 @@
 #include <linux/init.h>
 #include <linux/sched.h>
 #include <linux/seq_file.h>
+#include <linux/highmem.h>
 
 #include <asm/pgtable.h>
 
@@ -334,16 +335,16 @@ static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr,
 			   pgprotval_t eff_in, unsigned long P)
 {
 	int i;
-	pte_t *start;
+	pte_t *pte;
 	pgprotval_t prot, eff;
 
-	start = (pte_t *)pmd_page_vaddr(addr);
 	for (i = 0; i < PTRS_PER_PTE; i++) {
-		prot = pte_flags(*start);
-		eff = effective_prot(eff_in, prot);
 		st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT);
+		pte = pte_offset_map(&addr, st->current_address);
+		prot = pte_flags(*pte);
+		eff = effective_prot(eff_in, prot);
 		note_page(m, st, __pgprot(prot), eff, 5);
-		start++;
+		pte_unmap(pte);
 	}
 }
 #ifdef CONFIG_KASAN

From 05e489b1596f0aa1025a1fa572676631cd9665da Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Tue, 17 Apr 2018 14:25:58 +0800
Subject: [PATCH 185/288] VSOCK: make af_vsock.ko removable again

Commit c1eef220c1760762753b602c382127bfccee226d ("vsock: always call
vsock_init_tables()") introduced a module_init() function without a
corresponding module_exit() function.

Modules with an init function can only be removed if they also have an
exit function.  Therefore the vsock module was considered "permanent"
and could not be removed.

This patch adds an empty module_exit() function so that "rmmod vsock"
works.  No explicit cleanup is required because:

1. Transports call vsock_core_exit() upon exit and cannot be removed
   while sockets are still alive.
2. vsock_diag.ko does not perform any action that requires cleanup by
   vsock.ko.

Fixes: c1eef220c176 ("vsock: always call vsock_init_tables()")
Reported-by: Xiumei Mu <xmu@redhat.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: Jorgen Hansen <jhansen@vmware.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Jorgen Hansen <jhansen@vmware.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/vmw_vsock/af_vsock.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index aac9b8f6552e..c1076c19b858 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -2018,7 +2018,13 @@ const struct vsock_transport *vsock_core_get_transport(void)
 }
 EXPORT_SYMBOL_GPL(vsock_core_get_transport);
 
+static void __exit vsock_exit(void)
+{
+	/* Do nothing.  This function makes this module removable. */
+}
+
 module_init(vsock_init_tables);
+module_exit(vsock_exit);
 
 MODULE_AUTHOR("VMware, Inc.");
 MODULE_DESCRIPTION("VMware Virtual Socket Family");

From 77ac725e0c5b27c925e514b999cd46d01eedafd1 Mon Sep 17 00:00:00 2001
From: Nicolas Dechesne <nicolas.dechesne@linaro.org>
Date: Tue, 17 Apr 2018 14:03:26 +0200
Subject: [PATCH 186/288] net: qrtr: add MODULE_ALIAS_NETPROTO macro

To ensure that qrtr can be loaded automatically, when needed, if it is compiled
as module.

Signed-off-by: Nicolas Dechesne <nicolas.dechesne@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/qrtr/qrtr.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c
index b33e5aeb4c06..2aa07b547b16 100644
--- a/net/qrtr/qrtr.c
+++ b/net/qrtr/qrtr.c
@@ -1135,3 +1135,4 @@ module_exit(qrtr_proto_fini);
 
 MODULE_DESCRIPTION("Qualcomm IPC-router driver");
 MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_NETPROTO(PF_QIPCRTR);

From 800cb2e553d44541b83aa3ec45d9839385fe8ab6 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 16 Apr 2018 14:44:41 +0100
Subject: [PATCH 187/288] arm64: kasan: avoid pfn_to_nid() before page array is
 initialized

In arm64's kasan_init(), we use pfn_to_nid() to find the NUMA node a
span of memory is in, hoping to allocate shadow from the same NUMA node.
However, at this point, the page array has not been initialized, and
thus this is bogus.

Since commit:

  f165b378bbdf6c8a ("mm: uninitialized struct page poisoning sanity")

... accessing fields of the page array results in a boot time Oops(),
highlighting this problem:

[    0.000000] Unable to handle kernel paging request at virtual address dfff200000000000
[    0.000000] Mem abort info:
[    0.000000]   ESR = 0x96000004
[    0.000000]   Exception class = DABT (current EL), IL = 32 bits
[    0.000000]   SET = 0, FnV = 0
[    0.000000]   EA = 0, S1PTW = 0
[    0.000000] Data abort info:
[    0.000000]   ISV = 0, ISS = 0x00000004
[    0.000000]   CM = 0, WnR = 0
[    0.000000] [dfff200000000000] address between user and kernel address ranges
[    0.000000] Internal error: Oops: 96000004 [#1] PREEMPT SMP
[    0.000000] Modules linked in:
[    0.000000] CPU: 0 PID: 0 Comm: swapper Not tainted 4.16.0-07317-gf165b378bbdf #42
[    0.000000] Hardware name: ARM Juno development board (r1) (DT)
[    0.000000] pstate: 80000085 (Nzcv daIf -PAN -UAO)
[    0.000000] pc : __asan_load8+0x8c/0xa8
[    0.000000] lr : __dump_page+0x3c/0x3b8
[    0.000000] sp : ffff2000099b7ca0
[    0.000000] x29: ffff2000099b7ca0 x28: ffff20000a1762c0
[    0.000000] x27: ffff7e0000000000 x26: ffff2000099dd000
[    0.000000] x25: ffff200009a3f960 x24: ffff200008f9c38c
[    0.000000] x23: ffff20000a9d3000 x22: ffff200009735430
[    0.000000] x21: fffffffffffffffe x20: ffff7e0001e50420
[    0.000000] x19: ffff7e0001e50400 x18: 0000000000001840
[    0.000000] x17: ffffffffffff8270 x16: 0000000000001840
[    0.000000] x15: 0000000000001920 x14: 0000000000000004
[    0.000000] x13: 0000000000000000 x12: 0000000000000800
[    0.000000] x11: 1ffff0012d0f89ff x10: ffff10012d0f89ff
[    0.000000] x9 : 0000000000000000 x8 : ffff8009687c5000
[    0.000000] x7 : 0000000000000000 x6 : ffff10000f282000
[    0.000000] x5 : 0000000000000040 x4 : fffffffffffffffe
[    0.000000] x3 : 0000000000000000 x2 : dfff200000000000
[    0.000000] x1 : 0000000000000005 x0 : 0000000000000000
[    0.000000] Process swapper (pid: 0, stack limit = 0x        (ptrval))
[    0.000000] Call trace:
[    0.000000]  __asan_load8+0x8c/0xa8
[    0.000000]  __dump_page+0x3c/0x3b8
[    0.000000]  dump_page+0xc/0x18
[    0.000000]  kasan_init+0x2e8/0x5a8
[    0.000000]  setup_arch+0x294/0x71c
[    0.000000]  start_kernel+0xdc/0x500
[    0.000000] Code: aa0403e0 9400063c 17ffffee d343fc00 (38e26800)
[    0.000000] ---[ end trace 67064f0e9c0cc338 ]---
[    0.000000] Kernel panic - not syncing: Attempted to kill the idle task!
[    0.000000] ---[ end Kernel panic - not syncing: Attempted to kill the idle task! ]---

Let's fix this by using early_pfn_to_nid(), as other architectures do in
their kasan init code. Note that early_pfn_to_nid acquires the nid from
the memblock array, which we iterate over in kasan_init(), so this
should be fine.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Fixes: 39d114ddc6822302 ("arm64: add KASAN support")
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/mm/kasan_init.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c
index dabfc1ecda3d..12145874c02b 100644
--- a/arch/arm64/mm/kasan_init.c
+++ b/arch/arm64/mm/kasan_init.c
@@ -204,7 +204,7 @@ void __init kasan_init(void)
 	clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END);
 
 	kasan_map_populate(kimg_shadow_start, kimg_shadow_end,
-			   pfn_to_nid(virt_to_pfn(lm_alias(_text))));
+			   early_pfn_to_nid(virt_to_pfn(lm_alias(_text))));
 
 	kasan_populate_zero_shadow((void *)KASAN_SHADOW_START,
 				   (void *)mod_shadow_start);
@@ -224,7 +224,7 @@ void __init kasan_init(void)
 
 		kasan_map_populate((unsigned long)kasan_mem_to_shadow(start),
 				   (unsigned long)kasan_mem_to_shadow(end),
-				   pfn_to_nid(virt_to_pfn(start)));
+				   early_pfn_to_nid(virt_to_pfn(start)));
 	}
 
 	/*

From daf70d89f80c6e1772233da9e020114b1254e7e0 Mon Sep 17 00:00:00 2001
From: Matt Redfearn <matt.redfearn@mips.com>
Date: Tue, 17 Apr 2018 15:52:21 +0100
Subject: [PATCH 188/288] MIPS: memset.S: Fix return of __clear_user from
 Lpartial_fixup

The __clear_user function is defined to return the number of bytes that
could not be cleared. From the underlying memset / bzero implementation
this means setting register a2 to that number on return. Currently if a
page fault is triggered within the memset_partial block, the value
loaded into a2 on return is meaningless.

The label .Lpartial_fixup\@ is jumped to on page fault. In order to work
out how many bytes failed to copy, the exception handler should find how
many bytes left in the partial block (andi a2, STORMASK), add that to
the partial block end address (a2), and subtract the faulting address to
get the remainder. Currently it incorrectly subtracts the partial block
start address (t1), which has additionally been clobbered to generate a
jump target in memset_partial. Fix this by adding the block end address
instead.

This issue was found with the following test code:
      int j, k;
      for (j = 0; j < 512; j++) {
        if ((k = clear_user(NULL, j)) != j) {
           pr_err("clear_user (NULL %d) returned %d\n", j, k);
        }
      }
Which now passes on Creator Ci40 (MIPS32) and Cavium Octeon II (MIPS64).

Suggested-by: James Hogan <jhogan@kernel.org>
Signed-off-by: Matt Redfearn <matt.redfearn@mips.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: stable@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/19108/
Signed-off-by: James Hogan <jhogan@kernel.org>
---
 arch/mips/lib/memset.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/mips/lib/memset.S b/arch/mips/lib/memset.S
index 90bcdf1224ee..184819c1d5c8 100644
--- a/arch/mips/lib/memset.S
+++ b/arch/mips/lib/memset.S
@@ -252,7 +252,7 @@
 	PTR_L		t0, TI_TASK($28)
 	andi		a2, STORMASK
 	LONG_L		t0, THREAD_BUADDR(t0)
-	LONG_ADDU	a2, t1
+	LONG_ADDU	a2, a0
 	jr		ra
 	LONG_SUBU	a2, t0
 

From 4450dc0ae2c18a0ac6dce560215c7a1fa12122b5 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Thu, 5 Apr 2018 17:26:58 +0200
Subject: [PATCH 189/288] clockevents: Fix kernel messages split across
 multiple lines

Convert the clockevents driver from old-style printk() to pr_info() and
pr_cont(), to fix split kernel messages like below:

    Clockevents: could not switch to one-shot mode:
     dummy_timer is not functional.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: https://lkml.kernel.org/r/1522942018-14471-1-git-send-email-geert%2Brenesas@glider.be
---
 kernel/time/tick-oneshot.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index c1f518e7aa80..6fe615d57ebb 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -82,16 +82,15 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
 	if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) ||
 		    !tick_device_is_functional(dev)) {
 
-		printk(KERN_INFO "Clockevents: "
-		       "could not switch to one-shot mode:");
+		pr_info("Clockevents: could not switch to one-shot mode:");
 		if (!dev) {
-			printk(" no tick device\n");
+			pr_cont(" no tick device\n");
 		} else {
 			if (!tick_device_is_functional(dev))
-				printk(" %s is not functional.\n", dev->name);
+				pr_cont(" %s is not functional.\n", dev->name);
 			else
-				printk(" %s does not support one-shot mode.\n",
-				       dev->name);
+				pr_cont(" %s does not support one-shot mode.\n",
+					dev->name);
 		}
 		return -EINVAL;
 	}

From f0ae6a0321222864ed8675a924cc8ee2cb042c31 Mon Sep 17 00:00:00 2001
From: "Liu, Changcheng" <changcheng.liu@intel.com>
Date: Thu, 12 Apr 2018 15:57:01 +0800
Subject: [PATCH 190/288] timers: Remove stale struct tvec_base forward
 declaration

struct tvec_base is a leftover of the original timer wheel implementation
and not longer used. Remove the forward declaration.

Signed-off-by: Liu Changcheng <changcheng.liu@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: akpm@linux-foundation.org
Link: https://lkml.kernel.org/r/20180412075701.GA38952@sofia
---
 include/linux/timer.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/include/linux/timer.h b/include/linux/timer.h
index 2448f9cc48a3..7b066fd38248 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -8,8 +8,6 @@
 #include <linux/debugobjects.h>
 #include <linux/stringify.h>
 
-struct tvec_base;
-
 struct timer_list {
 	/*
 	 * All fields that change during normal runtime grouped to the

From e142aa09ed88be98395dde7acb96fb2263566b68 Mon Sep 17 00:00:00 2001
From: Baolin Wang <baolin.wang@linaro.org>
Date: Fri, 13 Apr 2018 13:27:58 +0800
Subject: [PATCH 191/288] timekeeping: Remove __current_kernel_time()

The __current_kernel_time() function based on 'struct timespec' is no
longer recommended for new code, and the only user of this function has
been replaced by commit 6909e29fdefb ("kdb: use __ktime_get_real_seconds
instead of __current_kernel_time").

Remove the obsolete interface.

Signed-off-by: Baolin Wang <baolin.wang@linaro.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: arnd@arndb.de
Cc: sboyd@kernel.org
Cc: broonie@kernel.org
Cc: john.stultz@linaro.org
Link: https://lkml.kernel.org/r/1a9dbea7ee2cda7efe9ed330874075cf17fdbff6.1523596316.git.baolin.wang@linaro.org
---
 include/linux/timekeeping32.h | 3 ---
 kernel/time/timekeeping.c     | 7 -------
 2 files changed, 10 deletions(-)

diff --git a/include/linux/timekeeping32.h b/include/linux/timekeeping32.h
index af4114d5dc17..3616b4becb59 100644
--- a/include/linux/timekeeping32.h
+++ b/include/linux/timekeeping32.h
@@ -9,9 +9,6 @@
 extern void do_gettimeofday(struct timeval *tv);
 unsigned long get_seconds(void);
 
-/* does not take xtime_lock */
-struct timespec __current_kernel_time(void);
-
 static inline struct timespec current_kernel_time(void)
 {
 	struct timespec64 now = current_kernel_time64();
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index ca90219a1e73..dcf7f20fcd12 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -2139,13 +2139,6 @@ unsigned long get_seconds(void)
 }
 EXPORT_SYMBOL(get_seconds);
 
-struct timespec __current_kernel_time(void)
-{
-	struct timekeeper *tk = &tk_core.timekeeper;
-
-	return timespec64_to_timespec(tk_xtime(tk));
-}
-
 struct timespec64 current_kernel_time64(void)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;

From 7ce2367254e84753bceb07327aaf5c953cfce117 Mon Sep 17 00:00:00 2001
From: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
Date: Tue, 17 Apr 2018 18:46:14 +0900
Subject: [PATCH 192/288] vlan: Fix reading memory beyond skb->tail in
 skb_vlan_tagged_multi

Syzkaller spotted an old bug which leads to reading skb beyond tail by 4
bytes on vlan tagged packets.
This is caused because skb_vlan_tagged_multi() did not check
skb_headlen.

BUG: KMSAN: uninit-value in eth_type_vlan include/linux/if_vlan.h:283 [inline]
BUG: KMSAN: uninit-value in skb_vlan_tagged_multi include/linux/if_vlan.h:656 [inline]
BUG: KMSAN: uninit-value in vlan_features_check include/linux/if_vlan.h:672 [inline]
BUG: KMSAN: uninit-value in dflt_features_check net/core/dev.c:2949 [inline]
BUG: KMSAN: uninit-value in netif_skb_features+0xd1b/0xdc0 net/core/dev.c:3009
CPU: 1 PID: 3582 Comm: syzkaller435149 Not tainted 4.16.0+ #82
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
  __dump_stack lib/dump_stack.c:17 [inline]
  dump_stack+0x185/0x1d0 lib/dump_stack.c:53
  kmsan_report+0x142/0x240 mm/kmsan/kmsan.c:1067
  __msan_warning_32+0x6c/0xb0 mm/kmsan/kmsan_instr.c:676
  eth_type_vlan include/linux/if_vlan.h:283 [inline]
  skb_vlan_tagged_multi include/linux/if_vlan.h:656 [inline]
  vlan_features_check include/linux/if_vlan.h:672 [inline]
  dflt_features_check net/core/dev.c:2949 [inline]
  netif_skb_features+0xd1b/0xdc0 net/core/dev.c:3009
  validate_xmit_skb+0x89/0x1320 net/core/dev.c:3084
  __dev_queue_xmit+0x1cb2/0x2b60 net/core/dev.c:3549
  dev_queue_xmit+0x4b/0x60 net/core/dev.c:3590
  packet_snd net/packet/af_packet.c:2944 [inline]
  packet_sendmsg+0x7c57/0x8a10 net/packet/af_packet.c:2969
  sock_sendmsg_nosec net/socket.c:630 [inline]
  sock_sendmsg net/socket.c:640 [inline]
  sock_write_iter+0x3b9/0x470 net/socket.c:909
  do_iter_readv_writev+0x7bb/0x970 include/linux/fs.h:1776
  do_iter_write+0x30d/0xd40 fs/read_write.c:932
  vfs_writev fs/read_write.c:977 [inline]
  do_writev+0x3c9/0x830 fs/read_write.c:1012
  SYSC_writev+0x9b/0xb0 fs/read_write.c:1085
  SyS_writev+0x56/0x80 fs/read_write.c:1082
  do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287
  entry_SYSCALL_64_after_hwframe+0x3d/0xa2
RIP: 0033:0x43ffa9
RSP: 002b:00007fff2cff3948 EFLAGS: 00000217 ORIG_RAX: 0000000000000014
RAX: ffffffffffffffda RBX: 00000000004002c8 RCX: 000000000043ffa9
RDX: 0000000000000001 RSI: 0000000020000080 RDI: 0000000000000003
RBP: 00000000006cb018 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000217 R12: 00000000004018d0
R13: 0000000000401960 R14: 0000000000000000 R15: 0000000000000000

Uninit was created at:
  kmsan_save_stack_with_flags mm/kmsan/kmsan.c:278 [inline]
  kmsan_internal_poison_shadow+0xb8/0x1b0 mm/kmsan/kmsan.c:188
  kmsan_kmalloc+0x94/0x100 mm/kmsan/kmsan.c:314
  kmsan_slab_alloc+0x11/0x20 mm/kmsan/kmsan.c:321
  slab_post_alloc_hook mm/slab.h:445 [inline]
  slab_alloc_node mm/slub.c:2737 [inline]
  __kmalloc_node_track_caller+0xaed/0x11c0 mm/slub.c:4369
  __kmalloc_reserve net/core/skbuff.c:138 [inline]
  __alloc_skb+0x2cf/0x9f0 net/core/skbuff.c:206
  alloc_skb include/linux/skbuff.h:984 [inline]
  alloc_skb_with_frags+0x1d4/0xb20 net/core/skbuff.c:5234
  sock_alloc_send_pskb+0xb56/0x1190 net/core/sock.c:2085
  packet_alloc_skb net/packet/af_packet.c:2803 [inline]
  packet_snd net/packet/af_packet.c:2894 [inline]
  packet_sendmsg+0x6444/0x8a10 net/packet/af_packet.c:2969
  sock_sendmsg_nosec net/socket.c:630 [inline]
  sock_sendmsg net/socket.c:640 [inline]
  sock_write_iter+0x3b9/0x470 net/socket.c:909
  do_iter_readv_writev+0x7bb/0x970 include/linux/fs.h:1776
  do_iter_write+0x30d/0xd40 fs/read_write.c:932
  vfs_writev fs/read_write.c:977 [inline]
  do_writev+0x3c9/0x830 fs/read_write.c:1012
  SYSC_writev+0x9b/0xb0 fs/read_write.c:1085
  SyS_writev+0x56/0x80 fs/read_write.c:1082
  do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287
  entry_SYSCALL_64_after_hwframe+0x3d/0xa2

Fixes: 58e998c6d239 ("offloading: Force software GSO for multiple vlan tags.")
Reported-and-tested-by: syzbot+0bbe42c764feafa82c5a@syzkaller.appspotmail.com
Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_vlan.h | 7 +++++--
 net/core/dev.c          | 2 +-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index d11f41d5269f..78a5a90b4267 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -663,7 +663,7 @@ static inline bool skb_vlan_tagged(const struct sk_buff *skb)
  * Returns true if the skb is tagged with multiple vlan headers, regardless
  * of whether it is hardware accelerated or not.
  */
-static inline bool skb_vlan_tagged_multi(const struct sk_buff *skb)
+static inline bool skb_vlan_tagged_multi(struct sk_buff *skb)
 {
 	__be16 protocol = skb->protocol;
 
@@ -673,6 +673,9 @@ static inline bool skb_vlan_tagged_multi(const struct sk_buff *skb)
 		if (likely(!eth_type_vlan(protocol)))
 			return false;
 
+		if (unlikely(!pskb_may_pull(skb, VLAN_ETH_HLEN)))
+			return false;
+
 		veh = (struct vlan_ethhdr *)skb->data;
 		protocol = veh->h_vlan_encapsulated_proto;
 	}
@@ -690,7 +693,7 @@ static inline bool skb_vlan_tagged_multi(const struct sk_buff *skb)
  *
  * Returns features without unsafe ones if the skb has multiple tags.
  */
-static inline netdev_features_t vlan_features_check(const struct sk_buff *skb,
+static inline netdev_features_t vlan_features_check(struct sk_buff *skb,
 						    netdev_features_t features)
 {
 	if (skb_vlan_tagged_multi(skb)) {
diff --git a/net/core/dev.c b/net/core/dev.c
index 969462ebb296..af0558b00c6c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2969,7 +2969,7 @@ netdev_features_t passthru_features_check(struct sk_buff *skb,
 }
 EXPORT_SYMBOL(passthru_features_check);
 
-static netdev_features_t dflt_features_check(const struct sk_buff *skb,
+static netdev_features_t dflt_features_check(struct sk_buff *skb,
 					     struct net_device *dev,
 					     netdev_features_t features)
 {

From 89bda97b445bacab68e71507cc08ccacd6694474 Mon Sep 17 00:00:00 2001
From: Bert Kenward <bkenward@solarflare.com>
Date: Tue, 17 Apr 2018 13:32:39 +0100
Subject: [PATCH 193/288] sfc: check RSS is active for filter insert

For some firmware variants - specifically 'capture packed stream' - RSS
filters are not valid. We must check if RSS is actually active rather
than merely enabled.

Fixes: 42356d9a137b ("sfc: support RSS spreading of ethtool ntuple filters")
Signed-off-by: Bert Kenward <bkenward@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/sfc/ef10.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/sfc/ef10.c b/drivers/net/ethernet/sfc/ef10.c
index 36f24c7e553a..83ce229f4eb7 100644
--- a/drivers/net/ethernet/sfc/ef10.c
+++ b/drivers/net/ethernet/sfc/ef10.c
@@ -5264,7 +5264,7 @@ static int efx_ef10_filter_insert_addr_list(struct efx_nic *efx,
 		ids = vlan->uc;
 	}
 
-	filter_flags = efx_rss_enabled(efx) ? EFX_FILTER_FLAG_RX_RSS : 0;
+	filter_flags = efx_rss_active(&efx->rss_context) ? EFX_FILTER_FLAG_RX_RSS : 0;
 
 	/* Insert/renew filters */
 	for (i = 0; i < addr_count; i++) {
@@ -5333,7 +5333,7 @@ static int efx_ef10_filter_insert_def(struct efx_nic *efx,
 	int rc;
 	u16 *id;
 
-	filter_flags = efx_rss_enabled(efx) ? EFX_FILTER_FLAG_RX_RSS : 0;
+	filter_flags = efx_rss_active(&efx->rss_context) ? EFX_FILTER_FLAG_RX_RSS : 0;
 
 	efx_filter_init_rx(&spec, EFX_FILTER_PRI_AUTO, filter_flags, 0);
 

From 9c438d7a3a52dcc2b9ed095cb87d3a5e83cf7e60 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Tue, 17 Apr 2018 12:07:06 -0700
Subject: [PATCH 194/288] KEYS: DNS: limit the length of option strings

Adding a dns_resolver key whose payload contains a very long option name
resulted in that string being printed in full.  This hit the WARN_ONCE()
in set_precision() during the printk(), because printk() only supports a
precision of up to 32767 bytes:

    precision 1000000 too large
    WARNING: CPU: 0 PID: 752 at lib/vsprintf.c:2189 vsnprintf+0x4bc/0x5b0

Fix it by limiting option strings (combined name + value) to a much more
reasonable 128 bytes.  The exact limit is arbitrary, but currently the
only recognized option is formatted as "dnserror=%lu" which fits well
within this limit.

Also ratelimit the printks.

Reproducer:

    perl -e 'print "#", "A" x 1000000, "\x00"' | keyctl padd dns_resolver desc @s

This bug was found using syzkaller.

Reported-by: Mark Rutland <mark.rutland@arm.com>
Fixes: 4a2d789267e0 ("DNS: If the DNS server returns an error, allow that to be cached [ver #2]")
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dns_resolver/dns_key.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c
index 8396705deffc..40c851693f77 100644
--- a/net/dns_resolver/dns_key.c
+++ b/net/dns_resolver/dns_key.c
@@ -91,9 +91,9 @@ dns_resolver_preparse(struct key_preparsed_payload *prep)
 
 			next_opt = memchr(opt, '#', end - opt) ?: end;
 			opt_len = next_opt - opt;
-			if (!opt_len) {
-				printk(KERN_WARNING
-				       "Empty option to dns_resolver key\n");
+			if (opt_len <= 0 || opt_len > 128) {
+				pr_warn_ratelimited("Invalid option length (%d) for dns_resolver key\n",
+						    opt_len);
 				return -EINVAL;
 			}
 
@@ -127,10 +127,8 @@ dns_resolver_preparse(struct key_preparsed_payload *prep)
 			}
 
 		bad_option_value:
-			printk(KERN_WARNING
-			       "Option '%*.*s' to dns_resolver key:"
-			       " bad/missing value\n",
-			       opt_nlen, opt_nlen, opt);
+			pr_warn_ratelimited("Option '%*.*s' to dns_resolver key: bad/missing value\n",
+					    opt_nlen, opt_nlen, opt);
 			return -EINVAL;
 		} while (opt = next_opt + 1, opt < end);
 	}

From 0d568cd34eb04acf05c26f360d1a0f071f0bb636 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Fri, 13 Apr 2018 10:13:29 -0500
Subject: [PATCH 195/288] cifs: smb2ops: Fix NULL check in smb2_query_symlink

The current code null checks variable err_buf, which is always null
when it is checked, hence utf16_path is free'd and the function
returns -ENOENT everytime it is called, making it impossible for the
execution path to reach the following code:

err_buf = err_iov.iov_base;

Fix this by null checking err_iov.iov_base instead of err_buf. Also,
notice that err_buf no longer needs to be initialized to NULL.

Addresses-Coverity-ID: 1467876 ("Logically dead code")
Fixes: 2d636199e400 ("cifs: Change SMB2_open to return an iov for the error parameter")
Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Signed-off-by: Steve French <smfrench@gmail.com>
Reviewed-by: Pavel Shilovsky <pshilov@microsoft.com>
---
 fs/cifs/smb2ops.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index b4ae932ea134..38ebf3f357d2 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -1452,7 +1452,7 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
 	struct cifs_open_parms oparms;
 	struct cifs_fid fid;
 	struct kvec err_iov = {NULL, 0};
-	struct smb2_err_rsp *err_buf = NULL;
+	struct smb2_err_rsp *err_buf;
 	struct smb2_symlink_err_rsp *symlink;
 	unsigned int sub_len;
 	unsigned int sub_offset;
@@ -1476,7 +1476,7 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
 
 	rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, &err_iov);
 
-	if (!rc || !err_buf) {
+	if (!rc || !err_iov.iov_base) {
 		kfree(utf16_path);
 		return -ENOENT;
 	}

From a5240cbde22c86c606c6462d32aea0648c21fdc3 Mon Sep 17 00:00:00 2001
From: Souptick Joarder <jrdr.linux@gmail.com>
Date: Sun, 15 Apr 2018 00:58:25 +0530
Subject: [PATCH 196/288] fs: cifs: Adding new return type vm_fault_t

Use new return type vm_fault_t for page_mkwrite
handler.

Signed-off-by: Souptick Joarder <jrdr.linux@gmail.com>
Reviewed-by: Matthew Wilcox <mawilcox@microsoft.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/cifs/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 4bcd4e838b47..23fd430fe74a 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -3462,7 +3462,7 @@ cifs_read(struct file *file, char *read_data, size_t read_size, loff_t *offset)
  * If the page is mmap'ed into a process' page tables, then we need to make
  * sure that it doesn't change while being written back.
  */
-static int
+static vm_fault_t
 cifs_page_mkwrite(struct vm_fault *vmf)
 {
 	struct page *page = vmf->page;

From 8bf24e8319613bbe950d4188682b3a0d9441b76b Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Tue, 10 Apr 2018 16:20:53 +1000
Subject: [PATCH 197/288] selftests/filesystems: Don't run dnotify_test by
 default

In commit ce290a19609d ("selftests: add devpts selftests"), the
filesystems directory was added to the top-level selftests Makefile.

That had the effect of causing the existing dnotify_test in the
filesystems directory to now be run as part of the default selftests
test-run. Unfortunately dnotify_test is actually an infinite loop.

Fix it by moving dnotify_test to TEST_GEN_PROGS_EXTENDED, which says
that it's a generated file (ie. built) but should not be run as part
of the default test suite run (it's an "extended" test).

While we're here cleanup a few other things, devpts_pts should be in
TEST_GEN_PROGS to indicate that it's built, and with the above two
changes we no longer need a custom all or clean rule.

Fixes: ce290a19609d ("selftests: add devpts selftests")
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Acked-by: Christian brauner <christian.brauner@ubuntu.com>
Signed-off-by: Shuah Khan <shuahkh@osg.samsung.com>
---
 tools/testing/selftests/filesystems/Makefile | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/tools/testing/selftests/filesystems/Makefile b/tools/testing/selftests/filesystems/Makefile
index 4e6d09fb166f..5c7d7001ad37 100644
--- a/tools/testing/selftests/filesystems/Makefile
+++ b/tools/testing/selftests/filesystems/Makefile
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
-TEST_PROGS := dnotify_test devpts_pts
-all: $(TEST_PROGS)
+
+TEST_GEN_PROGS := devpts_pts
+TEST_GEN_PROGS_EXTENDED := dnotify_test
 
 include ../lib.mk
-
-clean:
-	rm -fr $(TEST_PROGS)

From fcf1fadf4c65eea6c519c773d2d9901e8ad94f5f Mon Sep 17 00:00:00 2001
From: Xidong Wang <wangxidong_97@163.com>
Date: Wed, 4 Apr 2018 10:38:24 +0100
Subject: [PATCH 198/288] drm/i915: Do no use kfree() to free a
 kmem_cache_alloc() return value

Along the eb_lookup_vmas() error path, the return value from
kmem_cache_alloc() was freed using kfree(). Fix it to use the proper
kmem_cache_free() instead.

Fixes: d1b48c1e7184 ("drm/i915: Replace execbuf vma ht with an idr")
Signed-off-by: Xidong Wang <wangxidong_97@163.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: <stable@vger.kernel.org> # v4.14+
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20180404093824.9313-1-chris@chris-wilson.co.uk
(cherry picked from commit 6be1187dbffa0027ea379c53f7ca0c782515c610)
Signed-off-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
---
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 8c170db8495d..0414228cd2b5 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -728,7 +728,7 @@ static int eb_lookup_vmas(struct i915_execbuffer *eb)
 
 		err = radix_tree_insert(handles_vma, handle, vma);
 		if (unlikely(err)) {
-			kfree(lut);
+			kmem_cache_free(eb->i915->luts, lut);
 			goto err_obj;
 		}
 

From e6be6bd85654dba55b97758f937c46835d961a44 Mon Sep 17 00:00:00 2001
From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Date: Tue, 10 Apr 2018 12:27:04 +0100
Subject: [PATCH 199/288] drm/i915/pmu: Inspect runtime PM state more carefully
 while estimating RC6

While thinking about sporadic failures of perf_pmu/rc6-runtime-pm* tests
on some CI machines I have concluded that: a) the PMU readout of RC6 can
race against runtime PM transitions, and b) there are other reasons than
being runtime suspended which can cause intel_runtime_pm_get_if_in_use to
fail.

Therefore when estimating RC6 the code needs to assert we are indeed in
suspended state, and if not, the best we can do is return the last known
RC6 value.

Without this check we can calculate the estimated value based on un-
initialized or inappropriate internal state, which can result in over-
estimation, or in any case incorrect value being returned.

v2:
 * Re-arrange the code a bit to avoid second unlock and return branch.
   (Chris Wilson)

v3:
 * Insert some strategic blank lines and improve commit msg.
   (Chris Wilson)

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Fixes: 1fe699e30113 ("drm/i915/pmu: Fix sleep under atomic in RC6 readout")
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=105010
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Imre Deak <imre.deak@intel.com>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20180410112704.24462-1-tvrtko.ursulin@linux.intel.com
(cherry picked from commit 2924bdee21edd6785a4df1b4d17fd3cb265fddd9)
Signed-off-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
---
 drivers/gpu/drm/i915/i915_pmu.c | 39 +++++++++++++++++++++++----------
 1 file changed, 28 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index d8feb9053e0c..f0519e31543a 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -473,20 +473,37 @@ static u64 get_rc6(struct drm_i915_private *i915)
 		spin_lock_irqsave(&i915->pmu.lock, flags);
 		spin_lock(&kdev->power.lock);
 
-		if (!i915->pmu.sample[__I915_SAMPLE_RC6_ESTIMATED].cur)
-			i915->pmu.suspended_jiffies_last =
-						kdev->power.suspended_jiffies;
+		/*
+		 * After the above branch intel_runtime_pm_get_if_in_use failed
+		 * to get the runtime PM reference we cannot assume we are in
+		 * runtime suspend since we can either: a) race with coming out
+		 * of it before we took the power.lock, or b) there are other
+		 * states than suspended which can bring us here.
+		 *
+		 * We need to double-check that we are indeed currently runtime
+		 * suspended and if not we cannot do better than report the last
+		 * known RC6 value.
+		 */
+		if (kdev->power.runtime_status == RPM_SUSPENDED) {
+			if (!i915->pmu.sample[__I915_SAMPLE_RC6_ESTIMATED].cur)
+				i915->pmu.suspended_jiffies_last =
+						  kdev->power.suspended_jiffies;
 
-		val = kdev->power.suspended_jiffies -
-		      i915->pmu.suspended_jiffies_last;
-		val += jiffies - kdev->power.accounting_timestamp;
+			val = kdev->power.suspended_jiffies -
+			      i915->pmu.suspended_jiffies_last;
+			val += jiffies - kdev->power.accounting_timestamp;
+
+			val = jiffies_to_nsecs(val);
+			val += i915->pmu.sample[__I915_SAMPLE_RC6].cur;
+
+			i915->pmu.sample[__I915_SAMPLE_RC6_ESTIMATED].cur = val;
+		} else if (i915->pmu.sample[__I915_SAMPLE_RC6_ESTIMATED].cur) {
+			val = i915->pmu.sample[__I915_SAMPLE_RC6_ESTIMATED].cur;
+		} else {
+			val = i915->pmu.sample[__I915_SAMPLE_RC6].cur;
+		}
 
 		spin_unlock(&kdev->power.lock);
-
-		val = jiffies_to_nsecs(val);
-		val += i915->pmu.sample[__I915_SAMPLE_RC6].cur;
-		i915->pmu.sample[__I915_SAMPLE_RC6_ESTIMATED].cur = val;
-
 		spin_unlock_irqrestore(&i915->pmu.lock, flags);
 	}
 

From a3520b8992e57bc94ab6ec9f95f09c6c932555fd Mon Sep 17 00:00:00 2001
From: Jani Nikula <jani.nikula@intel.com>
Date: Wed, 11 Apr 2018 16:15:18 +0300
Subject: [PATCH 200/288] drm/i915/bios: filter out invalid DDC pins from VBT
 child devices

The VBT contains the DDC pin to use for specific ports. Alas, sometimes
the field appears to contain bogus data, and while we check for it later
on in intel_gmbus_get_adapter() we fail to check the returned NULL on
errors. Oops results.

The simplest approach seems to be to catch and ignore the bogus DDC pins
already at the VBT parsing phase, reverting to fixed per port default
pins. This doesn't guarantee display working, but at least it prevents
the oops. And we continue to be fuzzed by VBT.

One affected machine is Dell Latitude 5590 where a BIOS upgrade added
invalid DDC pins.

Typical backtrace:

[   35.461411] WARN_ON(!intel_gmbus_is_valid_pin(dev_priv, pin))
[   35.461432] WARNING: CPU: 6 PID: 411 at drivers/gpu/drm/i915/intel_i2c.c:844 intel_gmbus_get_adapter+0x32/0x37 [i915]
[   35.461437] Modules linked in: i915 ahci libahci dm_snapshot dm_bufio dm_raid raid456 async_raid6_recov async_pq raid6_pq async_xor xor async_memcpy async_tx
[   35.461445] CPU: 6 PID: 411 Comm: kworker/u16:2 Not tainted 4.16.0-rc7.x64-g1cda370ffded #1
[   35.461447] Hardware name: Dell Inc. Latitude 5590/0MM81M, BIOS 1.1.9 03/13/2018
[   35.461450] Workqueue: events_unbound async_run_entry_fn
[   35.461465] RIP: 0010:intel_gmbus_get_adapter+0x32/0x37 [i915]
[   35.461467] RSP: 0018:ffff9b4e43d47c40 EFLAGS: 00010286
[   35.461469] RAX: 0000000000000000 RBX: ffff98f90639f800 RCX: ffffffffae051960
[   35.461471] RDX: 0000000000000001 RSI: 0000000000000092 RDI: 0000000000000246
[   35.461472] RBP: ffff98f905410000 R08: 0000004d062a83f6 R09: 00000000000003bd
[   35.461474] R10: 0000000000000031 R11: ffffffffad4eda58 R12: ffff98f905410000
[   35.461475] R13: ffff98f9064c1000 R14: ffff9b4e43d47cf0 R15: ffff98f905410000
[   35.461477] FS:  0000000000000000(0000) GS:ffff98f92e580000(0000) knlGS:0000000000000000
[   35.461479] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   35.461481] CR2: 00007f5682359008 CR3: 00000001b700c005 CR4: 00000000003606e0
[   35.461483] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[   35.461484] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[   35.461486] Call Trace:
[   35.461501]  intel_hdmi_set_edid+0x37/0x27f [i915]
[   35.461515]  intel_hdmi_detect+0x7c/0x97 [i915]
[   35.461518]  drm_helper_probe_single_connector_modes+0xe1/0x6c0
[   35.461521]  drm_setup_crtcs+0x129/0xa6a
[   35.461523]  ? __switch_to_asm+0x34/0x70
[   35.461525]  ? __switch_to_asm+0x34/0x70
[   35.461527]  ? __switch_to_asm+0x40/0x70
[   35.461528]  ? __switch_to_asm+0x34/0x70
[   35.461529]  ? __switch_to_asm+0x40/0x70
[   35.461531]  ? __switch_to_asm+0x34/0x70
[   35.461532]  ? __switch_to_asm+0x40/0x70
[   35.461534]  ? __switch_to_asm+0x34/0x70
[   35.461536]  __drm_fb_helper_initial_config_and_unlock+0x34/0x46f
[   35.461538]  ? __switch_to_asm+0x40/0x70
[   35.461541]  ? _cond_resched+0x10/0x33
[   35.461557]  intel_fbdev_initial_config+0xf/0x1c [i915]
[   35.461560]  async_run_entry_fn+0x2e/0xf5
[   35.461563]  process_one_work+0x15b/0x364
[   35.461565]  worker_thread+0x2c/0x3a0
[   35.461567]  ? process_one_work+0x364/0x364
[   35.461568]  kthread+0x10c/0x122
[   35.461570]  ? _kthread_create_on_node+0x5d/0x5d
[   35.461572]  ret_from_fork+0x35/0x40
[   35.461574] Code: 74 16 89 f6 48 8d 04 b6 48 c1 e0 05 48 29 f0 48 8d 84 c7 e8 11 00 00 c3 48 c7 c6 b0 19 1e c0 48 c7 c7 64 8a 1c c0 e8 47 88 ed ec <0f> 0b 31 c0 c3 8b 87 a4 04 00 00 80 e4 fc 09 c6 89 b7 a4 04 00
[   35.461604] WARNING: CPU: 6 PID: 411 at drivers/gpu/drm/i915/intel_i2c.c:844 intel_gmbus_get_adapter+0x32/0x37 [i915]
[   35.461606] ---[ end trace 4fe1e63e2dd93373 ]---
[   35.461609] BUG: unable to handle kernel NULL pointer dereference at 0000000000000010
[   35.461613] IP: i2c_transfer+0x4/0x86
[   35.461614] PGD 0 P4D 0
[   35.461616] Oops: 0000 [#1] SMP PTI
[   35.461618] Modules linked in: i915 ahci libahci dm_snapshot dm_bufio dm_raid raid456 async_raid6_recov async_pq raid6_pq async_xor xor async_memcpy async_tx
[   35.461624] CPU: 6 PID: 411 Comm: kworker/u16:2 Tainted: G        W        4.16.0-rc7.x64-g1cda370ffded #1
[   35.461625] Hardware name: Dell Inc. Latitude 5590/0MM81M, BIOS 1.1.9 03/13/2018
[   35.461628] Workqueue: events_unbound async_run_entry_fn
[   35.461630] RIP: 0010:i2c_transfer+0x4/0x86
[   35.461631] RSP: 0018:ffff9b4e43d47b30 EFLAGS: 00010246
[   35.461633] RAX: ffff9b4e43d47b6e RBX: 0000000000000005 RCX: 0000000000000001
[   35.461635] RDX: 0000000000000002 RSI: ffff9b4e43d47b80 RDI: 0000000000000000
[   35.461636] RBP: ffff9b4e43d47bd8 R08: 0000004d062a83f6 R09: 00000000000003bd
[   35.461638] R10: 0000000000000031 R11: ffffffffad4eda58 R12: 0000000000000002
[   35.461639] R13: 0000000000000001 R14: ffff9b4e43d47b6f R15: ffff9b4e43d47c07
[   35.461641] FS:  0000000000000000(0000) GS:ffff98f92e580000(0000) knlGS:0000000000000000
[   35.461643] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   35.461645] CR2: 0000000000000010 CR3: 00000001b700c005 CR4: 00000000003606e0
[   35.461646] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[   35.461647] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[   35.461649] Call Trace:
[   35.461652]  drm_do_probe_ddc_edid+0xb3/0x128
[   35.461654]  drm_get_edid+0xe5/0x38d
[   35.461669]  intel_hdmi_set_edid+0x45/0x27f [i915]
[   35.461684]  intel_hdmi_detect+0x7c/0x97 [i915]
[   35.461687]  drm_helper_probe_single_connector_modes+0xe1/0x6c0
[   35.461689]  drm_setup_crtcs+0x129/0xa6a
[   35.461691]  ? __switch_to_asm+0x34/0x70
[   35.461693]  ? __switch_to_asm+0x34/0x70
[   35.461694]  ? __switch_to_asm+0x40/0x70
[   35.461696]  ? __switch_to_asm+0x34/0x70
[   35.461697]  ? __switch_to_asm+0x40/0x70
[   35.461698]  ? __switch_to_asm+0x34/0x70
[   35.461700]  ? __switch_to_asm+0x40/0x70
[   35.461701]  ? __switch_to_asm+0x34/0x70
[   35.461703]  __drm_fb_helper_initial_config_and_unlock+0x34/0x46f
[   35.461705]  ? __switch_to_asm+0x40/0x70
[   35.461707]  ? _cond_resched+0x10/0x33
[   35.461724]  intel_fbdev_initial_config+0xf/0x1c [i915]
[   35.461727]  async_run_entry_fn+0x2e/0xf5
[   35.461729]  process_one_work+0x15b/0x364
[   35.461731]  worker_thread+0x2c/0x3a0
[   35.461733]  ? process_one_work+0x364/0x364
[   35.461734]  kthread+0x10c/0x122
[   35.461736]  ? _kthread_create_on_node+0x5d/0x5d
[   35.461738]  ret_from_fork+0x35/0x40
[   35.461739] Code: 5c fa e1 ad 48 89 df e8 ea fb ff ff e9 2a ff ff ff 0f 1f 44 00 00 31 c0 e9 43 fd ff ff 31 c0 45 31 e4 e9 c5 fd ff ff 41 54 55 53 <48> 8b 47 10 48 83 78 10 00 74 70 41 89 d4 48 89 f5 48 89 fb 65
[   35.461756] RIP: i2c_transfer+0x4/0x86 RSP: ffff9b4e43d47b30
[   35.461757] CR2: 0000000000000010
[   35.461759] ---[ end trace 4fe1e63e2dd93374 ]---

Based on a patch by Fei Li.

v2: s/reverting/sticking/ (Chris)

Cc: stable@vger.kernel.org
Cc: Fei Li <fei.li@intel.com>
Co-developed-by: Fei Li <fei.li@intel.com>
Reported-by: Pavel Nakonechnyi <zorg1331@gmail.com>
Reported-and-tested-by: Seweryn Kokot <sewkokot@gmail.com>
Reported-and-tested-by: Laszlo Valko <valko@linux.karinthy.hu>
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=105549
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=105961
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180411131519.9091-1-jani.nikula@intel.com
(cherry picked from commit f212bf9abe5de9f938fecea7df07046e74052dde)
Signed-off-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
---
 drivers/gpu/drm/i915/intel_bios.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_bios.c b/drivers/gpu/drm/i915/intel_bios.c
index c5c7530ba157..447b721c3be9 100644
--- a/drivers/gpu/drm/i915/intel_bios.c
+++ b/drivers/gpu/drm/i915/intel_bios.c
@@ -1256,7 +1256,6 @@ static void parse_ddi_port(struct drm_i915_private *dev_priv, enum port port,
 		return;
 
 	aux_channel = child->aux_channel;
-	ddc_pin = child->ddc_pin;
 
 	is_dvi = child->device_type & DEVICE_TYPE_TMDS_DVI_SIGNALING;
 	is_dp = child->device_type & DEVICE_TYPE_DISPLAYPORT_OUTPUT;
@@ -1303,9 +1302,15 @@ static void parse_ddi_port(struct drm_i915_private *dev_priv, enum port port,
 		DRM_DEBUG_KMS("Port %c is internal DP\n", port_name(port));
 
 	if (is_dvi) {
-		info->alternate_ddc_pin = map_ddc_pin(dev_priv, ddc_pin);
-
-		sanitize_ddc_pin(dev_priv, port);
+		ddc_pin = map_ddc_pin(dev_priv, child->ddc_pin);
+		if (intel_gmbus_is_valid_pin(dev_priv, ddc_pin)) {
+			info->alternate_ddc_pin = ddc_pin;
+			sanitize_ddc_pin(dev_priv, port);
+		} else {
+			DRM_DEBUG_KMS("Port %c has invalid DDC pin %d, "
+				      "sticking to defaults\n",
+				      port_name(port), ddc_pin);
+		}
 	}
 
 	if (is_dp) {

From 4a0559ed99189f259e92ed34f60dd51688f1bc40 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Sat, 14 Apr 2018 10:12:33 +0100
Subject: [PATCH 201/288] drm/i915: Call i915_perf_fini() on init_hw error
 unwind

We have to cleanup after i915_perf_init(), even on the error path, as it
passes a pointer into the module to the sysfs core. If we fail to
unregister the sysctl table, we leave a dangling pointer which then may
explode anytime later.

Fixes: 9f9b2792b6d3 ("drm/i915/perf: reuse timestamp frequency from device info")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Cc: Matthew Auld <matthew.auld@intel.com>
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180414091233.32224-1-chris@chris-wilson.co.uk
(cherry picked from commit 9f172f6fbd243759c808d97bd83c95e49325b2c9)
Signed-off-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
---
 drivers/gpu/drm/i915/i915_drv.c | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 07c07d55398b..be8555049c93 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -1102,30 +1102,32 @@ static int i915_driver_init_hw(struct drm_i915_private *dev_priv)
 
 	ret = i915_ggtt_probe_hw(dev_priv);
 	if (ret)
-		return ret;
+		goto err_perf;
 
-	/* WARNING: Apparently we must kick fbdev drivers before vgacon,
-	 * otherwise the vga fbdev driver falls over. */
+	/*
+	 * WARNING: Apparently we must kick fbdev drivers before vgacon,
+	 * otherwise the vga fbdev driver falls over.
+	 */
 	ret = i915_kick_out_firmware_fb(dev_priv);
 	if (ret) {
 		DRM_ERROR("failed to remove conflicting framebuffer drivers\n");
-		goto out_ggtt;
+		goto err_ggtt;
 	}
 
 	ret = i915_kick_out_vgacon(dev_priv);
 	if (ret) {
 		DRM_ERROR("failed to remove conflicting VGA console\n");
-		goto out_ggtt;
+		goto err_ggtt;
 	}
 
 	ret = i915_ggtt_init_hw(dev_priv);
 	if (ret)
-		return ret;
+		goto err_ggtt;
 
 	ret = i915_ggtt_enable_hw(dev_priv);
 	if (ret) {
 		DRM_ERROR("failed to enable GGTT\n");
-		goto out_ggtt;
+		goto err_ggtt;
 	}
 
 	pci_set_master(pdev);
@@ -1136,7 +1138,7 @@ static int i915_driver_init_hw(struct drm_i915_private *dev_priv)
 		if (ret) {
 			DRM_ERROR("failed to set DMA mask\n");
 
-			goto out_ggtt;
+			goto err_ggtt;
 		}
 	}
 
@@ -1154,7 +1156,7 @@ static int i915_driver_init_hw(struct drm_i915_private *dev_priv)
 		if (ret) {
 			DRM_ERROR("failed to set DMA mask\n");
 
-			goto out_ggtt;
+			goto err_ggtt;
 		}
 	}
 
@@ -1187,13 +1189,14 @@ static int i915_driver_init_hw(struct drm_i915_private *dev_priv)
 
 	ret = intel_gvt_init(dev_priv);
 	if (ret)
-		goto out_ggtt;
+		goto err_ggtt;
 
 	return 0;
 
-out_ggtt:
+err_ggtt:
 	i915_ggtt_cleanup_hw(dev_priv);
-
+err_perf:
+	i915_perf_fini(dev_priv);
 	return ret;
 }
 

From b4615730530be85fc45ab4631c2ad6d8e2d0b97d Mon Sep 17 00:00:00 2001
From: Gaurav K Singh <gaurav.k.singh@intel.com>
Date: Tue, 17 Apr 2018 23:52:18 +0530
Subject: [PATCH 202/288] drm/i915/audio: Fix audio detection issue on GLK

On Geminilake, sometimes audio card is not getting
detected after reboot. This is a spurious issue happening on
Geminilake. HW codec and HD audio controller link was going
out of sync for which there was a fix in i915 driver but
was not getting invoked for GLK. Extending this fix to GLK as well.

Tested by Du,Wenkai on GLK board.

Bspec: 21829

v2: Instead of checking GEN9_BC, BXT and GLK macros, use IS_GEN9 macro (Jani N)

Cc: <stable@vger.kernel.org> # b651bd2a3ae3 ("drm/i915/audio: Fix audio enumeration issue on BXT")
Cc: <stable@vger.kernel.org>
Signed-off-by: Gaurav K Singh <gaurav.k.singh@intel.com>
Reviewed-by: Abhay Kumar <abhay.Kumar@intel.com>
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/1523989338-29677-1-git-send-email-gaurav.k.singh@intel.com
(cherry picked from commit 8221229046e862977ae93ec9d34aa583fbd10397)
Signed-off-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
---
 drivers/gpu/drm/i915/intel_audio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/intel_audio.c b/drivers/gpu/drm/i915/intel_audio.c
index 709d6ca68074..3ea566f99450 100644
--- a/drivers/gpu/drm/i915/intel_audio.c
+++ b/drivers/gpu/drm/i915/intel_audio.c
@@ -729,7 +729,7 @@ static void i915_audio_component_codec_wake_override(struct device *kdev,
 	struct drm_i915_private *dev_priv = kdev_to_i915(kdev);
 	u32 tmp;
 
-	if (!IS_GEN9_BC(dev_priv))
+	if (!IS_GEN9(dev_priv))
 		return;
 
 	i915_audio_component_get_power(kdev);

From 7eb2c4dd54ff841f2fe509a84973eb25fa20bda2 Mon Sep 17 00:00:00 2001
From: Imre Deak <imre.deak@intel.com>
Date: Mon, 16 Apr 2018 18:53:09 +0300
Subject: [PATCH 203/288] drm/i915: Fix LSPCON TMDS output buffer enabling from
 low-power state
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

LSPCON adapters in low-power state may ignore the first I2C write during
TMDS output buffer enabling, resulting in a blank screen even with an
otherwise enabled pipe. Fix this by reading back and validating the
written value a few times.

The problem was noticed on GLK machines with an onboard LSPCON adapter
after entering/exiting DC5 power state. Doing an I2C read of the adapter
ID as the first transaction - instead of the I2C write to enable the
TMDS buffers - returns the correct value. Based on this we assume that
the transaction itself is sent properly, it's only the adapter that is
not ready for some reason to accept this first write after waking from
low-power state. In my case the second I2C write attempt always
succeeded.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=105854
Cc: Clinton Taylor <clinton.a.taylor@intel.com>
Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
Cc: stable@vger.kernel.org
Signed-off-by: Imre Deak <imre.deak@intel.com>
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180416155309.11100-1-imre.deak@intel.com
---
 drivers/gpu/drm/drm_dp_dual_mode_helper.c | 39 +++++++++++++++++++----
 1 file changed, 32 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/drm_dp_dual_mode_helper.c b/drivers/gpu/drm/drm_dp_dual_mode_helper.c
index 02a50929af67..e7f4fe2848a5 100644
--- a/drivers/gpu/drm/drm_dp_dual_mode_helper.c
+++ b/drivers/gpu/drm/drm_dp_dual_mode_helper.c
@@ -350,19 +350,44 @@ int drm_dp_dual_mode_set_tmds_output(enum drm_dp_dual_mode_type type,
 {
 	uint8_t tmds_oen = enable ? 0 : DP_DUAL_MODE_TMDS_DISABLE;
 	ssize_t ret;
+	int retry;
 
 	if (type < DRM_DP_DUAL_MODE_TYPE2_DVI)
 		return 0;
 
-	ret = drm_dp_dual_mode_write(adapter, DP_DUAL_MODE_TMDS_OEN,
-				     &tmds_oen, sizeof(tmds_oen));
-	if (ret) {
-		DRM_DEBUG_KMS("Failed to %s TMDS output buffers\n",
-			      enable ? "enable" : "disable");
-		return ret;
+	/*
+	 * LSPCON adapters in low-power state may ignore the first write, so
+	 * read back and verify the written value a few times.
+	 */
+	for (retry = 0; retry < 3; retry++) {
+		uint8_t tmp;
+
+		ret = drm_dp_dual_mode_write(adapter, DP_DUAL_MODE_TMDS_OEN,
+					     &tmds_oen, sizeof(tmds_oen));
+		if (ret) {
+			DRM_DEBUG_KMS("Failed to %s TMDS output buffers (%d attempts)\n",
+				      enable ? "enable" : "disable",
+				      retry + 1);
+			return ret;
+		}
+
+		ret = drm_dp_dual_mode_read(adapter, DP_DUAL_MODE_TMDS_OEN,
+					    &tmp, sizeof(tmp));
+		if (ret) {
+			DRM_DEBUG_KMS("I2C read failed during TMDS output buffer %s (%d attempts)\n",
+				      enable ? "enabling" : "disabling",
+				      retry + 1);
+			return ret;
+		}
+
+		if (tmp == tmds_oen)
+			return 0;
 	}
 
-	return 0;
+	DRM_DEBUG_KMS("I2C write value mismatch during TMDS output buffer %s\n",
+		      enable ? "enabling" : "disabling");
+
+	return -EIO;
 }
 EXPORT_SYMBOL(drm_dp_dual_mode_set_tmds_output);
 

From 8b8b5903d50d6e4c5fd06272022b25a11c3214cf Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Date: Tue, 6 Mar 2018 15:43:54 +0100
Subject: [PATCH 204/288] dt-bindings: thermal: remove no longer needed samsung
 thermal properties

Remove documentation for longer needed samsung thermal properties.

Signed-off-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Eduardo Valentin <edubezval@gmail.com>
---
 .../bindings/thermal/exynos-thermal.txt       | 23 ++++---------------
 1 file changed, 5 insertions(+), 18 deletions(-)

diff --git a/Documentation/devicetree/bindings/thermal/exynos-thermal.txt b/Documentation/devicetree/bindings/thermal/exynos-thermal.txt
index 1b596fd38dc4..b957acff57aa 100644
--- a/Documentation/devicetree/bindings/thermal/exynos-thermal.txt
+++ b/Documentation/devicetree/bindings/thermal/exynos-thermal.txt
@@ -49,19 +49,6 @@ on the SoC (only first trip points defined in DT will be configured):
  - samsung,exynos5433-tmu: 8
  - samsung,exynos7-tmu: 8
 
-Following properties are mandatory (depending on SoC):
-- samsung,tmu_gain: Gain value for internal TMU operation.
-- samsung,tmu_reference_voltage: Value of TMU IP block's reference voltage
-- samsung,tmu_noise_cancel_mode: Mode for noise cancellation
-- samsung,tmu_efuse_value: Default level of temperature - it is needed when
-			   in factory fusing produced wrong value
-- samsung,tmu_min_efuse_value: Minimum temperature fused value
-- samsung,tmu_max_efuse_value: Maximum temperature fused value
-- samsung,tmu_first_point_trim: First point trimming value
-- samsung,tmu_second_point_trim: Second point trimming value
-- samsung,tmu_default_temp_offset: Default temperature offset
-- samsung,tmu_cal_type: Callibration type
-
 ** Optional properties:
 
 - vtmu-supply: This entry is optional and provides the regulator node supplying
@@ -78,7 +65,7 @@ Example 1):
 		clocks = <&clock 383>;
 		clock-names = "tmu_apbif";
 		vtmu-supply = <&tmu_regulator_node>;
-		#include "exynos4412-tmu-sensor-conf.dtsi"
+		#thermal-sensor-cells = <0>;
 	};
 
 Example 2):
@@ -89,7 +76,7 @@ Example 2):
 		interrupts = <0 58 0>;
 		clocks = <&clock 21>;
 		clock-names = "tmu_apbif";
-		#include "exynos5440-tmu-sensor-conf.dtsi"
+		#thermal-sensor-cells = <0>;
 	};
 
 Example 3): (In case of Exynos5420 "with misplaced TRIMINFO register")
@@ -99,7 +86,7 @@ Example 3): (In case of Exynos5420 "with misplaced TRIMINFO register")
 		interrupts = <0 184 0>;
 		clocks = <&clock 318>, <&clock 318>;
 		clock-names = "tmu_apbif", "tmu_triminfo_apbif";
-		#include "exynos4412-tmu-sensor-conf.dtsi"
+		#thermal-sensor-cells = <0>;
 	};
 
 	tmu_cpu3: tmu@1006c000 {
@@ -108,7 +95,7 @@ Example 3): (In case of Exynos5420 "with misplaced TRIMINFO register")
 		interrupts = <0 185 0>;
 		clocks = <&clock 318>, <&clock 319>;
 		clock-names = "tmu_apbif", "tmu_triminfo_apbif";
-		#include "exynos4412-tmu-sensor-conf.dtsi"
+		#thermal-sensor-cells = <0>;
 	};
 
 	tmu_gpu: tmu@100a0000 {
@@ -117,7 +104,7 @@ Example 3): (In case of Exynos5420 "with misplaced TRIMINFO register")
 		interrupts = <0 215 0>;
 		clocks = <&clock 319>, <&clock 318>;
 		clock-names = "tmu_apbif", "tmu_triminfo_apbif";
-		#include "exynos4412-tmu-sensor-conf.dtsi"
+		#thermal-sensor-cells = <0>;
 	};
 
 Note: For multi-instance tmu each instance should have an alias correctly

From e04907dbc25930b88ee2328fe692c776f63ddf2c Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Fri, 9 Feb 2018 14:28:10 +0530
Subject: [PATCH 205/288] dt-bindings: thermal: Remove
 "cooling-{min|max}-level" properties

The "cooling-min-level" and "cooling-max-level" properties are not
parsed by any part of kernel currently and the max cooling state of a
CPU cooling device is found by referring to the cpufreq table instead.

Remove the unused bindings.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Eduardo Valentin <edubezval@gmail.com>
---
 .../devicetree/bindings/thermal/thermal.txt      | 16 +---------------
 1 file changed, 1 insertion(+), 15 deletions(-)

diff --git a/Documentation/devicetree/bindings/thermal/thermal.txt b/Documentation/devicetree/bindings/thermal/thermal.txt
index 1719d47a5e2f..cc553f0952c5 100644
--- a/Documentation/devicetree/bindings/thermal/thermal.txt
+++ b/Documentation/devicetree/bindings/thermal/thermal.txt
@@ -55,8 +55,7 @@ of heat dissipation). For example a fan's cooling states correspond to
 the different fan speeds possible. Cooling states are referred to by
 single unsigned integers, where larger numbers mean greater heat
 dissipation. The precise set of cooling states associated with a device
-(as referred to by the cooling-min-level and cooling-max-level
-properties) should be defined in a particular device's binding.
+should be defined in a particular device's binding.
 For more examples of cooling devices, refer to the example sections below.
 
 Required properties:
@@ -69,15 +68,6 @@ Required properties:
 			See Cooling device maps section below for more details
 			on how consumers refer to cooling devices.
 
-Optional properties:
-- cooling-min-level:	An integer indicating the smallest
-  Type: unsigned	cooling state accepted. Typically 0.
-  Size: one cell
-
-- cooling-max-level:	An integer indicating the largest
-  Type: unsigned	cooling state accepted.
-  Size: one cell
-
 * Trip points
 
 The trip node is a node to describe a point in the temperature domain
@@ -226,8 +216,6 @@ cpus {
 			396000  950000
 			198000  850000
 		>;
-		cooling-min-level = <0>;
-		cooling-max-level = <3>;
 		#cooling-cells = <2>; /* min followed by max */
 	};
 	...
@@ -241,8 +229,6 @@ cpus {
 	 */
 	fan0: fan@48 {
 		...
-		cooling-min-level = <0>;
-		cooling-max-level = <9>;
 		#cooling-cells = <2>; /* min followed by max */
 	};
 };

From b2d71b3cda19831ec67f49d7c6ba0214d9367b29 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 16 Apr 2018 16:45:01 +0100
Subject: [PATCH 206/288] arm64: signal: don't force known signals to SIGKILL

Since commit:

  a7e6f1ca90354a31 ("arm64: signal: Force SIGKILL for unknown signals in force_signal_inject")

... any signal which is not SIGKILL will be upgraded to a SIGKILL be
force_signal_inject(). This includes signals we do expect, such as
SIGILL triggered by do_undefinstr().

Fix the check to use a logical AND rather than a logical OR, permitting
signals whose layout is SIL_FAULT.

Fixes: a7e6f1ca90354a31 ("arm64: signal: Force SIGKILL for unknown signals in force_signal_inject")
Cc: Will Deacon <will.deacon@arm.com>
Reviewed-by: Dave Martin <Dave.Martin@arm.com>
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/traps.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index ba964da31a25..1cb2749a72bf 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -366,7 +366,7 @@ void force_signal_inject(int signal, int code, unsigned long address)
 	}
 
 	/* Force signals we don't understand to SIGKILL */
-	if (WARN_ON(signal != SIGKILL ||
+	if (WARN_ON(signal != SIGKILL &&
 		    siginfo_layout(signal, code) != SIL_FAULT)) {
 		signal = SIGKILL;
 	}

From 44f06ba8297c7e9dfd0e49b40cbe119113cca094 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 12 Apr 2018 17:22:23 +0200
Subject: [PATCH 207/288] udf: Fix leak of UTF-16 surrogates into encoded
 strings

OSTA UDF specification does not mention whether the CS0 charset in case
of two bytes per character encoding should be treated in UTF-16 or
UCS-2. The sample code in the standard does not treat UTF-16 surrogates
in any special way but on systems such as Windows which work in UTF-16
internally, filenames would be treated as being in UTF-16 effectively.
In Linux it is more difficult to handle characters outside of Base
Multilingual plane (beyond 0xffff) as NLS framework works with 2-byte
characters only. Just make sure we don't leak UTF-16 surrogates into the
resulting string when loading names from the filesystem for now.

CC: stable@vger.kernel.org # >= v4.6
Reported-by: Mingye Wang <arthur200126@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/unicode.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index f897e55f2cd0..16a8ad21b77e 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -28,6 +28,9 @@
 
 #include "udf_sb.h"
 
+#define SURROGATE_MASK 0xfffff800
+#define SURROGATE_PAIR 0x0000d800
+
 static int udf_uni2char_utf8(wchar_t uni,
 			     unsigned char *out,
 			     int boundlen)
@@ -37,6 +40,9 @@ static int udf_uni2char_utf8(wchar_t uni,
 	if (boundlen <= 0)
 		return -ENAMETOOLONG;
 
+	if ((uni & SURROGATE_MASK) == SURROGATE_PAIR)
+		return -EINVAL;
+
 	if (uni < 0x80) {
 		out[u_len++] = (unsigned char)uni;
 	} else if (uni < 0x800) {

From a514d63882c3d2063b21b865447266ebcb18b04c Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Fri, 22 Dec 2017 16:06:39 +0800
Subject: [PATCH 208/288] btrfs: qgroup: Commit transaction in advance to
 reduce early EDQUOT

Unlike previous method that tries to commit transaction inside
qgroup_reserve(), this time we will try to commit transaction using
fs_info->transaction_kthread to avoid nested transaction and no need to
worry about locking context.

Since it's an asynchronous function call and we won't wait for
transaction commit, unlike previous method, we must call it before we
hit the qgroup limit.

So this patch will use the ratio and size of qgroup meta_pertrans
reservation as indicator to check if we should trigger a transaction
commit.  (meta_prealloc won't be cleaned in transaction committ, it's
useless anyway)

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h       |  6 ++++++
 fs/btrfs/disk-io.c     |  1 +
 fs/btrfs/qgroup.c      | 43 ++++++++++++++++++++++++++++++++++++++++--
 fs/btrfs/transaction.c |  1 +
 fs/btrfs/transaction.h | 14 ++++++++++++++
 5 files changed, 63 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 5474ef14d6e6..ec84e2dabb04 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -714,6 +714,12 @@ struct btrfs_delayed_root;
  */
 #define BTRFS_FS_EXCL_OP			16
 
+/*
+ * To info transaction_kthread we need an immediate commit so it doesn't
+ * need to wait for commit_interval
+ */
+#define BTRFS_FS_NEED_ASYNC_COMMIT		17
+
 struct btrfs_fs_info {
 	u8 fsid[BTRFS_FSID_SIZE];
 	u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4ac8b1d21baf..60caa68c3618 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1824,6 +1824,7 @@ static int transaction_kthread(void *arg)
 
 		now = get_seconds();
 		if (cur->state < TRANS_STATE_BLOCKED &&
+		    !test_bit(BTRFS_FS_NEED_ASYNC_COMMIT, &fs_info->flags) &&
 		    (now < cur->start_time ||
 		     now - cur->start_time < fs_info->commit_interval)) {
 			spin_unlock(&fs_info->trans_lock);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 09c7e4fd550f..9fb758d5077a 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -11,6 +11,7 @@
 #include <linux/slab.h>
 #include <linux/workqueue.h>
 #include <linux/btrfs.h>
+#include <linux/sizes.h>
 
 #include "ctree.h"
 #include "transaction.h"
@@ -2375,8 +2376,21 @@ out:
 	return ret;
 }
 
-static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 num_bytes)
+/*
+ * Two limits to commit transaction in advance.
+ *
+ * For RATIO, it will be 1/RATIO of the remaining limit
+ * (excluding data and prealloc meta) as threshold.
+ * For SIZE, it will be in byte unit as threshold.
+ */
+#define QGROUP_PERTRANS_RATIO		32
+#define QGROUP_PERTRANS_SIZE		SZ_32M
+static bool qgroup_check_limits(struct btrfs_fs_info *fs_info,
+				const struct btrfs_qgroup *qg, u64 num_bytes)
 {
+	u64 limit;
+	u64 threshold;
+
 	if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
 	    qgroup_rsv_total(qg) + (s64)qg->rfer + num_bytes > qg->max_rfer)
 		return false;
@@ -2385,6 +2399,31 @@ static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 num_bytes)
 	    qgroup_rsv_total(qg) + (s64)qg->excl + num_bytes > qg->max_excl)
 		return false;
 
+	/*
+	 * Even if we passed the check, it's better to check if reservation
+	 * for meta_pertrans is pushing us near limit.
+	 * If there is too much pertrans reservation or it's near the limit,
+	 * let's try commit transaction to free some, using transaction_kthread
+	 */
+	if ((qg->lim_flags & (BTRFS_QGROUP_LIMIT_MAX_RFER |
+			      BTRFS_QGROUP_LIMIT_MAX_EXCL))) {
+		if (qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL)
+			limit = qg->max_excl;
+		else
+			limit = qg->max_rfer;
+		threshold = (limit - qg->rsv.values[BTRFS_QGROUP_RSV_DATA] -
+			    qg->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC]) /
+			    QGROUP_PERTRANS_RATIO;
+		threshold = min_t(u64, threshold, QGROUP_PERTRANS_SIZE);
+
+		/*
+		 * Use transaction_kthread to commit transaction, so we no
+		 * longer need to bother nested transaction nor lock context.
+		 */
+		if (qg->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS] > threshold)
+			btrfs_commit_transaction_locksafe(fs_info);
+	}
+
 	return true;
 }
 
@@ -2434,7 +2473,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
 
 		qg = unode_aux_to_qgroup(unode);
 
-		if (enforce && !qgroup_check_limits(qg, num_bytes)) {
+		if (enforce && !qgroup_check_limits(fs_info, qg, num_bytes)) {
 			ret = -EDQUOT;
 			goto out;
 		}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 63fdcab64b01..c944b4769e3c 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -2267,6 +2267,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 	 */
 	cur_trans->state = TRANS_STATE_COMPLETED;
 	wake_up(&cur_trans->commit_wait);
+	clear_bit(BTRFS_FS_NEED_ASYNC_COMMIT, &fs_info->flags);
 
 	spin_lock(&fs_info->trans_lock);
 	list_del_init(&cur_trans->list);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index c88fccd80bc5..d8c0826bc2c7 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -199,6 +199,20 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root);
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans);
 int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
 				   int wait_for_unblock);
+
+/*
+ * Try to commit transaction asynchronously, so this is safe to call
+ * even holding a spinlock.
+ *
+ * It's done by informing transaction_kthread to commit transaction without
+ * waiting for commit interval.
+ */
+static inline void btrfs_commit_transaction_locksafe(
+		struct btrfs_fs_info *fs_info)
+{
+	set_bit(BTRFS_FS_NEED_ASYNC_COMMIT, &fs_info->flags);
+	wake_up_process(fs_info->transaction_kthread);
+}
 int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans);
 int btrfs_should_end_transaction(struct btrfs_trans_handle *trans);
 void btrfs_throttle(struct btrfs_fs_info *fs_info);

From ff6bc37eb7f6e7b052e50c13a480e1080b3ec07a Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Thu, 21 Dec 2017 13:42:04 +0800
Subject: [PATCH 209/288] btrfs: qgroup: Use independent and accurate per inode
 qgroup rsv

Unlike reservation calculation used in inode rsv for metadata, qgroup
doesn't really need to care about things like csum size or extent usage
for the whole tree COW.

Qgroups care more about net change of the extent usage.
That's to say, if we're going to insert one file extent, it will mostly
find its place in COWed tree block, leaving no change in extent usage.
Or causing a leaf split, resulting in one new net extent and increasing
qgroup number by nodesize.
Or in an even more rare case, increase the tree level, increasing qgroup
number by 2 * nodesize.

So here instead of using the complicated calculation for extent
allocator, which cares more about accuracy and no error, qgroup doesn't
need that over-estimated reservation.

This patch will maintain 2 new members in btrfs_block_rsv structure for
qgroup, using much smaller calculation for qgroup rsv, reducing false
EDQUOT.

Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
---
 fs/btrfs/ctree.h       | 19 ++++++++++++++
 fs/btrfs/extent-tree.c | 57 ++++++++++++++++++++++++++++++++++--------
 2 files changed, 65 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ec84e2dabb04..2771cc56a622 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -459,6 +459,25 @@ struct btrfs_block_rsv {
 	unsigned short full;
 	unsigned short type;
 	unsigned short failfast;
+
+	/*
+	 * Qgroup equivalent for @size @reserved
+	 *
+	 * Unlike normal @size/@reserved for inode rsv, qgroup doesn't care
+	 * about things like csum size nor how many tree blocks it will need to
+	 * reserve.
+	 *
+	 * Qgroup cares more about net change of the extent usage.
+	 *
+	 * So for one newly inserted file extent, in worst case it will cause
+	 * leaf split and level increase, nodesize for each file extent is
+	 * already too much.
+	 *
+	 * In short, qgroup_size/reserved is the upper limit of possible needed
+	 * qgroup metadata reservation.
+	 */
+	u64 qgroup_rsv_size;
+	u64 qgroup_rsv_reserved;
 };
 
 /*
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 5bdb5636d552..055494ddcace 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5560,14 +5560,18 @@ again:
 
 static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
 				    struct btrfs_block_rsv *block_rsv,
-				    struct btrfs_block_rsv *dest, u64 num_bytes)
+				    struct btrfs_block_rsv *dest, u64 num_bytes,
+				    u64 *qgroup_to_release_ret)
 {
 	struct btrfs_space_info *space_info = block_rsv->space_info;
+	u64 qgroup_to_release = 0;
 	u64 ret;
 
 	spin_lock(&block_rsv->lock);
-	if (num_bytes == (u64)-1)
+	if (num_bytes == (u64)-1) {
 		num_bytes = block_rsv->size;
+		qgroup_to_release = block_rsv->qgroup_rsv_size;
+	}
 	block_rsv->size -= num_bytes;
 	if (block_rsv->reserved >= block_rsv->size) {
 		num_bytes = block_rsv->reserved - block_rsv->size;
@@ -5576,6 +5580,13 @@ static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
 	} else {
 		num_bytes = 0;
 	}
+	if (block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) {
+		qgroup_to_release = block_rsv->qgroup_rsv_reserved -
+				    block_rsv->qgroup_rsv_size;
+		block_rsv->qgroup_rsv_reserved = block_rsv->qgroup_rsv_size;
+	} else {
+		qgroup_to_release = 0;
+	}
 	spin_unlock(&block_rsv->lock);
 
 	ret = num_bytes;
@@ -5598,6 +5609,8 @@ static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
 			space_info_add_old_bytes(fs_info, space_info,
 						 num_bytes);
 	}
+	if (qgroup_to_release_ret)
+		*qgroup_to_release_ret = qgroup_to_release;
 	return ret;
 }
 
@@ -5739,17 +5752,21 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
 	struct btrfs_root *root = inode->root;
 	struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
 	u64 num_bytes = 0;
+	u64 qgroup_num_bytes = 0;
 	int ret = -ENOSPC;
 
 	spin_lock(&block_rsv->lock);
 	if (block_rsv->reserved < block_rsv->size)
 		num_bytes = block_rsv->size - block_rsv->reserved;
+	if (block_rsv->qgroup_rsv_reserved < block_rsv->qgroup_rsv_size)
+		qgroup_num_bytes = block_rsv->qgroup_rsv_size -
+				   block_rsv->qgroup_rsv_reserved;
 	spin_unlock(&block_rsv->lock);
 
 	if (num_bytes == 0)
 		return 0;
 
-	ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true);
+	ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_num_bytes, true);
 	if (ret)
 		return ret;
 	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
@@ -5757,7 +5774,13 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
 		block_rsv_add_bytes(block_rsv, num_bytes, 0);
 		trace_btrfs_space_reservation(root->fs_info, "delalloc",
 					      btrfs_ino(inode), num_bytes, 1);
-	}
+
+		/* Don't forget to increase qgroup_rsv_reserved */
+		spin_lock(&block_rsv->lock);
+		block_rsv->qgroup_rsv_reserved += qgroup_num_bytes;
+		spin_unlock(&block_rsv->lock);
+	} else
+		btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
 	return ret;
 }
 
@@ -5778,20 +5801,23 @@ static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)
 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
 	struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
 	u64 released = 0;
+	u64 qgroup_to_release = 0;
 
 	/*
 	 * Since we statically set the block_rsv->size we just want to say we
 	 * are releasing 0 bytes, and then we'll just get the reservation over
 	 * the size free'd.
 	 */
-	released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, 0);
+	released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, 0,
+					   &qgroup_to_release);
 	if (released > 0)
 		trace_btrfs_space_reservation(fs_info, "delalloc",
 					      btrfs_ino(inode), released, 0);
 	if (qgroup_free)
-		btrfs_qgroup_free_meta_prealloc(inode->root, released);
+		btrfs_qgroup_free_meta_prealloc(inode->root, qgroup_to_release);
 	else
-		btrfs_qgroup_convert_reserved_meta(inode->root, released);
+		btrfs_qgroup_convert_reserved_meta(inode->root,
+						   qgroup_to_release);
 }
 
 void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
@@ -5803,7 +5829,7 @@ void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
 	if (global_rsv == block_rsv ||
 	    block_rsv->space_info != global_rsv->space_info)
 		global_rsv = NULL;
-	block_rsv_release_bytes(fs_info, block_rsv, global_rsv, num_bytes);
+	block_rsv_release_bytes(fs_info, block_rsv, global_rsv, num_bytes, NULL);
 }
 
 static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
@@ -5883,7 +5909,7 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
 static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
 {
 	block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
-				(u64)-1);
+				(u64)-1, NULL);
 	WARN_ON(fs_info->trans_block_rsv.size > 0);
 	WARN_ON(fs_info->trans_block_rsv.reserved > 0);
 	WARN_ON(fs_info->chunk_block_rsv.size > 0);
@@ -5907,7 +5933,7 @@ void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
 	WARN_ON_ONCE(!list_empty(&trans->new_bgs));
 
 	block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL,
-				trans->chunk_bytes_reserved);
+				trans->chunk_bytes_reserved, NULL);
 	trans->chunk_bytes_reserved = 0;
 }
 
@@ -6012,6 +6038,7 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
 {
 	struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
 	u64 reserve_size = 0;
+	u64 qgroup_rsv_size = 0;
 	u64 csum_leaves;
 	unsigned outstanding_extents;
 
@@ -6024,9 +6051,17 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
 						 inode->csum_bytes);
 	reserve_size += btrfs_calc_trans_metadata_size(fs_info,
 						       csum_leaves);
+	/*
+	 * For qgroup rsv, the calculation is very simple:
+	 * account one nodesize for each outstanding extent
+	 *
+	 * This is overestimating in most cases.
+	 */
+	qgroup_rsv_size = outstanding_extents * fs_info->nodesize;
 
 	spin_lock(&block_rsv->lock);
 	block_rsv->size = reserve_size;
+	block_rsv->qgroup_rsv_size = qgroup_rsv_size;
 	spin_unlock(&block_rsv->lock);
 }
 
@@ -8405,7 +8440,7 @@ static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
 			    struct btrfs_block_rsv *block_rsv, u32 blocksize)
 {
 	block_rsv_add_bytes(block_rsv, blocksize, 0);
-	block_rsv_release_bytes(fs_info, block_rsv, NULL, 0);
+	block_rsv_release_bytes(fs_info, block_rsv, NULL, 0, NULL);
 }
 
 /*

From f218ea6c4792e0fabba0195f2f866d0a3b58431e Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Tue, 17 Apr 2018 16:52:45 +0800
Subject: [PATCH 210/288] btrfs: delayed-inode: Remove wrong qgroup meta
 reservation calls

Commit 4f5427ccce5d ("btrfs: delayed-inode: Use new qgroup meta rsv for
delayed inode and item") merged into mainline was not latest version
submitted to the mail list in Dec 2017.

Which lacks the following fixes:

1) Remove btrfs_qgroup_convert_reserved_meta() call in
   btrfs_delayed_item_release_metadata()
2) Remove btrfs_qgroup_reserve_meta_prealloc() call in
   btrfs_delayed_inode_reserve_metadata()

Those fixes will resolve unexpected EDQUOT problems.

Fixes: 4f5427ccce5d ("btrfs: delayed-inode: Use new qgroup meta rsv for delayed inode and item")
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/delayed-inode.c | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 06ec8ab6d9ba..a8d492dbd3e7 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -556,6 +556,12 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
 	dst_rsv = &fs_info->delayed_block_rsv;
 
 	num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
+
+	/*
+	 * Here we migrate space rsv from transaction rsv, since have already
+	 * reserved space when starting a transaction.  So no need to reserve
+	 * qgroup space here.
+	 */
 	ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1);
 	if (!ret) {
 		trace_btrfs_space_reservation(fs_info, "delayed_item",
@@ -577,7 +583,10 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
 		return;
 
 	rsv = &fs_info->delayed_block_rsv;
-	btrfs_qgroup_convert_reserved_meta(root, item->bytes_reserved);
+	/*
+	 * Check btrfs_delayed_item_reserve_metadata() to see why we don't need
+	 * to release/reserve qgroup space.
+	 */
 	trace_btrfs_space_reservation(fs_info, "delayed_item",
 				      item->key.objectid, item->bytes_reserved,
 				      0);
@@ -602,9 +611,6 @@ static int btrfs_delayed_inode_reserve_metadata(
 
 	num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
 
-	ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true);
-	if (ret < 0)
-		return ret;
 	/*
 	 * btrfs_dirty_inode will update the inode under btrfs_join_transaction
 	 * which doesn't reserve space for speed.  This is a problem since we
@@ -616,6 +622,10 @@ static int btrfs_delayed_inode_reserve_metadata(
 	 */
 	if (!src_rsv || (!trans->bytes_reserved &&
 			 src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) {
+		ret = btrfs_qgroup_reserve_meta_prealloc(root,
+				fs_info->nodesize, true);
+		if (ret < 0)
+			return ret;
 		ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
 					  BTRFS_RESERVE_NO_FLUSH);
 		/*
@@ -634,6 +644,8 @@ static int btrfs_delayed_inode_reserve_metadata(
 						      "delayed_inode",
 						      btrfs_ino(inode),
 						      num_bytes, 1);
+		} else {
+			btrfs_qgroup_free_meta_prealloc(root, fs_info->nodesize);
 		}
 		return ret;
 	}

From 336a8bb8e36a273a802a54b2e673c777c9c62fb1 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Tue, 17 Apr 2018 18:43:58 +0800
Subject: [PATCH 211/288] btrfs: Fix wrong btrfs_delalloc_release_extents
 parameter

Commit 43b18595d660 ("btrfs: qgroup: Use separate meta reservation type
for delalloc") merged into mainline is not the latest version submitted
to mail list in Dec 2017.

It has a fatal wrong @qgroup_free parameter, which results increasing
qgroup metadata pertrans reserved space, and causing a lot of early EDQUOT.

Fix it by applying the correct diff on top of current branch.

Fixes: 43b18595d660 ("btrfs: qgroup: Use separate meta reservation type for delalloc")
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 0167a9c97c9c..f660ba1e5e58 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1748,7 +1748,7 @@ again:
 			unlock_extent_cached(&BTRFS_I(inode)->io_tree,
 					     lockstart, lockend, &cached_state);
 		btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes,
-					       (ret != 0));
+					       true);
 		if (ret) {
 			btrfs_drop_pages(pages, num_pages);
 			break;

From b32e56e5a87a1f9243db92bc7a5df0ffb4627cfb Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Wed, 11 Apr 2018 15:17:59 +1000
Subject: [PATCH 212/288] powerpc/xive: Fix trying to "push" an already active
 pool VP

When setting up a CPU, we "push" (activate) a pool VP for it.

However it's an error to do so if it already has an active
pool VP.

This happens when doing soft CPU hotplug on powernv since we
don't tear down the CPU on unplug. The HW flags the error which
gets captured by the diagnostics.

Fix this by making sure to "pull" out any already active pool
first.

Fixes: 243e25112d06 ("powerpc/xive: Native exploitation of the XIVE interrupt controller")
Cc: stable@vger.kernel.org # v4.12+
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/sysdev/xive/native.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c
index d22aeb0b69e1..b48454be5b98 100644
--- a/arch/powerpc/sysdev/xive/native.c
+++ b/arch/powerpc/sysdev/xive/native.c
@@ -389,6 +389,10 @@ static void xive_native_setup_cpu(unsigned int cpu, struct xive_cpu *xc)
 	if (xive_pool_vps == XIVE_INVALID_VP)
 		return;
 
+	/* Check if pool VP already active, if it is, pull it */
+	if (in_be32(xive_tima + TM_QW2_HV_POOL + TM_WORD2) & TM_QW2W2_VP)
+		in_be64(xive_tima + TM_SPC_PULL_POOL_CTX);
+
 	/* Enable the pool VP */
 	vp = xive_pool_vps + cpu;
 	pr_debug("CPU %d setting up pool VP 0x%x\n", cpu, vp);

From be47e41d77fba5bc17e9fb5f1c99217bb6691989 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Tue, 17 Apr 2018 21:25:42 +0200
Subject: [PATCH 213/288] tipc: fix use-after-free in tipc_nametbl_stop

When we delete a service item in tipc_nametbl_stop() we loop over
all service ranges in the service's RB tree, and for each service
range we loop over its pertaining publications while calling
tipc_service_remove_publ() for each of them.

However, tipc_service_remove_publ() has the side effect that it also
removes the comprising service range item when there are no publications
left. This leads to a "use-after-free" access when the inner loop
continues to the next iteration, since the range item holding the list
we are looping no longer exists.

We fix this by moving the delete of the service range item outside
the said function. Instead, we now let the two functions calling it
test if the list is empty and perform the removal when that is the
case.

Reported-by: syzbot+d64b64afc55660106556@syzkaller.appspotmail.com
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/name_table.c | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
index 4068eaad61a6..dd1c4fa2eb78 100644
--- a/net/tipc/name_table.c
+++ b/net/tipc/name_table.c
@@ -241,7 +241,8 @@ err:
 static struct publication *tipc_service_remove_publ(struct net *net,
 						    struct tipc_service *sc,
 						    u32 lower, u32 upper,
-						    u32 node, u32 key)
+						    u32 node, u32 key,
+						    struct service_range **rng)
 {
 	struct tipc_subscription *sub, *tmp;
 	struct service_range *sr;
@@ -275,19 +276,15 @@ static struct publication *tipc_service_remove_publ(struct net *net,
 
 	list_del(&p->all_publ);
 	list_del(&p->local_publ);
-
-	/* Remove service range item if this was its last publication */
-	if (list_empty(&sr->all_publ)) {
+	if (list_empty(&sr->all_publ))
 		last = true;
-		rb_erase(&sr->tree_node, &sc->ranges);
-		kfree(sr);
-	}
 
 	/* Notify any waiting subscriptions */
 	list_for_each_entry_safe(sub, tmp, &sc->subscriptions, service_list) {
 		tipc_sub_report_overlap(sub, p->lower, p->upper, TIPC_WITHDRAWN,
 					p->port, p->node, p->scope, last);
 	}
+	*rng = sr;
 	return p;
 }
 
@@ -379,13 +376,20 @@ struct publication *tipc_nametbl_remove_publ(struct net *net, u32 type,
 					     u32 node, u32 key)
 {
 	struct tipc_service *sc = tipc_service_find(net, type);
+	struct service_range *sr = NULL;
 	struct publication *p = NULL;
 
 	if (!sc)
 		return NULL;
 
 	spin_lock_bh(&sc->lock);
-	p = tipc_service_remove_publ(net, sc, lower, upper, node, key);
+	p = tipc_service_remove_publ(net, sc, lower, upper, node, key, &sr);
+
+	/* Remove service range item if this was its last publication */
+	if (sr && list_empty(&sr->all_publ)) {
+		rb_erase(&sr->tree_node, &sc->ranges);
+		kfree(sr);
+	}
 
 	/* Delete service item if this no more publications and subscriptions */
 	if (RB_EMPTY_ROOT(&sc->ranges) && list_empty(&sc->subscriptions)) {
@@ -747,16 +751,17 @@ int tipc_nametbl_init(struct net *net)
 static void tipc_service_delete(struct net *net, struct tipc_service *sc)
 {
 	struct service_range *sr, *tmpr;
-	struct publication *p, *tmpb;
+	struct publication *p, *tmp;
 
 	spin_lock_bh(&sc->lock);
 	rbtree_postorder_for_each_entry_safe(sr, tmpr, &sc->ranges, tree_node) {
-		list_for_each_entry_safe(p, tmpb,
-					 &sr->all_publ, all_publ) {
+		list_for_each_entry_safe(p, tmp, &sr->all_publ, all_publ) {
 			tipc_service_remove_publ(net, sc, p->lower, p->upper,
-						 p->node, p->key);
+						 p->node, p->key, &sr);
 			kfree_rcu(p, rcu);
 		}
+		rb_erase(&sr->tree_node, &sc->ranges);
+		kfree(sr);
 	}
 	hlist_del_init_rcu(&sc->service_list);
 	spin_unlock_bh(&sc->lock);

From 36a50a989ee8267588de520b8704b85f045a3220 Mon Sep 17 00:00:00 2001
From: Tung Nguyen <tung.q.nguyen@dektech.com.au>
Date: Tue, 17 Apr 2018 21:58:27 +0200
Subject: [PATCH 214/288] tipc: fix infinite loop when dumping link monitor
 summary

When configuring the number of used bearers to MAX_BEARER and issuing
command "tipc link monitor summary", the command enters infinite loop
in user space.

This issue happens because function tipc_nl_node_dump_monitor() returns
the wrong 'prev_bearer' value when all potential monitors have been
scanned.

The correct behavior is to always try to scan all monitors until either
the netlink message is full, in which case we return the bearer identity
of the affected monitor, or we continue through the whole bearer array
until we can return MAX_BEARERS. This solution also caters for the case
where there may be gaps in the bearer array.

Signed-off-by: Tung Nguyen <tung.q.nguyen@dektech.com.au>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/monitor.c |  2 +-
 net/tipc/node.c    | 11 ++++-------
 2 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/net/tipc/monitor.c b/net/tipc/monitor.c
index 32dc33a94bc7..5453e564da82 100644
--- a/net/tipc/monitor.c
+++ b/net/tipc/monitor.c
@@ -777,7 +777,7 @@ int __tipc_nl_add_monitor(struct net *net, struct tipc_nl_msg *msg,
 
 	ret = tipc_bearer_get_name(net, bearer_name, bearer_id);
 	if (ret || !mon)
-		return -EINVAL;
+		return 0;
 
 	hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family,
 			  NLM_F_MULTI, TIPC_NL_MON_GET);
diff --git a/net/tipc/node.c b/net/tipc/node.c
index c77dd2f3c589..6f98b56dd48e 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -2232,8 +2232,8 @@ int tipc_nl_node_dump_monitor(struct sk_buff *skb, struct netlink_callback *cb)
 	struct net *net = sock_net(skb->sk);
 	u32 prev_bearer = cb->args[0];
 	struct tipc_nl_msg msg;
+	int bearer_id;
 	int err;
-	int i;
 
 	if (prev_bearer == MAX_BEARERS)
 		return 0;
@@ -2243,16 +2243,13 @@ int tipc_nl_node_dump_monitor(struct sk_buff *skb, struct netlink_callback *cb)
 	msg.seq = cb->nlh->nlmsg_seq;
 
 	rtnl_lock();
-	for (i = prev_bearer; i < MAX_BEARERS; i++) {
-		prev_bearer = i;
+	for (bearer_id = prev_bearer; bearer_id < MAX_BEARERS; bearer_id++) {
 		err = __tipc_nl_add_monitor(net, &msg, prev_bearer);
 		if (err)
-			goto out;
+			break;
 	}
-
-out:
 	rtnl_unlock();
-	cb->args[0] = prev_bearer;
+	cb->args[0] = bearer_id;
 
 	return skb->len;
 }

From 81c895072d29cd70eea5be1a8587cd6461c3715a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B8rn=20Mork?= <bjorn@mork.no>
Date: Tue, 17 Apr 2018 22:46:38 +0200
Subject: [PATCH 215/288] tun: fix vlan packet truncation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bogus trimming in tun_net_xmit() causes truncated vlan packets.

skb->len is correct whether or not skb_vlan_tag_present() is true. There
is no more reason to adjust the skb length on xmit in this driver than
any other driver. tun_put_user() adds 4 bytes to the total for tagged
packets because it transmits the tag inline to userspace.  This is
similar to a nic transmitting the tag inline on the wire.

Reproducing the bug by sending any tagged packet through back-to-back
connected tap interfaces:

 socat TUN,tun-type=tap,iff-up,tun-name=in TUN,tun-type=tap,iff-up,tun-name=out &
 ip link add link in name in.20 type vlan id 20
 ip addr add 10.9.9.9/24 dev in.20
 ip link set in.20 up
 tshark -nxxi in -f arp -c1 2>/dev/null &
 tshark -nxxi out -f arp -c1 2>/dev/null &
 ping -c 1 10.9.9.5 >/dev/null 2>&1

The output from the 'in' and 'out' interfaces are different when the
bug is present:

 Capturing on 'in'
 0000  ff ff ff ff ff ff 76 cf 76 37 d5 0a 81 00 00 14   ......v.v7......
 0010  08 06 00 01 08 00 06 04 00 01 76 cf 76 37 d5 0a   ..........v.v7..
 0020  0a 09 09 09 00 00 00 00 00 00 0a 09 09 05         ..............

 Capturing on 'out'
 0000  ff ff ff ff ff ff 76 cf 76 37 d5 0a 81 00 00 14   ......v.v7......
 0010  08 06 00 01 08 00 06 04 00 01 76 cf 76 37 d5 0a   ..........v.v7..
 0020  0a 09 09 09 00 00 00 00 00 00                     ..........

Fixes: aff3d70a07ff ("tun: allow to attach ebpf socket filter")
Cc: Jason Wang <jasowang@redhat.com>
Signed-off-by: Bjørn Mork <bjorn@mork.no>
Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tun.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 28583aa0c17d..ef33950a45d9 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1102,12 +1102,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
 		goto drop;
 
 	len = run_ebpf_filter(tun, skb, len);
-
-	/* Trim extra bytes since we may insert vlan proto & TCI
-	 * in tun_put_user().
-	 */
-	len -= skb_vlan_tag_present(skb) ? sizeof(struct veth) : 0;
-	if (len <= 0 || pskb_trim(skb, len))
+	if (len == 0 || pskb_trim(skb, len))
 		goto drop;
 
 	if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))

From 4e5c01a7c746130033eb6e7153b346b8ca61e7a5 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 16 Apr 2018 16:39:38 -0300
Subject: [PATCH 216/288] perf trace: Support MAP_FIXED_NOREPLACE

Introduced in a4ff8e8620d3 ("mm: introduce MAP_FIXED_NOREPLACE"), and
now that we have that define in the just syncronized
tools/arch/*/include/uapi/asm/mman.h files, add support for it.

This should really transition to autogeneration of string tables as
done for various other things:

  $ ls /tmp/build/perf/trace/beauty/generated/*.c
  arch_errno_name_array.c kcmp_type_array.c madvise_behavior_array.c
  pkey_alloc_access_rights_array.c prctl_option_array.c
  $ head /tmp/build/perf/trace/beauty/generated/madvise_behavior_array.c
  static const char *madvise_advices[] = {
	[0] = "NORMAL",
	[1] = "RANDOM",
	[2] = "SEQUENTIAL",
	[3] = "WILLNEED",
	[4] = "DONTNEED",
	[8] = "FREE",
	[9] = "REMOVE",
	[10] = "DONTFORK",
	[11] = "DOFORK",
  $

Till then, add support for this the old way.

Also it has to be ifdef'ed, because arches like mips still don't define
it. The proper solution will be to have per-arch tables for these
values to support cross-analysis.

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: https://lkml.kernel.org/n/tip-td9t5vhjltqnlzaurkkgq8cn@git.kernel.org
Signef-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/trace/beauty/mmap.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/perf/trace/beauty/mmap.c b/tools/perf/trace/beauty/mmap.c
index 417e3ecfe9d7..9f68077b241b 100644
--- a/tools/perf/trace/beauty/mmap.c
+++ b/tools/perf/trace/beauty/mmap.c
@@ -54,6 +54,9 @@ static size_t syscall_arg__scnprintf_mmap_flags(char *bf, size_t size,
 	P_MMAP_FLAG(EXECUTABLE);
 	P_MMAP_FLAG(FILE);
 	P_MMAP_FLAG(FIXED);
+#ifdef MAP_FIXED_NOREPLACE
+	P_MMAP_FLAG(FIXED_NOREPLACE);
+#endif
 	P_MMAP_FLAG(GROWSDOWN);
 	P_MMAP_FLAG(HUGETLB);
 	P_MMAP_FLAG(LOCKED);

From a7e9eab3dbd35268c16244557a4155a2d9a641c3 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Fri, 6 Apr 2018 13:38:09 -0700
Subject: [PATCH 217/288] perf mem: Allow all record/report options

For perf mem report / perf mem record, pass all unknown options
through to the underlying report/record commands. This makes things
like

perf mem record -a sleep 1

work. Matches how c2c and other tools work.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: http://lkml.kernel.org/r/20180406203812.3087-2-andi@firstfloor.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-mem.txt | 3 +++
 tools/perf/builtin-mem.c              | 4 ++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/tools/perf/Documentation/perf-mem.txt b/tools/perf/Documentation/perf-mem.txt
index b0211410969b..8806ed5f3802 100644
--- a/tools/perf/Documentation/perf-mem.txt
+++ b/tools/perf/Documentation/perf-mem.txt
@@ -67,6 +67,9 @@ OPTIONS
 --phys-data::
 	Record/Report sample physical addresses
 
+In addition, for report all perf report options are valid, and for record
+all perf record options.
+
 SEE ALSO
 --------
 linkperf:perf-record[1], linkperf:perf-report[1]
diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c
index 506564651cda..57393e94d156 100644
--- a/tools/perf/builtin-mem.c
+++ b/tools/perf/builtin-mem.c
@@ -83,7 +83,7 @@ static int __cmd_record(int argc, const char **argv, struct perf_mem *mem)
 	};
 
 	argc = parse_options(argc, argv, options, record_mem_usage,
-			     PARSE_OPT_STOP_AT_NON_OPTION);
+			     PARSE_OPT_KEEP_UNKNOWN);
 
 	rec_argc = argc + 9; /* max number of arguments */
 	rec_argv = calloc(rec_argc + 1, sizeof(char *));
@@ -436,7 +436,7 @@ int cmd_mem(int argc, const char **argv)
 	}
 
 	argc = parse_options_subcommand(argc, argv, mem_options, mem_subcommands,
-					mem_usage, PARSE_OPT_STOP_AT_NON_OPTION);
+					mem_usage, PARSE_OPT_KEEP_UNKNOWN);
 
 	if (!argc || !(strncmp(argv[0], "rec", 3) || mem.operation))
 		usage_with_options(mem_usage, mem_options);

From 6a02f06edea5a5910c787fd6c49b0552e8080e5d Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Fri, 6 Apr 2018 13:38:10 -0700
Subject: [PATCH 218/288] perf hists browser: Clarify top/report browser help

Clarify in the browser help that ESC in tui mode may go back to the
previous screen instead of just exiting (was not clear to me)

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: http://lkml.kernel.org/r/20180406203812.3087-3-andi@firstfloor.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/ui/browsers/hists.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c
index 0eec06c105c6..e5f247247daa 100644
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c
@@ -2714,7 +2714,7 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
 	"h/?/F1        Show this window\n"				\
 	"UP/DOWN/PGUP\n"						\
 	"PGDN/SPACE    Navigate\n"					\
-	"q/ESC/CTRL+C  Exit browser\n\n"				\
+	"q/ESC/CTRL+C  Exit browser or go back to previous screen\n\n"	\
 	"For multiple event sessions:\n\n"				\
 	"TAB/UNTAB     Switch events\n\n"				\
 	"For symbolic views (--sort has sym):\n\n"			\

From ec3948451e0ba317e66873b48d6cc51d701d4eb0 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Fri, 6 Apr 2018 13:38:11 -0700
Subject: [PATCH 219/288] perf record: Remove misleading error suggestion

When perf record encounters an error setting up an event it suggests
to enable CONFIG_PERF_EVENTS. This is misleading because:

- Usually it is enabled (it is really hard to disable on x86)

- The problem is usually somewhere else, e.g. the CPU is not supported
or an invalid configuration has been used.

Remove the misleading suggestion.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: http://lkml.kernel.org/r/20180406203812.3087-4-andi@firstfloor.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/evsel.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 1ac8d9236efd..66b62570c855 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -2894,8 +2894,7 @@ int perf_evsel__open_strerror(struct perf_evsel *evsel, struct target *target,
 
 	return scnprintf(msg, size,
 	"The sys_perf_event_open() syscall returned with %d (%s) for event (%s).\n"
-	"/bin/dmesg may provide additional information.\n"
-	"No CONFIG_PERF_EVENTS=y kernel support configured?",
+	"/bin/dmesg | grep -i perf may provide additional information.\n",
 			 err, str_error_r(err, sbuf, sizeof(sbuf)),
 			 perf_evsel__name(evsel));
 }

From ccbb6afe0890b09cc828373a9a5fffab40ec85df Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Fri, 6 Apr 2018 13:38:12 -0700
Subject: [PATCH 220/288] perf record: Remove suggestion to enable APIC

'perf record' suggests to enable the APIC on errors.

APIC is practically always used today and the problem is usually
somewhere else.

Just remove the outdated suggestion.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: http://lkml.kernel.org/r/20180406203812.3087-5-andi@firstfloor.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/evsel.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 66b62570c855..3e87486c28fe 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -2870,8 +2870,7 @@ int perf_evsel__open_strerror(struct perf_evsel *evsel, struct target *target,
 #if defined(__i386__) || defined(__x86_64__)
 		if (evsel->attr.type == PERF_TYPE_HARDWARE)
 			return scnprintf(msg, size, "%s",
-	"No hardware sampling interrupt available.\n"
-	"No APIC? If so then you can boot the kernel with the \"lapic\" boot parameter to force-enable it.");
+	"No hardware sampling interrupt available.\n");
 #endif
 		break;
 	case EBUSY:

From 66f5a0779af2a4f28b1832f6c20bc9d3b1ab1886 Mon Sep 17 00:00:00 2001
From: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com>
Date: Tue, 17 Apr 2018 09:43:44 +0530
Subject: [PATCH 221/288] perf tools: Add '\n' at the end of parse-options
 error messages

Few error messages does not have '\n' at the end and thus next prompt
gets printed in the same line. Ex,

  linux~$ perf buildid-cache -verbose --add ./a.out
   Error: did you mean `--verbose` (with two dashes ?)linux~$

Fix it.

Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com>
Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kate Stewart <kstewart@linuxfoundation.org>
Cc: Krister Johansen <kjlx@templeofstupid.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Philippe Ombredanne <pombredanne@nexb.com>
Cc: Sihyeon Jang <uneedsihyeon@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20180417041346.5617-2-ravi.bangoria@linux.vnet.ibm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/lib/subcmd/parse-options.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/lib/subcmd/parse-options.c b/tools/lib/subcmd/parse-options.c
index f6a1babcbac4..cb7154eccbdc 100644
--- a/tools/lib/subcmd/parse-options.c
+++ b/tools/lib/subcmd/parse-options.c
@@ -433,7 +433,7 @@ match:
 
 	if (ambiguous_option) {
 		 fprintf(stderr,
-			 " Error: Ambiguous option: %s (could be --%s%s or --%s%s)",
+			 " Error: Ambiguous option: %s (could be --%s%s or --%s%s)\n",
 			 arg,
 			 (ambiguous_flags & OPT_UNSET) ?  "no-" : "",
 			 ambiguous_option->long_name,
@@ -458,7 +458,7 @@ static void check_typos(const char *arg, const struct option *options)
 		return;
 
 	if (strstarts(arg, "no-")) {
-		fprintf(stderr, " Error: did you mean `--%s` (with two dashes ?)", arg);
+		fprintf(stderr, " Error: did you mean `--%s` (with two dashes ?)\n", arg);
 		exit(129);
 	}
 
@@ -466,7 +466,7 @@ static void check_typos(const char *arg, const struct option *options)
 		if (!options->long_name)
 			continue;
 		if (strstarts(options->long_name, arg)) {
-			fprintf(stderr, " Error: did you mean `--%s` (with two dashes ?)", arg);
+			fprintf(stderr, " Error: did you mean `--%s` (with two dashes ?)\n", arg);
 			exit(129);
 		}
 	}

From 518c6021e9b82696819b380cd173e76cac55a01e Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 17 Apr 2018 14:47:36 -0300
Subject: [PATCH 222/288] perf tests mmap: Show which tracepoint is failing

In the 'perf test "mmap interface"' we try creating events for several
tracepoints, but when perf_evsel__new() fails we're not showing which
one is failing, fix that to help diagnosing problems, such as the
syscall tracepoints ones being found and fixes in this merge window.

Now the failing tests shows:

  # perf test -v "mmap interface"
 4: Read samples using the mmap interface                 :
  --- start ---
  test child forked, pid 14311
  <SNIP>
  perf_evsel__new(sys_enter_getppid)
  test child finished with -1
  ---- end ----
  Read samples using the mmap interface: FAILED!
  #

Now to check why the syscalls:sys_enter_getppid is failing...

  # ls -la /sys/kernel/debug/tracing/events/syscalls/sys_enter_getppid
  ls: cannot access '/sys/kernel/debug/tracing/events/syscalls/sys_enter_getppid': No such file or directory
  #

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Dominik Brodowski <linux@dominikbrodowski.net>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: https://lkml.kernel.org/n/tip-44xk0ycdzrfzx1o9rklf5itl@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/mmap-basic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/tests/mmap-basic.c b/tools/perf/tests/mmap-basic.c
index bb8e6bcb0d96..0919b0793e5b 100644
--- a/tools/perf/tests/mmap-basic.c
+++ b/tools/perf/tests/mmap-basic.c
@@ -75,7 +75,7 @@ int test__basic_mmap(struct test *test __maybe_unused, int subtest __maybe_unuse
 		snprintf(name, sizeof(name), "sys_enter_%s", syscall_names[i]);
 		evsels[i] = perf_evsel__newtp("syscalls", name);
 		if (IS_ERR(evsels[i])) {
-			pr_debug("perf_evsel__new\n");
+			pr_debug("perf_evsel__new(%s)\n", name);
 			goto out_delete_evlist;
 		}
 

From eccb1b936363c62544bccb5bbb75afec9536f7e3 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 18 Apr 2018 12:59:03 -0300
Subject: [PATCH 223/288] perf test BPF: Fixup BPF test using epoll_pwait
 syscall function probe

Since e145242ea0df ("syscalls/core, syscalls/x86: Clean up syscall stub
naming convention") changed the main syscall function for 'epoll_pwait'
to something other than the expected 'SyS_epoll_pwait the' 'perf test
BPF' entries started failing, fix it by using something called from the
main syscall function instead, 'epoll_wait', which should keep this test
working in older kernels too.

Before:

  # perf test BPF
  40: BPF filter                           :
  40.1: Basic BPF filtering                : FAILED!
  40.2: BPF pinning                        : Skip
  40.3: BPF prologue generation            : Skip
  40.4: BPF relocation checker             : Skip

If we use -v for that test we see the problem:

    Probe point 'SyS_epoll_pwait' not found.

After:

  # perf test BPF
  40: BPF filter                           :
  40.1: Basic BPF filtering                : Ok
  40.2: BPF pinning                        : Ok
  40.3: BPF prologue generation            : Ok
  40.4: BPF relocation checker             : Ok
  #

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Dominik Brodowski <linux@dominikbrodowski.net>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: https://lkml.kernel.org/r/tip-y24nmn70cs2am8jh4i344dng@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/bpf-script-example.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/tests/bpf-script-example.c b/tools/perf/tests/bpf-script-example.c
index e4123c1b0e88..1ca5106df5f1 100644
--- a/tools/perf/tests/bpf-script-example.c
+++ b/tools/perf/tests/bpf-script-example.c
@@ -31,7 +31,7 @@ struct bpf_map_def SEC("maps") flip_table = {
 	.max_entries = 1,
 };
 
-SEC("func=SyS_epoll_pwait")
+SEC("func=do_epoll_wait")
 int bpf_func__SyS_epoll_pwait(void *ctx)
 {
 	int ind =0;

From c96eebf07692e53bf4dd5987510d8b550e793598 Mon Sep 17 00:00:00 2001
From: Matt Redfearn <matt.redfearn@mips.com>
Date: Tue, 17 Apr 2018 16:40:00 +0100
Subject: [PATCH 224/288] MIPS: memset.S: Fix clobber of v1 in last_fixup

The label .Llast_fixup\@ is jumped to on page fault within the final
byte set loop of memset (on < MIPSR6 architectures). For some reason, in
this fault handler, the v1 register is randomly set to a2 & STORMASK.
This clobbers v1 for the calling function. This can be observed with the
following test code:

static int __init __attribute__((optimize("O0"))) test_clear_user(void)
{
  register int t asm("v1");
  char *test;
  int j, k;

  pr_info("\n\n\nTesting clear_user\n");
  test = vmalloc(PAGE_SIZE);

  for (j = 256; j < 512; j++) {
    t = 0xa5a5a5a5;
    if ((k = clear_user(test + PAGE_SIZE - 256, j)) != j - 256) {
        pr_err("clear_user (%px %d) returned %d\n", test + PAGE_SIZE - 256, j, k);
    }
    if (t != 0xa5a5a5a5) {
       pr_err("v1 was clobbered to 0x%x!\n", t);
    }
  }

  return 0;
}
late_initcall(test_clear_user);

Which demonstrates that v1 is indeed clobbered (MIPS64):

Testing clear_user
v1 was clobbered to 0x1!
v1 was clobbered to 0x2!
v1 was clobbered to 0x3!
v1 was clobbered to 0x4!
v1 was clobbered to 0x5!
v1 was clobbered to 0x6!
v1 was clobbered to 0x7!

Since the number of bytes that could not be set is already contained in
a2, the andi placing a value in v1 is not necessary and actively
harmful in clobbering v1.

Reported-by: James Hogan <jhogan@kernel.org>
Signed-off-by: Matt Redfearn <matt.redfearn@mips.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: stable@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/19109/
Signed-off-by: James Hogan <jhogan@kernel.org>
---
 arch/mips/lib/memset.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/mips/lib/memset.S b/arch/mips/lib/memset.S
index 184819c1d5c8..f7327979a8f8 100644
--- a/arch/mips/lib/memset.S
+++ b/arch/mips/lib/memset.S
@@ -258,7 +258,7 @@
 
 .Llast_fixup\@:
 	jr		ra
-	andi		v1, a2, STORMASK
+	 nop
 
 .Lsmall_fixup\@:
 	PTR_SUBU	a2, t1, a0

From b3d7e55c3f886493235bfee08e1e5a4a27cbcce8 Mon Sep 17 00:00:00 2001
From: Matt Redfearn <matt.redfearn@mips.com>
Date: Tue, 17 Apr 2018 16:40:01 +0100
Subject: [PATCH 225/288] MIPS: uaccess: Add micromips clobbers to bzero
 invocation

The micromips implementation of bzero additionally clobbers registers t7
& t8. Specify this in the clobbers list when invoking bzero.

Fixes: 26c5e07d1478 ("MIPS: microMIPS: Optimise 'memset' core library function.")
Reported-by: James Hogan <jhogan@kernel.org>
Signed-off-by: Matt Redfearn <matt.redfearn@mips.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: <stable@vger.kernel.org> # 3.10+
Patchwork: https://patchwork.linux-mips.org/patch/19110/
Signed-off-by: James Hogan <jhogan@kernel.org>
---
 arch/mips/include/asm/uaccess.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/arch/mips/include/asm/uaccess.h b/arch/mips/include/asm/uaccess.h
index b71306947290..06629011a434 100644
--- a/arch/mips/include/asm/uaccess.h
+++ b/arch/mips/include/asm/uaccess.h
@@ -654,6 +654,13 @@ __clear_user(void __user *addr, __kernel_size_t size)
 {
 	__kernel_size_t res;
 
+#ifdef CONFIG_CPU_MICROMIPS
+/* micromips memset / bzero also clobbers t7 & t8 */
+#define bzero_clobbers "$4", "$5", "$6", __UA_t0, __UA_t1, "$15", "$24", "$31"
+#else
+#define bzero_clobbers "$4", "$5", "$6", __UA_t0, __UA_t1, "$31"
+#endif /* CONFIG_CPU_MICROMIPS */
+
 	if (eva_kernel_access()) {
 		__asm__ __volatile__(
 			"move\t$4, %1\n\t"
@@ -663,7 +670,7 @@ __clear_user(void __user *addr, __kernel_size_t size)
 			"move\t%0, $6"
 			: "=r" (res)
 			: "r" (addr), "r" (size)
-			: "$4", "$5", "$6", __UA_t0, __UA_t1, "$31");
+			: bzero_clobbers);
 	} else {
 		might_fault();
 		__asm__ __volatile__(
@@ -674,7 +681,7 @@ __clear_user(void __user *addr, __kernel_size_t size)
 			"move\t%0, $6"
 			: "=r" (res)
 			: "r" (addr), "r" (size)
-			: "$4", "$5", "$6", __UA_t0, __UA_t1, "$31");
+			: bzero_clobbers);
 	}
 
 	return res;

From 92d32170847bfff2dd08af2c016085779f2fd2a1 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Mon, 16 Apr 2018 21:10:14 +0200
Subject: [PATCH 226/288] btrfs: fix unaligned access in readdir
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The last update to readdir introduced a temporary buffer to store the
emitted readdir data, but as there are file names of variable length,
there's a lot of unaligned access.

This was observed on a sparc64 machine:

  Kernel unaligned access at TPC[102f3080] btrfs_real_readdir+0x51c/0x718 [btrfs]

Fixes: 23b5ec74943 ("btrfs: fix readdir deadlock with pagefault")
CC: stable@vger.kernel.org # 4.14+
Reported-and-tested-by: René Rebe <rene@exactcode.com>
Reviewed-by: Liu Bo <bo.liu@linux.alibaba.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e064c49c9a9a..d241285a0d2a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -31,6 +31,7 @@
 #include <linux/uio.h>
 #include <linux/magic.h>
 #include <linux/iversion.h>
+#include <asm/unaligned.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -5905,11 +5906,13 @@ static int btrfs_filldir(void *addr, int entries, struct dir_context *ctx)
 		struct dir_entry *entry = addr;
 		char *name = (char *)(entry + 1);
 
-		ctx->pos = entry->offset;
-		if (!dir_emit(ctx, name, entry->name_len, entry->ino,
-			      entry->type))
+		ctx->pos = get_unaligned(&entry->offset);
+		if (!dir_emit(ctx, name, get_unaligned(&entry->name_len),
+					 get_unaligned(&entry->ino),
+					 get_unaligned(&entry->type)))
 			return 1;
-		addr += sizeof(struct dir_entry) + entry->name_len;
+		addr += sizeof(struct dir_entry) +
+			get_unaligned(&entry->name_len);
 		ctx->pos++;
 	}
 	return 0;
@@ -5999,14 +6002,15 @@ again:
 		}
 
 		entry = addr;
-		entry->name_len = name_len;
+		put_unaligned(name_len, &entry->name_len);
 		name_ptr = (char *)(entry + 1);
 		read_extent_buffer(leaf, name_ptr, (unsigned long)(di + 1),
 				   name_len);
-		entry->type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
+		put_unaligned(btrfs_filetype_table[btrfs_dir_type(leaf, di)],
+				&entry->type);
 		btrfs_dir_item_key_to_cpu(leaf, di, &location);
-		entry->ino = location.objectid;
-		entry->offset = found_key.offset;
+		put_unaligned(location.objectid, &entry->ino);
+		put_unaligned(found_key.offset, &entry->offset);
 		entries++;
 		addr += sizeof(struct dir_entry) + name_len;
 		total_len += sizeof(struct dir_entry) + name_len;

From 64e86fec54069266ba32be551d7b7f75e88ab60c Mon Sep 17 00:00:00 2001
From: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Date: Tue, 17 Apr 2018 17:40:00 -0600
Subject: [PATCH 227/288] net: qualcomm: rmnet: Fix warning seen with fill_info

When the last rmnet device attached to a real device is removed, the
real device is unregistered from rmnet. As a result, the real device
lookup fails resulting in a warning when the fill_info handler is
called as part of the rmnet device unregistration.

Fix this by returning the rmnet flags as 0 when no real device is
present.

WARNING: CPU: 0 PID: 1779 at net/core/rtnetlink.c:3254
rtmsg_ifinfo_build_skb+0xca/0x10d
Modules linked in:
CPU: 0 PID: 1779 Comm: ip Not tainted 4.16.0-11872-g7ce2367 #1
Stack:
 7fe655f0 60371ea3 00000000 00000000
 60282bc6 6006b116 7fe65600 60371ee8
 7fe65660 6003a68c 00000000 900000000
Call Trace:
 [<6006b116>] ? printk+0x0/0x94
 [<6001f375>] show_stack+0xfe/0x158
 [<60371ea3>] ? dump_stack_print_info+0xe8/0xf1
 [<60282bc6>] ? rtmsg_ifinfo_build_skb+0xca/0x10d
 [<6006b116>] ? printk+0x0/0x94
 [<60371ee8>] dump_stack+0x2a/0x2c
 [<6003a68c>] __warn+0x10e/0x13e
 [<6003a82c>] warn_slowpath_null+0x48/0x4f
 [<60282bc6>] rtmsg_ifinfo_build_skb+0xca/0x10d
 [<60282c4d>] rtmsg_ifinfo_event.part.37+0x1e/0x43
 [<60282c2f>] ? rtmsg_ifinfo_event.part.37+0x0/0x43
 [<60282d03>] rtmsg_ifinfo+0x24/0x28
 [<60264e86>] dev_close_many+0xba/0x119
 [<60282cdf>] ? rtmsg_ifinfo+0x0/0x28
 [<6027c225>] ? rtnl_is_locked+0x0/0x1c
 [<6026ca67>] rollback_registered_many+0x1ae/0x4ae
 [<600314be>] ? unblock_signals+0x0/0xae
 [<6026cdc0>] ? unregister_netdevice_queue+0x19/0xec
 [<6026ceec>] unregister_netdevice_many+0x21/0xa1
 [<6027c765>] rtnl_delete_link+0x3e/0x4e
 [<60280ecb>] rtnl_dellink+0x262/0x29c
 [<6027c241>] ? rtnl_get_link+0x0/0x3e
 [<6027f867>] rtnetlink_rcv_msg+0x235/0x274

Fixes: be81a85f5f87 ("net: qualcomm: rmnet: Implement fill_info")
Signed-off-by: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
index d33988570217..5f4e447c5dce 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
@@ -350,15 +350,16 @@ static int rmnet_fill_info(struct sk_buff *skb, const struct net_device *dev)
 
 	real_dev = priv->real_dev;
 
-	if (!rmnet_is_real_dev_registered(real_dev))
-		return -ENODEV;
-
 	if (nla_put_u16(skb, IFLA_RMNET_MUX_ID, priv->mux_id))
 		goto nla_put_failure;
 
-	port = rmnet_get_port_rtnl(real_dev);
+	if (rmnet_is_real_dev_registered(real_dev)) {
+		port = rmnet_get_port_rtnl(real_dev);
+		f.flags = port->data_format;
+	} else {
+		f.flags = 0;
+	}
 
-	f.flags = port->data_format;
 	f.mask  = ~0;
 
 	if (nla_put(skb, IFLA_RMNET_FLAGS, sizeof(f), &f))

From 13a83eac373c49c0a081cbcd137e79210fe78acd Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Wed, 11 Apr 2018 13:37:58 +1000
Subject: [PATCH 228/288] powerpc/eeh: Fix enabling bridge MMIO windows

On boot we save the configuration space of PCIe bridges. We do this so
when we get an EEH event and everything gets reset that we can restore
them.

Unfortunately we save this state before we've enabled the MMIO space
on the bridges. Hence if we have to reset the bridge when we come back
MMIO is not enabled and we end up taking an PE freeze when the driver
starts accessing again.

This patch forces the memory/MMIO and bus mastering on when restoring
bridges on EEH. Ideally we'd do this correctly by saving the
configuration space writes later, but that will have to come later in
a larger EEH rewrite. For now we have this simple fix.

The original bug can be triggered on a boston machine by doing:
  echo 0x8000000000000000 > /sys/kernel/debug/powerpc/PCI0001/err_injct_outbound
On boston, this PHB has a PCIe switch on it.  Without this patch,
you'll see two EEH events, 1 expected and 1 the failure we are fixing
here. The second EEH event causes the anything under the PHB to
disappear (i.e. the i40e eth).

With this patch, only 1 EEH event occurs and devices properly recover.

Fixes: 652defed4875 ("powerpc/eeh: Check PCIe link after reset")
Cc: stable@vger.kernel.org # v3.11+
Reported-by: Pridhiviraj Paidipeddi <ppaidipe@linux.vnet.ibm.com>
Signed-off-by: Michael Neuling <mikey@neuling.org>
Acked-by: Russell Currey <ruscur@russell.cc>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/eeh_pe.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
index 2d4956e97aa9..ee5a67d57aab 100644
--- a/arch/powerpc/kernel/eeh_pe.c
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -807,7 +807,8 @@ static void eeh_restore_bridge_bars(struct eeh_dev *edev)
 	eeh_ops->write_config(pdn, 15*4, 4, edev->config_space[15]);
 
 	/* PCI Command: 0x4 */
-	eeh_ops->write_config(pdn, PCI_COMMAND, 4, edev->config_space[1]);
+	eeh_ops->write_config(pdn, PCI_COMMAND, 4, edev->config_space[1] |
+			      PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER);
 
 	/* Check the PCIe link is ready */
 	eeh_bridge_check_link(edev);

From ab60ee7bf9a84954f50a66a3d835860e80f99b7f Mon Sep 17 00:00:00 2001
From: Long Li <longli@microsoft.com>
Date: Tue, 17 Apr 2018 12:17:05 -0700
Subject: [PATCH 229/288] cifs: smbd: Check for iov length on sending the last
 iov

When sending the last iov that breaks into smaller buffers to fit the
transfer size, it's necessary to check if this is the last iov.

If this is the latest iov, stop and proceed to send pages.

Signed-off-by: Long Li <longli@microsoft.com>
Cc: stable@vger.kernel.org
Signed-off-by: Steve French <stfrench@microsoft.com>
Reviewed-by: Ronnie Sahlberg <lsahlber@redhat.com>
---
 fs/cifs/smbdirect.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c
index 5008af546dd1..d611ed0537fd 100644
--- a/fs/cifs/smbdirect.c
+++ b/fs/cifs/smbdirect.c
@@ -2194,6 +2194,8 @@ int smbd_send(struct smbd_connection *info, struct smb_rqst *rqst)
 						goto done;
 				}
 				i++;
+				if (i == rqst->rq_nvec)
+					break;
 			}
 			start = i;
 			buflen = 0;

From 56376c5864f8ff4ba7c78a80ae857eee3b1d23d8 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Thu, 19 Apr 2018 16:22:20 +1000
Subject: [PATCH 230/288] powerpc/kvm: Fix lockups when running KVM guests on
 Power8

When running KVM guests on Power8 we can see a lockup where one CPU
stops responding. This often leads to a message such as:

  watchdog: CPU 136 detected hard LOCKUP on other CPUs 72
  Task dump for CPU 72:
  qemu-system-ppc R  running task    10560 20917  20908 0x00040004

And then backtraces on other CPUs, such as:

  Task dump for CPU 48:
  ksmd            R  running task    10032  1519      2 0x00000804
  Call Trace:
    ...
    --- interrupt: 901 at smp_call_function_many+0x3c8/0x460
        LR = smp_call_function_many+0x37c/0x460
    pmdp_invalidate+0x100/0x1b0
    __split_huge_pmd+0x52c/0xdb0
    try_to_unmap_one+0x764/0x8b0
    rmap_walk_anon+0x15c/0x370
    try_to_unmap+0xb4/0x170
    split_huge_page_to_list+0x148/0xa30
    try_to_merge_one_page+0xc8/0x990
    try_to_merge_with_ksm_page+0x74/0xf0
    ksm_scan_thread+0x10ec/0x1ac0
    kthread+0x160/0x1a0
    ret_from_kernel_thread+0x5c/0x78

This is caused by commit 8c1c7fb0b5ec ("powerpc/64s/idle: avoid sync
for KVM state when waking from idle"), which added a check in
pnv_powersave_wakeup() to see if the kvm_hstate.hwthread_state is
already set to KVM_HWTHREAD_IN_KERNEL, and if so to skip the store and
test of kvm_hstate.hwthread_req.

The problem is that the primary does not set KVM_HWTHREAD_IN_KVM when
entering the guest, so it can then come out to cede with
KVM_HWTHREAD_IN_KERNEL set. It can then go idle in kvm_do_nap after
setting hwthread_req to 1, but because hwthread_state is still
KVM_HWTHREAD_IN_KERNEL we will skip the test of hwthread_req when we
wake up from idle and won't go to kvm_start_guest. From there the
thread will return somewhere garbage and crash.

Fix it by skipping the store of hwthread_state, but not the test of
hwthread_req, when coming out of idle. It's OK to skip the sync in
that case because hwthread_req will have been set on the same thread,
so there is no synchronisation required.

Fixes: 8c1c7fb0b5ec ("powerpc/64s/idle: avoid sync for KVM state when waking from idle")
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/idle_book3s.S | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S
index 79d005445c6c..e734f6e45abc 100644
--- a/arch/powerpc/kernel/idle_book3s.S
+++ b/arch/powerpc/kernel/idle_book3s.S
@@ -553,12 +553,12 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300)
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 	lbz	r0,HSTATE_HWTHREAD_STATE(r13)
 	cmpwi	r0,KVM_HWTHREAD_IN_KERNEL
-	beq	1f
+	beq	0f
 	li	r0,KVM_HWTHREAD_IN_KERNEL
 	stb	r0,HSTATE_HWTHREAD_STATE(r13)
 	/* Order setting hwthread_state vs. testing hwthread_req */
 	sync
-	lbz	r0,HSTATE_HWTHREAD_REQ(r13)
+0:	lbz	r0,HSTATE_HWTHREAD_REQ(r13)
 	cmpwi	r0,0
 	beq	1f
 	b	kvm_start_guest

From b658912cb023cd6f8e46963d29779903d3c10538 Mon Sep 17 00:00:00 2001
From: Jiri Kosina <jkosina@suse.cz>
Date: Thu, 19 Apr 2018 09:25:15 +0200
Subject: [PATCH 231/288] HID: i2c-hid: fix inverted return value from
 i2c_hid_command()

i2c_hid_command() returns non-zero in error cases (the actual
errno). Error handling in for I2C_HID_QUIRK_RESEND_REPORT_DESCR
case in i2c_hid_resume() had the check inverted; fix that.

Fixes: 3e83eda467 ("HID: i2c-hid: Fix resume issue on Raydium touchscreen device")
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/i2c-hid/i2c-hid.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/hid/i2c-hid/i2c-hid.c b/drivers/hid/i2c-hid/i2c-hid.c
index 615a91ac93bd..963328674e93 100644
--- a/drivers/hid/i2c-hid/i2c-hid.c
+++ b/drivers/hid/i2c-hid/i2c-hid.c
@@ -1229,7 +1229,7 @@ static int i2c_hid_resume(struct device *dev)
 	 */
 	if (ihid->quirks & I2C_HID_QUIRK_RESEND_REPORT_DESCR) {
 		ret = i2c_hid_command(client, &hid_report_descr_cmd, NULL, 0);
-		if (!ret)
+		if (ret)
 			return ret;
 	}
 

From 3ce0d5aa265bcc0a4b281cb0cabf92491276101b Mon Sep 17 00:00:00 2001
From: Hui Wang <hui.wang@canonical.com>
Date: Thu, 19 Apr 2018 13:29:04 +0800
Subject: [PATCH 232/288] ALSA: hda/realtek - set PINCFG_HEADSET_MIC to
 parse_flags

Otherwise, the pin will be regarded as microphone, and the jack name
is "Mic Phantom", it is always on in the pulseaudio even nothing is
plugged into the jack. So the UI is confusing to users since the
microphone always shows up in the UI even there is no microphone
plugged.

After adding this flag, the jack name is "Headset Mic Phantom", then
the pulseaudio can handle its detection correctly.

Fixes: f0ba9d699e5c ("ALSA: hda/realtek - Fix Dell headset Mic can't record")
Cc: <stable@vger.kernel.org>
Signed-off-by: Hui Wang <hui.wang@canonical.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/pci/hda/patch_realtek.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
index aef1f52db7d9..c3b63b7a4ba4 100644
--- a/sound/pci/hda/patch_realtek.c
+++ b/sound/pci/hda/patch_realtek.c
@@ -6370,6 +6370,8 @@ static const struct hda_fixup alc269_fixups[] = {
 			{ 0x19, 0x01a1913c }, /* use as headset mic, without its own jack detect */
 			{ }
 		},
+		.chained = true,
+		.chain_id = ALC269_FIXUP_HEADSET_MIC
 	},
 };
 

From a3dafb2200bf3c13905a088e82ae11f1eb275a83 Mon Sep 17 00:00:00 2001
From: Hui Wang <hui.wang@canonical.com>
Date: Thu, 19 Apr 2018 13:29:05 +0800
Subject: [PATCH 233/288] ALSA: hda/realtek - adjust the location of one mic

There are two front mics on this machine, if we don't adjust the
location for one of them, they will have the same mixer name,
pulseaudio can't handle this situation.

After applying this FIXUP, they will have different mixer name,
then pulseaudio can handle them correctly.

Cc: <stable@vger.kernel.org>
Signed-off-by: Hui Wang <hui.wang@canonical.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/pci/hda/patch_realtek.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
index c3b63b7a4ba4..fc77bf7a1544 100644
--- a/sound/pci/hda/patch_realtek.c
+++ b/sound/pci/hda/patch_realtek.c
@@ -6575,6 +6575,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = {
 	SND_PCI_QUIRK(0x17aa, 0x30bb, "ThinkCentre AIO", ALC233_FIXUP_LENOVO_LINE2_MIC_HOTKEY),
 	SND_PCI_QUIRK(0x17aa, 0x30e2, "ThinkCentre AIO", ALC233_FIXUP_LENOVO_LINE2_MIC_HOTKEY),
 	SND_PCI_QUIRK(0x17aa, 0x310c, "ThinkCentre Station", ALC294_FIXUP_LENOVO_MIC_LOCATION),
+	SND_PCI_QUIRK(0x17aa, 0x3138, "ThinkCentre Station", ALC294_FIXUP_LENOVO_MIC_LOCATION),
 	SND_PCI_QUIRK(0x17aa, 0x313c, "ThinkCentre Station", ALC294_FIXUP_LENOVO_MIC_LOCATION),
 	SND_PCI_QUIRK(0x17aa, 0x3112, "ThinkCentre AIO", ALC233_FIXUP_LENOVO_LINE2_MIC_HOTKEY),
 	SND_PCI_QUIRK(0x17aa, 0x3902, "Lenovo E50-80", ALC269_FIXUP_DMIC_THINKPAD_ACPI),

From c3bca5d450b620dd3d36e14b5e1f43639fd47d6b Mon Sep 17 00:00:00 2001
From: Laura Abbott <labbott@redhat.com>
Date: Tue, 17 Apr 2018 14:57:42 -0700
Subject: [PATCH 234/288] posix-cpu-timers: Ensure set_process_cpu_timer is
 always evaluated

Commit a9445e47d897 ("posix-cpu-timers: Make set_process_cpu_timer()
more robust") moved the check into the 'if' statement. Unfortunately,
it did so on the right side of an && which means that it may get short
circuited and never evaluated. This is easily reproduced with:

$ cat loop.c
void main() {
  struct rlimit res;
  /* set the CPU time limit */
  getrlimit(RLIMIT_CPU,&res);
  res.rlim_cur = 2;
  res.rlim_max = 2;
  setrlimit(RLIMIT_CPU,&res);

  while (1);
}

Which will hang forever instead of being killed. Fix this by pulling the
evaluation out of the if statement but checking the return value instead.

Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1568337
Fixes: a9445e47d897 ("posix-cpu-timers: Make set_process_cpu_timer() more robust")
Signed-off-by: Laura Abbott <labbott@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Cc: "Max R . P . Grossmann" <m@max.pm>
Cc: John Stultz <john.stultz@linaro.org>
Link: https://lkml.kernel.org/r/20180417215742.2521-1-labbott@redhat.com
---
 kernel/time/posix-cpu-timers.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 2541bd89f20e..5a6251ac6f7a 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -1205,10 +1205,12 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
 			   u64 *newval, u64 *oldval)
 {
 	u64 now;
+	int ret;
 
 	WARN_ON_ONCE(clock_idx == CPUCLOCK_SCHED);
+	ret = cpu_timer_sample_group(clock_idx, tsk, &now);
 
-	if (oldval && cpu_timer_sample_group(clock_idx, tsk, &now) != -EINVAL) {
+	if (oldval && ret != -EINVAL) {
 		/*
 		 * We are setting itimer. The *oldval is absolute and we update
 		 * it to be relative, *newval argument is relative and we update

From 7407188489c62a7b5694bc75a6db2b82af94c9a5 Mon Sep 17 00:00:00 2001
From: Anson Huang <Anson.Huang@nxp.com>
Date: Thu, 19 Apr 2018 14:04:43 +0800
Subject: [PATCH 235/288] clocksource/imx-tpm: Correct -ETIME return condition
 check

The additional brakects added to tpm_set_next_event's return value
computation causes (int) forced type conversion NOT taking effect, and the
incorrect value return will cause various system timer issue, like RCU
stall etc..

Remove the additional brackets to make sure tpm_set_next_event always
returns correct value.

Fixes: 059ab7b82eec ("clocksource/drivers/imx-tpm: Add imx tpm timer support")
Signed-off-by: Anson Huang <Anson.Huang@nxp.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Dong Aisheng <Aisheng.dong@nxp.com>
Cc: stable@vger.kernel.org
Cc: daniel.lezcano@linaro.org
Cc: Linux-imx@nxp.com
Link: https://lkml.kernel.org/r/1524117883-2484-1-git-send-email-Anson.Huang@nxp.com
---
 drivers/clocksource/timer-imx-tpm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/clocksource/timer-imx-tpm.c b/drivers/clocksource/timer-imx-tpm.c
index 05d97a6871d8..6c8318470b48 100644
--- a/drivers/clocksource/timer-imx-tpm.c
+++ b/drivers/clocksource/timer-imx-tpm.c
@@ -114,7 +114,7 @@ static int tpm_set_next_event(unsigned long delta,
 	 * of writing CNT registers which may cause the min_delta event got
 	 * missed, so we need add a ETIME check here in case it happened.
 	 */
-	return (int)((next - now) <= 0) ? -ETIME : 0;
+	return (int)(next - now) <= 0 ? -ETIME : 0;
 }
 
 static int tpm_set_state_oneshot(struct clock_event_device *evt)

From 0cbc94daa55441c21999e96a07061952d873dcb7 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa+renesas@sang-engineering.com>
Date: Wed, 18 Apr 2018 20:20:57 +0200
Subject: [PATCH 236/288] mmc: renesas_sdhi_internal_dmac: limit DMA RX for old
 SoCs

Early revisions of certain SoCs cannot do multiple DMA RX streams in
parallel. To avoid data corruption, only allow one DMA RX channel and
fall back to PIO, if needed.

Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Reviewed-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Tested-by: Nguyen Viet Dung <dung.nguyen.aj@renesas.com>
Reviewed-by: Simon Horman <horms+renesas@verge.net.au>
Cc: stable@vger.kernel.org
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/host/renesas_sdhi_internal_dmac.c | 39 ++++++++++++++++---
 1 file changed, 33 insertions(+), 6 deletions(-)

diff --git a/drivers/mmc/host/renesas_sdhi_internal_dmac.c b/drivers/mmc/host/renesas_sdhi_internal_dmac.c
index 8e0acd197c43..6af946d16d24 100644
--- a/drivers/mmc/host/renesas_sdhi_internal_dmac.c
+++ b/drivers/mmc/host/renesas_sdhi_internal_dmac.c
@@ -9,6 +9,7 @@
  * published by the Free Software Foundation.
  */
 
+#include <linux/bitops.h>
 #include <linux/device.h>
 #include <linux/dma-mapping.h>
 #include <linux/io-64-nonatomic-hi-lo.h>
@@ -62,6 +63,17 @@
  *   need a custom accessor.
  */
 
+static unsigned long global_flags;
+/*
+ * Workaround for avoiding to use RX DMAC by multiple channels.
+ * On R-Car H3 ES1.* and M3-W ES1.0, when multiple SDHI channels use
+ * RX DMAC simultaneously, sometimes hundreds of bytes data are not
+ * stored into the system memory even if the DMAC interrupt happened.
+ * So, this driver then uses one RX DMAC channel only.
+ */
+#define SDHI_INTERNAL_DMAC_ONE_RX_ONLY	0
+#define SDHI_INTERNAL_DMAC_RX_IN_USE	1
+
 /* Definitions for sampling clocks */
 static struct renesas_sdhi_scc rcar_gen3_scc_taps[] = {
 	{
@@ -126,6 +138,9 @@ renesas_sdhi_internal_dmac_abort_dma(struct tmio_mmc_host *host) {
 	renesas_sdhi_internal_dmac_dm_write(host, DM_CM_RST,
 					    RST_RESERVED_BITS | val);
 
+	if (host->data && host->data->flags & MMC_DATA_READ)
+		clear_bit(SDHI_INTERNAL_DMAC_RX_IN_USE, &global_flags);
+
 	renesas_sdhi_internal_dmac_enable_dma(host, true);
 }
 
@@ -155,6 +170,9 @@ renesas_sdhi_internal_dmac_start_dma(struct tmio_mmc_host *host,
 	if (data->flags & MMC_DATA_READ) {
 		dtran_mode |= DTRAN_MODE_CH_NUM_CH1;
 		dir = DMA_FROM_DEVICE;
+		if (test_bit(SDHI_INTERNAL_DMAC_ONE_RX_ONLY, &global_flags) &&
+		    test_and_set_bit(SDHI_INTERNAL_DMAC_RX_IN_USE, &global_flags))
+			goto force_pio;
 	} else {
 		dtran_mode |= DTRAN_MODE_CH_NUM_CH0;
 		dir = DMA_TO_DEVICE;
@@ -208,6 +226,9 @@ static void renesas_sdhi_internal_dmac_complete_tasklet_fn(unsigned long arg)
 	renesas_sdhi_internal_dmac_enable_dma(host, false);
 	dma_unmap_sg(&host->pdev->dev, host->sg_ptr, host->sg_len, dir);
 
+	if (dir == DMA_FROM_DEVICE)
+		clear_bit(SDHI_INTERNAL_DMAC_RX_IN_USE, &global_flags);
+
 	tmio_mmc_do_data_irq(host);
 out:
 	spin_unlock_irq(&host->lock);
@@ -251,18 +272,24 @@ static const struct tmio_mmc_dma_ops renesas_sdhi_internal_dmac_dma_ops = {
  * implementation as others may use a different implementation.
  */
 static const struct soc_device_attribute gen3_soc_whitelist[] = {
-        { .soc_id = "r8a7795", .revision = "ES1.*" },
-        { .soc_id = "r8a7795", .revision = "ES2.0" },
-        { .soc_id = "r8a7796", .revision = "ES1.0" },
-        { .soc_id = "r8a77995", .revision = "ES1.0" },
-        { /* sentinel */ }
+	{ .soc_id = "r8a7795", .revision = "ES1.*",
+	  .data = (void *)BIT(SDHI_INTERNAL_DMAC_ONE_RX_ONLY) },
+	{ .soc_id = "r8a7795", .revision = "ES2.0" },
+	{ .soc_id = "r8a7796", .revision = "ES1.0",
+	  .data = (void *)BIT(SDHI_INTERNAL_DMAC_ONE_RX_ONLY) },
+	{ .soc_id = "r8a77995", .revision = "ES1.0" },
+	{ /* sentinel */ }
 };
 
 static int renesas_sdhi_internal_dmac_probe(struct platform_device *pdev)
 {
-	if (!soc_device_match(gen3_soc_whitelist))
+	const struct soc_device_attribute *soc = soc_device_match(gen3_soc_whitelist);
+
+	if (!soc)
 		return -ENODEV;
 
+	global_flags |= (unsigned long)soc->data;
+
 	return renesas_sdhi_probe(pdev, &renesas_sdhi_internal_dmac_dma_ops);
 }
 

From d78fd7255881645fc645e23145d469385227170d Mon Sep 17 00:00:00 2001
From: Harry Wentland <harry.wentland@amd.com>
Date: Thu, 12 Apr 2018 16:37:09 -0400
Subject: [PATCH 237/288] drm/amd/display: Don't program bypass on linear
 regamma LUT

Even though this is required for degamma since DCE HW only supports a
couple predefined LUTs we can just program the LUT directly for regamma.

This fixes dark screens which occurs when we program regamma to bypass
while degamma is using srgb LUT.

Signed-off-by: Harry Wentland <harry.wentland@amd.com>
Reviewed-by: Leo Li <sunpeng.li@amd.com>
Cc: stable@vger.kernel.org
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c
index f6cb502c303f..25f064c01038 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c
@@ -138,13 +138,6 @@ int amdgpu_dm_set_regamma_lut(struct dm_crtc_state *crtc)
 	lut = (struct drm_color_lut *)blob->data;
 	lut_size = blob->length / sizeof(struct drm_color_lut);
 
-	if (__is_lut_linear(lut, lut_size)) {
-		/* Set to bypass if lut is set to linear */
-		stream->out_transfer_func->type = TF_TYPE_BYPASS;
-		stream->out_transfer_func->tf = TRANSFER_FUNCTION_LINEAR;
-		return 0;
-	}
-
 	gamma = dc_create_gamma();
 	if (!gamma)
 		return -ENOMEM;

From 84f8508f717268c333de8e472f351d6a7a487e51 Mon Sep 17 00:00:00 2001
From: Rex Zhu <Rex.Zhu@amd.com>
Date: Tue, 17 Apr 2018 17:26:26 +0800
Subject: [PATCH 238/288] drm/amd/pp: Fix bug voltage can't be OD separately on
 VI

Make sure to update the MCLK and SCLK flags when setting the VDDC
flags due to dependencies.

Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Rex Zhu <Rex.Zhu@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/powerplay/hwmgr/smu7_hwmgr.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/smu7_hwmgr.c b/drivers/gpu/drm/amd/powerplay/hwmgr/smu7_hwmgr.c
index add90675fd2a..26fbeafc3c96 100644
--- a/drivers/gpu/drm/amd/powerplay/hwmgr/smu7_hwmgr.c
+++ b/drivers/gpu/drm/amd/powerplay/hwmgr/smu7_hwmgr.c
@@ -4743,23 +4743,27 @@ static void smu7_check_dpm_table_updated(struct pp_hwmgr *hwmgr)
 
 	for (i=0; i < dep_table->count; i++) {
 		if (dep_table->entries[i].vddc != odn_dep_table->entries[i].vddc) {
-			data->need_update_smu7_dpm_table |= DPMTABLE_OD_UPDATE_VDDC;
-			break;
+			data->need_update_smu7_dpm_table |= DPMTABLE_OD_UPDATE_VDDC | DPMTABLE_OD_UPDATE_MCLK;
+			return;
 		}
 	}
-	if (i == dep_table->count)
+	if (i == dep_table->count && data->need_update_smu7_dpm_table & DPMTABLE_OD_UPDATE_VDDC) {
 		data->need_update_smu7_dpm_table &= ~DPMTABLE_OD_UPDATE_VDDC;
+		data->need_update_smu7_dpm_table |= DPMTABLE_OD_UPDATE_MCLK;
+	}
 
 	dep_table = table_info->vdd_dep_on_sclk;
 	odn_dep_table = (struct phm_ppt_v1_clock_voltage_dependency_table *)&(odn_table->vdd_dependency_on_sclk);
 	for (i=0; i < dep_table->count; i++) {
 		if (dep_table->entries[i].vddc != odn_dep_table->entries[i].vddc) {
-			data->need_update_smu7_dpm_table |= DPMTABLE_OD_UPDATE_VDDC;
-			break;
+			data->need_update_smu7_dpm_table |= DPMTABLE_OD_UPDATE_VDDC | DPMTABLE_OD_UPDATE_SCLK;
+			return;
 		}
 	}
-	if (i == dep_table->count)
+	if (i == dep_table->count && data->need_update_smu7_dpm_table & DPMTABLE_OD_UPDATE_VDDC) {
 		data->need_update_smu7_dpm_table &= ~DPMTABLE_OD_UPDATE_VDDC;
+		data->need_update_smu7_dpm_table |= DPMTABLE_OD_UPDATE_SCLK;
+	}
 }
 
 static int smu7_odn_edit_dpm_table(struct pp_hwmgr *hwmgr,

From cc9e992dfb5bb48f59f3fbc1268d3f38d2c86ef3 Mon Sep 17 00:00:00 2001
From: Kenneth Feng <kenneth.feng@amd.com>
Date: Tue, 17 Apr 2018 21:49:51 +0800
Subject: [PATCH 239/288] drm/amd/powerplay: header file interface to SMU
 update

update vega12 smu interface.

Signed-off-by: Kenneth Feng <kenneth.feng@amd.com>
Reviewed-by: Huang Rui <ray.huang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/powerplay/inc/vega12/smu9_driver_if.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/powerplay/inc/vega12/smu9_driver_if.h b/drivers/gpu/drm/amd/powerplay/inc/vega12/smu9_driver_if.h
index fb696e3d06cf..2f8a3b983cce 100644
--- a/drivers/gpu/drm/amd/powerplay/inc/vega12/smu9_driver_if.h
+++ b/drivers/gpu/drm/amd/powerplay/inc/vega12/smu9_driver_if.h
@@ -412,8 +412,10 @@ typedef struct {
   QuadraticInt_t    ReservedEquation2;
   QuadraticInt_t    ReservedEquation3;
 
+	uint16_t     MinVoltageUlvGfx;
+	uint16_t     MinVoltageUlvSoc;
 
-  uint32_t     Reserved[15];
+	uint32_t     Reserved[14];
 
 
 

From 8a9fd8323087e794f1d3cd4850b393ced048bc73 Mon Sep 17 00:00:00 2001
From: Mathieu Poirier <mathieu.poirier@linaro.org>
Date: Wed, 18 Apr 2018 16:05:18 -0600
Subject: [PATCH 240/288] coresight: Move to SPDX identifier

Move CoreSight headers to the SPDX identifier.

Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: linux-arm-kernel@lists.infradead.org
Link: http://lkml.kernel.org/r/1524089118-27595-1-git-send-email-mathieu.poirier@linaro.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 include/linux/coresight-pmu.h                   | 13 +------------
 tools/include/linux/coresight-pmu.h             | 13 +------------
 tools/perf/arch/arm/util/auxtrace.c             | 13 +------------
 tools/perf/arch/arm/util/cs-etm.c               | 13 +------------
 tools/perf/arch/arm/util/cs-etm.h               | 13 +------------
 tools/perf/arch/arm/util/pmu.c                  | 13 +------------
 tools/perf/util/cs-etm-decoder/cs-etm-decoder.c |  3 +--
 tools/perf/util/cs-etm.c                        |  3 +--
 tools/perf/util/cs-etm.h                        | 13 +------------
 9 files changed, 9 insertions(+), 88 deletions(-)

diff --git a/include/linux/coresight-pmu.h b/include/linux/coresight-pmu.h
index edfeaba95429..a1a959ba24ff 100644
--- a/include/linux/coresight-pmu.h
+++ b/include/linux/coresight-pmu.h
@@ -1,18 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /*
  * Copyright(C) 2015 Linaro Limited. All rights reserved.
  * Author: Mathieu Poirier <mathieu.poirier@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef _LINUX_CORESIGHT_PMU_H
diff --git a/tools/include/linux/coresight-pmu.h b/tools/include/linux/coresight-pmu.h
index edfeaba95429..a1a959ba24ff 100644
--- a/tools/include/linux/coresight-pmu.h
+++ b/tools/include/linux/coresight-pmu.h
@@ -1,18 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /*
  * Copyright(C) 2015 Linaro Limited. All rights reserved.
  * Author: Mathieu Poirier <mathieu.poirier@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef _LINUX_CORESIGHT_PMU_H
diff --git a/tools/perf/arch/arm/util/auxtrace.c b/tools/perf/arch/arm/util/auxtrace.c
index fa639e3e52ac..1ce6bdbda561 100644
--- a/tools/perf/arch/arm/util/auxtrace.c
+++ b/tools/perf/arch/arm/util/auxtrace.c
@@ -1,18 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright(C) 2015 Linaro Limited. All rights reserved.
  * Author: Mathieu Poirier <mathieu.poirier@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <stdbool.h>
diff --git a/tools/perf/arch/arm/util/cs-etm.c b/tools/perf/arch/arm/util/cs-etm.c
index 5c655ad4621e..2f595cd73da6 100644
--- a/tools/perf/arch/arm/util/cs-etm.c
+++ b/tools/perf/arch/arm/util/cs-etm.c
@@ -1,18 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright(C) 2015 Linaro Limited. All rights reserved.
  * Author: Mathieu Poirier <mathieu.poirier@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <api/fs/fs.h>
diff --git a/tools/perf/arch/arm/util/cs-etm.h b/tools/perf/arch/arm/util/cs-etm.h
index 5256741be549..1a12e64f5127 100644
--- a/tools/perf/arch/arm/util/cs-etm.h
+++ b/tools/perf/arch/arm/util/cs-etm.h
@@ -1,18 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /*
  * Copyright(C) 2015 Linaro Limited. All rights reserved.
  * Author: Mathieu Poirier <mathieu.poirier@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef INCLUDE__PERF_CS_ETM_H__
diff --git a/tools/perf/arch/arm/util/pmu.c b/tools/perf/arch/arm/util/pmu.c
index ac4dffc807b8..e047571e6080 100644
--- a/tools/perf/arch/arm/util/pmu.c
+++ b/tools/perf/arch/arm/util/pmu.c
@@ -1,18 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright(C) 2015 Linaro Limited. All rights reserved.
  * Author: Mathieu Poirier <mathieu.poirier@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <string.h>
diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
index 640af88331b4..c8b98fa22997 100644
--- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
+++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
@@ -1,6 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
- * SPDX-License-Identifier: GPL-2.0
- *
  * Copyright(C) 2015-2018 Linaro Limited.
  *
  * Author: Tor Jeremiassen <tor@ti.com>
diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
index 1b0d422373be..40020b1ca54f 100644
--- a/tools/perf/util/cs-etm.c
+++ b/tools/perf/util/cs-etm.c
@@ -1,6 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
- * SPDX-License-Identifier: GPL-2.0
- *
  * Copyright(C) 2015-2018 Linaro Limited.
  *
  * Author: Tor Jeremiassen <tor@ti.com>
diff --git a/tools/perf/util/cs-etm.h b/tools/perf/util/cs-etm.h
index 5864d5dca616..37f8d48179ca 100644
--- a/tools/perf/util/cs-etm.h
+++ b/tools/perf/util/cs-etm.h
@@ -1,18 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /*
  * Copyright(C) 2015 Linaro Limited. All rights reserved.
  * Author: Mathieu Poirier <mathieu.poirier@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef INCLUDE__UTIL_PERF_CS_ETM_H__

From 8a56ef4f3ffba9ebf4967b61ef600b0a7ba10f11 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Thu, 19 Apr 2018 18:16:15 +0200
Subject: [PATCH 241/288] ALSA: rawmidi: Fix missing input substream checks in
 compat ioctls

Some rawmidi compat ioctls lack of the input substream checks
(although they do check only for rfile->output).  This many eventually
lead to an Oops as NULL substream is passed to the rawmidi core
functions.

Fix it by adding the proper checks before each function call.

The bug was spotted by syzkaller.

Reported-by: syzbot+f7a0348affc3b67bc617@syzkaller.appspotmail.com
Cc: <stable@vger.kernel.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/core/rawmidi_compat.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/sound/core/rawmidi_compat.c b/sound/core/rawmidi_compat.c
index f69764d7cdd7..e30e30ba6e39 100644
--- a/sound/core/rawmidi_compat.c
+++ b/sound/core/rawmidi_compat.c
@@ -36,8 +36,6 @@ static int snd_rawmidi_ioctl_params_compat(struct snd_rawmidi_file *rfile,
 	struct snd_rawmidi_params params;
 	unsigned int val;
 
-	if (rfile->output == NULL)
-		return -EINVAL;
 	if (get_user(params.stream, &src->stream) ||
 	    get_user(params.buffer_size, &src->buffer_size) ||
 	    get_user(params.avail_min, &src->avail_min) ||
@@ -46,8 +44,12 @@ static int snd_rawmidi_ioctl_params_compat(struct snd_rawmidi_file *rfile,
 	params.no_active_sensing = val;
 	switch (params.stream) {
 	case SNDRV_RAWMIDI_STREAM_OUTPUT:
+		if (!rfile->output)
+			return -EINVAL;
 		return snd_rawmidi_output_params(rfile->output, &params);
 	case SNDRV_RAWMIDI_STREAM_INPUT:
+		if (!rfile->input)
+			return -EINVAL;
 		return snd_rawmidi_input_params(rfile->input, &params);
 	}
 	return -EINVAL;
@@ -67,16 +69,18 @@ static int snd_rawmidi_ioctl_status_compat(struct snd_rawmidi_file *rfile,
 	int err;
 	struct snd_rawmidi_status status;
 
-	if (rfile->output == NULL)
-		return -EINVAL;
 	if (get_user(status.stream, &src->stream))
 		return -EFAULT;
 
 	switch (status.stream) {
 	case SNDRV_RAWMIDI_STREAM_OUTPUT:
+		if (!rfile->output)
+			return -EINVAL;
 		err = snd_rawmidi_output_status(rfile->output, &status);
 		break;
 	case SNDRV_RAWMIDI_STREAM_INPUT:
+		if (!rfile->input)
+			return -EINVAL;
 		err = snd_rawmidi_input_status(rfile->input, &status);
 		break;
 	default:
@@ -112,16 +116,18 @@ static int snd_rawmidi_ioctl_status_x32(struct snd_rawmidi_file *rfile,
 	int err;
 	struct snd_rawmidi_status status;
 
-	if (rfile->output == NULL)
-		return -EINVAL;
 	if (get_user(status.stream, &src->stream))
 		return -EFAULT;
 
 	switch (status.stream) {
 	case SNDRV_RAWMIDI_STREAM_OUTPUT:
+		if (!rfile->output)
+			return -EINVAL;
 		err = snd_rawmidi_output_status(rfile->output, &status);
 		break;
 	case SNDRV_RAWMIDI_STREAM_INPUT:
+		if (!rfile->input)
+			return -EINVAL;
 		err = snd_rawmidi_input_status(rfile->input, &status);
 		break;
 	default:

From bb9aaaa1849eed763c6b7f20227a8a03300d4421 Mon Sep 17 00:00:00 2001
From: sunlianwen <sunlw.fnst@cn.fujitsu.com>
Date: Wed, 18 Apr 2018 09:22:39 +0800
Subject: [PATCH 242/288] net: change the comment of dev_mc_init

The comment of dev_mc_init() is wrong. which use dev_mc_flush
instead of dev_mc_init.

Signed-off-by: Lianwen Sun <sunlw.fnst@cn.fujitsu.com
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev_addr_lists.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c
index e3e6a3e2ca22..d884d8f5f0e5 100644
--- a/net/core/dev_addr_lists.c
+++ b/net/core/dev_addr_lists.c
@@ -839,7 +839,7 @@ void dev_mc_flush(struct net_device *dev)
 EXPORT_SYMBOL(dev_mc_flush);
 
 /**
- *	dev_mc_flush - Init multicast address list
+ *	dev_mc_init - Init multicast address list
  *	@dev: device
  *
  *	Init multicast address list.

From da42bb271305d68df6cbf99eed90542f1f1ee1c9 Mon Sep 17 00:00:00 2001
From: Maxime Chevallier <maxime.chevallier@bootlin.com>
Date: Wed, 18 Apr 2018 11:14:44 +0200
Subject: [PATCH 243/288] net: mvpp2: Fix DMA address mask size

PPv2 TX/RX descriptors uses 40bits DMA addresses, but 41 bits masks were
used (GENMASK_ULL(40, 0)).

This commit fixes that by using the correct mask.

Fixes: e7c5359f2eed ("net: mvpp2: introduce PPv2.2 HW descriptors and adapt accessors")
Signed-off-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/marvell/mvpp2.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/marvell/mvpp2.c b/drivers/net/ethernet/marvell/mvpp2.c
index 9deb79b6dcc8..4202f9b5b966 100644
--- a/drivers/net/ethernet/marvell/mvpp2.c
+++ b/drivers/net/ethernet/marvell/mvpp2.c
@@ -916,6 +916,8 @@ static struct {
 
 #define MVPP2_MIB_COUNTERS_STATS_DELAY		(1 * HZ)
 
+#define MVPP2_DESC_DMA_MASK	DMA_BIT_MASK(40)
+
 /* Definitions */
 
 /* Shared Packet Processor resources */
@@ -1429,7 +1431,7 @@ static dma_addr_t mvpp2_txdesc_dma_addr_get(struct mvpp2_port *port,
 	if (port->priv->hw_version == MVPP21)
 		return tx_desc->pp21.buf_dma_addr;
 	else
-		return tx_desc->pp22.buf_dma_addr_ptp & GENMASK_ULL(40, 0);
+		return tx_desc->pp22.buf_dma_addr_ptp & MVPP2_DESC_DMA_MASK;
 }
 
 static void mvpp2_txdesc_dma_addr_set(struct mvpp2_port *port,
@@ -1447,7 +1449,7 @@ static void mvpp2_txdesc_dma_addr_set(struct mvpp2_port *port,
 	} else {
 		u64 val = (u64)addr;
 
-		tx_desc->pp22.buf_dma_addr_ptp &= ~GENMASK_ULL(40, 0);
+		tx_desc->pp22.buf_dma_addr_ptp &= ~MVPP2_DESC_DMA_MASK;
 		tx_desc->pp22.buf_dma_addr_ptp |= val;
 		tx_desc->pp22.packet_offset = offset;
 	}
@@ -1507,7 +1509,7 @@ static dma_addr_t mvpp2_rxdesc_dma_addr_get(struct mvpp2_port *port,
 	if (port->priv->hw_version == MVPP21)
 		return rx_desc->pp21.buf_dma_addr;
 	else
-		return rx_desc->pp22.buf_dma_addr_key_hash & GENMASK_ULL(40, 0);
+		return rx_desc->pp22.buf_dma_addr_key_hash & MVPP2_DESC_DMA_MASK;
 }
 
 static unsigned long mvpp2_rxdesc_cookie_get(struct mvpp2_port *port,
@@ -1516,7 +1518,7 @@ static unsigned long mvpp2_rxdesc_cookie_get(struct mvpp2_port *port,
 	if (port->priv->hw_version == MVPP21)
 		return rx_desc->pp21.buf_cookie;
 	else
-		return rx_desc->pp22.buf_cookie_misc & GENMASK_ULL(40, 0);
+		return rx_desc->pp22.buf_cookie_misc & MVPP2_DESC_DMA_MASK;
 }
 
 static size_t mvpp2_rxdesc_size_get(struct mvpp2_port *port,
@@ -8789,7 +8791,7 @@ static int mvpp2_probe(struct platform_device *pdev)
 	}
 
 	if (priv->hw_version == MVPP22) {
-		err = dma_set_mask(&pdev->dev, DMA_BIT_MASK(40));
+		err = dma_set_mask(&pdev->dev, MVPP2_DESC_DMA_MASK);
 		if (err)
 			goto err_mg_clk;
 		/* Sadly, the BM pools all share the same register to

From 565020aaeebfa7c8b3ec077bee38f4c15acc9905 Mon Sep 17 00:00:00 2001
From: Jose Abreu <Jose.Abreu@synopsys.com>
Date: Wed, 18 Apr 2018 10:57:55 +0100
Subject: [PATCH 244/288] net: stmmac: Disable ACS Feature for GMAC >= 4

ACS Feature is currently enabled for GMAC >= 4 but the llc_snap status
is never checked in descriptor rx_status callback. This will cause
stmmac to always strip packets even that ACS feature is already
stripping them.

Lets be safe and disable the ACS feature for GMAC >= 4 and always strip
the packets for this GMAC version.

Fixes: 477286b53f55 ("stmmac: add GMAC4 core support")
Signed-off-by: Jose Abreu <joabreu@synopsys.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Joao Pinto <jpinto@synopsys.com>
Cc: Giuseppe Cavallaro <peppe.cavallaro@st.com>
Cc: Alexandre Torgue <alexandre.torgue@st.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/dwmac4.h      | 2 +-
 drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c | 7 -------
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 7 ++++++-
 3 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
index c7bff596c665..dedd40613090 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
@@ -347,7 +347,7 @@ enum power_event {
 #define MTL_RX_OVERFLOW_INT		BIT(16)
 
 /* Default operating mode of the MAC */
-#define GMAC_CORE_INIT (GMAC_CONFIG_JD | GMAC_CONFIG_PS | GMAC_CONFIG_ACS | \
+#define GMAC_CORE_INIT (GMAC_CONFIG_JD | GMAC_CONFIG_PS | \
 			GMAC_CONFIG_BE | GMAC_CONFIG_DCRS)
 
 /* To dump the core regs excluding  the Address Registers */
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
index a3af92ebbca8..517b1f6736a8 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
@@ -31,13 +31,6 @@ static void dwmac4_core_init(struct mac_device_info *hw,
 
 	value |= GMAC_CORE_INIT;
 
-	/* Clear ACS bit because Ethernet switch tagging formats such as
-	 * Broadcom tags can look like invalid LLC/SNAP packets and cause the
-	 * hardware to truncate packets on reception.
-	 */
-	if (netdev_uses_dsa(dev))
-		value &= ~GMAC_CONFIG_ACS;
-
 	if (mtu > 1500)
 		value |= GMAC_CONFIG_2K;
 	if (mtu > 2000)
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 9a16931ce39d..b65e2d144698 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -3495,8 +3495,13 @@ static int stmmac_rx(struct stmmac_priv *priv, int limit, u32 queue)
 
 			/* ACS is set; GMAC core strips PAD/FCS for IEEE 802.3
 			 * Type frames (LLC/LLC-SNAP)
+			 *
+			 * llc_snap is never checked in GMAC >= 4, so this ACS
+			 * feature is always disabled and packets need to be
+			 * stripped manually.
 			 */
-			if (unlikely(status != llc_snap))
+			if (unlikely(priv->synopsys_id >= DWMAC_CORE_4_00) ||
+			    unlikely(status != llc_snap))
 				frame_len -= ETH_FCS_LEN;
 
 			if (netif_msg_rx_status(priv)) {

From 5e84b38b07e676fcd3ab6e296780b4f77a29d09f Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Wed, 18 Apr 2018 12:00:08 +0100
Subject: [PATCH 245/288] net: caif: fix spelling mistake "UKNOWN" -> "UNKNOWN"

Trivial fix to spelling mistake

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/caif/chnl_net.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c
index 53ecda10b790..13e2ae6be620 100644
--- a/net/caif/chnl_net.c
+++ b/net/caif/chnl_net.c
@@ -174,7 +174,7 @@ static void chnl_flowctrl_cb(struct cflayer *layr, enum caif_ctrlcmd flow,
 		flow == CAIF_CTRLCMD_DEINIT_RSP ? "CLOSE/DEINIT" :
 		flow == CAIF_CTRLCMD_INIT_FAIL_RSP ? "OPEN_FAIL" :
 		flow == CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND ?
-		 "REMOTE_SHUTDOWN" : "UKNOWN CTRL COMMAND");
+		 "REMOTE_SHUTDOWN" : "UNKNOWN CTRL COMMAND");
 
 
 

From 4ec7eb3ff6eb5c9af3a84288a8d808a857fbc22b Mon Sep 17 00:00:00 2001
From: Pawel Dembicki <paweldembicki@gmail.com>
Date: Wed, 18 Apr 2018 16:03:24 +0200
Subject: [PATCH 246/288] net: qmi_wwan: add Wistron Neweb D19Q1
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This modem is embedded on dlink dwr-960 router.
The oem configuration states:

T: Bus=01 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 2 Spd=480 MxCh= 0
D: Ver= 2.10 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1
P: Vendor=1435 ProdID=d191 Rev=ff.ff
S: Manufacturer=Android
S: Product=Android
S: SerialNumber=0123456789ABCDEF
C:* #Ifs= 6 Cfg#= 1 Atr=80 MxPwr=500mA
I:* If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=(none)
E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I:* If#= 1 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=(none)
E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E: Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I:* If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=(none)
E: Ad=84(I) Atr=03(Int.) MxPS= 10 Ivl=32ms
E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I:* If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=(none)
E: Ad=86(I) Atr=03(Int.) MxPS= 10 Ivl=32ms
E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I:* If#= 4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=qmi_wwan
E: Ad=88(I) Atr=03(Int.) MxPS= 8 Ivl=32ms
E: Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E: Ad=05(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I:* If#= 5 Alt= 0 #EPs= 2 Cls=08(stor.) Sub=06 Prot=50 Driver=(none)
E: Ad=89(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E: Ad=06(O) Atr=02(Bulk) MxPS= 512 Ivl=125us

Tested on openwrt distribution

Signed-off-by: Pawel Dembicki <paweldembicki@gmail.com>
Acked-by: Bjørn Mork <bjorn@mork.no>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/qmi_wwan.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c
index ca066b785e9f..c853e7410f5a 100644
--- a/drivers/net/usb/qmi_wwan.c
+++ b/drivers/net/usb/qmi_wwan.c
@@ -1107,6 +1107,7 @@ static const struct usb_device_id products[] = {
 	{QMI_FIXED_INTF(0x1435, 0xd181, 3)},	/* Wistron NeWeb D18Q1 */
 	{QMI_FIXED_INTF(0x1435, 0xd181, 4)},	/* Wistron NeWeb D18Q1 */
 	{QMI_FIXED_INTF(0x1435, 0xd181, 5)},	/* Wistron NeWeb D18Q1 */
+	{QMI_FIXED_INTF(0x1435, 0xd191, 4)},	/* Wistron NeWeb D19Q1 */
 	{QMI_FIXED_INTF(0x16d8, 0x6003, 0)},	/* CMOTech 6003 */
 	{QMI_FIXED_INTF(0x16d8, 0x6007, 0)},	/* CMOTech CHE-628S */
 	{QMI_FIXED_INTF(0x16d8, 0x6008, 0)},	/* CMOTech CMU-301 */

From f3335545b34315fc42cc03a83165bdd26d956584 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Wed, 18 Apr 2018 16:55:05 +0100
Subject: [PATCH 247/288] atm: iphase: fix spelling mistake: "Tansmit" ->
 "Transmit"

Trivial fix to spelling mistake in message text.

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/atm/iphase.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/atm/iphase.c b/drivers/atm/iphase.c
index 44abb8a0a5e5..be076606d30e 100644
--- a/drivers/atm/iphase.c
+++ b/drivers/atm/iphase.c
@@ -671,7 +671,7 @@ static void ia_tx_poll (IADEV *iadev) {
           if ((vcc->pop) && (skb1->len != 0))
           {
              vcc->pop(vcc, skb1);
-             IF_EVENT(printk("Tansmit Done - skb 0x%lx return\n",
+             IF_EVENT(printk("Transmit Done - skb 0x%lx return\n",
                                                           (long)skb1);)
           }
           else 
@@ -1665,7 +1665,7 @@ static void tx_intr(struct atm_dev *dev)
 	status = readl(iadev->seg_reg+SEG_INTR_STATUS_REG);  
         if (status & TRANSMIT_DONE){
 
-           IF_EVENT(printk("Tansmit Done Intr logic run\n");)
+           IF_EVENT(printk("Transmit Done Intr logic run\n");)
            spin_lock_irqsave(&iadev->tx_lock, flags);
            ia_tx_poll(iadev);
            spin_unlock_irqrestore(&iadev->tx_lock, flags);

From 02b94fc70ffe320a7799c35e09372809e40b7131 Mon Sep 17 00:00:00 2001
From: Jonathan Corbet <corbet@lwn.net>
Date: Wed, 18 Apr 2018 10:14:13 -0600
Subject: [PATCH 248/288] MAINTAINERS: Direct networking documentation changes
 to netdev

Networking docs changes go through the networking tree, so patch the
MAINTAINERS file to direct authors to the right place.

Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index b60179d948bb..1624f7d92364 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9751,6 +9751,7 @@ F:	include/uapi/linux/net_namespace.h
 F:	tools/testing/selftests/net/
 F:	lib/net_utils.c
 F:	lib/random32.c
+F:	Documentation/networking/
 
 NETWORKING [IPSEC]
 M:	Steffen Klassert <steffen.klassert@secunet.com>

From f7e43672683b097bb074a8fe7af9bc600a23f231 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Wed, 18 Apr 2018 11:51:56 -0700
Subject: [PATCH 249/288] llc: hold llc_sap before release_sock()

syzbot reported we still access llc->sap in llc_backlog_rcv()
after it is freed in llc_sap_remove_socket():

Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x1b9/0x294 lib/dump_stack.c:113
 print_address_description+0x6c/0x20b mm/kasan/report.c:256
 kasan_report_error mm/kasan/report.c:354 [inline]
 kasan_report.cold.7+0x242/0x2fe mm/kasan/report.c:412
 __asan_report_load1_noabort+0x14/0x20 mm/kasan/report.c:430
 llc_conn_ac_send_sabme_cmd_p_set_x+0x3a8/0x460 net/llc/llc_c_ac.c:785
 llc_exec_conn_trans_actions net/llc/llc_conn.c:475 [inline]
 llc_conn_service net/llc/llc_conn.c:400 [inline]
 llc_conn_state_process+0x4e1/0x13a0 net/llc/llc_conn.c:75
 llc_backlog_rcv+0x195/0x1e0 net/llc/llc_conn.c:891
 sk_backlog_rcv include/net/sock.h:909 [inline]
 __release_sock+0x12f/0x3a0 net/core/sock.c:2335
 release_sock+0xa4/0x2b0 net/core/sock.c:2850
 llc_ui_release+0xc8/0x220 net/llc/af_llc.c:204

llc->sap is refcount'ed and llc_sap_remove_socket() is paired
with llc_sap_add_socket(). This can be amended by holding its refcount
before llc_sap_remove_socket() and releasing it after release_sock().

Reported-by: <syzbot+6e181fc95081c2cf9051@syzkaller.appspotmail.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/llc/af_llc.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 01dcc0823d1f..6d29b2b94e84 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -189,6 +189,7 @@ static int llc_ui_release(struct socket *sock)
 {
 	struct sock *sk = sock->sk;
 	struct llc_sock *llc;
+	struct llc_sap *sap;
 
 	if (unlikely(sk == NULL))
 		goto out;
@@ -199,9 +200,15 @@ static int llc_ui_release(struct socket *sock)
 		llc->laddr.lsap, llc->daddr.lsap);
 	if (!llc_send_disc(sk))
 		llc_ui_wait_for_disc(sk, sk->sk_rcvtimeo);
+	sap = llc->sap;
+	/* Hold this for release_sock(), so that llc_backlog_rcv() could still
+	 * use it.
+	 */
+	llc_sap_hold(sap);
 	if (!sock_flag(sk, SOCK_ZAPPED))
 		llc_sap_remove_socket(llc->sap, sk);
 	release_sock(sk);
+	llc_sap_put(sap);
 	if (llc->dev)
 		dev_put(llc->dev);
 	sock_put(sk);

From 65ec0bd1c7c14522670a5294de35710fb577a7fd Mon Sep 17 00:00:00 2001
From: Ronak Doshi <doshir@vmware.com>
Date: Wed, 18 Apr 2018 12:48:04 -0700
Subject: [PATCH 250/288] vmxnet3: fix incorrect dereference when rxvlan is
 disabled

vmxnet3_get_hdr_len() is used to calculate the header length which in
turn is used to calculate the gso_size for skb. When rxvlan offload is
disabled, vlan tag is present in the header and the function references
ip header from sizeof(ethhdr) and leads to incorrect pointer reference.

This patch fixes this issue by taking sizeof(vlan_ethhdr) into account
if vlan tag is present and correctly references the ip hdr.

Signed-off-by: Ronak Doshi <doshir@vmware.com>
Acked-by: Guolin Yang <gyang@vmware.com>
Acked-by: Louis Luo <llouis@vmware.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vmxnet3/vmxnet3_drv.c | 17 +++++++++++++----
 drivers/net/vmxnet3/vmxnet3_int.h |  4 ++--
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c
index e04937f44f33..9ebe2a689966 100644
--- a/drivers/net/vmxnet3/vmxnet3_drv.c
+++ b/drivers/net/vmxnet3/vmxnet3_drv.c
@@ -1218,6 +1218,7 @@ vmxnet3_get_hdr_len(struct vmxnet3_adapter *adapter, struct sk_buff *skb,
 	union {
 		void *ptr;
 		struct ethhdr *eth;
+		struct vlan_ethhdr *veth;
 		struct iphdr *ipv4;
 		struct ipv6hdr *ipv6;
 		struct tcphdr *tcp;
@@ -1228,16 +1229,24 @@ vmxnet3_get_hdr_len(struct vmxnet3_adapter *adapter, struct sk_buff *skb,
 	if (unlikely(sizeof(struct iphdr) + sizeof(struct tcphdr) > maplen))
 		return 0;
 
+	if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
+	    skb->protocol == cpu_to_be16(ETH_P_8021AD))
+		hlen = sizeof(struct vlan_ethhdr);
+	else
+		hlen = sizeof(struct ethhdr);
+
 	hdr.eth = eth_hdr(skb);
 	if (gdesc->rcd.v4) {
-		BUG_ON(hdr.eth->h_proto != htons(ETH_P_IP));
-		hdr.ptr += sizeof(struct ethhdr);
+		BUG_ON(hdr.eth->h_proto != htons(ETH_P_IP) &&
+		       hdr.veth->h_vlan_encapsulated_proto != htons(ETH_P_IP));
+		hdr.ptr += hlen;
 		BUG_ON(hdr.ipv4->protocol != IPPROTO_TCP);
 		hlen = hdr.ipv4->ihl << 2;
 		hdr.ptr += hdr.ipv4->ihl << 2;
 	} else if (gdesc->rcd.v6) {
-		BUG_ON(hdr.eth->h_proto != htons(ETH_P_IPV6));
-		hdr.ptr += sizeof(struct ethhdr);
+		BUG_ON(hdr.eth->h_proto != htons(ETH_P_IPV6) &&
+		       hdr.veth->h_vlan_encapsulated_proto != htons(ETH_P_IPV6));
+		hdr.ptr += hlen;
 		/* Use an estimated value, since we also need to handle
 		 * TSO case.
 		 */
diff --git a/drivers/net/vmxnet3/vmxnet3_int.h b/drivers/net/vmxnet3/vmxnet3_int.h
index 59ec34052a65..a3326463b71f 100644
--- a/drivers/net/vmxnet3/vmxnet3_int.h
+++ b/drivers/net/vmxnet3/vmxnet3_int.h
@@ -69,10 +69,10 @@
 /*
  * Version numbers
  */
-#define VMXNET3_DRIVER_VERSION_STRING   "1.4.13.0-k"
+#define VMXNET3_DRIVER_VERSION_STRING   "1.4.14.0-k"
 
 /* a 32-bit int, each byte encode a verion number in VMXNET3_DRIVER_VERSION */
-#define VMXNET3_DRIVER_VERSION_NUM      0x01040d00
+#define VMXNET3_DRIVER_VERSION_NUM      0x01040e00
 
 #if defined(CONFIG_PCI_MSI)
 	/* RSS only makes sense if MSI-X is supported. */

From ab913455dd59b81204b6a0d387a44697b0e0bd85 Mon Sep 17 00:00:00 2001
From: Olivier Gayot <olivier.gayot@sigexec.com>
Date: Wed, 18 Apr 2018 22:03:06 +0200
Subject: [PATCH 251/288] docs: ip-sysctl.txt: fix name of some ipv6 variables

The name of the following proc/sysctl entries were incorrectly
documented:

    /proc/sys/net/ipv6/conf/<interface>/max_dst_opts_number
    /proc/sys/net/ipv6/conf/<interface>/max_hbt_opts_number
    /proc/sys/net/ipv6/conf/<interface>/max_dst_opts_length
    /proc/sys/net/ipv6/conf/<interface>/max_hbt_length

Their name was set to the name of the symbol in the .data field of the
control table instead of their .proc name.

Signed-off-by: Olivier Gayot <olivier.gayot@sigexec.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 5dc1a040a2f1..b583a73cf95f 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1390,26 +1390,26 @@ mld_qrv - INTEGER
 	Default: 2 (as specified by RFC3810 9.1)
 	Minimum: 1 (as specified by RFC6636 4.5)
 
-max_dst_opts_cnt - INTEGER
+max_dst_opts_number - INTEGER
 	Maximum number of non-padding TLVs allowed in a Destination
 	options extension header. If this value is less than zero
 	then unknown options are disallowed and the number of known
 	TLVs allowed is the absolute value of this number.
 	Default: 8
 
-max_hbh_opts_cnt - INTEGER
+max_hbh_opts_number - INTEGER
 	Maximum number of non-padding TLVs allowed in a Hop-by-Hop
 	options extension header. If this value is less than zero
 	then unknown options are disallowed and the number of known
 	TLVs allowed is the absolute value of this number.
 	Default: 8
 
-max dst_opts_len - INTEGER
+max_dst_opts_length - INTEGER
 	Maximum length allowed for a Destination options extension
 	header.
 	Default: INT_MAX (unlimited)
 
-max hbh_opts_len - INTEGER
+max_hbh_length - INTEGER
 	Maximum length allowed for a Hop-by-Hop options extension
 	header.
 	Default: INT_MAX (unlimited)

From f4ea89110df237da6fbcaab76af431e85f07d904 Mon Sep 17 00:00:00 2001
From: dann frazier <dann.frazier@canonical.com>
Date: Wed, 18 Apr 2018 21:55:41 -0600
Subject: [PATCH 252/288] net: hns: Avoid action name truncation

When longer interface names are used, the action names exposed in
/proc/interrupts and /proc/irq/* maybe truncated. For example, when
using the predictable name algorithm in systemd on a HiSilicon D05,
I see:

  ubuntu@d05-3:~$  grep enahisic2i0-tx /proc/interrupts | sed 's/.* //'
  enahisic2i0-tx0
  enahisic2i0-tx1
  [...]
  enahisic2i0-tx8
  enahisic2i0-tx9
  enahisic2i0-tx1
  enahisic2i0-tx1
  enahisic2i0-tx1
  enahisic2i0-tx1
  enahisic2i0-tx1
  enahisic2i0-tx1

Increase the max ring name length to allow for an interface name
of IFNAMSIZE. After this change, I now see:

  $ grep enahisic2i0-tx /proc/interrupts | sed 's/.* //'
  enahisic2i0-tx0
  enahisic2i0-tx1
  enahisic2i0-tx2
  [...]
  enahisic2i0-tx8
  enahisic2i0-tx9
  enahisic2i0-tx10
  enahisic2i0-tx11
  enahisic2i0-tx12
  enahisic2i0-tx13
  enahisic2i0-tx14
  enahisic2i0-tx15

Signed-off-by: dann frazier <dann.frazier@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns/hnae.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/hisilicon/hns/hnae.h b/drivers/net/ethernet/hisilicon/hns/hnae.h
index 3e62692af011..fa5b30f547f6 100644
--- a/drivers/net/ethernet/hisilicon/hns/hnae.h
+++ b/drivers/net/ethernet/hisilicon/hns/hnae.h
@@ -87,7 +87,7 @@ do { \
 
 #define HNAE_AE_REGISTER 0x1
 
-#define RCB_RING_NAME_LEN 16
+#define RCB_RING_NAME_LEN (IFNAMSIZ + 4)
 
 #define HNAE_LOWEST_LATENCY_COAL_PARAM	30
 #define HNAE_LOW_LATENCY_COAL_PARAM	80

From 12e571693837d6164bda61e316b1944972ee0d97 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Thu, 19 Apr 2018 08:30:48 +0300
Subject: [PATCH 253/288] virtio_net: split out ctrl buffer

When sending control commands, virtio net sets up several buffers for
DMA. The buffers are all part of the net device which means it's
actually allocated by kvmalloc so it's in theory (on extreme memory
pressure) possible to get a vmalloc'ed buffer which on some platforms
means we can't DMA there.

Fix up by moving the DMA buffers into a separate structure.

Reported-by: Mikulas Patocka <mpatocka@redhat.com>
Suggested-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/virtio_net.c | 68 +++++++++++++++++++++++-----------------
 1 file changed, 39 insertions(+), 29 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index dfe78308044d..9a675250107f 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -147,6 +147,17 @@ struct receive_queue {
 	struct xdp_rxq_info xdp_rxq;
 };
 
+/* Control VQ buffers: protected by the rtnl lock */
+struct control_buf {
+	struct virtio_net_ctrl_hdr hdr;
+	virtio_net_ctrl_ack status;
+	struct virtio_net_ctrl_mq mq;
+	u8 promisc;
+	u8 allmulti;
+	u16 vid;
+	u64 offloads;
+};
+
 struct virtnet_info {
 	struct virtio_device *vdev;
 	struct virtqueue *cvq;
@@ -192,14 +203,7 @@ struct virtnet_info {
 	struct hlist_node node;
 	struct hlist_node node_dead;
 
-	/* Control VQ buffers: protected by the rtnl lock */
-	struct virtio_net_ctrl_hdr ctrl_hdr;
-	virtio_net_ctrl_ack ctrl_status;
-	struct virtio_net_ctrl_mq ctrl_mq;
-	u8 ctrl_promisc;
-	u8 ctrl_allmulti;
-	u16 ctrl_vid;
-	u64 ctrl_offloads;
+	struct control_buf *ctrl;
 
 	/* Ethtool settings */
 	u8 duplex;
@@ -1461,25 +1465,25 @@ static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd,
 	/* Caller should know better */
 	BUG_ON(!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ));
 
-	vi->ctrl_status = ~0;
-	vi->ctrl_hdr.class = class;
-	vi->ctrl_hdr.cmd = cmd;
+	vi->ctrl->status = ~0;
+	vi->ctrl->hdr.class = class;
+	vi->ctrl->hdr.cmd = cmd;
 	/* Add header */
-	sg_init_one(&hdr, &vi->ctrl_hdr, sizeof(vi->ctrl_hdr));
+	sg_init_one(&hdr, &vi->ctrl->hdr, sizeof(vi->ctrl->hdr));
 	sgs[out_num++] = &hdr;
 
 	if (out)
 		sgs[out_num++] = out;
 
 	/* Add return status. */
-	sg_init_one(&stat, &vi->ctrl_status, sizeof(vi->ctrl_status));
+	sg_init_one(&stat, &vi->ctrl->status, sizeof(vi->ctrl->status));
 	sgs[out_num] = &stat;
 
 	BUG_ON(out_num + 1 > ARRAY_SIZE(sgs));
 	virtqueue_add_sgs(vi->cvq, sgs, out_num, 1, vi, GFP_ATOMIC);
 
 	if (unlikely(!virtqueue_kick(vi->cvq)))
-		return vi->ctrl_status == VIRTIO_NET_OK;
+		return vi->ctrl->status == VIRTIO_NET_OK;
 
 	/* Spin for a response, the kick causes an ioport write, trapping
 	 * into the hypervisor, so the request should be handled immediately.
@@ -1488,7 +1492,7 @@ static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd,
 	       !virtqueue_is_broken(vi->cvq))
 		cpu_relax();
 
-	return vi->ctrl_status == VIRTIO_NET_OK;
+	return vi->ctrl->status == VIRTIO_NET_OK;
 }
 
 static int virtnet_set_mac_address(struct net_device *dev, void *p)
@@ -1600,8 +1604,8 @@ static int _virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
 	if (!vi->has_cvq || !virtio_has_feature(vi->vdev, VIRTIO_NET_F_MQ))
 		return 0;
 
-	vi->ctrl_mq.virtqueue_pairs = cpu_to_virtio16(vi->vdev, queue_pairs);
-	sg_init_one(&sg, &vi->ctrl_mq, sizeof(vi->ctrl_mq));
+	vi->ctrl->mq.virtqueue_pairs = cpu_to_virtio16(vi->vdev, queue_pairs);
+	sg_init_one(&sg, &vi->ctrl->mq, sizeof(vi->ctrl->mq));
 
 	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MQ,
 				  VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, &sg)) {
@@ -1660,22 +1664,22 @@ static void virtnet_set_rx_mode(struct net_device *dev)
 	if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_RX))
 		return;
 
-	vi->ctrl_promisc = ((dev->flags & IFF_PROMISC) != 0);
-	vi->ctrl_allmulti = ((dev->flags & IFF_ALLMULTI) != 0);
+	vi->ctrl->promisc = ((dev->flags & IFF_PROMISC) != 0);
+	vi->ctrl->allmulti = ((dev->flags & IFF_ALLMULTI) != 0);
 
-	sg_init_one(sg, &vi->ctrl_promisc, sizeof(vi->ctrl_promisc));
+	sg_init_one(sg, &vi->ctrl->promisc, sizeof(vi->ctrl->promisc));
 
 	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
 				  VIRTIO_NET_CTRL_RX_PROMISC, sg))
 		dev_warn(&dev->dev, "Failed to %sable promisc mode.\n",
-			 vi->ctrl_promisc ? "en" : "dis");
+			 vi->ctrl->promisc ? "en" : "dis");
 
-	sg_init_one(sg, &vi->ctrl_allmulti, sizeof(vi->ctrl_allmulti));
+	sg_init_one(sg, &vi->ctrl->allmulti, sizeof(vi->ctrl->allmulti));
 
 	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
 				  VIRTIO_NET_CTRL_RX_ALLMULTI, sg))
 		dev_warn(&dev->dev, "Failed to %sable allmulti mode.\n",
-			 vi->ctrl_allmulti ? "en" : "dis");
+			 vi->ctrl->allmulti ? "en" : "dis");
 
 	uc_count = netdev_uc_count(dev);
 	mc_count = netdev_mc_count(dev);
@@ -1721,8 +1725,8 @@ static int virtnet_vlan_rx_add_vid(struct net_device *dev,
 	struct virtnet_info *vi = netdev_priv(dev);
 	struct scatterlist sg;
 
-	vi->ctrl_vid = vid;
-	sg_init_one(&sg, &vi->ctrl_vid, sizeof(vi->ctrl_vid));
+	vi->ctrl->vid = vid;
+	sg_init_one(&sg, &vi->ctrl->vid, sizeof(vi->ctrl->vid));
 
 	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
 				  VIRTIO_NET_CTRL_VLAN_ADD, &sg))
@@ -1736,8 +1740,8 @@ static int virtnet_vlan_rx_kill_vid(struct net_device *dev,
 	struct virtnet_info *vi = netdev_priv(dev);
 	struct scatterlist sg;
 
-	vi->ctrl_vid = vid;
-	sg_init_one(&sg, &vi->ctrl_vid, sizeof(vi->ctrl_vid));
+	vi->ctrl->vid = vid;
+	sg_init_one(&sg, &vi->ctrl->vid, sizeof(vi->ctrl->vid));
 
 	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
 				  VIRTIO_NET_CTRL_VLAN_DEL, &sg))
@@ -2133,9 +2137,9 @@ static int virtnet_restore_up(struct virtio_device *vdev)
 static int virtnet_set_guest_offloads(struct virtnet_info *vi, u64 offloads)
 {
 	struct scatterlist sg;
-	vi->ctrl_offloads = cpu_to_virtio64(vi->vdev, offloads);
+	vi->ctrl->offloads = cpu_to_virtio64(vi->vdev, offloads);
 
-	sg_init_one(&sg, &vi->ctrl_offloads, sizeof(vi->ctrl_offloads));
+	sg_init_one(&sg, &vi->ctrl->offloads, sizeof(vi->ctrl->offloads));
 
 	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_GUEST_OFFLOADS,
 				  VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET, &sg)) {
@@ -2358,6 +2362,7 @@ static void virtnet_free_queues(struct virtnet_info *vi)
 
 	kfree(vi->rq);
 	kfree(vi->sq);
+	kfree(vi->ctrl);
 }
 
 static void _free_receive_bufs(struct virtnet_info *vi)
@@ -2550,6 +2555,9 @@ static int virtnet_alloc_queues(struct virtnet_info *vi)
 {
 	int i;
 
+	vi->ctrl = kzalloc(sizeof(*vi->ctrl), GFP_KERNEL);
+	if (!vi->ctrl)
+		goto err_ctrl;
 	vi->sq = kzalloc(sizeof(*vi->sq) * vi->max_queue_pairs, GFP_KERNEL);
 	if (!vi->sq)
 		goto err_sq;
@@ -2578,6 +2586,8 @@ static int virtnet_alloc_queues(struct virtnet_info *vi)
 err_rq:
 	kfree(vi->sq);
 err_sq:
+	kfree(vi->ctrl);
+err_ctrl:
 	return -ENOMEM;
 }
 

From d7fad4c840f33a6bd333dd7fbb3006edbcf0017a Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Thu, 19 Apr 2018 08:30:49 +0300
Subject: [PATCH 254/288] virtio_net: fix adding vids on big-endian

Programming vids (adding or removing them) still passes
guest-endian values in the DMA buffer. That's wrong
if guest is big-endian and when virtio 1 is enabled.

Note: this is on top of a previous patch:
	virtio_net: split out ctrl buffer

Fixes: 9465a7a6f ("virtio_net: enable v1.0 support")
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/virtio_net.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 9a675250107f..16b0c7db431b 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -154,7 +154,7 @@ struct control_buf {
 	struct virtio_net_ctrl_mq mq;
 	u8 promisc;
 	u8 allmulti;
-	u16 vid;
+	__virtio16 vid;
 	u64 offloads;
 };
 
@@ -1725,7 +1725,7 @@ static int virtnet_vlan_rx_add_vid(struct net_device *dev,
 	struct virtnet_info *vi = netdev_priv(dev);
 	struct scatterlist sg;
 
-	vi->ctrl->vid = vid;
+	vi->ctrl->vid = cpu_to_virtio16(vi->vdev, vid);
 	sg_init_one(&sg, &vi->ctrl->vid, sizeof(vi->ctrl->vid));
 
 	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
@@ -1740,7 +1740,7 @@ static int virtnet_vlan_rx_kill_vid(struct net_device *dev,
 	struct virtnet_info *vi = netdev_priv(dev);
 	struct scatterlist sg;
 
-	vi->ctrl->vid = vid;
+	vi->ctrl->vid = cpu_to_virtio16(vi->vdev, vid);
 	sg_init_one(&sg, &vi->ctrl->vid, sizeof(vi->ctrl->vid));
 
 	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,

From f4ee703ace847f299da00944d57db7ff91786d0b Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Thu, 19 Apr 2018 08:30:50 +0300
Subject: [PATCH 255/288] virtio_net: sparse annotation fix

offloads is a buffer in virtio format, should use
the __virtio64 tag.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/virtio_net.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 16b0c7db431b..770422e953f7 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -155,7 +155,7 @@ struct control_buf {
 	u8 promisc;
 	u8 allmulti;
 	__virtio16 vid;
-	u64 offloads;
+	__virtio64 offloads;
 };
 
 struct virtnet_info {

From a60faa60da891e311e19fd3e88d611863f431130 Mon Sep 17 00:00:00 2001
From: Vasundhara Volam <vasundhara-v.volam@broadcom.com>
Date: Thu, 19 Apr 2018 03:16:16 -0400
Subject: [PATCH 256/288] bnxt_en: Fix memory fault in bnxt_ethtool_init()

In some firmware images, the length of BNX_DIR_TYPE_PKG_LOG nvram type
could be greater than the fixed buffer length of 4096 bytes allocated by
the driver.  This was causing HWRM_NVM_READ to copy more data to the buffer
than the allocated size, causing general protection fault.

Fix the issue by allocating the exact buffer length returned by
HWRM_NVM_FIND_DIR_ENTRY, instead of 4096.  Move the kzalloc() call
into the bnxt_get_pkgver() function.

Fixes: 3ebf6f0a09a2 ("bnxt_en: Add installed-package firmware version reporting via Ethtool GDRVINFO")
Signed-off-by: Vasundhara Volam <vasundhara-v.volam@broadcom.com>
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 49 ++++++++++---------
 .../ethernet/broadcom/bnxt/bnxt_nvm_defs.h    |  2 -
 2 files changed, 27 insertions(+), 24 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
index 1f622ca2a64f..8ba14ae00e8f 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
@@ -1927,22 +1927,39 @@ static char *bnxt_parse_pkglog(int desired_field, u8 *data, size_t datalen)
 	return retval;
 }
 
-static char *bnxt_get_pkgver(struct net_device *dev, char *buf, size_t buflen)
+static void bnxt_get_pkgver(struct net_device *dev)
 {
+	struct bnxt *bp = netdev_priv(dev);
 	u16 index = 0;
-	u32 datalen;
+	char *pkgver;
+	u32 pkglen;
+	u8 *pkgbuf;
+	int len;
 
 	if (bnxt_find_nvram_item(dev, BNX_DIR_TYPE_PKG_LOG,
 				 BNX_DIR_ORDINAL_FIRST, BNX_DIR_EXT_NONE,
-				 &index, NULL, &datalen) != 0)
-		return NULL;
+				 &index, NULL, &pkglen) != 0)
+		return;
 
-	memset(buf, 0, buflen);
-	if (bnxt_get_nvram_item(dev, index, 0, datalen, buf) != 0)
-		return NULL;
+	pkgbuf = kzalloc(pkglen, GFP_KERNEL);
+	if (!pkgbuf) {
+		dev_err(&bp->pdev->dev, "Unable to allocate memory for pkg version, length = %u\n",
+			pkglen);
+		return;
+	}
 
-	return bnxt_parse_pkglog(BNX_PKG_LOG_FIELD_IDX_PKG_VERSION, buf,
-		datalen);
+	if (bnxt_get_nvram_item(dev, index, 0, pkglen, pkgbuf))
+		goto err;
+
+	pkgver = bnxt_parse_pkglog(BNX_PKG_LOG_FIELD_IDX_PKG_VERSION, pkgbuf,
+				   pkglen);
+	if (pkgver && *pkgver != 0 && isdigit(*pkgver)) {
+		len = strlen(bp->fw_ver_str);
+		snprintf(bp->fw_ver_str + len, FW_VER_STR_LEN - len - 1,
+			 "/pkg %s", pkgver);
+	}
+err:
+	kfree(pkgbuf);
 }
 
 static int bnxt_get_eeprom(struct net_device *dev,
@@ -2615,22 +2632,10 @@ void bnxt_ethtool_init(struct bnxt *bp)
 	struct hwrm_selftest_qlist_input req = {0};
 	struct bnxt_test_info *test_info;
 	struct net_device *dev = bp->dev;
-	char *pkglog;
 	int i, rc;
 
-	pkglog = kzalloc(BNX_PKG_LOG_MAX_LENGTH, GFP_KERNEL);
-	if (pkglog) {
-		char *pkgver;
-		int len;
+	bnxt_get_pkgver(dev);
 
-		pkgver = bnxt_get_pkgver(dev, pkglog, BNX_PKG_LOG_MAX_LENGTH);
-		if (pkgver && *pkgver != 0 && isdigit(*pkgver)) {
-			len = strlen(bp->fw_ver_str);
-			snprintf(bp->fw_ver_str + len, FW_VER_STR_LEN - len - 1,
-				 "/pkg %s", pkgver);
-		}
-		kfree(pkglog);
-	}
 	if (bp->hwrm_spec_code < 0x10704 || !BNXT_SINGLE_PF(bp))
 		return;
 
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_nvm_defs.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_nvm_defs.h
index 73f2249555b5..83444811d3c6 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_nvm_defs.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_nvm_defs.h
@@ -59,8 +59,6 @@ enum bnxt_nvm_directory_type {
 #define BNX_DIR_ATTR_NO_CHKSUM			(1 << 0)
 #define BNX_DIR_ATTR_PROP_STREAM		(1 << 1)
 
-#define BNX_PKG_LOG_MAX_LENGTH			4096
-
 enum bnxnvm_pkglog_field_index {
 	BNX_PKG_LOG_FIELD_IDX_INSTALLED_TIMESTAMP	= 0,
 	BNX_PKG_LOG_FIELD_IDX_PKG_DESCRIPTION		= 1,

From 1255fcb2a655f05e02f3a74675a6d6525f187afd Mon Sep 17 00:00:00 2001
From: Ursula Braun <ubraun@linux.vnet.ibm.com>
Date: Thu, 19 Apr 2018 15:56:40 +0200
Subject: [PATCH 257/288] net/smc: fix shutdown in state SMC_LISTEN

Calling shutdown with SHUT_RD and SHUT_RDWR for a listening SMC socket
crashes, because
   commit 127f49705823 ("net/smc: release clcsock from tcp_listen_worker")
releases the internal clcsock in smc_close_active() and sets smc->clcsock
to NULL.
For SHUT_RD the smc_close_active() call is removed.
For SHUT_RDWR the kernel_sock_shutdown() call is omitted, since the
clcsock is already released.

Fixes: 127f49705823 ("net/smc: release clcsock from tcp_listen_worker")
Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Reported-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 5f8046c62d90..f5d4b69dbabc 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -1259,14 +1259,12 @@ static int smc_shutdown(struct socket *sock, int how)
 		rc = smc_close_shutdown_write(smc);
 		break;
 	case SHUT_RD:
-		if (sk->sk_state == SMC_LISTEN)
-			rc = smc_close_active(smc);
-		else
-			rc = 0;
-			/* nothing more to do because peer is not involved */
+		rc = 0;
+		/* nothing more to do because peer is not involved */
 		break;
 	}
-	rc1 = kernel_sock_shutdown(smc->clcsock, how);
+	if (smc->clcsock)
+		rc1 = kernel_sock_shutdown(smc->clcsock, how);
 	/* map sock_shutdown_cmd constants to sk_shutdown value range */
 	sk->sk_shutdown |= how + 1;
 

From df3f126482dba8e00cdbfc8fc44a05a5a35b1704 Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Mon, 16 Apr 2018 11:58:16 -0500
Subject: [PATCH 258/288] libnvdimm, of_pmem: use dev_to_node() instead of
 of_node_to_nid()

Remove the direct dependency on of_node_to_nid() by using dev_to_node()
instead. Any DT platform device will have its NUMA node id set when the
device is created.

With this, commit 291717b6fbdb ("libnvdimm, of_pmem: workaround OF_NUMA=n
build error") can be reverted.

Fixes: 717197608952 ("libnvdimm: Add device-tree based driver")
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Oliver O'Halloran <oohall@gmail.com>
Cc: linux-nvdimm@lists.01.org
Signed-off-by: Rob Herring <robh@kernel.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/of_pmem.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvdimm/of_pmem.c b/drivers/nvdimm/of_pmem.c
index 85013bad35de..0a701837dfc0 100644
--- a/drivers/nvdimm/of_pmem.c
+++ b/drivers/nvdimm/of_pmem.c
@@ -67,7 +67,7 @@ static int of_pmem_region_probe(struct platform_device *pdev)
 		 */
 		memset(&ndr_desc, 0, sizeof(ndr_desc));
 		ndr_desc.attr_groups = region_attr_groups;
-		ndr_desc.numa_node = of_node_to_nid(np);
+		ndr_desc.numa_node = dev_to_node(&pdev->dev);
 		ndr_desc.res = &pdev->resource[i];
 		ndr_desc.of_node = np;
 		set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags);

From f22acf82746dd3d75e06d7c5801926d3b9c50169 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 19 Apr 2018 15:07:42 -0700
Subject: [PATCH 259/288] Revert "libnvdimm, of_pmem: workaround OF_NUMA=n
 build error"

With commit df3f126482db ("libnvdimm, of_pmem: use dev_to_node() instead
of of_node_to_nid()") it is now possible to allow of_pmem to be built as
a module as originally implemented.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/Kconfig | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/nvdimm/Kconfig b/drivers/nvdimm/Kconfig
index 85997184e047..9d36473dc2a2 100644
--- a/drivers/nvdimm/Kconfig
+++ b/drivers/nvdimm/Kconfig
@@ -103,8 +103,7 @@ config NVDIMM_DAX
 	  Select Y if unsure
 
 config OF_PMEM
-	# FIXME: make tristate once OF_NUMA dependency removed
-	bool "Device-tree support for persistent memory regions"
+	tristate "Device-tree support for persistent memory regions"
 	depends on OF
 	default LIBNVDIMM
 	help

From ef8423022324cf79bd1b41d8707c766461e7e555 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 19 Apr 2018 13:39:43 -0700
Subject: [PATCH 260/288] device-dax: allow MAP_SYNC to succeed

MAP_SYNC is a nop for device-dax. Allow MAP_SYNC to succeed on device-dax
to eliminate special casing between device-dax and fs-dax as to when the
flag can be specified. Device-dax users already implicitly assume that they do
not need to call fsync(), and this enables them to explicitly check for this
capability.

Cc: <stable@vger.kernel.org>
Fixes: b6fb293f2497 ("mm: Define MAP_SYNC and VM_SYNC flags")
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dax/device.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/dax/device.c b/drivers/dax/device.c
index be8606457f27..aff2c1594220 100644
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -19,6 +19,7 @@
 #include <linux/dax.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
+#include <linux/mman.h>
 #include "dax-private.h"
 #include "dax.h"
 
@@ -540,6 +541,7 @@ static const struct file_operations dax_fops = {
 	.release = dax_release,
 	.get_unmapped_area = dax_get_unmapped_area,
 	.mmap = dax_mmap,
+	.mmap_supported_flags = MAP_SYNC,
 };
 
 static void dev_dax_release(struct device *dev)

From c5794510d7b5f210f05531ff9e82432cf7244367 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Fri, 13 Apr 2018 13:47:40 -0700
Subject: [PATCH 261/288] MAINTAINERS: Add backup maintainers for libnvdimm and
 DAX

Adding additional maintainers to libnvdimm related code and DAX.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Acked-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Acked-by: Vishal Verma <vishal.l.verma@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 MAINTAINERS | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 0a1410d5a621..60f89cdc565f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4245,6 +4245,9 @@ F:	include/trace/events/fs_dax.h
 
 DEVICE DIRECT ACCESS (DAX)
 M:	Dan Williams <dan.j.williams@intel.com>
+M:	Dave Jiang <dave.jiang@intel.com>
+M:	Ross Zwisler <ross.zwisler@linux.intel.com>
+M:	Vishal Verma <vishal.l.verma@intel.com>
 L:	linux-nvdimm@lists.01.org
 S:	Supported
 F:	drivers/dax/
@@ -8048,6 +8051,9 @@ F:	tools/lib/lockdep/
 
 LIBNVDIMM BLK: MMIO-APERTURE DRIVER
 M:	Ross Zwisler <ross.zwisler@linux.intel.com>
+M:	Dan Williams <dan.j.williams@intel.com>
+M:	Vishal Verma <vishal.l.verma@intel.com>
+M:	Dave Jiang <dave.jiang@intel.com>
 L:	linux-nvdimm@lists.01.org
 Q:	https://patchwork.kernel.org/project/linux-nvdimm/list/
 S:	Supported
@@ -8056,6 +8062,9 @@ F:	drivers/nvdimm/region_devs.c
 
 LIBNVDIMM BTT: BLOCK TRANSLATION TABLE
 M:	Vishal Verma <vishal.l.verma@intel.com>
+M:	Dan Williams <dan.j.williams@intel.com>
+M:	Ross Zwisler <ross.zwisler@linux.intel.com>
+M:	Dave Jiang <dave.jiang@intel.com>
 L:	linux-nvdimm@lists.01.org
 Q:	https://patchwork.kernel.org/project/linux-nvdimm/list/
 S:	Supported
@@ -8063,6 +8072,9 @@ F:	drivers/nvdimm/btt*
 
 LIBNVDIMM PMEM: PERSISTENT MEMORY DRIVER
 M:	Ross Zwisler <ross.zwisler@linux.intel.com>
+M:	Dan Williams <dan.j.williams@intel.com>
+M:	Vishal Verma <vishal.l.verma@intel.com>
+M:	Dave Jiang <dave.jiang@intel.com>
 L:	linux-nvdimm@lists.01.org
 Q:	https://patchwork.kernel.org/project/linux-nvdimm/list/
 S:	Supported
@@ -8078,6 +8090,9 @@ F:	Documentation/devicetree/bindings/pmem/pmem-region.txt
 
 LIBNVDIMM: NON-VOLATILE MEMORY DEVICE SUBSYSTEM
 M:	Dan Williams <dan.j.williams@intel.com>
+M:	Ross Zwisler <ross.zwisler@linux.intel.com>
+M:	Vishal Verma <vishal.l.verma@intel.com>
+M:	Dave Jiang <dave.jiang@intel.com>
 L:	linux-nvdimm@lists.01.org
 Q:	https://patchwork.kernel.org/project/linux-nvdimm/list/
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm.git

From 16a34adb9392b2fe4195267475ab5b472e55292c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 19 Apr 2018 22:03:08 -0400
Subject: [PATCH 262/288] Don't leak MNT_INTERNAL away from internal mounts

We want it only for the stuff created by SB_KERNMOUNT mounts, *not* for
their copies.  As it is, creating a deep stack of bindings of /proc/*/ns/*
somewhere in a new namespace and exiting yields a stack overflow.

Cc: stable@kernel.org
Reported-by: Alexander Aring <aring@mojatatu.com>
Bisected-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Tested-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Tested-by: Alexander Aring <aring@mojatatu.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index e398f32d7541..8634d565b858 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1089,7 +1089,8 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
 			goto out_free;
 	}
 
-	mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~(MNT_WRITE_HOLD|MNT_MARKED);
+	mnt->mnt.mnt_flags = old->mnt.mnt_flags;
+	mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL);
 	/* Don't allow unprivileged users to change mount flags */
 	if (flag & CL_UNPRIVILEGED) {
 		mnt->mnt.mnt_flags |= MNT_LOCK_ATIME;

From 05189820da23fc87ee2a7d87c20257f298af27f4 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Thu, 19 Apr 2018 20:26:00 +0200
Subject: [PATCH 263/288] x86/power/64: Fix page-table setup for temporary text
 mapping

On a system with 4-level page-tables there is no p4d, so the pud in the pgd
should be mapped. The old code before commit fb43d6cb91ef already did that.

The change from above commit causes an invalid page-table which causes
undefined behavior. In one report it caused triple faults.

Fix it by changing the p4d back to pud.

Fixes: fb43d6cb91ef ('x86/mm: Do not auto-massage page protections')
Reported-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Michal Kubecek <mkubecek@suse.cz>
Tested-by: Borislav Petkov <bp@suse.de>
Cc: linux-pm@vger.kernel.org
Cc: rjw@rjwysocki.net
Cc: pavel@ucw.cz
Cc: hpa@zytor.com
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Link: https://lkml.kernel.org/r/1524162360-26179-1-git-send-email-joro@8bytes.org
---
 arch/x86/power/hibernate_64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index 48b14b534897..ccf4a49bb065 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -98,7 +98,7 @@ static int set_up_temporary_text_mapping(pgd_t *pgd)
 		set_pgd(pgd + pgd_index(restore_jump_address), new_pgd);
 	} else {
 		/* No p4d for 4-level paging: point the pgd to the pud page table */
-		pgd_t new_pgd = __pgd(__pa(p4d) | pgprot_val(pgtable_prot));
+		pgd_t new_pgd = __pgd(__pa(pud) | pgprot_val(pgtable_prot));
 		set_pgd(pgd + pgd_index(restore_jump_address), new_pgd);
 	}
 

From d7717587ac6deae00e0b66c0113a046be2c6fb1c Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Fri, 23 Mar 2018 09:11:29 -0400
Subject: [PATCH 264/288] perf/x86/intel/uncore: Revert "Remove SBOX support
 for Broadwell server"

This reverts commit 3b94a891667c ("perf/x86/intel/uncore: Remove
SBOX support for Broadwell server")

Revert because there exists a proper workaround for Broadwell-EP servers
without SBOX now. Note that BDX-DE does not have a SBOX.

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Kan Liang <kan.liang@intel.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: ak@linux.intel.com
Cc: osk@google.com
Cc: mark@voidzero.net
Link: https://lkml.kernel.org/r/1521810690-2576-1-git-send-email-kan.liang@linux.intel.com
---
 arch/x86/events/intel/uncore_snbep.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index c98b943e58b4..5bbbbee11879 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -3028,10 +3028,27 @@ static struct intel_uncore_type bdx_uncore_cbox = {
 	.format_group		= &hswep_uncore_cbox_format_group,
 };
 
+static struct intel_uncore_type bdx_uncore_sbox = {
+	.name			= "sbox",
+	.num_counters		= 4,
+	.num_boxes		= 4,
+	.perf_ctr_bits		= 48,
+	.event_ctl		= HSWEP_S0_MSR_PMON_CTL0,
+	.perf_ctr		= HSWEP_S0_MSR_PMON_CTR0,
+	.event_mask		= HSWEP_S_MSR_PMON_RAW_EVENT_MASK,
+	.box_ctl		= HSWEP_S0_MSR_PMON_BOX_CTL,
+	.msr_offset		= HSWEP_SBOX_MSR_OFFSET,
+	.ops			= &hswep_uncore_sbox_msr_ops,
+	.format_group		= &hswep_uncore_sbox_format_group,
+};
+
+#define BDX_MSR_UNCORE_SBOX	3
+
 static struct intel_uncore_type *bdx_msr_uncores[] = {
 	&bdx_uncore_ubox,
 	&bdx_uncore_cbox,
 	&hswep_uncore_pcu,
+	&bdx_uncore_sbox,
 	NULL,
 };
 
@@ -3047,6 +3064,10 @@ void bdx_uncore_cpu_init(void)
 		bdx_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
 	uncore_msr_uncores = bdx_msr_uncores;
 
+	/* BDX-DE doesn't have SBOX */
+	if (boot_cpu_data.x86_model == 86)
+		uncore_msr_uncores[BDX_MSR_UNCORE_SBOX] = NULL;
+
 	hswep_uncore_pcu.constraints = bdx_uncore_pcu_constraints;
 }
 

From 15a3e845b01ce2342cf187dc123c92c44c3c8170 Mon Sep 17 00:00:00 2001
From: Oskar Senft <osk@google.com>
Date: Fri, 23 Mar 2018 09:11:30 -0400
Subject: [PATCH 265/288] perf/x86/intel/uncore: Fix SBOX support for Broadwell
 CPUs

SBOX on some Broadwell CPUs is broken because it's enabled unconditionally
despite the fact that there are no SBOXes available.

Check the Power Control Unit CAPID4 register to determine the number of
available SBOXes on the particular CPU before trying to enable them. If
there are none, nullify the SBOX descriptor so it isn't tried to be
initialized.

Signed-off-by: Oskar Senft <osk@google.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Mark van Dijk <mark@voidzero.net>
Reviewed-by: Kan Liang <kan.liang@intel.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: ak@linux.intel.com
Cc: peterz@infradead.org
Cc: eranian@google.com
Link: https://lkml.kernel.org/r/1521810690-2576-2-git-send-email-kan.liang@linux.intel.com
---
 arch/x86/events/intel/uncore_snbep.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 5bbbbee11879..77076a102e34 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -3060,14 +3060,25 @@ static struct event_constraint bdx_uncore_pcu_constraints[] = {
 
 void bdx_uncore_cpu_init(void)
 {
+	int pkg = topology_phys_to_logical_pkg(0);
+
 	if (bdx_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
 		bdx_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
 	uncore_msr_uncores = bdx_msr_uncores;
 
 	/* BDX-DE doesn't have SBOX */
-	if (boot_cpu_data.x86_model == 86)
+	if (boot_cpu_data.x86_model == 86) {
 		uncore_msr_uncores[BDX_MSR_UNCORE_SBOX] = NULL;
+	/* Detect systems with no SBOXes */
+	} else if (uncore_extra_pci_dev[pkg].dev[HSWEP_PCI_PCU_3]) {
+		struct pci_dev *pdev;
+		u32 capid4;
 
+		pdev = uncore_extra_pci_dev[pkg].dev[HSWEP_PCI_PCU_3];
+		pci_read_config_dword(pdev, 0x94, &capid4);
+		if (((capid4 >> 6) & 0x3) == 0)
+			bdx_msr_uncores[BDX_MSR_UNCORE_SBOX] = NULL;
+	}
 	hswep_uncore_pcu.constraints = bdx_uncore_pcu_constraints;
 }
 
@@ -3285,6 +3296,11 @@ static const struct pci_device_id bdx_uncore_pci_ids[] = {
 		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f46),
 		.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 2),
 	},
+	{ /* PCU.3 (for Capability registers) */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fc0),
+		.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+						   HSWEP_PCI_PCU_3),
+	},
 	{ /* end: all zeroes */ }
 };
 

From 660625922b3d9fcb376e5870299bc5c1086e1d32 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 18 Apr 2018 09:38:34 +0100
Subject: [PATCH 266/288] afs: Fix server record deletion

AFS server records get removed from the net->fs_servers tree when
they're deleted, but not from the net->fs_addresses{4,6} lists, which
can lead to an oops in afs_find_server() when a server record has been
removed, for instance during rmmod.

Fix this by deleting the record from the by-address lists before posting
it for RCU destruction.

The reason this hasn't been noticed before is that the fileserver keeps
probing the local cache manager, thereby keeping the service record
alive, so the oops would only happen when a fileserver eventually gets
bored and stops pinging or if the module gets rmmod'd and a call comes
in from the fileserver during the window between the server records
being destroyed and the socket being closed.

The oops looks something like:

  BUG: unable to handle kernel NULL pointer dereference at 000000000000001c
  ...
  Workqueue: kafsd afs_process_async_call [kafs]
  RIP: 0010:afs_find_server+0x271/0x36f [kafs]
  ...
  Call Trace:
   afs_deliver_cb_init_call_back_state3+0x1f2/0x21f [kafs]
   afs_deliver_to_call+0x1ee/0x5e8 [kafs]
   afs_process_async_call+0x5b/0xd0 [kafs]
   process_one_work+0x2c2/0x504
   worker_thread+0x1d4/0x2ac
   kthread+0x11f/0x127
   ret_from_fork+0x24/0x30

Fixes: d2ddc776a458 ("afs: Overhaul volume and server record caching and fileserver rotation")
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/afs/server.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/fs/afs/server.c b/fs/afs/server.c
index e23be63998a8..629c74986cff 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -428,8 +428,15 @@ static void afs_gc_servers(struct afs_net *net, struct afs_server *gc_list)
 		}
 		write_sequnlock(&net->fs_lock);
 
-		if (deleted)
+		if (deleted) {
+			write_seqlock(&net->fs_addr_lock);
+			if (!hlist_unhashed(&server->addr4_link))
+				hlist_del_rcu(&server->addr4_link);
+			if (!hlist_unhashed(&server->addr6_link))
+				hlist_del_rcu(&server->addr6_link);
+			write_sequnlock(&net->fs_addr_lock);
 			afs_destroy_server(net, server);
+		}
 	}
 }
 

From a9e5b73288cf1595ac2e05cf1acd1924ceea05fa Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 20 Apr 2018 13:35:02 +0100
Subject: [PATCH 267/288] vfs: Undo an overly zealous MS_RDONLY -> SB_RDONLY
 conversion

In do_mount() when the MS_* flags are being converted to MNT_* flags,
MS_RDONLY got accidentally convered to SB_RDONLY.

Undo this change.

Fixes: e462ec50cb5f ("VFS: Differentiate mount flags (MS_*) from internal superblock flags")
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/namespace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 8634d565b858..5f75969adff1 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2815,7 +2815,7 @@ long do_mount(const char *dev_name, const char __user *dir_name,
 		mnt_flags |= MNT_NODIRATIME;
 	if (flags & MS_STRICTATIME)
 		mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);
-	if (flags & SB_RDONLY)
+	if (flags & MS_RDONLY)
 		mnt_flags |= MNT_READONLY;
 
 	/* The default atime for remount is preservation */

From 5e388e95815408c27f3612190d089afc0774b870 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Wed, 18 Apr 2018 09:41:54 +0300
Subject: [PATCH 268/288] btrfs: Fix race condition between delayed refs and
 blockgroup removal

When the delayed refs for a head are all run, eventually
cleanup_ref_head is called which (in case of deletion) obtains a
reference for the relevant btrfs_space_info struct by querying the bg
for the range. This is problematic because when the last extent of a
bg is deleted a race window emerges between removal of that bg and the
subsequent invocation of cleanup_ref_head. This can result in cache being null
and either a null pointer dereference or assertion failure.

	task: ffff8d04d31ed080 task.stack: ffff9e5dc10cc000
	RIP: 0010:assfail.constprop.78+0x18/0x1a [btrfs]
	RSP: 0018:ffff9e5dc10cfbe8 EFLAGS: 00010292
	RAX: 0000000000000044 RBX: 0000000000000000 RCX: 0000000000000000
	RDX: ffff8d04ffc1f868 RSI: ffff8d04ffc178c8 RDI: ffff8d04ffc178c8
	RBP: ffff8d04d29e5ea0 R08: 00000000000001f0 R09: 0000000000000001
	R10: ffff9e5dc0507d58 R11: 0000000000000001 R12: ffff8d04d29e5ea0
	R13: ffff8d04d29e5f08 R14: ffff8d04efe29b40 R15: ffff8d04efe203e0
	FS:  00007fbf58ead500(0000) GS:ffff8d04ffc00000(0000) knlGS:0000000000000000
	CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
	CR2: 00007fe6c6975648 CR3: 0000000013b2a000 CR4: 00000000000006f0
	DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
	DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
	Call Trace:
	 __btrfs_run_delayed_refs+0x10e7/0x12c0 [btrfs]
	 btrfs_run_delayed_refs+0x68/0x250 [btrfs]
	 btrfs_should_end_transaction+0x42/0x60 [btrfs]
	 btrfs_truncate_inode_items+0xaac/0xfc0 [btrfs]
	 btrfs_evict_inode+0x4c6/0x5c0 [btrfs]
	 evict+0xc6/0x190
	 do_unlinkat+0x19c/0x300
	 do_syscall_64+0x74/0x140
	 entry_SYSCALL_64_after_hwframe+0x3d/0xa2
	RIP: 0033:0x7fbf589c57a7

To fix this, introduce a new flag "is_system" to head_ref structs,
which is populated at insertion time. This allows to decouple the
querying for the spaceinfo from querying the possibly deleted bg.

Fixes: d7eae3403f46 ("Btrfs: rework delayed ref total_bytes_pinned accounting")
CC: stable@vger.kernel.org # 4.14+
Suggested-by: Omar Sandoval <osandov@osandov.com>
Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/delayed-ref.c | 19 ++++++++++++++-----
 fs/btrfs/delayed-ref.h |  1 +
 fs/btrfs/extent-tree.c | 16 +++++++++++-----
 3 files changed, 26 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 9e98295de7ce..e1b0651686f7 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -540,8 +540,10 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
 		     struct btrfs_delayed_ref_head *head_ref,
 		     struct btrfs_qgroup_extent_record *qrecord,
 		     u64 bytenr, u64 num_bytes, u64 ref_root, u64 reserved,
-		     int action, int is_data, int *qrecord_inserted_ret,
+		     int action, int is_data, int is_system,
+		     int *qrecord_inserted_ret,
 		     int *old_ref_mod, int *new_ref_mod)
+
 {
 	struct btrfs_delayed_ref_head *existing;
 	struct btrfs_delayed_ref_root *delayed_refs;
@@ -585,6 +587,7 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
 	head_ref->ref_mod = count_mod;
 	head_ref->must_insert_reserved = must_insert_reserved;
 	head_ref->is_data = is_data;
+	head_ref->is_system = is_system;
 	head_ref->ref_tree = RB_ROOT;
 	INIT_LIST_HEAD(&head_ref->ref_add_list);
 	RB_CLEAR_NODE(&head_ref->href_node);
@@ -772,6 +775,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
 	struct btrfs_delayed_ref_root *delayed_refs;
 	struct btrfs_qgroup_extent_record *record = NULL;
 	int qrecord_inserted;
+	int is_system = (ref_root == BTRFS_CHUNK_TREE_OBJECTID);
 
 	BUG_ON(extent_op && extent_op->is_data);
 	ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS);
@@ -800,8 +804,8 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
 	 */
 	head_ref = add_delayed_ref_head(fs_info, trans, head_ref, record,
 					bytenr, num_bytes, 0, 0, action, 0,
-					&qrecord_inserted, old_ref_mod,
-					new_ref_mod);
+					is_system, &qrecord_inserted,
+					old_ref_mod, new_ref_mod);
 
 	add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr,
 			     num_bytes, parent, ref_root, level, action);
@@ -868,7 +872,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 	 */
 	head_ref = add_delayed_ref_head(fs_info, trans, head_ref, record,
 					bytenr, num_bytes, ref_root, reserved,
-					action, 1, &qrecord_inserted,
+					action, 1, 0, &qrecord_inserted,
 					old_ref_mod, new_ref_mod);
 
 	add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr,
@@ -898,9 +902,14 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
 	delayed_refs = &trans->transaction->delayed_refs;
 	spin_lock(&delayed_refs->lock);
 
+	/*
+	 * extent_ops just modify the flags of an extent and they don't result
+	 * in ref count changes, hence it's safe to pass false/0 for is_system
+	 * argument
+	 */
 	add_delayed_ref_head(fs_info, trans, head_ref, NULL, bytenr,
 			     num_bytes, 0, 0, BTRFS_UPDATE_DELAYED_HEAD,
-			     extent_op->is_data, NULL, NULL, NULL);
+			     extent_op->is_data, 0, NULL, NULL, NULL);
 
 	spin_unlock(&delayed_refs->lock);
 	return 0;
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 741869dbc316..7f00db50bd24 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -127,6 +127,7 @@ struct btrfs_delayed_ref_head {
 	 */
 	unsigned int must_insert_reserved:1;
 	unsigned int is_data:1;
+	unsigned int is_system:1;
 	unsigned int processing:1;
 };
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 055494ddcace..f99102063366 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2601,13 +2601,19 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
 	trace_run_delayed_ref_head(fs_info, head, 0);
 
 	if (head->total_ref_mod < 0) {
-		struct btrfs_block_group_cache *cache;
+		struct btrfs_space_info *space_info;
+		u64 flags;
 
-		cache = btrfs_lookup_block_group(fs_info, head->bytenr);
-		ASSERT(cache);
-		percpu_counter_add(&cache->space_info->total_bytes_pinned,
+		if (head->is_data)
+			flags = BTRFS_BLOCK_GROUP_DATA;
+		else if (head->is_system)
+			flags = BTRFS_BLOCK_GROUP_SYSTEM;
+		else
+			flags = BTRFS_BLOCK_GROUP_METADATA;
+		space_info = __find_space_info(fs_info, flags);
+		ASSERT(space_info);
+		percpu_counter_add(&space_info->total_bytes_pinned,
 				   -head->num_bytes);
-		btrfs_put_block_group(cache);
 
 		if (head->is_data) {
 			spin_lock(&delayed_refs->lock);

From c0872323746e11fc79344e3738b283a8cda86654 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Wed, 11 Apr 2018 17:08:12 +0800
Subject: [PATCH 269/288] btrfs: print-tree: debugging output enhancement

This patch enhances the following things:

- tree block header
  * add generation and owner output for node and leaf
- node pointer generation output
- allow btrfs_print_tree() to not follow nodes
  * just like btrfs-progs

Please note that, although function btrfs_print_tree() is not called by
anyone right now, it's still a pretty useful function to debug kernel.
So that function is still kept for later use.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Lu Fengqi <lufq.fnst@cn.fujitsu.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/print-tree.c | 25 +++++++++++++++----------
 fs/btrfs/print-tree.h |  2 +-
 2 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 124276bba8cf..21a831d3d087 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -189,9 +189,10 @@ void btrfs_print_leaf(struct extent_buffer *l)
 	fs_info = l->fs_info;
 	nr = btrfs_header_nritems(l);
 
-	btrfs_info(fs_info, "leaf %llu total ptrs %d free space %d",
-		   btrfs_header_bytenr(l), nr,
-		   btrfs_leaf_free_space(fs_info, l));
+	btrfs_info(fs_info,
+		   "leaf %llu gen %llu total ptrs %d free space %d owner %llu",
+		   btrfs_header_bytenr(l), btrfs_header_generation(l), nr,
+		   btrfs_leaf_free_space(fs_info, l), btrfs_header_owner(l));
 	for (i = 0 ; i < nr ; i++) {
 		item = btrfs_item_nr(i);
 		btrfs_item_key_to_cpu(l, &key, i);
@@ -325,7 +326,7 @@ void btrfs_print_leaf(struct extent_buffer *l)
 	}
 }
 
-void btrfs_print_tree(struct extent_buffer *c)
+void btrfs_print_tree(struct extent_buffer *c, bool follow)
 {
 	struct btrfs_fs_info *fs_info;
 	int i; u32 nr;
@@ -342,15 +343,19 @@ void btrfs_print_tree(struct extent_buffer *c)
 		return;
 	}
 	btrfs_info(fs_info,
-		   "node %llu level %d total ptrs %d free spc %u",
-		   btrfs_header_bytenr(c), level, nr,
-		   (u32)BTRFS_NODEPTRS_PER_BLOCK(fs_info) - nr);
+		   "node %llu level %d gen %llu total ptrs %d free spc %u owner %llu",
+		   btrfs_header_bytenr(c), level, btrfs_header_generation(c),
+		   nr, (u32)BTRFS_NODEPTRS_PER_BLOCK(fs_info) - nr,
+		   btrfs_header_owner(c));
 	for (i = 0; i < nr; i++) {
 		btrfs_node_key_to_cpu(c, &key, i);
-		pr_info("\tkey %d (%llu %u %llu) block %llu\n",
+		pr_info("\tkey %d (%llu %u %llu) block %llu gen %llu\n",
 		       i, key.objectid, key.type, key.offset,
-		       btrfs_node_blockptr(c, i));
+		       btrfs_node_blockptr(c, i),
+		       btrfs_node_ptr_generation(c, i));
 	}
+	if (!follow)
+		return;
 	for (i = 0; i < nr; i++) {
 		struct btrfs_key first_key;
 		struct extent_buffer *next;
@@ -372,7 +377,7 @@ void btrfs_print_tree(struct extent_buffer *c)
 		if (btrfs_header_level(next) !=
 		       level - 1)
 			BUG();
-		btrfs_print_tree(next);
+		btrfs_print_tree(next, follow);
 		free_extent_buffer(next);
 	}
 }
diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h
index 4a98481688f4..e6bb38fd75ad 100644
--- a/fs/btrfs/print-tree.h
+++ b/fs/btrfs/print-tree.h
@@ -7,6 +7,6 @@
 #define BTRFS_PRINT_TREE_H
 
 void btrfs_print_leaf(struct extent_buffer *l);
-void btrfs_print_tree(struct extent_buffer *c);
+void btrfs_print_tree(struct extent_buffer *c, bool follow);
 
 #endif

From ff30b89e0ab71115cbad6ae10a58bd83fe18b41f Mon Sep 17 00:00:00 2001
From: Long Li <longli@microsoft.com>
Date: Tue, 17 Apr 2018 12:17:10 -0700
Subject: [PATCH 270/288] cifs: smbd: Dump SMB packet when configured

When sending through SMB Direct, also dump the packet in SMB send path.

Also fixed a typo in debug message.

Signed-off-by: Long Li <longli@microsoft.com>
Cc: stable@vger.kernel.org
Signed-off-by: Steve French <smfrench@gmail.com>
Reviewed-by: Ronnie Sahlberg <lsahlber@redhat.com>
---
 fs/cifs/smbdirect.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c
index d611ed0537fd..87817ddcc096 100644
--- a/fs/cifs/smbdirect.c
+++ b/fs/cifs/smbdirect.c
@@ -1028,7 +1028,7 @@ static int smbd_post_send(struct smbd_connection *info,
 	for (i = 0; i < request->num_sge; i++) {
 		log_rdma_send(INFO,
 			"rdma_request sge[%d] addr=%llu length=%u\n",
-			i, request->sge[0].addr, request->sge[0].length);
+			i, request->sge[i].addr, request->sge[i].length);
 		ib_dma_sync_single_for_device(
 			info->id->device,
 			request->sge[i].addr,
@@ -2139,6 +2139,10 @@ int smbd_send(struct smbd_connection *info, struct smb_rqst *rqst)
 		goto done;
 	}
 
+	cifs_dbg(FYI, "Sending smb (RDMA): smb_len=%u\n", buflen);
+	for (i = 0; i < rqst->rq_nvec-1; i++)
+		dump_smb(iov[i].iov_base, iov[i].iov_len);
+
 	remaining_data_length = buflen;
 
 	log_write(INFO, "rqst->rq_nvec=%d rqst->rq_npages=%d rq_pagesz=%d "

From 1d0cffa674cfa7d185a302c8c6850fc50b893bed Mon Sep 17 00:00:00 2001
From: Steve French <smfrench@gmail.com>
Date: Fri, 20 Apr 2018 12:19:07 -0500
Subject: [PATCH 271/288] cifs: do not allow creating sockets except with SMB1
 posix exensions

RHBZ: 1453123

Since at least the 3.10 kernel and likely a lot earlier we have
not been able to create unix domain sockets in a cifs share
when mounted using the SFU mount option (except when mounted
with the cifs unix extensions to Samba e.g.)
Trying to create a socket, for example using the af_unix command from
xfstests will cause :
BUG: unable to handle kernel NULL pointer dereference at 00000000
00000040

Since no one uses or depends on being able to create unix domains sockets
on a cifs share the easiest fix to stop this vulnerability is to simply
not allow creation of any other special files than char or block devices
when sfu is used.

Added update to Ronnie's patch to handle a tcon link leak, and
to address a buf leak noticed by Gustavo and Colin.

Acked-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
CC:  Colin Ian King <colin.king@canonical.com>
Reviewed-by: Pavel Shilovsky <pshilov@microsoft.com>
Reported-by: Eryu Guan <eguan@redhat.com>
Signed-off-by: Ronnie Sahlberg <lsahlber@redhat.com>
Signed-off-by: Steve French <smfrench@gmail.com>
Cc: stable@vger.kernel.org
---
 fs/cifs/dir.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 81ba6e0d88d8..925844343038 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -684,6 +684,9 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode,
 		goto mknod_out;
 	}
 
+	if (!S_ISCHR(mode) && !S_ISBLK(mode))
+		goto mknod_out;
+
 	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL))
 		goto mknod_out;
 
@@ -692,10 +695,8 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode,
 
 	buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
 	if (buf == NULL) {
-		kfree(full_path);
 		rc = -ENOMEM;
-		free_xid(xid);
-		return rc;
+		goto mknod_out;
 	}
 
 	if (backup_cred(cifs_sb))
@@ -742,7 +743,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode,
 		pdev->minor = cpu_to_le64(MINOR(device_number));
 		rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms,
 							&bytes_written, iov, 1);
-	} /* else if (S_ISFIFO) */
+	}
 	tcon->ses->server->ops->close(xid, tcon, &fid);
 	d_drop(direntry);
 

From 596632de0440baecaccc9d4347329c64661c400f Mon Sep 17 00:00:00 2001
From: Aurelien Aptel <aaptel@suse.com>
Date: Thu, 19 Apr 2018 10:44:20 +0200
Subject: [PATCH 272/288] CIFS: fix typo in cifs_dbg

Signed-off-by: Aurelien Aptel <aaptel@suse.com>
Signed-off-by: Steve French <smfrench@gmail.com>
Reported-by: Long Li <longli@microsoft.com>
---
 fs/cifs/cifs_debug.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h
index fe5567655662..0e74690d11bc 100644
--- a/fs/cifs/cifs_debug.h
+++ b/fs/cifs/cifs_debug.h
@@ -54,7 +54,7 @@ do {								\
 		pr_debug_ ## ratefunc("%s: "			\
 				fmt, __FILE__, ##__VA_ARGS__);	\
 	} else if ((type) & VFS) {				\
-		pr_err_ ## ratefunc("CuIFS VFS: "		\
+		pr_err_ ## ratefunc("CIFS VFS: "		\
 				 fmt, ##__VA_ARGS__);		\
 	} else if ((type) & NOISY && (NOISY != 0)) {		\
 		pr_debug_ ## ratefunc(fmt, ##__VA_ARGS__);	\

From e01e80634ecdde1dd113ac43b3adad21b47f3957 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 20 Apr 2018 14:55:31 -0700
Subject: [PATCH 273/288] fork: unconditionally clear stack on fork

One of the classes of kernel stack content leaks[1] is exposing the
contents of prior heap or stack contents when a new process stack is
allocated.  Normally, those stacks are not zeroed, and the old contents
remain in place.  In the face of stack content exposure flaws, those
contents can leak to userspace.

Fixing this will make the kernel no longer vulnerable to these flaws, as
the stack will be wiped each time a stack is assigned to a new process.
There's not a meaningful change in runtime performance; it almost looks
like it provides a benefit.

Performing back-to-back kernel builds before:
	Run times: 157.86 157.09 158.90 160.94 160.80
	Mean: 159.12
	Std Dev: 1.54

and after:
	Run times: 159.31 157.34 156.71 158.15 160.81
	Mean: 158.46
	Std Dev: 1.46

Instead of making this a build or runtime config, Andy Lutomirski
recommended this just be enabled by default.

[1] A noisy search for many kinds of stack content leaks can be seen here:
https://cve.mitre.org/cgi-bin/cvekey.cgi?keyword=linux+kernel+stack+leak

I did some more with perf and cycle counts on running 100,000 execs of
/bin/true.

before:
Cycles: 218858861551 218853036130 214727610969 227656844122 224980542841
Mean:  221015379122.60
Std Dev: 4662486552.47

after:
Cycles: 213868945060 213119275204 211820169456 224426673259 225489986348
Mean:  217745009865.40
Std Dev: 5935559279.99

It continues to look like it's faster, though the deviation is rather
wide, but I'm not sure what I could do that would be less noisy.  I'm
open to ideas!

Link: http://lkml.kernel.org/r/20180221021659.GA37073@beast
Signed-off-by: Kees Cook <keescook@chromium.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Laura Abbott <labbott@redhat.com>
Cc: Rasmus Villemoes <rasmus.villemoes@prevas.dk>
Cc: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/thread_info.h | 6 +-----
 kernel/fork.c               | 3 +--
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index 34f053a150a9..cf2862bd134a 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -43,11 +43,7 @@ enum {
 #define THREAD_ALIGN	THREAD_SIZE
 #endif
 
-#if IS_ENABLED(CONFIG_DEBUG_STACK_USAGE) || IS_ENABLED(CONFIG_DEBUG_KMEMLEAK)
-# define THREADINFO_GFP		(GFP_KERNEL_ACCOUNT | __GFP_ZERO)
-#else
-# define THREADINFO_GFP		(GFP_KERNEL_ACCOUNT)
-#endif
+#define THREADINFO_GFP		(GFP_KERNEL_ACCOUNT | __GFP_ZERO)
 
 /*
  * flag set/clear/test wrappers
diff --git a/kernel/fork.c b/kernel/fork.c
index 242c8c93d285..a5d21c42acfc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -216,10 +216,9 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
 		if (!s)
 			continue;
 
-#ifdef CONFIG_DEBUG_KMEMLEAK
 		/* Clear stale pointers from reused stack. */
 		memset(s->addr, 0, THREAD_SIZE);
-#endif
+
 		tsk->stack_vm_area = s;
 		return s->addr;
 	}

From 8f175cf5c99dc0e3add2aac0ea1cd54e0f9ca87d Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 20 Apr 2018 14:55:35 -0700
Subject: [PATCH 274/288] mm: fix do_pages_move status handling

Li Wang has reported that LTP move_pages04 test fails with the current
tree:

LTP move_pages04:
   TFAIL  :  move_pages04.c:143: status[1] is EPERM, expected EFAULT

The test allocates an array of two pages, one is present while the other
is not (resp.  backed by zero page) and it expects EFAULT for the second
page as the man page suggests.  We are reporting EPERM which doesn't make
any sense and this is a result of a bug from cf5f16b23ec9 ("mm: unclutter
THP migration").

do_pages_move tries to handle as many pages in one batch as possible so we
queue all pages with the same node target together and that corresponds to
[start, i] range which is then used to update status array.
add_page_for_migration will correctly notice the zero (resp.  !present)
page and returns with EFAULT which gets written to the status.  But if
this is the last page in the array we do not update start and so the last
store_status after the loop will overwrite the range of the last batch
with NUMA_NO_NODE (which corresponds to EPERM).

Fix this by simply bailing out from the last flush if the pagelist is
empty as there is clearly nothing more to do.

Link: http://lkml.kernel.org/r/20180418121255.334-1-mhocko@kernel.org
Fixes: cf5f16b23ec9 ("mm: unclutter THP migration")
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reported-by: Li Wang <liwang@redhat.com>
Tested-by: Li Wang <liwang@redhat.com>
Cc: Zi Yan <zi.yan@cs.rutgers.edu>
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/migrate.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/mm/migrate.c b/mm/migrate.c
index f65dd69e1fd1..70ef794cccae 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1622,6 +1622,9 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
 		current_node = NUMA_NO_NODE;
 	}
 out_flush:
+	if (list_empty(&pagelist))
+		return err;
+
 	/* Make sure we do not overwrite the existing error */
 	err1 = do_move_pages_to_node(mm, &pagelist, current_node);
 	if (!err1)

From 88c28f2469151b031f8cea9b28ed5be1b74a4172 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Fri, 20 Apr 2018 14:55:38 -0700
Subject: [PATCH 275/288] mm, pagemap: fix swap offset value for PMD migration
 entry

The swap offset reported by /proc/<pid>/pagemap may be not correct for
PMD migration entries.  If addr passed into pagemap_pmd_range() isn't
aligned with PMD start address, the swap offset reported doesn't
reflect this.  And in the loop to report information of each sub-page,
the swap offset isn't increased accordingly as that for PFN.

This may happen after opening /proc/<pid>/pagemap and seeking to a page
whose address doesn't align with a PMD start address.  I have verified
this with a simple test program.

BTW: migration swap entries have PFN information, do we need to restrict
whether to show them?

[akpm@linux-foundation.org: fix typo, per Huang, Ying]
Link: http://lkml.kernel.org/r/20180408033737.10897-1-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrei Vagin <avagin@openvz.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: "Jerome Glisse" <jglisse@redhat.com>
Cc: Daniel Colascione <dancol@google.com>
Cc: Zi Yan <zi.yan@cs.rutgers.edu>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 65ae54659833..c486ad4b43f0 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1310,9 +1310,11 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
 		else if (is_swap_pmd(pmd)) {
 			swp_entry_t entry = pmd_to_swp_entry(pmd);
+			unsigned long offset = swp_offset(entry);
 
+			offset += (addr & ~PMD_MASK) >> PAGE_SHIFT;
 			frame = swp_type(entry) |
-				(swp_offset(entry) << MAX_SWAPFILES_SHIFT);
+				(offset << MAX_SWAPFILES_SHIFT);
 			flags |= PM_SWAP;
 			if (pmd_swp_soft_dirty(pmd))
 				flags |= PM_SOFT_DIRTY;
@@ -1332,6 +1334,8 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
 				break;
 			if (pm->show_pfn && (flags & PM_PRESENT))
 				frame++;
+			else if (flags & PM_SWAP)
+				frame += (1 << MAX_SWAPFILES_SHIFT);
 		}
 		spin_unlock(ptl);
 		return err;

From 2e898e4c0a3897ccd434adac5abb8330194f527b Mon Sep 17 00:00:00 2001
From: Greg Thelen <gthelen@google.com>
Date: Fri, 20 Apr 2018 14:55:42 -0700
Subject: [PATCH 276/288] writeback: safer lock nesting

lock_page_memcg()/unlock_page_memcg() use spin_lock_irqsave/restore() if
the page's memcg is undergoing move accounting, which occurs when a
process leaves its memcg for a new one that has
memory.move_charge_at_immigrate set.

unlocked_inode_to_wb_begin,end() use spin_lock_irq/spin_unlock_irq() if
the given inode is switching writeback domains.  Switches occur when
enough writes are issued from a new domain.

This existing pattern is thus suspicious:
    lock_page_memcg(page);
    unlocked_inode_to_wb_begin(inode, &locked);
    ...
    unlocked_inode_to_wb_end(inode, locked);
    unlock_page_memcg(page);

If both inode switch and process memcg migration are both in-flight then
unlocked_inode_to_wb_end() will unconditionally enable interrupts while
still holding the lock_page_memcg() irq spinlock.  This suggests the
possibility of deadlock if an interrupt occurs before unlock_page_memcg().

    truncate
    __cancel_dirty_page
    lock_page_memcg
    unlocked_inode_to_wb_begin
    unlocked_inode_to_wb_end
    <interrupts mistakenly enabled>
                                    <interrupt>
                                    end_page_writeback
                                    test_clear_page_writeback
                                    lock_page_memcg
                                    <deadlock>
    unlock_page_memcg

Due to configuration limitations this deadlock is not currently possible
because we don't mix cgroup writeback (a cgroupv2 feature) and
memory.move_charge_at_immigrate (a cgroupv1 feature).

If the kernel is hacked to always claim inode switching and memcg
moving_account, then this script triggers lockup in less than a minute:

  cd /mnt/cgroup/memory
  mkdir a b
  echo 1 > a/memory.move_charge_at_immigrate
  echo 1 > b/memory.move_charge_at_immigrate
  (
    echo $BASHPID > a/cgroup.procs
    while true; do
      dd if=/dev/zero of=/mnt/big bs=1M count=256
    done
  ) &
  while true; do
    sync
  done &
  sleep 1h &
  SLEEP=$!
  while true; do
    echo $SLEEP > a/cgroup.procs
    echo $SLEEP > b/cgroup.procs
  done

The deadlock does not seem possible, so it's debatable if there's any
reason to modify the kernel.  I suggest we should to prevent future
surprises.  And Wang Long said "this deadlock occurs three times in our
environment", so there's more reason to apply this, even to stable.
Stable 4.4 has minor conflicts applying this patch.  For a clean 4.4 patch
see "[PATCH for-4.4] writeback: safer lock nesting"
https://lkml.org/lkml/2018/4/11/146

Wang Long said "this deadlock occurs three times in our environment"

[gthelen@google.com: v4]
  Link: http://lkml.kernel.org/r/20180411084653.254724-1-gthelen@google.com
[akpm@linux-foundation.org: comment tweaks, struct initialization simplification]
Change-Id: Ibb773e8045852978f6207074491d262f1b3fb613
Link: http://lkml.kernel.org/r/20180410005908.167976-1-gthelen@google.com
Fixes: 682aa8e1a6a1 ("writeback: implement unlocked_inode_to_wb transaction and use it for stat updates")
Signed-off-by: Greg Thelen <gthelen@google.com>
Reported-by: Wang Long <wanglong19@meituan.com>
Acked-by: Wang Long <wanglong19@meituan.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: <stable@vger.kernel.org>	[v4.2+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fs-writeback.c                |  7 ++++---
 include/linux/backing-dev-defs.h |  5 +++++
 include/linux/backing-dev.h      | 30 ++++++++++++++++--------------
 mm/page-writeback.c              | 18 +++++++++---------
 4 files changed, 34 insertions(+), 26 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 4b12ba70a895..47d7c151fcba 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -745,11 +745,12 @@ int inode_congested(struct inode *inode, int cong_bits)
 	 */
 	if (inode && inode_to_wb_is_valid(inode)) {
 		struct bdi_writeback *wb;
-		bool locked, congested;
+		struct wb_lock_cookie lock_cookie = {};
+		bool congested;
 
-		wb = unlocked_inode_to_wb_begin(inode, &locked);
+		wb = unlocked_inode_to_wb_begin(inode, &lock_cookie);
 		congested = wb_congested(wb, cong_bits);
-		unlocked_inode_to_wb_end(inode, locked);
+		unlocked_inode_to_wb_end(inode, &lock_cookie);
 		return congested;
 	}
 
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index bfe86b54f6c1..0bd432a4d7bd 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -223,6 +223,11 @@ static inline void set_bdi_congested(struct backing_dev_info *bdi, int sync)
 	set_wb_congested(bdi->wb.congested, sync);
 }
 
+struct wb_lock_cookie {
+	bool locked;
+	unsigned long flags;
+};
+
 #ifdef CONFIG_CGROUP_WRITEBACK
 
 /**
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index f6be4b0b6c18..72ca0f3d39f3 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -347,7 +347,7 @@ static inline struct bdi_writeback *inode_to_wb(const struct inode *inode)
 /**
  * unlocked_inode_to_wb_begin - begin unlocked inode wb access transaction
  * @inode: target inode
- * @lockedp: temp bool output param, to be passed to the end function
+ * @cookie: output param, to be passed to the end function
  *
  * The caller wants to access the wb associated with @inode but isn't
  * holding inode->i_lock, the i_pages lock or wb->list_lock.  This
@@ -355,12 +355,12 @@ static inline struct bdi_writeback *inode_to_wb(const struct inode *inode)
  * association doesn't change until the transaction is finished with
  * unlocked_inode_to_wb_end().
  *
- * The caller must call unlocked_inode_to_wb_end() with *@lockdep
- * afterwards and can't sleep during transaction.  IRQ may or may not be
- * disabled on return.
+ * The caller must call unlocked_inode_to_wb_end() with *@cookie afterwards and
+ * can't sleep during the transaction.  IRQs may or may not be disabled on
+ * return.
  */
 static inline struct bdi_writeback *
-unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp)
+unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie)
 {
 	rcu_read_lock();
 
@@ -368,10 +368,10 @@ unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp)
 	 * Paired with store_release in inode_switch_wb_work_fn() and
 	 * ensures that we see the new wb if we see cleared I_WB_SWITCH.
 	 */
-	*lockedp = smp_load_acquire(&inode->i_state) & I_WB_SWITCH;
+	cookie->locked = smp_load_acquire(&inode->i_state) & I_WB_SWITCH;
 
-	if (unlikely(*lockedp))
-		xa_lock_irq(&inode->i_mapping->i_pages);
+	if (unlikely(cookie->locked))
+		xa_lock_irqsave(&inode->i_mapping->i_pages, cookie->flags);
 
 	/*
 	 * Protected by either !I_WB_SWITCH + rcu_read_lock() or the i_pages
@@ -383,12 +383,13 @@ unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp)
 /**
  * unlocked_inode_to_wb_end - end inode wb access transaction
  * @inode: target inode
- * @locked: *@lockedp from unlocked_inode_to_wb_begin()
+ * @cookie: @cookie from unlocked_inode_to_wb_begin()
  */
-static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked)
+static inline void unlocked_inode_to_wb_end(struct inode *inode,
+					    struct wb_lock_cookie *cookie)
 {
-	if (unlikely(locked))
-		xa_unlock_irq(&inode->i_mapping->i_pages);
+	if (unlikely(cookie->locked))
+		xa_unlock_irqrestore(&inode->i_mapping->i_pages, cookie->flags);
 
 	rcu_read_unlock();
 }
@@ -435,12 +436,13 @@ static inline struct bdi_writeback *inode_to_wb(struct inode *inode)
 }
 
 static inline struct bdi_writeback *
-unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp)
+unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie)
 {
 	return inode_to_wb(inode);
 }
 
-static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked)
+static inline void unlocked_inode_to_wb_end(struct inode *inode,
+					    struct wb_lock_cookie *cookie)
 {
 }
 
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 5c1a3279e63f..337c6afb3345 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2502,13 +2502,13 @@ void account_page_redirty(struct page *page)
 	if (mapping && mapping_cap_account_dirty(mapping)) {
 		struct inode *inode = mapping->host;
 		struct bdi_writeback *wb;
-		bool locked;
+		struct wb_lock_cookie cookie = {};
 
-		wb = unlocked_inode_to_wb_begin(inode, &locked);
+		wb = unlocked_inode_to_wb_begin(inode, &cookie);
 		current->nr_dirtied--;
 		dec_node_page_state(page, NR_DIRTIED);
 		dec_wb_stat(wb, WB_DIRTIED);
-		unlocked_inode_to_wb_end(inode, locked);
+		unlocked_inode_to_wb_end(inode, &cookie);
 	}
 }
 EXPORT_SYMBOL(account_page_redirty);
@@ -2614,15 +2614,15 @@ void __cancel_dirty_page(struct page *page)
 	if (mapping_cap_account_dirty(mapping)) {
 		struct inode *inode = mapping->host;
 		struct bdi_writeback *wb;
-		bool locked;
+		struct wb_lock_cookie cookie = {};
 
 		lock_page_memcg(page);
-		wb = unlocked_inode_to_wb_begin(inode, &locked);
+		wb = unlocked_inode_to_wb_begin(inode, &cookie);
 
 		if (TestClearPageDirty(page))
 			account_page_cleaned(page, mapping, wb);
 
-		unlocked_inode_to_wb_end(inode, locked);
+		unlocked_inode_to_wb_end(inode, &cookie);
 		unlock_page_memcg(page);
 	} else {
 		ClearPageDirty(page);
@@ -2654,7 +2654,7 @@ int clear_page_dirty_for_io(struct page *page)
 	if (mapping && mapping_cap_account_dirty(mapping)) {
 		struct inode *inode = mapping->host;
 		struct bdi_writeback *wb;
-		bool locked;
+		struct wb_lock_cookie cookie = {};
 
 		/*
 		 * Yes, Virginia, this is indeed insane.
@@ -2691,14 +2691,14 @@ int clear_page_dirty_for_io(struct page *page)
 		 * always locked coming in here, so we get the desired
 		 * exclusion.
 		 */
-		wb = unlocked_inode_to_wb_begin(inode, &locked);
+		wb = unlocked_inode_to_wb_begin(inode, &cookie);
 		if (TestClearPageDirty(page)) {
 			dec_lruvec_page_state(page, NR_FILE_DIRTY);
 			dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
 			dec_wb_stat(wb, WB_RECLAIMABLE);
 			ret = 1;
 		}
-		unlocked_inode_to_wb_end(inode, locked);
+		unlocked_inode_to_wb_end(inode, &cookie);
 		return ret;
 	}
 	return TestClearPageDirty(page);

From e71769ae52609ea0044a9901709042e5634c2306 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Fri, 20 Apr 2018 14:55:45 -0700
Subject: [PATCH 277/288] mm: enable thp migration for shmem thp

My testing for the latest kernel supporting thp migration showed an
infinite loop in offlining the memory block that is filled with shmem
thps.  We can get out of the loop with a signal, but kernel should return
with failure in this case.

What happens in the loop is that scan_movable_pages() repeats returning
the same pfn without any progress.  That's because page migration always
fails for shmem thps.

In memory offline code, memory blocks containing unmovable pages should be
prevented from being offline targets by has_unmovable_pages() inside
start_isolate_page_range().  So it's possible to change migratability for
non-anonymous thps to avoid the issue, but it introduces more complex and
thp-specific handling in migration code, so it might not good.

So this patch is suggesting to fix the issue by enabling thp migration for
shmem thp.  Both of anon/shmem thp are migratable so we don't need
precheck about the type of thps.

Link: http://lkml.kernel.org/r/20180406030706.GA2434@hori1.linux.bs1.fc.nec.co.jp
Fixes: commit 72b39cfc4d75 ("mm, memory_hotplug: do not fail offlining too early")
Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Zi Yan <zi.yan@sent.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c |  5 ++++-
 mm/migrate.c     | 19 ++++++++++++++++---
 mm/rmap.c        |  3 ---
 3 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 14ed6ee5e02f..a3a1815f8e11 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2925,7 +2925,10 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
 		pmde = maybe_pmd_mkwrite(pmde, vma);
 
 	flush_cache_range(vma, mmun_start, mmun_start + HPAGE_PMD_SIZE);
-	page_add_anon_rmap(new, vma, mmun_start, true);
+	if (PageAnon(new))
+		page_add_anon_rmap(new, vma, mmun_start, true);
+	else
+		page_add_file_rmap(new, true);
 	set_pmd_at(mm, mmun_start, pvmw->pmd, pmde);
 	if (vma->vm_flags & VM_LOCKED)
 		mlock_vma_page(new);
diff --git a/mm/migrate.c b/mm/migrate.c
index 70ef794cccae..568433023831 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -472,7 +472,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
 	pslot = radix_tree_lookup_slot(&mapping->i_pages,
  					page_index(page));
 
-	expected_count += 1 + page_has_private(page);
+	expected_count += hpage_nr_pages(page) + page_has_private(page);
 	if (page_count(page) != expected_count ||
 		radix_tree_deref_slot_protected(pslot,
 					&mapping->i_pages.xa_lock) != page) {
@@ -505,7 +505,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
 	 */
 	newpage->index = page->index;
 	newpage->mapping = page->mapping;
-	get_page(newpage);	/* add cache reference */
+	page_ref_add(newpage, hpage_nr_pages(page)); /* add cache reference */
 	if (PageSwapBacked(page)) {
 		__SetPageSwapBacked(newpage);
 		if (PageSwapCache(page)) {
@@ -524,13 +524,26 @@ int migrate_page_move_mapping(struct address_space *mapping,
 	}
 
 	radix_tree_replace_slot(&mapping->i_pages, pslot, newpage);
+	if (PageTransHuge(page)) {
+		int i;
+		int index = page_index(page);
+
+		for (i = 0; i < HPAGE_PMD_NR; i++) {
+			pslot = radix_tree_lookup_slot(&mapping->i_pages,
+						       index + i);
+			radix_tree_replace_slot(&mapping->i_pages, pslot,
+						newpage + i);
+		}
+	} else {
+		radix_tree_replace_slot(&mapping->i_pages, pslot, newpage);
+	}
 
 	/*
 	 * Drop cache reference from old page by unfreezing
 	 * to one less reference.
 	 * We know this isn't the last reference.
 	 */
-	page_ref_unfreeze(page, expected_count - 1);
+	page_ref_unfreeze(page, expected_count - hpage_nr_pages(page));
 
 	xa_unlock(&mapping->i_pages);
 	/* Leave irq disabled to prevent preemption while updating stats */
diff --git a/mm/rmap.c b/mm/rmap.c
index f0dd4e4565bc..8d5337fed37b 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1374,9 +1374,6 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 		if (!pvmw.pte && (flags & TTU_MIGRATION)) {
 			VM_BUG_ON_PAGE(PageHuge(page) || !PageTransCompound(page), page);
 
-			if (!PageAnon(page))
-				continue;
-
 			set_pmd_migration_entry(&pvmw, page);
 			continue;
 		}

From c5157b76869ba98c3a99a1982396437464e131a6 Mon Sep 17 00:00:00 2001
From: Ioan Nicu <ioan.nicu.ext@nokia.com>
Date: Fri, 20 Apr 2018 14:55:49 -0700
Subject: [PATCH 278/288] rapidio: fix rio_dma_transfer error handling

Some of the mport_dma_req structure members were initialized late
inside the do_dma_request() function, just before submitting the
request to the dma engine. But we have some error branches before
that. In case of such an error, the code would return on the error
path and trigger the calling of dma_req_free() with a req structure
which is not completely initialized. This causes a NULL pointer
dereference in dma_req_free().

This patch fixes these error branches by making sure that all
necessary mport_dma_req structure members are initialized in
rio_dma_transfer() immediately after the request structure gets
allocated.

Link: http://lkml.kernel.org/r/20180412150605.GA31409@nokia.com
Fixes: bbd876adb8c72 ("rapidio: use a reference count for struct mport_dma_req")
Signed-off-by: Ioan Nicu <ioan.nicu.ext@nokia.com>
Tested-by: Alexander Sverdlin <alexander.sverdlin@nokia.com>
Acked-by: Alexandre Bounine <alex.bou9@gmail.com>
Cc: Barry Wood <barry.wood@idt.com>
Cc: Matt Porter <mporter@kernel.crashing.org>
Cc: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Frank Kunz <frank.kunz@nokia.com>
Cc: <stable@vger.kernel.org>	[4.6+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/rapidio/devices/rio_mport_cdev.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/drivers/rapidio/devices/rio_mport_cdev.c b/drivers/rapidio/devices/rio_mport_cdev.c
index 9d27016c899e..0434ab7b6497 100644
--- a/drivers/rapidio/devices/rio_mport_cdev.c
+++ b/drivers/rapidio/devices/rio_mport_cdev.c
@@ -740,10 +740,7 @@ static int do_dma_request(struct mport_dma_req *req,
 	tx->callback = dma_xfer_callback;
 	tx->callback_param = req;
 
-	req->dmach = chan;
-	req->sync = sync;
 	req->status = DMA_IN_PROGRESS;
-	init_completion(&req->req_comp);
 	kref_get(&req->refcount);
 
 	cookie = dmaengine_submit(tx);
@@ -831,13 +828,20 @@ rio_dma_transfer(struct file *filp, u32 transfer_mode,
 	if (!req)
 		return -ENOMEM;
 
-	kref_init(&req->refcount);
-
 	ret = get_dma_channel(priv);
 	if (ret) {
 		kfree(req);
 		return ret;
 	}
+	chan = priv->dmach;
+
+	kref_init(&req->refcount);
+	init_completion(&req->req_comp);
+	req->dir = dir;
+	req->filp = filp;
+	req->priv = priv;
+	req->dmach = chan;
+	req->sync = sync;
 
 	/*
 	 * If parameter loc_addr != NULL, we are transferring data from/to
@@ -925,11 +929,6 @@ rio_dma_transfer(struct file *filp, u32 transfer_mode,
 				xfer->offset, xfer->length);
 	}
 
-	req->dir = dir;
-	req->filp = filp;
-	req->priv = priv;
-	chan = priv->dmach;
-
 	nents = dma_map_sg(chan->device->dev,
 			   req->sgt.sgl, req->sgt.nents, dir);
 	if (nents == 0) {

From 12c8f25a016dff69ee284aa3338bebfd2cfcba33 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Fri, 20 Apr 2018 14:55:52 -0700
Subject: [PATCH 279/288] kasan: add no_sanitize attribute for clang builds

KASAN uses the __no_sanitize_address macro to disable instrumentation of
particular functions.  Right now it's defined only for GCC build, which
causes false positives when clang is used.

This patch adds a definition for clang.

Note, that clang's revision 329612 or higher is required.

[andreyknvl@google.com: remove redundant #ifdef CONFIG_KASAN check]
  Link: http://lkml.kernel.org/r/c79aa31a2a2790f6131ed607c58b0dd45dd62a6c.1523967959.git.andreyknvl@google.com
Link: http://lkml.kernel.org/r/4ad725cc903f8534f8c8a60f0daade5e3d674f8d.1523554166.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Acked-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: David Woodhouse <dwmw@amazon.co.uk>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Paul Lawrence <paullawrence@google.com>
Cc: Sandipan Das <sandipan@linux.vnet.ibm.com>
Cc: Kees Cook <keescook@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compiler-clang.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h
index ceb96ecab96e..7d98e263e048 100644
--- a/include/linux/compiler-clang.h
+++ b/include/linux/compiler-clang.h
@@ -25,6 +25,9 @@
 #define __SANITIZE_ADDRESS__
 #endif
 
+#undef __no_sanitize_address
+#define __no_sanitize_address __attribute__((no_sanitize("address")))
+
 /* Clang doesn't have a way to turn it off per-function, yet. */
 #ifdef __noretpoline
 #undef __noretpoline

From 1551cf740cffc62221a4f13da1892f15b0d36f1e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Fri, 20 Apr 2018 14:55:56 -0700
Subject: [PATCH 280/288] MAINTAINERS: add personal addresses for Sascha and
 Uwe
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The idea behind using kernel@pengutronix.de (i.e. the mail alias for the
kernel people at Pengutronix) as email address was to have a backup when
a given developer is on vacation or run over by a bus. Make this more
explicit by adding the alias as reviewer and use the personal address
for Sascha and me.

Link: http://lkml.kernel.org/r/20180413083312.11213-1-u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Acked-by: Sascha Hauer <s.hauer@pengutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 MAINTAINERS | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 5ae51d05c09b..b7ff5654b8b5 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1373,7 +1373,8 @@ F:	arch/arm/mach-ebsa110/
 F:	drivers/net/ethernet/amd/am79c961a.*
 
 ARM/ENERGY MICRO (SILICON LABS) EFM32 SUPPORT
-M:	Uwe Kleine-König <kernel@pengutronix.de>
+M:	Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
+R:	Pengutronix Kernel Team <kernel@pengutronix.de>
 L:	linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 S:	Maintained
 N:	efm32
@@ -1401,7 +1402,8 @@ F:	arch/arm/mach-footbridge/
 
 ARM/FREESCALE IMX / MXC ARM ARCHITECTURE
 M:	Shawn Guo <shawnguo@kernel.org>
-M:	Sascha Hauer <kernel@pengutronix.de>
+M:	Sascha Hauer <s.hauer@pengutronix.de>
+R:	Pengutronix Kernel Team <kernel@pengutronix.de>
 R:	Fabio Estevam <fabio.estevam@nxp.com>
 L:	linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 S:	Maintained
@@ -1416,7 +1418,8 @@ F:	include/soc/imx/
 
 ARM/FREESCALE VYBRID ARM ARCHITECTURE
 M:	Shawn Guo <shawnguo@kernel.org>
-M:	Sascha Hauer <kernel@pengutronix.de>
+M:	Sascha Hauer <s.hauer@pengutronix.de>
+R:	Pengutronix Kernel Team <kernel@pengutronix.de>
 R:	Stefan Agner <stefan@agner.ch>
 L:	linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 S:	Maintained
@@ -5652,7 +5655,8 @@ F:	drivers/net/ethernet/freescale/fec.h
 F:	Documentation/devicetree/bindings/net/fsl-fec.txt
 
 FREESCALE IMX / MXC FRAMEBUFFER DRIVER
-M:	Sascha Hauer <kernel@pengutronix.de>
+M:	Sascha Hauer <s.hauer@pengutronix.de>
+R:	Pengutronix Kernel Team <kernel@pengutronix.de>
 L:	linux-fbdev@vger.kernel.org
 L:	linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 S:	Maintained
@@ -12825,7 +12829,8 @@ F:	include/linux/siphash.h
 
 SIOX
 M:	Gavin Schenk <g.schenk@eckelmann.de>
-M:	Uwe Kleine-König <kernel@pengutronix.de>
+M:	Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
+R:	Pengutronix Kernel Team <kernel@pengutronix.de>
 S:	Supported
 F:	drivers/siox/*
 F:	include/trace/events/siox.h

From 1e6306652ba18723015d1b4967fe9de55f042499 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Fri, 20 Apr 2018 14:55:59 -0700
Subject: [PATCH 281/288] autofs: mount point create should honour passed in
 mode

The autofs file system mkdir inode operation blindly sets the created
directory mode to S_IFDIR | 0555, ingoring the passed in mode, which can
cause selinux dac_override denials.

But the function also checks if the caller is the daemon (as no-one else
should be able to do anything here) so there's no point in not honouring
the passed in mode, allowing the daemon to set appropriate mode when
required.

Link: http://lkml.kernel.org/r/152361593601.8051.14014139124905996173.stgit@pluto.themaw.net
Signed-off-by: Ian Kent <raven@themaw.net>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs4/root.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 82e8f6edfb48..b12e37f27530 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -749,7 +749,7 @@ static int autofs4_dir_mkdir(struct inode *dir,
 
 	autofs4_del_active(dentry);
 
-	inode = autofs4_get_inode(dir->i_sb, S_IFDIR | 0555);
+	inode = autofs4_get_inode(dir->i_sb, S_IFDIR | mode);
 	if (!inode)
 		return -ENOMEM;
 	d_add(dentry, inode);

From 2e0ad552f5f8cd0fda02bc45fcd2b89821c62fd1 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Fri, 20 Apr 2018 14:56:03 -0700
Subject: [PATCH 282/288] proc: revalidate kernel thread inodes to root:root

task_dump_owner() has the following code:

	mm = task->mm;
	if (mm) {
		if (get_dumpable(mm) != SUID_DUMP_USER) {
			uid = ...
		}
	}

Check for ->mm is buggy -- kernel thread might be borrowing mm
and inode will go to some random uid:gid pair.

Link: http://lkml.kernel.org/r/20180412220109.GA20978@avx2
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index eafa39a3a88c..1b2ede6abcdf 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1693,6 +1693,12 @@ void task_dump_owner(struct task_struct *task, umode_t mode,
 	kuid_t uid;
 	kgid_t gid;
 
+	if (unlikely(task->flags & PF_KTHREAD)) {
+		*ruid = GLOBAL_ROOT_UID;
+		*rgid = GLOBAL_ROOT_GID;
+		return;
+	}
+
 	/* Default to the tasks effective ownership */
 	rcu_read_lock();
 	cred = __task_cred(task);

From 9a1015b32faa7cebfe19663c886b0cfe90be1d49 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Fri, 20 Apr 2018 14:56:06 -0700
Subject: [PATCH 283/288] proc: fix /proc/loadavg regression

Commit 95846ecf9dac ("pid: replace pid bitmap implementation with IDR
API") changed last field of /proc/loadavg (last pid allocated) to be off
by one:

	# unshare -p -f --mount-proc cat /proc/loadavg
	0.00 0.00 0.00 1/60 2	<===

It should be 1 after first fork into pid namespace.

This is formally a regression but given how useless this field is I
don't think anyone is affected.

Bug was found by /proc testsuite!

Link: http://lkml.kernel.org/r/20180413175408.GA27246@avx2
Fixes: 95846ecf9dac508 ("pid: replace pid bitmap implementation with IDR API")
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Gargi Sharma <gs051095@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/platforms/cell/spufs/sched.c | 2 +-
 fs/proc/loadavg.c                         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index 9033c8194eda..ccc421503363 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -1093,7 +1093,7 @@ static int show_spu_loadavg(struct seq_file *s, void *private)
 		LOAD_INT(c), LOAD_FRAC(c),
 		count_active_contexts(),
 		atomic_read(&nr_spu_contexts),
-		idr_get_cursor(&task_active_pid_ns(current)->idr));
+		idr_get_cursor(&task_active_pid_ns(current)->idr) - 1);
 	return 0;
 }
 
diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c
index a000d7547479..b572cc865b92 100644
--- a/fs/proc/loadavg.c
+++ b/fs/proc/loadavg.c
@@ -24,7 +24,7 @@ static int loadavg_proc_show(struct seq_file *m, void *v)
 		LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]),
 		LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]),
 		nr_running(), nr_threads,
-		idr_get_cursor(&task_active_pid_ns(current)->idr));
+		idr_get_cursor(&task_active_pid_ns(current)->idr) - 1);
 	return 0;
 }
 

From a841aa83dff0af75c88aa846ba610a8af4c5ee21 Mon Sep 17 00:00:00 2001
From: Dave Young <dyoung@redhat.com>
Date: Fri, 20 Apr 2018 14:56:10 -0700
Subject: [PATCH 284/288] kexec_file: do not add extra alignment to efi memmap

Chun-Yi reported a kernel warning message below:

  WARNING: CPU: 0 PID: 0 at ../mm/early_ioremap.c:182 early_iounmap+0x4f/0x12c()
  early_iounmap(ffffffffff200180, 00000118) [0] size not consistent 00000120

The problem is x86 kexec_file_load adds extra alignment to the efi
memmap: in bzImage64_load():

        efi_map_sz = efi_get_runtime_map_size();
        efi_map_sz = ALIGN(efi_map_sz, 16);

And __efi_memmap_init maps with the size including the alignment bytes
but efi_memmap_unmap use nr_maps * desc_size which does not include the
extra bytes.

The alignment in kexec code is only needed for the kexec buffer internal
use Actually kexec should pass exact size of the efi memmap to 2nd
kernel.

Link: http://lkml.kernel.org/r/20180417083600.GA1972@dhcp-128-65.nay.redhat.com
Signed-off-by: Dave Young <dyoung@redhat.com>
Reported-by: joeyli <jlee@suse.com>
Tested-by: Randy Wright <rwright@hpe.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/kexec-bzimage64.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index 3182908b7e6c..7326078eaa7a 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -398,11 +398,10 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
 	 * little bit simple
 	 */
 	efi_map_sz = efi_get_runtime_map_size();
-	efi_map_sz = ALIGN(efi_map_sz, 16);
 	params_cmdline_sz = sizeof(struct boot_params) + cmdline_len +
 				MAX_ELFCOREHDR_STR_LEN;
 	params_cmdline_sz = ALIGN(params_cmdline_sz, 16);
-	kbuf.bufsz = params_cmdline_sz + efi_map_sz +
+	kbuf.bufsz = params_cmdline_sz + ALIGN(efi_map_sz, 16) +
 				sizeof(struct setup_data) +
 				sizeof(struct efi_setup_data);
 
@@ -410,7 +409,7 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
 	if (!params)
 		return ERR_PTR(-ENOMEM);
 	efi_map_offset = params_cmdline_sz;
-	efi_setup_data_offset = efi_map_offset + efi_map_sz;
+	efi_setup_data_offset = efi_map_offset + ALIGN(efi_map_sz, 16);
 
 	/* Copy setup header onto bootparams. Documentation/x86/boot.txt */
 	setup_header_size = 0x0202 + kernel[0x0201] - setup_hdr_offset;

From d23a61ee90af38bb4a65decf76e798e65b401482 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Fri, 20 Apr 2018 14:56:13 -0700
Subject: [PATCH 285/288] fs, elf: don't complain MAP_FIXED_NOREPLACE unless
 -EEXIST error

Commit 4ed28639519c ("fs, elf: drop MAP_FIXED usage from elf_map") is
printing spurious messages under memory pressure due to map_addr == -ENOMEM.

 9794 (a.out): Uhuuh, elf segment at 00007f2e34738000(fffffffffffffff4) requested but the memory is mapped already
 14104 (a.out): Uhuuh, elf segment at 00007f34fd76c000(fffffffffffffff4) requested but the memory is mapped already
 16843 (a.out): Uhuuh, elf segment at 00007f930ecc7000(fffffffffffffff4) requested but the memory is mapped already

Complain only if -EEXIST, and use %px for printing the address.

Link: http://lkml.kernel.org/r/201804182307.FAC17665.SFMOFJVFtHOLOQ@I-love.SAKURA.ne.jp
Fixes: 4ed28639519c7bad ("fs, elf: drop MAP_FIXED usage from elf_map") is
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Andrei Vagin <avagin@openvz.org>
Cc: Khalid Aziz <khalid.aziz@oracle.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Kees Cook <keescook@chromium.org>
Cc: Abdul Haleem <abdhalee@linux.vnet.ibm.com>
Cc: Joel Stanley <joel@jms.id.au>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/binfmt_elf.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 41e04183e4ce..4ad6f669fe34 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -377,10 +377,10 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
 	} else
 		map_addr = vm_mmap(filep, addr, size, prot, type, off);
 
-	if ((type & MAP_FIXED_NOREPLACE) && BAD_ADDR(map_addr))
-		pr_info("%d (%s): Uhuuh, elf segment at %p requested but the memory is mapped already\n",
-				task_pid_nr(current), current->comm,
-				(void *)addr);
+	if ((type & MAP_FIXED_NOREPLACE) &&
+	    PTR_ERR((void *)map_addr) == -EEXIST)
+		pr_info("%d (%s): Uhuuh, elf segment at %px requested but the memory is mapped already\n",
+			task_pid_nr(current), current->comm, (void *)addr);
 
 	return(map_addr);
 }

From c892fd82cc0632d425ae011a4dd75eb59e9f84ee Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Fri, 20 Apr 2018 14:56:17 -0700
Subject: [PATCH 286/288] mm: memcg: add __GFP_NOWARN in
 __memcg_schedule_kmem_cache_create()

If there is heavy memory pressure, page allocation with __GFP_NOWAIT
fails easily although it's order-0 request.  I got below warning 9 times
for normal boot.

     <snip >: page allocation failure: order:0, mode:0x2200000(GFP_NOWAIT|__GFP_NOTRACK)
     .. snip ..
     Call trace:
       dump_backtrace+0x0/0x4
       dump_stack+0xa4/0xc0
       warn_alloc+0xd4/0x15c
       __alloc_pages_nodemask+0xf88/0x10fc
       alloc_slab_page+0x40/0x18c
       new_slab+0x2b8/0x2e0
       ___slab_alloc+0x25c/0x464
       __kmalloc+0x394/0x498
       memcg_kmem_get_cache+0x114/0x2b8
       kmem_cache_alloc+0x98/0x3e8
       mmap_region+0x3bc/0x8c0
       do_mmap+0x40c/0x43c
       vm_mmap_pgoff+0x15c/0x1e4
       sys_mmap+0xb0/0xc8
       el0_svc_naked+0x24/0x28
     Mem-Info:
     active_anon:17124 inactive_anon:193 isolated_anon:0
      active_file:7898 inactive_file:712955 isolated_file:55
      unevictable:0 dirty:27 writeback:18 unstable:0
      slab_reclaimable:12250 slab_unreclaimable:23334
      mapped:19310 shmem:212 pagetables:816 bounce:0
      free:36561 free_pcp:1205 free_cma:35615
     Node 0 active_anon:68496kB inactive_anon:772kB active_file:31592kB inactive_file:2851820kB unevictable:0kB isolated(anon):0kB isolated(file):220kB mapped:77240kB dirty:108kB writeback:72kB shmem:848kB writeback_tmp:0kB unstable:0kB all_unreclaimable? no
     DMA free:142188kB min:3056kB low:3820kB high:4584kB active_anon:10052kB inactive_anon:12kB active_file:312kB inactive_file:1412620kB unevictable:0kB writepending:0kB present:1781412kB managed:1604728kB mlocked:0kB slab_reclaimable:3592kB slab_unreclaimable:876kB kernel_stack:400kB pagetables:52kB bounce:0kB free_pcp:1436kB local_pcp:124kB free_cma:142492kB
     lowmem_reserve[]: 0 1842 1842
     Normal free:4056kB min:4172kB low:5212kB high:6252kB active_anon:58376kB inactive_anon:760kB active_file:31348kB inactive_file:1439040kB unevictable:0kB writepending:180kB present:2000636kB managed:1923688kB mlocked:0kB slab_reclaimable:45408kB slab_unreclaimable:92460kB kernel_stack:9680kB pagetables:3212kB bounce:0kB free_pcp:3392kB local_pcp:688kB free_cma:0kB
     lowmem_reserve[]: 0 0 0
     DMA: 0*4kB 0*8kB 1*16kB (C) 0*32kB 0*64kB 0*128kB 1*256kB (C) 1*512kB (C) 0*1024kB 1*2048kB (C) 34*4096kB (C) = 142096kB
     Normal: 228*4kB (UMEH) 172*8kB (UMH) 23*16kB (UH) 24*32kB (H) 5*64kB (H) 1*128kB (H) 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 3872kB
     721350 total pagecache pages
     0 pages in swap cache
     Swap cache stats: add 0, delete 0, find 0/0
     Free swap  = 0kB
     Total swap = 0kB
     945512 pages RAM
     0 pages HighMem/MovableOnly
     63408 pages reserved
     51200 pages cma reserved

__memcg_schedule_kmem_cache_create() tries to create a shadow slab cache
and the worker allocation failure is not really critical because we will
retry on the next kmem charge.  We might miss some charges but that
shouldn't be critical.  The excessive allocation failure report is not
very helpful.

[mhocko@kernel.org: changelog update]
Link: http://lkml.kernel.org/r/20180418022912.248417-1-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e074f7c637aa..2bd3df3d101a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2192,7 +2192,7 @@ static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
 {
 	struct memcg_kmem_cache_create_work *cw;
 
-	cw = kmalloc(sizeof(*cw), GFP_NOWAIT);
+	cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN);
 	if (!cw)
 		return;
 

From abc1be13fd113ddef5e2d807a466286b864caed3 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <mawilcox@microsoft.com>
Date: Fri, 20 Apr 2018 14:56:20 -0700
Subject: [PATCH 287/288] mm/filemap.c: fix NULL pointer in
 page_cache_tree_insert()

f2fs specifies the __GFP_ZERO flag for allocating some of its pages.
Unfortunately, the page cache also uses the mapping's GFP flags for
allocating radix tree nodes.  It always masked off the __GFP_HIGHMEM
flag, and masks off __GFP_ZERO in some paths, but not all.  That causes
radix tree nodes to be allocated with a NULL list_head, which causes
backtraces like:

  __list_del_entry+0x30/0xd0
  list_lru_del+0xac/0x1ac
  page_cache_tree_insert+0xd8/0x110

The __GFP_DMA and __GFP_DMA32 flags would also be able to sneak through
if they are ever used.  Fix them all by using GFP_RECLAIM_MASK at the
innermost location, and remove it from earlier in the callchain.

Link: http://lkml.kernel.org/r/20180411060320.14458-2-willy@infradead.org
Fixes: 449dd6984d0e ("mm: keep page cache radix tree nodes in check")
Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
Reported-by: Chris Fries <cfries@google.com>
Debugged-by: Minchan Kim <minchan@kernel.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/filemap.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index 9276bdb2343c..0604cb02e6f3 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -786,7 +786,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
 	VM_BUG_ON_PAGE(!PageLocked(new), new);
 	VM_BUG_ON_PAGE(new->mapping, new);
 
-	error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
+	error = radix_tree_preload(gfp_mask & GFP_RECLAIM_MASK);
 	if (!error) {
 		struct address_space *mapping = old->mapping;
 		void (*freepage)(struct page *);
@@ -842,7 +842,7 @@ static int __add_to_page_cache_locked(struct page *page,
 			return error;
 	}
 
-	error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM);
+	error = radix_tree_maybe_preload(gfp_mask & GFP_RECLAIM_MASK);
 	if (error) {
 		if (!huge)
 			mem_cgroup_cancel_charge(page, memcg, false);
@@ -1585,8 +1585,7 @@ no_page:
 		if (fgp_flags & FGP_ACCESSED)
 			__SetPageReferenced(page);
 
-		err = add_to_page_cache_lru(page, mapping, offset,
-				gfp_mask & GFP_RECLAIM_MASK);
+		err = add_to_page_cache_lru(page, mapping, offset, gfp_mask);
 		if (unlikely(err)) {
 			put_page(page);
 			page = NULL;
@@ -2387,7 +2386,7 @@ static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask)
 		if (!page)
 			return -ENOMEM;
 
-		ret = add_to_page_cache_lru(page, mapping, offset, gfp_mask & GFP_KERNEL);
+		ret = add_to_page_cache_lru(page, mapping, offset, gfp_mask);
 		if (ret == 0)
 			ret = mapping->a_ops->readpage(file, page);
 		else if (ret == -EEXIST)

From 6d08b06e67cd117f6992c46611dfb4ce267cd71e Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 22 Apr 2018 19:20:09 -0700
Subject: [PATCH 288/288] Linux 4.17-rc2

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index e811e0c509c5..83b6c541565a 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,7 @@
 VERSION = 4
 PATCHLEVEL = 17
 SUBLEVEL = 0
-EXTRAVERSION = -rc1
+EXTRAVERSION = -rc2
 NAME = Fearless Coyote
 
 # *DOCUMENTATION*