From 916cda1aa1b412d7cf2991c3af7479544942d121 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Tue, 26 Jan 2016 14:10:34 +0100
Subject: [PATCH 01/74] s390: add a system call for guarded storage

This adds a new system call to enable the use of guarded storage for
user space processes. The system call takes two arguments, a command
and pointer to a guarded storage control block:

    s390_guarded_storage(int command, struct gs_cb *gs_cb);

The second argument is relevant only for the GS_SET_BC_CB command.

The commands in detail:

0 - GS_ENABLE
    Enable the guarded storage facility for the current task. The
    initial content of the guarded storage control block will be
    all zeros. After the enablement the user space code can use
    load-guarded-storage-controls instruction (LGSC) to load an
    arbitrary control block. While a task is enabled the kernel
    will save and restore the current content of the guarded
    storage registers on context switch.
1 - GS_DISABLE
    Disables the use of the guarded storage facility for the current
    task. The kernel will cease to save and restore the content of
    the guarded storage registers, the task specific content of
    these registers is lost.
2 - GS_SET_BC_CB
    Set a broadcast guarded storage control block. This is called
    per thread and stores a specific guarded storage control block
    in the task struct of the current task. This control block will
    be used for the broadcast event GS_BROADCAST.
3 - GS_CLEAR_BC_CB
    Clears the broadcast guarded storage control block. The guarded-
    storage control block is removed from the task struct that was
    established by GS_SET_BC_CB.
4 - GS_BROADCAST
    Sends a broadcast to all thread siblings of the current task.
    Every sibling that has established a broadcast guarded storage
    control block will load this control block and will be enabled
    for guarded storage. The broadcast guarded storage control block
    is used up, a second broadcast without a refresh of the stored
    control block with GS_SET_BC_CB will not have any effect.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/include/asm/elf.h                  |   1 +
 arch/s390/include/asm/lowcore.h              |   9 +-
 arch/s390/include/asm/nmi.h                  |  12 +-
 arch/s390/include/asm/processor.h            |   5 +
 arch/s390/include/asm/setup.h                |   2 +
 arch/s390/include/asm/switch_to.h            |   3 +
 arch/s390/include/asm/thread_info.h          |  12 +-
 arch/s390/include/uapi/asm/Kbuild            |   1 +
 arch/s390/include/uapi/asm/guarded_storage.h |  77 +++++++++++
 arch/s390/include/uapi/asm/unistd.h          |   2 +-
 arch/s390/kernel/Makefile                    |   2 +-
 arch/s390/kernel/asm-offsets.c               |   2 +-
 arch/s390/kernel/compat_wrapper.c            |   1 +
 arch/s390/kernel/early.c                     |   2 +
 arch/s390/kernel/entry.S                     |  26 +++-
 arch/s390/kernel/entry.h                     |   2 +
 arch/s390/kernel/guarded_storage.c           | 128 +++++++++++++++++++
 arch/s390/kernel/machine_kexec.c             |  13 +-
 arch/s390/kernel/nmi.c                       |  19 ++-
 arch/s390/kernel/process.c                   |   7 +-
 arch/s390/kernel/processor.c                 |   2 +-
 arch/s390/kernel/ptrace.c                    |  84 ++++++++++--
 arch/s390/kernel/setup.c                     |  18 ++-
 arch/s390/kernel/smp.c                       |  43 ++++++-
 arch/s390/kernel/syscalls.S                  |   2 +-
 arch/s390/kvm/interrupt.c                    |   4 +-
 include/uapi/linux/elf.h                     |   1 +
 27 files changed, 435 insertions(+), 45 deletions(-)
 create mode 100644 arch/s390/include/uapi/asm/guarded_storage.h
 create mode 100644 arch/s390/kernel/guarded_storage.c

diff --git a/arch/s390/include/asm/elf.h b/arch/s390/include/asm/elf.h
index 1d48880b3cc1..e8f623041769 100644
--- a/arch/s390/include/asm/elf.h
+++ b/arch/s390/include/asm/elf.h
@@ -105,6 +105,7 @@
 #define HWCAP_S390_VXRS		2048
 #define HWCAP_S390_VXRS_BCD	4096
 #define HWCAP_S390_VXRS_EXT	8192
+#define HWCAP_S390_GS		16384
 
 /* Internal bits, not exposed via elf */
 #define HWCAP_INT_SIE		1UL
diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h
index 61261e0e95c0..8a5b082797f8 100644
--- a/arch/s390/include/asm/lowcore.h
+++ b/arch/s390/include/asm/lowcore.h
@@ -157,8 +157,8 @@ struct lowcore {
 	__u64	stfle_fac_list[32];		/* 0x0f00 */
 	__u8	pad_0x1000[0x11b0-0x1000];	/* 0x1000 */
 
-	/* Pointer to vector register save area */
-	__u64	vector_save_area_addr;		/* 0x11b0 */
+	/* Pointer to the machine check extended save area */
+	__u64	mcesad;				/* 0x11b0 */
 
 	/* 64 bit extparam used for pfault/diag 250: defined by architecture */
 	__u64	ext_params2;			/* 0x11B8 */
@@ -182,10 +182,7 @@ struct lowcore {
 
 	/* Transaction abort diagnostic block */
 	__u8	pgm_tdb[256];			/* 0x1800 */
-	__u8	pad_0x1900[0x1c00-0x1900];	/* 0x1900 */
-
-	/* Software defined save area for vector registers */
-	__u8	vector_save_area[1024];		/* 0x1c00 */
+	__u8	pad_0x1900[0x2000-0x1900];	/* 0x1900 */
 } __packed;
 
 #define S390_lowcore (*((struct lowcore *) 0))
diff --git a/arch/s390/include/asm/nmi.h b/arch/s390/include/asm/nmi.h
index b75fd910386a..e3e8895f5d3e 100644
--- a/arch/s390/include/asm/nmi.h
+++ b/arch/s390/include/asm/nmi.h
@@ -58,7 +58,9 @@ union mci {
 		u64 ie :  1; /* 32 indirect storage error */
 		u64 ar :  1; /* 33 access register validity */
 		u64 da :  1; /* 34 delayed access exception */
-		u64    :  7; /* 35-41 */
+		u64    :  1; /* 35 */
+		u64 gs :  1; /* 36 guarded storage registers */
+		u64    :  5; /* 37-41 */
 		u64 pr :  1; /* 42 tod programmable register validity */
 		u64 fc :  1; /* 43 fp control register validity */
 		u64 ap :  1; /* 44 ancillary report */
@@ -69,6 +71,14 @@ union mci {
 	};
 };
 
+#define MCESA_ORIGIN_MASK	(~0x3ffUL)
+#define MCESA_LC_MASK		(0xfUL)
+
+struct mcesa {
+	u8 vector_save_area[1024];
+	u8 guarded_storage_save_area[32];
+};
+
 struct pt_regs;
 
 extern void s390_handle_mcck(void);
diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h
index e4988710aa86..cc101f9371cb 100644
--- a/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@ -135,6 +135,8 @@ struct thread_struct {
 	struct list_head list;
 	/* cpu runtime instrumentation */
 	struct runtime_instr_cb *ri_cb;
+	struct gs_cb *gs_cb;		/* Current guarded storage cb */
+	struct gs_cb *gs_bc_cb;		/* Broadcast guarded storage cb */
 	unsigned char trap_tdb[256];	/* Transaction abort diagnose block */
 	/*
 	 * Warning: 'fpu' is dynamically-sized. It *MUST* be at
@@ -215,6 +217,9 @@ void show_cacheinfo(struct seq_file *m);
 /* Free all resources held by a thread. */
 extern void release_thread(struct task_struct *);
 
+/* Free guarded storage control block for current */
+void exit_thread_gs(void);
+
 /*
  * Return saved PC of a blocked thread.
  */
diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h
index 30bdb5a027f3..383bd8358a8c 100644
--- a/arch/s390/include/asm/setup.h
+++ b/arch/s390/include/asm/setup.h
@@ -31,6 +31,7 @@
 #define MACHINE_FLAG_VX		_BITUL(13)
 #define MACHINE_FLAG_CAD	_BITUL(14)
 #define MACHINE_FLAG_NX		_BITUL(15)
+#define MACHINE_FLAG_GS		_BITUL(16)
 
 #define LPP_MAGIC		_BITUL(31)
 #define LPP_PFAULT_PID_MASK	_AC(0xffffffff, UL)
@@ -70,6 +71,7 @@ extern void detect_memory_memblock(void);
 #define MACHINE_HAS_VX		(S390_lowcore.machine_flags & MACHINE_FLAG_VX)
 #define MACHINE_HAS_CAD		(S390_lowcore.machine_flags & MACHINE_FLAG_CAD)
 #define MACHINE_HAS_NX		(S390_lowcore.machine_flags & MACHINE_FLAG_NX)
+#define MACHINE_HAS_GS		(S390_lowcore.machine_flags & MACHINE_FLAG_GS)
 
 /*
  * Console mode. Override with conmode=
diff --git a/arch/s390/include/asm/switch_to.h b/arch/s390/include/asm/switch_to.h
index 12d45f0cfdd9..f6c2b5814ab0 100644
--- a/arch/s390/include/asm/switch_to.h
+++ b/arch/s390/include/asm/switch_to.h
@@ -10,6 +10,7 @@
 #include <linux/thread_info.h>
 #include <asm/fpu/api.h>
 #include <asm/ptrace.h>
+#include <asm/guarded_storage.h>
 
 extern struct task_struct *__switch_to(void *, void *);
 extern void update_cr_regs(struct task_struct *task);
@@ -33,12 +34,14 @@ static inline void restore_access_regs(unsigned int *acrs)
 		save_fpu_regs();					\
 		save_access_regs(&prev->thread.acrs[0]);		\
 		save_ri_cb(prev->thread.ri_cb);				\
+		save_gs_cb(prev->thread.gs_cb);				\
 	}								\
 	if (next->mm) {							\
 		update_cr_regs(next);					\
 		set_cpu_flag(CIF_FPU);					\
 		restore_access_regs(&next->thread.acrs[0]);		\
 		restore_ri_cb(next->thread.ri_cb, prev->thread.ri_cb);	\
+		restore_gs_cb(next->thread.gs_cb);			\
 	}								\
 	prev = __switch_to(prev,next);					\
 } while (0)
diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h
index a5b54a445eb8..f36e6e2b73f0 100644
--- a/arch/s390/include/asm/thread_info.h
+++ b/arch/s390/include/asm/thread_info.h
@@ -54,11 +54,12 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src);
 #define TIF_NOTIFY_RESUME	0	/* callback before returning to user */
 #define TIF_SIGPENDING		1	/* signal pending */
 #define TIF_NEED_RESCHED	2	/* rescheduling necessary */
-#define TIF_SYSCALL_TRACE	3	/* syscall trace active */
-#define TIF_SYSCALL_AUDIT	4	/* syscall auditing active */
-#define TIF_SECCOMP		5	/* secure computing */
-#define TIF_SYSCALL_TRACEPOINT	6	/* syscall tracepoint instrumentation */
-#define TIF_UPROBE		7	/* breakpointed or single-stepping */
+#define TIF_UPROBE		3	/* breakpointed or single-stepping */
+#define TIF_GUARDED_STORAGE	4	/* load guarded storage control block */
+#define TIF_SYSCALL_TRACE	8	/* syscall trace active */
+#define TIF_SYSCALL_AUDIT	9	/* syscall auditing active */
+#define TIF_SECCOMP		10	/* secure computing */
+#define TIF_SYSCALL_TRACEPOINT	11	/* syscall tracepoint instrumentation */
 #define TIF_31BIT		16	/* 32bit process */
 #define TIF_MEMDIE		17	/* is terminating due to OOM killer */
 #define TIF_RESTORE_SIGMASK	18	/* restore signal mask in do_signal() */
@@ -76,5 +77,6 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src);
 #define _TIF_UPROBE		_BITUL(TIF_UPROBE)
 #define _TIF_31BIT		_BITUL(TIF_31BIT)
 #define _TIF_SINGLE_STEP	_BITUL(TIF_SINGLE_STEP)
+#define _TIF_GUARDED_STORAGE	_BITUL(TIF_GUARDED_STORAGE)
 
 #endif /* _ASM_THREAD_INFO_H */
diff --git a/arch/s390/include/uapi/asm/Kbuild b/arch/s390/include/uapi/asm/Kbuild
index 6848ba5c1454..86b761e583e3 100644
--- a/arch/s390/include/uapi/asm/Kbuild
+++ b/arch/s390/include/uapi/asm/Kbuild
@@ -12,6 +12,7 @@ header-y += dasd.h
 header-y += debug.h
 header-y += errno.h
 header-y += fcntl.h
+header-y += guarded_storage.h
 header-y += hypfs.h
 header-y += ioctl.h
 header-y += ioctls.h
diff --git a/arch/s390/include/uapi/asm/guarded_storage.h b/arch/s390/include/uapi/asm/guarded_storage.h
new file mode 100644
index 000000000000..852850e8e17e
--- /dev/null
+++ b/arch/s390/include/uapi/asm/guarded_storage.h
@@ -0,0 +1,77 @@
+#ifndef _GUARDED_STORAGE_H
+#define _GUARDED_STORAGE_H
+
+#include <linux/types.h>
+
+struct gs_cb {
+	__u64 reserved;
+	__u64 gsd;
+	__u64 gssm;
+	__u64 gs_epl_a;
+};
+
+struct gs_epl {
+	__u8 pad1;
+	union {
+		__u8 gs_eam;
+		struct {
+			__u8	: 6;
+			__u8 e	: 1;
+			__u8 b	: 1;
+		};
+	};
+	union {
+		__u8 gs_eci;
+		struct {
+			__u8 tx	: 1;
+			__u8 cx	: 1;
+			__u8	: 5;
+			__u8 in	: 1;
+		};
+	};
+	union {
+		__u8 gs_eai;
+		struct {
+			__u8	: 1;
+			__u8 t	: 1;
+			__u8 as	: 2;
+			__u8 ar	: 4;
+		};
+	};
+	__u32 pad2;
+	__u64 gs_eha;
+	__u64 gs_eia;
+	__u64 gs_eoa;
+	__u64 gs_eir;
+	__u64 gs_era;
+};
+
+#define GS_ENABLE	0
+#define	GS_DISABLE	1
+#define GS_SET_BC_CB	2
+#define GS_CLEAR_BC_CB	3
+#define GS_BROADCAST	4
+
+static inline void load_gs_cb(struct gs_cb *gs_cb)
+{
+	asm volatile(".insn rxy,0xe3000000004d,0,%0" : : "Q" (*gs_cb));
+}
+
+static inline void store_gs_cb(struct gs_cb *gs_cb)
+{
+	asm volatile(".insn rxy,0xe30000000049,0,%0" : : "Q" (*gs_cb));
+}
+
+static inline void save_gs_cb(struct gs_cb *gs_cb)
+{
+	if (gs_cb)
+		store_gs_cb(gs_cb);
+}
+
+static inline void restore_gs_cb(struct gs_cb *gs_cb)
+{
+	if (gs_cb)
+		load_gs_cb(gs_cb);
+}
+
+#endif /* _GUARDED_STORAGE_H */
diff --git a/arch/s390/include/uapi/asm/unistd.h b/arch/s390/include/uapi/asm/unistd.h
index 152de9b796e1..ea42290e7d51 100644
--- a/arch/s390/include/uapi/asm/unistd.h
+++ b/arch/s390/include/uapi/asm/unistd.h
@@ -313,7 +313,7 @@
 #define __NR_copy_file_range	375
 #define __NR_preadv2		376
 #define __NR_pwritev2		377
-/* Number 378 is reserved for guarded storage */
+#define __NR_s390_guarded_storage	378
 #define __NR_statx		379
 #define NR_syscalls 380
 
diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile
index 060ce548fe8b..aa5adbdaf200 100644
--- a/arch/s390/kernel/Makefile
+++ b/arch/s390/kernel/Makefile
@@ -57,7 +57,7 @@ obj-y	:= traps.o time.o process.o base.o early.o setup.o idle.o vtime.o
 obj-y	+= processor.o sys_s390.o ptrace.o signal.o cpcmd.o ebcdic.o nmi.o
 obj-y	+= debug.o irq.o ipl.o dis.o diag.o vdso.o als.o
 obj-y	+= sysinfo.o jump_label.o lgr.o os_info.o machine_kexec.o pgm_check.o
-obj-y	+= runtime_instr.o cache.o fpu.o dumpstack.o
+obj-y	+= runtime_instr.o cache.o fpu.o dumpstack.o guarded_storage.o
 obj-y	+= entry.o reipl.o relocate_kernel.o
 
 extra-y				+= head.o head64.o vmlinux.lds
diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c
index c4b3570ded5b..6bb29633e1f1 100644
--- a/arch/s390/kernel/asm-offsets.c
+++ b/arch/s390/kernel/asm-offsets.c
@@ -175,7 +175,7 @@ int main(void)
 	/* software defined ABI-relevant lowcore locations 0xe00 - 0xe20 */
 	OFFSET(__LC_DUMP_REIPL, lowcore, ipib);
 	/* hardware defined lowcore locations 0x1000 - 0x18ff */
-	OFFSET(__LC_VX_SAVE_AREA_ADDR, lowcore, vector_save_area_addr);
+	OFFSET(__LC_MCESAD, lowcore, mcesad);
 	OFFSET(__LC_EXT_PARAMS2, lowcore, ext_params2);
 	OFFSET(__LC_FPREGS_SAVE_AREA, lowcore, floating_pt_save_area);
 	OFFSET(__LC_GPREGS_SAVE_AREA, lowcore, gpregs_save_area);
diff --git a/arch/s390/kernel/compat_wrapper.c b/arch/s390/kernel/compat_wrapper.c
index e89cc2e71db1..986642a3543b 100644
--- a/arch/s390/kernel/compat_wrapper.c
+++ b/arch/s390/kernel/compat_wrapper.c
@@ -178,4 +178,5 @@ COMPAT_SYSCALL_WRAP3(getpeername, int, fd, struct sockaddr __user *, usockaddr,
 COMPAT_SYSCALL_WRAP6(sendto, int, fd, void __user *, buff, size_t, len, unsigned int, flags, struct sockaddr __user *, addr, int, addr_len);
 COMPAT_SYSCALL_WRAP3(mlock2, unsigned long, start, size_t, len, int, flags);
 COMPAT_SYSCALL_WRAP6(copy_file_range, int, fd_in, loff_t __user *, off_in, int, fd_out, loff_t __user *, off_out, size_t, len, unsigned int, flags);
+COMPAT_SYSCALL_WRAP2(s390_guarded_storage, int, command, struct gs_cb *, gs_cb);
 COMPAT_SYSCALL_WRAP5(statx, int, dfd, const char __user *, path, unsigned, flags, unsigned, mask, struct statx __user *, buffer);
diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c
index 4e65c79cc5f2..95298a41076f 100644
--- a/arch/s390/kernel/early.c
+++ b/arch/s390/kernel/early.c
@@ -358,6 +358,8 @@ static __init void detect_machine_facilities(void)
 		S390_lowcore.machine_flags |= MACHINE_FLAG_NX;
 		__ctl_set_bit(0, 20);
 	}
+	if (test_facility(133))
+		S390_lowcore.machine_flags |= MACHINE_FLAG_GS;
 }
 
 static inline void save_vector_registers(void)
diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index 6a7d737d514c..fa8b8f28e08b 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -47,7 +47,7 @@ STACK_SIZE  = 1 << STACK_SHIFT
 STACK_INIT = STACK_SIZE - STACK_FRAME_OVERHEAD - __PT_SIZE
 
 _TIF_WORK	= (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED | \
-		   _TIF_UPROBE)
+		   _TIF_UPROBE | _TIF_GUARDED_STORAGE)
 _TIF_TRACE	= (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | \
 		   _TIF_SYSCALL_TRACEPOINT)
 _CIF_WORK	= (_CIF_MCCK_PENDING | _CIF_ASCE_PRIMARY | \
@@ -332,6 +332,8 @@ ENTRY(system_call)
 	TSTMSK	__TI_flags(%r12),_TIF_UPROBE
 	jo	.Lsysc_uprobe_notify
 #endif
+	TSTMSK	__TI_flags(%r12),_TIF_GUARDED_STORAGE
+	jo	.Lsysc_guarded_storage
 	TSTMSK	__PT_FLAGS(%r11),_PIF_PER_TRAP
 	jo	.Lsysc_singlestep
 	TSTMSK	__TI_flags(%r12),_TIF_SIGPENDING
@@ -408,6 +410,14 @@ ENTRY(system_call)
 	jg	uprobe_notify_resume
 #endif
 
+#
+# _TIF_GUARDED_STORAGE is set, call guarded_storage_load
+#
+.Lsysc_guarded_storage:
+	lgr	%r2,%r11		# pass pointer to pt_regs
+	larl	%r14,.Lsysc_return
+	jg	gs_load_bc_cb
+
 #
 # _PIF_PER_TRAP is set, call do_per_trap
 #
@@ -663,6 +673,8 @@ ENTRY(io_int_handler)
 	jo	.Lio_sigpending
 	TSTMSK	__TI_flags(%r12),_TIF_NOTIFY_RESUME
 	jo	.Lio_notify_resume
+	TSTMSK	__TI_flags(%r12),_TIF_GUARDED_STORAGE
+	jo	.Lio_guarded_storage
 	TSTMSK	__LC_CPU_FLAGS,_CIF_FPU
 	jo	.Lio_vxrs
 	TSTMSK	__LC_CPU_FLAGS,(_CIF_ASCE_PRIMARY|_CIF_ASCE_SECONDARY)
@@ -696,6 +708,18 @@ ENTRY(io_int_handler)
 	larl	%r14,.Lio_return
 	jg	load_fpu_regs
 
+#
+# _TIF_GUARDED_STORAGE is set, call guarded_storage_load
+#
+.Lio_guarded_storage:
+	# TRACE_IRQS_ON already done at .Lio_return
+	ssm	__LC_SVC_NEW_PSW	# reenable interrupts
+	lgr	%r2,%r11		# pass pointer to pt_regs
+	brasl	%r14,gs_load_bc_cb
+	ssm	__LC_PGM_NEW_PSW	# disable I/O and ext. interrupts
+	TRACE_IRQS_OFF
+	j	.Lio_return
+
 #
 # _TIF_NEED_RESCHED is set, call schedule
 #
diff --git a/arch/s390/kernel/entry.h b/arch/s390/kernel/entry.h
index 33f901865326..dbf5f7e18246 100644
--- a/arch/s390/kernel/entry.h
+++ b/arch/s390/kernel/entry.h
@@ -74,12 +74,14 @@ long sys_sigreturn(void);
 
 long sys_s390_personality(unsigned int personality);
 long sys_s390_runtime_instr(int command, int signum);
+long sys_s390_guarded_storage(int command, struct gs_cb __user *);
 long sys_s390_pci_mmio_write(unsigned long, const void __user *, size_t);
 long sys_s390_pci_mmio_read(unsigned long, void __user *, size_t);
 
 DECLARE_PER_CPU(u64, mt_cycles[8]);
 
 void verify_facilities(void);
+void gs_load_bc_cb(struct pt_regs *regs);
 void set_fs_fixup(void);
 
 #endif /* _ENTRY_H */
diff --git a/arch/s390/kernel/guarded_storage.c b/arch/s390/kernel/guarded_storage.c
new file mode 100644
index 000000000000..6f064745c3b1
--- /dev/null
+++ b/arch/s390/kernel/guarded_storage.c
@@ -0,0 +1,128 @@
+/*
+ * Copyright IBM Corp. 2016
+ * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/syscalls.h>
+#include <linux/signal.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <asm/guarded_storage.h>
+#include "entry.h"
+
+void exit_thread_gs(void)
+{
+	kfree(current->thread.gs_cb);
+	kfree(current->thread.gs_bc_cb);
+	current->thread.gs_cb = current->thread.gs_bc_cb = NULL;
+}
+
+static int gs_enable(void)
+{
+	struct gs_cb *gs_cb;
+
+	if (!current->thread.gs_cb) {
+		gs_cb = kzalloc(sizeof(*gs_cb), GFP_KERNEL);
+		if (!gs_cb)
+			return -ENOMEM;
+		gs_cb->gsd = 25;
+		preempt_disable();
+		__ctl_set_bit(2, 4);
+		load_gs_cb(gs_cb);
+		current->thread.gs_cb = gs_cb;
+		preempt_enable();
+	}
+	return 0;
+}
+
+static int gs_disable(void)
+{
+	if (current->thread.gs_cb) {
+		preempt_disable();
+		kfree(current->thread.gs_cb);
+		current->thread.gs_cb = NULL;
+		__ctl_clear_bit(2, 4);
+		preempt_enable();
+	}
+	return 0;
+}
+
+static int gs_set_bc_cb(struct gs_cb __user *u_gs_cb)
+{
+	struct gs_cb *gs_cb;
+
+	gs_cb = current->thread.gs_bc_cb;
+	if (!gs_cb) {
+		gs_cb = kzalloc(sizeof(*gs_cb), GFP_KERNEL);
+		if (!gs_cb)
+			return -ENOMEM;
+		current->thread.gs_bc_cb = gs_cb;
+	}
+	if (copy_from_user(gs_cb, u_gs_cb, sizeof(*gs_cb)))
+		return -EFAULT;
+	return 0;
+}
+
+static int gs_clear_bc_cb(void)
+{
+	struct gs_cb *gs_cb;
+
+	gs_cb = current->thread.gs_bc_cb;
+	current->thread.gs_bc_cb = NULL;
+	kfree(gs_cb);
+	return 0;
+}
+
+void gs_load_bc_cb(struct pt_regs *regs)
+{
+	struct gs_cb *gs_cb;
+
+	preempt_disable();
+	clear_thread_flag(TIF_GUARDED_STORAGE);
+	gs_cb = current->thread.gs_bc_cb;
+	if (gs_cb) {
+		kfree(current->thread.gs_cb);
+		current->thread.gs_bc_cb = NULL;
+		__ctl_set_bit(2, 4);
+		load_gs_cb(gs_cb);
+		current->thread.gs_cb = gs_cb;
+	}
+	preempt_enable();
+}
+
+static int gs_broadcast(void)
+{
+	struct task_struct *sibling;
+
+	read_lock(&tasklist_lock);
+	for_each_thread(current, sibling) {
+		if (!sibling->thread.gs_bc_cb)
+			continue;
+		if (test_and_set_tsk_thread_flag(sibling, TIF_GUARDED_STORAGE))
+			kick_process(sibling);
+	}
+	read_unlock(&tasklist_lock);
+	return 0;
+}
+
+SYSCALL_DEFINE2(s390_guarded_storage, int, command,
+		struct gs_cb __user *, gs_cb)
+{
+	if (!MACHINE_HAS_GS)
+		return -EOPNOTSUPP;
+	switch (command) {
+	case GS_ENABLE:
+		return gs_enable();
+	case GS_DISABLE:
+		return gs_disable();
+	case GS_SET_BC_CB:
+		return gs_set_bc_cb(gs_cb);
+	case GS_CLEAR_BC_CB:
+		return gs_clear_bc_cb();
+	case GS_BROADCAST:
+		return gs_broadcast();
+	default:
+		return -EINVAL;
+	}
+}
diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c
index 3074c1d83829..db5658daf994 100644
--- a/arch/s390/kernel/machine_kexec.c
+++ b/arch/s390/kernel/machine_kexec.c
@@ -27,6 +27,7 @@
 #include <asm/cacheflush.h>
 #include <asm/os_info.h>
 #include <asm/switch_to.h>
+#include <asm/nmi.h>
 
 typedef void (*relocate_kernel_t)(kimage_entry_t *, unsigned long);
 
@@ -102,6 +103,8 @@ static void __do_machine_kdump(void *image)
  */
 static noinline void __machine_kdump(void *image)
 {
+	struct mcesa *mcesa;
+	unsigned long cr2_old, cr2_new;
 	int this_cpu, cpu;
 
 	lgr_info_log();
@@ -114,8 +117,16 @@ static noinline void __machine_kdump(void *image)
 			continue;
 	}
 	/* Store status of the boot CPU */
+	mcesa = (struct mcesa *)(S390_lowcore.mcesad & MCESA_ORIGIN_MASK);
 	if (MACHINE_HAS_VX)
-		save_vx_regs((void *) &S390_lowcore.vector_save_area);
+		save_vx_regs((__vector128 *) mcesa->vector_save_area);
+	if (MACHINE_HAS_GS) {
+		__ctl_store(cr2_old, 2, 2);
+		cr2_new = cr2_old | (1UL << 4);
+		__ctl_load(cr2_new, 2, 2);
+		save_gs_cb((struct gs_cb *) mcesa->guarded_storage_save_area);
+		__ctl_load(cr2_old, 2, 2);
+	}
 	/*
 	 * To create a good backchain for this CPU in the dump store_status
 	 * is passed the address of a function. The address is saved into
diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c
index 9bf8327154ee..985589523970 100644
--- a/arch/s390/kernel/nmi.c
+++ b/arch/s390/kernel/nmi.c
@@ -106,6 +106,7 @@ static int notrace s390_validate_registers(union mci mci, int umode)
 	int kill_task;
 	u64 zero;
 	void *fpt_save_area;
+	struct mcesa *mcesa;
 
 	kill_task = 0;
 	zero = 0;
@@ -165,6 +166,7 @@ static int notrace s390_validate_registers(union mci mci, int umode)
 			     : : "Q" (S390_lowcore.fpt_creg_save_area));
 	}
 
+	mcesa = (struct mcesa *)(S390_lowcore.mcesad & MCESA_ORIGIN_MASK);
 	if (!MACHINE_HAS_VX) {
 		/* Validate floating point registers */
 		asm volatile(
@@ -209,8 +211,8 @@ static int notrace s390_validate_registers(union mci mci, int umode)
 			"	la	1,%0\n"
 			"	.word	0xe70f,0x1000,0x0036\n"	/* vlm 0,15,0(1) */
 			"	.word	0xe70f,0x1100,0x0c36\n"	/* vlm 16,31,256(1) */
-			: : "Q" (*(struct vx_array *)
-				 &S390_lowcore.vector_save_area) : "1");
+			: : "Q" (*(struct vx_array *) mcesa->vector_save_area)
+			: "1");
 		__ctl_load(S390_lowcore.cregs_save_area[0], 0, 0);
 	}
 	/* Validate access registers */
@@ -224,6 +226,19 @@ static int notrace s390_validate_registers(union mci mci, int umode)
 		 */
 		kill_task = 1;
 	}
+	/* Validate guarded storage registers */
+	if (MACHINE_HAS_GS && (S390_lowcore.cregs_save_area[2] & (1UL << 4))) {
+		if (!mci.gs)
+			/*
+			 * Guarded storage register can't be restored and
+			 * the current processes uses guarded storage.
+			 * It has to be terminated.
+			 */
+			kill_task = 1;
+		else
+			load_gs_cb((struct gs_cb *)
+				   mcesa->guarded_storage_save_area);
+	}
 	/*
 	 * We don't even try to validate the TOD register, since we simply
 	 * can't write something sensible into that register.
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index f29e41c5e2ec..999d7154bbdc 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -73,8 +73,10 @@ extern void kernel_thread_starter(void);
  */
 void exit_thread(struct task_struct *tsk)
 {
-	if (tsk == current)
+	if (tsk == current) {
 		exit_thread_runtime_instr();
+		exit_thread_gs();
+	}
 }
 
 void flush_thread(void)
@@ -159,6 +161,9 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long new_stackp,
 	/* Don't copy runtime instrumentation info */
 	p->thread.ri_cb = NULL;
 	frame->childregs.psw.mask &= ~PSW_MASK_RI;
+	/* Don't copy guarded storage control block */
+	p->thread.gs_cb = NULL;
+	p->thread.gs_bc_cb = NULL;
 
 	/* Set a new TLS ?  */
 	if (clone_flags & CLONE_SETTLS) {
diff --git a/arch/s390/kernel/processor.c b/arch/s390/kernel/processor.c
index 928b929a6261..c73709869447 100644
--- a/arch/s390/kernel/processor.c
+++ b/arch/s390/kernel/processor.c
@@ -95,7 +95,7 @@ static void show_cpu_summary(struct seq_file *m, void *v)
 {
 	static const char *hwcap_str[] = {
 		"esan3", "zarch", "stfle", "msa", "ldisp", "eimm", "dfp",
-		"edat", "etf3eh", "highgprs", "te", "vx", "vxd", "vxe"
+		"edat", "etf3eh", "highgprs", "te", "vx", "vxd", "vxe", "gs"
 	};
 	static const char * const int_hwcap_str[] = {
 		"sie"
diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c
index c14df0a1ec3c..c933e255b5d5 100644
--- a/arch/s390/kernel/ptrace.c
+++ b/arch/s390/kernel/ptrace.c
@@ -44,30 +44,42 @@ void update_cr_regs(struct task_struct *task)
 	struct pt_regs *regs = task_pt_regs(task);
 	struct thread_struct *thread = &task->thread;
 	struct per_regs old, new;
+	unsigned long cr0_old, cr0_new;
+	unsigned long cr2_old, cr2_new;
+	int cr0_changed, cr2_changed;
 
+	__ctl_store(cr0_old, 0, 0);
+	__ctl_store(cr2_old, 2, 2);
+	cr0_new = cr0_old;
+	cr2_new = cr2_old;
 	/* Take care of the enable/disable of transactional execution. */
 	if (MACHINE_HAS_TE) {
-		unsigned long cr, cr_new;
-
-		__ctl_store(cr, 0, 0);
 		/* Set or clear transaction execution TXC bit 8. */
-		cr_new = cr | (1UL << 55);
+		cr0_new |= (1UL << 55);
 		if (task->thread.per_flags & PER_FLAG_NO_TE)
-			cr_new &= ~(1UL << 55);
-		if (cr_new != cr)
-			__ctl_load(cr_new, 0, 0);
+			cr0_new &= ~(1UL << 55);
 		/* Set or clear transaction execution TDC bits 62 and 63. */
-		__ctl_store(cr, 2, 2);
-		cr_new = cr & ~3UL;
+		cr2_new &= ~3UL;
 		if (task->thread.per_flags & PER_FLAG_TE_ABORT_RAND) {
 			if (task->thread.per_flags & PER_FLAG_TE_ABORT_RAND_TEND)
-				cr_new |= 1UL;
+				cr2_new |= 1UL;
 			else
-				cr_new |= 2UL;
+				cr2_new |= 2UL;
 		}
-		if (cr_new != cr)
-			__ctl_load(cr_new, 2, 2);
 	}
+	/* Take care of enable/disable of guarded storage. */
+	if (MACHINE_HAS_GS) {
+		cr2_new &= ~(1UL << 4);
+		if (task->thread.gs_cb)
+			cr2_new |= (1UL << 4);
+	}
+	/* Load control register 0/2 iff changed */
+	cr0_changed = cr0_new != cr0_old;
+	cr2_changed = cr2_new != cr2_old;
+	if (cr0_changed)
+		__ctl_load(cr0_new, 0, 0);
+	if (cr2_changed)
+		__ctl_load(cr2_new, 2, 2);
 	/* Copy user specified PER registers */
 	new.control = thread->per_user.control;
 	new.start = thread->per_user.start;
@@ -1137,6 +1149,36 @@ static int s390_system_call_set(struct task_struct *target,
 				  data, 0, sizeof(unsigned int));
 }
 
+static int s390_gs_cb_get(struct task_struct *target,
+			  const struct user_regset *regset,
+			  unsigned int pos, unsigned int count,
+			  void *kbuf, void __user *ubuf)
+{
+	struct gs_cb *data = target->thread.gs_cb;
+
+	if (!MACHINE_HAS_GS)
+		return -ENODEV;
+	if (!data)
+		return -ENODATA;
+	return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+				   data, 0, sizeof(struct gs_cb));
+}
+
+static int s390_gs_cb_set(struct task_struct *target,
+			  const struct user_regset *regset,
+			  unsigned int pos, unsigned int count,
+			  const void *kbuf, const void __user *ubuf)
+{
+	struct gs_cb *data = target->thread.gs_cb;
+
+	if (!MACHINE_HAS_GS)
+		return -ENODEV;
+	if (!data)
+		return -ENODATA;
+	return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				  data, 0, sizeof(struct gs_cb));
+}
+
 static const struct user_regset s390_regsets[] = {
 	{
 		.core_note_type = NT_PRSTATUS,
@@ -1194,6 +1236,14 @@ static const struct user_regset s390_regsets[] = {
 		.get = s390_vxrs_high_get,
 		.set = s390_vxrs_high_set,
 	},
+	{
+		.core_note_type = NT_S390_GS_CB,
+		.n = sizeof(struct gs_cb) / sizeof(__u64),
+		.size = sizeof(__u64),
+		.align = sizeof(__u64),
+		.get = s390_gs_cb_get,
+		.set = s390_gs_cb_set,
+	},
 };
 
 static const struct user_regset_view user_s390_view = {
@@ -1422,6 +1472,14 @@ static const struct user_regset s390_compat_regsets[] = {
 		.get = s390_compat_regs_high_get,
 		.set = s390_compat_regs_high_set,
 	},
+	{
+		.core_note_type = NT_S390_GS_CB,
+		.n = sizeof(struct gs_cb) / sizeof(__u64),
+		.size = sizeof(__u64),
+		.align = sizeof(__u64),
+		.get = s390_gs_cb_get,
+		.set = s390_gs_cb_set,
+	},
 };
 
 static const struct user_regset_view user_s390_compat_view = {
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index 911dc0b49be0..3ae756c0db3d 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -339,9 +339,15 @@ static void __init setup_lowcore(void)
 	lc->stfl_fac_list = S390_lowcore.stfl_fac_list;
 	memcpy(lc->stfle_fac_list, S390_lowcore.stfle_fac_list,
 	       MAX_FACILITY_BIT/8);
-	if (MACHINE_HAS_VX)
-		lc->vector_save_area_addr =
-			(unsigned long) &lc->vector_save_area;
+	if (MACHINE_HAS_VX || MACHINE_HAS_GS) {
+		unsigned long bits, size;
+
+		bits = MACHINE_HAS_GS ? 11 : 10;
+		size = 1UL << bits;
+		lc->mcesad = (__u64) memblock_virt_alloc(size, size);
+		if (MACHINE_HAS_GS)
+			lc->mcesad |= bits;
+	}
 	lc->vdso_per_cpu_data = (unsigned long) &lc->paste[0];
 	lc->sync_enter_timer = S390_lowcore.sync_enter_timer;
 	lc->async_enter_timer = S390_lowcore.async_enter_timer;
@@ -779,6 +785,12 @@ static int __init setup_hwcaps(void)
 			elf_hwcap |= HWCAP_S390_VXRS_BCD;
 	}
 
+	/*
+	 * Guarded storage support HWCAP_S390_GS is bit 12.
+	 */
+	if (MACHINE_HAS_GS)
+		elf_hwcap |= HWCAP_S390_GS;
+
 	get_cpu_id(&cpu_id);
 	add_device_randomness(&cpu_id, sizeof(cpu_id));
 	switch (cpu_id.machine) {
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index 47a973b5b4f1..286bcee800f4 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -51,6 +51,7 @@
 #include <asm/os_info.h>
 #include <asm/sigp.h>
 #include <asm/idle.h>
+#include <asm/nmi.h>
 #include "entry.h"
 
 enum {
@@ -78,6 +79,8 @@ struct pcpu {
 static u8 boot_core_type;
 static struct pcpu pcpu_devices[NR_CPUS];
 
+static struct kmem_cache *pcpu_mcesa_cache;
+
 unsigned int smp_cpu_mt_shift;
 EXPORT_SYMBOL(smp_cpu_mt_shift);
 
@@ -188,8 +191,10 @@ static void pcpu_ec_call(struct pcpu *pcpu, int ec_bit)
 static int pcpu_alloc_lowcore(struct pcpu *pcpu, int cpu)
 {
 	unsigned long async_stack, panic_stack;
+	unsigned long mcesa_origin, mcesa_bits;
 	struct lowcore *lc;
 
+	mcesa_origin = mcesa_bits = 0;
 	if (pcpu != &pcpu_devices[0]) {
 		pcpu->lowcore =	(struct lowcore *)
 			__get_free_pages(GFP_KERNEL | GFP_DMA, LC_ORDER);
@@ -197,20 +202,27 @@ static int pcpu_alloc_lowcore(struct pcpu *pcpu, int cpu)
 		panic_stack = __get_free_page(GFP_KERNEL);
 		if (!pcpu->lowcore || !panic_stack || !async_stack)
 			goto out;
+		if (MACHINE_HAS_VX || MACHINE_HAS_GS) {
+			mcesa_origin = (unsigned long)
+				kmem_cache_alloc(pcpu_mcesa_cache, GFP_KERNEL);
+			if (!mcesa_origin)
+				goto out;
+			mcesa_bits = MACHINE_HAS_GS ? 11 : 0;
+		}
 	} else {
 		async_stack = pcpu->lowcore->async_stack - ASYNC_FRAME_OFFSET;
 		panic_stack = pcpu->lowcore->panic_stack - PANIC_FRAME_OFFSET;
+		mcesa_origin = pcpu->lowcore->mcesad & MCESA_ORIGIN_MASK;
+		mcesa_bits = pcpu->lowcore->mcesad & MCESA_LC_MASK;
 	}
 	lc = pcpu->lowcore;
 	memcpy(lc, &S390_lowcore, 512);
 	memset((char *) lc + 512, 0, sizeof(*lc) - 512);
 	lc->async_stack = async_stack + ASYNC_FRAME_OFFSET;
 	lc->panic_stack = panic_stack + PANIC_FRAME_OFFSET;
+	lc->mcesad = mcesa_origin | mcesa_bits;
 	lc->cpu_nr = cpu;
 	lc->spinlock_lockval = arch_spin_lockval(cpu);
-	if (MACHINE_HAS_VX)
-		lc->vector_save_area_addr =
-			(unsigned long) &lc->vector_save_area;
 	if (vdso_alloc_per_cpu(lc))
 		goto out;
 	lowcore_ptr[cpu] = lc;
@@ -218,6 +230,9 @@ static int pcpu_alloc_lowcore(struct pcpu *pcpu, int cpu)
 	return 0;
 out:
 	if (pcpu != &pcpu_devices[0]) {
+		if (mcesa_origin)
+			kmem_cache_free(pcpu_mcesa_cache,
+					(void *) mcesa_origin);
 		free_page(panic_stack);
 		free_pages(async_stack, ASYNC_ORDER);
 		free_pages((unsigned long) pcpu->lowcore, LC_ORDER);
@@ -229,11 +244,17 @@ out:
 
 static void pcpu_free_lowcore(struct pcpu *pcpu)
 {
+	unsigned long mcesa_origin;
+
 	pcpu_sigp_retry(pcpu, SIGP_SET_PREFIX, 0);
 	lowcore_ptr[pcpu - pcpu_devices] = NULL;
 	vdso_free_per_cpu(pcpu->lowcore);
 	if (pcpu == &pcpu_devices[0])
 		return;
+	if (MACHINE_HAS_VX || MACHINE_HAS_GS) {
+		mcesa_origin = pcpu->lowcore->mcesad & MCESA_ORIGIN_MASK;
+		kmem_cache_free(pcpu_mcesa_cache, (void *) mcesa_origin);
+	}
 	free_page(pcpu->lowcore->panic_stack-PANIC_FRAME_OFFSET);
 	free_pages(pcpu->lowcore->async_stack-ASYNC_FRAME_OFFSET, ASYNC_ORDER);
 	free_pages((unsigned long) pcpu->lowcore, LC_ORDER);
@@ -550,9 +571,11 @@ int smp_store_status(int cpu)
 	if (__pcpu_sigp_relax(pcpu->address, SIGP_STORE_STATUS_AT_ADDRESS,
 			      pa) != SIGP_CC_ORDER_CODE_ACCEPTED)
 		return -EIO;
-	if (!MACHINE_HAS_VX)
+	if (!MACHINE_HAS_VX && !MACHINE_HAS_GS)
 		return 0;
-	pa = __pa(pcpu->lowcore->vector_save_area_addr);
+	pa = __pa(pcpu->lowcore->mcesad & MCESA_ORIGIN_MASK);
+	if (MACHINE_HAS_GS)
+		pa |= pcpu->lowcore->mcesad & MCESA_LC_MASK;
 	if (__pcpu_sigp_relax(pcpu->address, SIGP_STORE_ADDITIONAL_STATUS,
 			      pa) != SIGP_CC_ORDER_CODE_ACCEPTED)
 		return -EIO;
@@ -897,12 +920,22 @@ void __init smp_fill_possible_mask(void)
 
 void __init smp_prepare_cpus(unsigned int max_cpus)
 {
+	unsigned long size;
+
 	/* request the 0x1201 emergency signal external interrupt */
 	if (register_external_irq(EXT_IRQ_EMERGENCY_SIG, do_ext_call_interrupt))
 		panic("Couldn't request external interrupt 0x1201");
 	/* request the 0x1202 external call external interrupt */
 	if (register_external_irq(EXT_IRQ_EXTERNAL_CALL, do_ext_call_interrupt))
 		panic("Couldn't request external interrupt 0x1202");
+	/* create slab cache for the machine-check-extended-save-areas */
+	if (MACHINE_HAS_VX || MACHINE_HAS_GS) {
+		size = 1UL << (MACHINE_HAS_GS ? 11 : 10);
+		pcpu_mcesa_cache = kmem_cache_create("nmi_save_areas",
+						     size, size, 0, NULL);
+		if (!pcpu_mcesa_cache)
+			panic("Couldn't create nmi save area cache");
+	}
 }
 
 void __init smp_prepare_boot_cpu(void)
diff --git a/arch/s390/kernel/syscalls.S b/arch/s390/kernel/syscalls.S
index 2659b5cfeddb..54fce7b065de 100644
--- a/arch/s390/kernel/syscalls.S
+++ b/arch/s390/kernel/syscalls.S
@@ -386,5 +386,5 @@ SYSCALL(sys_mlock2,compat_sys_mlock2)
 SYSCALL(sys_copy_file_range,compat_sys_copy_file_range) /* 375 */
 SYSCALL(sys_preadv2,compat_sys_preadv2)
 SYSCALL(sys_pwritev2,compat_sys_pwritev2)
-NI_SYSCALL
+SYSCALL(sys_s390_guarded_storage,compat_sys_s390_guarded_storage) /* 378 */
 SYSCALL(sys_statx,compat_sys_statx)
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 0f8f14199734..169558dc7daf 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -420,8 +420,8 @@ static int __write_machine_check(struct kvm_vcpu *vcpu,
 	save_access_regs(vcpu->run->s.regs.acrs);
 
 	/* Extended save area */
-	rc = read_guest_lc(vcpu, __LC_VX_SAVE_AREA_ADDR, &ext_sa_addr,
-			    sizeof(unsigned long));
+	rc = read_guest_lc(vcpu, __LC_MCESAD, &ext_sa_addr,
+			   sizeof(unsigned long));
 	/* Only bits 0-53 are used for address formation */
 	ext_sa_addr &= ~0x3ffUL;
 	if (!rc && mci.vr && ext_sa_addr && test_kvm_facility(vcpu->kvm, 129)) {
diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
index b59ee077a596..8c6d3bdb9a00 100644
--- a/include/uapi/linux/elf.h
+++ b/include/uapi/linux/elf.h
@@ -409,6 +409,7 @@ typedef struct elf64_shdr {
 #define NT_S390_TDB	0x308		/* s390 transaction diagnostic block */
 #define NT_S390_VXRS_LOW	0x309	/* s390 vector registers 0-15 upper half */
 #define NT_S390_VXRS_HIGH	0x30a	/* s390 vector registers 16-31 */
+#define NT_S390_GS_CB	0x30b		/* s390 guarded storage registers */
 #define NT_ARM_VFP	0x400		/* ARM VFP/NEON registers */
 #define NT_ARM_TLS	0x401		/* ARM TLS register */
 #define NT_ARM_HW_BREAK	0x402		/* ARM hardware breakpoint registers */

From 09214545c4a40943ecb6cedc511cd4bd709c85a6 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Sat, 25 Feb 2017 12:16:31 +0100
Subject: [PATCH 02/74] s390/bitops: add for_each_set_bit_inv helper

Same helper function like for_each_set_bit in generic code.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/include/asm/bitops.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/arch/s390/include/asm/bitops.h b/arch/s390/include/asm/bitops.h
index d92047da5ccb..e7a185407182 100644
--- a/arch/s390/include/asm/bitops.h
+++ b/arch/s390/include/asm/bitops.h
@@ -253,6 +253,11 @@ unsigned long find_first_bit_inv(const unsigned long *addr, unsigned long size);
 unsigned long find_next_bit_inv(const unsigned long *addr, unsigned long size,
 				unsigned long offset);
 
+#define for_each_set_bit_inv(bit, addr, size)				\
+	for ((bit) = find_first_bit_inv((addr), (size));		\
+	     (bit) < (size);						\
+	     (bit) = find_next_bit_inv((addr), (size), (bit) + 1))
+
 static inline void set_bit_inv(unsigned long nr, volatile unsigned long *ptr)
 {
 	return set_bit(nr ^ (BITS_PER_LONG - 1), ptr);

From 157467ba9fb7e379f0540707dd89111de441e45e Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Sat, 25 Feb 2017 12:16:48 +0100
Subject: [PATCH 03/74] s390/cpuinfo: show facilities as reported by stfle

Add a new line to /proc/cpuinfo which shows all available facilities
as reported by the stfle instruction:

> cat /proc/cpuinfo
...
facilities      : 0 1 2 3 4 6 7 ...
...

Reviewed-by: Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/kernel/processor.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/arch/s390/kernel/processor.c b/arch/s390/kernel/processor.c
index c73709869447..778cd6536175 100644
--- a/arch/s390/kernel/processor.c
+++ b/arch/s390/kernel/processor.c
@@ -7,6 +7,7 @@
 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
 
 #include <linux/cpufeature.h>
+#include <linux/bitops.h>
 #include <linux/kernel.h>
 #include <linux/sched/mm.h>
 #include <linux/init.h>
@@ -91,6 +92,18 @@ int cpu_have_feature(unsigned int num)
 }
 EXPORT_SYMBOL(cpu_have_feature);
 
+static void show_facilities(struct seq_file *m)
+{
+	unsigned int bit;
+	long *facilities;
+
+	facilities = (long *)&S390_lowcore.stfle_fac_list;
+	seq_puts(m, "facilities      :");
+	for_each_set_bit_inv(bit, facilities, MAX_FACILITY_BIT)
+		seq_printf(m, " %d", bit);
+	seq_putc(m, '\n');
+}
+
 static void show_cpu_summary(struct seq_file *m, void *v)
 {
 	static const char *hwcap_str[] = {
@@ -116,6 +129,7 @@ static void show_cpu_summary(struct seq_file *m, void *v)
 		if (int_hwcap_str[i] && (int_hwcap & (1UL << i)))
 			seq_printf(m, "%s ", int_hwcap_str[i]);
 	seq_puts(m, "\n");
+	show_facilities(m);
 	show_cacheinfo(m);
 	for_each_online_cpu(cpu) {
 		struct cpuid *id = &per_cpu(cpu_info.cpu_id, cpu);

From faee35a57bc1758864fb325740c3f0d5ca0b3771 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Tue, 7 Mar 2017 16:48:40 +0100
Subject: [PATCH 04/74] s390: get rid of superfluous __GFP_REPEAT

__GFP_REPEAT has a rather weak semantic but since it has been introduced
around 2.6.12 it has been ignored for low order allocations.

page_table_alloc then uses the flag for a single page allocation. This
means that this flag has never been actually useful here because it has
always been used only for PAGE_ALLOC_COSTLY requests.

An earlier attempt to remove the flag 10d58bf297e2 ("s390: get rid of
superfluous __GFP_REPEAT") has missed this one but the situation is very
same here.

Signed-off-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/mm/pgalloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c
index 995f78532cc2..2776bad61094 100644
--- a/arch/s390/mm/pgalloc.c
+++ b/arch/s390/mm/pgalloc.c
@@ -144,7 +144,7 @@ struct page *page_table_alloc_pgste(struct mm_struct *mm)
 	struct page *page;
 	unsigned long *table;
 
-	page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
+	page = alloc_page(GFP_KERNEL);
 	if (page) {
 		table = (unsigned long *) page_to_phys(page);
 		clear_table(table, _PAGE_INVALID, PAGE_SIZE/2);

From 9b62330fa06346da3399a3799f67822fa081a8fe Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Thu, 9 Mar 2017 09:51:22 +0100
Subject: [PATCH 05/74] s390/bitops: remove outdated comment

The s390 specific little endian bitop macros are gone since a long time.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/include/asm/bitops.h | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/arch/s390/include/asm/bitops.h b/arch/s390/include/asm/bitops.h
index e7a185407182..99902b7b9f0c 100644
--- a/arch/s390/include/asm/bitops.h
+++ b/arch/s390/include/asm/bitops.h
@@ -15,14 +15,6 @@
  * end up numbered:
  *   |63..............0|127............64|191...........128|255...........192|
  *
- * There are a few little-endian macros used mostly for filesystem
- * bitmaps, these work on similar bit array layouts, but byte-oriented:
- *   |7...0|15...8|23...16|31...24|39...32|47...40|55...48|63...56|
- *
- * The main difference is that bit 3-5 in the bit number field needs to be
- * reversed compared to the big-endian bit fields. This can be achieved by
- * XOR with 0x38.
- *
  * We also have special functions which work with an MSB0 encoding.
  * The bits are numbered:
  *   |0..............63|64............127|128...........191|192...........255|

From 251ea0ca408b827e888d622eac5e89e4b9502ea2 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Thu, 9 Mar 2017 10:02:28 +0100
Subject: [PATCH 06/74] s390/topology: get rid of core mask array

Use a single long value instead of a single element array to represent
the core mask. The array is a leftover from 32/31 bit code so we were
able to use bitops helper functions.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/include/asm/sysinfo.h | 3 +--
 arch/s390/kernel/topology.c     | 4 +++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/s390/include/asm/sysinfo.h b/arch/s390/include/asm/sysinfo.h
index 229326c942c7..27394989073a 100644
--- a/arch/s390/include/asm/sysinfo.h
+++ b/arch/s390/include/asm/sysinfo.h
@@ -142,7 +142,6 @@ struct sysinfo_3_2_2 {
 
 extern int topology_max_mnest;
 
-#define TOPOLOGY_CORE_BITS	64
 #define TOPOLOGY_NR_MAG		6
 
 struct topology_core {
@@ -152,7 +151,7 @@ struct topology_core {
 	unsigned char pp:2;
 	unsigned char reserved1;
 	unsigned short origin;
-	unsigned long mask[TOPOLOGY_CORE_BITS / BITS_PER_LONG];
+	unsigned long mask;
 };
 
 struct topology_container {
diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c
index 17660e800e74..0537130fb915 100644
--- a/arch/s390/kernel/topology.c
+++ b/arch/s390/kernel/topology.c
@@ -83,6 +83,8 @@ static cpumask_t cpu_thread_map(unsigned int cpu)
 	return mask;
 }
 
+#define TOPOLOGY_CORE_BITS	64
+
 static void add_cpus_to_mask(struct topology_core *tl_core,
 			     struct mask_info *drawer,
 			     struct mask_info *book,
@@ -91,7 +93,7 @@ static void add_cpus_to_mask(struct topology_core *tl_core,
 	struct cpu_topology_s390 *topo;
 	unsigned int core;
 
-	for_each_set_bit(core, &tl_core->mask[0], TOPOLOGY_CORE_BITS) {
+	for_each_set_bit(core, &tl_core->mask, TOPOLOGY_CORE_BITS) {
 		unsigned int rcore;
 		int lcpu, i;
 

From 4fd4dd8bffb112d1e6549e0ff09e9fa3c8cc2b96 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Mon, 13 Mar 2017 13:36:10 +0100
Subject: [PATCH 07/74] s390/topology: fix typo in early topology code

Use MACHINE_FLAG_TOPOLOGY instead of MACHINE_HAS_TOPOLOGY when
clearing the bit that indicates if the machine provides topology
information (and if it should be used). Currently works anyway.

Fixes: 68cc795d1933 ("s390/topology: make "topology=off" parameter work")
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/kernel/early.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c
index 95298a41076f..01cd7fe08d18 100644
--- a/arch/s390/kernel/early.c
+++ b/arch/s390/kernel/early.c
@@ -377,7 +377,7 @@ static int __init topology_setup(char *str)
 
 	rc = kstrtobool(str, &enabled);
 	if (!rc && !enabled)
-		S390_lowcore.machine_flags &= ~MACHINE_HAS_TOPOLOGY;
+		S390_lowcore.machine_flags &= ~MACHINE_FLAG_TOPOLOGY;
 	return rc;
 }
 early_param("topology", topology_setup);

From 59808fc81942834964e3b4e65f08e5855a3914a2 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Mon, 13 Mar 2017 15:45:54 +0100
Subject: [PATCH 08/74] s390/sysinfo: allow compiler warnings again

Allow compiler warnings again for the sysinfo file. Compiler warnings
were disabled when the bogomips calculation with math-emu code was
introduced ("[S390] Calibrate delay and bogomips.").
Since that code is gone, we can enable warnings again.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/kernel/Makefile | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile
index aa5adbdaf200..08e7924ea2f9 100644
--- a/arch/s390/kernel/Makefile
+++ b/arch/s390/kernel/Makefile
@@ -51,8 +51,6 @@ CFLAGS_dumpstack.o	+= -fno-optimize-sibling-calls
 #
 CFLAGS_ptrace.o		+= -DUTS_MACHINE='"$(UTS_MACHINE)"'
 
-CFLAGS_sysinfo.o	+= -w
-
 obj-y	:= traps.o time.o process.o base.o early.o setup.o idle.o vtime.o
 obj-y	+= processor.o sys_s390.o ptrace.o signal.o cpcmd.o ebcdic.o nmi.o
 obj-y	+= debug.o irq.o ipl.o dis.o diag.o vdso.o als.o

From d2f039742537c4aec6488fa4de0c91a641210fd9 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Sat, 11 Mar 2017 11:58:27 +0100
Subject: [PATCH 09/74] s390/dump_stack: remove whitespace from arch
 description

The arch description provided for the "Hardware name:" contains lots
of extra whitespace due to the way the SYSIB contents are defined
(strings aren't zero terminated).
This looks a bit odd and therefore remove the extra whitespace
characters. This also gives the opportunity to add more information,
if required, without hitting the magic 80 characters per line limit.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/kernel/early.c | 29 ++++++++++++++++++++++++-----
 1 file changed, 24 insertions(+), 5 deletions(-)

diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c
index 01cd7fe08d18..914f273b1983 100644
--- a/arch/s390/kernel/early.c
+++ b/arch/s390/kernel/early.c
@@ -231,9 +231,28 @@ static noinline __init void detect_machine_type(void)
 		S390_lowcore.machine_flags |= MACHINE_FLAG_VM;
 }
 
+/* Remove leading, trailing and double whitespace. */
+static inline void strim_all(char *str)
+{
+	char *s;
+
+	s = strim(str);
+	if (s != str)
+		memmove(str, s, strlen(s));
+	while (*str) {
+		if (!isspace(*str++))
+			continue;
+		if (isspace(*str)) {
+			s = skip_spaces(str);
+			memmove(str, s, strlen(s) + 1);
+		}
+	}
+}
+
 static noinline __init void setup_arch_string(void)
 {
 	struct sysinfo_1_1_1 *mach = (struct sysinfo_1_1_1 *)&sysinfo_page;
+	char mstr[80];
 
 	if (stsi(mach, 1, 1, 1))
 		return;
@@ -241,11 +260,11 @@ static noinline __init void setup_arch_string(void)
 	EBCASC(mach->type, sizeof(mach->type));
 	EBCASC(mach->model, sizeof(mach->model));
 	EBCASC(mach->model_capacity, sizeof(mach->model_capacity));
-	dump_stack_set_arch_desc("%-16.16s %-4.4s %-16.16s %-16.16s (%s)",
-				 mach->manufacturer,
-				 mach->type,
-				 mach->model,
-				 mach->model_capacity,
+	sprintf(mstr, "%-16.16s %-4.4s %-16.16s %-16.16s",
+		mach->manufacturer, mach->type,
+		mach->model, mach->model_capacity);
+	strim_all(mstr);
+	dump_stack_set_arch_desc("%s (%s)", mstr,
 				 MACHINE_IS_LPAR ? "LPAR" :
 				 MACHINE_IS_VM ? "z/VM" :
 				 MACHINE_IS_KVM ? "KVM" : "unknown");

From 2f8876f98447ccbddb5bccc8f74ee20a69e83a3e Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Sat, 11 Mar 2017 12:22:11 +0100
Subject: [PATCH 10/74] s390/dump_stack: use control program identification
 string

If running within a level 3 hypervisor, the hypervisor provides a
SYSIB block which contains a control program indentifier string. Use
this string instead of the simple KVM and z/VM strings only. In case
of z/VM this provides addtional information: the z/VM version.

The new string looks similar to this:

Hardware name: IBM 2964 N96 702 (z/VM 6.4.0)

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/kernel/early.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c
index 914f273b1983..251391e3f8bc 100644
--- a/arch/s390/kernel/early.c
+++ b/arch/s390/kernel/early.c
@@ -252,7 +252,8 @@ static inline void strim_all(char *str)
 static noinline __init void setup_arch_string(void)
 {
 	struct sysinfo_1_1_1 *mach = (struct sysinfo_1_1_1 *)&sysinfo_page;
-	char mstr[80];
+	struct sysinfo_3_2_2 *vm = (struct sysinfo_3_2_2 *)&sysinfo_page;
+	char mstr[80], hvstr[17];
 
 	if (stsi(mach, 1, 1, 1))
 		return;
@@ -264,10 +265,17 @@ static noinline __init void setup_arch_string(void)
 		mach->manufacturer, mach->type,
 		mach->model, mach->model_capacity);
 	strim_all(mstr);
-	dump_stack_set_arch_desc("%s (%s)", mstr,
-				 MACHINE_IS_LPAR ? "LPAR" :
-				 MACHINE_IS_VM ? "z/VM" :
-				 MACHINE_IS_KVM ? "KVM" : "unknown");
+	if (stsi(vm, 3, 2, 2) == 0 && vm->count) {
+		EBCASC(vm->vm[0].cpi, sizeof(vm->vm[0].cpi));
+		sprintf(hvstr, "%-16.16s", vm->vm[0].cpi);
+		strim_all(hvstr);
+	} else {
+		sprintf(hvstr, "%s",
+			MACHINE_IS_LPAR ? "LPAR" :
+			MACHINE_IS_VM ? "z/VM" :
+			MACHINE_IS_KVM ? "KVM" : "unknown");
+	}
+	dump_stack_set_arch_desc("%s (%s)", mstr, hvstr);
 }
 
 static __init void setup_topology(void)

From 15d3387c3c7cfe8c8c790bfbe829364782b70d43 Mon Sep 17 00:00:00 2001
From: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Date: Thu, 16 Mar 2017 13:33:18 +0100
Subject: [PATCH 11/74] s390/configs//zfcpdump_defconfig: Remove
 CONFIG_SCSI_SRP_ATTRS

Signed-off-by: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/configs/zfcpdump_defconfig | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/s390/configs/zfcpdump_defconfig b/arch/s390/configs/zfcpdump_defconfig
index 4366a3e3e754..e23d97c13735 100644
--- a/arch/s390/configs/zfcpdump_defconfig
+++ b/arch/s390/configs/zfcpdump_defconfig
@@ -35,7 +35,6 @@ CONFIG_SCSI_ENCLOSURE=y
 CONFIG_SCSI_CONSTANTS=y
 CONFIG_SCSI_LOGGING=y
 CONFIG_SCSI_FC_ATTRS=y
-CONFIG_SCSI_SRP_ATTRS=y
 CONFIG_ZFCP=y
 # CONFIG_INPUT_MOUSEDEV_PSAUX is not set
 # CONFIG_INPUT_KEYBOARD is not set

From e61a6134e7a547939a0b7056bcf6b12d75b6d355 Mon Sep 17 00:00:00 2001
From: Harald Freudenberger <freude@linux.vnet.ibm.com>
Date: Wed, 15 Mar 2017 11:08:27 +0100
Subject: [PATCH 12/74] s390/pkey: Introduce new API for secure key
 verification

User space needs some information about the secure key(s)
before actually invoking the pkey and/or paes funcionality.
This patch introduces a new ioctl API and in kernel API to
verify the the secure key blob and give back some
information about the key (type, bitsize, old MKVP).
Both APIs are described in detail in the header files
arch/s390/include/asm/pkey.h and arch/s390/include/uapi/asm/pkey.h.

Signed-off-by: Harald Freudenberger <freude@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/include/asm/pkey.h      | 21 ++++++++++
 arch/s390/include/uapi/asm/pkey.h | 19 +++++++++
 drivers/s390/crypto/pkey_api.c    | 64 ++++++++++++++++++++++++++++++-
 3 files changed, 103 insertions(+), 1 deletion(-)

diff --git a/arch/s390/include/asm/pkey.h b/arch/s390/include/asm/pkey.h
index b48aef4188f6..4c484590d858 100644
--- a/arch/s390/include/asm/pkey.h
+++ b/arch/s390/include/asm/pkey.h
@@ -87,4 +87,25 @@ int pkey_findcard(const struct pkey_seckey *seckey,
 int pkey_skey2pkey(const struct pkey_seckey *seckey,
 		   struct pkey_protkey *protkey);
 
+/*
+ * Verify the given secure key for being able to be useable with
+ * the pkey module. Check for correct key type and check for having at
+ * least one crypto card being able to handle this key (master key
+ * or old master key verification pattern matches).
+ * Return some info about the key: keysize in bits, keytype (currently
+ * only AES), flag if key is wrapped with an old MKVP.
+ * @param seckey pointer to buffer with the input secure key
+ * @param pcardnr pointer to cardnr, receives the card number on success
+ * @param pdomain pointer to domain, receives the domain number on success
+ * @param pkeysize pointer to keysize, receives the bitsize of the key
+ * @param pattributes pointer to attributes, receives additional info
+ *	  PKEY_VERIFY_ATTR_AES if the key is an AES key
+ *	  PKEY_VERIFY_ATTR_OLD_MKVP if key has old mkvp stored in
+ * @return 0 on success, negative errno value on failure. If no card could
+ *	   be found which is able to handle this key, -ENODEV is returned.
+ */
+int pkey_verifykey(const struct pkey_seckey *seckey,
+		   u16 *pcardnr, u16 *pdomain,
+		   u16 *pkeysize, u32 *pattributes);
+
 #endif /* _KAPI_PKEY_H */
diff --git a/arch/s390/include/uapi/asm/pkey.h b/arch/s390/include/uapi/asm/pkey.h
index ed7f19c27ce5..e6c04faf8a6c 100644
--- a/arch/s390/include/uapi/asm/pkey.h
+++ b/arch/s390/include/uapi/asm/pkey.h
@@ -109,4 +109,23 @@ struct pkey_skey2pkey {
 };
 #define PKEY_SKEY2PKEY _IOWR(PKEY_IOCTL_MAGIC, 0x06, struct pkey_skey2pkey)
 
+/*
+ * Verify the given secure key for being able to be useable with
+ * the pkey module. Check for correct key type and check for having at
+ * least one crypto card being able to handle this key (master key
+ * or old master key verification pattern matches).
+ * Return some info about the key: keysize in bits, keytype (currently
+ * only AES), flag if key is wrapped with an old MKVP.
+ */
+struct pkey_verifykey {
+	struct pkey_seckey seckey;	       /* in: the secure key blob */
+	__u16  cardnr;			       /* out: card number	  */
+	__u16  domain;			       /* out: domain number	  */
+	__u16  keysize;			       /* out: key size in bits   */
+	__u32  attributes;		       /* out: attribute bits	  */
+};
+#define PKEY_VERIFYKEY _IOWR(PKEY_IOCTL_MAGIC, 0x07, struct pkey_verifykey)
+#define PKEY_VERIFY_ATTR_AES	   0x00000001  /* key is an AES key */
+#define PKEY_VERIFY_ATTR_OLD_MKVP  0x00000100  /* key has old MKVP value */
+
 #endif /* _UAPI_PKEY_H */
diff --git a/drivers/s390/crypto/pkey_api.c b/drivers/s390/crypto/pkey_api.c
index 058db724b5a2..ea86da8c75f9 100644
--- a/drivers/s390/crypto/pkey_api.c
+++ b/drivers/s390/crypto/pkey_api.c
@@ -80,7 +80,7 @@ struct secaeskeytoken {
  * token. If keybitsize is given, the bitsize of the key is
  * also checked. Returns 0 on success or errno value on failure.
  */
-static int check_secaeskeytoken(u8 *token, int keybitsize)
+static int check_secaeskeytoken(const u8 *token, int keybitsize)
 {
 	struct secaeskeytoken *t = (struct secaeskeytoken *) token;
 
@@ -1003,6 +1003,53 @@ int pkey_skey2pkey(const struct pkey_seckey *seckey,
 }
 EXPORT_SYMBOL(pkey_skey2pkey);
 
+/*
+ * Verify key and give back some info about the key.
+ */
+int pkey_verifykey(const struct pkey_seckey *seckey,
+		   u16 *pcardnr, u16 *pdomain,
+		   u16 *pkeysize, u32 *pattributes)
+{
+	struct secaeskeytoken *t = (struct secaeskeytoken *) seckey;
+	u16 cardnr, domain;
+	u64 mkvp[2];
+	int rc;
+
+	/* check the secure key for valid AES secure key */
+	rc = check_secaeskeytoken((u8 *) seckey, 0);
+	if (rc)
+		goto out;
+	if (pattributes)
+		*pattributes = PKEY_VERIFY_ATTR_AES;
+	if (pkeysize)
+		*pkeysize = t->bitsize;
+
+	/* try to find a card which can handle this key */
+	rc = pkey_findcard(seckey, &cardnr, &domain, 1);
+	if (rc)
+		goto out;
+
+	/* check mkvp for old mkvp match */
+	rc = mkvp_cache_fetch(cardnr, domain, mkvp);
+	if (rc)
+		goto out;
+	if (t->mkvp == mkvp[1]) {
+		DEBUG_DBG("pkey_verifykey secure key has old mkvp\n");
+		if (pattributes)
+			*pattributes |= PKEY_VERIFY_ATTR_OLD_MKVP;
+	}
+
+	if (pcardnr)
+		*pcardnr = cardnr;
+	if (pdomain)
+		*pdomain = domain;
+
+out:
+	DEBUG_DBG("pkey_verifykey rc=%d\n", rc);
+	return rc;
+}
+EXPORT_SYMBOL(pkey_verifykey);
+
 /*
  * File io functions
  */
@@ -1104,6 +1151,21 @@ static long pkey_unlocked_ioctl(struct file *filp, unsigned int cmd,
 			return -EFAULT;
 		break;
 	}
+	case PKEY_VERIFYKEY: {
+		struct pkey_verifykey __user *uvk = (void __user *) arg;
+		struct pkey_verifykey kvk;
+
+		if (copy_from_user(&kvk, uvk, sizeof(kvk)))
+			return -EFAULT;
+		rc = pkey_verifykey(&kvk.seckey, &kvk.cardnr, &kvk.domain,
+				    &kvk.keysize, &kvk.attributes);
+		DEBUG_DBG("pkey_ioctl pkey_verifykey()=%d\n", rc);
+		if (rc)
+			break;
+		if (copy_to_user(uvk, &kvk, sizeof(kvk)))
+			return -EFAULT;
+		break;
+	}
 	default:
 		/* unknown/unsupported ioctl cmd */
 		return -ENOTTY;

From 050f99b1b85876a8456b76ebfee7d43746d37f6c Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Mon, 13 Mar 2017 15:58:35 +0100
Subject: [PATCH 13/74] s390/debugfs: introduce top-level 's390' directory

Introduce a top-level 's390' directory which should be used when
adding new s390 specific debug feature files and/or directories.

This makes hopefully sure that the contents of the s390 directory will
be a bit more structured. Right now we have a couple of top-level
files where it is not easy to tell to which subsystem they belong to.

Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/kernel/Makefile   |  2 +-
 arch/s390/kernel/kdebugfs.c | 15 +++++++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)
 create mode 100644 arch/s390/kernel/kdebugfs.c

diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile
index 08e7924ea2f9..adb3fe2e3d42 100644
--- a/arch/s390/kernel/Makefile
+++ b/arch/s390/kernel/Makefile
@@ -56,7 +56,7 @@ obj-y	+= processor.o sys_s390.o ptrace.o signal.o cpcmd.o ebcdic.o nmi.o
 obj-y	+= debug.o irq.o ipl.o dis.o diag.o vdso.o als.o
 obj-y	+= sysinfo.o jump_label.o lgr.o os_info.o machine_kexec.o pgm_check.o
 obj-y	+= runtime_instr.o cache.o fpu.o dumpstack.o guarded_storage.o
-obj-y	+= entry.o reipl.o relocate_kernel.o
+obj-y	+= entry.o reipl.o relocate_kernel.o kdebugfs.o
 
 extra-y				+= head.o head64.o vmlinux.lds
 
diff --git a/arch/s390/kernel/kdebugfs.c b/arch/s390/kernel/kdebugfs.c
new file mode 100644
index 000000000000..ee85e17dd79d
--- /dev/null
+++ b/arch/s390/kernel/kdebugfs.c
@@ -0,0 +1,15 @@
+#include <linux/debugfs.h>
+#include <linux/export.h>
+#include <linux/init.h>
+
+struct dentry *arch_debugfs_dir;
+EXPORT_SYMBOL(arch_debugfs_dir);
+
+static int __init arch_kdebugfs_init(void)
+{
+	arch_debugfs_dir = debugfs_create_dir("s390", NULL);
+	if (IS_ERR(arch_debugfs_dir))
+		arch_debugfs_dir = NULL;
+	return 0;
+}
+postcore_initcall(arch_kdebugfs_init);

From ae5ca67acaf070ced4e3ba324160ad6a99637e71 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Mon, 13 Mar 2017 15:58:59 +0100
Subject: [PATCH 14/74] s390/sysinfo,topology: provide raw stsi 15,1,x data via
 debugfs

Provide the raw stsi 15,1,x data contents via debugfs. This makes it
much easier to debug unexpected scheduling domains on machines that
provide cpu topology information.

Therefore this file adds a new 's390/stsi' debugfs directory with a
file for each possible topology nesting level that is allowed by the
architecture. The files will be created regardless if the machine
supports all, or any, level.  If a level is not supported, or no data
is available, user space can recognize this with a -EINVAL error code
when trying to read such data.
In addition a 'topology' symlink is created that points to the file
that contains the data that is used to create the scheduling domains.

Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/include/asm/sysinfo.h |  9 ++++
 arch/s390/kernel/sysinfo.c      | 80 +++++++++++++++++++++++++++++++++
 arch/s390/kernel/topology.c     |  2 +-
 3 files changed, 90 insertions(+), 1 deletion(-)

diff --git a/arch/s390/include/asm/sysinfo.h b/arch/s390/include/asm/sysinfo.h
index 27394989073a..73bff45ced55 100644
--- a/arch/s390/include/asm/sysinfo.h
+++ b/arch/s390/include/asm/sysinfo.h
@@ -142,6 +142,15 @@ struct sysinfo_3_2_2 {
 
 extern int topology_max_mnest;
 
+/*
+ * Returns the maximum nesting level supported by the cpu topology code.
+ * The current maximum level is 4 which is the drawer level.
+ */
+static inline int topology_mnest_limit(void)
+{
+	return min(topology_max_mnest, 4);
+}
+
 #define TOPOLOGY_NR_MAG		6
 
 struct topology_core {
diff --git a/arch/s390/kernel/sysinfo.c b/arch/s390/kernel/sysinfo.c
index 12b6b138e354..1d4680e38378 100644
--- a/arch/s390/kernel/sysinfo.c
+++ b/arch/s390/kernel/sysinfo.c
@@ -4,6 +4,7 @@
  *	       Martin Schwidefsky <schwidefsky@de.ibm.com>,
  */
 
+#include <linux/debugfs.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/proc_fs.h>
@@ -13,6 +14,7 @@
 #include <linux/export.h>
 #include <linux/slab.h>
 #include <asm/ebcdic.h>
+#include <asm/debug.h>
 #include <asm/sysinfo.h>
 #include <asm/cpcmd.h>
 #include <asm/topology.h>
@@ -485,3 +487,81 @@ void calibrate_delay(void)
 	       "%lu.%02lu BogoMIPS preset\n", loops_per_jiffy/(500000/HZ),
 	       (loops_per_jiffy/(5000/HZ)) % 100);
 }
+
+#ifdef CONFIG_DEBUG_FS
+
+#define STSI_FILE(fc, s1, s2)						       \
+static int stsi_open_##fc##_##s1##_##s2(struct inode *inode, struct file *file)\
+{									       \
+	file->private_data = (void *) get_zeroed_page(GFP_KERNEL);	       \
+	if (!file->private_data)					       \
+		return -ENOMEM;						       \
+	if (stsi(file->private_data, fc, s1, s2)) {			       \
+		free_page((unsigned long)file->private_data);		       \
+		file->private_data = NULL;				       \
+		return -EACCES;						       \
+	}								       \
+	return nonseekable_open(inode, file);				       \
+}									       \
+									       \
+static const struct file_operations stsi_##fc##_##s1##_##s2##_fs_ops = {       \
+	.open		= stsi_open_##fc##_##s1##_##s2,			       \
+	.release	= stsi_release,					       \
+	.read		= stsi_read,					       \
+	.llseek		= no_llseek,					       \
+};
+
+static int stsi_release(struct inode *inode, struct file *file)
+{
+	free_page((unsigned long)file->private_data);
+	return 0;
+}
+
+static ssize_t stsi_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
+{
+	return simple_read_from_buffer(buf, size, ppos, file->private_data, PAGE_SIZE);
+}
+
+STSI_FILE(15, 1, 2);
+STSI_FILE(15, 1, 3);
+STSI_FILE(15, 1, 4);
+STSI_FILE(15, 1, 5);
+STSI_FILE(15, 1, 6);
+
+struct stsi_file {
+	const struct file_operations *fops;
+	char *name;
+};
+
+static struct stsi_file stsi_file[] __initdata = {
+	{.fops = &stsi_15_1_2_fs_ops, .name = "15_1_2"},
+	{.fops = &stsi_15_1_3_fs_ops, .name = "15_1_3"},
+	{.fops = &stsi_15_1_4_fs_ops, .name = "15_1_4"},
+	{.fops = &stsi_15_1_5_fs_ops, .name = "15_1_5"},
+	{.fops = &stsi_15_1_6_fs_ops, .name = "15_1_6"},
+};
+
+static __init int stsi_init_debugfs(void)
+{
+	struct dentry *stsi_root;
+	struct stsi_file *sf;
+	int i;
+
+	stsi_root = debugfs_create_dir("stsi", arch_debugfs_dir);
+	if (IS_ERR_OR_NULL(stsi_root))
+		return 0;
+	for (i = 0; i < ARRAY_SIZE(stsi_file); i++) {
+		sf = &stsi_file[i];
+		debugfs_create_file(sf->name, 0400, stsi_root, NULL, sf->fops);
+	}
+	if (IS_ENABLED(CONFIG_SCHED_TOPOLOGY) && MACHINE_HAS_TOPOLOGY) {
+		char link_to[10];
+
+		sprintf(link_to, "15_1_%d", topology_mnest_limit());
+		debugfs_create_symlink("topology", stsi_root, link_to);
+	}
+	return 0;
+}
+device_initcall(stsi_init_debugfs);
+
+#endif /* CONFIG_DEBUG_FS */
diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c
index 0537130fb915..bb47c92476f0 100644
--- a/arch/s390/kernel/topology.c
+++ b/arch/s390/kernel/topology.c
@@ -246,7 +246,7 @@ static void update_cpu_masks(void)
 
 void store_topology(struct sysinfo_15_1_x *info)
 {
-	stsi(info, 15, 1, min(topology_max_mnest, 4));
+	stsi(info, 15, 1, topology_mnest_limit());
 }
 
 static int __arch_update_cpu_topology(void)

From cdd3bd9d618d7a4a3920a193e20ca40736b878e9 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Mon, 13 Mar 2017 16:15:06 +0100
Subject: [PATCH 15/74] s390/sysinfo: provide remaining stsi information via
 debugfs

Provide the remaining stsi information via debugfs files. This also
might be useful for debugging purposes.

Suggested-by: Christian Borntraeger <borntraeger@de.ibm.com>
Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/kernel/sysinfo.c | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/arch/s390/kernel/sysinfo.c b/arch/s390/kernel/sysinfo.c
index 1d4680e38378..eefcb54872a5 100644
--- a/arch/s390/kernel/sysinfo.c
+++ b/arch/s390/kernel/sysinfo.c
@@ -522,6 +522,12 @@ static ssize_t stsi_read(struct file *file, char __user *buf, size_t size, loff_
 	return simple_read_from_buffer(buf, size, ppos, file->private_data, PAGE_SIZE);
 }
 
+STSI_FILE( 1, 1, 1);
+STSI_FILE( 1, 2, 1);
+STSI_FILE( 1, 2, 2);
+STSI_FILE( 2, 2, 1);
+STSI_FILE( 2, 2, 2);
+STSI_FILE( 3, 2, 2);
 STSI_FILE(15, 1, 2);
 STSI_FILE(15, 1, 3);
 STSI_FILE(15, 1, 4);
@@ -534,6 +540,12 @@ struct stsi_file {
 };
 
 static struct stsi_file stsi_file[] __initdata = {
+	{.fops = &stsi_1_1_1_fs_ops,  .name =  "1_1_1"},
+	{.fops = &stsi_1_2_1_fs_ops,  .name =  "1_2_1"},
+	{.fops = &stsi_1_2_2_fs_ops,  .name =  "1_2_2"},
+	{.fops = &stsi_2_2_1_fs_ops,  .name =  "2_2_1"},
+	{.fops = &stsi_2_2_2_fs_ops,  .name =  "2_2_2"},
+	{.fops = &stsi_3_2_2_fs_ops,  .name =  "3_2_2"},
 	{.fops = &stsi_15_1_2_fs_ops, .name = "15_1_2"},
 	{.fops = &stsi_15_1_3_fs_ops, .name = "15_1_3"},
 	{.fops = &stsi_15_1_4_fs_ops, .name = "15_1_4"},
@@ -541,15 +553,21 @@ static struct stsi_file stsi_file[] __initdata = {
 	{.fops = &stsi_15_1_6_fs_ops, .name = "15_1_6"},
 };
 
+static u8 stsi_0_0_0;
+
 static __init int stsi_init_debugfs(void)
 {
 	struct dentry *stsi_root;
 	struct stsi_file *sf;
-	int i;
+	int lvl, i;
 
 	stsi_root = debugfs_create_dir("stsi", arch_debugfs_dir);
 	if (IS_ERR_OR_NULL(stsi_root))
 		return 0;
+	lvl = stsi(NULL, 0, 0, 0);
+	if (lvl > 0)
+		stsi_0_0_0 = lvl;
+	debugfs_create_u8("0_0_0", 0400, stsi_root, &stsi_0_0_0);
 	for (i = 0; i < ARRAY_SIZE(stsi_file); i++) {
 		sf = &stsi_file[i];
 		debugfs_create_file(sf->name, 0400, stsi_root, NULL, sf->fops);

From 6f5165e864d240d15675cc2fb5a369d57e1f60d0 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Mon, 20 Mar 2017 14:29:50 +0100
Subject: [PATCH 16/74] s390/facilites: use stfle_fac_list array size for
 MAX_FACILITY_BIT

Use the actual size of the facility list array within the lowcore
structure for the MAX_FACILITY_BIT define instead of a comment which
states what this is good for. This makes it a bit harder to break
things.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/include/asm/facility.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/s390/include/asm/facility.h b/arch/s390/include/asm/facility.h
index 09b406db7529..aead78b5da7e 100644
--- a/arch/s390/include/asm/facility.h
+++ b/arch/s390/include/asm/facility.h
@@ -15,7 +15,7 @@
 #include <linux/preempt.h>
 #include <asm/lowcore.h>
 
-#define MAX_FACILITY_BIT (256*8)	/* stfle_fac_list has 256 bytes */
+#define MAX_FACILITY_BIT (sizeof(((struct lowcore *)0)->stfle_fac_list) * 8)
 
 static inline int __test_facility(unsigned long nr, void *facilities)
 {

From 0b7bb6af1d734b15dbebec942767708e8ca40ca3 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Mon, 20 Mar 2017 14:39:28 +0100
Subject: [PATCH 17/74] s390/facilities: get rid of __ASSEMBLY__ in facility
 header file

There is no need for the __ASSEMBLY__ ifdefery anymore since the
architecture level set code that deals with facility bits was
converted to C in the meantime.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/include/asm/facility.h | 4 ----
 arch/s390/kernel/head.S          | 1 -
 2 files changed, 5 deletions(-)

diff --git a/arch/s390/include/asm/facility.h b/arch/s390/include/asm/facility.h
index aead78b5da7e..cb60d5c5755d 100644
--- a/arch/s390/include/asm/facility.h
+++ b/arch/s390/include/asm/facility.h
@@ -8,9 +8,6 @@
 #define __ASM_FACILITY_H
 
 #include <generated/facilities.h>
-
-#ifndef __ASSEMBLY__
-
 #include <linux/string.h>
 #include <linux/preempt.h>
 #include <asm/lowcore.h>
@@ -72,5 +69,4 @@ static inline void stfle(u64 *stfle_fac_list, int size)
 	preempt_enable();
 }
 
-#endif /* __ASSEMBLY__ */
 #endif /* __ASM_FACILITY_H */
diff --git a/arch/s390/kernel/head.S b/arch/s390/kernel/head.S
index 0b5ebf8a3d30..eff5b31671d4 100644
--- a/arch/s390/kernel/head.S
+++ b/arch/s390/kernel/head.S
@@ -25,7 +25,6 @@
 #include <linux/linkage.h>
 #include <asm/asm-offsets.h>
 #include <asm/thread_info.h>
-#include <asm/facility.h>
 #include <asm/page.h>
 #include <asm/ptrace.h>
 

From 2fa5ed7d87074268b5d29cdff6c4f81f51666885 Mon Sep 17 00:00:00 2001
From: Janosch Frank <frankja@linux.vnet.ibm.com>
Date: Wed, 8 Feb 2017 08:59:56 +0100
Subject: [PATCH 18/74] s390/mm: Remove double gaddr calculation when notifying

ptep_notify and gmap_shadow_notify both need a guest address and
therefore retrieve them from the available virtual host address.

As they operate on the same guest address, we can calculate it once
and then pass it on. As a gmap normally has more than one shadow gmap,
we also do not recalculate for each of them any more.

Signed-off-by: Janosch Frank <frankja@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/mm/gmap.c | 35 ++++++++++++++---------------------
 1 file changed, 14 insertions(+), 21 deletions(-)

diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index a07b1ec1391d..ffb55c935eda 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -2004,20 +2004,12 @@ EXPORT_SYMBOL_GPL(gmap_shadow_page);
  * Called with sg->parent->shadow_lock.
  */
 static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr,
-			       unsigned long offset, pte_t *pte)
+			       unsigned long gaddr, pte_t *pte)
 {
 	struct gmap_rmap *rmap, *rnext, *head;
-	unsigned long gaddr, start, end, bits, raddr;
-	unsigned long *table;
+	unsigned long start, end, bits, raddr;
 
 	BUG_ON(!gmap_is_shadow(sg));
-	spin_lock(&sg->parent->guest_table_lock);
-	table = radix_tree_lookup(&sg->parent->host_to_guest,
-				  vmaddr >> PMD_SHIFT);
-	gaddr = table ? __gmap_segment_gaddr(table) + offset : 0;
-	spin_unlock(&sg->parent->guest_table_lock);
-	if (!table)
-		return;
 
 	spin_lock(&sg->guest_table_lock);
 	if (sg->removed) {
@@ -2076,7 +2068,7 @@ static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr,
 void ptep_notify(struct mm_struct *mm, unsigned long vmaddr,
 		 pte_t *pte, unsigned long bits)
 {
-	unsigned long offset, gaddr;
+	unsigned long offset, gaddr = 0;
 	unsigned long *table;
 	struct gmap *gmap, *sg, *next;
 
@@ -2084,22 +2076,23 @@ void ptep_notify(struct mm_struct *mm, unsigned long vmaddr,
 	offset = offset * (4096 / sizeof(pte_t));
 	rcu_read_lock();
 	list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
-		if (!list_empty(&gmap->children) && (bits & PGSTE_VSIE_BIT)) {
-			spin_lock(&gmap->shadow_lock);
-			list_for_each_entry_safe(sg, next,
-						 &gmap->children, list)
-				gmap_shadow_notify(sg, vmaddr, offset, pte);
-			spin_unlock(&gmap->shadow_lock);
-		}
-		if (!(bits & PGSTE_IN_BIT))
-			continue;
 		spin_lock(&gmap->guest_table_lock);
 		table = radix_tree_lookup(&gmap->host_to_guest,
 					  vmaddr >> PMD_SHIFT);
 		if (table)
 			gaddr = __gmap_segment_gaddr(table) + offset;
 		spin_unlock(&gmap->guest_table_lock);
-		if (table)
+		if (!table)
+			continue;
+
+		if (!list_empty(&gmap->children) && (bits & PGSTE_VSIE_BIT)) {
+			spin_lock(&gmap->shadow_lock);
+			list_for_each_entry_safe(sg, next,
+						 &gmap->children, list)
+				gmap_shadow_notify(sg, vmaddr, gaddr, pte);
+			spin_unlock(&gmap->shadow_lock);
+		}
+		if (bits & PGSTE_IN_BIT)
 			gmap_call_notifier(gmap, gaddr, gaddr + PAGE_SIZE - 1);
 	}
 	rcu_read_unlock();

From ccd53fa2261c2676258fe523d9201099f4d5ebc1 Mon Sep 17 00:00:00 2001
From: Stefan Haberland <sth@linux.vnet.ibm.com>
Date: Wed, 22 Mar 2017 10:30:40 +0100
Subject: [PATCH 19/74] s390/dasd: check if query host access feature is
 supported

Some storage servers might not support the query host access feature.
Check if the corresponding feature code is set.

Signed-off-by: Stefan Haberland <sth@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 drivers/s390/block/dasd_eckd.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c
index 0b38217f8147..eb090e85b02e 100644
--- a/drivers/s390/block/dasd_eckd.c
+++ b/drivers/s390/block/dasd_eckd.c
@@ -5172,6 +5172,10 @@ static int dasd_eckd_query_host_access(struct dasd_device *device,
 	if (!device->block && private->lcu->pav == HYPER_PAV)
 		return -EOPNOTSUPP;
 
+	/* may not be supported by the storage server */
+	if (!(private->features.feature[14] & 0x80))
+		return -EOPNOTSUPP;
+
 	cqr = dasd_smalloc_request(DASD_ECKD_MAGIC, 1 /* PSF */	+ 1 /* RSSD */,
 				   sizeof(struct dasd_psf_prssd_data) + 1,
 				   device);

From ab24fbd35a6ee77a58c24bd50582c51610a194f0 Mon Sep 17 00:00:00 2001
From: Stefan Haberland <sth@linux.vnet.ibm.com>
Date: Wed, 22 Mar 2017 11:06:20 +0100
Subject: [PATCH 20/74] s390/dasd: suppress command reject error for query host
 access command

On some z/VM systems the query host access command is not supported for
temp disks, though the corresponding feature code is set.
This does not have any impact beside that the information is not available.
Suppress the full blown command reject error messages to not confuse the
user. The error is still logged in the s390dbf.

Signed-off-by: Stefan Haberland <sth@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 drivers/s390/block/dasd_3990_erp.c |  5 +++--
 drivers/s390/block/dasd_eckd.c     | 12 +++++++++---
 drivers/s390/block/dasd_int.h      |  2 +-
 3 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/drivers/s390/block/dasd_3990_erp.c b/drivers/s390/block/dasd_3990_erp.c
index 774da20ceb58..107cd3361e29 100644
--- a/drivers/s390/block/dasd_3990_erp.c
+++ b/drivers/s390/block/dasd_3990_erp.c
@@ -1052,8 +1052,9 @@ dasd_3990_erp_com_rej(struct dasd_ccw_req * erp, char *sense)
 	} else {
 		/* fatal error -  set status to FAILED
 		   internal error 09 - Command Reject */
-		dev_err(&device->cdev->dev, "An error occurred in the DASD "
-			"device driver, reason=%s\n", "09");
+		if (!test_bit(DASD_CQR_SUPPRESS_CR, &erp->flags))
+			dev_err(&device->cdev->dev,
+				"An error occurred in the DASD device driver, reason=09\n");
 
 		erp = dasd_3990_erp_cleanup(erp, DASD_CQR_FAILED);
 	}
diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c
index eb090e85b02e..122456e4db89 100644
--- a/drivers/s390/block/dasd_eckd.c
+++ b/drivers/s390/block/dasd_eckd.c
@@ -4927,10 +4927,14 @@ static void dasd_eckd_dump_sense(struct dasd_device *device,
 		dasd_eckd_dump_sense_tcw(device, req, irb);
 	} else {
 		/*
-		 * In some cases the 'No Record Found' error might be expected
-		 * and log messages shouldn't be written then. Check if the
-		 * according suppress bit is set.
+		 * In some cases the 'Command Reject' or 'No Record Found'
+		 * error might be expected and log messages shouldn't be
+		 * written then. Check if the according suppress bit is set.
 		 */
+		if (sense && sense[0] & SNS0_CMD_REJECT &&
+		    test_bit(DASD_CQR_SUPPRESS_CR, &req->flags))
+			return;
+
 		if (sense && sense[1] & SNS1_NO_REC_FOUND &&
 		    test_bit(DASD_CQR_SUPPRESS_NRF, &req->flags))
 			return;
@@ -5223,6 +5227,8 @@ static int dasd_eckd_query_host_access(struct dasd_device *device,
 
 	cqr->buildclk = get_tod_clock();
 	cqr->status = DASD_CQR_FILLED;
+	/* the command might not be supported, suppress error message */
+	__set_bit(DASD_CQR_SUPPRESS_CR, &cqr->flags);
 	rc = dasd_sleep_on_interruptible(cqr);
 	if (rc == 0) {
 		*data = *host_access;
diff --git a/drivers/s390/block/dasd_int.h b/drivers/s390/block/dasd_int.h
index 518dba2732d5..dca7cb1e6f65 100644
--- a/drivers/s390/block/dasd_int.h
+++ b/drivers/s390/block/dasd_int.h
@@ -239,11 +239,11 @@ struct dasd_ccw_req {
 					 */
 /*
  * The following flags are used to suppress output of certain errors.
- * These flags should only be used for format checks!
  */
 #define DASD_CQR_SUPPRESS_NRF	4	/* Suppress 'No Record Found' error */
 #define DASD_CQR_SUPPRESS_FP	5	/* Suppress 'File Protected' error*/
 #define DASD_CQR_SUPPRESS_IL	6	/* Suppress 'Incorrect Length' error */
+#define DASD_CQR_SUPPRESS_CR	7	/* Suppress 'Command Reject' error */
 
 /* Signature for error recovery functions. */
 typedef struct dasd_ccw_req *(*dasd_erp_fn_t) (struct dasd_ccw_req *);

From 59cea29a34eb6f5fe0e44e0766a1797dd73f40a1 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Mon, 27 Mar 2017 08:01:29 +0200
Subject: [PATCH 21/74] s390: remove HAVE_ARCH_EARLY_PFN_TO_NID select
 statement

HAVE_ARCH_EARLY_PFN_TO_NID selects a not present Kconfig
option. Therefore remove it.
Given that the first call of early_pfn_to_nid() happens after
numa_setup() finished to establish the memory to node mapping, there
is no need to implement an architecture private version of
__early_pfn_to_nid() like (only) ia64 does.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/Kconfig | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index a2dcef0aacc7..1521edafdd15 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -123,7 +123,6 @@ config S390
 	select GENERIC_TIME_VSYSCALL
 	select HAVE_ALIGNED_STRUCT_PAGE if SLUB
 	select HAVE_ARCH_AUDITSYSCALL
-	select HAVE_ARCH_EARLY_PFN_TO_NID
 	select HAVE_ARCH_HARDENED_USERCOPY
 	select HAVE_ARCH_JUMP_LABEL
 	select CPU_NO_EFFICIENT_FFS if !HAVE_MARCH_Z9_109_FEATURES

From dcc00b79fc3d076832f7240de8870f492629b171 Mon Sep 17 00:00:00 2001
From: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Date: Thu, 23 Mar 2017 21:02:54 +0100
Subject: [PATCH 22/74] s390/kdump: Add final note

Since linux v3.14 with commit 38dfac843cb6d7be1 ("vmcore: prevent PT_NOTE
p_memsz overflow during header update") on s390 we get the following
message in the kdump kernel:

  Warning: Exceeded p_memsz, dropping PT_NOTE entry n_namesz=0x6b6b6b6b,
  n_descsz=0x6b6b6b6b

The reason for this is that we don't create a final zero note in
the ELF header which the proc/vmcore code uses to find out the end
of the notes section (see also kernel/kexec_core.c:final_note()).

It still worked on s390 by chance because we (most of the time?) have the
byte pattern 0x6b6b6b6b after the notes section which also makes the notes
parsing code stop in update_note_header_size_elf64() because 0x6b6b6b6b is
interpreded as note size:

  if ((real_sz + sz) > max_sz) {
          pr_warn("Warning: Exceeded p_memsz, dropping P ...);
          break;
  }

So fix this and add the missing final note to the ELF header.
We don't have to adjust the memory size for ELF header ("alloc_size")
because the new ELF note still fits into the 0x1000 base memory.

Cc: stable@vger.kernel.org # v4.4+
Signed-off-by: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/kernel/crash_dump.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c
index dd1d5c62c374..d628afc26708 100644
--- a/arch/s390/kernel/crash_dump.c
+++ b/arch/s390/kernel/crash_dump.c
@@ -428,6 +428,20 @@ static void *nt_vmcoreinfo(void *ptr)
 	return nt_init_name(ptr, 0, vmcoreinfo, size, "VMCOREINFO");
 }
 
+/*
+ * Initialize final note (needed for /proc/vmcore code)
+ */
+static void *nt_final(void *ptr)
+{
+	Elf64_Nhdr *note;
+
+	note = (Elf64_Nhdr *) ptr;
+	note->n_namesz = 0;
+	note->n_descsz = 0;
+	note->n_type = 0;
+	return PTR_ADD(ptr, sizeof(Elf64_Nhdr));
+}
+
 /*
  * Initialize ELF header (new kernel)
  */
@@ -515,6 +529,7 @@ static void *notes_init(Elf64_Phdr *phdr, void *ptr, u64 notes_offset)
 		if (sa->prefix != 0)
 			ptr = fill_cpu_elf_notes(ptr, cpu++, sa);
 	ptr = nt_vmcoreinfo(ptr);
+	ptr = nt_final(ptr);
 	memset(phdr, 0, sizeof(*phdr));
 	phdr->p_type = PT_NOTE;
 	phdr->p_offset = notes_offset;

From b8f982803490920544c988bf6ec313028677851f Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Wed, 22 Mar 2017 12:36:07 +0100
Subject: [PATCH 23/74] s390/MAINTAINERS: add gmap.c to kvm maintainers

gmap.c deals mostly with KVM-related memory management, so a lot
of changes to this file will come via the KVM tree. Reflect this
in MAINTAINERS. Please note that there are intricate ties to
arch/s390/mm/pgtable.c. If changes are needed in both files,
this will continue to be submitted via the s390 tree (or a
topic branch if necessary).

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Acked-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index c776906f67a9..24136346fe58 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7170,6 +7170,7 @@ S:	Supported
 F:	Documentation/s390/kvm.txt
 F:	arch/s390/include/asm/kvm*
 F:	arch/s390/kvm/
+F:	arch/s390/mm/gmap.c
 
 KERNEL VIRTUAL MACHINE (KVM) FOR ARM
 M:	Christoffer Dall <christoffer.dall@linaro.org>

From fd15a1f333ff5c55be8d3d94ac4c609afe2fd79f Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Mon, 27 Mar 2017 12:28:25 +0200
Subject: [PATCH 24/74] s390: enable ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT

Deferred struct page initialization works on s390. However it makes
only sense for the fake numa case, since the kthreads that initialize
struct pages are started per node. Without fake numa there is just a
single node and therefore no gain.
However there is no reason to not enable this feature. Therefore
select the config option and enable the feature in all config files.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/Kconfig                       | 1 +
 arch/s390/configs/default_defconfig     | 1 +
 arch/s390/configs/gcov_defconfig        | 1 +
 arch/s390/configs/performance_defconfig | 1 +
 4 files changed, 4 insertions(+)

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 1521edafdd15..fc842c4bd720 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -105,6 +105,7 @@ config S390
 	select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
 	select ARCH_SAVE_PAGE_KEYS if HIBERNATION
 	select ARCH_SUPPORTS_ATOMIC_RMW
+	select ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
 	select ARCH_SUPPORTS_NUMA_BALANCING
 	select ARCH_USE_BUILTIN_BSWAP
 	select ARCH_USE_CMPXCHG_LOCKREF
diff --git a/arch/s390/configs/default_defconfig b/arch/s390/configs/default_defconfig
index 4b176fe83da4..a5039fa89314 100644
--- a/arch/s390/configs/default_defconfig
+++ b/arch/s390/configs/default_defconfig
@@ -73,6 +73,7 @@ CONFIG_ZSWAP=y
 CONFIG_ZBUD=m
 CONFIG_ZSMALLOC=m
 CONFIG_ZSMALLOC_STAT=y
+CONFIG_DEFERRED_STRUCT_PAGE_INIT=y
 CONFIG_IDLE_PAGE_TRACKING=y
 CONFIG_PCI=y
 CONFIG_PCI_DEBUG=y
diff --git a/arch/s390/configs/gcov_defconfig b/arch/s390/configs/gcov_defconfig
index 0de46cc397f6..83970b5afb2b 100644
--- a/arch/s390/configs/gcov_defconfig
+++ b/arch/s390/configs/gcov_defconfig
@@ -72,6 +72,7 @@ CONFIG_ZSWAP=y
 CONFIG_ZBUD=m
 CONFIG_ZSMALLOC=m
 CONFIG_ZSMALLOC_STAT=y
+CONFIG_DEFERRED_STRUCT_PAGE_INIT=y
 CONFIG_IDLE_PAGE_TRACKING=y
 CONFIG_PCI=y
 CONFIG_HOTPLUG_PCI=y
diff --git a/arch/s390/configs/performance_defconfig b/arch/s390/configs/performance_defconfig
index e167557b434c..fbc6542aaf59 100644
--- a/arch/s390/configs/performance_defconfig
+++ b/arch/s390/configs/performance_defconfig
@@ -70,6 +70,7 @@ CONFIG_ZSWAP=y
 CONFIG_ZBUD=m
 CONFIG_ZSMALLOC=m
 CONFIG_ZSMALLOC_STAT=y
+CONFIG_DEFERRED_STRUCT_PAGE_INIT=y
 CONFIG_IDLE_PAGE_TRACKING=y
 CONFIG_PCI=y
 CONFIG_HOTPLUG_PCI=y

From b8402b957d6792f80282fa76e9932ff77a6f0554 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Mon, 27 Mar 2017 15:36:20 +0200
Subject: [PATCH 25/74] s390: make MAX_PHYSMEM_BITS configurable

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/Kconfig                 | 10 ++++++++++
 arch/s390/include/asm/sparsemem.h |  2 +-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index fc842c4bd720..afac5d89ad1f 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -537,6 +537,16 @@ config FORCE_MAX_ZONEORDER
 
 source "mm/Kconfig"
 
+config MAX_PHYSMEM_BITS
+	int "Maximum size of supported physical memory in bits (42-53)"
+	range 42 53
+	default "46"
+	help
+	  This option specifies the maximum supported size of physical memory
+	  in bits. Supported is any size between 2^42 (4TB) and 2^53 (8PB).
+	  Increasing the number of bits also increases the kernel image size.
+	  By default 46 bits (64TB) are supported.
+
 config PACK_STACK
 	def_bool y
 	prompt "Pack kernel stack"
diff --git a/arch/s390/include/asm/sparsemem.h b/arch/s390/include/asm/sparsemem.h
index 487428b6d099..334e279f1bce 100644
--- a/arch/s390/include/asm/sparsemem.h
+++ b/arch/s390/include/asm/sparsemem.h
@@ -2,6 +2,6 @@
 #define _ASM_S390_SPARSEMEM_H
 
 #define SECTION_SIZE_BITS	28
-#define MAX_PHYSMEM_BITS	46
+#define MAX_PHYSMEM_BITS	CONFIG_MAX_PHYSMEM_BITS
 
 #endif /* _ASM_S390_SPARSEMEM_H */

From c30abecb39d1493943706193eb086de986738542 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 29 Mar 2017 09:59:21 +0200
Subject: [PATCH 26/74] s390/uapi: use generic headers if possible

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/include/uapi/asm/Kbuild     | 18 ++++++++++--------
 arch/s390/include/uapi/asm/errno.h    | 11 -----------
 arch/s390/include/uapi/asm/fcntl.h    |  1 -
 arch/s390/include/uapi/asm/ioctl.h    |  1 -
 arch/s390/include/uapi/asm/mman.h     |  6 ------
 arch/s390/include/uapi/asm/param.h    |  6 ------
 arch/s390/include/uapi/asm/poll.h     |  1 -
 arch/s390/include/uapi/asm/resource.h | 13 -------------
 arch/s390/include/uapi/asm/sockios.h  |  6 ------
 arch/s390/include/uapi/asm/termbits.h |  6 ------
 10 files changed, 10 insertions(+), 59 deletions(-)
 delete mode 100644 arch/s390/include/uapi/asm/errno.h
 delete mode 100644 arch/s390/include/uapi/asm/fcntl.h
 delete mode 100644 arch/s390/include/uapi/asm/ioctl.h
 delete mode 100644 arch/s390/include/uapi/asm/mman.h
 delete mode 100644 arch/s390/include/uapi/asm/param.h
 delete mode 100644 arch/s390/include/uapi/asm/poll.h
 delete mode 100644 arch/s390/include/uapi/asm/resource.h
 delete mode 100644 arch/s390/include/uapi/asm/sockios.h
 delete mode 100644 arch/s390/include/uapi/asm/termbits.h

diff --git a/arch/s390/include/uapi/asm/Kbuild b/arch/s390/include/uapi/asm/Kbuild
index 86b761e583e3..addb09cee0f5 100644
--- a/arch/s390/include/uapi/asm/Kbuild
+++ b/arch/s390/include/uapi/asm/Kbuild
@@ -1,6 +1,16 @@
 # UAPI Header export list
 include include/uapi/asm-generic/Kbuild.asm
 
+generic-y += errno.h
+generic-y += fcntl.h
+generic-y += ioctl.h
+generic-y += mman.h
+generic-y += param.h
+generic-y += poll.h
+generic-y += resource.h
+generic-y += sockios.h
+generic-y += termbits.h
+
 header-y += auxvec.h
 header-y += bitsperlong.h
 header-y += byteorder.h
@@ -11,26 +21,20 @@ header-y += cmb.h
 header-y += dasd.h
 header-y += debug.h
 header-y += errno.h
-header-y += fcntl.h
 header-y += guarded_storage.h
 header-y += hypfs.h
-header-y += ioctl.h
 header-y += ioctls.h
 header-y += ipcbuf.h
 header-y += kvm.h
 header-y += kvm_para.h
 header-y += kvm_perf.h
 header-y += kvm_virtio.h
-header-y += mman.h
 header-y += monwriter.h
 header-y += msgbuf.h
-header-y += param.h
 header-y += pkey.h
-header-y += poll.h
 header-y += posix_types.h
 header-y += ptrace.h
 header-y += qeth.h
-header-y += resource.h
 header-y += schid.h
 header-y += sclp_ctl.h
 header-y += sembuf.h
@@ -41,12 +45,10 @@ header-y += sigcontext.h
 header-y += siginfo.h
 header-y += signal.h
 header-y += socket.h
-header-y += sockios.h
 header-y += stat.h
 header-y += statfs.h
 header-y += swab.h
 header-y += tape390.h
-header-y += termbits.h
 header-y += termios.h
 header-y += types.h
 header-y += ucontext.h
diff --git a/arch/s390/include/uapi/asm/errno.h b/arch/s390/include/uapi/asm/errno.h
deleted file mode 100644
index 395e97d8005e..000000000000
--- a/arch/s390/include/uapi/asm/errno.h
+++ /dev/null
@@ -1,11 +0,0 @@
-/*
- *  S390 version
- *
- */
-
-#ifndef _S390_ERRNO_H
-#define _S390_ERRNO_H
-
-#include <asm-generic/errno.h>
-
-#endif
diff --git a/arch/s390/include/uapi/asm/fcntl.h b/arch/s390/include/uapi/asm/fcntl.h
deleted file mode 100644
index 46ab12db5739..000000000000
--- a/arch/s390/include/uapi/asm/fcntl.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/fcntl.h>
diff --git a/arch/s390/include/uapi/asm/ioctl.h b/arch/s390/include/uapi/asm/ioctl.h
deleted file mode 100644
index b279fe06dfe5..000000000000
--- a/arch/s390/include/uapi/asm/ioctl.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/ioctl.h>
diff --git a/arch/s390/include/uapi/asm/mman.h b/arch/s390/include/uapi/asm/mman.h
deleted file mode 100644
index de23da1f41b2..000000000000
--- a/arch/s390/include/uapi/asm/mman.h
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- *  S390 version
- *
- *  Derived from "include/asm-i386/mman.h"
- */
-#include <asm-generic/mman.h>
diff --git a/arch/s390/include/uapi/asm/param.h b/arch/s390/include/uapi/asm/param.h
deleted file mode 100644
index c616821bf2ac..000000000000
--- a/arch/s390/include/uapi/asm/param.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef _ASMS390_PARAM_H
-#define _ASMS390_PARAM_H
-
-#include <asm-generic/param.h>
-
-#endif /* _ASMS390_PARAM_H */
diff --git a/arch/s390/include/uapi/asm/poll.h b/arch/s390/include/uapi/asm/poll.h
deleted file mode 100644
index c98509d3149e..000000000000
--- a/arch/s390/include/uapi/asm/poll.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/poll.h>
diff --git a/arch/s390/include/uapi/asm/resource.h b/arch/s390/include/uapi/asm/resource.h
deleted file mode 100644
index ec23d1c73c92..000000000000
--- a/arch/s390/include/uapi/asm/resource.h
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- *  S390 version
- *
- *  Derived from "include/asm-i386/resources.h"
- */
-
-#ifndef _S390_RESOURCE_H
-#define _S390_RESOURCE_H
-
-#include <asm-generic/resource.h>
-
-#endif
-
diff --git a/arch/s390/include/uapi/asm/sockios.h b/arch/s390/include/uapi/asm/sockios.h
deleted file mode 100644
index 6f60eee73242..000000000000
--- a/arch/s390/include/uapi/asm/sockios.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef _ASM_S390_SOCKIOS_H
-#define _ASM_S390_SOCKIOS_H
-
-#include <asm-generic/sockios.h>
-
-#endif
diff --git a/arch/s390/include/uapi/asm/termbits.h b/arch/s390/include/uapi/asm/termbits.h
deleted file mode 100644
index 71bf6ac6a2b9..000000000000
--- a/arch/s390/include/uapi/asm/termbits.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef _ASM_S390_TERMBITS_H
-#define _ASM_S390_TERMBITS_H
-
-#include <asm-generic/termbits.h>
-
-#endif

From 4d6e51c740170595868eded99eb448ea79db803a Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 29 Mar 2017 10:30:48 +0200
Subject: [PATCH 27/74] s390: use generic headers if possible

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/include/asm/Kbuild              | 6 ++++++
 arch/s390/include/asm/div64.h             | 1 -
 arch/s390/include/asm/emergency-restart.h | 6 ------
 arch/s390/include/asm/irq_regs.h          | 1 -
 arch/s390/include/asm/kmap_types.h        | 6 ------
 arch/s390/include/asm/local.h             | 1 -
 arch/s390/include/asm/local64.h           | 1 -
 7 files changed, 6 insertions(+), 16 deletions(-)
 delete mode 100644 arch/s390/include/asm/div64.h
 delete mode 100644 arch/s390/include/asm/emergency-restart.h
 delete mode 100644 arch/s390/include/asm/irq_regs.h
 delete mode 100644 arch/s390/include/asm/kmap_types.h
 delete mode 100644 arch/s390/include/asm/local.h
 delete mode 100644 arch/s390/include/asm/local64.h

diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild
index 8aea32fe8bd2..7e3481eb2174 100644
--- a/arch/s390/include/asm/Kbuild
+++ b/arch/s390/include/asm/Kbuild
@@ -1,8 +1,14 @@
 generic-y += asm-offsets.h
 generic-y += clkdev.h
 generic-y += dma-contiguous.h
+generic-y += div64.h
+generic-y += emergency-restart.h
 generic-y += export.h
+generic-y += irq_regs.h
 generic-y += irq_work.h
+generic-y += kmap_types.h
+generic-y += local.h
+generic-y += local64.h
 generic-y += mcs_spinlock.h
 generic-y += mm-arch-hooks.h
 generic-y += preempt.h
diff --git a/arch/s390/include/asm/div64.h b/arch/s390/include/asm/div64.h
deleted file mode 100644
index 6cd978cefb28..000000000000
--- a/arch/s390/include/asm/div64.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/div64.h>
diff --git a/arch/s390/include/asm/emergency-restart.h b/arch/s390/include/asm/emergency-restart.h
deleted file mode 100644
index 108d8c48e42e..000000000000
--- a/arch/s390/include/asm/emergency-restart.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef _ASM_EMERGENCY_RESTART_H
-#define _ASM_EMERGENCY_RESTART_H
-
-#include <asm-generic/emergency-restart.h>
-
-#endif /* _ASM_EMERGENCY_RESTART_H */
diff --git a/arch/s390/include/asm/irq_regs.h b/arch/s390/include/asm/irq_regs.h
deleted file mode 100644
index 3dd9c0b70270..000000000000
--- a/arch/s390/include/asm/irq_regs.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/irq_regs.h>
diff --git a/arch/s390/include/asm/kmap_types.h b/arch/s390/include/asm/kmap_types.h
deleted file mode 100644
index 0a88622339ee..000000000000
--- a/arch/s390/include/asm/kmap_types.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef _ASM_KMAP_TYPES_H
-#define _ASM_KMAP_TYPES_H
-
-#include <asm-generic/kmap_types.h>
-
-#endif
diff --git a/arch/s390/include/asm/local.h b/arch/s390/include/asm/local.h
deleted file mode 100644
index c11c530f74d0..000000000000
--- a/arch/s390/include/asm/local.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/local.h>
diff --git a/arch/s390/include/asm/local64.h b/arch/s390/include/asm/local64.h
deleted file mode 100644
index 36c93b5cc239..000000000000
--- a/arch/s390/include/asm/local64.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/local64.h>

From 20ba46da369e239aa454cc57d0c7e33d51a8de51 Mon Sep 17 00:00:00 2001
From: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Date: Mon, 13 Feb 2017 12:30:09 +0100
Subject: [PATCH 28/74] s390/cpum_cf: update counter numbers to ecctr limits

Use the highest counter number that can be specified for the
ecctr (extract CPU counter) instruction for perf.

Signed-off-by: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/include/asm/perf_event.h | 4 ++--
 arch/s390/kernel/perf_cpum_cf.c    | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/s390/include/asm/perf_event.h b/arch/s390/include/asm/perf_event.h
index c64c0befd3f3..dd32beb9d30c 100644
--- a/arch/s390/include/asm/perf_event.h
+++ b/arch/s390/include/asm/perf_event.h
@@ -1,7 +1,7 @@
 /*
  * Performance event support - s390 specific definitions.
  *
- * Copyright IBM Corp. 2009, 2013
+ * Copyright IBM Corp. 2009, 2017
  * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
  *	      Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
  */
@@ -47,7 +47,7 @@ struct perf_sf_sde_regs {
 };
 
 /* Perf PMU definitions for the counter facility */
-#define PERF_CPUM_CF_MAX_CTR		256
+#define PERF_CPUM_CF_MAX_CTR		0xffffUL  /* Max ctr for ECCTR */
 
 /* Perf PMU definitions for the sampling facility */
 #define PERF_CPUM_SF_MAX_CTR		2
diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c
index 1aba10e90906..1c3f93812817 100644
--- a/arch/s390/kernel/perf_cpum_cf.c
+++ b/arch/s390/kernel/perf_cpum_cf.c
@@ -370,7 +370,7 @@ static int __hw_perf_event_init(struct perf_event *event)
 	if (ev == -1)
 		return -ENOENT;
 
-	if (ev >= PERF_CPUM_CF_MAX_CTR)
+	if (ev > PERF_CPUM_CF_MAX_CTR)
 		return -EINVAL;
 
 	/* Use the hardware perf event structure to store the counter number

From db17160dce4ffe4d5bd70b58ee94f398ef9cabb1 Mon Sep 17 00:00:00 2001
From: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Date: Mon, 13 Feb 2017 12:32:17 +0100
Subject: [PATCH 29/74] s390/cpum_cf: cleanup event/counter validation

The validate_event() function just checked for reserved counters
in particular CPU-MF counter sets.  Because the number of counters
in counter sets vary among different hardware models, remove the
explicit check to tolerate new models.

Reserved counters are not accounted and, thus, will return zero.

Signed-off-by: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/kernel/perf_cpum_cf.c | 31 +------------------------------
 1 file changed, 1 insertion(+), 30 deletions(-)

diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c
index 1c3f93812817..6fe1428e9dfc 100644
--- a/arch/s390/kernel/perf_cpum_cf.c
+++ b/arch/s390/kernel/perf_cpum_cf.c
@@ -1,7 +1,7 @@
 /*
  * Performance event support for s390x - CPU-measurement Counter Facility
  *
- *  Copyright IBM Corp. 2012
+ *  Copyright IBM Corp. 2012, 2017
  *  Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
  *
  * This program is free software; you can redistribute it and/or modify
@@ -102,26 +102,6 @@ static int get_counter_set(u64 event)
 	return set;
 }
 
-static int validate_event(const struct hw_perf_event *hwc)
-{
-	switch (hwc->config_base) {
-	case CPUMF_CTR_SET_BASIC:
-	case CPUMF_CTR_SET_USER:
-	case CPUMF_CTR_SET_CRYPTO:
-	case CPUMF_CTR_SET_EXT:
-		/* check for reserved counters */
-		if ((hwc->config >=  6 && hwc->config <=  31) ||
-		    (hwc->config >= 38 && hwc->config <=  63) ||
-		    (hwc->config >= 80 && hwc->config <= 127))
-			return -EOPNOTSUPP;
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
 static int validate_ctr_version(const struct hw_perf_event *hwc)
 {
 	struct cpu_hw_events *cpuhw;
@@ -381,15 +361,6 @@ static int __hw_perf_event_init(struct perf_event *event)
 	hwc->config = ev;
 	hwc->config_base = get_counter_set(ev);
 
-	/* Validate the counter that is assigned to this event.
-	 * Because the counter facility can use numerous counters at the
-	 * same time without constraints, it is not necessary to explicitly
-	 * validate event groups (event->group_leader != event).
-	 */
-	err = validate_event(hwc);
-	if (err)
-		return err;
-
 	/* Initialize for using the CPU-measurement counter facility */
 	if (!atomic_inc_not_zero(&num_events)) {
 		mutex_lock(&pmc_reserve_mutex);

From ee699f329a239bc3cc8a8c336b9615166993bffc Mon Sep 17 00:00:00 2001
From: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Date: Fri, 3 Jun 2016 16:55:03 +0200
Subject: [PATCH 30/74] s390/cpum_cf: add support for the MT-diagnostic counter
 set (z13)

Complete the IBM z13 support and support counters from the
MT-diagnostic counter set.  Note that this counter set is
available only if SMT is enabled.

Signed-off-by: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/include/asm/cpu_mf.h  |  4 +-
 arch/s390/kernel/perf_cpum_cf.c | 89 ++++++++++++++++++++++++---------
 2 files changed, 67 insertions(+), 26 deletions(-)

diff --git a/arch/s390/include/asm/cpu_mf.h b/arch/s390/include/asm/cpu_mf.h
index d1e0707310fd..facdc2496c3a 100644
--- a/arch/s390/include/asm/cpu_mf.h
+++ b/arch/s390/include/asm/cpu_mf.h
@@ -20,9 +20,11 @@
 #define CPU_MF_INT_SF_PRA	(1 << 29)	/* program request alert */
 #define CPU_MF_INT_SF_SACA	(1 << 23)	/* sampler auth. change alert */
 #define CPU_MF_INT_SF_LSDA	(1 << 22)	/* loss of sample data alert */
+#define CPU_MF_INT_CF_MTDA	(1 << 15)	/* loss of MT ctr. data alert */
 #define CPU_MF_INT_CF_CACA	(1 <<  7)	/* counter auth. change alert */
 #define CPU_MF_INT_CF_LCDA	(1 <<  6)	/* loss of counter data alert */
-#define CPU_MF_INT_CF_MASK	(CPU_MF_INT_CF_CACA|CPU_MF_INT_CF_LCDA)
+#define CPU_MF_INT_CF_MASK	(CPU_MF_INT_CF_MTDA|CPU_MF_INT_CF_CACA| \
+				 CPU_MF_INT_CF_LCDA)
 #define CPU_MF_INT_SF_MASK	(CPU_MF_INT_SF_IAE|CPU_MF_INT_SF_ISE|	\
 				 CPU_MF_INT_SF_PRA|CPU_MF_INT_SF_SACA|	\
 				 CPU_MF_INT_SF_LSDA)
diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c
index 6fe1428e9dfc..52a9ae0272c9 100644
--- a/arch/s390/kernel/perf_cpum_cf.c
+++ b/arch/s390/kernel/perf_cpum_cf.c
@@ -22,19 +22,12 @@
 #include <asm/irq.h>
 #include <asm/cpu_mf.h>
 
-/* CPU-measurement counter facility supports these CPU counter sets:
- * For CPU counter sets:
- *    Basic counter set:	     0-31
- *    Problem-state counter set:    32-63
- *    Crypto-activity counter set:  64-127
- *    Extented counter set:	   128-159
- */
 enum cpumf_ctr_set {
-	/* CPU counter sets */
-	CPUMF_CTR_SET_BASIC   = 0,
-	CPUMF_CTR_SET_USER    = 1,
-	CPUMF_CTR_SET_CRYPTO  = 2,
-	CPUMF_CTR_SET_EXT     = 3,
+	CPUMF_CTR_SET_BASIC   = 0,    /* Basic Counter Set */
+	CPUMF_CTR_SET_USER    = 1,    /* Problem-State Counter Set */
+	CPUMF_CTR_SET_CRYPTO  = 2,    /* Crypto-Activity Counter Set */
+	CPUMF_CTR_SET_EXT     = 3,    /* Extended Counter Set */
+	CPUMF_CTR_SET_MT_DIAG = 4,    /* MT-diagnostic Counter Set */
 
 	/* Maximum number of counter sets */
 	CPUMF_CTR_SET_MAX,
@@ -47,6 +40,7 @@ static const u64 cpumf_state_ctl[CPUMF_CTR_SET_MAX] = {
 	[CPUMF_CTR_SET_USER]	= 0x04,
 	[CPUMF_CTR_SET_CRYPTO]	= 0x08,
 	[CPUMF_CTR_SET_EXT]	= 0x01,
+	[CPUMF_CTR_SET_MT_DIAG] = 0x20,
 };
 
 static void ctr_set_enable(u64 *state, int ctr_set)
@@ -76,19 +70,20 @@ struct cpu_hw_events {
 };
 static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
 	.ctr_set = {
-		[CPUMF_CTR_SET_BASIC]  = ATOMIC_INIT(0),
-		[CPUMF_CTR_SET_USER]   = ATOMIC_INIT(0),
-		[CPUMF_CTR_SET_CRYPTO] = ATOMIC_INIT(0),
-		[CPUMF_CTR_SET_EXT]    = ATOMIC_INIT(0),
+		[CPUMF_CTR_SET_BASIC]	= ATOMIC_INIT(0),
+		[CPUMF_CTR_SET_USER]	= ATOMIC_INIT(0),
+		[CPUMF_CTR_SET_CRYPTO]	= ATOMIC_INIT(0),
+		[CPUMF_CTR_SET_EXT]	= ATOMIC_INIT(0),
+		[CPUMF_CTR_SET_MT_DIAG] = ATOMIC_INIT(0),
 	},
 	.state = 0,
 	.flags = 0,
 	.txn_flags = 0,
 };
 
-static int get_counter_set(u64 event)
+static enum cpumf_ctr_set get_counter_set(u64 event)
 {
-	int set = -1;
+	int set = CPUMF_CTR_SET_MAX;
 
 	if (event < 32)
 		set = CPUMF_CTR_SET_BASIC;
@@ -98,6 +93,8 @@ static int get_counter_set(u64 event)
 		set = CPUMF_CTR_SET_CRYPTO;
 	else if (event < 256)
 		set = CPUMF_CTR_SET_EXT;
+	else if (event >= 448 && event < 496)
+		set = CPUMF_CTR_SET_MT_DIAG;
 
 	return set;
 }
@@ -106,6 +103,7 @@ static int validate_ctr_version(const struct hw_perf_event *hwc)
 {
 	struct cpu_hw_events *cpuhw;
 	int err = 0;
+	u16 mtdiag_ctl;
 
 	cpuhw = &get_cpu_var(cpu_hw_events);
 
@@ -125,6 +123,27 @@ static int validate_ctr_version(const struct hw_perf_event *hwc)
 		    (cpuhw->info.csvn  > 2 && hwc->config > 255))
 			err = -EOPNOTSUPP;
 		break;
+	case CPUMF_CTR_SET_MT_DIAG:
+		if (cpuhw->info.csvn <= 3)
+			err = -EOPNOTSUPP;
+		/*
+		 * MT-diagnostic counters are read-only.  The counter set
+		 * is automatically enabled and activated on all CPUs with
+		 * multithreading (SMT).  Deactivation of multithreading
+		 * also disables the counter set.  State changes are ignored
+		 * by lcctl().	Because Linux controls SMT enablement through
+		 * a kernel parameter only, the counter set is either disabled
+		 * or enabled and active.
+		 *
+		 * Thus, the counters can only be used if SMT is on and the
+		 * counter set is enabled and active.
+		 */
+		mtdiag_ctl = cpumf_state_ctl[CPUMF_CTR_SET_MT_DIAG];
+		if (!((cpuhw->info.auth_ctl & mtdiag_ctl) &&
+		      (cpuhw->info.enable_ctl & mtdiag_ctl) &&
+		      (cpuhw->info.act_ctl & mtdiag_ctl)))
+			err = -EOPNOTSUPP;
+		break;
 	}
 
 	put_cpu_var(cpu_hw_events);
@@ -230,6 +249,11 @@ static void cpumf_measurement_alert(struct ext_code ext_code,
 	/* loss of counter data alert */
 	if (alert & CPU_MF_INT_CF_LCDA)
 		pr_err("CPU[%i] Counter data was lost\n", smp_processor_id());
+
+	/* loss of MT counter data alert */
+	if (alert & CPU_MF_INT_CF_MTDA)
+		pr_warn("CPU[%i] MT counter data was lost\n",
+			smp_processor_id());
 }
 
 #define PMC_INIT      0
@@ -310,6 +334,7 @@ static int __hw_perf_event_init(struct perf_event *event)
 {
 	struct perf_event_attr *attr = &event->attr;
 	struct hw_perf_event *hwc = &event->hw;
+	enum cpumf_ctr_set set;
 	int err;
 	u64 ev;
 
@@ -353,13 +378,27 @@ static int __hw_perf_event_init(struct perf_event *event)
 	if (ev > PERF_CPUM_CF_MAX_CTR)
 		return -EINVAL;
 
-	/* Use the hardware perf event structure to store the counter number
-	 * in 'config' member and the counter set to which the counter belongs
-	 * in the 'config_base'.  The counter set (config_base) is then used
-	 * to enable/disable the counters.
-	 */
-	hwc->config = ev;
-	hwc->config_base = get_counter_set(ev);
+	/* Obtain the counter set to which the specified counter belongs */
+	set = get_counter_set(ev);
+	switch (set) {
+	case CPUMF_CTR_SET_BASIC:
+	case CPUMF_CTR_SET_USER:
+	case CPUMF_CTR_SET_CRYPTO:
+	case CPUMF_CTR_SET_EXT:
+	case CPUMF_CTR_SET_MT_DIAG:
+		/*
+		 * Use the hardware perf event structure to store the
+		 * counter number in the 'config' member and the counter
+		 * set number in the 'config_base'.  The counter set number
+		 * is then later used to enable/disable the counter(s).
+		 */
+		hwc->config = ev;
+		hwc->config_base = set;
+		break;
+	case CPUMF_CTR_SET_MAX:
+		/* The counter could not be associated to a counter set */
+		return -EINVAL;
+	};
 
 	/* Initialize for using the CPU-measurement counter facility */
 	if (!atomic_inc_not_zero(&num_events)) {

From 3fc7acebaecf940697ea9a5a927cf10766d4b00e Mon Sep 17 00:00:00 2001
From: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Date: Fri, 17 Feb 2017 12:16:00 +0100
Subject: [PATCH 31/74] s390/cpum_cf: add IBM z13 counter event names

Add the event names for the IBM z13/z13s specific CPU-MF counters.

Also improve the merging of the generic and model specific events
so that their sysfs attribute definitions completely reside in
memory.  Hence, flagging the generic event attribute definitions
as initdata too.

Signed-off-by: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/kernel/perf_cpum_cf_events.c | 130 +++++++++++++++++++++++--
 1 file changed, 123 insertions(+), 7 deletions(-)

diff --git a/arch/s390/kernel/perf_cpum_cf_events.c b/arch/s390/kernel/perf_cpum_cf_events.c
index c343ac2cf6c5..33d90c29bb3d 100644
--- a/arch/s390/kernel/perf_cpum_cf_events.c
+++ b/arch/s390/kernel/perf_cpum_cf_events.c
@@ -114,8 +114,64 @@ CPUMF_EVENT_ATTR(cf_zec12, L1I_OFFBOOK_L3_SOURCED_WRITES_IV, 0x00a1);
 CPUMF_EVENT_ATTR(cf_zec12, TX_NC_TABORT, 0x00b1);
 CPUMF_EVENT_ATTR(cf_zec12, TX_C_TABORT_NO_SPECIAL, 0x00b2);
 CPUMF_EVENT_ATTR(cf_zec12, TX_C_TABORT_SPECIAL, 0x00b3);
+CPUMF_EVENT_ATTR(cf_z13, L1D_WRITES_RO_EXCL, 0x0080);
+CPUMF_EVENT_ATTR(cf_z13, DTLB1_WRITES, 0x0081);
+CPUMF_EVENT_ATTR(cf_z13, DTLB1_MISSES, 0x0082);
+CPUMF_EVENT_ATTR(cf_z13, DTLB1_HPAGE_WRITES, 0x0083);
+CPUMF_EVENT_ATTR(cf_z13, DTLB1_GPAGE_WRITES, 0x0084);
+CPUMF_EVENT_ATTR(cf_z13, L1D_L2D_SOURCED_WRITES, 0x0085);
+CPUMF_EVENT_ATTR(cf_z13, ITLB1_WRITES, 0x0086);
+CPUMF_EVENT_ATTR(cf_z13, ITLB1_MISSES, 0x0087);
+CPUMF_EVENT_ATTR(cf_z13, L1I_L2I_SOURCED_WRITES, 0x0088);
+CPUMF_EVENT_ATTR(cf_z13, TLB2_PTE_WRITES, 0x0089);
+CPUMF_EVENT_ATTR(cf_z13, TLB2_CRSTE_HPAGE_WRITES, 0x008a);
+CPUMF_EVENT_ATTR(cf_z13, TLB2_CRSTE_WRITES, 0x008b);
+CPUMF_EVENT_ATTR(cf_z13, TX_C_TEND, 0x008c);
+CPUMF_EVENT_ATTR(cf_z13, TX_NC_TEND, 0x008d);
+CPUMF_EVENT_ATTR(cf_z13, L1C_TLB1_MISSES, 0x008f);
+CPUMF_EVENT_ATTR(cf_z13, L1D_ONCHIP_L3_SOURCED_WRITES, 0x0090);
+CPUMF_EVENT_ATTR(cf_z13, L1D_ONCHIP_L3_SOURCED_WRITES_IV, 0x0091);
+CPUMF_EVENT_ATTR(cf_z13, L1D_ONNODE_L4_SOURCED_WRITES, 0x0092);
+CPUMF_EVENT_ATTR(cf_z13, L1D_ONNODE_L3_SOURCED_WRITES_IV, 0x0093);
+CPUMF_EVENT_ATTR(cf_z13, L1D_ONNODE_L3_SOURCED_WRITES, 0x0094);
+CPUMF_EVENT_ATTR(cf_z13, L1D_ONDRAWER_L4_SOURCED_WRITES, 0x0095);
+CPUMF_EVENT_ATTR(cf_z13, L1D_ONDRAWER_L3_SOURCED_WRITES_IV, 0x0096);
+CPUMF_EVENT_ATTR(cf_z13, L1D_ONDRAWER_L3_SOURCED_WRITES, 0x0097);
+CPUMF_EVENT_ATTR(cf_z13, L1D_OFFDRAWER_SCOL_L4_SOURCED_WRITES, 0x0098);
+CPUMF_EVENT_ATTR(cf_z13, L1D_OFFDRAWER_SCOL_L3_SOURCED_WRITES_IV, 0x0099);
+CPUMF_EVENT_ATTR(cf_z13, L1D_OFFDRAWER_SCOL_L3_SOURCED_WRITES, 0x009a);
+CPUMF_EVENT_ATTR(cf_z13, L1D_OFFDRAWER_FCOL_L4_SOURCED_WRITES, 0x009b);
+CPUMF_EVENT_ATTR(cf_z13, L1D_OFFDRAWER_FCOL_L3_SOURCED_WRITES_IV, 0x009c);
+CPUMF_EVENT_ATTR(cf_z13, L1D_OFFDRAWER_FCOL_L3_SOURCED_WRITES, 0x009d);
+CPUMF_EVENT_ATTR(cf_z13, L1D_ONNODE_MEM_SOURCED_WRITES, 0x009e);
+CPUMF_EVENT_ATTR(cf_z13, L1D_ONDRAWER_MEM_SOURCED_WRITES, 0x009f);
+CPUMF_EVENT_ATTR(cf_z13, L1D_OFFDRAWER_MEM_SOURCED_WRITES, 0x00a0);
+CPUMF_EVENT_ATTR(cf_z13, L1D_ONCHIP_MEM_SOURCED_WRITES, 0x00a1);
+CPUMF_EVENT_ATTR(cf_z13, L1I_ONCHIP_L3_SOURCED_WRITES, 0x00a2);
+CPUMF_EVENT_ATTR(cf_z13, L1I_ONCHIP_L3_SOURCED_WRITES_IV, 0x00a3);
+CPUMF_EVENT_ATTR(cf_z13, L1I_ONNODE_L4_SOURCED_WRITES, 0x00a4);
+CPUMF_EVENT_ATTR(cf_z13, L1I_ONNODE_L3_SOURCED_WRITES_IV, 0x00a5);
+CPUMF_EVENT_ATTR(cf_z13, L1I_ONNODE_L3_SOURCED_WRITES, 0x00a6);
+CPUMF_EVENT_ATTR(cf_z13, L1I_ONDRAWER_L4_SOURCED_WRITES, 0x00a7);
+CPUMF_EVENT_ATTR(cf_z13, L1I_ONDRAWER_L3_SOURCED_WRITES_IV, 0x00a8);
+CPUMF_EVENT_ATTR(cf_z13, L1I_ONDRAWER_L3_SOURCED_WRITES, 0x00a9);
+CPUMF_EVENT_ATTR(cf_z13, L1I_OFFDRAWER_SCOL_L4_SOURCED_WRITES, 0x00aa);
+CPUMF_EVENT_ATTR(cf_z13, L1I_OFFDRAWER_SCOL_L3_SOURCED_WRITES_IV, 0x00ab);
+CPUMF_EVENT_ATTR(cf_z13, L1I_OFFDRAWER_SCOL_L3_SOURCED_WRITES, 0x00ac);
+CPUMF_EVENT_ATTR(cf_z13, L1I_OFFDRAWER_FCOL_L4_SOURCED_WRITES, 0x00ad);
+CPUMF_EVENT_ATTR(cf_z13, L1I_OFFDRAWER_FCOL_L3_SOURCED_WRITES_IV, 0x00ae);
+CPUMF_EVENT_ATTR(cf_z13, L1I_OFFDRAWER_FCOL_L3_SOURCED_WRITES, 0x00af);
+CPUMF_EVENT_ATTR(cf_z13, L1I_ONNODE_MEM_SOURCED_WRITES, 0x00b0);
+CPUMF_EVENT_ATTR(cf_z13, L1I_ONDRAWER_MEM_SOURCED_WRITES, 0x00b1);
+CPUMF_EVENT_ATTR(cf_z13, L1I_OFFDRAWER_MEM_SOURCED_WRITES, 0x00b2);
+CPUMF_EVENT_ATTR(cf_z13, L1I_ONCHIP_MEM_SOURCED_WRITES, 0x00b3);
+CPUMF_EVENT_ATTR(cf_z13, TX_NC_TABORT, 0x00da);
+CPUMF_EVENT_ATTR(cf_z13, TX_C_TABORT_NO_SPECIAL, 0x00db);
+CPUMF_EVENT_ATTR(cf_z13, TX_C_TABORT_SPECIAL, 0x00dc);
+CPUMF_EVENT_ATTR(cf_z13, MT_DIAG_CYCLES_ONE_THR_ACTIVE, 0x01c0);
+CPUMF_EVENT_ATTR(cf_z13, MT_DIAG_CYCLES_TWO_THR_ACTIVE, 0x01c1);
 
-static struct attribute *cpumcf_pmu_event_attr[] = {
+static struct attribute *cpumcf_pmu_event_attr[] __initdata = {
 	CPUMF_EVENT_PTR(cf, CPU_CYCLES),
 	CPUMF_EVENT_PTR(cf, INSTRUCTIONS),
 	CPUMF_EVENT_PTR(cf, L1I_DIR_WRITES),
@@ -236,11 +292,70 @@ static struct attribute *cpumcf_zec12_pmu_event_attr[] __initdata = {
 	NULL,
 };
 
+static struct attribute *cpumcf_z13_pmu_event_attr[] __initdata = {
+	CPUMF_EVENT_PTR(cf_z13, L1D_WRITES_RO_EXCL),
+	CPUMF_EVENT_PTR(cf_z13, DTLB1_WRITES),
+	CPUMF_EVENT_PTR(cf_z13, DTLB1_MISSES),
+	CPUMF_EVENT_PTR(cf_z13, DTLB1_HPAGE_WRITES),
+	CPUMF_EVENT_PTR(cf_z13, DTLB1_GPAGE_WRITES),
+	CPUMF_EVENT_PTR(cf_z13, L1D_L2D_SOURCED_WRITES),
+	CPUMF_EVENT_PTR(cf_z13, ITLB1_WRITES),
+	CPUMF_EVENT_PTR(cf_z13, ITLB1_MISSES),
+	CPUMF_EVENT_PTR(cf_z13, L1I_L2I_SOURCED_WRITES),
+	CPUMF_EVENT_PTR(cf_z13, TLB2_PTE_WRITES),
+	CPUMF_EVENT_PTR(cf_z13, TLB2_CRSTE_HPAGE_WRITES),
+	CPUMF_EVENT_PTR(cf_z13, TLB2_CRSTE_WRITES),
+	CPUMF_EVENT_PTR(cf_z13, TX_C_TEND),
+	CPUMF_EVENT_PTR(cf_z13, TX_NC_TEND),
+	CPUMF_EVENT_PTR(cf_z13, L1C_TLB1_MISSES),
+	CPUMF_EVENT_PTR(cf_z13, L1D_ONCHIP_L3_SOURCED_WRITES),
+	CPUMF_EVENT_PTR(cf_z13, L1D_ONCHIP_L3_SOURCED_WRITES_IV),
+	CPUMF_EVENT_PTR(cf_z13, L1D_ONNODE_L4_SOURCED_WRITES),
+	CPUMF_EVENT_PTR(cf_z13, L1D_ONNODE_L3_SOURCED_WRITES_IV),
+	CPUMF_EVENT_PTR(cf_z13, L1D_ONNODE_L3_SOURCED_WRITES),
+	CPUMF_EVENT_PTR(cf_z13, L1D_ONDRAWER_L4_SOURCED_WRITES),
+	CPUMF_EVENT_PTR(cf_z13, L1D_ONDRAWER_L3_SOURCED_WRITES_IV),
+	CPUMF_EVENT_PTR(cf_z13, L1D_ONDRAWER_L3_SOURCED_WRITES),
+	CPUMF_EVENT_PTR(cf_z13, L1D_OFFDRAWER_SCOL_L4_SOURCED_WRITES),
+	CPUMF_EVENT_PTR(cf_z13, L1D_OFFDRAWER_SCOL_L3_SOURCED_WRITES_IV),
+	CPUMF_EVENT_PTR(cf_z13, L1D_OFFDRAWER_SCOL_L3_SOURCED_WRITES),
+	CPUMF_EVENT_PTR(cf_z13, L1D_OFFDRAWER_FCOL_L4_SOURCED_WRITES),
+	CPUMF_EVENT_PTR(cf_z13, L1D_OFFDRAWER_FCOL_L3_SOURCED_WRITES_IV),
+	CPUMF_EVENT_PTR(cf_z13, L1D_OFFDRAWER_FCOL_L3_SOURCED_WRITES),
+	CPUMF_EVENT_PTR(cf_z13, L1D_ONNODE_MEM_SOURCED_WRITES),
+	CPUMF_EVENT_PTR(cf_z13, L1D_ONDRAWER_MEM_SOURCED_WRITES),
+	CPUMF_EVENT_PTR(cf_z13, L1D_OFFDRAWER_MEM_SOURCED_WRITES),
+	CPUMF_EVENT_PTR(cf_z13, L1D_ONCHIP_MEM_SOURCED_WRITES),
+	CPUMF_EVENT_PTR(cf_z13, L1I_ONCHIP_L3_SOURCED_WRITES),
+	CPUMF_EVENT_PTR(cf_z13, L1I_ONCHIP_L3_SOURCED_WRITES_IV),
+	CPUMF_EVENT_PTR(cf_z13, L1I_ONNODE_L4_SOURCED_WRITES),
+	CPUMF_EVENT_PTR(cf_z13, L1I_ONNODE_L3_SOURCED_WRITES_IV),
+	CPUMF_EVENT_PTR(cf_z13, L1I_ONNODE_L3_SOURCED_WRITES),
+	CPUMF_EVENT_PTR(cf_z13, L1I_ONDRAWER_L4_SOURCED_WRITES),
+	CPUMF_EVENT_PTR(cf_z13, L1I_ONDRAWER_L3_SOURCED_WRITES_IV),
+	CPUMF_EVENT_PTR(cf_z13, L1I_ONDRAWER_L3_SOURCED_WRITES),
+	CPUMF_EVENT_PTR(cf_z13, L1I_OFFDRAWER_SCOL_L4_SOURCED_WRITES),
+	CPUMF_EVENT_PTR(cf_z13, L1I_OFFDRAWER_SCOL_L3_SOURCED_WRITES_IV),
+	CPUMF_EVENT_PTR(cf_z13, L1I_OFFDRAWER_SCOL_L3_SOURCED_WRITES),
+	CPUMF_EVENT_PTR(cf_z13, L1I_OFFDRAWER_FCOL_L4_SOURCED_WRITES),
+	CPUMF_EVENT_PTR(cf_z13, L1I_OFFDRAWER_FCOL_L3_SOURCED_WRITES_IV),
+	CPUMF_EVENT_PTR(cf_z13, L1I_OFFDRAWER_FCOL_L3_SOURCED_WRITES),
+	CPUMF_EVENT_PTR(cf_z13, L1I_ONNODE_MEM_SOURCED_WRITES),
+	CPUMF_EVENT_PTR(cf_z13, L1I_ONDRAWER_MEM_SOURCED_WRITES),
+	CPUMF_EVENT_PTR(cf_z13, L1I_OFFDRAWER_MEM_SOURCED_WRITES),
+	CPUMF_EVENT_PTR(cf_z13, L1I_ONCHIP_MEM_SOURCED_WRITES),
+	CPUMF_EVENT_PTR(cf_z13, TX_NC_TABORT),
+	CPUMF_EVENT_PTR(cf_z13, TX_C_TABORT_NO_SPECIAL),
+	CPUMF_EVENT_PTR(cf_z13, TX_C_TABORT_SPECIAL),
+	CPUMF_EVENT_PTR(cf_z13, MT_DIAG_CYCLES_ONE_THR_ACTIVE),
+	CPUMF_EVENT_PTR(cf_z13, MT_DIAG_CYCLES_TWO_THR_ACTIVE),
+	NULL,
+};
+
 /* END: CPUM_CF COUNTER DEFINITIONS ===================================== */
 
 static struct attribute_group cpumsf_pmu_events_group = {
 	.name = "events",
-	.attrs = cpumcf_pmu_event_attr,
 };
 
 PMU_FORMAT_ATTR(event, "config:0-63");
@@ -290,6 +405,7 @@ static __init struct attribute **merge_attr(struct attribute **a,
 __init const struct attribute_group **cpumf_cf_event_group(void)
 {
 	struct attribute **combined, **model;
+	struct attribute *none[] = { NULL };
 	struct cpuid cpu_id;
 
 	get_cpu_id(&cpu_id);
@@ -306,17 +422,17 @@ __init const struct attribute_group **cpumf_cf_event_group(void)
 	case 0x2828:
 		model = cpumcf_zec12_pmu_event_attr;
 		break;
+	case 0x2964:
+	case 0x2965:
+		model = cpumcf_z13_pmu_event_attr;
+		break;
 	default:
-		model = NULL;
+		model = none;
 		break;
 	}
 
-	if (!model)
-		goto out;
-
 	combined = merge_attr(cpumcf_pmu_event_attr, model);
 	if (combined)
 		cpumsf_pmu_events_group.attrs = combined;
-out:
 	return cpumsf_pmu_attr_groups;
 }

From 66a49784f909cb3da54b56591979971ebb7e7cac Mon Sep 17 00:00:00 2001
From: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Date: Fri, 17 Feb 2017 12:53:55 +0100
Subject: [PATCH 32/74] s390/cpum_cf: correct variable naming (cleanup)

Make clear that the event definitions relate to the counter
facility (cf) and not to the sampling facility (sf).

Signed-off-by: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/kernel/perf_cpum_cf_events.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/arch/s390/kernel/perf_cpum_cf_events.c b/arch/s390/kernel/perf_cpum_cf_events.c
index 33d90c29bb3d..d3133285b7d1 100644
--- a/arch/s390/kernel/perf_cpum_cf_events.c
+++ b/arch/s390/kernel/perf_cpum_cf_events.c
@@ -354,25 +354,25 @@ static struct attribute *cpumcf_z13_pmu_event_attr[] __initdata = {
 
 /* END: CPUM_CF COUNTER DEFINITIONS ===================================== */
 
-static struct attribute_group cpumsf_pmu_events_group = {
+static struct attribute_group cpumcf_pmu_events_group = {
 	.name = "events",
 };
 
 PMU_FORMAT_ATTR(event, "config:0-63");
 
-static struct attribute *cpumsf_pmu_format_attr[] = {
+static struct attribute *cpumcf_pmu_format_attr[] = {
 	&format_attr_event.attr,
 	NULL,
 };
 
-static struct attribute_group cpumsf_pmu_format_group = {
+static struct attribute_group cpumcf_pmu_format_group = {
 	.name = "format",
-	.attrs = cpumsf_pmu_format_attr,
+	.attrs = cpumcf_pmu_format_attr,
 };
 
-static const struct attribute_group *cpumsf_pmu_attr_groups[] = {
-	&cpumsf_pmu_events_group,
-	&cpumsf_pmu_format_group,
+static const struct attribute_group *cpumcf_pmu_attr_groups[] = {
+	&cpumcf_pmu_events_group,
+	&cpumcf_pmu_format_group,
 	NULL,
 };
 
@@ -433,6 +433,6 @@ __init const struct attribute_group **cpumf_cf_event_group(void)
 
 	combined = merge_attr(cpumcf_pmu_event_attr, model);
 	if (combined)
-		cpumsf_pmu_events_group.attrs = combined;
-	return cpumsf_pmu_attr_groups;
+		cpumcf_pmu_events_group.attrs = combined;
+	return cpumcf_pmu_attr_groups;
 }

From 26f268ac682854345531325c7c0d803b8c573228 Mon Sep 17 00:00:00 2001
From: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Date: Mon, 20 Feb 2017 16:07:43 +0100
Subject: [PATCH 33/74] s390/cpu_mf: remove register variable in __ecctr()

Using a register variable for r4 is not necessary.  Let the
compiler decide the register to be used.

Reported-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/include/asm/cpu_mf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/s390/include/asm/cpu_mf.h b/arch/s390/include/asm/cpu_mf.h
index facdc2496c3a..05480e4cc5ca 100644
--- a/arch/s390/include/asm/cpu_mf.h
+++ b/arch/s390/include/asm/cpu_mf.h
@@ -174,7 +174,7 @@ static inline int lcctl(u64 ctl)
 /* Extract CPU counter */
 static inline int __ecctr(u64 ctr, u64 *content)
 {
-	register u64 _content asm("4") = 0;
+	u64 _content;
 	int cc;
 
 	asm volatile (

From 485527ba578254bb1171b13c55394257fd63cd59 Mon Sep 17 00:00:00 2001
From: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Date: Mon, 20 Feb 2017 16:10:38 +0100
Subject: [PATCH 34/74] s390/cpum_cf: make hw_perf_event_update() a void
 function

The return code of hw_perf_event_update() is not evaluated by
its callers.  Hence, simplify the function by removing the
return code.

Reported-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/kernel/perf_cpum_cf.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c
index 52a9ae0272c9..746d03423333 100644
--- a/arch/s390/kernel/perf_cpum_cf.c
+++ b/arch/s390/kernel/perf_cpum_cf.c
@@ -462,7 +462,7 @@ static int hw_perf_event_reset(struct perf_event *event)
 	return err;
 }
 
-static int hw_perf_event_update(struct perf_event *event)
+static void hw_perf_event_update(struct perf_event *event)
 {
 	u64 prev, new, delta;
 	int err;
@@ -471,14 +471,12 @@ static int hw_perf_event_update(struct perf_event *event)
 		prev = local64_read(&event->hw.prev_count);
 		err = ecctr(event->hw.config, &new);
 		if (err)
-			goto out;
+			return;
 	} while (local64_cmpxchg(&event->hw.prev_count, prev, new) != prev);
 
 	delta = (prev <= new) ? new - prev
 			      : (-1ULL - prev) + new + 1;	 /* overflow */
 	local64_add(delta, &event->count);
-out:
-	return err;
 }
 
 static void cpumf_pmu_read(struct perf_event *event)

From 5434da4ddff396aadf5e1098065069ea58134e26 Mon Sep 17 00:00:00 2001
From: Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com>
Date: Fri, 17 Mar 2017 04:17:28 +0100
Subject: [PATCH 35/74] s390: cio: introduce cio_cancel_halt_clear

For future code reuse purpose, this decouples the cio code with
the ccw device specific parts from ccw_device_cancel_halt_clear,
and makes a new common I/O interface named cio_cancel_halt_clear.

Reviewed-by: Pierre Morel <pmorel@linux.vnet.ibm.com>
Signed-off-by: Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com>
Cc: Sebastian Ott <sebott@linux.vnet.ibm.com>
Cc: Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
Acked-by: Sebastian Ott <sebott@linux.vnet.ibm.com>
Message-Id: <20170317031743.40128-2-bjsdjshi@linux.vnet.ibm.com>
[CH: Fix typo]
Signed-off-by: Cornelia Huck <cornelia.huck@de.ibm.com>
---
 drivers/s390/cio/cio.c        | 59 +++++++++++++++++++++++++++++++++++
 drivers/s390/cio/cio.h        |  1 +
 drivers/s390/cio/device_fsm.c | 54 +++++---------------------------
 3 files changed, 68 insertions(+), 46 deletions(-)

diff --git a/drivers/s390/cio/cio.c b/drivers/s390/cio/cio.c
index 1b350665c823..c7ee3e44f162 100644
--- a/drivers/s390/cio/cio.c
+++ b/drivers/s390/cio/cio.c
@@ -309,6 +309,65 @@ cio_cancel (struct subchannel *sch)
 	}
 }
 
+/**
+ * cio_cancel_halt_clear - Cancel running I/O by performing cancel, halt
+ * and clear ordinally if subchannel is valid.
+ * @sch: subchannel on which to perform the cancel_halt_clear operation
+ * @iretry: the number of the times remained to retry the next operation
+ *
+ * This should be called repeatedly since halt/clear are asynchronous
+ * operations. We do one try with cio_cancel, three tries with cio_halt,
+ * 255 tries with cio_clear. The caller should initialize @iretry with
+ * the value 255 for its first call to this, and keep using the same
+ * @iretry in the subsequent calls until it gets a non -EBUSY return.
+ *
+ * Returns 0 if device now idle, -ENODEV for device not operational,
+ * -EBUSY if an interrupt is expected (either from halt/clear or from a
+ * status pending), and -EIO if out of retries.
+ */
+int cio_cancel_halt_clear(struct subchannel *sch, int *iretry)
+{
+	int ret;
+
+	if (cio_update_schib(sch))
+		return -ENODEV;
+	if (!sch->schib.pmcw.ena)
+		/* Not operational -> done. */
+		return 0;
+	/* Stage 1: cancel io. */
+	if (!(scsw_actl(&sch->schib.scsw) & SCSW_ACTL_HALT_PEND) &&
+	    !(scsw_actl(&sch->schib.scsw) & SCSW_ACTL_CLEAR_PEND)) {
+		if (!scsw_is_tm(&sch->schib.scsw)) {
+			ret = cio_cancel(sch);
+			if (ret != -EINVAL)
+				return ret;
+		}
+		/*
+		 * Cancel io unsuccessful or not applicable (transport mode).
+		 * Continue with asynchronous instructions.
+		 */
+		*iretry = 3;	/* 3 halt retries. */
+	}
+	/* Stage 2: halt io. */
+	if (!(scsw_actl(&sch->schib.scsw) & SCSW_ACTL_CLEAR_PEND)) {
+		if (*iretry) {
+			*iretry -= 1;
+			ret = cio_halt(sch);
+			if (ret != -EBUSY)
+				return (ret == 0) ? -EBUSY : ret;
+		}
+		/* Halt io unsuccessful. */
+		*iretry = 255;	/* 255 clear retries. */
+	}
+	/* Stage 3: clear io. */
+	if (*iretry) {
+		*iretry -= 1;
+		ret = cio_clear(sch);
+		return (ret == 0) ? -EBUSY : ret;
+	}
+	/* Function was unsuccessful */
+	return -EIO;
+}
 
 static void cio_apply_config(struct subchannel *sch, struct schib *schib)
 {
diff --git a/drivers/s390/cio/cio.h b/drivers/s390/cio/cio.h
index f0e57aefb5f2..939596d81b73 100644
--- a/drivers/s390/cio/cio.h
+++ b/drivers/s390/cio/cio.h
@@ -123,6 +123,7 @@ extern int cio_enable_subchannel(struct subchannel *, u32);
 extern int cio_disable_subchannel (struct subchannel *);
 extern int cio_cancel (struct subchannel *);
 extern int cio_clear (struct subchannel *);
+extern int cio_cancel_halt_clear(struct subchannel *, int *);
 extern int cio_resume (struct subchannel *);
 extern int cio_halt (struct subchannel *);
 extern int cio_start (struct subchannel *, struct ccw1 *, __u8);
diff --git a/drivers/s390/cio/device_fsm.c b/drivers/s390/cio/device_fsm.c
index 9afb5ce13007..12016e32e519 100644
--- a/drivers/s390/cio/device_fsm.c
+++ b/drivers/s390/cio/device_fsm.c
@@ -124,14 +124,6 @@ ccw_device_set_timeout(struct ccw_device *cdev, int expires)
 	add_timer(&cdev->private->timer);
 }
 
-/*
- * Cancel running i/o. This is called repeatedly since halt/clear are
- * asynchronous operations. We do one try with cio_cancel, two tries
- * with cio_halt, 255 tries with cio_clear. If everythings fails panic.
- * Returns 0 if device now idle, -ENODEV for device not operational and
- * -EBUSY if an interrupt is expected (either from halt/clear or from a
- * status pending).
- */
 int
 ccw_device_cancel_halt_clear(struct ccw_device *cdev)
 {
@@ -139,44 +131,14 @@ ccw_device_cancel_halt_clear(struct ccw_device *cdev)
 	int ret;
 
 	sch = to_subchannel(cdev->dev.parent);
-	if (cio_update_schib(sch))
-		return -ENODEV; 
-	if (!sch->schib.pmcw.ena)
-		/* Not operational -> done. */
-		return 0;
-	/* Stage 1: cancel io. */
-	if (!(scsw_actl(&sch->schib.scsw) & SCSW_ACTL_HALT_PEND) &&
-	    !(scsw_actl(&sch->schib.scsw) & SCSW_ACTL_CLEAR_PEND)) {
-		if (!scsw_is_tm(&sch->schib.scsw)) {
-			ret = cio_cancel(sch);
-			if (ret != -EINVAL)
-				return ret;
-		}
-		/* cancel io unsuccessful or not applicable (transport mode).
-		 * Continue with asynchronous instructions. */
-		cdev->private->iretry = 3;	/* 3 halt retries. */
-	}
-	if (!(scsw_actl(&sch->schib.scsw) & SCSW_ACTL_CLEAR_PEND)) {
-		/* Stage 2: halt io. */
-		if (cdev->private->iretry) {
-			cdev->private->iretry--;
-			ret = cio_halt(sch);
-			if (ret != -EBUSY)
-				return (ret == 0) ? -EBUSY : ret;
-		}
-		/* halt io unsuccessful. */
-		cdev->private->iretry = 255;	/* 255 clear retries. */
-	}
-	/* Stage 3: clear io. */
-	if (cdev->private->iretry) {
-		cdev->private->iretry--;
-		ret = cio_clear (sch);
-		return (ret == 0) ? -EBUSY : ret;
-	}
-	/* Function was unsuccessful */
-	CIO_MSG_EVENT(0, "0.%x.%04x: could not stop I/O\n",
-		      cdev->private->dev_id.ssid, cdev->private->dev_id.devno);
-	return -EIO;
+	ret = cio_cancel_halt_clear(sch, &cdev->private->iretry);
+
+	if (ret == -EIO)
+		CIO_MSG_EVENT(0, "0.%x.%04x: could not stop I/O\n",
+			      cdev->private->dev_id.ssid,
+			      cdev->private->dev_id.devno);
+
+	return ret;
 }
 
 void ccw_device_update_sense_data(struct ccw_device *cdev)

From a22217e27c699de61cc6859494e1af097291afb5 Mon Sep 17 00:00:00 2001
From: Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com>
Date: Fri, 17 Mar 2017 04:17:29 +0100
Subject: [PATCH 36/74] s390: cio: export more interfaces

Export the common I/O interfaces those are needed by an I/O
subchannel driver to actually talk to the subchannel.

Reviewed-by: Pierre Morel <pmorel@linux.vnet.ibm.com>
Signed-off-by: Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com>
Cc: Sebastian Ott <sebott@linux.vnet.ibm.com>
Cc: Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
Acked-by: Sebastian Ott <sebott@linux.vnet.ibm.com>
Message-Id: <20170317031743.40128-3-bjsdjshi@linux.vnet.ibm.com>
Signed-off-by: Cornelia Huck <cornelia.huck@de.ibm.com>
---
 drivers/s390/cio/cio.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/drivers/s390/cio/cio.c b/drivers/s390/cio/cio.c
index c7ee3e44f162..89216174fcbb 100644
--- a/drivers/s390/cio/cio.c
+++ b/drivers/s390/cio/cio.c
@@ -170,12 +170,14 @@ cio_start_key (struct subchannel *sch,	/* subchannel structure */
 		return ccode;
 	}
 }
+EXPORT_SYMBOL_GPL(cio_start_key);
 
 int
 cio_start (struct subchannel *sch, struct ccw1 *cpa, __u8 lpm)
 {
 	return cio_start_key(sch, cpa, lpm, PAGE_DEFAULT_KEY);
 }
+EXPORT_SYMBOL_GPL(cio_start);
 
 /*
  * resume suspended I/O operation
@@ -208,6 +210,7 @@ cio_resume (struct subchannel *sch)
 		return -ENODEV;
 	}
 }
+EXPORT_SYMBOL_GPL(cio_resume);
 
 /*
  * halt I/O operation
@@ -241,6 +244,7 @@ cio_halt(struct subchannel *sch)
 		return -ENODEV;
 	}
 }
+EXPORT_SYMBOL_GPL(cio_halt);
 
 /*
  * Clear I/O operation
@@ -271,6 +275,7 @@ cio_clear(struct subchannel *sch)
 		return -ENODEV;
 	}
 }
+EXPORT_SYMBOL_GPL(cio_clear);
 
 /*
  * Function: cio_cancel
@@ -308,6 +313,7 @@ cio_cancel (struct subchannel *sch)
 		return -ENODEV;
 	}
 }
+EXPORT_SYMBOL_GPL(cio_cancel);
 
 /**
  * cio_cancel_halt_clear - Cancel running I/O by performing cancel, halt
@@ -368,6 +374,7 @@ int cio_cancel_halt_clear(struct subchannel *sch, int *iretry)
 	/* Function was unsuccessful */
 	return -EIO;
 }
+EXPORT_SYMBOL_GPL(cio_cancel_halt_clear);
 
 static void cio_apply_config(struct subchannel *sch, struct schib *schib)
 {
@@ -441,6 +448,7 @@ int cio_commit_config(struct subchannel *sch)
 	}
 	return ret;
 }
+EXPORT_SYMBOL_GPL(cio_commit_config);
 
 /**
  * cio_update_schib - Perform stsch and update schib if subchannel is valid.
@@ -1046,6 +1054,7 @@ int cio_tm_start_key(struct subchannel *sch, struct tcw *tcw, u8 lpm, u8 key)
 		return cio_start_handle_notoper(sch, lpm);
 	}
 }
+EXPORT_SYMBOL_GPL(cio_tm_start_key);
 
 /**
  * cio_tm_intrg - perform interrogate function
@@ -1071,3 +1080,4 @@ int cio_tm_intrg(struct subchannel *sch)
 		return -ENODEV;
 	}
 }
+EXPORT_SYMBOL_GPL(cio_tm_intrg);

From aec390b937dcaf11c01483b8c348a2b004119aa7 Mon Sep 17 00:00:00 2001
From: Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com>
Date: Fri, 17 Mar 2017 04:17:30 +0100
Subject: [PATCH 37/74] vfio: ccw: define device_api strings

Define vfio-ccw device API strings. CCW vendor driver using mediated
device framework should use this string for device_api attribute.

Reviewed-by: Pierre Morel <pmorel@linux.vnet.ibm.com>
Signed-off-by: Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com>
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Message-Id: <20170317031743.40128-4-bjsdjshi@linux.vnet.ibm.com>
Signed-off-by: Cornelia Huck <cornelia.huck@de.ibm.com>
---
 include/uapi/linux/vfio.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 519eff362c1c..61837890c132 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -212,6 +212,7 @@ struct vfio_device_info {
 #define VFIO_DEVICE_API_PCI_STRING		"vfio-pci"
 #define VFIO_DEVICE_API_PLATFORM_STRING		"vfio-platform"
 #define VFIO_DEVICE_API_AMBA_STRING		"vfio-amba"
+#define VFIO_DEVICE_API_CCW_STRING		"vfio-ccw"
 
 /**
  * VFIO_DEVICE_GET_REGION_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 8,

From 63f1934d562de7cf3124de0ef617dcd3be98b279 Mon Sep 17 00:00:00 2001
From: Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com>
Date: Fri, 17 Mar 2017 04:17:31 +0100
Subject: [PATCH 38/74] vfio: ccw: basic implementation for vfio_ccw driver

To make vfio support subchannel devices, we need a css driver for
the vfio subchannels. This patch adds a basic vfio-ccw subchannel
driver for this purpose.

To enable VFIO for vfio-ccw, enable S390_CCW_IOMMU config option
and configure VFIO as required.

Acked-by: Pierre Morel <pmorel@linux.vnet.ibm.com>
Signed-off-by: Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com>
Message-Id: <20170317031743.40128-5-bjsdjshi@linux.vnet.ibm.com>
Signed-off-by: Cornelia Huck <cornelia.huck@de.ibm.com>
---
 arch/s390/Kconfig                   |  10 ++
 arch/s390/include/asm/isc.h         |   1 +
 drivers/iommu/Kconfig               |   8 +
 drivers/s390/cio/Makefile           |   3 +
 drivers/s390/cio/vfio_ccw_drv.c     | 262 ++++++++++++++++++++++++++++
 drivers/s390/cio/vfio_ccw_private.h |  25 +++
 6 files changed, 309 insertions(+)
 create mode 100644 drivers/s390/cio/vfio_ccw_drv.c
 create mode 100644 drivers/s390/cio/vfio_ccw_private.h

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index a2dcef0aacc7..b53f0359f2b6 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -672,6 +672,16 @@ config EADM_SCH
 	  To compile this driver as a module, choose M here: the
 	  module will be called eadm_sch.
 
+config VFIO_CCW
+	def_tristate n
+	prompt "Support for VFIO-CCW subchannels"
+	depends on S390_CCW_IOMMU && VFIO
+	help
+	  This driver allows usage of I/O subchannels via VFIO-CCW.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called vfio_ccw.
+
 endmenu
 
 menu "Dump support"
diff --git a/arch/s390/include/asm/isc.h b/arch/s390/include/asm/isc.h
index 68d7d68300f2..8a0b721a9b8d 100644
--- a/arch/s390/include/asm/isc.h
+++ b/arch/s390/include/asm/isc.h
@@ -16,6 +16,7 @@
 #define CONSOLE_ISC 1			/* console I/O subchannel */
 #define EADM_SCH_ISC 4			/* EADM subchannels */
 #define CHSC_SCH_ISC 7			/* CHSC subchannels */
+#define VFIO_CCW_ISC IO_SCH_ISC		/* VFIO-CCW I/O subchannels */
 /* Adapter interrupts. */
 #define QDIO_AIRQ_ISC IO_SCH_ISC	/* I/O subchannel in qdio mode */
 #define PCI_ISC 2			/* PCI I/O subchannels */
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 37e204f3d9be..6ee3a25ae731 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -327,6 +327,14 @@ config S390_IOMMU
 	help
 	  Support for the IOMMU API for s390 PCI devices.
 
+config S390_CCW_IOMMU
+	bool "S390 CCW IOMMU Support"
+	depends on S390 && CCW
+	select IOMMU_API
+	help
+	  Enables bits of IOMMU API required by VFIO. The iommu_ops
+	  is not implemented as it is not necessary for VFIO.
+
 config MTK_IOMMU
 	bool "MTK IOMMU Support"
 	depends on ARM || ARM64
diff --git a/drivers/s390/cio/Makefile b/drivers/s390/cio/Makefile
index 3ab9aedeb84a..3d7390e05b2a 100644
--- a/drivers/s390/cio/Makefile
+++ b/drivers/s390/cio/Makefile
@@ -17,3 +17,6 @@ obj-$(CONFIG_CCWGROUP) += ccwgroup.o
 
 qdio-objs := qdio_main.o qdio_thinint.o qdio_debug.o qdio_setup.o
 obj-$(CONFIG_QDIO) += qdio.o
+
+vfio_ccw-objs += vfio_ccw_drv.o
+obj-$(CONFIG_VFIO_CCW) += vfio_ccw.o
diff --git a/drivers/s390/cio/vfio_ccw_drv.c b/drivers/s390/cio/vfio_ccw_drv.c
new file mode 100644
index 000000000000..de4f4f1b45de
--- /dev/null
+++ b/drivers/s390/cio/vfio_ccw_drv.c
@@ -0,0 +1,262 @@
+/*
+ * VFIO based Physical Subchannel device driver
+ *
+ * Copyright IBM Corp. 2017
+ *
+ * Author(s): Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com>
+ *            Xiao Feng Ren <renxiaof@linux.vnet.ibm.com>
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/device.h>
+#include <linux/slab.h>
+
+#include <asm/isc.h>
+
+#include "vfio_ccw_private.h"
+
+/*
+ * Helpers
+ */
+static int vfio_ccw_sch_quiesce(struct subchannel *sch)
+{
+	struct vfio_ccw_private *private = dev_get_drvdata(&sch->dev);
+	DECLARE_COMPLETION_ONSTACK(completion);
+	int iretry, ret = 0;
+
+	spin_lock_irq(sch->lock);
+	if (!sch->schib.pmcw.ena)
+		goto out_unlock;
+	ret = cio_disable_subchannel(sch);
+	if (ret != -EBUSY)
+		goto out_unlock;
+
+	do {
+		iretry = 255;
+
+		ret = cio_cancel_halt_clear(sch, &iretry);
+		while (ret == -EBUSY) {
+			/*
+			 * Flush all I/O and wait for
+			 * cancel/halt/clear completion.
+			 */
+			private->completion = &completion;
+			spin_unlock_irq(sch->lock);
+
+			wait_for_completion_timeout(&completion, 3*HZ);
+
+			spin_lock_irq(sch->lock);
+			private->completion = NULL;
+			ret = cio_cancel_halt_clear(sch, &iretry);
+		};
+
+		ret = cio_disable_subchannel(sch);
+	} while (ret == -EBUSY);
+
+out_unlock:
+	spin_unlock_irq(sch->lock);
+	return ret;
+}
+
+/*
+ * Sysfs interfaces
+ */
+static ssize_t chpids_show(struct device *dev,
+			   struct device_attribute *attr,
+			   char *buf)
+{
+	struct subchannel *sch = to_subchannel(dev);
+	struct chsc_ssd_info *ssd = &sch->ssd_info;
+	ssize_t ret = 0;
+	int chp;
+	int mask;
+
+	for (chp = 0; chp < 8; chp++) {
+		mask = 0x80 >> chp;
+		if (ssd->path_mask & mask)
+			ret += sprintf(buf + ret, "%02x ", ssd->chpid[chp].id);
+		else
+			ret += sprintf(buf + ret, "00 ");
+	}
+	ret += sprintf(buf+ret, "\n");
+	return ret;
+}
+
+static ssize_t pimpampom_show(struct device *dev,
+			      struct device_attribute *attr,
+			      char *buf)
+{
+	struct subchannel *sch = to_subchannel(dev);
+	struct pmcw *pmcw = &sch->schib.pmcw;
+
+	return sprintf(buf, "%02x %02x %02x\n",
+		       pmcw->pim, pmcw->pam, pmcw->pom);
+}
+
+static DEVICE_ATTR(chpids, 0444, chpids_show, NULL);
+static DEVICE_ATTR(pimpampom, 0444, pimpampom_show, NULL);
+
+static struct attribute *vfio_subchannel_attrs[] = {
+	&dev_attr_chpids.attr,
+	&dev_attr_pimpampom.attr,
+	NULL,
+};
+
+static struct attribute_group vfio_subchannel_attr_group = {
+	.attrs = vfio_subchannel_attrs,
+};
+
+/*
+ * Css driver callbacks
+ */
+static void vfio_ccw_sch_irq(struct subchannel *sch)
+{
+	struct vfio_ccw_private *private = dev_get_drvdata(&sch->dev);
+
+	inc_irq_stat(IRQIO_CIO);
+
+	if (!private)
+		return;
+
+	if (private->completion)
+		complete(private->completion);
+}
+
+static int vfio_ccw_sch_probe(struct subchannel *sch)
+{
+	struct pmcw *pmcw = &sch->schib.pmcw;
+	struct vfio_ccw_private *private;
+	int ret;
+
+	if (pmcw->qf) {
+		dev_warn(&sch->dev, "vfio: ccw: does not support QDIO: %s\n",
+			 dev_name(&sch->dev));
+		return -ENODEV;
+	}
+
+	private = kzalloc(sizeof(*private), GFP_KERNEL | GFP_DMA);
+	if (!private)
+		return -ENOMEM;
+	private->sch = sch;
+	dev_set_drvdata(&sch->dev, private);
+
+	spin_lock_irq(sch->lock);
+	sch->isc = VFIO_CCW_ISC;
+	ret = cio_enable_subchannel(sch, (u32)(unsigned long)sch);
+	spin_unlock_irq(sch->lock);
+	if (ret)
+		goto out_free;
+
+	ret = sysfs_create_group(&sch->dev.kobj, &vfio_subchannel_attr_group);
+	if (ret)
+		goto out_disable;
+
+	return 0;
+
+out_disable:
+	cio_disable_subchannel(sch);
+out_free:
+	dev_set_drvdata(&sch->dev, NULL);
+	kfree(private);
+	return ret;
+}
+
+static int vfio_ccw_sch_remove(struct subchannel *sch)
+{
+	struct vfio_ccw_private *private = dev_get_drvdata(&sch->dev);
+
+	vfio_ccw_sch_quiesce(sch);
+
+	sysfs_remove_group(&sch->dev.kobj, &vfio_subchannel_attr_group);
+
+	dev_set_drvdata(&sch->dev, NULL);
+
+	kfree(private);
+
+	return 0;
+}
+
+static void vfio_ccw_sch_shutdown(struct subchannel *sch)
+{
+	vfio_ccw_sch_quiesce(sch);
+}
+
+/**
+ * vfio_ccw_sch_event - process subchannel event
+ * @sch: subchannel
+ * @process: non-zero if function is called in process context
+ *
+ * An unspecified event occurred for this subchannel. Adjust data according
+ * to the current operational state of the subchannel. Return zero when the
+ * event has been handled sufficiently or -EAGAIN when this function should
+ * be called again in process context.
+ */
+static int vfio_ccw_sch_event(struct subchannel *sch, int process)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(sch->lock, flags);
+	if (!device_is_registered(&sch->dev))
+		goto out_unlock;
+
+	if (work_pending(&sch->todo_work))
+		goto out_unlock;
+
+	if (cio_update_schib(sch)) {
+		/* Not operational. */
+		css_sched_sch_todo(sch, SCH_TODO_UNREG);
+
+		/*
+		 * TODO:
+		 * Probably we should send the machine check to the guest.
+		 */
+		goto out_unlock;
+	}
+
+out_unlock:
+	spin_unlock_irqrestore(sch->lock, flags);
+
+	return 0;
+}
+
+static struct css_device_id vfio_ccw_sch_ids[] = {
+	{ .match_flags = 0x1, .type = SUBCHANNEL_TYPE_IO, },
+	{ /* end of list */ },
+};
+MODULE_DEVICE_TABLE(css, vfio_ccw_sch_ids);
+
+static struct css_driver vfio_ccw_sch_driver = {
+	.drv = {
+		.name = "vfio_ccw",
+		.owner = THIS_MODULE,
+	},
+	.subchannel_type = vfio_ccw_sch_ids,
+	.irq = vfio_ccw_sch_irq,
+	.probe = vfio_ccw_sch_probe,
+	.remove = vfio_ccw_sch_remove,
+	.shutdown = vfio_ccw_sch_shutdown,
+	.sch_event = vfio_ccw_sch_event,
+};
+
+static int __init vfio_ccw_sch_init(void)
+{
+	int ret;
+
+	isc_register(VFIO_CCW_ISC);
+	ret = css_driver_register(&vfio_ccw_sch_driver);
+	if (ret)
+		isc_unregister(VFIO_CCW_ISC);
+
+	return ret;
+}
+
+static void __exit vfio_ccw_sch_exit(void)
+{
+	css_driver_unregister(&vfio_ccw_sch_driver);
+	isc_unregister(VFIO_CCW_ISC);
+}
+module_init(vfio_ccw_sch_init);
+module_exit(vfio_ccw_sch_exit);
+
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/s390/cio/vfio_ccw_private.h b/drivers/s390/cio/vfio_ccw_private.h
new file mode 100644
index 000000000000..38d69a59cb12
--- /dev/null
+++ b/drivers/s390/cio/vfio_ccw_private.h
@@ -0,0 +1,25 @@
+/*
+ * Private stuff for vfio_ccw driver
+ *
+ * Copyright IBM Corp. 2017
+ *
+ * Author(s): Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com>
+ *            Xiao Feng Ren <renxiaof@linux.vnet.ibm.com>
+ */
+
+#ifndef _VFIO_CCW_PRIVATE_H_
+#define _VFIO_CCW_PRIVATE_H_
+
+#include "css.h"
+
+/**
+ * struct vfio_ccw_private
+ * @sch: pointer to the subchannel
+ * @completion: synchronization helper of the I/O completion
+ */
+struct vfio_ccw_private {
+	struct subchannel	*sch;
+	struct completion	*completion;
+} __aligned(8);
+
+#endif

From 0a19e61e6d4c6192077ead760ba0a2d350987d4c Mon Sep 17 00:00:00 2001
From: Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com>
Date: Fri, 17 Mar 2017 04:17:32 +0100
Subject: [PATCH 39/74] vfio: ccw: introduce channel program interfaces

Introduce ccwchain structure and helper functions that can be used to
handle a channel program issued from a virtual machine.

The following limitations apply:
1. Supports only prefetch enabled mode.
2. Supports idal(c64) ccw chaining.
3. Supports 4k idaw.
4. Supports ccw1.
5. Supports direct ccw chaining by translating them to idal ccws.

CCW translation requires to leverage the vfio_(un)pin_pages interfaces
to pin/unpin sets of mem pages frequently. Currently we have a lack of
support to do this in an efficient way. So we introduce pfn_array data
structure and helper functions to handle pin/unpin operations here.

Signed-off-by: Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com>
Message-Id: <20170317031743.40128-6-bjsdjshi@linux.vnet.ibm.com>
Signed-off-by: Cornelia Huck <cornelia.huck@de.ibm.com>
---
 drivers/s390/cio/Makefile      |   2 +-
 drivers/s390/cio/vfio_ccw_cp.c | 816 +++++++++++++++++++++++++++++++++
 drivers/s390/cio/vfio_ccw_cp.h |  42 ++
 3 files changed, 859 insertions(+), 1 deletion(-)
 create mode 100644 drivers/s390/cio/vfio_ccw_cp.c
 create mode 100644 drivers/s390/cio/vfio_ccw_cp.h

diff --git a/drivers/s390/cio/Makefile b/drivers/s390/cio/Makefile
index 3d7390e05b2a..1bec279430b7 100644
--- a/drivers/s390/cio/Makefile
+++ b/drivers/s390/cio/Makefile
@@ -18,5 +18,5 @@ obj-$(CONFIG_CCWGROUP) += ccwgroup.o
 qdio-objs := qdio_main.o qdio_thinint.o qdio_debug.o qdio_setup.o
 obj-$(CONFIG_QDIO) += qdio.o
 
-vfio_ccw-objs += vfio_ccw_drv.o
+vfio_ccw-objs += vfio_ccw_drv.o vfio_ccw_cp.o
 obj-$(CONFIG_VFIO_CCW) += vfio_ccw.o
diff --git a/drivers/s390/cio/vfio_ccw_cp.c b/drivers/s390/cio/vfio_ccw_cp.c
new file mode 100644
index 000000000000..16bbb54ee532
--- /dev/null
+++ b/drivers/s390/cio/vfio_ccw_cp.c
@@ -0,0 +1,816 @@
+/*
+ * channel program interfaces
+ *
+ * Copyright IBM Corp. 2017
+ *
+ * Author(s): Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com>
+ *            Xiao Feng Ren <renxiaof@linux.vnet.ibm.com>
+ */
+
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/iommu.h>
+#include <linux/vfio.h>
+#include <asm/idals.h>
+
+#include "vfio_ccw_cp.h"
+
+/*
+ * Max length for ccw chain.
+ * XXX: Limit to 256, need to check more?
+ */
+#define CCWCHAIN_LEN_MAX	256
+
+struct pfn_array {
+	unsigned long		pa_iova;
+	unsigned long		*pa_iova_pfn;
+	unsigned long		*pa_pfn;
+	int			pa_nr;
+};
+
+struct pfn_array_table {
+	struct pfn_array	*pat_pa;
+	int			pat_nr;
+};
+
+struct ccwchain {
+	struct list_head	next;
+	struct ccw1		*ch_ccw;
+	/* Guest physical address of the current chain. */
+	u64			ch_iova;
+	/* Count of the valid ccws in chain. */
+	int			ch_len;
+	/* Pinned PAGEs for the original data. */
+	struct pfn_array_table	*ch_pat;
+};
+
+/*
+ * pfn_array_pin() - pin user pages in memory
+ * @pa: pfn_array on which to perform the operation
+ * @mdev: the mediated device to perform pin/unpin operations
+ *
+ * Attempt to pin user pages in memory.
+ *
+ * Usage of pfn_array:
+ * @pa->pa_iova     starting guest physical I/O address. Assigned by caller.
+ * @pa->pa_iova_pfn array that stores PFNs of the pages need to pin. Allocated
+ *                  by caller.
+ * @pa->pa_pfn      array that receives PFNs of the pages pinned. Allocated by
+ *                  caller.
+ * @pa->pa_nr       number of pages from @pa->pa_iova to pin. Assigned by
+ *                  caller.
+ *                  number of pages pinned. Assigned by callee.
+ *
+ * Returns:
+ *   Number of pages pinned on success.
+ *   If @pa->pa_nr is 0 or negative, returns 0.
+ *   If no pages were pinned, returns -errno.
+ */
+static int pfn_array_pin(struct pfn_array *pa, struct device *mdev)
+{
+	int i, ret;
+
+	if (pa->pa_nr <= 0) {
+		pa->pa_nr = 0;
+		return 0;
+	}
+
+	pa->pa_iova_pfn[0] = pa->pa_iova >> PAGE_SHIFT;
+	for (i = 1; i < pa->pa_nr; i++)
+		pa->pa_iova_pfn[i] = pa->pa_iova_pfn[i - 1] + 1;
+
+	ret = vfio_pin_pages(mdev, pa->pa_iova_pfn, pa->pa_nr,
+			     IOMMU_READ | IOMMU_WRITE, pa->pa_pfn);
+
+	if (ret > 0 && ret != pa->pa_nr) {
+		vfio_unpin_pages(mdev, pa->pa_iova_pfn, ret);
+		pa->pa_nr = 0;
+		return 0;
+	}
+
+	return ret;
+}
+
+/* Unpin the pages before releasing the memory. */
+static void pfn_array_unpin_free(struct pfn_array *pa, struct device *mdev)
+{
+	vfio_unpin_pages(mdev, pa->pa_iova_pfn, pa->pa_nr);
+	pa->pa_nr = 0;
+	kfree(pa->pa_iova_pfn);
+}
+
+/* Alloc memory for PFNs, then pin pages with them. */
+static int pfn_array_alloc_pin(struct pfn_array *pa, struct device *mdev,
+			       u64 iova, unsigned int len)
+{
+	int ret = 0;
+
+	if (!len || pa->pa_nr)
+		return -EINVAL;
+
+	pa->pa_iova = iova;
+
+	pa->pa_nr = ((iova & ~PAGE_MASK) + len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
+	if (!pa->pa_nr)
+		return -EINVAL;
+
+	pa->pa_iova_pfn = kcalloc(pa->pa_nr,
+				  sizeof(*pa->pa_iova_pfn) +
+				  sizeof(*pa->pa_pfn),
+				  GFP_KERNEL);
+	if (unlikely(!pa->pa_iova_pfn))
+		return -ENOMEM;
+	pa->pa_pfn = pa->pa_iova_pfn + pa->pa_nr;
+
+	ret = pfn_array_pin(pa, mdev);
+
+	if (ret > 0)
+		return ret;
+	else if (!ret)
+		ret = -EINVAL;
+
+	kfree(pa->pa_iova_pfn);
+
+	return ret;
+}
+
+static int pfn_array_table_init(struct pfn_array_table *pat, int nr)
+{
+	pat->pat_pa = kcalloc(nr, sizeof(*pat->pat_pa), GFP_KERNEL);
+	if (unlikely(ZERO_OR_NULL_PTR(pat->pat_pa))) {
+		pat->pat_nr = 0;
+		return -ENOMEM;
+	}
+
+	pat->pat_nr = nr;
+
+	return 0;
+}
+
+static void pfn_array_table_unpin_free(struct pfn_array_table *pat,
+				       struct device *mdev)
+{
+	int i;
+
+	for (i = 0; i < pat->pat_nr; i++)
+		pfn_array_unpin_free(pat->pat_pa + i, mdev);
+
+	if (pat->pat_nr) {
+		kfree(pat->pat_pa);
+		pat->pat_pa = NULL;
+		pat->pat_nr = 0;
+	}
+}
+
+static bool pfn_array_table_iova_pinned(struct pfn_array_table *pat,
+					unsigned long iova)
+{
+	struct pfn_array *pa = pat->pat_pa;
+	unsigned long iova_pfn = iova >> PAGE_SHIFT;
+	int i, j;
+
+	for (i = 0; i < pat->pat_nr; i++, pa++)
+		for (j = 0; j < pa->pa_nr; j++)
+			if (pa->pa_iova_pfn[i] == iova_pfn)
+				return true;
+
+	return false;
+}
+/* Create the list idal words for a pfn_array_table. */
+static inline void pfn_array_table_idal_create_words(
+	struct pfn_array_table *pat,
+	unsigned long *idaws)
+{
+	struct pfn_array *pa;
+	int i, j, k;
+
+	/*
+	 * Idal words (execept the first one) rely on the memory being 4k
+	 * aligned. If a user virtual address is 4K aligned, then it's
+	 * corresponding kernel physical address will also be 4K aligned. Thus
+	 * there will be no problem here to simply use the phys to create an
+	 * idaw.
+	 */
+	k = 0;
+	for (i = 0; i < pat->pat_nr; i++) {
+		pa = pat->pat_pa + i;
+		for (j = 0; j < pa->pa_nr; j++) {
+			idaws[k] = pa->pa_pfn[j] << PAGE_SHIFT;
+			if (k == 0)
+				idaws[k] += pa->pa_iova & (PAGE_SIZE - 1);
+			k++;
+		}
+	}
+}
+
+
+/*
+ * Within the domain (@mdev), copy @n bytes from a guest physical
+ * address (@iova) to a host physical address (@to).
+ */
+static long copy_from_iova(struct device *mdev,
+			   void *to, u64 iova,
+			   unsigned long n)
+{
+	struct pfn_array pa = {0};
+	u64 from;
+	int i, ret;
+	unsigned long l, m;
+
+	ret = pfn_array_alloc_pin(&pa, mdev, iova, n);
+	if (ret <= 0)
+		return ret;
+
+	l = n;
+	for (i = 0; i < pa.pa_nr; i++) {
+		from = pa.pa_pfn[i] << PAGE_SHIFT;
+		m = PAGE_SIZE;
+		if (i == 0) {
+			from += iova & (PAGE_SIZE - 1);
+			m -= iova & (PAGE_SIZE - 1);
+		}
+
+		m = min(l, m);
+		memcpy(to + (n - l), (void *)from, m);
+
+		l -= m;
+		if (l == 0)
+			break;
+	}
+
+	pfn_array_unpin_free(&pa, mdev);
+
+	return l;
+}
+
+static long copy_ccw_from_iova(struct channel_program *cp,
+			       struct ccw1 *to, u64 iova,
+			       unsigned long len)
+{
+	return copy_from_iova(cp->mdev, to, iova, len * sizeof(struct ccw1));
+}
+
+/*
+ * Helpers to operate ccwchain.
+ */
+#define ccw_is_test(_ccw) (((_ccw)->cmd_code & 0x0F) == 0)
+
+#define ccw_is_noop(_ccw) ((_ccw)->cmd_code == CCW_CMD_NOOP)
+
+#define ccw_is_tic(_ccw) ((_ccw)->cmd_code == CCW_CMD_TIC)
+
+#define ccw_is_idal(_ccw) ((_ccw)->flags & CCW_FLAG_IDA)
+
+
+#define ccw_is_chain(_ccw) ((_ccw)->flags & (CCW_FLAG_CC | CCW_FLAG_DC))
+
+static struct ccwchain *ccwchain_alloc(struct channel_program *cp, int len)
+{
+	struct ccwchain *chain;
+	void *data;
+	size_t size;
+
+	/* Make ccw address aligned to 8. */
+	size = ((sizeof(*chain) + 7L) & -8L) +
+		sizeof(*chain->ch_ccw) * len +
+		sizeof(*chain->ch_pat) * len;
+	chain = kzalloc(size, GFP_DMA | GFP_KERNEL);
+	if (!chain)
+		return NULL;
+
+	data = (u8 *)chain + ((sizeof(*chain) + 7L) & -8L);
+	chain->ch_ccw = (struct ccw1 *)data;
+
+	data = (u8 *)(chain->ch_ccw) + sizeof(*chain->ch_ccw) * len;
+	chain->ch_pat = (struct pfn_array_table *)data;
+
+	chain->ch_len = len;
+
+	list_add_tail(&chain->next, &cp->ccwchain_list);
+
+	return chain;
+}
+
+static void ccwchain_free(struct ccwchain *chain)
+{
+	list_del(&chain->next);
+	kfree(chain);
+}
+
+/* Free resource for a ccw that allocated memory for its cda. */
+static void ccwchain_cda_free(struct ccwchain *chain, int idx)
+{
+	struct ccw1 *ccw = chain->ch_ccw + idx;
+
+	if (!ccw->count)
+		return;
+
+	kfree((void *)(u64)ccw->cda);
+}
+
+/* Unpin the pages then free the memory resources. */
+static void cp_unpin_free(struct channel_program *cp)
+{
+	struct ccwchain *chain, *temp;
+	int i;
+
+	list_for_each_entry_safe(chain, temp, &cp->ccwchain_list, next) {
+		for (i = 0; i < chain->ch_len; i++) {
+			pfn_array_table_unpin_free(chain->ch_pat + i,
+						   cp->mdev);
+			ccwchain_cda_free(chain, i);
+		}
+		ccwchain_free(chain);
+	}
+}
+
+/**
+ * ccwchain_calc_length - calculate the length of the ccw chain.
+ * @iova: guest physical address of the target ccw chain
+ * @cp: channel_program on which to perform the operation
+ *
+ * This is the chain length not considering any TICs.
+ * You need to do a new round for each TIC target.
+ *
+ * Returns: the length of the ccw chain or -errno.
+ */
+static int ccwchain_calc_length(u64 iova, struct channel_program *cp)
+{
+	struct ccw1 *ccw, *p;
+	int cnt;
+
+	/*
+	 * Copy current chain from guest to host kernel.
+	 * Currently the chain length is limited to CCWCHAIN_LEN_MAX (256).
+	 * So copying 2K is enough (safe).
+	 */
+	p = ccw = kcalloc(CCWCHAIN_LEN_MAX, sizeof(*ccw), GFP_KERNEL);
+	if (!ccw)
+		return -ENOMEM;
+
+	cnt = copy_ccw_from_iova(cp, ccw, iova, CCWCHAIN_LEN_MAX);
+	if (cnt) {
+		kfree(ccw);
+		return cnt;
+	}
+
+	cnt = 0;
+	do {
+		cnt++;
+
+		if ((!ccw_is_chain(ccw)) && (!ccw_is_tic(ccw)))
+			break;
+
+		ccw++;
+	} while (cnt < CCWCHAIN_LEN_MAX + 1);
+
+	if (cnt == CCWCHAIN_LEN_MAX + 1)
+		cnt = -EINVAL;
+
+	kfree(p);
+	return cnt;
+}
+
+static int tic_target_chain_exists(struct ccw1 *tic, struct channel_program *cp)
+{
+	struct ccwchain *chain;
+	u32 ccw_head, ccw_tail;
+
+	list_for_each_entry(chain, &cp->ccwchain_list, next) {
+		ccw_head = chain->ch_iova;
+		ccw_tail = ccw_head + (chain->ch_len - 1) * sizeof(struct ccw1);
+
+		if ((ccw_head <= tic->cda) && (tic->cda <= ccw_tail))
+			return 1;
+	}
+
+	return 0;
+}
+
+static int ccwchain_loop_tic(struct ccwchain *chain,
+			     struct channel_program *cp);
+
+static int ccwchain_handle_tic(struct ccw1 *tic, struct channel_program *cp)
+{
+	struct ccwchain *chain;
+	int len, ret;
+
+	/* May transfer to an existing chain. */
+	if (tic_target_chain_exists(tic, cp))
+		return 0;
+
+	/* Get chain length. */
+	len = ccwchain_calc_length(tic->cda, cp);
+	if (len < 0)
+		return len;
+
+	/* Need alloc a new chain for this one. */
+	chain = ccwchain_alloc(cp, len);
+	if (!chain)
+		return -ENOMEM;
+	chain->ch_iova = tic->cda;
+
+	/* Copy the new chain from user. */
+	ret = copy_ccw_from_iova(cp, chain->ch_ccw, tic->cda, len);
+	if (ret) {
+		ccwchain_free(chain);
+		return ret;
+	}
+
+	/* Loop for tics on this new chain. */
+	return ccwchain_loop_tic(chain, cp);
+}
+
+/* Loop for TICs. */
+static int ccwchain_loop_tic(struct ccwchain *chain, struct channel_program *cp)
+{
+	struct ccw1 *tic;
+	int i, ret;
+
+	for (i = 0; i < chain->ch_len; i++) {
+		tic = chain->ch_ccw + i;
+
+		if (!ccw_is_tic(tic))
+			continue;
+
+		ret = ccwchain_handle_tic(tic, cp);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int ccwchain_fetch_tic(struct ccwchain *chain,
+			      int idx,
+			      struct channel_program *cp)
+{
+	struct ccw1 *ccw = chain->ch_ccw + idx;
+	struct ccwchain *iter;
+	u32 ccw_head, ccw_tail;
+
+	list_for_each_entry(iter, &cp->ccwchain_list, next) {
+		ccw_head = iter->ch_iova;
+		ccw_tail = ccw_head + (iter->ch_len - 1) * sizeof(struct ccw1);
+
+		if ((ccw_head <= ccw->cda) && (ccw->cda <= ccw_tail)) {
+			ccw->cda = (__u32) (addr_t) (iter->ch_ccw +
+						     (ccw->cda - ccw_head));
+			return 0;
+		}
+	}
+
+	return -EFAULT;
+}
+
+static int ccwchain_fetch_direct(struct ccwchain *chain,
+				 int idx,
+				 struct channel_program *cp)
+{
+	struct ccw1 *ccw;
+	struct pfn_array_table *pat;
+	unsigned long *idaws;
+	int idaw_nr;
+
+	ccw = chain->ch_ccw + idx;
+
+	/*
+	 * Pin data page(s) in memory.
+	 * The number of pages actually is the count of the idaws which will be
+	 * needed when translating a direct ccw to a idal ccw.
+	 */
+	pat = chain->ch_pat + idx;
+	if (pfn_array_table_init(pat, 1))
+		return -ENOMEM;
+	idaw_nr = pfn_array_alloc_pin(pat->pat_pa, cp->mdev,
+				      ccw->cda, ccw->count);
+	if (idaw_nr < 0)
+		return idaw_nr;
+
+	/* Translate this direct ccw to a idal ccw. */
+	idaws = kcalloc(idaw_nr, sizeof(*idaws), GFP_DMA | GFP_KERNEL);
+	if (!idaws) {
+		pfn_array_table_unpin_free(pat, cp->mdev);
+		return -ENOMEM;
+	}
+	ccw->cda = (__u32) virt_to_phys(idaws);
+	ccw->flags |= CCW_FLAG_IDA;
+
+	pfn_array_table_idal_create_words(pat, idaws);
+
+	return 0;
+}
+
+static int ccwchain_fetch_idal(struct ccwchain *chain,
+			       int idx,
+			       struct channel_program *cp)
+{
+	struct ccw1 *ccw;
+	struct pfn_array_table *pat;
+	unsigned long *idaws;
+	u64 idaw_iova;
+	unsigned int idaw_nr, idaw_len;
+	int i, ret;
+
+	ccw = chain->ch_ccw + idx;
+
+	/* Calculate size of idaws. */
+	ret = copy_from_iova(cp->mdev, &idaw_iova, ccw->cda, sizeof(idaw_iova));
+	if (ret)
+		return ret;
+	idaw_nr = idal_nr_words((void *)(idaw_iova), ccw->count);
+	idaw_len = idaw_nr * sizeof(*idaws);
+
+	/* Pin data page(s) in memory. */
+	pat = chain->ch_pat + idx;
+	ret = pfn_array_table_init(pat, idaw_nr);
+	if (ret)
+		return ret;
+
+	/* Translate idal ccw to use new allocated idaws. */
+	idaws = kzalloc(idaw_len, GFP_DMA | GFP_KERNEL);
+	if (!idaws) {
+		ret = -ENOMEM;
+		goto out_unpin;
+	}
+
+	ret = copy_from_iova(cp->mdev, idaws, ccw->cda, idaw_len);
+	if (ret)
+		goto out_free_idaws;
+
+	ccw->cda = virt_to_phys(idaws);
+
+	for (i = 0; i < idaw_nr; i++) {
+		idaw_iova = *(idaws + i);
+		if (IS_ERR_VALUE(idaw_iova)) {
+			ret = -EFAULT;
+			goto out_free_idaws;
+		}
+
+		ret = pfn_array_alloc_pin(pat->pat_pa + i, cp->mdev,
+					  idaw_iova, 1);
+		if (ret < 0)
+			goto out_free_idaws;
+	}
+
+	pfn_array_table_idal_create_words(pat, idaws);
+
+	return 0;
+
+out_free_idaws:
+	kfree(idaws);
+out_unpin:
+	pfn_array_table_unpin_free(pat, cp->mdev);
+	return ret;
+}
+
+/*
+ * Fetch one ccw.
+ * To reduce memory copy, we'll pin the cda page in memory,
+ * and to get rid of the cda 2G limitiaion of ccw1, we'll translate
+ * direct ccws to idal ccws.
+ */
+static int ccwchain_fetch_one(struct ccwchain *chain,
+			      int idx,
+			      struct channel_program *cp)
+{
+	struct ccw1 *ccw = chain->ch_ccw + idx;
+
+	if (ccw_is_test(ccw) || ccw_is_noop(ccw))
+		return 0;
+
+	if (ccw_is_tic(ccw))
+		return ccwchain_fetch_tic(chain, idx, cp);
+
+	if (ccw_is_idal(ccw))
+		return ccwchain_fetch_idal(chain, idx, cp);
+
+	return ccwchain_fetch_direct(chain, idx, cp);
+}
+
+/**
+ * cp_init() - allocate ccwchains for a channel program.
+ * @cp: channel_program on which to perform the operation
+ * @mdev: the mediated device to perform pin/unpin operations
+ * @orb: control block for the channel program from the guest
+ *
+ * This creates one or more ccwchain(s), and copies the raw data of
+ * the target channel program from @orb->cmd.iova to the new ccwchain(s).
+ *
+ * Limitations:
+ * 1. Supports only prefetch enabled mode.
+ * 2. Supports idal(c64) ccw chaining.
+ * 3. Supports 4k idaw.
+ *
+ * Returns:
+ *   %0 on success and a negative error value on failure.
+ */
+int cp_init(struct channel_program *cp, struct device *mdev, union orb *orb)
+{
+	u64 iova = orb->cmd.cpa;
+	struct ccwchain *chain;
+	int len, ret;
+
+	/*
+	 * XXX:
+	 * Only support prefetch enable mode now.
+	 * Only support 64bit addressing idal.
+	 * Only support 4k IDAW.
+	 * Only support ccw1.
+	 */
+	if (!orb->cmd.pfch || !orb->cmd.c64 || orb->cmd.i2k || !orb->cmd.fmt)
+		return -EOPNOTSUPP;
+
+	INIT_LIST_HEAD(&cp->ccwchain_list);
+	memcpy(&cp->orb, orb, sizeof(*orb));
+	cp->mdev = mdev;
+
+	/* Get chain length. */
+	len = ccwchain_calc_length(iova, cp);
+	if (len < 0)
+		return len;
+
+	/* Alloc mem for the head chain. */
+	chain = ccwchain_alloc(cp, len);
+	if (!chain)
+		return -ENOMEM;
+	chain->ch_iova = iova;
+
+	/* Copy the head chain from guest. */
+	ret = copy_ccw_from_iova(cp, chain->ch_ccw, iova, len);
+	if (ret) {
+		ccwchain_free(chain);
+		return ret;
+	}
+
+	/* Now loop for its TICs. */
+	ret = ccwchain_loop_tic(chain, cp);
+	if (ret)
+		cp_unpin_free(cp);
+
+	return ret;
+}
+
+
+/**
+ * cp_free() - free resources for channel program.
+ * @cp: channel_program on which to perform the operation
+ *
+ * This unpins the memory pages and frees the memory space occupied by
+ * @cp, which must have been returned by a previous call to cp_init().
+ * Otherwise, undefined behavior occurs.
+ */
+void cp_free(struct channel_program *cp)
+{
+	cp_unpin_free(cp);
+}
+
+/**
+ * cp_prefetch() - translate a guest physical address channel program to
+ *                 a real-device runnable channel program.
+ * @cp: channel_program on which to perform the operation
+ *
+ * This function translates the guest-physical-address channel program
+ * and stores the result to ccwchain list. @cp must have been
+ * initialized by a previous call with cp_init(). Otherwise, undefined
+ * behavior occurs.
+ *
+ * The S/390 CCW Translation APIS (prefixed by 'cp_') are introduced
+ * as helpers to do ccw chain translation inside the kernel. Basically
+ * they accept a channel program issued by a virtual machine, and
+ * translate the channel program to a real-device runnable channel
+ * program.
+ *
+ * These APIs will copy the ccws into kernel-space buffers, and update
+ * the guest phsical addresses with their corresponding host physical
+ * addresses.  Then channel I/O device drivers could issue the
+ * translated channel program to real devices to perform an I/O
+ * operation.
+ *
+ * These interfaces are designed to support translation only for
+ * channel programs, which are generated and formatted by a
+ * guest. Thus this will make it possible for things like VFIO to
+ * leverage the interfaces to passthrough a channel I/O mediated
+ * device in QEMU.
+ *
+ * We support direct ccw chaining by translating them to idal ccws.
+ *
+ * Returns:
+ *   %0 on success and a negative error value on failure.
+ */
+int cp_prefetch(struct channel_program *cp)
+{
+	struct ccwchain *chain;
+	int len, idx, ret;
+
+	list_for_each_entry(chain, &cp->ccwchain_list, next) {
+		len = chain->ch_len;
+		for (idx = 0; idx < len; idx++) {
+			ret = ccwchain_fetch_one(chain, idx, cp);
+			if (ret)
+				return ret;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * cp_get_orb() - get the orb of the channel program
+ * @cp: channel_program on which to perform the operation
+ * @intparm: new intparm for the returned orb
+ * @lpm: candidate value of the logical-path mask for the returned orb
+ *
+ * This function returns the address of the updated orb of the channel
+ * program. Channel I/O device drivers could use this orb to issue a
+ * ssch.
+ */
+union orb *cp_get_orb(struct channel_program *cp, u32 intparm, u8 lpm)
+{
+	union orb *orb;
+	struct ccwchain *chain;
+	struct ccw1 *cpa;
+
+	orb = &cp->orb;
+
+	orb->cmd.intparm = intparm;
+	orb->cmd.fmt = 1;
+	orb->cmd.key = PAGE_DEFAULT_KEY >> 4;
+
+	if (orb->cmd.lpm == 0)
+		orb->cmd.lpm = lpm;
+
+	chain = list_first_entry(&cp->ccwchain_list, struct ccwchain, next);
+	cpa = chain->ch_ccw;
+	orb->cmd.cpa = (__u32) __pa(cpa);
+
+	return orb;
+}
+
+/**
+ * cp_update_scsw() - update scsw for a channel program.
+ * @cp: channel_program on which to perform the operation
+ * @scsw: I/O results of the channel program and also the target to be
+ *        updated
+ *
+ * @scsw contains the I/O results of the channel program that pointed
+ * to by @cp. However what @scsw->cpa stores is a host physical
+ * address, which is meaningless for the guest, which is waiting for
+ * the I/O results.
+ *
+ * This function updates @scsw->cpa to its coressponding guest physical
+ * address.
+ */
+void cp_update_scsw(struct channel_program *cp, union scsw *scsw)
+{
+	struct ccwchain *chain;
+	u32 cpa = scsw->cmd.cpa;
+	u32 ccw_head, ccw_tail;
+
+	/*
+	 * LATER:
+	 * For now, only update the cmd.cpa part. We may need to deal with
+	 * other portions of the schib as well, even if we don't return them
+	 * in the ioctl directly. Path status changes etc.
+	 */
+	list_for_each_entry(chain, &cp->ccwchain_list, next) {
+		ccw_head = (u32)(u64)chain->ch_ccw;
+		ccw_tail = (u32)(u64)(chain->ch_ccw + chain->ch_len - 1);
+
+		if ((ccw_head <= cpa) && (cpa <= ccw_tail)) {
+			/*
+			 * (cpa - ccw_head) is the offset value of the host
+			 * physical ccw to its chain head.
+			 * Adding this value to the guest physical ccw chain
+			 * head gets us the guest cpa.
+			 */
+			cpa = chain->ch_iova + (cpa - ccw_head);
+			break;
+		}
+	}
+
+	scsw->cmd.cpa = cpa;
+}
+
+/**
+ * cp_iova_pinned() - check if an iova is pinned for a ccw chain.
+ * @cmd: ccwchain command on which to perform the operation
+ * @iova: the iova to check
+ *
+ * If the @iova is currently pinned for the ccw chain, return true;
+ * else return false.
+ */
+bool cp_iova_pinned(struct channel_program *cp, u64 iova)
+{
+	struct ccwchain *chain;
+	int i;
+
+	list_for_each_entry(chain, &cp->ccwchain_list, next) {
+		for (i = 0; i < chain->ch_len; i++)
+			if (pfn_array_table_iova_pinned(chain->ch_pat + i,
+							iova))
+				return true;
+	}
+
+	return false;
+}
diff --git a/drivers/s390/cio/vfio_ccw_cp.h b/drivers/s390/cio/vfio_ccw_cp.h
new file mode 100644
index 000000000000..7a1996b3b36d
--- /dev/null
+++ b/drivers/s390/cio/vfio_ccw_cp.h
@@ -0,0 +1,42 @@
+/*
+ * channel program interfaces
+ *
+ * Copyright IBM Corp. 2017
+ *
+ * Author(s): Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com>
+ *            Xiao Feng Ren <renxiaof@linux.vnet.ibm.com>
+ */
+
+#ifndef _VFIO_CCW_CP_H_
+#define _VFIO_CCW_CP_H_
+
+#include <asm/cio.h>
+#include <asm/scsw.h>
+
+#include "orb.h"
+
+/**
+ * struct channel_program - manage information for channel program
+ * @ccwchain_list: list head of ccwchains
+ * @orb: orb for the currently processed ssch request
+ * @mdev: the mediated device to perform page pinning/unpinning
+ *
+ * @ccwchain_list is the head of a ccwchain list, that contents the
+ * translated result of the guest channel program that pointed out by
+ * the iova parameter when calling cp_init.
+ */
+struct channel_program {
+	struct list_head ccwchain_list;
+	union orb orb;
+	struct device *mdev;
+};
+
+extern int cp_init(struct channel_program *cp, struct device *mdev,
+		   union orb *orb);
+extern void cp_free(struct channel_program *cp);
+extern int cp_prefetch(struct channel_program *cp);
+extern union orb *cp_get_orb(struct channel_program *cp, u32 intparm, u8 lpm);
+extern void cp_update_scsw(struct channel_program *cp, union scsw *scsw);
+extern bool cp_iova_pinned(struct channel_program *cp, u64 iova);
+
+#endif

From 84cd8fc48478b5e67b3f1600717299e11430a67e Mon Sep 17 00:00:00 2001
From: Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com>
Date: Fri, 17 Mar 2017 04:17:33 +0100
Subject: [PATCH 40/74] vfio: ccw: register vfio_ccw to the mediated device
 framework

To make vfio support subchannel devices, we need to leverage the
mediated device framework to create a mediated device for the
subchannel device.

This registers the subchannel device to the mediated device
framework during probe to enable mediated device creation.

Reviewed-by: Pierre Morel <pmorel@linux.vnet.ibm.com>
Signed-off-by: Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com>
Message-Id: <20170317031743.40128-7-bjsdjshi@linux.vnet.ibm.com>
Signed-off-by: Cornelia Huck <cornelia.huck@de.ibm.com>
---
 arch/s390/Kconfig                   |   2 +-
 drivers/s390/cio/Makefile           |   2 +-
 drivers/s390/cio/vfio_ccw_drv.c     |  12 ++-
 drivers/s390/cio/vfio_ccw_ops.c     | 147 ++++++++++++++++++++++++++++
 drivers/s390/cio/vfio_ccw_private.h |  11 +++
 5 files changed, 171 insertions(+), 3 deletions(-)
 create mode 100644 drivers/s390/cio/vfio_ccw_ops.c

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index b53f0359f2b6..f1317cb24d75 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -675,7 +675,7 @@ config EADM_SCH
 config VFIO_CCW
 	def_tristate n
 	prompt "Support for VFIO-CCW subchannels"
-	depends on S390_CCW_IOMMU && VFIO
+	depends on S390_CCW_IOMMU && VFIO_MDEV
 	help
 	  This driver allows usage of I/O subchannels via VFIO-CCW.
 
diff --git a/drivers/s390/cio/Makefile b/drivers/s390/cio/Makefile
index 1bec279430b7..b0586b223502 100644
--- a/drivers/s390/cio/Makefile
+++ b/drivers/s390/cio/Makefile
@@ -18,5 +18,5 @@ obj-$(CONFIG_CCWGROUP) += ccwgroup.o
 qdio-objs := qdio_main.o qdio_thinint.o qdio_debug.o qdio_setup.o
 obj-$(CONFIG_QDIO) += qdio.o
 
-vfio_ccw-objs += vfio_ccw_drv.o vfio_ccw_cp.o
+vfio_ccw-objs += vfio_ccw_drv.o vfio_ccw_cp.o vfio_ccw_ops.o
 obj-$(CONFIG_VFIO_CCW) += vfio_ccw.o
diff --git a/drivers/s390/cio/vfio_ccw_drv.c b/drivers/s390/cio/vfio_ccw_drv.c
index de4f4f1b45de..b1f430aa9bdb 100644
--- a/drivers/s390/cio/vfio_ccw_drv.c
+++ b/drivers/s390/cio/vfio_ccw_drv.c
@@ -19,7 +19,7 @@
 /*
  * Helpers
  */
-static int vfio_ccw_sch_quiesce(struct subchannel *sch)
+int vfio_ccw_sch_quiesce(struct subchannel *sch)
 {
 	struct vfio_ccw_private *private = dev_get_drvdata(&sch->dev);
 	DECLARE_COMPLETION_ONSTACK(completion);
@@ -152,8 +152,16 @@ static int vfio_ccw_sch_probe(struct subchannel *sch)
 	if (ret)
 		goto out_disable;
 
+	ret = vfio_ccw_mdev_reg(sch);
+	if (ret)
+		goto out_rm_group;
+
+	atomic_set(&private->avail, 1);
+
 	return 0;
 
+out_rm_group:
+	sysfs_remove_group(&sch->dev.kobj, &vfio_subchannel_attr_group);
 out_disable:
 	cio_disable_subchannel(sch);
 out_free:
@@ -168,6 +176,8 @@ static int vfio_ccw_sch_remove(struct subchannel *sch)
 
 	vfio_ccw_sch_quiesce(sch);
 
+	vfio_ccw_mdev_unreg(sch);
+
 	sysfs_remove_group(&sch->dev.kobj, &vfio_subchannel_attr_group);
 
 	dev_set_drvdata(&sch->dev, NULL);
diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c
new file mode 100644
index 000000000000..b8a2fed58f02
--- /dev/null
+++ b/drivers/s390/cio/vfio_ccw_ops.c
@@ -0,0 +1,147 @@
+/*
+ * Physical device callbacks for vfio_ccw
+ *
+ * Copyright IBM Corp. 2017
+ *
+ * Author(s): Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com>
+ *            Xiao Feng Ren <renxiaof@linux.vnet.ibm.com>
+ */
+
+#include <linux/vfio.h>
+#include <linux/mdev.h>
+
+#include "vfio_ccw_private.h"
+
+static int vfio_ccw_mdev_notifier(struct notifier_block *nb,
+				  unsigned long action,
+				  void *data)
+{
+	struct vfio_ccw_private *private =
+		container_of(nb, struct vfio_ccw_private, nb);
+
+	if (!private)
+		return NOTIFY_STOP;
+
+	/*
+	 * TODO:
+	 * Vendor drivers MUST unpin pages in response to an
+	 * invalidation.
+	 */
+	if (action == VFIO_IOMMU_NOTIFY_DMA_UNMAP)
+		return NOTIFY_BAD;
+
+	return NOTIFY_DONE;
+}
+
+static ssize_t name_show(struct kobject *kobj, struct device *dev, char *buf)
+{
+	return sprintf(buf, "I/O subchannel (Non-QDIO)\n");
+}
+MDEV_TYPE_ATTR_RO(name);
+
+static ssize_t device_api_show(struct kobject *kobj, struct device *dev,
+			       char *buf)
+{
+	return sprintf(buf, "%s\n", VFIO_DEVICE_API_CCW_STRING);
+}
+MDEV_TYPE_ATTR_RO(device_api);
+
+static ssize_t available_instances_show(struct kobject *kobj,
+					struct device *dev, char *buf)
+{
+	struct vfio_ccw_private *private = dev_get_drvdata(dev);
+
+	return sprintf(buf, "%d\n", atomic_read(&private->avail));
+}
+MDEV_TYPE_ATTR_RO(available_instances);
+
+static struct attribute *mdev_types_attrs[] = {
+	&mdev_type_attr_name.attr,
+	&mdev_type_attr_device_api.attr,
+	&mdev_type_attr_available_instances.attr,
+	NULL,
+};
+
+static struct attribute_group mdev_type_group = {
+	.name  = "io",
+	.attrs = mdev_types_attrs,
+};
+
+struct attribute_group *mdev_type_groups[] = {
+	&mdev_type_group,
+	NULL,
+};
+
+static int vfio_ccw_mdev_create(struct kobject *kobj, struct mdev_device *mdev)
+{
+	struct vfio_ccw_private *private =
+		dev_get_drvdata(mdev_parent_dev(mdev));
+
+	if (atomic_dec_if_positive(&private->avail) < 0)
+		return -EPERM;
+
+	private->mdev = mdev;
+
+	return 0;
+}
+
+static int vfio_ccw_mdev_remove(struct mdev_device *mdev)
+{
+	struct vfio_ccw_private *private;
+	struct subchannel *sch;
+	int ret;
+
+	private = dev_get_drvdata(mdev_parent_dev(mdev));
+	sch = private->sch;
+	ret = vfio_ccw_sch_quiesce(sch);
+	if (ret)
+		return ret;
+	ret = cio_enable_subchannel(sch, (u32)(unsigned long)sch);
+	if (ret)
+		return ret;
+
+	private->mdev = NULL;
+	atomic_inc(&private->avail);
+
+	return 0;
+}
+
+static int vfio_ccw_mdev_open(struct mdev_device *mdev)
+{
+	struct vfio_ccw_private *private =
+		dev_get_drvdata(mdev_parent_dev(mdev));
+	unsigned long events = VFIO_IOMMU_NOTIFY_DMA_UNMAP;
+
+	private->nb.notifier_call = vfio_ccw_mdev_notifier;
+
+	return vfio_register_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY,
+				      &events, &private->nb);
+}
+
+void vfio_ccw_mdev_release(struct mdev_device *mdev)
+{
+	struct vfio_ccw_private *private =
+		dev_get_drvdata(mdev_parent_dev(mdev));
+
+	vfio_unregister_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY,
+				 &private->nb);
+}
+
+static const struct mdev_parent_ops vfio_ccw_mdev_ops = {
+	.owner			= THIS_MODULE,
+	.supported_type_groups  = mdev_type_groups,
+	.create			= vfio_ccw_mdev_create,
+	.remove			= vfio_ccw_mdev_remove,
+	.open			= vfio_ccw_mdev_open,
+	.release		= vfio_ccw_mdev_release,
+};
+
+int vfio_ccw_mdev_reg(struct subchannel *sch)
+{
+	return mdev_register_device(&sch->dev, &vfio_ccw_mdev_ops);
+}
+
+void vfio_ccw_mdev_unreg(struct subchannel *sch)
+{
+	mdev_unregister_device(&sch->dev);
+}
diff --git a/drivers/s390/cio/vfio_ccw_private.h b/drivers/s390/cio/vfio_ccw_private.h
index 38d69a59cb12..5afb3ba5c7b5 100644
--- a/drivers/s390/cio/vfio_ccw_private.h
+++ b/drivers/s390/cio/vfio_ccw_private.h
@@ -16,10 +16,21 @@
  * struct vfio_ccw_private
  * @sch: pointer to the subchannel
  * @completion: synchronization helper of the I/O completion
+ * @avail: available for creating a mediated device
+ * @mdev: pointer to the mediated device
+ * @nb: notifier for vfio events
  */
 struct vfio_ccw_private {
 	struct subchannel	*sch;
 	struct completion	*completion;
+	atomic_t		avail;
+	struct mdev_device	*mdev;
+	struct notifier_block	nb;
 } __aligned(8);
 
+extern int vfio_ccw_mdev_reg(struct subchannel *sch);
+extern void vfio_ccw_mdev_unreg(struct subchannel *sch);
+
+extern int vfio_ccw_sch_quiesce(struct subchannel *sch);
+
 #endif

From 060d2b5afcc4f9e2d61e2b059e648f569b8dba9a Mon Sep 17 00:00:00 2001
From: Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com>
Date: Fri, 17 Mar 2017 04:17:34 +0100
Subject: [PATCH 41/74] vfio: ccw: introduce ccw_io_region

To provide user-space a set of interfaces to:
1. pass in a ccw program to perform an I/O operation.
2. read back I/O results of the completed I/O operations.
We introduce an MMIO region for the vfio-ccw device here.

This region is defined to content:
1. areas to store arguments that an ssch required.
2. areas to store the I/O results.

Using pwrite/pread to the device on this region, a user-space program
could write/read data to/from the vfio-ccw device.

Reviewed-by: Pierre Morel <pmorel@linux.vnet.ibm.com>
Signed-off-by: Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com>
Message-Id: <20170317031743.40128-8-bjsdjshi@linux.vnet.ibm.com>
Signed-off-by: Cornelia Huck <cornelia.huck@de.ibm.com>
---
 drivers/s390/cio/vfio_ccw_ops.c     | 47 +++++++++++++++++++++++++++++
 drivers/s390/cio/vfio_ccw_private.h |  4 +++
 include/uapi/linux/vfio_ccw.h       | 24 +++++++++++++++
 3 files changed, 75 insertions(+)
 create mode 100644 include/uapi/linux/vfio_ccw.h

diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c
index b8a2fed58f02..6c06805839d8 100644
--- a/drivers/s390/cio/vfio_ccw_ops.c
+++ b/drivers/s390/cio/vfio_ccw_ops.c
@@ -127,6 +127,51 @@ void vfio_ccw_mdev_release(struct mdev_device *mdev)
 				 &private->nb);
 }
 
+static ssize_t vfio_ccw_mdev_read(struct mdev_device *mdev,
+				  char __user *buf,
+				  size_t count,
+				  loff_t *ppos)
+{
+	struct vfio_ccw_private *private;
+	struct ccw_io_region *region;
+
+	if (*ppos + count > sizeof(*region))
+		return -EINVAL;
+
+	private = dev_get_drvdata(mdev_parent_dev(mdev));
+	if (!private)
+		return -ENODEV;
+
+	region = &private->io_region;
+	if (copy_to_user(buf, (void *)region + *ppos, count))
+		return -EFAULT;
+
+	return count;
+}
+
+static ssize_t vfio_ccw_mdev_write(struct mdev_device *mdev,
+				   const char __user *buf,
+				   size_t count,
+				   loff_t *ppos)
+{
+	struct vfio_ccw_private *private;
+	struct ccw_io_region *region;
+
+	if (*ppos + count > sizeof(*region))
+		return -EINVAL;
+
+	private = dev_get_drvdata(mdev_parent_dev(mdev));
+	if (!private)
+		return -ENODEV;
+
+	region = &private->io_region;
+	if (copy_from_user((void *)region + *ppos, buf, count))
+		return -EFAULT;
+	region->ret_code = 0;
+
+	return count;
+}
+
 static const struct mdev_parent_ops vfio_ccw_mdev_ops = {
 	.owner			= THIS_MODULE,
 	.supported_type_groups  = mdev_type_groups,
@@ -134,6 +179,8 @@ static const struct mdev_parent_ops vfio_ccw_mdev_ops = {
 	.remove			= vfio_ccw_mdev_remove,
 	.open			= vfio_ccw_mdev_open,
 	.release		= vfio_ccw_mdev_release,
+	.read			= vfio_ccw_mdev_read,
+	.write			= vfio_ccw_mdev_write,
 };
 
 int vfio_ccw_mdev_reg(struct subchannel *sch)
diff --git a/drivers/s390/cio/vfio_ccw_private.h b/drivers/s390/cio/vfio_ccw_private.h
index 5afb3ba5c7b5..359e96ba9c6c 100644
--- a/drivers/s390/cio/vfio_ccw_private.h
+++ b/drivers/s390/cio/vfio_ccw_private.h
@@ -10,6 +10,8 @@
 #ifndef _VFIO_CCW_PRIVATE_H_
 #define _VFIO_CCW_PRIVATE_H_
 
+#include <linux/vfio_ccw.h>
+
 #include "css.h"
 
 /**
@@ -19,6 +21,7 @@
  * @avail: available for creating a mediated device
  * @mdev: pointer to the mediated device
  * @nb: notifier for vfio events
+ * @io_region: MMIO region to input/output I/O arguments/results
  */
 struct vfio_ccw_private {
 	struct subchannel	*sch;
@@ -26,6 +29,7 @@ struct vfio_ccw_private {
 	atomic_t		avail;
 	struct mdev_device	*mdev;
 	struct notifier_block	nb;
+	struct ccw_io_region	io_region;
 } __aligned(8);
 
 extern int vfio_ccw_mdev_reg(struct subchannel *sch);
diff --git a/include/uapi/linux/vfio_ccw.h b/include/uapi/linux/vfio_ccw.h
new file mode 100644
index 000000000000..34a7f6f9e065
--- /dev/null
+++ b/include/uapi/linux/vfio_ccw.h
@@ -0,0 +1,24 @@
+/*
+ * Interfaces for vfio-ccw
+ *
+ * Copyright IBM Corp. 2017
+ *
+ * Author(s): Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com>
+ */
+
+#ifndef _VFIO_CCW_H_
+#define _VFIO_CCW_H_
+
+#include <linux/types.h>
+
+struct ccw_io_region {
+#define ORB_AREA_SIZE 12
+	__u8	orb_area[ORB_AREA_SIZE];
+#define SCSW_AREA_SIZE 12
+	__u8	scsw_area[SCSW_AREA_SIZE];
+#define IRB_AREA_SIZE 96
+	__u8	irb_area[IRB_AREA_SIZE];
+	__u32	ret_code;
+} __packed;
+
+#endif

From 4e149e431a2858b86b4f9c801b2a4dde50c929f9 Mon Sep 17 00:00:00 2001
From: Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com>
Date: Fri, 17 Mar 2017 04:17:35 +0100
Subject: [PATCH 42/74] vfio: ccw: handle ccw command request

We implement the basic ccw command handling infrastructure
here:
1. Translate the ccw commands.
2. Issue the translated ccw commands to the device.
3. Once we get the execution result, update the guest SCSW
   with it.

Acked-by: Pierre Morel <pmorel@linux.vnet.ibm.com>
Signed-off-by: Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com>
Message-Id: <20170317031743.40128-9-bjsdjshi@linux.vnet.ibm.com>
Signed-off-by: Cornelia Huck <cornelia.huck@de.ibm.com>
---
 drivers/s390/cio/vfio_ccw_drv.c     | 114 ++++++++++++++++++++++++++++
 drivers/s390/cio/vfio_ccw_ops.c     |  24 +++++-
 drivers/s390/cio/vfio_ccw_private.h |  14 ++++
 3 files changed, 148 insertions(+), 4 deletions(-)

diff --git a/drivers/s390/cio/vfio_ccw_drv.c b/drivers/s390/cio/vfio_ccw_drv.c
index b1f430aa9bdb..b5971d321697 100644
--- a/drivers/s390/cio/vfio_ccw_drv.c
+++ b/drivers/s390/cio/vfio_ccw_drv.c
@@ -11,9 +11,13 @@
 #include <linux/init.h>
 #include <linux/device.h>
 #include <linux/slab.h>
+#include <linux/uuid.h>
+#include <linux/mdev.h>
 
 #include <asm/isc.h>
 
+#include "ioasm.h"
+#include "css.h"
 #include "vfio_ccw_private.h"
 
 /*
@@ -59,6 +63,109 @@ out_unlock:
 	return ret;
 }
 
+static int doing_io(struct vfio_ccw_private *private, u32 intparm)
+{
+	return (private->intparm == intparm);
+}
+
+static int vfio_ccw_sch_io_helper(struct vfio_ccw_private *private)
+{
+	struct subchannel *sch;
+	union orb *orb;
+	int ccode;
+	__u8 lpm;
+	u32 intparm;
+
+	sch = private->sch;
+
+	orb = cp_get_orb(&private->cp, (u32)(addr_t)sch, sch->lpm);
+
+	/* Issue "Start Subchannel" */
+	ccode = ssch(sch->schid, orb);
+
+	switch (ccode) {
+	case 0:
+		/*
+		 * Initialize device status information
+		 */
+		sch->schib.scsw.cmd.actl |= SCSW_ACTL_START_PEND;
+		break;
+	case 1:		/* Status pending */
+	case 2:		/* Busy */
+		return -EBUSY;
+	case 3:		/* Device/path not operational */
+	{
+		lpm = orb->cmd.lpm;
+		if (lpm != 0)
+			sch->lpm &= ~lpm;
+		else
+			sch->lpm = 0;
+
+		if (cio_update_schib(sch))
+			return -ENODEV;
+
+		return sch->lpm ? -EACCES : -ENODEV;
+	}
+	default:
+		return ccode;
+	}
+
+	intparm = (u32)(addr_t)sch;
+	private->intparm = 0;
+	wait_event(private->wait_q, doing_io(private, intparm));
+
+	if (scsw_is_solicited(&private->irb.scsw))
+		cp_update_scsw(&private->cp, &private->irb.scsw);
+
+	return 0;
+}
+
+/* Deal with the ccw command request from the userspace. */
+int vfio_ccw_sch_cmd_request(struct vfio_ccw_private *private)
+{
+	struct mdev_device *mdev = private->mdev;
+	union orb *orb;
+	union scsw *scsw = &private->scsw;
+	struct irb *irb = &private->irb;
+	struct ccw_io_region *io_region = &private->io_region;
+	int ret;
+
+	memcpy(scsw, io_region->scsw_area, sizeof(*scsw));
+
+	if (scsw->cmd.fctl & SCSW_FCTL_START_FUNC) {
+		orb = (union orb *)io_region->orb_area;
+
+		ret = cp_init(&private->cp, mdev_dev(mdev), orb);
+		if (ret)
+			return ret;
+
+		ret = cp_prefetch(&private->cp);
+		if (ret) {
+			cp_free(&private->cp);
+			return ret;
+		}
+
+		/* Start channel program and wait for I/O interrupt. */
+		ret = vfio_ccw_sch_io_helper(private);
+		if (!ret) {
+			/* Get irb info and copy it to irb_area. */
+			memcpy(io_region->irb_area, irb, sizeof(*irb));
+		}
+
+		cp_free(&private->cp);
+	} else if (scsw->cmd.fctl & SCSW_FCTL_HALT_FUNC) {
+		/* XXX: Handle halt. */
+		ret = -EOPNOTSUPP;
+	} else if (scsw->cmd.fctl & SCSW_FCTL_CLEAR_FUNC) {
+		/* XXX: Handle clear. */
+		ret = -EOPNOTSUPP;
+	} else {
+		ret = -EOPNOTSUPP;
+	}
+
+	return ret;
+}
+
 /*
  * Sysfs interfaces
  */
@@ -113,12 +220,18 @@ static struct attribute_group vfio_subchannel_attr_group = {
 static void vfio_ccw_sch_irq(struct subchannel *sch)
 {
 	struct vfio_ccw_private *private = dev_get_drvdata(&sch->dev);
+	struct irb *irb;
 
 	inc_irq_stat(IRQIO_CIO);
 
 	if (!private)
 		return;
 
+	irb = this_cpu_ptr(&cio_irb);
+	memcpy(&private->irb, irb, sizeof(*irb));
+	private->intparm = (u32)(addr_t)sch;
+	wake_up(&private->wait_q);
+
 	if (private->completion)
 		complete(private->completion);
 }
@@ -156,6 +269,7 @@ static int vfio_ccw_sch_probe(struct subchannel *sch)
 	if (ret)
 		goto out_rm_group;
 
+	init_waitqueue_head(&private->wait_q);
 	atomic_set(&private->avail, 1);
 
 	return 0;
diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c
index 6c06805839d8..878c88239fc8 100644
--- a/drivers/s390/cio/vfio_ccw_ops.c
+++ b/drivers/s390/cio/vfio_ccw_ops.c
@@ -23,12 +23,25 @@ static int vfio_ccw_mdev_notifier(struct notifier_block *nb,
 		return NOTIFY_STOP;
 
 	/*
-	 * TODO:
 	 * Vendor drivers MUST unpin pages in response to an
 	 * invalidation.
 	 */
-	if (action == VFIO_IOMMU_NOTIFY_DMA_UNMAP)
-		return NOTIFY_BAD;
+	if (action == VFIO_IOMMU_NOTIFY_DMA_UNMAP) {
+		struct vfio_iommu_type1_dma_unmap *unmap = data;
+		struct subchannel *sch = private->sch;
+
+		if (!cp_iova_pinned(&private->cp, unmap->iova))
+			return NOTIFY_OK;
+
+		if (vfio_ccw_sch_quiesce(sch))
+			return NOTIFY_BAD;
+
+		if (cio_enable_subchannel(sch, (u32)(unsigned long)sch))
+			return NOTIFY_BAD;
+
+		cp_free(&private->cp);
+		return NOTIFY_OK;
+	}
 
 	return NOTIFY_DONE;
 }
@@ -167,7 +180,10 @@ static ssize_t vfio_ccw_mdev_write(struct mdev_device *mdev,
 	region = &private->io_region;
 	if (copy_from_user((void *)region + *ppos, buf, count))
 		return -EFAULT;
-	region->ret_code = 0;
+
+	region->ret_code = vfio_ccw_sch_cmd_request(private);
+	if (region->ret_code != 0)
+		return region->ret_code;
 
 	return count;
 }
diff --git a/drivers/s390/cio/vfio_ccw_private.h b/drivers/s390/cio/vfio_ccw_private.h
index 359e96ba9c6c..79e53378f212 100644
--- a/drivers/s390/cio/vfio_ccw_private.h
+++ b/drivers/s390/cio/vfio_ccw_private.h
@@ -10,9 +10,11 @@
 #ifndef _VFIO_CCW_PRIVATE_H_
 #define _VFIO_CCW_PRIVATE_H_
 
+#include <linux/completion.h>
 #include <linux/vfio_ccw.h>
 
 #include "css.h"
+#include "vfio_ccw_cp.h"
 
 /**
  * struct vfio_ccw_private
@@ -22,6 +24,11 @@
  * @mdev: pointer to the mediated device
  * @nb: notifier for vfio events
  * @io_region: MMIO region to input/output I/O arguments/results
+ * @wait_q: wait for interrupt
+ * @intparm: record current interrupt parameter, used for wait interrupt
+ * @cp: channel program for the current I/O operation
+ * @irb: irb info received from interrupt
+ * @scsw: scsw info
  */
 struct vfio_ccw_private {
 	struct subchannel	*sch;
@@ -30,11 +37,18 @@ struct vfio_ccw_private {
 	struct mdev_device	*mdev;
 	struct notifier_block	nb;
 	struct ccw_io_region	io_region;
+
+	wait_queue_head_t	wait_q;
+	u32			intparm;
+	struct channel_program	cp;
+	struct irb		irb;
+	union scsw		scsw;
 } __aligned(8);
 
 extern int vfio_ccw_mdev_reg(struct subchannel *sch);
 extern void vfio_ccw_mdev_unreg(struct subchannel *sch);
 
 extern int vfio_ccw_sch_quiesce(struct subchannel *sch);
+extern int vfio_ccw_sch_cmd_request(struct vfio_ccw_private *private);
 
 #endif

From e01bcdd61320c91c826376e0a7dd96ef8e85dd18 Mon Sep 17 00:00:00 2001
From: Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com>
Date: Fri, 17 Mar 2017 04:17:36 +0100
Subject: [PATCH 43/74] vfio: ccw: realize VFIO_DEVICE_GET_REGION_INFO ioctl

Introduce device information about vfio-ccw: VFIO_DEVICE_FLAGS_CCW.
Realize VFIO_DEVICE_GET_REGION_INFO ioctl for vfio-ccw.

Reviewed-by: Pierre Morel <pmorel@linux.vnet.ibm.com>
Signed-off-by: Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com>
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Message-Id: <20170317031743.40128-10-bjsdjshi@linux.vnet.ibm.com>
Signed-off-by: Cornelia Huck <cornelia.huck@de.ibm.com>
---
 drivers/s390/cio/vfio_ccw_ops.c | 78 +++++++++++++++++++++++++++++++++
 include/uapi/linux/vfio.h       | 11 +++++
 2 files changed, 89 insertions(+)

diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c
index 878c88239fc8..f3300ddded3f 100644
--- a/drivers/s390/cio/vfio_ccw_ops.c
+++ b/drivers/s390/cio/vfio_ccw_ops.c
@@ -188,6 +188,83 @@ static ssize_t vfio_ccw_mdev_write(struct mdev_device *mdev,
 	return count;
 }
 
+static int vfio_ccw_mdev_get_device_info(struct vfio_device_info *info)
+{
+	info->flags = VFIO_DEVICE_FLAGS_CCW;
+	info->num_regions = VFIO_CCW_NUM_REGIONS;
+	info->num_irqs = 0;
+
+	return 0;
+}
+
+static int vfio_ccw_mdev_get_region_info(struct vfio_region_info *info,
+					 u16 *cap_type_id,
+					 void **cap_type)
+{
+	switch (info->index) {
+	case VFIO_CCW_CONFIG_REGION_INDEX:
+		info->offset = 0;
+		info->size = sizeof(struct ccw_io_region);
+		info->flags = VFIO_REGION_INFO_FLAG_READ
+			      | VFIO_REGION_INFO_FLAG_WRITE;
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
+static ssize_t vfio_ccw_mdev_ioctl(struct mdev_device *mdev,
+				   unsigned int cmd,
+				   unsigned long arg)
+{
+	int ret = 0;
+	unsigned long minsz;
+
+	switch (cmd) {
+	case VFIO_DEVICE_GET_INFO:
+	{
+		struct vfio_device_info info;
+
+		minsz = offsetofend(struct vfio_device_info, num_irqs);
+
+		if (copy_from_user(&info, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (info.argsz < minsz)
+			return -EINVAL;
+
+		ret = vfio_ccw_mdev_get_device_info(&info);
+		if (ret)
+			return ret;
+
+		return copy_to_user((void __user *)arg, &info, minsz);
+	}
+	case VFIO_DEVICE_GET_REGION_INFO:
+	{
+		struct vfio_region_info info;
+		u16 cap_type_id = 0;
+		void *cap_type = NULL;
+
+		minsz = offsetofend(struct vfio_region_info, offset);
+
+		if (copy_from_user(&info, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (info.argsz < minsz)
+			return -EINVAL;
+
+		ret = vfio_ccw_mdev_get_region_info(&info, &cap_type_id,
+						    &cap_type);
+		if (ret)
+			return ret;
+
+		return copy_to_user((void __user *)arg, &info, minsz);
+	}
+	default:
+		return -ENOTTY;
+	}
+}
+
 static const struct mdev_parent_ops vfio_ccw_mdev_ops = {
 	.owner			= THIS_MODULE,
 	.supported_type_groups  = mdev_type_groups,
@@ -197,6 +274,7 @@ static const struct mdev_parent_ops vfio_ccw_mdev_ops = {
 	.release		= vfio_ccw_mdev_release,
 	.read			= vfio_ccw_mdev_read,
 	.write			= vfio_ccw_mdev_write,
+	.ioctl			= vfio_ccw_mdev_ioctl,
 };
 
 int vfio_ccw_mdev_reg(struct subchannel *sch)
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 61837890c132..3fd70ffe9d78 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -198,6 +198,7 @@ struct vfio_device_info {
 #define VFIO_DEVICE_FLAGS_PCI	(1 << 1)	/* vfio-pci device */
 #define VFIO_DEVICE_FLAGS_PLATFORM (1 << 2)	/* vfio-platform device */
 #define VFIO_DEVICE_FLAGS_AMBA  (1 << 3)	/* vfio-amba device */
+#define VFIO_DEVICE_FLAGS_CCW	(1 << 4)	/* vfio-ccw device */
 	__u32	num_regions;	/* Max region index + 1 */
 	__u32	num_irqs;	/* Max IRQ index + 1 */
 };
@@ -447,6 +448,16 @@ enum {
 	VFIO_PCI_NUM_IRQS
 };
 
+/*
+ * The vfio-ccw bus driver makes use of the following fixed region.
+ * Unimplemented regions return a size of zero.
+ */
+
+enum {
+	VFIO_CCW_CONFIG_REGION_INDEX,
+	VFIO_CCW_NUM_REGIONS
+};
+
 /**
  * VFIO_DEVICE_GET_PCI_HOT_RESET_INFO - _IORW(VFIO_TYPE, VFIO_BASE + 12,
  *					      struct vfio_pci_hot_reset_info)

From 83d1193a96dc78576c15f93cd70e0558763a85b3 Mon Sep 17 00:00:00 2001
From: Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com>
Date: Fri, 17 Mar 2017 04:17:37 +0100
Subject: [PATCH 44/74] vfio: ccw: realize VFIO_DEVICE_RESET ioctl

Introduce VFIO_DEVICE_RESET ioctl for vfio-ccw to make it possible
to hot-reset the device.

We try to achieve a reset by first disabling the subchannel and
then enabling it again: this should clear all state at the subchannel.

Signed-off-by: Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com>
Message-Id: <20170317031743.40128-11-bjsdjshi@linux.vnet.ibm.com>
Signed-off-by: Cornelia Huck <cornelia.huck@de.ibm.com>
---
 drivers/s390/cio/vfio_ccw_ops.c | 47 +++++++++++++++++++++++----------
 1 file changed, 33 insertions(+), 14 deletions(-)

diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c
index f3300ddded3f..125818cdf305 100644
--- a/drivers/s390/cio/vfio_ccw_ops.c
+++ b/drivers/s390/cio/vfio_ccw_ops.c
@@ -12,6 +12,32 @@
 
 #include "vfio_ccw_private.h"
 
+static int vfio_ccw_mdev_reset(struct mdev_device *mdev)
+{
+	struct vfio_ccw_private *private;
+	struct subchannel *sch;
+	int ret;
+
+	private = dev_get_drvdata(mdev_parent_dev(mdev));
+	if (!private)
+		return -ENODEV;
+
+	sch = private->sch;
+	/*
+	 * TODO:
+	 * In the cureent stage, some things like "no I/O running" and "no
+	 * interrupt pending" are clear, but we are not sure what other state
+	 * we need to care about.
+	 * There are still a lot more instructions need to be handled. We
+	 * should come back here later.
+	 */
+	ret = vfio_ccw_sch_quiesce(sch);
+	if (ret)
+		return ret;
+
+	return cio_enable_subchannel(sch, (u32)(unsigned long)sch);
+}
+
 static int vfio_ccw_mdev_notifier(struct notifier_block *nb,
 				  unsigned long action,
 				  void *data)
@@ -28,15 +54,11 @@ static int vfio_ccw_mdev_notifier(struct notifier_block *nb,
 	 */
 	if (action == VFIO_IOMMU_NOTIFY_DMA_UNMAP) {
 		struct vfio_iommu_type1_dma_unmap *unmap = data;
-		struct subchannel *sch = private->sch;
 
 		if (!cp_iova_pinned(&private->cp, unmap->iova))
 			return NOTIFY_OK;
 
-		if (vfio_ccw_sch_quiesce(sch))
-			return NOTIFY_BAD;
-
-		if (cio_enable_subchannel(sch, (u32)(unsigned long)sch))
+		if (vfio_ccw_mdev_reset(private->mdev))
 			return NOTIFY_BAD;
 
 		cp_free(&private->cp);
@@ -100,16 +122,11 @@ static int vfio_ccw_mdev_create(struct kobject *kobj, struct mdev_device *mdev)
 
 static int vfio_ccw_mdev_remove(struct mdev_device *mdev)
 {
-	struct vfio_ccw_private *private;
-	struct subchannel *sch;
+	struct vfio_ccw_private *private =
+		dev_get_drvdata(mdev_parent_dev(mdev));
 	int ret;
 
-	private = dev_get_drvdata(mdev_parent_dev(mdev));
-	sch = private->sch;
-	ret = vfio_ccw_sch_quiesce(sch);
-	if (ret)
-		return ret;
-	ret = cio_enable_subchannel(sch, (u32)(unsigned long)sch);
+	ret = vfio_ccw_mdev_reset(mdev);
 	if (ret)
 		return ret;
 
@@ -190,7 +207,7 @@ static ssize_t vfio_ccw_mdev_write(struct mdev_device *mdev,
 
 static int vfio_ccw_mdev_get_device_info(struct vfio_device_info *info)
 {
-	info->flags = VFIO_DEVICE_FLAGS_CCW;
+	info->flags = VFIO_DEVICE_FLAGS_CCW | VFIO_DEVICE_FLAGS_RESET;
 	info->num_regions = VFIO_CCW_NUM_REGIONS;
 	info->num_irqs = 0;
 
@@ -260,6 +277,8 @@ static ssize_t vfio_ccw_mdev_ioctl(struct mdev_device *mdev,
 
 		return copy_to_user((void __user *)arg, &info, minsz);
 	}
+	case VFIO_DEVICE_RESET:
+		return vfio_ccw_mdev_reset(mdev);
 	default:
 		return -ENOTTY;
 	}

From 120e214e504fd6d3e33ec4b661193600b2faab95 Mon Sep 17 00:00:00 2001
From: Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com>
Date: Fri, 17 Mar 2017 04:17:38 +0100
Subject: [PATCH 45/74] vfio: ccw: realize VFIO_DEVICE_G(S)ET_IRQ_INFO ioctls

Realize VFIO_DEVICE_GET_IRQ_INFO ioctl to retrieve
VFIO_CCW_IO_IRQ information.

Realize VFIO_DEVICE_SET_IRQS ioctl to set an eventfd fd for
VFIO_CCW_IO_IRQ. Once a write operation to the ccw_io_region
was performed, trigger a signal on this fd.

Reviewed-by: Pierre Morel <pmorel@linux.vnet.ibm.com>
Signed-off-by: Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com>
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Message-Id: <20170317031743.40128-12-bjsdjshi@linux.vnet.ibm.com>
Signed-off-by: Cornelia Huck <cornelia.huck@de.ibm.com>
---
 drivers/s390/cio/vfio_ccw_ops.c     | 123 +++++++++++++++++++++++++++-
 drivers/s390/cio/vfio_ccw_private.h |   4 +
 include/uapi/linux/vfio.h           |  10 ++-
 3 files changed, 134 insertions(+), 3 deletions(-)

diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c
index 125818cdf305..1294c5347410 100644
--- a/drivers/s390/cio/vfio_ccw_ops.c
+++ b/drivers/s390/cio/vfio_ccw_ops.c
@@ -202,6 +202,9 @@ static ssize_t vfio_ccw_mdev_write(struct mdev_device *mdev,
 	if (region->ret_code != 0)
 		return region->ret_code;
 
+	if (private->io_trigger)
+		eventfd_signal(private->io_trigger, 1);
+
 	return count;
 }
 
@@ -209,7 +212,7 @@ static int vfio_ccw_mdev_get_device_info(struct vfio_device_info *info)
 {
 	info->flags = VFIO_DEVICE_FLAGS_CCW | VFIO_DEVICE_FLAGS_RESET;
 	info->num_regions = VFIO_CCW_NUM_REGIONS;
-	info->num_irqs = 0;
+	info->num_irqs = VFIO_CCW_NUM_IRQS;
 
 	return 0;
 }
@@ -230,6 +233,83 @@ static int vfio_ccw_mdev_get_region_info(struct vfio_region_info *info,
 	}
 }
 
+int vfio_ccw_mdev_get_irq_info(struct vfio_irq_info *info)
+{
+	if (info->index != VFIO_CCW_IO_IRQ_INDEX)
+		return -EINVAL;
+
+	info->count = 1;
+	info->flags = VFIO_IRQ_INFO_EVENTFD;
+
+	return 0;
+}
+
+static int vfio_ccw_mdev_set_irqs(struct mdev_device *mdev,
+				  uint32_t flags,
+				  void __user *data)
+{
+	struct vfio_ccw_private *private;
+	struct eventfd_ctx **ctx;
+
+	if (!(flags & VFIO_IRQ_SET_ACTION_TRIGGER))
+		return -EINVAL;
+
+	private = dev_get_drvdata(mdev_parent_dev(mdev));
+	if (!private)
+		return -ENODEV;
+
+	ctx = &private->io_trigger;
+
+	switch (flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
+	case VFIO_IRQ_SET_DATA_NONE:
+	{
+		if (*ctx)
+			eventfd_signal(*ctx, 1);
+		return 0;
+	}
+	case VFIO_IRQ_SET_DATA_BOOL:
+	{
+		uint8_t trigger;
+
+		if (get_user(trigger, (uint8_t __user *)data))
+			return -EFAULT;
+
+		if (trigger && *ctx)
+			eventfd_signal(*ctx, 1);
+		return 0;
+	}
+	case VFIO_IRQ_SET_DATA_EVENTFD:
+	{
+		int32_t fd;
+
+		if (get_user(fd, (int32_t __user *)data))
+			return -EFAULT;
+
+		if (fd == -1) {
+			if (*ctx)
+				eventfd_ctx_put(*ctx);
+			*ctx = NULL;
+		} else if (fd >= 0) {
+			struct eventfd_ctx *efdctx;
+
+			efdctx = eventfd_ctx_fdget(fd);
+			if (IS_ERR(efdctx))
+				return PTR_ERR(efdctx);
+
+			if (*ctx)
+				eventfd_ctx_put(*ctx);
+
+			*ctx = efdctx;
+		} else
+			return -EINVAL;
+
+		return 0;
+	}
+	default:
+		return -EINVAL;
+	}
+}
+
 static ssize_t vfio_ccw_mdev_ioctl(struct mdev_device *mdev,
 				   unsigned int cmd,
 				   unsigned long arg)
@@ -277,6 +357,47 @@ static ssize_t vfio_ccw_mdev_ioctl(struct mdev_device *mdev,
 
 		return copy_to_user((void __user *)arg, &info, minsz);
 	}
+	case VFIO_DEVICE_GET_IRQ_INFO:
+	{
+		struct vfio_irq_info info;
+
+		minsz = offsetofend(struct vfio_irq_info, count);
+
+		if (copy_from_user(&info, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (info.argsz < minsz || info.index >= VFIO_CCW_NUM_IRQS)
+			return -EINVAL;
+
+		ret = vfio_ccw_mdev_get_irq_info(&info);
+		if (ret)
+			return ret;
+
+		if (info.count == -1)
+			return -EINVAL;
+
+		return copy_to_user((void __user *)arg, &info, minsz);
+	}
+	case VFIO_DEVICE_SET_IRQS:
+	{
+		struct vfio_irq_set hdr;
+		size_t data_size;
+		void __user *data;
+
+		minsz = offsetofend(struct vfio_irq_set, count);
+
+		if (copy_from_user(&hdr, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		ret = vfio_set_irqs_validate_and_prepare(&hdr, 1,
+							 VFIO_CCW_NUM_IRQS,
+							 &data_size);
+		if (ret)
+			return ret;
+
+		data = (void __user *)(arg + minsz);
+		return vfio_ccw_mdev_set_irqs(mdev, hdr.flags, data);
+	}
 	case VFIO_DEVICE_RESET:
 		return vfio_ccw_mdev_reset(mdev);
 	default:
diff --git a/drivers/s390/cio/vfio_ccw_private.h b/drivers/s390/cio/vfio_ccw_private.h
index 79e53378f212..dddab52913ed 100644
--- a/drivers/s390/cio/vfio_ccw_private.h
+++ b/drivers/s390/cio/vfio_ccw_private.h
@@ -11,6 +11,7 @@
 #define _VFIO_CCW_PRIVATE_H_
 
 #include <linux/completion.h>
+#include <linux/eventfd.h>
 #include <linux/vfio_ccw.h>
 
 #include "css.h"
@@ -29,6 +30,7 @@
  * @cp: channel program for the current I/O operation
  * @irb: irb info received from interrupt
  * @scsw: scsw info
+ * @io_trigger: eventfd ctx for signaling userspace I/O results
  */
 struct vfio_ccw_private {
 	struct subchannel	*sch;
@@ -43,6 +45,8 @@ struct vfio_ccw_private {
 	struct channel_program	cp;
 	struct irb		irb;
 	union scsw		scsw;
+
+	struct eventfd_ctx	*io_trigger;
 } __aligned(8);
 
 extern int vfio_ccw_mdev_reg(struct subchannel *sch);
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 3fd70ffe9d78..ae461050661a 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -449,8 +449,9 @@ enum {
 };
 
 /*
- * The vfio-ccw bus driver makes use of the following fixed region.
- * Unimplemented regions return a size of zero.
+ * The vfio-ccw bus driver makes use of the following fixed region and
+ * IRQ index mapping. Unimplemented regions return a size of zero.
+ * Unimplemented IRQ types return a count of zero.
  */
 
 enum {
@@ -458,6 +459,11 @@ enum {
 	VFIO_CCW_NUM_REGIONS
 };
 
+enum {
+	VFIO_CCW_IO_IRQ_INDEX,
+	VFIO_CCW_NUM_IRQS
+};
+
 /**
  * VFIO_DEVICE_GET_PCI_HOT_RESET_INFO - _IORW(VFIO_TYPE, VFIO_BASE + 12,
  *					      struct vfio_pci_hot_reset_info)

From e5f84dbaea59b4f712dac428c337528b70e1c533 Mon Sep 17 00:00:00 2001
From: Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com>
Date: Fri, 17 Mar 2017 04:17:39 +0100
Subject: [PATCH 46/74] vfio: ccw: return I/O results asynchronously

Introduce a singlethreaded workqueue to handle the I/O interrupts.
With the work added to this queue, we store the I/O results to the
io_region of the subchannel, then signal the userspace program to
handle the results.

Signed-off-by: Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com>
Message-Id: <20170317031743.40128-13-bjsdjshi@linux.vnet.ibm.com>
Signed-off-by: Cornelia Huck <cornelia.huck@de.ibm.com>
---
 drivers/s390/cio/vfio_ccw_drv.c     | 58 +++++++++++++++++------------
 drivers/s390/cio/vfio_ccw_ops.c     |  3 --
 drivers/s390/cio/vfio_ccw_private.h |  7 ++--
 3 files changed, 37 insertions(+), 31 deletions(-)

diff --git a/drivers/s390/cio/vfio_ccw_drv.c b/drivers/s390/cio/vfio_ccw_drv.c
index b5971d321697..3e0a40ec8ade 100644
--- a/drivers/s390/cio/vfio_ccw_drv.c
+++ b/drivers/s390/cio/vfio_ccw_drv.c
@@ -20,6 +20,8 @@
 #include "css.h"
 #include "vfio_ccw_private.h"
 
+struct workqueue_struct *vfio_ccw_work_q;
+
 /*
  * Helpers
  */
@@ -52,6 +54,7 @@ int vfio_ccw_sch_quiesce(struct subchannel *sch)
 
 			spin_lock_irq(sch->lock);
 			private->completion = NULL;
+			flush_workqueue(vfio_ccw_work_q);
 			ret = cio_cancel_halt_clear(sch, &iretry);
 		};
 
@@ -63,18 +66,12 @@ out_unlock:
 	return ret;
 }
 
-static int doing_io(struct vfio_ccw_private *private, u32 intparm)
-{
-	return (private->intparm == intparm);
-}
-
 static int vfio_ccw_sch_io_helper(struct vfio_ccw_private *private)
 {
 	struct subchannel *sch;
 	union orb *orb;
 	int ccode;
 	__u8 lpm;
-	u32 intparm;
 
 	sch = private->sch;
 
@@ -89,7 +86,7 @@ static int vfio_ccw_sch_io_helper(struct vfio_ccw_private *private)
 		 * Initialize device status information
 		 */
 		sch->schib.scsw.cmd.actl |= SCSW_ACTL_START_PEND;
-		break;
+		return 0;
 	case 1:		/* Status pending */
 	case 2:		/* Busy */
 		return -EBUSY;
@@ -109,15 +106,26 @@ static int vfio_ccw_sch_io_helper(struct vfio_ccw_private *private)
 	default:
 		return ccode;
 	}
+}
 
-	intparm = (u32)(addr_t)sch;
-	private->intparm = 0;
-	wait_event(private->wait_q, doing_io(private, intparm));
+static void vfio_ccw_sch_io_todo(struct work_struct *work)
+{
+	struct vfio_ccw_private *private;
+	struct subchannel *sch;
+	struct irb *irb;
 
-	if (scsw_is_solicited(&private->irb.scsw))
-		cp_update_scsw(&private->cp, &private->irb.scsw);
+	private = container_of(work, struct vfio_ccw_private, io_work);
+	irb = &private->irb;
+	sch = private->sch;
 
-	return 0;
+	if (scsw_is_solicited(&irb->scsw)) {
+		cp_update_scsw(&private->cp, &irb->scsw);
+		cp_free(&private->cp);
+	}
+	memcpy(private->io_region.irb_area, irb, sizeof(*irb));
+
+	if (private->io_trigger)
+		eventfd_signal(private->io_trigger, 1);
 }
 
 /* Deal with the ccw command request from the userspace. */
@@ -126,7 +134,6 @@ int vfio_ccw_sch_cmd_request(struct vfio_ccw_private *private)
 	struct mdev_device *mdev = private->mdev;
 	union orb *orb;
 	union scsw *scsw = &private->scsw;
-	struct irb *irb = &private->irb;
 	struct ccw_io_region *io_region = &private->io_region;
 	int ret;
 
@@ -147,12 +154,8 @@ int vfio_ccw_sch_cmd_request(struct vfio_ccw_private *private)
 
 		/* Start channel program and wait for I/O interrupt. */
 		ret = vfio_ccw_sch_io_helper(private);
-		if (!ret) {
-			/* Get irb info and copy it to irb_area. */
-			memcpy(io_region->irb_area, irb, sizeof(*irb));
-		}
-
-		cp_free(&private->cp);
+		if (!ret)
+			cp_free(&private->cp);
 	} else if (scsw->cmd.fctl & SCSW_FCTL_HALT_FUNC) {
 		/* XXX: Handle halt. */
 		ret = -EOPNOTSUPP;
@@ -229,8 +232,8 @@ static void vfio_ccw_sch_irq(struct subchannel *sch)
 
 	irb = this_cpu_ptr(&cio_irb);
 	memcpy(&private->irb, irb, sizeof(*irb));
-	private->intparm = (u32)(addr_t)sch;
-	wake_up(&private->wait_q);
+
+	queue_work(vfio_ccw_work_q, &private->io_work);
 
 	if (private->completion)
 		complete(private->completion);
@@ -269,7 +272,7 @@ static int vfio_ccw_sch_probe(struct subchannel *sch)
 	if (ret)
 		goto out_rm_group;
 
-	init_waitqueue_head(&private->wait_q);
+	INIT_WORK(&private->io_work, vfio_ccw_sch_io_todo);
 	atomic_set(&private->avail, 1);
 
 	return 0;
@@ -367,10 +370,16 @@ static int __init vfio_ccw_sch_init(void)
 {
 	int ret;
 
+	vfio_ccw_work_q = create_singlethread_workqueue("vfio-ccw");
+	if (!vfio_ccw_work_q)
+		return -ENOMEM;
+
 	isc_register(VFIO_CCW_ISC);
 	ret = css_driver_register(&vfio_ccw_sch_driver);
-	if (ret)
+	if (ret) {
 		isc_unregister(VFIO_CCW_ISC);
+		destroy_workqueue(vfio_ccw_work_q);
+	}
 
 	return ret;
 }
@@ -379,6 +388,7 @@ static void __exit vfio_ccw_sch_exit(void)
 {
 	css_driver_unregister(&vfio_ccw_sch_driver);
 	isc_unregister(VFIO_CCW_ISC);
+	destroy_workqueue(vfio_ccw_work_q);
 }
 module_init(vfio_ccw_sch_init);
 module_exit(vfio_ccw_sch_exit);
diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c
index 1294c5347410..d754d3d90574 100644
--- a/drivers/s390/cio/vfio_ccw_ops.c
+++ b/drivers/s390/cio/vfio_ccw_ops.c
@@ -202,9 +202,6 @@ static ssize_t vfio_ccw_mdev_write(struct mdev_device *mdev,
 	if (region->ret_code != 0)
 		return region->ret_code;
 
-	if (private->io_trigger)
-		eventfd_signal(private->io_trigger, 1);
-
 	return count;
 }
 
diff --git a/drivers/s390/cio/vfio_ccw_private.h b/drivers/s390/cio/vfio_ccw_private.h
index dddab52913ed..2503797fb153 100644
--- a/drivers/s390/cio/vfio_ccw_private.h
+++ b/drivers/s390/cio/vfio_ccw_private.h
@@ -12,6 +12,7 @@
 
 #include <linux/completion.h>
 #include <linux/eventfd.h>
+#include <linux/workqueue.h>
 #include <linux/vfio_ccw.h>
 
 #include "css.h"
@@ -25,12 +26,11 @@
  * @mdev: pointer to the mediated device
  * @nb: notifier for vfio events
  * @io_region: MMIO region to input/output I/O arguments/results
- * @wait_q: wait for interrupt
- * @intparm: record current interrupt parameter, used for wait interrupt
  * @cp: channel program for the current I/O operation
  * @irb: irb info received from interrupt
  * @scsw: scsw info
  * @io_trigger: eventfd ctx for signaling userspace I/O results
+ * @io_work: work for deferral process of I/O handling
  */
 struct vfio_ccw_private {
 	struct subchannel	*sch;
@@ -40,13 +40,12 @@ struct vfio_ccw_private {
 	struct notifier_block	nb;
 	struct ccw_io_region	io_region;
 
-	wait_queue_head_t	wait_q;
-	u32			intparm;
 	struct channel_program	cp;
 	struct irb		irb;
 	union scsw		scsw;
 
 	struct eventfd_ctx	*io_trigger;
+	struct work_struct	io_work;
 } __aligned(8);
 
 extern int vfio_ccw_mdev_reg(struct subchannel *sch);

From bbe37e4cb89702aa78e0f44618c5af7f9aaa33f6 Mon Sep 17 00:00:00 2001
From: Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com>
Date: Fri, 17 Mar 2017 04:17:40 +0100
Subject: [PATCH 47/74] vfio: ccw: introduce a finite state machine

The current implementation doesn't check if the subchannel is in a
proper device state when handling an event. Let's introduce
a finite state machine to manage the state/event change.

Signed-off-by: Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com>
Message-Id: <20170317031743.40128-14-bjsdjshi@linux.vnet.ibm.com>
Signed-off-by: Cornelia Huck <cornelia.huck@de.ibm.com>
---
 drivers/s390/cio/Makefile           |   2 +-
 drivers/s390/cio/vfio_ccw_drv.c     | 116 ++--------------
 drivers/s390/cio/vfio_ccw_fsm.c     | 207 ++++++++++++++++++++++++++++
 drivers/s390/cio/vfio_ccw_ops.c     |  28 +++-
 drivers/s390/cio/vfio_ccw_private.h |  41 +++++-
 5 files changed, 287 insertions(+), 107 deletions(-)
 create mode 100644 drivers/s390/cio/vfio_ccw_fsm.c

diff --git a/drivers/s390/cio/Makefile b/drivers/s390/cio/Makefile
index b0586b223502..bdf47526038a 100644
--- a/drivers/s390/cio/Makefile
+++ b/drivers/s390/cio/Makefile
@@ -18,5 +18,5 @@ obj-$(CONFIG_CCWGROUP) += ccwgroup.o
 qdio-objs := qdio_main.o qdio_thinint.o qdio_debug.o qdio_setup.o
 obj-$(CONFIG_QDIO) += qdio.o
 
-vfio_ccw-objs += vfio_ccw_drv.o vfio_ccw_cp.o vfio_ccw_ops.o
+vfio_ccw-objs += vfio_ccw_drv.o vfio_ccw_cp.o vfio_ccw_ops.o vfio_ccw_fsm.o
 obj-$(CONFIG_VFIO_CCW) += vfio_ccw.o
diff --git a/drivers/s390/cio/vfio_ccw_drv.c b/drivers/s390/cio/vfio_ccw_drv.c
index 3e0a40ec8ade..e90dd43d2a55 100644
--- a/drivers/s390/cio/vfio_ccw_drv.c
+++ b/drivers/s390/cio/vfio_ccw_drv.c
@@ -60,54 +60,12 @@ int vfio_ccw_sch_quiesce(struct subchannel *sch)
 
 		ret = cio_disable_subchannel(sch);
 	} while (ret == -EBUSY);
-
 out_unlock:
+	private->state = VFIO_CCW_STATE_NOT_OPER;
 	spin_unlock_irq(sch->lock);
 	return ret;
 }
 
-static int vfio_ccw_sch_io_helper(struct vfio_ccw_private *private)
-{
-	struct subchannel *sch;
-	union orb *orb;
-	int ccode;
-	__u8 lpm;
-
-	sch = private->sch;
-
-	orb = cp_get_orb(&private->cp, (u32)(addr_t)sch, sch->lpm);
-
-	/* Issue "Start Subchannel" */
-	ccode = ssch(sch->schid, orb);
-
-	switch (ccode) {
-	case 0:
-		/*
-		 * Initialize device status information
-		 */
-		sch->schib.scsw.cmd.actl |= SCSW_ACTL_START_PEND;
-		return 0;
-	case 1:		/* Status pending */
-	case 2:		/* Busy */
-		return -EBUSY;
-	case 3:		/* Device/path not operational */
-	{
-		lpm = orb->cmd.lpm;
-		if (lpm != 0)
-			sch->lpm &= ~lpm;
-		else
-			sch->lpm = 0;
-
-		if (cio_update_schib(sch))
-			return -ENODEV;
-
-		return sch->lpm ? -EACCES : -ENODEV;
-	}
-	default:
-		return ccode;
-	}
-}
-
 static void vfio_ccw_sch_io_todo(struct work_struct *work)
 {
 	struct vfio_ccw_private *private;
@@ -126,47 +84,9 @@ static void vfio_ccw_sch_io_todo(struct work_struct *work)
 
 	if (private->io_trigger)
 		eventfd_signal(private->io_trigger, 1);
-}
 
-/* Deal with the ccw command request from the userspace. */
-int vfio_ccw_sch_cmd_request(struct vfio_ccw_private *private)
-{
-	struct mdev_device *mdev = private->mdev;
-	union orb *orb;
-	union scsw *scsw = &private->scsw;
-	struct ccw_io_region *io_region = &private->io_region;
-	int ret;
-
-	memcpy(scsw, io_region->scsw_area, sizeof(*scsw));
-
-	if (scsw->cmd.fctl & SCSW_FCTL_START_FUNC) {
-		orb = (union orb *)io_region->orb_area;
-
-		ret = cp_init(&private->cp, mdev_dev(mdev), orb);
-		if (ret)
-			return ret;
-
-		ret = cp_prefetch(&private->cp);
-		if (ret) {
-			cp_free(&private->cp);
-			return ret;
-		}
-
-		/* Start channel program and wait for I/O interrupt. */
-		ret = vfio_ccw_sch_io_helper(private);
-		if (!ret)
-			cp_free(&private->cp);
-	} else if (scsw->cmd.fctl & SCSW_FCTL_HALT_FUNC) {
-		/* XXX: Handle halt. */
-		ret = -EOPNOTSUPP;
-	} else if (scsw->cmd.fctl & SCSW_FCTL_CLEAR_FUNC) {
-		/* XXX: Handle clear. */
-		ret = -EOPNOTSUPP;
-	} else {
-		ret = -EOPNOTSUPP;
-	}
-
-	return ret;
+	if (private->mdev)
+		private->state = VFIO_CCW_STATE_IDLE;
 }
 
 /*
@@ -223,20 +143,9 @@ static struct attribute_group vfio_subchannel_attr_group = {
 static void vfio_ccw_sch_irq(struct subchannel *sch)
 {
 	struct vfio_ccw_private *private = dev_get_drvdata(&sch->dev);
-	struct irb *irb;
 
 	inc_irq_stat(IRQIO_CIO);
-
-	if (!private)
-		return;
-
-	irb = this_cpu_ptr(&cio_irb);
-	memcpy(&private->irb, irb, sizeof(*irb));
-
-	queue_work(vfio_ccw_work_q, &private->io_work);
-
-	if (private->completion)
-		complete(private->completion);
+	vfio_ccw_fsm_event(private, VFIO_CCW_EVENT_INTERRUPT);
 }
 
 static int vfio_ccw_sch_probe(struct subchannel *sch)
@@ -258,6 +167,7 @@ static int vfio_ccw_sch_probe(struct subchannel *sch)
 	dev_set_drvdata(&sch->dev, private);
 
 	spin_lock_irq(sch->lock);
+	private->state = VFIO_CCW_STATE_NOT_OPER;
 	sch->isc = VFIO_CCW_ISC;
 	ret = cio_enable_subchannel(sch, (u32)(unsigned long)sch);
 	spin_unlock_irq(sch->lock);
@@ -274,6 +184,7 @@ static int vfio_ccw_sch_probe(struct subchannel *sch)
 
 	INIT_WORK(&private->io_work, vfio_ccw_sch_io_todo);
 	atomic_set(&private->avail, 1);
+	private->state = VFIO_CCW_STATE_STANDBY;
 
 	return 0;
 
@@ -321,6 +232,7 @@ static void vfio_ccw_sch_shutdown(struct subchannel *sch)
  */
 static int vfio_ccw_sch_event(struct subchannel *sch, int process)
 {
+	struct vfio_ccw_private *private = dev_get_drvdata(&sch->dev);
 	unsigned long flags;
 
 	spin_lock_irqsave(sch->lock, flags);
@@ -331,16 +243,16 @@ static int vfio_ccw_sch_event(struct subchannel *sch, int process)
 		goto out_unlock;
 
 	if (cio_update_schib(sch)) {
-		/* Not operational. */
-		css_sched_sch_todo(sch, SCH_TODO_UNREG);
-
-		/*
-		 * TODO:
-		 * Probably we should send the machine check to the guest.
-		 */
+		vfio_ccw_fsm_event(private, VFIO_CCW_EVENT_NOT_OPER);
 		goto out_unlock;
 	}
 
+	private = dev_get_drvdata(&sch->dev);
+	if (private->state == VFIO_CCW_STATE_NOT_OPER) {
+		private->state = private->mdev ? VFIO_CCW_STATE_IDLE :
+				 VFIO_CCW_STATE_STANDBY;
+	}
+
 out_unlock:
 	spin_unlock_irqrestore(sch->lock, flags);
 
diff --git a/drivers/s390/cio/vfio_ccw_fsm.c b/drivers/s390/cio/vfio_ccw_fsm.c
new file mode 100644
index 000000000000..55b6cc55758e
--- /dev/null
+++ b/drivers/s390/cio/vfio_ccw_fsm.c
@@ -0,0 +1,207 @@
+/*
+ * Finite state machine for vfio-ccw device handling
+ *
+ * Copyright IBM Corp. 2017
+ *
+ * Author(s): Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com>
+ */
+
+#include <linux/vfio.h>
+#include <linux/mdev.h>
+
+#include "ioasm.h"
+#include "vfio_ccw_private.h"
+
+static int fsm_io_helper(struct vfio_ccw_private *private)
+{
+	struct subchannel *sch;
+	union orb *orb;
+	int ccode;
+	__u8 lpm;
+	unsigned long flags;
+
+	sch = private->sch;
+
+	spin_lock_irqsave(sch->lock, flags);
+	private->state = VFIO_CCW_STATE_BUSY;
+	spin_unlock_irqrestore(sch->lock, flags);
+
+	orb = cp_get_orb(&private->cp, (u32)(addr_t)sch, sch->lpm);
+
+	/* Issue "Start Subchannel" */
+	ccode = ssch(sch->schid, orb);
+
+	switch (ccode) {
+	case 0:
+		/*
+		 * Initialize device status information
+		 */
+		sch->schib.scsw.cmd.actl |= SCSW_ACTL_START_PEND;
+		return 0;
+	case 1:		/* Status pending */
+	case 2:		/* Busy */
+		return -EBUSY;
+	case 3:		/* Device/path not operational */
+	{
+		lpm = orb->cmd.lpm;
+		if (lpm != 0)
+			sch->lpm &= ~lpm;
+		else
+			sch->lpm = 0;
+
+		if (cio_update_schib(sch))
+			return -ENODEV;
+
+		return sch->lpm ? -EACCES : -ENODEV;
+	}
+	default:
+		return ccode;
+	}
+}
+
+static void fsm_notoper(struct vfio_ccw_private *private,
+			enum vfio_ccw_event event)
+{
+	struct subchannel *sch = private->sch;
+
+	/*
+	 * TODO:
+	 * Probably we should send the machine check to the guest.
+	 */
+	css_sched_sch_todo(sch, SCH_TODO_UNREG);
+	private->state = VFIO_CCW_STATE_NOT_OPER;
+}
+
+/*
+ * No operation action.
+ */
+static void fsm_nop(struct vfio_ccw_private *private,
+		    enum vfio_ccw_event event)
+{
+}
+
+static void fsm_io_error(struct vfio_ccw_private *private,
+			 enum vfio_ccw_event event)
+{
+	pr_err("vfio-ccw: FSM: I/O request from state:%d\n", private->state);
+	private->io_region.ret_code = -EIO;
+}
+
+static void fsm_io_busy(struct vfio_ccw_private *private,
+			enum vfio_ccw_event event)
+{
+	private->io_region.ret_code = -EBUSY;
+}
+
+static void fsm_disabled_irq(struct vfio_ccw_private *private,
+			     enum vfio_ccw_event event)
+{
+	struct subchannel *sch = private->sch;
+
+	/*
+	 * An interrupt in a disabled state means a previous disable was not
+	 * successful - should not happen, but we try to disable again.
+	 */
+	cio_disable_subchannel(sch);
+}
+
+/*
+ * Deal with the ccw command request from the userspace.
+ */
+static void fsm_io_request(struct vfio_ccw_private *private,
+			   enum vfio_ccw_event event)
+{
+	union orb *orb;
+	union scsw *scsw = &private->scsw;
+	struct ccw_io_region *io_region = &private->io_region;
+	struct mdev_device *mdev = private->mdev;
+
+	private->state = VFIO_CCW_STATE_BOXED;
+
+	memcpy(scsw, io_region->scsw_area, sizeof(*scsw));
+
+	if (scsw->cmd.fctl & SCSW_FCTL_START_FUNC) {
+		orb = (union orb *)io_region->orb_area;
+
+		io_region->ret_code = cp_init(&private->cp, mdev_dev(mdev),
+					      orb);
+		if (io_region->ret_code)
+			goto err_out;
+
+		io_region->ret_code = cp_prefetch(&private->cp);
+		if (io_region->ret_code) {
+			cp_free(&private->cp);
+			goto err_out;
+		}
+
+		/* Start channel program and wait for I/O interrupt. */
+		io_region->ret_code = fsm_io_helper(private);
+		if (io_region->ret_code) {
+			cp_free(&private->cp);
+			goto err_out;
+		}
+		return;
+	} else if (scsw->cmd.fctl & SCSW_FCTL_HALT_FUNC) {
+		/* XXX: Handle halt. */
+		io_region->ret_code = -EOPNOTSUPP;
+		goto err_out;
+	} else if (scsw->cmd.fctl & SCSW_FCTL_CLEAR_FUNC) {
+		/* XXX: Handle clear. */
+		io_region->ret_code = -EOPNOTSUPP;
+		goto err_out;
+	}
+
+err_out:
+	private->state = VFIO_CCW_STATE_IDLE;
+}
+
+/*
+ * Got an interrupt for a normal io (state busy).
+ */
+static void fsm_irq(struct vfio_ccw_private *private,
+		    enum vfio_ccw_event event)
+{
+	struct irb *irb;
+
+	if (!private)
+		return;
+
+	irb = this_cpu_ptr(&cio_irb);
+	memcpy(&private->irb, irb, sizeof(*irb));
+
+	queue_work(vfio_ccw_work_q, &private->io_work);
+
+	if (private->completion)
+		complete(private->completion);
+}
+
+/*
+ * Device statemachine
+ */
+fsm_func_t *vfio_ccw_jumptable[NR_VFIO_CCW_STATES][NR_VFIO_CCW_EVENTS] = {
+	[VFIO_CCW_STATE_NOT_OPER] = {
+		[VFIO_CCW_EVENT_NOT_OPER]	= fsm_nop,
+		[VFIO_CCW_EVENT_IO_REQ]		= fsm_io_error,
+		[VFIO_CCW_EVENT_INTERRUPT]	= fsm_disabled_irq,
+	},
+	[VFIO_CCW_STATE_STANDBY] = {
+		[VFIO_CCW_EVENT_NOT_OPER]	= fsm_notoper,
+		[VFIO_CCW_EVENT_IO_REQ]		= fsm_io_error,
+		[VFIO_CCW_EVENT_INTERRUPT]	= fsm_irq,
+	},
+	[VFIO_CCW_STATE_IDLE] = {
+		[VFIO_CCW_EVENT_NOT_OPER]	= fsm_notoper,
+		[VFIO_CCW_EVENT_IO_REQ]		= fsm_io_request,
+		[VFIO_CCW_EVENT_INTERRUPT]	= fsm_irq,
+	},
+	[VFIO_CCW_STATE_BOXED] = {
+		[VFIO_CCW_EVENT_NOT_OPER]	= fsm_notoper,
+		[VFIO_CCW_EVENT_IO_REQ]		= fsm_io_busy,
+		[VFIO_CCW_EVENT_INTERRUPT]	= fsm_irq,
+	},
+	[VFIO_CCW_STATE_BUSY] = {
+		[VFIO_CCW_EVENT_NOT_OPER]	= fsm_notoper,
+		[VFIO_CCW_EVENT_IO_REQ]		= fsm_io_busy,
+		[VFIO_CCW_EVENT_INTERRUPT]	= fsm_irq,
+	},
+};
diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c
index d754d3d90574..b2e615404034 100644
--- a/drivers/s390/cio/vfio_ccw_ops.c
+++ b/drivers/s390/cio/vfio_ccw_ops.c
@@ -35,7 +35,11 @@ static int vfio_ccw_mdev_reset(struct mdev_device *mdev)
 	if (ret)
 		return ret;
 
-	return cio_enable_subchannel(sch, (u32)(unsigned long)sch);
+	ret = cio_enable_subchannel(sch, (u32)(unsigned long)sch);
+	if (!ret)
+		private->state = VFIO_CCW_STATE_IDLE;
+
+	return ret;
 }
 
 static int vfio_ccw_mdev_notifier(struct notifier_block *nb,
@@ -112,10 +116,14 @@ static int vfio_ccw_mdev_create(struct kobject *kobj, struct mdev_device *mdev)
 	struct vfio_ccw_private *private =
 		dev_get_drvdata(mdev_parent_dev(mdev));
 
+	if (private->state == VFIO_CCW_STATE_NOT_OPER)
+		return -ENODEV;
+
 	if (atomic_dec_if_positive(&private->avail) < 0)
 		return -EPERM;
 
 	private->mdev = mdev;
+	private->state = VFIO_CCW_STATE_IDLE;
 
 	return 0;
 }
@@ -126,10 +134,20 @@ static int vfio_ccw_mdev_remove(struct mdev_device *mdev)
 		dev_get_drvdata(mdev_parent_dev(mdev));
 	int ret;
 
+	if (!private)
+		goto out;
+
+	if ((private->state == VFIO_CCW_STATE_NOT_OPER) ||
+	    (private->state == VFIO_CCW_STATE_STANDBY))
+		goto out;
+
 	ret = vfio_ccw_mdev_reset(mdev);
 	if (ret)
 		return ret;
 
+	private->state = VFIO_CCW_STATE_STANDBY;
+
+out:
 	private->mdev = NULL;
 	atomic_inc(&private->avail);
 
@@ -193,14 +211,18 @@ static ssize_t vfio_ccw_mdev_write(struct mdev_device *mdev,
 	private = dev_get_drvdata(mdev_parent_dev(mdev));
 	if (!private)
 		return -ENODEV;
+	if (private->state != VFIO_CCW_STATE_IDLE)
+		return -EACCES;
 
 	region = &private->io_region;
 	if (copy_from_user((void *)region + *ppos, buf, count))
 		return -EFAULT;
 
-	region->ret_code = vfio_ccw_sch_cmd_request(private);
-	if (region->ret_code != 0)
+	vfio_ccw_fsm_event(private, VFIO_CCW_EVENT_IO_REQ);
+	if (region->ret_code != 0) {
+		private->state = VFIO_CCW_STATE_IDLE;
 		return region->ret_code;
+	}
 
 	return count;
 }
diff --git a/drivers/s390/cio/vfio_ccw_private.h b/drivers/s390/cio/vfio_ccw_private.h
index 2503797fb153..fc0f01c16ef9 100644
--- a/drivers/s390/cio/vfio_ccw_private.h
+++ b/drivers/s390/cio/vfio_ccw_private.h
@@ -21,6 +21,7 @@
 /**
  * struct vfio_ccw_private
  * @sch: pointer to the subchannel
+ * @state: internal state of the device
  * @completion: synchronization helper of the I/O completion
  * @avail: available for creating a mediated device
  * @mdev: pointer to the mediated device
@@ -34,6 +35,7 @@
  */
 struct vfio_ccw_private {
 	struct subchannel	*sch;
+	int			state;
 	struct completion	*completion;
 	atomic_t		avail;
 	struct mdev_device	*mdev;
@@ -52,6 +54,43 @@ extern int vfio_ccw_mdev_reg(struct subchannel *sch);
 extern void vfio_ccw_mdev_unreg(struct subchannel *sch);
 
 extern int vfio_ccw_sch_quiesce(struct subchannel *sch);
-extern int vfio_ccw_sch_cmd_request(struct vfio_ccw_private *private);
+
+/*
+ * States of the device statemachine.
+ */
+enum vfio_ccw_state {
+	VFIO_CCW_STATE_NOT_OPER,
+	VFIO_CCW_STATE_STANDBY,
+	VFIO_CCW_STATE_IDLE,
+	VFIO_CCW_STATE_BOXED,
+	VFIO_CCW_STATE_BUSY,
+	/* last element! */
+	NR_VFIO_CCW_STATES
+};
+
+/*
+ * Asynchronous events of the device statemachine.
+ */
+enum vfio_ccw_event {
+	VFIO_CCW_EVENT_NOT_OPER,
+	VFIO_CCW_EVENT_IO_REQ,
+	VFIO_CCW_EVENT_INTERRUPT,
+	/* last element! */
+	NR_VFIO_CCW_EVENTS
+};
+
+/*
+ * Action called through jumptable.
+ */
+typedef void (fsm_func_t)(struct vfio_ccw_private *, enum vfio_ccw_event);
+extern fsm_func_t *vfio_ccw_jumptable[NR_VFIO_CCW_STATES][NR_VFIO_CCW_EVENTS];
+
+static inline void vfio_ccw_fsm_event(struct vfio_ccw_private *private,
+				     int event)
+{
+	vfio_ccw_jumptable[private->state][event](private, event);
+}
+
+extern struct workqueue_struct *vfio_ccw_work_q;
 
 #endif

From 25627ba3899e71fc205e172b676b207ee0ac1e6a Mon Sep 17 00:00:00 2001
From: Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com>
Date: Fri, 17 Mar 2017 04:17:41 +0100
Subject: [PATCH 48/74] docs: add documentation for vfio-ccw

Add file Documentation/s390/vfio-ccw.txt that includes details
of vfio-ccw.

Acked-by: Pierre Morel <pmorel@linux.vnet.ibm.com>
Signed-off-by: Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com>
Message-Id: <20170317031743.40128-15-bjsdjshi@linux.vnet.ibm.com>
Signed-off-by: Cornelia Huck <cornelia.huck@de.ibm.com>
---
 Documentation/s390/00-INDEX     |   2 +
 Documentation/s390/vfio-ccw.txt | 303 ++++++++++++++++++++++++++++++++
 2 files changed, 305 insertions(+)
 create mode 100644 Documentation/s390/vfio-ccw.txt

diff --git a/Documentation/s390/00-INDEX b/Documentation/s390/00-INDEX
index 9189535f6cd2..317f0378ae01 100644
--- a/Documentation/s390/00-INDEX
+++ b/Documentation/s390/00-INDEX
@@ -22,5 +22,7 @@ qeth.txt
 	- HiperSockets Bridge Port Support.
 s390dbf.txt
 	- information on using the s390 debug feature.
+vfio-ccw.txt
+	  information on the vfio-ccw I/O subchannel driver.
 zfcpdump.txt
 	- information on the s390 SCSI dump tool.
diff --git a/Documentation/s390/vfio-ccw.txt b/Documentation/s390/vfio-ccw.txt
new file mode 100644
index 000000000000..90b3dfead81b
--- /dev/null
+++ b/Documentation/s390/vfio-ccw.txt
@@ -0,0 +1,303 @@
+vfio-ccw: the basic infrastructure
+==================================
+
+Introduction
+------------
+
+Here we describe the vfio support for I/O subchannel devices for
+Linux/s390. Motivation for vfio-ccw is to passthrough subchannels to a
+virtual machine, while vfio is the means.
+
+Different than other hardware architectures, s390 has defined a unified
+I/O access method, which is so called Channel I/O. It has its own access
+patterns:
+- Channel programs run asynchronously on a separate (co)processor.
+- The channel subsystem will access any memory designated by the caller
+  in the channel program directly, i.e. there is no iommu involved.
+Thus when we introduce vfio support for these devices, we realize it
+with a mediated device (mdev) implementation. The vfio mdev will be
+added to an iommu group, so as to make itself able to be managed by the
+vfio framework. And we add read/write callbacks for special vfio I/O
+regions to pass the channel programs from the mdev to its parent device
+(the real I/O subchannel device) to do further address translation and
+to perform I/O instructions.
+
+This document does not intend to explain the s390 I/O architecture in
+every detail. More information/reference could be found here:
+- A good start to know Channel I/O in general:
+  https://en.wikipedia.org/wiki/Channel_I/O
+- s390 architecture:
+  s390 Principles of Operation manual (IBM Form. No. SA22-7832)
+- The existing Qemu code which implements a simple emulated channel
+  subsystem could also be a good reference. It makes it easier to follow
+  the flow.
+  qemu/hw/s390x/css.c
+
+For vfio mediated device framework:
+- Documentation/vfio-mediated-device.txt
+
+Motivation of vfio-ccw
+----------------------
+
+Currently, a guest virtualized via qemu/kvm on s390 only sees
+paravirtualized virtio devices via the "Virtio Over Channel I/O
+(virtio-ccw)" transport. This makes virtio devices discoverable via
+standard operating system algorithms for handling channel devices.
+
+However this is not enough. On s390 for the majority of devices, which
+use the standard Channel I/O based mechanism, we also need to provide
+the functionality of passing through them to a Qemu virtual machine.
+This includes devices that don't have a virtio counterpart (e.g. tape
+drives) or that have specific characteristics which guests want to
+exploit.
+
+For passing a device to a guest, we want to use the same interface as
+everybody else, namely vfio. Thus, we would like to introduce vfio
+support for channel devices. And we would like to name this new vfio
+device "vfio-ccw".
+
+Access patterns of CCW devices
+------------------------------
+
+s390 architecture has implemented a so called channel subsystem, that
+provides a unified view of the devices physically attached to the
+systems. Though the s390 hardware platform knows about a huge variety of
+different peripheral attachments like disk devices (aka. DASDs), tapes,
+communication controllers, etc. They can all be accessed by a well
+defined access method and they are presenting I/O completion a unified
+way: I/O interruptions.
+
+All I/O requires the use of channel command words (CCWs). A CCW is an
+instruction to a specialized I/O channel processor. A channel program is
+a sequence of CCWs which are executed by the I/O channel subsystem.  To
+issue a channel program to the channel subsystem, it is required to
+build an operation request block (ORB), which can be used to point out
+the format of the CCW and other control information to the system. The
+operating system signals the I/O channel subsystem to begin executing
+the channel program with a SSCH (start sub-channel) instruction. The
+central processor is then free to proceed with non-I/O instructions
+until interrupted. The I/O completion result is received by the
+interrupt handler in the form of interrupt response block (IRB).
+
+Back to vfio-ccw, in short:
+- ORBs and channel programs are built in guest kernel (with guest
+  physical addresses).
+- ORBs and channel programs are passed to the host kernel.
+- Host kernel translates the guest physical addresses to real addresses
+  and starts the I/O with issuing a privileged Channel I/O instruction
+  (e.g SSCH).
+- channel programs run asynchronously on a separate processor.
+- I/O completion will be signaled to the host with I/O interruptions.
+  And it will be copied as IRB to user space to pass it back to the
+  guest.
+
+Physical vfio ccw device and its child mdev
+-------------------------------------------
+
+As mentioned above, we realize vfio-ccw with a mdev implementation.
+
+Channel I/O does not have IOMMU hardware support, so the physical
+vfio-ccw device does not have an IOMMU level translation or isolation.
+
+Sub-channel I/O instructions are all privileged instructions, When
+handling the I/O instruction interception, vfio-ccw has the software
+policing and translation how the channel program is programmed before
+it gets sent to hardware.
+
+Within this implementation, we have two drivers for two types of
+devices:
+- The vfio_ccw driver for the physical subchannel device.
+  This is an I/O subchannel driver for the real subchannel device.  It
+  realizes a group of callbacks and registers to the mdev framework as a
+  parent (physical) device. As a consequence, mdev provides vfio_ccw a
+  generic interface (sysfs) to create mdev devices. A vfio mdev could be
+  created by vfio_ccw then and added to the mediated bus. It is the vfio
+  device that added to an IOMMU group and a vfio group.
+  vfio_ccw also provides an I/O region to accept channel program
+  request from user space and store I/O interrupt result for user
+  space to retrieve. To notify user space an I/O completion, it offers
+  an interface to setup an eventfd fd for asynchronous signaling.
+
+- The vfio_mdev driver for the mediated vfio ccw device.
+  This is provided by the mdev framework. It is a vfio device driver for
+  the mdev that created by vfio_ccw.
+  It realize a group of vfio device driver callbacks, adds itself to a
+  vfio group, and registers itself to the mdev framework as a mdev
+  driver.
+  It uses a vfio iommu backend that uses the existing map and unmap
+  ioctls, but rather than programming them into an IOMMU for a device,
+  it simply stores the translations for use by later requests. This
+  means that a device programmed in a VM with guest physical addresses
+  can have the vfio kernel convert that address to process virtual
+  address, pin the page and program the hardware with the host physical
+  address in one step.
+  For a mdev, the vfio iommu backend will not pin the pages during the
+  VFIO_IOMMU_MAP_DMA ioctl. Mdev framework will only maintain a database
+  of the iova<->vaddr mappings in this operation. And they export a
+  vfio_pin_pages and a vfio_unpin_pages interfaces from the vfio iommu
+  backend for the physical devices to pin and unpin pages by demand.
+
+Below is a high Level block diagram.
+
+ +-------------+
+ |             |
+ | +---------+ | mdev_register_driver() +--------------+
+ | |  Mdev   | +<-----------------------+              |
+ | |  bus    | |                        | vfio_mdev.ko |
+ | | driver  | +----------------------->+              |<-> VFIO user
+ | +---------+ |    probe()/remove()    +--------------+    APIs
+ |             |
+ |  MDEV CORE  |
+ |   MODULE    |
+ |   mdev.ko   |
+ | +---------+ | mdev_register_device() +--------------+
+ | |Physical | +<-----------------------+              |
+ | | device  | |                        |  vfio_ccw.ko |<-> subchannel
+ | |interface| +----------------------->+              |     device
+ | +---------+ |       callback         +--------------+
+ +-------------+
+
+The process of how these work together.
+1. vfio_ccw.ko drives the physical I/O subchannel, and registers the
+   physical device (with callbacks) to mdev framework.
+   When vfio_ccw probing the subchannel device, it registers device
+   pointer and callbacks to the mdev framework. Mdev related file nodes
+   under the device node in sysfs would be created for the subchannel
+   device, namely 'mdev_create', 'mdev_destroy' and
+   'mdev_supported_types'.
+2. Create a mediated vfio ccw device.
+   Use the 'mdev_create' sysfs file, we need to manually create one (and
+   only one for our case) mediated device.
+3. vfio_mdev.ko drives the mediated ccw device.
+   vfio_mdev is also the vfio device drvier. It will probe the mdev and
+   add it to an iommu_group and a vfio_group. Then we could pass through
+   the mdev to a guest.
+
+vfio-ccw I/O region
+-------------------
+
+An I/O region is used to accept channel program request from user
+space and store I/O interrupt result for user space to retrieve. The
+defination of the region is:
+
+struct ccw_io_region {
+#define ORB_AREA_SIZE 12
+	__u8	orb_area[ORB_AREA_SIZE];
+#define SCSW_AREA_SIZE 12
+	__u8	scsw_area[SCSW_AREA_SIZE];
+#define IRB_AREA_SIZE 96
+	__u8	irb_area[IRB_AREA_SIZE];
+	__u32	ret_code;
+} __packed;
+
+While starting an I/O request, orb_area should be filled with the
+guest ORB, and scsw_area should be filled with the SCSW of the Virtual
+Subchannel.
+
+irb_area stores the I/O result.
+
+ret_code stores a return code for each access of the region.
+
+vfio-ccw patches overview
+-------------------------
+
+For now, our patches are rebased on the latest mdev implementation.
+vfio-ccw follows what vfio-pci did on the s390 paltform and uses
+vfio-iommu-type1 as the vfio iommu backend. It's a good start to launch
+the code review for vfio-ccw. Note that the implementation is far from
+complete yet; but we'd like to get feedback for the general
+architecture.
+
+* CCW translation APIs
+- Description:
+  These introduce a group of APIs (start with 'cp_') to do CCW
+  translation. The CCWs passed in by a user space program are
+  organized with their guest physical memory addresses. These APIs
+  will copy the CCWs into the kernel space, and assemble a runnable
+  kernel channel program by updating the guest physical addresses with
+  their corresponding host physical addresses.
+- Patches:
+  vfio: ccw: introduce channel program interfaces
+
+* vfio_ccw device driver
+- Description:
+  The following patches utilizes the CCW translation APIs and introduce
+  vfio_ccw, which is the driver for the I/O subchannel devices you want
+  to pass through.
+  vfio_ccw implements the following vfio ioctls:
+    VFIO_DEVICE_GET_INFO
+    VFIO_DEVICE_GET_IRQ_INFO
+    VFIO_DEVICE_GET_REGION_INFO
+    VFIO_DEVICE_RESET
+    VFIO_DEVICE_SET_IRQS
+  This provides an I/O region, so that the user space program can pass a
+  channel program to the kernel, to do further CCW translation before
+  issuing them to a real device.
+  This also provides the SET_IRQ ioctl to setup an event notifier to
+  notify the user space program the I/O completion in an asynchronous
+  way.
+- Patches:
+  vfio: ccw: basic implementation for vfio_ccw driver
+  vfio: ccw: introduce ccw_io_region
+  vfio: ccw: realize VFIO_DEVICE_GET_REGION_INFO ioctl
+  vfio: ccw: realize VFIO_DEVICE_RESET ioctl
+  vfio: ccw: realize VFIO_DEVICE_G(S)ET_IRQ_INFO ioctls
+
+The user of vfio-ccw is not limited to Qemu, while Qemu is definitely a
+good example to get understand how these patches work. Here is a little
+bit more detail how an I/O request triggered by the Qemu guest will be
+handled (without error handling).
+
+Explanation:
+Q1-Q7: Qemu side process.
+K1-K5: Kernel side process.
+
+Q1. Get I/O region info during initialization.
+Q2. Setup event notifier and handler to handle I/O completion.
+
+... ...
+
+Q3. Intercept a ssch instruction.
+Q4. Write the guest channel program and ORB to the I/O region.
+    K1. Copy from guest to kernel.
+    K2. Translate the guest channel program to a host kernel space
+        channel program, which becomes runnable for a real device.
+    K3. With the necessary information contained in the orb passed in
+        by Qemu, issue the ccwchain to the device.
+    K4. Return the ssch CC code.
+Q5. Return the CC code to the guest.
+
+... ...
+
+    K5. Interrupt handler gets the I/O result and write the result to
+        the I/O region.
+    K6. Signal Qemu to retrieve the result.
+Q6. Get the signal and event handler reads out the result from the I/O
+    region.
+Q7. Update the irb for the guest.
+
+Limitations
+-----------
+
+The current vfio-ccw implementation focuses on supporting basic commands
+needed to implement block device functionality (read/write) of DASD/ECKD
+device only. Some commands may need special handling in the future, for
+example, anything related to path grouping.
+
+DASD is a kind of storage device. While ECKD is a data recording format.
+More information for DASD and ECKD could be found here:
+https://en.wikipedia.org/wiki/Direct-access_storage_device
+https://en.wikipedia.org/wiki/Count_key_data
+
+Together with the corresponding work in Qemu, we can bring the passed
+through DASD/ECKD device online in a guest now and use it as a block
+device.
+
+Reference
+---------
+1. ESA/s390 Principles of Operation manual (IBM Form. No. SA22-7832)
+2. ESA/390 Common I/O Device Commands manual (IBM Form. No. SA22-7204)
+3. https://en.wikipedia.org/wiki/Channel_I/O
+4. Documentation/s390/cds.txt
+5. Documentation/vfio.txt
+6. Documentation/vfio-mediated-device.txt

From d686f21ace295f224eb2320bf1a8a20835de8494 Mon Sep 17 00:00:00 2001
From: Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com>
Date: Fri, 17 Mar 2017 04:17:42 +0100
Subject: [PATCH 49/74] vfio: ccw: introduce support for ccw0

Although Linux does not use format-0 channel command words (CCW0)
these are a non-optional part of the platform spec, and for the sake
of platform compliance, and possibly some non-Linux guests, we have
to support CCW0.

Making the kernel execute a format 0 channel program is too much hassle
because we would need to allocate and use memory which can be addressed
by 24 bit physical addresses (because of CCW0.cda). So we implement CCW0
support by translating the channel program into an equivalent CCW1
program instead.

Based upon an orginal patch by Kai Yue Wang.
Signed-off-by: Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com>
Message-Id: <20170317031743.40128-16-bjsdjshi@linux.vnet.ibm.com>
Signed-off-by: Cornelia Huck <cornelia.huck@de.ibm.com>
---
 arch/s390/include/asm/cio.h    | 18 ++++++++++++++++++
 drivers/s390/cio/vfio_ccw_cp.c | 32 +++++++++++++++++++++++++++++---
 2 files changed, 47 insertions(+), 3 deletions(-)

diff --git a/arch/s390/include/asm/cio.h b/arch/s390/include/asm/cio.h
index f7ed88cc066e..7a38ca85190b 100644
--- a/arch/s390/include/asm/cio.h
+++ b/arch/s390/include/asm/cio.h
@@ -33,6 +33,24 @@ struct ccw1 {
 	__u32 cda;
 } __attribute__ ((packed,aligned(8)));
 
+/**
+ * struct ccw0 - channel command word
+ * @cmd_code: command code
+ * @cda: data address
+ * @flags: flags, like IDA addressing, etc.
+ * @reserved: will be ignored
+ * @count: byte count
+ *
+ * The format-0 ccw structure.
+ */
+struct ccw0 {
+	__u8 cmd_code;
+	__u32 cda : 24;
+	__u8  flags;
+	__u8  reserved;
+	__u16 count;
+} __packed __aligned(8);
+
 #define CCW_FLAG_DC		0x80
 #define CCW_FLAG_CC		0x40
 #define CCW_FLAG_SLI		0x20
diff --git a/drivers/s390/cio/vfio_ccw_cp.c b/drivers/s390/cio/vfio_ccw_cp.c
index 16bbb54ee532..ba6ac83a6c25 100644
--- a/drivers/s390/cio/vfio_ccw_cp.c
+++ b/drivers/s390/cio/vfio_ccw_cp.c
@@ -247,7 +247,34 @@ static long copy_ccw_from_iova(struct channel_program *cp,
 			       struct ccw1 *to, u64 iova,
 			       unsigned long len)
 {
-	return copy_from_iova(cp->mdev, to, iova, len * sizeof(struct ccw1));
+	struct ccw0 ccw0;
+	struct ccw1 *pccw1;
+	int ret;
+	int i;
+
+	ret = copy_from_iova(cp->mdev, to, iova, len * sizeof(struct ccw1));
+	if (ret)
+		return ret;
+
+	if (!cp->orb.cmd.fmt) {
+		pccw1 = to;
+		for (i = 0; i < len; i++) {
+			ccw0 = *(struct ccw0 *)pccw1;
+			if ((pccw1->cmd_code & 0x0f) == CCW_CMD_TIC) {
+				pccw1->cmd_code = CCW_CMD_TIC;
+				pccw1->flags = 0;
+				pccw1->count = 0;
+			} else {
+				pccw1->cmd_code = ccw0.cmd_code;
+				pccw1->flags = ccw0.flags;
+				pccw1->count = ccw0.count;
+			}
+			pccw1->cda = ccw0.cda;
+			pccw1++;
+		}
+	}
+
+	return ret;
 }
 
 /*
@@ -616,9 +643,8 @@ int cp_init(struct channel_program *cp, struct device *mdev, union orb *orb)
 	 * Only support prefetch enable mode now.
 	 * Only support 64bit addressing idal.
 	 * Only support 4k IDAW.
-	 * Only support ccw1.
 	 */
-	if (!orb->cmd.pfch || !orb->cmd.c64 || orb->cmd.i2k || !orb->cmd.fmt)
+	if (!orb->cmd.pfch || !orb->cmd.c64 || orb->cmd.i2k)
 		return -EOPNOTSUPP;
 
 	INIT_LIST_HEAD(&cp->ccwchain_list);

From 1877888d0ad21858693d8a5594734b125a068638 Mon Sep 17 00:00:00 2001
From: Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com>
Date: Fri, 17 Mar 2017 04:17:43 +0100
Subject: [PATCH 50/74] MAINTAINERS: Add vfio-ccw maintainers

Add Cornelia Huck and myself as the vfio-ccw driver maintainers.

Signed-off-by: Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com>
Message-Id: <20170317031743.40128-17-bjsdjshi@linux.vnet.ibm.com>
Signed-off-by: Cornelia Huck <cornelia.huck@de.ibm.com>
---
 MAINTAINERS | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index c776906f67a9..01a66752c1cb 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10859,6 +10859,16 @@ W:	http://www.ibm.com/developerworks/linux/linux390/
 S:	Supported
 F:	drivers/iommu/s390-iommu.c
 
+S390 VFIO-CCW DRIVER
+M:	Cornelia Huck <cornelia.huck@de.ibm.com>
+M:	Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com>
+L:	linux-s390@vger.kernel.org
+L:	kvm@vger.kernel.org
+S:	Supported
+F:	drivers/s390/cio/vfio_ccw*
+F:	Documentation/s390/vfio-ccw.txt
+F:	include/uapi/linux/vfio_ccw.h
+
 S3C24XX SD/MMC Driver
 M:	Ben Dooks <ben-linux@fluff.org>
 L:	linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)

From cab36c262ef9a5ddf3c7ae0f8031b191338b3142 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Mon, 3 Apr 2017 13:30:23 +0200
Subject: [PATCH 51/74] s390: use 64-bit lctlg to load task pid to cr4 on
 context switch

The 32-bit lctl instruction is quite a bit slower than the 64-bit
counter part lctlg. Use the faster instruction.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/kernel/entry.S | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index fa8b8f28e08b..02f11018e2df 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -190,7 +190,9 @@ ENTRY(__switch_to)
 	stg	%r15,__LC_KERNEL_STACK		# store end of kernel stack
 	lg	%r15,__THREAD_ksp(%r1)		# load kernel stack of next
 	/* c4 is used in guest detection: arch/s390/kernel/perf_cpum_sf.c */
-	lctl	%c4,%c4,__TASK_pid(%r3)		# load pid to control reg. 4
+	xc	__SF_EMPTY(8,%r15),__SF_EMPTY(%r15)
+	mvc	__SF_EMPTY+4(4,%r15),__TASK_pid(%r3)
+	lctlg	%c4,%c4,__SF_EMPTY(%r15)	# load pid to control reg. 4
 	mvc	__LC_CURRENT_PID(4,%r0),__TASK_pid(%r3) # store pid of next
 	lmg	%r6,%r15,__SF_GPRS(%r15)	# load gprs of next task
 	TSTMSK	__LC_MACHINE_FLAGS,MACHINE_FLAG_LPP

From 02f5cb9fe851e924775b79f466133be72affdc9a Mon Sep 17 00:00:00 2001
From: Sebastian Ott <sebott@linux.vnet.ibm.com>
Date: Mon, 27 Mar 2017 18:22:14 +0200
Subject: [PATCH 52/74] s390/pci: remove unused function

barsize was never used. Get rid of it.

Signed-off-by: Sebastian Ott <sebott@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/pci/pci.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c
index 364b9d824be3..f6ed48c17a9d 100644
--- a/arch/s390/pci/pci.c
+++ b/arch/s390/pci/pci.c
@@ -871,11 +871,6 @@ int zpci_report_error(struct pci_dev *pdev,
 }
 EXPORT_SYMBOL(zpci_report_error);
 
-static inline int barsize(u8 size)
-{
-	return (size) ? (1 << size) >> 10 : 0;
-}
-
 static int zpci_mem_init(void)
 {
 	BUILD_BUG_ON(!is_power_of_2(__alignof__(struct zpci_fmb)) ||

From 4e0cca7d64b62b5974fa198acce0d3b476ca9914 Mon Sep 17 00:00:00 2001
From: Sebastian Ott <sebott@linux.vnet.ibm.com>
Date: Mon, 27 Mar 2017 18:23:01 +0200
Subject: [PATCH 53/74] s390/pci: remove duplicated define

Address space identifiers are already defined in <asm/pci_insn.h>.

Signed-off-by: Sebastian Ott <sebott@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/pci/pci.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c
index f6ed48c17a9d..e6cb96a1171f 100644
--- a/arch/s390/pci/pci.c
+++ b/arch/s390/pci/pci.c
@@ -214,8 +214,6 @@ int zpci_fmb_disable_device(struct zpci_dev *zdev)
 	return rc;
 }
 
-#define ZPCI_PCIAS_CFGSPC	15
-
 static int zpci_cfg_load(struct zpci_dev *zdev, int offset, u32 *val, u8 len)
 {
 	u64 req = ZPCI_CREATE_REQ(zdev->fh, ZPCI_PCIAS_CFGSPC, len);

From b0c8ce897a42ab357001f275fa80ba82c8d4555f Mon Sep 17 00:00:00 2001
From: Sebastian Ott <sebott@linux.vnet.ibm.com>
Date: Mon, 27 Mar 2017 19:07:24 +0200
Subject: [PATCH 54/74] s390/pci: reduce iomap size (even more)

Commit c506fff3d3a8 ("s390/pci: resize iomap") reduced the iomap
to NR_FUNCTIONS * PCI_BAR_COUNT elements. Since we only support
functions with 64bit BARs we can cut that number in half.

Signed-off-by: Sebastian Ott <sebott@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/pci/pci.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c
index e6cb96a1171f..ba56b9d9c765 100644
--- a/arch/s390/pci/pci.c
+++ b/arch/s390/pci/pci.c
@@ -69,7 +69,7 @@ static struct airq_struct zpci_airq = {
 };
 
 #define ZPCI_IOMAP_ENTRIES						\
-	min(((unsigned long) CONFIG_PCI_NR_FUNCTIONS * PCI_BAR_COUNT),	\
+	min(((unsigned long) ZPCI_NR_DEVICES * PCI_BAR_COUNT / 2),	\
 	    ZPCI_IOMAP_MAX_ENTRIES)
 
 static DEFINE_SPINLOCK(zpci_iomap_lock);

From b18601b076fe32c6556e9bfe7cb811d86625e450 Mon Sep 17 00:00:00 2001
From: Sebastian Ott <sebott@linux.vnet.ibm.com>
Date: Mon, 27 Mar 2017 19:09:15 +0200
Subject: [PATCH 55/74] s390/pci: increase the PCI_NR_FUNCTIONS default

Users complained that they are hitting the limit of 64 functions (some
physical functions can spawn lots of virtual functions). Double the
default limit. With the latest savings in static data usage this
increases the image size by only 520 bytes.

Signed-off-by: Sebastian Ott <sebott@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index ca2fe764be2d..249c2771be0e 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -624,7 +624,7 @@ if PCI
 config PCI_NR_FUNCTIONS
 	int "Maximum number of PCI functions (1-4096)"
 	range 1 4096
-	default "64"
+	default "128"
 	help
 	  This allows you to specify the maximum number of PCI functions which
 	  this kernel will support.

From 561ecb0cf6904ce6aac8597004f311d215571625 Mon Sep 17 00:00:00 2001
From: Sebastian Ott <sebott@linux.vnet.ibm.com>
Date: Mon, 27 Mar 2017 19:22:27 +0200
Subject: [PATCH 56/74] s390/pci: remove forward declaration

Move a struct definition to get rid of a forward declaration.

Signed-off-by: Sebastian Ott <sebott@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/pci/pci.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c
index ba56b9d9c765..8051df109db3 100644
--- a/arch/s390/pci/pci.c
+++ b/arch/s390/pci/pci.c
@@ -60,14 +60,6 @@ static DEFINE_SPINLOCK(zpci_domain_lock);
 static struct airq_iv *zpci_aisb_iv;
 static struct airq_iv *zpci_aibv[ZPCI_NR_DEVICES];
 
-/* Adapter interrupt definitions */
-static void zpci_irq_handler(struct airq_struct *airq);
-
-static struct airq_struct zpci_airq = {
-	.handler = zpci_irq_handler,
-	.isc = PCI_ISC,
-};
-
 #define ZPCI_IOMAP_ENTRIES						\
 	min(((unsigned long) ZPCI_NR_DEVICES * PCI_BAR_COUNT / 2),	\
 	    ZPCI_IOMAP_MAX_ENTRIES)
@@ -505,6 +497,11 @@ static void zpci_unmap_resources(struct pci_dev *pdev)
 	}
 }
 
+static struct airq_struct zpci_airq = {
+	.handler = zpci_irq_handler,
+	.isc = PCI_ISC,
+};
+
 static int __init zpci_irq_init(void)
 {
 	int rc;

From df26c2e87e6cf3ced1fbd589e40d633a6a7f20cb Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Tue, 4 Apr 2017 12:55:11 +0200
Subject: [PATCH 57/74] s390/cpumf: simplify detection of guest samples

There are three different code levels in regard to the identification
of guest samples. They differ in the way the LPP instruction is used.

1) Old kernels without the LPP instruction. The guest program parameter
   is always zero.
2) Newer kernels load the process pid into the program parameter with LPP.
   The guest program parameter is non-zero if the guest executes in a
   process != idle.
3) The latest kernels load ((1UL << 31) | pid) with LPP to make the value
   non-zero even for the idle task. The guest program parameter is non-zero
   if the guest is running.

All kernels load the process pid to CR4 on context switch. The CPU sampling
code uses the value in CR4 to decide between guest and host samples in case
the guest program parameter is zero. The three cases:

1) CR4==pid, gpp==0
2) CR4==pid, gpp==pid
3) CR4==pid, gpp==((1UL << 31) | pid)

The load-control instruction to load the pid into CR4 is expensive and the
goal is to remove it. To distinguish the host CR4 from the guest pid for
the idle process the maximum value 0xffff for the PASN is used.
This adds a fourth case for a guest OS with an updated kernel:

4) CR4==0xffff, gpp=((1UL << 31) | pid)

The host kernel will have CR4==0xffff and will use (gpp!=0 || CR4!==0xffff)
to identify guest samples. This works nicely with all 4 cases, the only
possible issue would be a guest with an old kernel (gpp==0) and a process
pid of 0xffff. Well, don't do that..

Suggested-by: Christian Borntraeger <borntraeger@de.ibm.com>
Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/kernel/entry.S        | 4 ----
 arch/s390/kernel/head64.S       | 2 +-
 arch/s390/kernel/perf_cpum_sf.c | 7 +++----
 3 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index 02f11018e2df..c6cf338c9327 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -189,10 +189,6 @@ ENTRY(__switch_to)
 	stg	%r3,__LC_CURRENT		# store task struct of next
 	stg	%r15,__LC_KERNEL_STACK		# store end of kernel stack
 	lg	%r15,__THREAD_ksp(%r1)		# load kernel stack of next
-	/* c4 is used in guest detection: arch/s390/kernel/perf_cpum_sf.c */
-	xc	__SF_EMPTY(8,%r15),__SF_EMPTY(%r15)
-	mvc	__SF_EMPTY+4(4,%r15),__TASK_pid(%r3)
-	lctlg	%c4,%c4,__SF_EMPTY(%r15)	# load pid to control reg. 4
 	mvc	__LC_CURRENT_PID(4,%r0),__TASK_pid(%r3) # store pid of next
 	lmg	%r6,%r15,__SF_GPRS(%r15)	# load gprs of next task
 	TSTMSK	__LC_MACHINE_FLAGS,MACHINE_FLAG_LPP
diff --git a/arch/s390/kernel/head64.S b/arch/s390/kernel/head64.S
index 482d3526e32b..31c91f24e562 100644
--- a/arch/s390/kernel/head64.S
+++ b/arch/s390/kernel/head64.S
@@ -52,7 +52,7 @@ ENTRY(startup_continue)
 	.quad	0			# cr1: primary space segment table
 	.quad	.Lduct			# cr2: dispatchable unit control table
 	.quad	0			# cr3: instruction authorization
-	.quad	0			# cr4: instruction authorization
+	.quad	0xffff			# cr4: instruction authorization
 	.quad	.Lduct			# cr5: primary-aste origin
 	.quad	0			# cr6:	I/O interrupts
 	.quad	0			# cr7:	secondary space segment table
diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c
index 1c0b58545c04..9a4f279d25ca 100644
--- a/arch/s390/kernel/perf_cpum_sf.c
+++ b/arch/s390/kernel/perf_cpum_sf.c
@@ -1009,8 +1009,8 @@ static int perf_push_sample(struct perf_event *event, struct sf_raw_sample *sfr)
 	 * sample. Some early samples or samples from guests without
 	 * lpp usage would be misaccounted to the host. We use the asn
 	 * value as an addon heuristic to detect most of these guest samples.
-	 * If the value differs from the host hpp value, we assume to be a
-	 * KVM guest.
+	 * If the value differs from 0xffff (the host value), we assume to
+	 * be a KVM guest.
 	 */
 	switch (sfr->basic.CL) {
 	case 1: /* logical partition */
@@ -1020,8 +1020,7 @@ static int perf_push_sample(struct perf_event *event, struct sf_raw_sample *sfr)
 		sde_regs->in_guest = 1;
 		break;
 	default: /* old machine, use heuristics */
-		if (sfr->basic.gpp ||
-		    sfr->basic.prim_asn != (u16)sfr->basic.hpp)
+		if (sfr->basic.gpp || sfr->basic.prim_asn != 0xffff)
 			sde_regs->in_guest = 1;
 		break;
 	}

From 02c503ff237cdcd8e012a122a638295550db10a5 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Mon, 28 Nov 2016 15:50:48 +0100
Subject: [PATCH 58/74] s390/spinlock: use atomic primitives for spinlocks

Add a couple more __atomic_xxx function to atomic_ops.h and use them
to replace the compare-and-swap inlines in the spinlock code. This
changes the type of the lock value from unsigned int to int.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/include/asm/atomic_ops.h     | 22 ++++----
 arch/s390/include/asm/spinlock.h       | 45 +++++++---------
 arch/s390/include/asm/spinlock_types.h |  6 +--
 arch/s390/lib/spinlock.c               | 73 ++++++++++++--------------
 4 files changed, 70 insertions(+), 76 deletions(-)

diff --git a/arch/s390/include/asm/atomic_ops.h b/arch/s390/include/asm/atomic_ops.h
index ac9e2b939d04..ba6d29412344 100644
--- a/arch/s390/include/asm/atomic_ops.h
+++ b/arch/s390/include/asm/atomic_ops.h
@@ -111,20 +111,22 @@ __ATOMIC64_OPS(__atomic64_xor, "xgr")
 
 static inline int __atomic_cmpxchg(int *ptr, int old, int new)
 {
-	asm volatile(
-		"	cs	%[old],%[new],%[ptr]"
-		: [old] "+d" (old), [ptr] "+Q" (*ptr)
-		: [new] "d" (new) : "cc", "memory");
-	return old;
+	return __sync_val_compare_and_swap(ptr, old, new);
+}
+
+static inline int __atomic_cmpxchg_bool(int *ptr, int old, int new)
+{
+	return __sync_bool_compare_and_swap(ptr, old, new);
 }
 
 static inline long __atomic64_cmpxchg(long *ptr, long old, long new)
 {
-	asm volatile(
-		"	csg	%[old],%[new],%[ptr]"
-		: [old] "+d" (old), [ptr] "+Q" (*ptr)
-		: [new] "d" (new) : "cc", "memory");
-	return old;
+	return __sync_val_compare_and_swap(ptr, old, new);
+}
+
+static inline long __atomic64_cmpxchg_bool(long *ptr, long old, long new)
+{
+	return __sync_bool_compare_and_swap(ptr, old, new);
 }
 
 #endif /* __ARCH_S390_ATOMIC_OPS__  */
diff --git a/arch/s390/include/asm/spinlock.h b/arch/s390/include/asm/spinlock.h
index ffc45048ea7d..f7838ecd83c6 100644
--- a/arch/s390/include/asm/spinlock.h
+++ b/arch/s390/include/asm/spinlock.h
@@ -10,6 +10,7 @@
 #define __ASM_SPINLOCK_H
 
 #include <linux/smp.h>
+#include <asm/atomic_ops.h>
 #include <asm/barrier.h>
 #include <asm/processor.h>
 
@@ -17,12 +18,6 @@
 
 extern int spin_retry;
 
-static inline int
-_raw_compare_and_swap(unsigned int *lock, unsigned int old, unsigned int new)
-{
-	return __sync_bool_compare_and_swap(lock, old, new);
-}
-
 #ifndef CONFIG_SMP
 static inline bool arch_vcpu_is_preempted(int cpu) { return false; }
 #else
@@ -40,7 +35,7 @@ bool arch_vcpu_is_preempted(int cpu);
  * (the type definitions are in asm/spinlock_types.h)
  */
 
-void arch_lock_relax(unsigned int cpu);
+void arch_lock_relax(int cpu);
 
 void arch_spin_lock_wait(arch_spinlock_t *);
 int arch_spin_trylock_retry(arch_spinlock_t *);
@@ -70,7 +65,7 @@ static inline int arch_spin_trylock_once(arch_spinlock_t *lp)
 {
 	barrier();
 	return likely(arch_spin_value_unlocked(*lp) &&
-		      _raw_compare_and_swap(&lp->lock, 0, SPINLOCK_LOCKVAL));
+		      __atomic_cmpxchg_bool(&lp->lock, 0, SPINLOCK_LOCKVAL));
 }
 
 static inline void arch_spin_lock(arch_spinlock_t *lp)
@@ -95,7 +90,7 @@ static inline int arch_spin_trylock(arch_spinlock_t *lp)
 
 static inline void arch_spin_unlock(arch_spinlock_t *lp)
 {
-	typecheck(unsigned int, lp->lock);
+	typecheck(int, lp->lock);
 	asm volatile(
 		"st	%1,%0\n"
 		: "+Q" (lp->lock)
@@ -141,16 +136,16 @@ extern int _raw_write_trylock_retry(arch_rwlock_t *lp);
 
 static inline int arch_read_trylock_once(arch_rwlock_t *rw)
 {
-	unsigned int old = ACCESS_ONCE(rw->lock);
-	return likely((int) old >= 0 &&
-		      _raw_compare_and_swap(&rw->lock, old, old + 1));
+	int old = ACCESS_ONCE(rw->lock);
+	return likely(old >= 0 &&
+		      __atomic_cmpxchg_bool(&rw->lock, old, old + 1));
 }
 
 static inline int arch_write_trylock_once(arch_rwlock_t *rw)
 {
-	unsigned int old = ACCESS_ONCE(rw->lock);
+	int old = ACCESS_ONCE(rw->lock);
 	return likely(old == 0 &&
-		      _raw_compare_and_swap(&rw->lock, 0, 0x80000000));
+		      __atomic_cmpxchg_bool(&rw->lock, 0, 0x80000000));
 }
 
 #ifdef CONFIG_HAVE_MARCH_Z196_FEATURES
@@ -161,9 +156,9 @@ static inline int arch_write_trylock_once(arch_rwlock_t *rw)
 
 #define __RAW_LOCK(ptr, op_val, op_string)		\
 ({							\
-	unsigned int old_val;				\
+	int old_val;					\
 							\
-	typecheck(unsigned int *, ptr);			\
+	typecheck(int *, ptr);				\
 	asm volatile(					\
 		op_string "	%0,%2,%1\n"		\
 		"bcr	14,0\n"				\
@@ -175,9 +170,9 @@ static inline int arch_write_trylock_once(arch_rwlock_t *rw)
 
 #define __RAW_UNLOCK(ptr, op_val, op_string)		\
 ({							\
-	unsigned int old_val;				\
+	int old_val;					\
 							\
-	typecheck(unsigned int *, ptr);			\
+	typecheck(int *, ptr);				\
 	asm volatile(					\
 		op_string "	%0,%2,%1\n"		\
 		: "=d" (old_val), "+Q" (*ptr)		\
@@ -187,14 +182,14 @@ static inline int arch_write_trylock_once(arch_rwlock_t *rw)
 })
 
 extern void _raw_read_lock_wait(arch_rwlock_t *lp);
-extern void _raw_write_lock_wait(arch_rwlock_t *lp, unsigned int prev);
+extern void _raw_write_lock_wait(arch_rwlock_t *lp, int prev);
 
 static inline void arch_read_lock(arch_rwlock_t *rw)
 {
-	unsigned int old;
+	int old;
 
 	old = __RAW_LOCK(&rw->lock, 1, __RAW_OP_ADD);
-	if ((int) old < 0)
+	if (old < 0)
 		_raw_read_lock_wait(rw);
 }
 
@@ -205,7 +200,7 @@ static inline void arch_read_unlock(arch_rwlock_t *rw)
 
 static inline void arch_write_lock(arch_rwlock_t *rw)
 {
-	unsigned int old;
+	int old;
 
 	old = __RAW_LOCK(&rw->lock, 0x80000000, __RAW_OP_OR);
 	if (old != 0)
@@ -232,11 +227,11 @@ static inline void arch_read_lock(arch_rwlock_t *rw)
 
 static inline void arch_read_unlock(arch_rwlock_t *rw)
 {
-	unsigned int old;
+	int old;
 
 	do {
 		old = ACCESS_ONCE(rw->lock);
-	} while (!_raw_compare_and_swap(&rw->lock, old, old - 1));
+	} while (!__atomic_cmpxchg_bool(&rw->lock, old, old - 1));
 }
 
 static inline void arch_write_lock(arch_rwlock_t *rw)
@@ -248,7 +243,7 @@ static inline void arch_write_lock(arch_rwlock_t *rw)
 
 static inline void arch_write_unlock(arch_rwlock_t *rw)
 {
-	typecheck(unsigned int, rw->lock);
+	typecheck(int, rw->lock);
 
 	rw->owner = 0;
 	asm volatile(
diff --git a/arch/s390/include/asm/spinlock_types.h b/arch/s390/include/asm/spinlock_types.h
index d84b6939237c..fe755eec275f 100644
--- a/arch/s390/include/asm/spinlock_types.h
+++ b/arch/s390/include/asm/spinlock_types.h
@@ -6,14 +6,14 @@
 #endif
 
 typedef struct {
-	unsigned int lock;
+	int lock;
 } __attribute__ ((aligned (4))) arch_spinlock_t;
 
 #define __ARCH_SPIN_LOCK_UNLOCKED { .lock = 0, }
 
 typedef struct {
-	unsigned int lock;
-	unsigned int owner;
+	int lock;
+	int owner;
 } arch_rwlock_t;
 
 #define __ARCH_RW_LOCK_UNLOCKED		{ 0 }
diff --git a/arch/s390/lib/spinlock.c b/arch/s390/lib/spinlock.c
index ba427eb6f14c..3f4d0c69bbfe 100644
--- a/arch/s390/lib/spinlock.c
+++ b/arch/s390/lib/spinlock.c
@@ -32,23 +32,22 @@ static int __init spin_retry_setup(char *str)
 }
 __setup("spin_retry=", spin_retry_setup);
 
-static inline void _raw_compare_and_delay(unsigned int *lock, unsigned int old)
+static inline void compare_and_delay(int *lock, int old)
 {
 	asm(".insn rsy,0xeb0000000022,%0,0,%1" : : "d" (old), "Q" (*lock));
 }
 
 void arch_spin_lock_wait(arch_spinlock_t *lp)
 {
-	unsigned int cpu = SPINLOCK_LOCKVAL;
-	unsigned int owner;
-	int count, first_diag;
+	int cpu = SPINLOCK_LOCKVAL;
+	int owner, count, first_diag;
 
 	first_diag = 1;
 	while (1) {
 		owner = ACCESS_ONCE(lp->lock);
 		/* Try to get the lock if it is free. */
 		if (!owner) {
-			if (_raw_compare_and_swap(&lp->lock, 0, cpu))
+			if (__atomic_cmpxchg_bool(&lp->lock, 0, cpu))
 				return;
 			continue;
 		}
@@ -62,7 +61,7 @@ void arch_spin_lock_wait(arch_spinlock_t *lp)
 		count = spin_retry;
 		do {
 			if (MACHINE_HAS_CAD)
-				_raw_compare_and_delay(&lp->lock, owner);
+				compare_and_delay(&lp->lock, owner);
 			owner = ACCESS_ONCE(lp->lock);
 		} while (owner && count-- > 0);
 		if (!owner)
@@ -82,9 +81,8 @@ EXPORT_SYMBOL(arch_spin_lock_wait);
 
 void arch_spin_lock_wait_flags(arch_spinlock_t *lp, unsigned long flags)
 {
-	unsigned int cpu = SPINLOCK_LOCKVAL;
-	unsigned int owner;
-	int count, first_diag;
+	int cpu = SPINLOCK_LOCKVAL;
+	int owner, count, first_diag;
 
 	local_irq_restore(flags);
 	first_diag = 1;
@@ -93,7 +91,7 @@ void arch_spin_lock_wait_flags(arch_spinlock_t *lp, unsigned long flags)
 		/* Try to get the lock if it is free. */
 		if (!owner) {
 			local_irq_disable();
-			if (_raw_compare_and_swap(&lp->lock, 0, cpu))
+			if (__atomic_cmpxchg_bool(&lp->lock, 0, cpu))
 				return;
 			local_irq_restore(flags);
 			continue;
@@ -108,7 +106,7 @@ void arch_spin_lock_wait_flags(arch_spinlock_t *lp, unsigned long flags)
 		count = spin_retry;
 		do {
 			if (MACHINE_HAS_CAD)
-				_raw_compare_and_delay(&lp->lock, owner);
+				compare_and_delay(&lp->lock, owner);
 			owner = ACCESS_ONCE(lp->lock);
 		} while (owner && count-- > 0);
 		if (!owner)
@@ -128,18 +126,17 @@ EXPORT_SYMBOL(arch_spin_lock_wait_flags);
 
 int arch_spin_trylock_retry(arch_spinlock_t *lp)
 {
-	unsigned int cpu = SPINLOCK_LOCKVAL;
-	unsigned int owner;
-	int count;
+	int cpu = SPINLOCK_LOCKVAL;
+	int owner, count;
 
 	for (count = spin_retry; count > 0; count--) {
 		owner = READ_ONCE(lp->lock);
 		/* Try to get the lock if it is free. */
 		if (!owner) {
-			if (_raw_compare_and_swap(&lp->lock, 0, cpu))
+			if (__atomic_cmpxchg_bool(&lp->lock, 0, cpu))
 				return 1;
 		} else if (MACHINE_HAS_CAD)
-			_raw_compare_and_delay(&lp->lock, owner);
+			compare_and_delay(&lp->lock, owner);
 	}
 	return 0;
 }
@@ -147,8 +144,8 @@ EXPORT_SYMBOL(arch_spin_trylock_retry);
 
 void _raw_read_lock_wait(arch_rwlock_t *rw)
 {
-	unsigned int owner, old;
 	int count = spin_retry;
+	int owner, old;
 
 #ifdef CONFIG_HAVE_MARCH_Z196_FEATURES
 	__RAW_LOCK(&rw->lock, -1, __RAW_OP_ADD);
@@ -162,12 +159,12 @@ void _raw_read_lock_wait(arch_rwlock_t *rw)
 		}
 		old = ACCESS_ONCE(rw->lock);
 		owner = ACCESS_ONCE(rw->owner);
-		if ((int) old < 0) {
+		if (old < 0) {
 			if (MACHINE_HAS_CAD)
-				_raw_compare_and_delay(&rw->lock, old);
+				compare_and_delay(&rw->lock, old);
 			continue;
 		}
-		if (_raw_compare_and_swap(&rw->lock, old, old + 1))
+		if (__atomic_cmpxchg_bool(&rw->lock, old, old + 1))
 			return;
 	}
 }
@@ -175,17 +172,17 @@ EXPORT_SYMBOL(_raw_read_lock_wait);
 
 int _raw_read_trylock_retry(arch_rwlock_t *rw)
 {
-	unsigned int old;
 	int count = spin_retry;
+	int old;
 
 	while (count-- > 0) {
 		old = ACCESS_ONCE(rw->lock);
-		if ((int) old < 0) {
+		if (old < 0) {
 			if (MACHINE_HAS_CAD)
-				_raw_compare_and_delay(&rw->lock, old);
+				compare_and_delay(&rw->lock, old);
 			continue;
 		}
-		if (_raw_compare_and_swap(&rw->lock, old, old + 1))
+		if (__atomic_cmpxchg_bool(&rw->lock, old, old + 1))
 			return 1;
 	}
 	return 0;
@@ -194,10 +191,10 @@ EXPORT_SYMBOL(_raw_read_trylock_retry);
 
 #ifdef CONFIG_HAVE_MARCH_Z196_FEATURES
 
-void _raw_write_lock_wait(arch_rwlock_t *rw, unsigned int prev)
+void _raw_write_lock_wait(arch_rwlock_t *rw, int prev)
 {
-	unsigned int owner, old;
 	int count = spin_retry;
+	int owner, old;
 
 	owner = 0;
 	while (1) {
@@ -209,14 +206,14 @@ void _raw_write_lock_wait(arch_rwlock_t *rw, unsigned int prev)
 		old = ACCESS_ONCE(rw->lock);
 		owner = ACCESS_ONCE(rw->owner);
 		smp_mb();
-		if ((int) old >= 0) {
+		if (old >= 0) {
 			prev = __RAW_LOCK(&rw->lock, 0x80000000, __RAW_OP_OR);
 			old = prev;
 		}
-		if ((old & 0x7fffffff) == 0 && (int) prev >= 0)
+		if ((old & 0x7fffffff) == 0 && prev >= 0)
 			break;
 		if (MACHINE_HAS_CAD)
-			_raw_compare_and_delay(&rw->lock, old);
+			compare_and_delay(&rw->lock, old);
 	}
 }
 EXPORT_SYMBOL(_raw_write_lock_wait);
@@ -225,8 +222,8 @@ EXPORT_SYMBOL(_raw_write_lock_wait);
 
 void _raw_write_lock_wait(arch_rwlock_t *rw)
 {
-	unsigned int owner, old, prev;
 	int count = spin_retry;
+	int owner, old, prev;
 
 	prev = 0x80000000;
 	owner = 0;
@@ -238,15 +235,15 @@ void _raw_write_lock_wait(arch_rwlock_t *rw)
 		}
 		old = ACCESS_ONCE(rw->lock);
 		owner = ACCESS_ONCE(rw->owner);
-		if ((int) old >= 0 &&
-		    _raw_compare_and_swap(&rw->lock, old, old | 0x80000000))
+		if (old >= 0 &&
+		    __atomic_cmpxchg_bool(&rw->lock, old, old | 0x80000000))
 			prev = old;
 		else
 			smp_mb();
-		if ((old & 0x7fffffff) == 0 && (int) prev >= 0)
+		if ((old & 0x7fffffff) == 0 && prev >= 0)
 			break;
 		if (MACHINE_HAS_CAD)
-			_raw_compare_and_delay(&rw->lock, old);
+			compare_and_delay(&rw->lock, old);
 	}
 }
 EXPORT_SYMBOL(_raw_write_lock_wait);
@@ -255,24 +252,24 @@ EXPORT_SYMBOL(_raw_write_lock_wait);
 
 int _raw_write_trylock_retry(arch_rwlock_t *rw)
 {
-	unsigned int old;
 	int count = spin_retry;
+	int old;
 
 	while (count-- > 0) {
 		old = ACCESS_ONCE(rw->lock);
 		if (old) {
 			if (MACHINE_HAS_CAD)
-				_raw_compare_and_delay(&rw->lock, old);
+				compare_and_delay(&rw->lock, old);
 			continue;
 		}
-		if (_raw_compare_and_swap(&rw->lock, 0, 0x80000000))
+		if (__atomic_cmpxchg_bool(&rw->lock, 0, 0x80000000))
 			return 1;
 	}
 	return 0;
 }
 EXPORT_SYMBOL(_raw_write_trylock_retry);
 
-void arch_lock_relax(unsigned int cpu)
+void arch_lock_relax(int cpu)
 {
 	if (!cpu)
 		return;

From b13de4b7adeb7a5e37a5aa78d5a4926c3cd4e131 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Fri, 24 Mar 2017 17:00:45 +0100
Subject: [PATCH 59/74] s390/spinlock: remove compare and delay instruction

The CAD instruction never worked quite as expected for the spinlock
code. It has been disabled by default with git commit 61b0b01686d48220,
if the "cad" kernel parameter is specified it is enabled for both user
space and the spinlock code. Leave the option to enable the instruction
for user space but remove it from the spinlock code.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/include/asm/setup.h |  6 ++----
 arch/s390/kernel/early.c      | 19 ++++++-------------
 arch/s390/lib/spinlock.c      | 33 +++++----------------------------
 3 files changed, 13 insertions(+), 45 deletions(-)

diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h
index 383bd8358a8c..cd78155b1829 100644
--- a/arch/s390/include/asm/setup.h
+++ b/arch/s390/include/asm/setup.h
@@ -29,9 +29,8 @@
 #define MACHINE_FLAG_TE		_BITUL(11)
 #define MACHINE_FLAG_TLB_LC	_BITUL(12)
 #define MACHINE_FLAG_VX		_BITUL(13)
-#define MACHINE_FLAG_CAD	_BITUL(14)
-#define MACHINE_FLAG_NX		_BITUL(15)
-#define MACHINE_FLAG_GS		_BITUL(16)
+#define MACHINE_FLAG_NX		_BITUL(14)
+#define MACHINE_FLAG_GS		_BITUL(15)
 
 #define LPP_MAGIC		_BITUL(31)
 #define LPP_PFAULT_PID_MASK	_AC(0xffffffff, UL)
@@ -69,7 +68,6 @@ extern void detect_memory_memblock(void);
 #define MACHINE_HAS_TE		(S390_lowcore.machine_flags & MACHINE_FLAG_TE)
 #define MACHINE_HAS_TLB_LC	(S390_lowcore.machine_flags & MACHINE_FLAG_TLB_LC)
 #define MACHINE_HAS_VX		(S390_lowcore.machine_flags & MACHINE_FLAG_VX)
-#define MACHINE_HAS_CAD		(S390_lowcore.machine_flags & MACHINE_FLAG_CAD)
 #define MACHINE_HAS_NX		(S390_lowcore.machine_flags & MACHINE_FLAG_NX)
 #define MACHINE_HAS_GS		(S390_lowcore.machine_flags & MACHINE_FLAG_GS)
 
diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c
index 251391e3f8bc..5d20182ee8ae 100644
--- a/arch/s390/kernel/early.c
+++ b/arch/s390/kernel/early.c
@@ -434,23 +434,16 @@ early_param("noexec", noexec_setup);
 
 static int __init cad_setup(char *str)
 {
-	int val;
+	bool enabled;
+	int rc;
 
-	get_option(&str, &val);
-	if (val && test_facility(128))
-		S390_lowcore.machine_flags |= MACHINE_FLAG_CAD;
-	return 0;
-}
-early_param("cad", cad_setup);
-
-static int __init cad_init(void)
-{
-	if (MACHINE_HAS_CAD)
+	rc = kstrtobool(str, &enabled);
+	if (!rc && enabled && test_facility(128))
 		/* Enable problem state CAD. */
 		__ctl_set_bit(2, 3);
-	return 0;
+	return rc;
 }
-early_initcall(cad_init);
+early_param("cad", cad_setup);
 
 static __init void memmove_early(void *dst, const void *src, size_t n)
 {
diff --git a/arch/s390/lib/spinlock.c b/arch/s390/lib/spinlock.c
index 3f4d0c69bbfe..ffb15bd4c593 100644
--- a/arch/s390/lib/spinlock.c
+++ b/arch/s390/lib/spinlock.c
@@ -17,7 +17,7 @@ int spin_retry = -1;
 static int __init spin_retry_init(void)
 {
 	if (spin_retry < 0)
-		spin_retry = MACHINE_HAS_CAD ? 10 : 1000;
+		spin_retry = 1000;
 	return 0;
 }
 early_initcall(spin_retry_init);
@@ -32,11 +32,6 @@ static int __init spin_retry_setup(char *str)
 }
 __setup("spin_retry=", spin_retry_setup);
 
-static inline void compare_and_delay(int *lock, int old)
-{
-	asm(".insn rsy,0xeb0000000022,%0,0,%1" : : "d" (old), "Q" (*lock));
-}
-
 void arch_spin_lock_wait(arch_spinlock_t *lp)
 {
 	int cpu = SPINLOCK_LOCKVAL;
@@ -60,8 +55,6 @@ void arch_spin_lock_wait(arch_spinlock_t *lp)
 		/* Loop for a while on the lock value. */
 		count = spin_retry;
 		do {
-			if (MACHINE_HAS_CAD)
-				compare_and_delay(&lp->lock, owner);
 			owner = ACCESS_ONCE(lp->lock);
 		} while (owner && count-- > 0);
 		if (!owner)
@@ -105,8 +98,6 @@ void arch_spin_lock_wait_flags(arch_spinlock_t *lp, unsigned long flags)
 		/* Loop for a while on the lock value. */
 		count = spin_retry;
 		do {
-			if (MACHINE_HAS_CAD)
-				compare_and_delay(&lp->lock, owner);
 			owner = ACCESS_ONCE(lp->lock);
 		} while (owner && count-- > 0);
 		if (!owner)
@@ -135,8 +126,7 @@ int arch_spin_trylock_retry(arch_spinlock_t *lp)
 		if (!owner) {
 			if (__atomic_cmpxchg_bool(&lp->lock, 0, cpu))
 				return 1;
-		} else if (MACHINE_HAS_CAD)
-			compare_and_delay(&lp->lock, owner);
+		}
 	}
 	return 0;
 }
@@ -159,11 +149,8 @@ void _raw_read_lock_wait(arch_rwlock_t *rw)
 		}
 		old = ACCESS_ONCE(rw->lock);
 		owner = ACCESS_ONCE(rw->owner);
-		if (old < 0) {
-			if (MACHINE_HAS_CAD)
-				compare_and_delay(&rw->lock, old);
+		if (old < 0)
 			continue;
-		}
 		if (__atomic_cmpxchg_bool(&rw->lock, old, old + 1))
 			return;
 	}
@@ -177,11 +164,8 @@ int _raw_read_trylock_retry(arch_rwlock_t *rw)
 
 	while (count-- > 0) {
 		old = ACCESS_ONCE(rw->lock);
-		if (old < 0) {
-			if (MACHINE_HAS_CAD)
-				compare_and_delay(&rw->lock, old);
+		if (old < 0)
 			continue;
-		}
 		if (__atomic_cmpxchg_bool(&rw->lock, old, old + 1))
 			return 1;
 	}
@@ -212,8 +196,6 @@ void _raw_write_lock_wait(arch_rwlock_t *rw, int prev)
 		}
 		if ((old & 0x7fffffff) == 0 && prev >= 0)
 			break;
-		if (MACHINE_HAS_CAD)
-			compare_and_delay(&rw->lock, old);
 	}
 }
 EXPORT_SYMBOL(_raw_write_lock_wait);
@@ -242,8 +224,6 @@ void _raw_write_lock_wait(arch_rwlock_t *rw)
 			smp_mb();
 		if ((old & 0x7fffffff) == 0 && prev >= 0)
 			break;
-		if (MACHINE_HAS_CAD)
-			compare_and_delay(&rw->lock, old);
 	}
 }
 EXPORT_SYMBOL(_raw_write_lock_wait);
@@ -257,11 +237,8 @@ int _raw_write_trylock_retry(arch_rwlock_t *rw)
 
 	while (count-- > 0) {
 		old = ACCESS_ONCE(rw->lock);
-		if (old) {
-			if (MACHINE_HAS_CAD)
-				compare_and_delay(&rw->lock, old);
+		if (old)
 			continue;
-		}
 		if (__atomic_cmpxchg_bool(&rw->lock, 0, 0x80000000))
 			return 1;
 	}

From c9c31b07bab5fee3ef0bf163afc11b1100eb10d4 Mon Sep 17 00:00:00 2001
From: Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com>
Date: Wed, 12 Apr 2017 11:08:15 +0200
Subject: [PATCH 60/74] vfio: ccw: remove unnecessary NULL checks of a pointer

Remove several unnecessary checks for the @private pointer, since it
can never be NULL in these places.

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com>
Message-Id: <20170412090816.79108-2-bjsdjshi@linux.vnet.ibm.com>
Signed-off-by: Cornelia Huck <cornelia.huck@de.ibm.com>
---
 drivers/s390/cio/vfio_ccw_fsm.c |  6 +-----
 drivers/s390/cio/vfio_ccw_ops.c | 17 -----------------
 2 files changed, 1 insertion(+), 22 deletions(-)

diff --git a/drivers/s390/cio/vfio_ccw_fsm.c b/drivers/s390/cio/vfio_ccw_fsm.c
index 55b6cc55758e..80a0559cd7ce 100644
--- a/drivers/s390/cio/vfio_ccw_fsm.c
+++ b/drivers/s390/cio/vfio_ccw_fsm.c
@@ -161,12 +161,8 @@ err_out:
 static void fsm_irq(struct vfio_ccw_private *private,
 		    enum vfio_ccw_event event)
 {
-	struct irb *irb;
+	struct irb *irb = this_cpu_ptr(&cio_irb);
 
-	if (!private)
-		return;
-
-	irb = this_cpu_ptr(&cio_irb);
 	memcpy(&private->irb, irb, sizeof(*irb));
 
 	queue_work(vfio_ccw_work_q, &private->io_work);
diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c
index b2e615404034..55d0c87e73c3 100644
--- a/drivers/s390/cio/vfio_ccw_ops.c
+++ b/drivers/s390/cio/vfio_ccw_ops.c
@@ -19,9 +19,6 @@ static int vfio_ccw_mdev_reset(struct mdev_device *mdev)
 	int ret;
 
 	private = dev_get_drvdata(mdev_parent_dev(mdev));
-	if (!private)
-		return -ENODEV;
-
 	sch = private->sch;
 	/*
 	 * TODO:
@@ -49,9 +46,6 @@ static int vfio_ccw_mdev_notifier(struct notifier_block *nb,
 	struct vfio_ccw_private *private =
 		container_of(nb, struct vfio_ccw_private, nb);
 
-	if (!private)
-		return NOTIFY_STOP;
-
 	/*
 	 * Vendor drivers MUST unpin pages in response to an
 	 * invalidation.
@@ -134,9 +128,6 @@ static int vfio_ccw_mdev_remove(struct mdev_device *mdev)
 		dev_get_drvdata(mdev_parent_dev(mdev));
 	int ret;
 
-	if (!private)
-		goto out;
-
 	if ((private->state == VFIO_CCW_STATE_NOT_OPER) ||
 	    (private->state == VFIO_CCW_STATE_STANDBY))
 		goto out;
@@ -187,9 +178,6 @@ static ssize_t vfio_ccw_mdev_read(struct mdev_device *mdev,
 		return -EINVAL;
 
 	private = dev_get_drvdata(mdev_parent_dev(mdev));
-	if (!private)
-		return -ENODEV;
-
 	region = &private->io_region;
 	if (copy_to_user(buf, (void *)region + *ppos, count))
 		return -EFAULT;
@@ -209,8 +197,6 @@ static ssize_t vfio_ccw_mdev_write(struct mdev_device *mdev,
 		return -EINVAL;
 
 	private = dev_get_drvdata(mdev_parent_dev(mdev));
-	if (!private)
-		return -ENODEV;
 	if (private->state != VFIO_CCW_STATE_IDLE)
 		return -EACCES;
 
@@ -274,9 +260,6 @@ static int vfio_ccw_mdev_set_irqs(struct mdev_device *mdev,
 		return -EINVAL;
 
 	private = dev_get_drvdata(mdev_parent_dev(mdev));
-	if (!private)
-		return -ENODEV;
-
 	ctx = &private->io_trigger;
 
 	switch (flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {

From 129cc19a94513081e9250323cd57e12ed48b3613 Mon Sep 17 00:00:00 2001
From: Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com>
Date: Wed, 12 Apr 2017 11:08:16 +0200
Subject: [PATCH 61/74] vfio: ccw: improve error handling for
 vfio_ccw_mdev_remove

When vfio_ccw_mdev_reset fails during the remove process of the mdev,
the current implementation simply returns.

The failure indicates that the subchannel device is in a NOT_OPER state,
thus the right thing to do should be removing the mdev.

While we are at here, reverse the condition check to make the code more
concise and readable.

Signed-off-by: Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com>
Message-Id: <20170412090816.79108-3-bjsdjshi@linux.vnet.ibm.com>
Signed-off-by: Cornelia Huck <cornelia.huck@de.ibm.com>
---
 drivers/s390/cio/vfio_ccw_ops.c | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c
index 55d0c87e73c3..e72abbc18ee3 100644
--- a/drivers/s390/cio/vfio_ccw_ops.c
+++ b/drivers/s390/cio/vfio_ccw_ops.c
@@ -126,19 +126,14 @@ static int vfio_ccw_mdev_remove(struct mdev_device *mdev)
 {
 	struct vfio_ccw_private *private =
 		dev_get_drvdata(mdev_parent_dev(mdev));
-	int ret;
 
-	if ((private->state == VFIO_CCW_STATE_NOT_OPER) ||
-	    (private->state == VFIO_CCW_STATE_STANDBY))
-		goto out;
+	if ((private->state != VFIO_CCW_STATE_NOT_OPER) &&
+	    (private->state != VFIO_CCW_STATE_STANDBY)) {
+		if (!vfio_ccw_mdev_reset(mdev))
+			private->state = VFIO_CCW_STATE_STANDBY;
+		/* The state will be NOT_OPER on error. */
+	}
 
-	ret = vfio_ccw_mdev_reset(mdev);
-	if (ret)
-		return ret;
-
-	private->state = VFIO_CCW_STATE_STANDBY;
-
-out:
 	private->mdev = NULL;
 	atomic_inc(&private->avail);
 

From 2d42f9477320befd33846c4083cab898998cdee5 Mon Sep 17 00:00:00 2001
From: Claudio Imbrenda <imbrenda@linux.vnet.ibm.com>
Date: Thu, 20 Apr 2017 10:03:45 +0200
Subject: [PATCH 62/74] s390/kvm: Add PGSTE manipulation functions

Add PGSTE manipulation functions:
* set_pgste_bits sets specific bits in a PGSTE
* get_pgste returns the whole PGSTE
* pgste_perform_essa manipulates a PGSTE to set specific storage states
* ESSA_[SG]ET_* macros used to indicate the action for manipulate_pgste

Signed-off-by: Claudio Imbrenda <imbrenda@linux.vnet.ibm.com>
Reviewed-by: Janosch Frank <frankja@de.ibm.com>
Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/include/asm/page-states.h |  19 ++++
 arch/s390/include/asm/pgtable.h     |  16 ++-
 arch/s390/mm/page-states.c          |   3 +-
 arch/s390/mm/pgtable.c              | 153 ++++++++++++++++++++++++++++
 4 files changed, 185 insertions(+), 6 deletions(-)
 create mode 100644 arch/s390/include/asm/page-states.h

diff --git a/arch/s390/include/asm/page-states.h b/arch/s390/include/asm/page-states.h
new file mode 100644
index 000000000000..42267a2fe29e
--- /dev/null
+++ b/arch/s390/include/asm/page-states.h
@@ -0,0 +1,19 @@
+/*
+ *    Copyright IBM Corp. 2017
+ *    Author(s): Claudio Imbrenda <imbrenda@linux.vnet.ibm.com>
+ */
+
+#ifndef PAGE_STATES_H
+#define PAGE_STATES_H
+
+#define ESSA_GET_STATE			0
+#define ESSA_SET_STABLE			1
+#define ESSA_SET_UNUSED			2
+#define ESSA_SET_VOLATILE		3
+#define ESSA_SET_POT_VOLATILE		4
+#define ESSA_SET_STABLE_RESIDENT	5
+#define ESSA_SET_STABLE_IF_RESIDENT	6
+
+#define ESSA_MAX	ESSA_SET_STABLE_IF_RESIDENT
+
+#endif
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 93e37b12e882..bdabbbb3e925 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -372,10 +372,12 @@ static inline int is_module_addr(void *addr)
 #define PGSTE_VSIE_BIT	0x0000200000000000UL	/* ref'd in a shadow table */
 
 /* Guest Page State used for virtualization */
-#define _PGSTE_GPS_ZERO		0x0000000080000000UL
-#define _PGSTE_GPS_USAGE_MASK	0x0000000003000000UL
-#define _PGSTE_GPS_USAGE_STABLE 0x0000000000000000UL
-#define _PGSTE_GPS_USAGE_UNUSED 0x0000000001000000UL
+#define _PGSTE_GPS_ZERO			0x0000000080000000UL
+#define _PGSTE_GPS_USAGE_MASK		0x0000000003000000UL
+#define _PGSTE_GPS_USAGE_STABLE		0x0000000000000000UL
+#define _PGSTE_GPS_USAGE_UNUSED		0x0000000001000000UL
+#define _PGSTE_GPS_USAGE_POT_VOLATILE	0x0000000002000000UL
+#define _PGSTE_GPS_USAGE_VOLATILE	_PGSTE_GPS_USAGE_MASK
 
 /*
  * A user page table pointer has the space-switch-event bit, the
@@ -1041,6 +1043,12 @@ int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr);
 int get_guest_storage_key(struct mm_struct *mm, unsigned long addr,
 			  unsigned char *key);
 
+int set_pgste_bits(struct mm_struct *mm, unsigned long addr,
+				unsigned long bits, unsigned long value);
+int get_pgste(struct mm_struct *mm, unsigned long hva, unsigned long *pgstep);
+int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc,
+			unsigned long *oldpte, unsigned long *oldpgste);
+
 /*
  * Certain architectures need to do special things when PTEs
  * within a page table are directly modified.  Thus, the following
diff --git a/arch/s390/mm/page-states.c b/arch/s390/mm/page-states.c
index 3330ea124eec..69a7b01ae746 100644
--- a/arch/s390/mm/page-states.c
+++ b/arch/s390/mm/page-states.c
@@ -13,8 +13,7 @@
 #include <linux/gfp.h>
 #include <linux/init.h>
 
-#define ESSA_SET_STABLE		1
-#define ESSA_SET_UNUSED		2
+#include <asm/page-states.h>
 
 static int cmma_flag = 1;
 
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 463e5ef02304..947b66a5cdba 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -23,6 +23,7 @@
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
+#include <asm/page-states.h>
 
 static inline pte_t ptep_flush_direct(struct mm_struct *mm,
 				      unsigned long addr, pte_t *ptep)
@@ -787,4 +788,156 @@ int get_guest_storage_key(struct mm_struct *mm, unsigned long addr,
 	return 0;
 }
 EXPORT_SYMBOL(get_guest_storage_key);
+
+/**
+ * pgste_perform_essa - perform ESSA actions on the PGSTE.
+ * @mm: the memory context. It must have PGSTEs, no check is performed here!
+ * @hva: the host virtual address of the page whose PGSTE is to be processed
+ * @orc: the specific action to perform, see the ESSA_SET_* macros.
+ * @oldpte: the PTE will be saved there if the pointer is not NULL.
+ * @oldpgste: the old PGSTE will be saved there if the pointer is not NULL.
+ *
+ * Return: 1 if the page is to be added to the CBRL, otherwise 0,
+ *	   or < 0 in case of error. -EINVAL is returned for invalid values
+ *	   of orc, -EFAULT for invalid addresses.
+ */
+int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc,
+			unsigned long *oldpte, unsigned long *oldpgste)
+{
+	unsigned long pgstev;
+	spinlock_t *ptl;
+	pgste_t pgste;
+	pte_t *ptep;
+	int res = 0;
+
+	WARN_ON_ONCE(orc > ESSA_MAX);
+	if (unlikely(orc > ESSA_MAX))
+		return -EINVAL;
+	ptep = get_locked_pte(mm, hva, &ptl);
+	if (unlikely(!ptep))
+		return -EFAULT;
+	pgste = pgste_get_lock(ptep);
+	pgstev = pgste_val(pgste);
+	if (oldpte)
+		*oldpte = pte_val(*ptep);
+	if (oldpgste)
+		*oldpgste = pgstev;
+
+	switch (orc) {
+	case ESSA_GET_STATE:
+		break;
+	case ESSA_SET_STABLE:
+		pgstev &= ~_PGSTE_GPS_USAGE_MASK;
+		pgstev |= _PGSTE_GPS_USAGE_STABLE;
+		break;
+	case ESSA_SET_UNUSED:
+		pgstev &= ~_PGSTE_GPS_USAGE_MASK;
+		pgstev |= _PGSTE_GPS_USAGE_UNUSED;
+		if (pte_val(*ptep) & _PAGE_INVALID)
+			res = 1;
+		break;
+	case ESSA_SET_VOLATILE:
+		pgstev &= ~_PGSTE_GPS_USAGE_MASK;
+		pgstev |= _PGSTE_GPS_USAGE_VOLATILE;
+		if (pte_val(*ptep) & _PAGE_INVALID)
+			res = 1;
+		break;
+	case ESSA_SET_POT_VOLATILE:
+		pgstev &= ~_PGSTE_GPS_USAGE_MASK;
+		if (!(pte_val(*ptep) & _PAGE_INVALID)) {
+			pgstev |= _PGSTE_GPS_USAGE_POT_VOLATILE;
+			break;
+		}
+		if (pgstev & _PGSTE_GPS_ZERO) {
+			pgstev |= _PGSTE_GPS_USAGE_VOLATILE;
+			break;
+		}
+		if (!(pgstev & PGSTE_GC_BIT)) {
+			pgstev |= _PGSTE_GPS_USAGE_VOLATILE;
+			res = 1;
+			break;
+		}
+		break;
+	case ESSA_SET_STABLE_RESIDENT:
+		pgstev &= ~_PGSTE_GPS_USAGE_MASK;
+		pgstev |= _PGSTE_GPS_USAGE_STABLE;
+		/*
+		 * Since the resident state can go away any time after this
+		 * call, we will not make this page resident. We can revisit
+		 * this decision if a guest will ever start using this.
+		 */
+		break;
+	case ESSA_SET_STABLE_IF_RESIDENT:
+		if (!(pte_val(*ptep) & _PAGE_INVALID)) {
+			pgstev &= ~_PGSTE_GPS_USAGE_MASK;
+			pgstev |= _PGSTE_GPS_USAGE_STABLE;
+		}
+		break;
+	default:
+		/* we should never get here! */
+		break;
+	}
+	/* If we are discarding a page, set it to logical zero */
+	if (res)
+		pgstev |= _PGSTE_GPS_ZERO;
+
+	pgste_val(pgste) = pgstev;
+	pgste_set_unlock(ptep, pgste);
+	pte_unmap_unlock(ptep, ptl);
+	return res;
+}
+EXPORT_SYMBOL(pgste_perform_essa);
+
+/**
+ * set_pgste_bits - set specific PGSTE bits.
+ * @mm: the memory context. It must have PGSTEs, no check is performed here!
+ * @hva: the host virtual address of the page whose PGSTE is to be processed
+ * @bits: a bitmask representing the bits that will be touched
+ * @value: the values of the bits to be written. Only the bits in the mask
+ *	   will be written.
+ *
+ * Return: 0 on success, < 0 in case of error.
+ */
+int set_pgste_bits(struct mm_struct *mm, unsigned long hva,
+			unsigned long bits, unsigned long value)
+{
+	spinlock_t *ptl;
+	pgste_t new;
+	pte_t *ptep;
+
+	ptep = get_locked_pte(mm, hva, &ptl);
+	if (unlikely(!ptep))
+		return -EFAULT;
+	new = pgste_get_lock(ptep);
+
+	pgste_val(new) &= ~bits;
+	pgste_val(new) |= value & bits;
+
+	pgste_set_unlock(ptep, new);
+	pte_unmap_unlock(ptep, ptl);
+	return 0;
+}
+EXPORT_SYMBOL(set_pgste_bits);
+
+/**
+ * get_pgste - get the current PGSTE for the given address.
+ * @mm: the memory context. It must have PGSTEs, no check is performed here!
+ * @hva: the host virtual address of the page whose PGSTE is to be processed
+ * @pgstep: will be written with the current PGSTE for the given address.
+ *
+ * Return: 0 on success, < 0 in case of error.
+ */
+int get_pgste(struct mm_struct *mm, unsigned long hva, unsigned long *pgstep)
+{
+	spinlock_t *ptl;
+	pte_t *ptep;
+
+	ptep = get_locked_pte(mm, hva, &ptl);
+	if (unlikely(!ptep))
+		return -EFAULT;
+	*pgstep = pgste_val(pgste_get(ptep));
+	pte_unmap_unlock(ptep, ptl);
+	return 0;
+}
+EXPORT_SYMBOL(get_pgste);
 #endif

From aa824e1340e79d26976c9f942add29edf612a67b Mon Sep 17 00:00:00 2001
From: Claudio Imbrenda <imbrenda@linux.vnet.ibm.com>
Date: Thu, 20 Apr 2017 10:03:46 +0200
Subject: [PATCH 63/74] s390/kvm: Add use_cmma field to mm_context_t

Add use_cmma field to mm_context_t, like we do for storage keys.

Signed-off-by: Claudio Imbrenda <imbrenda@linux.vnet.ibm.com>
Acked-by: Janosch Frank <frankja@de.ibm.com>
Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/include/asm/mmu.h         | 2 ++
 arch/s390/include/asm/mmu_context.h | 1 +
 2 files changed, 3 insertions(+)

diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h
index bea785d7f853..bd6f30304518 100644
--- a/arch/s390/include/asm/mmu.h
+++ b/arch/s390/include/asm/mmu.h
@@ -22,6 +22,8 @@ typedef struct {
 	unsigned int has_pgste:1;
 	/* The mmu context uses storage keys. */
 	unsigned int use_skey:1;
+	/* The mmu context uses CMMA. */
+	unsigned int use_cmma:1;
 } mm_context_t;
 
 #define INIT_MM_CONTEXT(name)						   \
diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h
index 6e31d87fb669..c119d564d8f2 100644
--- a/arch/s390/include/asm/mmu_context.h
+++ b/arch/s390/include/asm/mmu_context.h
@@ -28,6 +28,7 @@ static inline int init_new_context(struct task_struct *tsk,
 	mm->context.alloc_pgste = page_table_allocate_pgste;
 	mm->context.has_pgste = 0;
 	mm->context.use_skey = 0;
+	mm->context.use_cmma = 0;
 #endif
 	switch (mm->context.asce_limit) {
 	case 1UL << 42:

From e525f8a6e696210d15f8b8277d4da12fc4add299 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Thu, 20 Apr 2017 13:54:11 +0200
Subject: [PATCH 64/74] s390/gs: add regset for the guarded storage broadcast
 control block

The guarded storage interface allows to register a control block for
each thread that is activated with the guarded storage broadcast event.
To retrieve the complete state of a process from the kernel a register
set for the stored broadcast control block is required.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/kernel/ptrace.c | 46 +++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/elf.h  |  1 +
 2 files changed, 47 insertions(+)

diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c
index c933e255b5d5..488c5bb8dc77 100644
--- a/arch/s390/kernel/ptrace.c
+++ b/arch/s390/kernel/ptrace.c
@@ -1171,10 +1171,48 @@ static int s390_gs_cb_set(struct task_struct *target,
 {
 	struct gs_cb *data = target->thread.gs_cb;
 
+	if (!MACHINE_HAS_GS)
+		return -ENODEV;
+	if (!data) {
+		data = kzalloc(sizeof(*data), GFP_KERNEL);
+		if (!data)
+			return -ENOMEM;
+		target->thread.gs_cb = data;
+	}
+	return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				  data, 0, sizeof(struct gs_cb));
+}
+
+static int s390_gs_bc_get(struct task_struct *target,
+			  const struct user_regset *regset,
+			  unsigned int pos, unsigned int count,
+			  void *kbuf, void __user *ubuf)
+{
+	struct gs_cb *data = target->thread.gs_bc_cb;
+
 	if (!MACHINE_HAS_GS)
 		return -ENODEV;
 	if (!data)
 		return -ENODATA;
+	return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+				   data, 0, sizeof(struct gs_cb));
+}
+
+static int s390_gs_bc_set(struct task_struct *target,
+			  const struct user_regset *regset,
+			  unsigned int pos, unsigned int count,
+			  const void *kbuf, const void __user *ubuf)
+{
+	struct gs_cb *data = target->thread.gs_bc_cb;
+
+	if (!MACHINE_HAS_GS)
+		return -ENODEV;
+	if (!data) {
+		data = kzalloc(sizeof(*data), GFP_KERNEL);
+		if (!data)
+			return -ENOMEM;
+		target->thread.gs_bc_cb = data;
+	}
 	return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
 				  data, 0, sizeof(struct gs_cb));
 }
@@ -1244,6 +1282,14 @@ static const struct user_regset s390_regsets[] = {
 		.get = s390_gs_cb_get,
 		.set = s390_gs_cb_set,
 	},
+	{
+		.core_note_type = NT_S390_GS_BC,
+		.n = sizeof(struct gs_cb) / sizeof(__u64),
+		.size = sizeof(__u64),
+		.align = sizeof(__u64),
+		.get = s390_gs_bc_get,
+		.set = s390_gs_bc_set,
+	},
 };
 
 static const struct user_regset_view user_s390_view = {
diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
index 8c6d3bdb9a00..176b6cb1008d 100644
--- a/include/uapi/linux/elf.h
+++ b/include/uapi/linux/elf.h
@@ -410,6 +410,7 @@ typedef struct elf64_shdr {
 #define NT_S390_VXRS_LOW	0x309	/* s390 vector registers 0-15 upper half */
 #define NT_S390_VXRS_HIGH	0x30a	/* s390 vector registers 16-31 */
 #define NT_S390_GS_CB	0x30b		/* s390 guarded storage registers */
+#define NT_S390_GS_BC	0x30c		/* s390 guarded storage broadcast control block */
 #define NT_ARM_VFP	0x400		/* ARM VFP/NEON registers */
 #define NT_ARM_TLS	0x401		/* ARM TLS register */
 #define NT_ARM_HW_BREAK	0x402		/* ARM hardware breakpoint registers */

From ee71d16d22bb268c1f6a64ef6d3654ace5f1e8c7 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Thu, 20 Apr 2017 14:43:51 +0200
Subject: [PATCH 65/74] s390/mm: make TASK_SIZE independent from the number of
 page table levels

The TASK_SIZE for a process should be maximum possible size of the address
space, 2GB for a 31-bit process and 8PB for a 64-bit process. The number
of page table levels required for a given memory layout is a consequence
of the mapped memory areas and their location.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/include/asm/mman.h      |  4 ----
 arch/s390/include/asm/processor.h |  9 +++++----
 arch/s390/kvm/kvm-s390.c          |  4 ++--
 arch/s390/mm/gmap.c               |  2 +-
 arch/s390/mm/gup.c                |  2 +-
 arch/s390/mm/mmap.c               | 31 ++++++++++++-------------------
 arch/s390/mm/pgalloc.c            |  2 --
 7 files changed, 21 insertions(+), 33 deletions(-)

diff --git a/arch/s390/include/asm/mman.h b/arch/s390/include/asm/mman.h
index b55a59e1d134..b79813d9cf68 100644
--- a/arch/s390/include/asm/mman.h
+++ b/arch/s390/include/asm/mman.h
@@ -8,8 +8,4 @@
 
 #include <uapi/asm/mman.h>
 
-#ifndef __ASSEMBLY__
-int s390_mmap_check(unsigned long addr, unsigned long len, unsigned long flags);
-#define arch_mmap_check(addr, len, flags) s390_mmap_check(addr, len, flags)
-#endif
 #endif /* __S390_MMAN_H__ */
diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h
index cc101f9371cb..60d395fdc864 100644
--- a/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@ -91,14 +91,15 @@ extern void execve_tail(void);
  * User space process size: 2GB for 31 bit, 4TB or 8PT for 64 bit.
  */
 
-#define TASK_SIZE_OF(tsk)	((tsk)->mm ? \
-				 (tsk)->mm->context.asce_limit : TASK_MAX_SIZE)
+#define TASK_SIZE_OF(tsk)	(test_tsk_thread_flag(tsk, TIF_31BIT) ? \
+					(1UL << 31) : (1UL << 53))
 #define TASK_UNMAPPED_BASE	(test_thread_flag(TIF_31BIT) ? \
 					(1UL << 30) : (1UL << 41))
 #define TASK_SIZE		TASK_SIZE_OF(current)
-#define TASK_MAX_SIZE		(1UL << 53)
+#define TASK_SIZE_MAX		(1UL << 53)
 
-#define STACK_TOP		(1UL << (test_thread_flag(TIF_31BIT) ? 31:42))
+#define STACK_TOP		(test_thread_flag(TIF_31BIT) ? \
+					(1UL << 31) : (1UL << 42))
 #define STACK_TOP_MAX		(1UL << 42)
 
 #define HAVE_ARCH_PICK_MMAP_LAYOUT
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index fd6cd05bb6a7..28983836c0f7 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -1512,9 +1512,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 		kvm->arch.mem_limit = KVM_S390_NO_MEM_LIMIT;
 	} else {
 		if (sclp.hamax == U64_MAX)
-			kvm->arch.mem_limit = TASK_MAX_SIZE;
+			kvm->arch.mem_limit = TASK_SIZE_MAX;
 		else
-			kvm->arch.mem_limit = min_t(unsigned long, TASK_MAX_SIZE,
+			kvm->arch.mem_limit = min_t(unsigned long, TASK_SIZE_MAX,
 						    sclp.hamax + 1);
 		kvm->arch.gmap = gmap_create(current->mm, kvm->arch.mem_limit - 1);
 		if (!kvm->arch.gmap)
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index ffb55c935eda..7f6db1e6c048 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -431,7 +431,7 @@ int gmap_map_segment(struct gmap *gmap, unsigned long from,
 	if ((from | to | len) & (PMD_SIZE - 1))
 		return -EINVAL;
 	if (len == 0 || from + len < from || to + len < to ||
-	    from + len - 1 > TASK_MAX_SIZE || to + len - 1 > gmap->asce_end)
+	    from + len - 1 > TASK_SIZE_MAX || to + len - 1 > gmap->asce_end)
 		return -EINVAL;
 
 	flush = 0;
diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c
index 18d4107e10ee..b7b779c40a5b 100644
--- a/arch/s390/mm/gup.c
+++ b/arch/s390/mm/gup.c
@@ -211,7 +211,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
 	addr = start;
 	len = (unsigned long) nr_pages << PAGE_SHIFT;
 	end = start + len;
-	if ((end <= start) || (end > TASK_SIZE))
+	if ((end <= start) || (end > mm->context.asce_limit))
 		return 0;
 	/*
 	 * local_irq_save() doesn't prevent pagetable teardown, but does
diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c
index 50618614881f..eed233ce59dd 100644
--- a/arch/s390/mm/mmap.c
+++ b/arch/s390/mm/mmap.c
@@ -90,7 +90,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
 	struct vm_area_struct *vma;
 	struct vm_unmapped_area_info info;
 
-	if (len > TASK_SIZE - mmap_min_addr)
+	if (len > mm->context.asce_limit - mmap_min_addr)
 		return -ENOMEM;
 
 	if (flags & MAP_FIXED)
@@ -99,7 +99,8 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
 	if (addr) {
 		addr = PAGE_ALIGN(addr);
 		vma = find_vma(mm, addr);
-		if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
+		if (mm->context.asce_limit - len >= addr &&
+		    addr >= mmap_min_addr &&
 		    (!vma || addr + len <= vma->vm_start))
 			return addr;
 	}
@@ -107,7 +108,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
 	info.flags = 0;
 	info.length = len;
 	info.low_limit = mm->mmap_base;
-	info.high_limit = TASK_SIZE;
+	info.high_limit = mm->context.asce_limit;
 	if (filp || (flags & MAP_SHARED))
 		info.align_mask = MMAP_ALIGN_MASK << PAGE_SHIFT;
 	else
@@ -127,7 +128,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 	struct vm_unmapped_area_info info;
 
 	/* requested length too big for entire address space */
-	if (len > TASK_SIZE - mmap_min_addr)
+	if (len > mm->context.asce_limit - mmap_min_addr)
 		return -ENOMEM;
 
 	if (flags & MAP_FIXED)
@@ -137,7 +138,8 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 	if (addr) {
 		addr = PAGE_ALIGN(addr);
 		vma = find_vma(mm, addr);
-		if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
+		if (mm->context.asce_limit - len >= addr &&
+		    addr >= mmap_min_addr &&
 				(!vma || addr + len <= vma->vm_start))
 			return addr;
 	}
@@ -163,24 +165,13 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 		VM_BUG_ON(addr != -ENOMEM);
 		info.flags = 0;
 		info.low_limit = TASK_UNMAPPED_BASE;
-		info.high_limit = TASK_SIZE;
+		info.high_limit = mm->context.asce_limit;
 		addr = vm_unmapped_area(&info);
 	}
 
 	return addr;
 }
 
-int s390_mmap_check(unsigned long addr, unsigned long len, unsigned long flags)
-{
-	if (is_compat_task() || TASK_SIZE >= TASK_MAX_SIZE)
-		return 0;
-	if (!(flags & MAP_FIXED))
-		addr = 0;
-	if ((addr + len) >= TASK_SIZE)
-		return crst_table_upgrade(current->mm);
-	return 0;
-}
-
 static unsigned long
 s390_get_unmapped_area(struct file *filp, unsigned long addr,
 		unsigned long len, unsigned long pgoff, unsigned long flags)
@@ -192,7 +183,8 @@ s390_get_unmapped_area(struct file *filp, unsigned long addr,
 	area = arch_get_unmapped_area(filp, addr, len, pgoff, flags);
 	if (!(area & ~PAGE_MASK))
 		return area;
-	if (area == -ENOMEM && !is_compat_task() && TASK_SIZE < TASK_MAX_SIZE) {
+	if (area == -ENOMEM && !is_compat_task() &&
+	    current->mm->context.asce_limit < TASK_SIZE_MAX) {
 		/* Upgrade the page table to 4 levels and retry. */
 		rc = crst_table_upgrade(mm);
 		if (rc)
@@ -214,7 +206,8 @@ s390_get_unmapped_area_topdown(struct file *filp, const unsigned long addr,
 	area = arch_get_unmapped_area_topdown(filp, addr, len, pgoff, flags);
 	if (!(area & ~PAGE_MASK))
 		return area;
-	if (area == -ENOMEM && !is_compat_task() && TASK_SIZE < TASK_MAX_SIZE) {
+	if (area == -ENOMEM && !is_compat_task() &&
+	    current->mm->context.asce_limit < TASK_SIZE_MAX) {
 		/* Upgrade the page table to 4 levels and retry. */
 		rc = crst_table_upgrade(mm);
 		if (rc)
diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c
index 2776bad61094..f502cbe657af 100644
--- a/arch/s390/mm/pgalloc.c
+++ b/arch/s390/mm/pgalloc.c
@@ -95,7 +95,6 @@ int crst_table_upgrade(struct mm_struct *mm)
 	mm->context.asce_limit = 1UL << 53;
 	mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
 			   _ASCE_USER_BITS | _ASCE_TYPE_REGION2;
-	mm->task_size = mm->context.asce_limit;
 	spin_unlock_bh(&mm->page_table_lock);
 
 	on_each_cpu(__crst_table_upgrade, mm, 0);
@@ -119,7 +118,6 @@ void crst_table_downgrade(struct mm_struct *mm)
 	mm->context.asce_limit = 1UL << 31;
 	mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
 			   _ASCE_USER_BITS | _ASCE_TYPE_SEGMENT;
-	mm->task_size = mm->context.asce_limit;
 	crst_table_free(mm, (unsigned long *) pgd);
 
 	if (current->active_mm == mm)

From 9b11c7912d00d0e5fe0d7f8973b77dac4f69d3df Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Mon, 24 Apr 2017 18:14:48 +0200
Subject: [PATCH 66/74] s390/mm: simplify arch_get_unmapped_area[_topdown]

With TASK_SIZE now reflecting the maximum size of the address space for
a process the code for arch_get_unmapped_area[_topdown] can be simplified.
Just let the logic pick a suitable address and deal with the page table
upgrade after the address has been selected.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/mm/mmap.c | 95 +++++++++++++++++----------------------------
 1 file changed, 35 insertions(+), 60 deletions(-)

diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c
index eed233ce59dd..b017daed6887 100644
--- a/arch/s390/mm/mmap.c
+++ b/arch/s390/mm/mmap.c
@@ -89,32 +89,43 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma;
 	struct vm_unmapped_area_info info;
+	int rc;
 
-	if (len > mm->context.asce_limit - mmap_min_addr)
+	if (len > TASK_SIZE - mmap_min_addr)
 		return -ENOMEM;
 
 	if (flags & MAP_FIXED)
-		return addr;
+		goto check_asce_limit;
 
 	if (addr) {
 		addr = PAGE_ALIGN(addr);
 		vma = find_vma(mm, addr);
-		if (mm->context.asce_limit - len >= addr &&
-		    addr >= mmap_min_addr &&
+		if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
 		    (!vma || addr + len <= vma->vm_start))
-			return addr;
+			goto check_asce_limit;
 	}
 
 	info.flags = 0;
 	info.length = len;
 	info.low_limit = mm->mmap_base;
-	info.high_limit = mm->context.asce_limit;
+	info.high_limit = TASK_SIZE;
 	if (filp || (flags & MAP_SHARED))
 		info.align_mask = MMAP_ALIGN_MASK << PAGE_SHIFT;
 	else
 		info.align_mask = 0;
 	info.align_offset = pgoff << PAGE_SHIFT;
-	return vm_unmapped_area(&info);
+	addr = vm_unmapped_area(&info);
+	if (addr & ~PAGE_MASK)
+		return addr;
+
+check_asce_limit:
+	if (addr + len > current->mm->context.asce_limit) {
+		rc = crst_table_upgrade(mm);
+		if (rc)
+			return (unsigned long) rc;
+	}
+
+	return addr;
 }
 
 unsigned long
@@ -126,22 +137,22 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 	struct mm_struct *mm = current->mm;
 	unsigned long addr = addr0;
 	struct vm_unmapped_area_info info;
+	int rc;
 
 	/* requested length too big for entire address space */
-	if (len > mm->context.asce_limit - mmap_min_addr)
+	if (len > TASK_SIZE - mmap_min_addr)
 		return -ENOMEM;
 
 	if (flags & MAP_FIXED)
-		return addr;
+		goto check_asce_limit;
 
 	/* requesting a specific address */
 	if (addr) {
 		addr = PAGE_ALIGN(addr);
 		vma = find_vma(mm, addr);
-		if (mm->context.asce_limit - len >= addr &&
-		    addr >= mmap_min_addr &&
+		if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
 				(!vma || addr + len <= vma->vm_start))
-			return addr;
+			goto check_asce_limit;
 	}
 
 	info.flags = VM_UNMAPPED_AREA_TOPDOWN;
@@ -165,58 +176,22 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 		VM_BUG_ON(addr != -ENOMEM);
 		info.flags = 0;
 		info.low_limit = TASK_UNMAPPED_BASE;
-		info.high_limit = mm->context.asce_limit;
+		info.high_limit = TASK_SIZE;
 		addr = vm_unmapped_area(&info);
+		if (addr & ~PAGE_MASK)
+			return addr;
+	}
+
+check_asce_limit:
+	if (addr + len > current->mm->context.asce_limit) {
+		rc = crst_table_upgrade(mm);
+		if (rc)
+			return (unsigned long) rc;
 	}
 
 	return addr;
 }
 
-static unsigned long
-s390_get_unmapped_area(struct file *filp, unsigned long addr,
-		unsigned long len, unsigned long pgoff, unsigned long flags)
-{
-	struct mm_struct *mm = current->mm;
-	unsigned long area;
-	int rc;
-
-	area = arch_get_unmapped_area(filp, addr, len, pgoff, flags);
-	if (!(area & ~PAGE_MASK))
-		return area;
-	if (area == -ENOMEM && !is_compat_task() &&
-	    current->mm->context.asce_limit < TASK_SIZE_MAX) {
-		/* Upgrade the page table to 4 levels and retry. */
-		rc = crst_table_upgrade(mm);
-		if (rc)
-			return (unsigned long) rc;
-		area = arch_get_unmapped_area(filp, addr, len, pgoff, flags);
-	}
-	return area;
-}
-
-static unsigned long
-s390_get_unmapped_area_topdown(struct file *filp, const unsigned long addr,
-			  const unsigned long len, const unsigned long pgoff,
-			  const unsigned long flags)
-{
-	struct mm_struct *mm = current->mm;
-	unsigned long area;
-	int rc;
-
-	area = arch_get_unmapped_area_topdown(filp, addr, len, pgoff, flags);
-	if (!(area & ~PAGE_MASK))
-		return area;
-	if (area == -ENOMEM && !is_compat_task() &&
-	    current->mm->context.asce_limit < TASK_SIZE_MAX) {
-		/* Upgrade the page table to 4 levels and retry. */
-		rc = crst_table_upgrade(mm);
-		if (rc)
-			return (unsigned long) rc;
-		area = arch_get_unmapped_area_topdown(filp, addr, len,
-						      pgoff, flags);
-	}
-	return area;
-}
 /*
  * This function, called very early during the creation of a new
  * process VM image, sets up which VM layout function to use:
@@ -234,9 +209,9 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
 	 */
 	if (mmap_is_legacy()) {
 		mm->mmap_base = mmap_base_legacy(random_factor);
-		mm->get_unmapped_area = s390_get_unmapped_area;
+		mm->get_unmapped_area = arch_get_unmapped_area;
 	} else {
 		mm->mmap_base = mmap_base(random_factor);
-		mm->get_unmapped_area = s390_get_unmapped_area_topdown;
+		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
 	}
 }

From 1366def38b6ab574dd0dbf95e308c7d115e73796 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Mon, 24 Apr 2017 15:27:35 +0200
Subject: [PATCH 67/74] s390/pageattr: avoid unnecessary page table splitting

The kernel page table splitting code will split page tables even for
features the CPU does not support. E.g. a CPU may not support the NX
feature.
In order to avoid this, remove those bits from the flags parameter
that correlate with unsupported CPU features within __set_memory(). In
addition add an early exit if the flags parameter does not have any
bits set afterwards.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/mm/pageattr.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c
index fc5dc33bb141..fc321c5ec30e 100644
--- a/arch/s390/mm/pageattr.c
+++ b/arch/s390/mm/pageattr.c
@@ -94,7 +94,7 @@ static int walk_pte_level(pmd_t *pmdp, unsigned long addr, unsigned long end,
 			new = pte_wrprotect(new);
 		else if (flags & SET_MEMORY_RW)
 			new = pte_mkwrite(pte_mkdirty(new));
-		if ((flags & SET_MEMORY_NX) && MACHINE_HAS_NX)
+		if (flags & SET_MEMORY_NX)
 			pte_val(new) |= _PAGE_NOEXEC;
 		else if (flags & SET_MEMORY_X)
 			pte_val(new) &= ~_PAGE_NOEXEC;
@@ -144,7 +144,7 @@ static void modify_pmd_page(pmd_t *pmdp, unsigned long addr,
 		new = pmd_wrprotect(new);
 	else if (flags & SET_MEMORY_RW)
 		new = pmd_mkwrite(pmd_mkdirty(new));
-	if ((flags & SET_MEMORY_NX) && MACHINE_HAS_NX)
+	if (flags & SET_MEMORY_NX)
 		pmd_val(new) |= _SEGMENT_ENTRY_NOEXEC;
 	else if (flags & SET_MEMORY_X)
 		pmd_val(new) &= ~_SEGMENT_ENTRY_NOEXEC;
@@ -221,7 +221,7 @@ static void modify_pud_page(pud_t *pudp, unsigned long addr,
 		new = pud_wrprotect(new);
 	else if (flags & SET_MEMORY_RW)
 		new = pud_mkwrite(pud_mkdirty(new));
-	if ((flags & SET_MEMORY_NX) && MACHINE_HAS_NX)
+	if (flags & SET_MEMORY_NX)
 		pud_val(new) |= _REGION_ENTRY_NOEXEC;
 	else if (flags & SET_MEMORY_X)
 		pud_val(new) &= ~_REGION_ENTRY_NOEXEC;
@@ -288,6 +288,10 @@ static int change_page_attr(unsigned long addr, unsigned long end,
 
 int __set_memory(unsigned long addr, int numpages, unsigned long flags)
 {
+	if (!MACHINE_HAS_NX)
+		flags &= ~(SET_MEMORY_NX | SET_MEMORY_X);
+	if (!flags)
+		return 0;
 	addr &= PAGE_MASK;
 	return change_page_attr(addr, addr + numpages * PAGE_SIZE, flags);
 }

From 985a9d20daa67e1983910fcf6e4f348ce8eed086 Mon Sep 17 00:00:00 2001
From: Harald Freudenberger <freude@linux.vnet.ibm.com>
Date: Fri, 24 Feb 2017 10:11:54 +0100
Subject: [PATCH 68/74] s390/crypto: Renaming PPNO to PRNO.

The PPNO (Perform Pseudorandom Number Operation) instruction
has been renamed to PRNO (Perform Random Number Operation).
To avoid confusion and conflicts with future extensions with
this instruction (like e.g. provide a true random number
generator) this patch renames all occurences in cpacf.h and
adjusts the only exploiter code which is the prng device
driver and one line in the s390 kvm feature check.

Signed-off-by: Harald Freudenberger <freude@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/crypto/prng.c       | 42 +++++++++++++++++------------------
 arch/s390/include/asm/cpacf.h | 20 ++++++++---------
 arch/s390/kvm/kvm-s390.c      |  2 +-
 3 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/arch/s390/crypto/prng.c b/arch/s390/crypto/prng.c
index 5a3ec04a7082..3e47c4a0f18b 100644
--- a/arch/s390/crypto/prng.c
+++ b/arch/s390/crypto/prng.c
@@ -81,7 +81,7 @@ struct prng_ws_s {
 	u64 byte_counter;
 };
 
-struct ppno_ws_s {
+struct prno_ws_s {
 	u32 res;
 	u32 reseed_counter;
 	u64 stream_bytes;
@@ -93,7 +93,7 @@ struct prng_data_s {
 	struct mutex mutex;
 	union {
 		struct prng_ws_s prngws;
-		struct ppno_ws_s ppnows;
+		struct prno_ws_s prnows;
 	};
 	u8 *buf;
 	u32 rest;
@@ -306,12 +306,12 @@ static int __init prng_sha512_selftest(void)
 		0x36, 0x8c, 0x5a, 0x9f, 0x7a, 0x4b, 0x3e, 0xe2 };
 
 	u8 buf[sizeof(random)];
-	struct ppno_ws_s ws;
+	struct prno_ws_s ws;
 
 	memset(&ws, 0, sizeof(ws));
 
 	/* initial seed */
-	cpacf_ppno(CPACF_PPNO_SHA512_DRNG_SEED,
+	cpacf_prno(CPACF_PRNO_SHA512_DRNG_SEED,
 		   &ws, NULL, 0, seed, sizeof(seed));
 
 	/* check working states V and C */
@@ -324,9 +324,9 @@ static int __init prng_sha512_selftest(void)
 	}
 
 	/* generate random bytes */
-	cpacf_ppno(CPACF_PPNO_SHA512_DRNG_GEN,
+	cpacf_prno(CPACF_PRNO_SHA512_DRNG_GEN,
 		   &ws, buf, sizeof(buf), NULL, 0);
-	cpacf_ppno(CPACF_PPNO_SHA512_DRNG_GEN,
+	cpacf_prno(CPACF_PRNO_SHA512_DRNG_GEN,
 		   &ws, buf, sizeof(buf), NULL, 0);
 
 	/* check against expected data */
@@ -374,16 +374,16 @@ static int __init prng_sha512_instantiate(void)
 	/* followed by 16 bytes of unique nonce */
 	get_tod_clock_ext(seed + 64 + 32);
 
-	/* initial seed of the ppno drng */
-	cpacf_ppno(CPACF_PPNO_SHA512_DRNG_SEED,
-		   &prng_data->ppnows, NULL, 0, seed, sizeof(seed));
+	/* initial seed of the prno drng */
+	cpacf_prno(CPACF_PRNO_SHA512_DRNG_SEED,
+		   &prng_data->prnows, NULL, 0, seed, sizeof(seed));
 
 	/* if fips mode is enabled, generate a first block of random
 	   bytes for the FIPS 140-2 Conditional Self Test */
 	if (fips_enabled) {
 		prng_data->prev = prng_data->buf + prng_chunk_size;
-		cpacf_ppno(CPACF_PPNO_SHA512_DRNG_GEN,
-			   &prng_data->ppnows,
+		cpacf_prno(CPACF_PRNO_SHA512_DRNG_GEN,
+			   &prng_data->prnows,
 			   prng_data->prev, prng_chunk_size, NULL, 0);
 	}
 
@@ -412,9 +412,9 @@ static int prng_sha512_reseed(void)
 	if (ret != sizeof(seed))
 		return ret;
 
-	/* do a reseed of the ppno drng with this bytestring */
-	cpacf_ppno(CPACF_PPNO_SHA512_DRNG_SEED,
-		   &prng_data->ppnows, NULL, 0, seed, sizeof(seed));
+	/* do a reseed of the prno drng with this bytestring */
+	cpacf_prno(CPACF_PRNO_SHA512_DRNG_SEED,
+		   &prng_data->prnows, NULL, 0, seed, sizeof(seed));
 
 	return 0;
 }
@@ -425,15 +425,15 @@ static int prng_sha512_generate(u8 *buf, size_t nbytes)
 	int ret;
 
 	/* reseed needed ? */
-	if (prng_data->ppnows.reseed_counter > prng_reseed_limit) {
+	if (prng_data->prnows.reseed_counter > prng_reseed_limit) {
 		ret = prng_sha512_reseed();
 		if (ret)
 			return ret;
 	}
 
-	/* PPNO generate */
-	cpacf_ppno(CPACF_PPNO_SHA512_DRNG_GEN,
-		   &prng_data->ppnows, buf, nbytes, NULL, 0);
+	/* PRNO generate */
+	cpacf_prno(CPACF_PRNO_SHA512_DRNG_GEN,
+		   &prng_data->prnows, buf, nbytes, NULL, 0);
 
 	/* FIPS 140-2 Conditional Self Test */
 	if (fips_enabled) {
@@ -653,7 +653,7 @@ static ssize_t prng_counter_show(struct device *dev,
 	if (mutex_lock_interruptible(&prng_data->mutex))
 		return -ERESTARTSYS;
 	if (prng_mode == PRNG_MODE_SHA512)
-		counter = prng_data->ppnows.stream_bytes;
+		counter = prng_data->prnows.stream_bytes;
 	else
 		counter = prng_data->prngws.byte_counter;
 	mutex_unlock(&prng_data->mutex);
@@ -774,8 +774,8 @@ static int __init prng_init(void)
 
 	/* choose prng mode */
 	if (prng_mode != PRNG_MODE_TDES) {
-		/* check for MSA5 support for PPNO operations */
-		if (!cpacf_query_func(CPACF_PPNO, CPACF_PPNO_SHA512_DRNG_GEN)) {
+		/* check for MSA5 support for PRNO operations */
+		if (!cpacf_query_func(CPACF_PRNO, CPACF_PRNO_SHA512_DRNG_GEN)) {
 			if (prng_mode == PRNG_MODE_SHA512) {
 				pr_err("The prng module cannot "
 				       "start in SHA-512 mode\n");
diff --git a/arch/s390/include/asm/cpacf.h b/arch/s390/include/asm/cpacf.h
index e2dfbf280d12..ba4aecc2dac7 100644
--- a/arch/s390/include/asm/cpacf.h
+++ b/arch/s390/include/asm/cpacf.h
@@ -25,7 +25,7 @@
 #define CPACF_KMO		0xb92b		/* MSA4 */
 #define CPACF_PCC		0xb92c		/* MSA4 */
 #define CPACF_KMCTR		0xb92d		/* MSA4 */
-#define CPACF_PPNO		0xb93c		/* MSA5 */
+#define CPACF_PRNO		0xb93c		/* MSA5 */
 
 /*
  * En/decryption modifier bits
@@ -123,12 +123,12 @@
 #define CPACF_PCKMO_ENC_AES_256_KEY	0x14
 
 /*
- * Function codes for the PPNO (PERFORM PSEUDORANDOM NUMBER OPERATION)
+ * Function codes for the PRNO (PERFORM RANDOM NUMBER OPERATION)
  * instruction
  */
-#define CPACF_PPNO_QUERY		0x00
-#define CPACF_PPNO_SHA512_DRNG_GEN	0x03
-#define CPACF_PPNO_SHA512_DRNG_SEED	0x83
+#define CPACF_PRNO_QUERY		0x00
+#define CPACF_PRNO_SHA512_DRNG_GEN	0x03
+#define CPACF_PRNO_SHA512_DRNG_SEED	0x83
 
 typedef struct { unsigned char bytes[16]; } cpacf_mask_t;
 
@@ -173,7 +173,7 @@ static inline int __cpacf_check_opcode(unsigned int opcode)
 	case CPACF_PCC:
 	case CPACF_KMCTR:
 		return test_facility(77);	/* check for MSA4 */
-	case CPACF_PPNO:
+	case CPACF_PRNO:
 		return test_facility(57);	/* check for MSA5 */
 	default:
 		BUG();
@@ -373,16 +373,16 @@ static inline int cpacf_kmctr(unsigned long func, void *param, u8 *dest,
 }
 
 /**
- * cpacf_ppno() - executes the PPNO (PERFORM PSEUDORANDOM NUMBER OPERATION)
+ * cpacf_prno() - executes the PRNO (PERFORM RANDOM NUMBER OPERATION)
  *		  instruction
- * @func: the function code passed to PPNO; see CPACF_PPNO_xxx defines
+ * @func: the function code passed to PRNO; see CPACF_PRNO_xxx defines
  * @param: address of parameter block; see POP for details on each func
  * @dest: address of destination memory area
  * @dest_len: size of destination memory area in bytes
  * @seed: address of seed data
  * @seed_len: size of seed data in bytes
  */
-static inline void cpacf_ppno(unsigned long func, void *param,
+static inline void cpacf_prno(unsigned long func, void *param,
 			      u8 *dest, long dest_len,
 			      const u8 *seed, long seed_len)
 {
@@ -398,7 +398,7 @@ static inline void cpacf_ppno(unsigned long func, void *param,
 		"	brc	1,0b\n"	  /* handle partial completion */
 		: [dst] "+a" (r2), [dlen] "+d" (r3)
 		: [fc] "d" (r0), [pba] "a" (r1),
-		  [seed] "a" (r4), [slen] "d" (r5), [opc] "i" (CPACF_PPNO)
+		  [seed] "a" (r4), [slen] "d" (r5), [opc] "i" (CPACF_PRNO)
 		: "cc", "memory");
 }
 
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 28983836c0f7..d5c5c911821a 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -273,7 +273,7 @@ static void kvm_s390_cpu_feat_init(void)
 			      kvm_s390_available_subfunc.pcc);
 	}
 	if (test_facility(57)) /* MSA5 */
-		__cpacf_query(CPACF_PPNO, (cpacf_mask_t *)
+		__cpacf_query(CPACF_PRNO, (cpacf_mask_t *)
 			      kvm_s390_available_subfunc.ppno);
 
 	if (MACHINE_HAS_ESOP)

From f75fa65d70e2423a03c1b7391bfca410d554b1c5 Mon Sep 17 00:00:00 2001
From: Harald Freudenberger <freude@linux.vnet.ibm.com>
Date: Tue, 28 Feb 2017 08:59:22 +0100
Subject: [PATCH 69/74] s390/crypto: Add new subfunctions to the cpacf PRNO
 function.

There is a new TRNG extension in the subcodes for the cpacf
PRNO function. This patch introduces new defines and a new
cpacf_trng inline function to provide these new features for
other kernel code parts.

Signed-off-by: Harald Freudenberger <freude@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/include/asm/cpacf.h | 31 +++++++++++++++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

diff --git a/arch/s390/include/asm/cpacf.h b/arch/s390/include/asm/cpacf.h
index ba4aecc2dac7..7707a35176c4 100644
--- a/arch/s390/include/asm/cpacf.h
+++ b/arch/s390/include/asm/cpacf.h
@@ -129,6 +129,8 @@
 #define CPACF_PRNO_QUERY		0x00
 #define CPACF_PRNO_SHA512_DRNG_GEN	0x03
 #define CPACF_PRNO_SHA512_DRNG_SEED	0x83
+#define CPACF_PRNO_TRNG_Q_R2C_RATIO	0x70
+#define CPACF_PRNO_TRNG			0x72
 
 typedef struct { unsigned char bytes[16]; } cpacf_mask_t;
 
@@ -383,8 +385,8 @@ static inline int cpacf_kmctr(unsigned long func, void *param, u8 *dest,
  * @seed_len: size of seed data in bytes
  */
 static inline void cpacf_prno(unsigned long func, void *param,
-			      u8 *dest, long dest_len,
-			      const u8 *seed, long seed_len)
+			      u8 *dest, unsigned long dest_len,
+			      const u8 *seed, unsigned long seed_len)
 {
 	register unsigned long r0 asm("0") = (unsigned long) func;
 	register unsigned long r1 asm("1") = (unsigned long) param;
@@ -402,6 +404,31 @@ static inline void cpacf_prno(unsigned long func, void *param,
 		: "cc", "memory");
 }
 
+/**
+ * cpacf_trng() - executes the TRNG subfunction of the PRNO instruction
+ * @ucbuf: buffer for unconditioned data
+ * @ucbuf_len: amount of unconditioned data to fetch in bytes
+ * @cbuf: buffer for conditioned data
+ * @cbuf_len: amount of conditioned data to fetch in bytes
+ */
+static inline void cpacf_trng(u8 *ucbuf, unsigned long ucbuf_len,
+			      u8 *cbuf, unsigned long cbuf_len)
+{
+	register unsigned long r0 asm("0") = (unsigned long) CPACF_PRNO_TRNG;
+	register unsigned long r2 asm("2") = (unsigned long) ucbuf;
+	register unsigned long r3 asm("3") = (unsigned long) ucbuf_len;
+	register unsigned long r4 asm("4") = (unsigned long) cbuf;
+	register unsigned long r5 asm("5") = (unsigned long) cbuf_len;
+
+	asm volatile (
+		"0:	.insn	rre,%[opc] << 16,%[ucbuf],%[cbuf]\n"
+		"	brc	1,0b\n"	  /* handle partial completion */
+		: [ucbuf] "+a" (r2), [ucbuflen] "+d" (r3),
+		  [cbuf] "+a" (r4), [cbuflen] "+d" (r5)
+		: [fc] "d" (r0), [opc] "i" (CPACF_PRNO)
+		: "cc", "memory");
+}
+
 /**
  * cpacf_pcc() - executes the PCC (PERFORM CRYPTOGRAPHIC COMPUTATION)
  *		 instruction

From 4c637cd8de43416c1b1eef6113e7aa06abacf18d Mon Sep 17 00:00:00 2001
From: Harald Freudenberger <freude@linux.vnet.ibm.com>
Date: Fri, 17 Mar 2017 10:46:31 +0100
Subject: [PATCH 70/74] s390/crypto: Provide s390 specific arch random
 functionality.

This patch introduces s390 specific arch random functionality.
There exists a generic kernel API for arch specific random
number implementation (see include/linux/random.h). Here
comes the header file and a very small static code part
implementing the arch_random_* API based on the TRNG
subfunction coming with the reworked PRNG instruction.

The arch random implementation hooks into the kernel
initialization and checks for availability of the TRNG
function. In accordance to the arch random API all functions
return false if the TRNG is not available. Otherwise the new
high quality entropy source provides fresh random on each
invocation.

The s390 arch random feature build is controlled via
CONFIG_ARCH_RANDOM. This config option located in
arch/s390/Kconfig is enabled by default and appears
as entry "s390 architectural random number generation API"
in the submenu "Processor type and features" for s390 builds.

Signed-off-by: Harald Freudenberger <freude@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/Kbuild                   |  2 +-
 arch/s390/Kconfig                  | 15 +++++++
 arch/s390/crypto/Makefile          |  1 +
 arch/s390/crypto/arch_random.c     | 31 ++++++++++++++
 arch/s390/include/asm/archrandom.h | 69 ++++++++++++++++++++++++++++++
 5 files changed, 117 insertions(+), 1 deletion(-)
 create mode 100644 arch/s390/crypto/arch_random.c
 create mode 100644 arch/s390/include/asm/archrandom.h

diff --git a/arch/s390/Kbuild b/arch/s390/Kbuild
index e256592eb66e..eae2c64cf69d 100644
--- a/arch/s390/Kbuild
+++ b/arch/s390/Kbuild
@@ -1,7 +1,7 @@
 obj-y				+= kernel/
 obj-y				+= mm/
 obj-$(CONFIG_KVM)		+= kvm/
-obj-$(CONFIG_CRYPTO_HW)		+= crypto/
+obj-y				+= crypto/
 obj-$(CONFIG_S390_HYPFS_FS)	+= hypfs/
 obj-$(CONFIG_APPLDATA_BASE)	+= appldata/
 obj-y				+= net/
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 249c2771be0e..ebc0bf7c84d0 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -507,6 +507,21 @@ source kernel/Kconfig.preempt
 
 source kernel/Kconfig.hz
 
+config ARCH_RANDOM
+	def_bool y
+	prompt "s390 architectural random number generation API"
+	help
+	  Enable the s390 architectural random number generation API
+	  to provide random data for all consumers within the Linux
+	  kernel.
+
+	  When enabled the arch_random_* functions declared in linux/random.h
+	  are implemented. The implementation is based on the s390 CPACF
+	  instruction subfunction TRNG which provides a real true random
+	  number generator.
+
+	  If unsure, say Y.
+
 endmenu
 
 menu "Memory setup"
diff --git a/arch/s390/crypto/Makefile b/arch/s390/crypto/Makefile
index 402c530c6da5..678d9863e3f0 100644
--- a/arch/s390/crypto/Makefile
+++ b/arch/s390/crypto/Makefile
@@ -10,5 +10,6 @@ obj-$(CONFIG_CRYPTO_AES_S390) += aes_s390.o paes_s390.o
 obj-$(CONFIG_S390_PRNG) += prng.o
 obj-$(CONFIG_CRYPTO_GHASH_S390) += ghash_s390.o
 obj-$(CONFIG_CRYPTO_CRC32_S390) += crc32-vx_s390.o
+obj-$(CONFIG_ARCH_RANDOM) += arch_random.o
 
 crc32-vx_s390-y := crc32-vx.o crc32le-vx.o crc32be-vx.o
diff --git a/arch/s390/crypto/arch_random.c b/arch/s390/crypto/arch_random.c
new file mode 100644
index 000000000000..9317b3e645e2
--- /dev/null
+++ b/arch/s390/crypto/arch_random.c
@@ -0,0 +1,31 @@
+/*
+ * s390 arch random implementation.
+ *
+ * Copyright IBM Corp. 2017
+ * Author(s): Harald Freudenberger <freude@de.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License (version 2 only)
+ * as published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/atomic.h>
+#include <linux/static_key.h>
+#include <asm/cpacf.h>
+
+DEFINE_STATIC_KEY_FALSE(s390_arch_random_available);
+
+atomic64_t s390_arch_random_counter = ATOMIC64_INIT(0);
+EXPORT_SYMBOL(s390_arch_random_counter);
+
+static int __init s390_arch_random_init(void)
+{
+	/* check if subfunction CPACF_PRNO_TRNG is available */
+	if (cpacf_query_func(CPACF_PRNO, CPACF_PRNO_TRNG))
+		static_branch_enable(&s390_arch_random_available);
+
+	return 0;
+}
+arch_initcall(s390_arch_random_init);
diff --git a/arch/s390/include/asm/archrandom.h b/arch/s390/include/asm/archrandom.h
new file mode 100644
index 000000000000..6033901a40b2
--- /dev/null
+++ b/arch/s390/include/asm/archrandom.h
@@ -0,0 +1,69 @@
+/*
+ * Kernel interface for the s390 arch_random_* functions
+ *
+ * Copyright IBM Corp. 2017
+ *
+ * Author: Harald Freudenberger <freude@de.ibm.com>
+ *
+ */
+
+#ifndef _ASM_S390_ARCHRANDOM_H
+#define _ASM_S390_ARCHRANDOM_H
+
+#ifdef CONFIG_ARCH_RANDOM
+
+#include <linux/static_key.h>
+#include <linux/atomic.h>
+#include <asm/cpacf.h>
+
+DECLARE_STATIC_KEY_FALSE(s390_arch_random_available);
+extern atomic64_t s390_arch_random_counter;
+
+static void s390_arch_random_generate(u8 *buf, unsigned int nbytes)
+{
+	cpacf_trng(NULL, 0, buf, nbytes);
+	atomic64_add(nbytes, &s390_arch_random_counter);
+}
+
+static inline bool arch_has_random(void)
+{
+	if (static_branch_likely(&s390_arch_random_available))
+		return true;
+	return false;
+}
+
+static inline bool arch_has_random_seed(void)
+{
+	return arch_has_random();
+}
+
+static inline bool arch_get_random_long(unsigned long *v)
+{
+	if (static_branch_likely(&s390_arch_random_available)) {
+		s390_arch_random_generate((u8 *)v, sizeof(*v));
+		return true;
+	}
+	return false;
+}
+
+static inline bool arch_get_random_int(unsigned int *v)
+{
+	if (static_branch_likely(&s390_arch_random_available)) {
+		s390_arch_random_generate((u8 *)v, sizeof(*v));
+		return true;
+	}
+	return false;
+}
+
+static inline bool arch_get_random_seed_long(unsigned long *v)
+{
+	return arch_get_random_long(v);
+}
+
+static inline bool arch_get_random_seed_int(unsigned int *v)
+{
+	return arch_get_random_int(v);
+}
+
+#endif /* CONFIG_ARCH_RANDOM */
+#endif /* _ASM_S390_ARCHRANDOM_H */

From bbcb478e3fefa5d9acff6a5311073fae0809a9c3 Mon Sep 17 00:00:00 2001
From: Harald Freudenberger <freude@linux.vnet.ibm.com>
Date: Fri, 24 Feb 2017 09:10:05 +0100
Subject: [PATCH 71/74] s390/trng: Introduce s390 TRNG device driver.

This patch introduces a new device driver s390-trng for the
s390 platform which exploits the new PRNO TRNG cpacf
subfunction. The true-random-number-generator is accessible
from userspace, by default visible as /dev/trng. The driver
also registers at the kernel build-in hwrng API to feed the
hwrng with fresh entropy data. This generic device driver
for hardware random data is visible from userspace as
/dev/hwrng.

Signed-off-by: Harald Freudenberger <freude@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 drivers/char/hw_random/Kconfig     |  14 ++
 drivers/char/hw_random/Makefile    |   1 +
 drivers/char/hw_random/s390-trng.c | 268 +++++++++++++++++++++++++++++
 3 files changed, 283 insertions(+)
 create mode 100644 drivers/char/hw_random/s390-trng.c

diff --git a/drivers/char/hw_random/Kconfig b/drivers/char/hw_random/Kconfig
index 0cafe08919c9..b9918fb9587d 100644
--- a/drivers/char/hw_random/Kconfig
+++ b/drivers/char/hw_random/Kconfig
@@ -423,6 +423,20 @@ config HW_RANDOM_CAVIUM
 
          If unsure, say Y.
 
+config HW_RANDOM_S390
+	tristate "S390 True Random Number Generator support"
+	depends on S390
+	default HW_RANDOM
+	---help---
+	  This driver provides kernel-side support for the True
+	  Random Number Generator available as CPACF extension
+	  on modern s390 hardware platforms.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called s390-trng.
+
+	  If unsure, say Y.
+
 endif # HW_RANDOM
 
 config UML_RANDOM
diff --git a/drivers/char/hw_random/Makefile b/drivers/char/hw_random/Makefile
index 5f52b1e4e7be..dd1765246255 100644
--- a/drivers/char/hw_random/Makefile
+++ b/drivers/char/hw_random/Makefile
@@ -36,3 +36,4 @@ obj-$(CONFIG_HW_RANDOM_STM32) += stm32-rng.o
 obj-$(CONFIG_HW_RANDOM_PIC32) += pic32-rng.o
 obj-$(CONFIG_HW_RANDOM_MESON) += meson-rng.o
 obj-$(CONFIG_HW_RANDOM_CAVIUM) += cavium-rng.o cavium-rng-vf.o
+obj-$(CONFIG_HW_RANDOM_S390) += s390-trng.o
diff --git a/drivers/char/hw_random/s390-trng.c b/drivers/char/hw_random/s390-trng.c
new file mode 100644
index 000000000000..aca48e893fca
--- /dev/null
+++ b/drivers/char/hw_random/s390-trng.c
@@ -0,0 +1,268 @@
+/*
+ * s390 TRNG device driver
+ *
+ * Driver for the TRNG (true random number generation) command
+ * available via CPACF extension MSA 7 on the s390 arch.
+
+ * Copyright IBM Corp. 2017
+ * Author(s): Harald Freudenberger <freude@de.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License (version 2 only)
+ * as published by the Free Software Foundation.
+ *
+ */
+
+#define KMSG_COMPONENT "trng"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/hw_random.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/cpufeature.h>
+#include <linux/miscdevice.h>
+#include <linux/debugfs.h>
+#include <linux/atomic.h>
+#include <linux/random.h>
+#include <linux/sched/signal.h>
+#include <asm/debug.h>
+#include <asm/cpacf.h>
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("IBM Corporation");
+MODULE_DESCRIPTION("s390 CPACF TRNG device driver");
+
+
+/* trng related debug feature things */
+
+static debug_info_t *debug_info;
+
+#define DEBUG_DBG(...)	debug_sprintf_event(debug_info, 6, ##__VA_ARGS__)
+#define DEBUG_INFO(...) debug_sprintf_event(debug_info, 5, ##__VA_ARGS__)
+#define DEBUG_WARN(...) debug_sprintf_event(debug_info, 4, ##__VA_ARGS__)
+#define DEBUG_ERR(...)	debug_sprintf_event(debug_info, 3, ##__VA_ARGS__)
+
+
+/* trng helpers */
+
+static atomic64_t trng_dev_counter = ATOMIC64_INIT(0);
+static atomic64_t trng_hwrng_counter = ATOMIC64_INIT(0);
+
+
+/* file io functions */
+
+static int trng_open(struct inode *inode, struct file *file)
+{
+	return nonseekable_open(inode, file);
+}
+
+static ssize_t trng_read(struct file *file, char __user *ubuf,
+			 size_t nbytes, loff_t *ppos)
+{
+	u8 buf[32];
+	u8 *p = buf;
+	unsigned int n;
+	ssize_t ret = 0;
+
+	/*
+	 * use buf for requests <= sizeof(buf),
+	 * otherwise allocate one page and fetch
+	 * pagewise.
+	 */
+
+	if (nbytes > sizeof(buf)) {
+		p = (u8 *) __get_free_page(GFP_KERNEL);
+		if (!p)
+			return -ENOMEM;
+	}
+
+	while (nbytes) {
+		if (need_resched()) {
+			if (signal_pending(current)) {
+				if (ret == 0)
+					ret = -ERESTARTSYS;
+				break;
+			}
+			schedule();
+		}
+		n = nbytes > PAGE_SIZE ? PAGE_SIZE : nbytes;
+		cpacf_trng(NULL, 0, p, n);
+		atomic64_add(n, &trng_dev_counter);
+		if (copy_to_user(ubuf, p, n)) {
+			ret = -EFAULT;
+			break;
+		}
+		nbytes -= n;
+		ubuf += n;
+		ret += n;
+	}
+
+	if (p != buf)
+		free_page((unsigned long) p);
+
+	DEBUG_DBG("trng_read()=%zd\n", ret);
+	return ret;
+}
+
+
+/* sysfs */
+
+static ssize_t trng_counter_show(struct device *dev,
+				 struct device_attribute *attr, char *buf)
+{
+	u64 dev_counter = atomic64_read(&trng_dev_counter);
+	u64 hwrng_counter = atomic64_read(&trng_hwrng_counter);
+#if IS_ENABLED(CONFIG_ARCH_RANDOM)
+	u64 arch_counter = atomic64_read(&s390_arch_random_counter);
+
+	return snprintf(buf, PAGE_SIZE,
+			"trng:  %llu\n"
+			"hwrng: %llu\n"
+			"arch:  %llu\n"
+			"total: %llu\n",
+			dev_counter, hwrng_counter, arch_counter,
+			dev_counter + hwrng_counter + arch_counter);
+#else
+	return snprintf(buf, PAGE_SIZE,
+			"trng:  %llu\n"
+			"hwrng: %llu\n"
+			"total: %llu\n",
+			dev_counter, hwrng_counter,
+			dev_counter + hwrng_counter);
+#endif
+}
+static DEVICE_ATTR(byte_counter, 0444, trng_counter_show, NULL);
+
+static struct attribute *trng_dev_attrs[] = {
+	&dev_attr_byte_counter.attr,
+	NULL
+};
+
+static const struct attribute_group trng_dev_attr_group = {
+	.attrs = trng_dev_attrs
+};
+
+static const struct attribute_group *trng_dev_attr_groups[] = {
+	&trng_dev_attr_group,
+	NULL
+};
+
+static const struct file_operations trng_fops = {
+	.owner		= THIS_MODULE,
+	.open		= &trng_open,
+	.release	= NULL,
+	.read		= &trng_read,
+	.llseek		= noop_llseek,
+};
+
+static struct miscdevice trng_dev = {
+	.name	= "trng",
+	.minor	= MISC_DYNAMIC_MINOR,
+	.mode	= 0444,
+	.fops	= &trng_fops,
+	.groups = trng_dev_attr_groups,
+};
+
+
+/* hwrng_register */
+
+static inline void _trng_hwrng_read(u8 *buf, size_t len)
+{
+	cpacf_trng(NULL, 0, buf, len);
+	atomic64_add(len, &trng_hwrng_counter);
+}
+
+static int trng_hwrng_data_read(struct hwrng *rng, u32 *data)
+{
+	size_t len = sizeof(*data);
+
+	_trng_hwrng_read((u8 *) data, len);
+
+	DEBUG_DBG("trng_hwrng_data_read()=%zu\n", len);
+
+	return len;
+}
+
+static int trng_hwrng_read(struct hwrng *rng, void *data, size_t max, bool wait)
+{
+	size_t len = max <= PAGE_SIZE ? max : PAGE_SIZE;
+
+	_trng_hwrng_read((u8 *) data, len);
+
+	DEBUG_DBG("trng_hwrng_read()=%zu\n", len);
+
+	return len;
+}
+
+/*
+ * hwrng register struct
+ * The trng is suppost to have 100% entropy, and thus
+ * we register with a very high quality value.
+ */
+static struct hwrng trng_hwrng_dev = {
+	.name		= "s390-trng",
+	.data_read	= trng_hwrng_data_read,
+	.read		= trng_hwrng_read,
+	.quality	= 999,
+};
+
+
+/* init and exit */
+
+static void __init trng_debug_init(void)
+{
+	debug_info = debug_register("trng", 1, 1, 4 * sizeof(long));
+	debug_register_view(debug_info, &debug_sprintf_view);
+	debug_set_level(debug_info, 3);
+}
+
+static void trng_debug_exit(void)
+{
+	debug_unregister(debug_info);
+}
+
+static int __init trng_init(void)
+{
+	int ret;
+
+	trng_debug_init();
+
+	/* check if subfunction CPACF_PRNO_TRNG is available */
+	if (!cpacf_query_func(CPACF_PRNO, CPACF_PRNO_TRNG)) {
+		DEBUG_INFO("trng_init CPACF_PRNO_TRNG not available\n");
+		ret = -ENODEV;
+		goto out_dbg;
+	}
+
+	ret = misc_register(&trng_dev);
+	if (ret) {
+		DEBUG_WARN("trng_init misc_register() failed rc=%d\n", ret);
+		goto out_dbg;
+	}
+
+	ret = hwrng_register(&trng_hwrng_dev);
+	if (ret) {
+		DEBUG_WARN("trng_init hwrng_register() failed rc=%d\n", ret);
+		goto out_misc;
+	}
+
+	DEBUG_DBG("trng_init successful\n");
+
+	return 0;
+
+out_misc:
+	misc_deregister(&trng_dev);
+out_dbg:
+	trng_debug_exit();
+	return ret;
+}
+
+static void __exit trng_exit(void)
+{
+	hwrng_unregister(&trng_hwrng_dev);
+	misc_deregister(&trng_dev);
+	trng_debug_exit();
+}
+
+module_cpu_feature_match(MSA, trng_init);
+module_exit(trng_exit);

From b112a2df28e6b5238ae4e68821ddd706a476d28c Mon Sep 17 00:00:00 2001
From: "Jason J. Herne" <jjherne@linux.vnet.ibm.com>
Date: Tue, 21 Feb 2017 09:00:54 -0500
Subject: [PATCH 72/74] s390/cpacf: query instructions use unique parameters
 for compatibility with KMA

The new KMA instruction requires unique parameters. Update __cpacf_query to
generate a compatible assembler instruction.

Signed-off-by: Jason J. Herne <jjherne@linux.vnet.ibm.com>
Acked-by: Harald Freudenberger <freude@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/include/asm/cpacf.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/s390/include/asm/cpacf.h b/arch/s390/include/asm/cpacf.h
index e2dfbf280d12..175a5d773306 100644
--- a/arch/s390/include/asm/cpacf.h
+++ b/arch/s390/include/asm/cpacf.h
@@ -149,8 +149,8 @@ static inline void __cpacf_query(unsigned int opcode, cpacf_mask_t *mask)
 
 	asm volatile(
 		"	spm 0\n" /* pckmo doesn't change the cc */
-		/* Parameter registers are ignored, but may not be 0 */
-		"0:	.insn	rrf,%[opc] << 16,2,2,2,0\n"
+		/* Parameter regs are ignored, but must be nonzero and unique */
+		"0:	.insn	rrf,%[opc] << 16,2,4,6,0\n"
 		"	brc	1,0b\n"	/* handle partial completion */
 		: "=m" (*mask)
 		: [fc] "d" (r0), [pba] "a" (r1), [opc] "i" (opcode)

From 152c1c8d60ebedce8cc912c12f9be9ceca6c6671 Mon Sep 17 00:00:00 2001
From: "Jason J. Herne" <jjherne@linux.vnet.ibm.com>
Date: Thu, 2 Feb 2017 08:25:19 -0500
Subject: [PATCH 73/74] s390/cpacf: Introduce kma instruction

Provide a kma instruction definition for use by callers of __cpacf_query.

Signed-off-by: Jason J. Herne <jjherne@linux.vnet.ibm.com>
Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/include/asm/cpacf.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/s390/include/asm/cpacf.h b/arch/s390/include/asm/cpacf.h
index 175a5d773306..31cac7d17b48 100644
--- a/arch/s390/include/asm/cpacf.h
+++ b/arch/s390/include/asm/cpacf.h
@@ -26,6 +26,7 @@
 #define CPACF_PCC		0xb92c		/* MSA4 */
 #define CPACF_KMCTR		0xb92d		/* MSA4 */
 #define CPACF_PPNO		0xb93c		/* MSA5 */
+#define CPACF_KMA		0xb929		/* MSA8 */
 
 /*
  * En/decryption modifier bits

From 284c43e6af415515865d2900d18d1382979c777c Mon Sep 17 00:00:00 2001
From: Harald Freudenberger <freude@linux.vnet.ibm.com>
Date: Wed, 26 Apr 2017 13:56:41 +0200
Subject: [PATCH 74/74] s390/crypt: use the correct module alias for paes_s390.

For automatic module loading (e.g. as it is used with cryptsetup)
an alias "paes" for the paes_s390 kernel module is needed.
Correct the paes_s390 module alias from "aes-all" to "paes".

Signed-off-by: Harald Freudenberger <freude@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/crypto/paes_s390.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/s390/crypto/paes_s390.c b/arch/s390/crypto/paes_s390.c
index 716b17238599..a4e903ed7e21 100644
--- a/arch/s390/crypto/paes_s390.c
+++ b/arch/s390/crypto/paes_s390.c
@@ -616,7 +616,7 @@ out_err:
 module_init(paes_s390_init);
 module_exit(paes_s390_fini);
 
-MODULE_ALIAS_CRYPTO("aes-all");
+MODULE_ALIAS_CRYPTO("paes");
 
 MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm with protected keys");
 MODULE_LICENSE("GPL");