From ce1ce2f31224c18e84d859ccd5675cbb2728f57a Mon Sep 17 00:00:00 2001 From: Ingo Tuchscherer Date: Tue, 18 Mar 2014 16:30:21 +0100 Subject: [PATCH 01/13] s390/zcrypt: add length check for aligned data to avoid overflow in msg-type 6 Signed-off-by: Ingo Tuchscherer Signed-off-by: Martin Schwidefsky --- drivers/s390/crypto/zcrypt_api.c | 4 ++-- drivers/s390/crypto/zcrypt_msgtype6.c | 18 ++++++++++++++++-- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/drivers/s390/crypto/zcrypt_api.c b/drivers/s390/crypto/zcrypt_api.c index 4b824b15194f..5222ebe15705 100644 --- a/drivers/s390/crypto/zcrypt_api.c +++ b/drivers/s390/crypto/zcrypt_api.c @@ -626,8 +626,8 @@ static long zcrypt_send_ep11_cprb(struct ep11_urb *xcrb) return -ENOMEM; if (copy_from_user(ep11_dev_list.targets, - (struct ep11_target_dev *)xcrb->targets, - xcrb->targets_num * + (struct ep11_target_dev __force __user *) + xcrb->targets, xcrb->targets_num * sizeof(struct ep11_target_dev))) return -EFAULT; } diff --git a/drivers/s390/crypto/zcrypt_msgtype6.c b/drivers/s390/crypto/zcrypt_msgtype6.c index 0bc91e46395a..46b324ce6c7a 100644 --- a/drivers/s390/crypto/zcrypt_msgtype6.c +++ b/drivers/s390/crypto/zcrypt_msgtype6.c @@ -315,6 +315,10 @@ static int XCRB_msg_to_type6CPRB_msgX(struct zcrypt_device *zdev, char *req_data = ap_msg->message + sizeof(struct type6_hdr) + rcblen; char *function_code; + if (CEIL4(xcRB->request_control_blk_length) < + xcRB->request_control_blk_length) + return -EINVAL; /* overflow after alignment*/ + /* length checks */ ap_msg->length = sizeof(struct type6_hdr) + CEIL4(xcRB->request_control_blk_length) + @@ -333,6 +337,10 @@ static int XCRB_msg_to_type6CPRB_msgX(struct zcrypt_device *zdev, return -EINVAL; } + if (CEIL4(xcRB->reply_control_blk_length) < + xcRB->reply_control_blk_length) + return -EINVAL; /* overflow after alignment*/ + replylen = sizeof(struct type86_fmt2_msg) + CEIL4(xcRB->reply_control_blk_length) + xcRB->reply_data_length; @@ -415,12 +423,18 @@ static int xcrb_msg_to_type6_ep11cprb_msgx(struct zcrypt_device *zdev, unsigned int dom_val; /* domain id */ } __packed * payload_hdr; + if (CEIL4(xcRB->req_len) < xcRB->req_len) + return -EINVAL; /* overflow after alignment*/ + /* length checks */ ap_msg->length = sizeof(struct type6_hdr) + xcRB->req_len; if (CEIL4(xcRB->req_len) > MSGTYPE06_MAX_MSG_SIZE - (sizeof(struct type6_hdr))) return -EINVAL; + if (CEIL4(xcRB->resp_len) < xcRB->resp_len) + return -EINVAL; /* overflow after alignment*/ + if (CEIL4(xcRB->resp_len) > MSGTYPE06_MAX_MSG_SIZE - (sizeof(struct type86_fmt2_msg))) return -EINVAL; @@ -432,7 +446,7 @@ static int xcrb_msg_to_type6_ep11cprb_msgx(struct zcrypt_device *zdev, /* Import CPRB data from the ioctl input parameter */ if (copy_from_user(&(msg->cprbx.cprb_len), - (char *)xcRB->req, xcRB->req_len)) { + (char __force __user *)xcRB->req, xcRB->req_len)) { return -EFAULT; } @@ -645,7 +659,7 @@ static int convert_type86_ep11_xcrb(struct zcrypt_device *zdev, return -EINVAL; /* Copy response CPRB to user */ - if (copy_to_user((char *)xcRB->resp, + if (copy_to_user((char __force __user *)xcRB->resp, data + msg->fmt2.offset1, msg->fmt2.count1)) return -EFAULT; xcRB->resp_len = msg->fmt2.count1; From 0ccc8b7ac86053388e793bad20bd26bd777752eb Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 20 Mar 2014 08:55:00 +0100 Subject: [PATCH 02/13] s390/bitops,atomic: add missing memory barriers When reworking the bitops and atomic ops I missed that those instructions that got atomic behaviour only perform a "specific-operand-serialization" instead of a full "serialization". The compare-and-swap instruction used before performs a full serialization before and after the instruction is executed, which means it has full memory barrier semantics. In order to give the new bitops and atomic ops functions also full memory barrier semantics add a "bcr 14,0" before and after each of those new instructions which performs full serialization as well. This restores memory barrier semantics for bitops and atomic ops functions which return values, like e.g. atomic_add_return(), but not for functions which do not return a value, like e.g. atomic_add(). This is consistent to other architectures and what common code requires. Cc: stable@vger.kernel.org # v3.13+ Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/atomic.h | 70 ++++++++++++++++++++-------------- arch/s390/include/asm/bitops.h | 41 +++++++++++--------- 2 files changed, 65 insertions(+), 46 deletions(-) diff --git a/arch/s390/include/asm/atomic.h b/arch/s390/include/asm/atomic.h index fa9aaf7144b7..1d4706114a45 100644 --- a/arch/s390/include/asm/atomic.h +++ b/arch/s390/include/asm/atomic.h @@ -15,23 +15,29 @@ #include #include +#include #include #define ATOMIC_INIT(i) { (i) } +#define __ATOMIC_NO_BARRIER "\n" + #ifdef CONFIG_HAVE_MARCH_Z196_FEATURES #define __ATOMIC_OR "lao" #define __ATOMIC_AND "lan" #define __ATOMIC_ADD "laa" +#define __ATOMIC_BARRIER "bcr 14,0\n" -#define __ATOMIC_LOOP(ptr, op_val, op_string) \ +#define __ATOMIC_LOOP(ptr, op_val, op_string, __barrier) \ ({ \ int old_val; \ \ typecheck(atomic_t *, ptr); \ asm volatile( \ + __barrier \ op_string " %0,%2,%1\n" \ + __barrier \ : "=d" (old_val), "+Q" ((ptr)->counter) \ : "d" (op_val) \ : "cc", "memory"); \ @@ -43,8 +49,9 @@ #define __ATOMIC_OR "or" #define __ATOMIC_AND "nr" #define __ATOMIC_ADD "ar" +#define __ATOMIC_BARRIER "\n" -#define __ATOMIC_LOOP(ptr, op_val, op_string) \ +#define __ATOMIC_LOOP(ptr, op_val, op_string, __barrier) \ ({ \ int old_val, new_val; \ \ @@ -82,7 +89,7 @@ static inline void atomic_set(atomic_t *v, int i) static inline int atomic_add_return(int i, atomic_t *v) { - return __ATOMIC_LOOP(v, i, __ATOMIC_ADD) + i; + return __ATOMIC_LOOP(v, i, __ATOMIC_ADD, __ATOMIC_BARRIER) + i; } static inline void atomic_add(int i, atomic_t *v) @@ -94,12 +101,10 @@ static inline void atomic_add(int i, atomic_t *v) : "+Q" (v->counter) : "i" (i) : "cc", "memory"); - } else { - atomic_add_return(i, v); + return; } -#else - atomic_add_return(i, v); #endif + __ATOMIC_LOOP(v, i, __ATOMIC_ADD, __ATOMIC_NO_BARRIER); } #define atomic_add_negative(_i, _v) (atomic_add_return(_i, _v) < 0) @@ -115,12 +120,12 @@ static inline void atomic_add(int i, atomic_t *v) static inline void atomic_clear_mask(unsigned int mask, atomic_t *v) { - __ATOMIC_LOOP(v, ~mask, __ATOMIC_AND); + __ATOMIC_LOOP(v, ~mask, __ATOMIC_AND, __ATOMIC_NO_BARRIER); } static inline void atomic_set_mask(unsigned int mask, atomic_t *v) { - __ATOMIC_LOOP(v, mask, __ATOMIC_OR); + __ATOMIC_LOOP(v, mask, __ATOMIC_OR, __ATOMIC_NO_BARRIER); } #define atomic_xchg(v, new) (xchg(&((v)->counter), new)) @@ -157,19 +162,24 @@ static inline int __atomic_add_unless(atomic_t *v, int a, int u) #ifdef CONFIG_64BIT +#define __ATOMIC64_NO_BARRIER "\n" + #ifdef CONFIG_HAVE_MARCH_Z196_FEATURES #define __ATOMIC64_OR "laog" #define __ATOMIC64_AND "lang" #define __ATOMIC64_ADD "laag" +#define __ATOMIC64_BARRIER "bcr 14,0\n" -#define __ATOMIC64_LOOP(ptr, op_val, op_string) \ +#define __ATOMIC64_LOOP(ptr, op_val, op_string, __barrier) \ ({ \ long long old_val; \ \ typecheck(atomic64_t *, ptr); \ asm volatile( \ + __barrier \ op_string " %0,%2,%1\n" \ + __barrier \ : "=d" (old_val), "+Q" ((ptr)->counter) \ : "d" (op_val) \ : "cc", "memory"); \ @@ -181,8 +191,9 @@ static inline int __atomic_add_unless(atomic_t *v, int a, int u) #define __ATOMIC64_OR "ogr" #define __ATOMIC64_AND "ngr" #define __ATOMIC64_ADD "agr" +#define __ATOMIC64_BARRIER "\n" -#define __ATOMIC64_LOOP(ptr, op_val, op_string) \ +#define __ATOMIC64_LOOP(ptr, op_val, op_string, __barrier) \ ({ \ long long old_val, new_val; \ \ @@ -220,17 +231,32 @@ static inline void atomic64_set(atomic64_t *v, long long i) static inline long long atomic64_add_return(long long i, atomic64_t *v) { - return __ATOMIC64_LOOP(v, i, __ATOMIC64_ADD) + i; + return __ATOMIC64_LOOP(v, i, __ATOMIC64_ADD, __ATOMIC64_BARRIER) + i; +} + +static inline void atomic64_add(long long i, atomic64_t *v) +{ +#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES + if (__builtin_constant_p(i) && (i > -129) && (i < 128)) { + asm volatile( + "agsi %0,%1\n" + : "+Q" (v->counter) + : "i" (i) + : "cc", "memory"); + return; + } +#endif + __ATOMIC64_LOOP(v, i, __ATOMIC64_ADD, __ATOMIC64_NO_BARRIER); } static inline void atomic64_clear_mask(unsigned long mask, atomic64_t *v) { - __ATOMIC64_LOOP(v, ~mask, __ATOMIC64_AND); + __ATOMIC64_LOOP(v, ~mask, __ATOMIC64_AND, __ATOMIC64_NO_BARRIER); } static inline void atomic64_set_mask(unsigned long mask, atomic64_t *v) { - __ATOMIC64_LOOP(v, mask, __ATOMIC64_OR); + __ATOMIC64_LOOP(v, mask, __ATOMIC64_OR, __ATOMIC64_NO_BARRIER); } #define atomic64_xchg(v, new) (xchg(&((v)->counter), new)) @@ -334,25 +360,13 @@ static inline void atomic64_clear_mask(unsigned long long mask, atomic64_t *v) } while (atomic64_cmpxchg(v, old, new) != old); } -#endif /* CONFIG_64BIT */ - static inline void atomic64_add(long long i, atomic64_t *v) { -#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES - if (__builtin_constant_p(i) && (i > -129) && (i < 128)) { - asm volatile( - "agsi %0,%1\n" - : "+Q" (v->counter) - : "i" (i) - : "cc", "memory"); - } else { - atomic64_add_return(i, v); - } -#else atomic64_add_return(i, v); -#endif } +#endif /* CONFIG_64BIT */ + static inline int atomic64_add_unless(atomic64_t *v, long long i, long long u) { long long c, old; diff --git a/arch/s390/include/asm/bitops.h b/arch/s390/include/asm/bitops.h index ec5ef891db6b..520542477678 100644 --- a/arch/s390/include/asm/bitops.h +++ b/arch/s390/include/asm/bitops.h @@ -47,14 +47,18 @@ #include #include +#include + +#define __BITOPS_NO_BARRIER "\n" #ifndef CONFIG_64BIT #define __BITOPS_OR "or" #define __BITOPS_AND "nr" #define __BITOPS_XOR "xr" +#define __BITOPS_BARRIER "\n" -#define __BITOPS_LOOP(__addr, __val, __op_string) \ +#define __BITOPS_LOOP(__addr, __val, __op_string, __barrier) \ ({ \ unsigned long __old, __new; \ \ @@ -67,7 +71,7 @@ " jl 0b" \ : "=&d" (__old), "=&d" (__new), "+Q" (*(__addr))\ : "d" (__val) \ - : "cc"); \ + : "cc", "memory"); \ __old; \ }) @@ -78,17 +82,20 @@ #define __BITOPS_OR "laog" #define __BITOPS_AND "lang" #define __BITOPS_XOR "laxg" +#define __BITOPS_BARRIER "bcr 14,0\n" -#define __BITOPS_LOOP(__addr, __val, __op_string) \ +#define __BITOPS_LOOP(__addr, __val, __op_string, __barrier) \ ({ \ unsigned long __old; \ \ typecheck(unsigned long *, (__addr)); \ asm volatile( \ + __barrier \ __op_string " %0,%2,%1\n" \ + __barrier \ : "=d" (__old), "+Q" (*(__addr)) \ : "d" (__val) \ - : "cc"); \ + : "cc", "memory"); \ __old; \ }) @@ -97,8 +104,9 @@ #define __BITOPS_OR "ogr" #define __BITOPS_AND "ngr" #define __BITOPS_XOR "xgr" +#define __BITOPS_BARRIER "\n" -#define __BITOPS_LOOP(__addr, __val, __op_string) \ +#define __BITOPS_LOOP(__addr, __val, __op_string, __barrier) \ ({ \ unsigned long __old, __new; \ \ @@ -111,7 +119,7 @@ " jl 0b" \ : "=&d" (__old), "=&d" (__new), "+Q" (*(__addr))\ : "d" (__val) \ - : "cc"); \ + : "cc", "memory"); \ __old; \ }) @@ -149,12 +157,12 @@ static inline void set_bit(unsigned long nr, volatile unsigned long *ptr) "oi %0,%b1\n" : "+Q" (*caddr) : "i" (1 << (nr & 7)) - : "cc"); + : "cc", "memory"); return; } #endif mask = 1UL << (nr & (BITS_PER_LONG - 1)); - __BITOPS_LOOP(addr, mask, __BITOPS_OR); + __BITOPS_LOOP(addr, mask, __BITOPS_OR, __BITOPS_NO_BARRIER); } static inline void clear_bit(unsigned long nr, volatile unsigned long *ptr) @@ -170,12 +178,12 @@ static inline void clear_bit(unsigned long nr, volatile unsigned long *ptr) "ni %0,%b1\n" : "+Q" (*caddr) : "i" (~(1 << (nr & 7))) - : "cc"); + : "cc", "memory"); return; } #endif mask = ~(1UL << (nr & (BITS_PER_LONG - 1))); - __BITOPS_LOOP(addr, mask, __BITOPS_AND); + __BITOPS_LOOP(addr, mask, __BITOPS_AND, __BITOPS_NO_BARRIER); } static inline void change_bit(unsigned long nr, volatile unsigned long *ptr) @@ -191,12 +199,12 @@ static inline void change_bit(unsigned long nr, volatile unsigned long *ptr) "xi %0,%b1\n" : "+Q" (*caddr) : "i" (1 << (nr & 7)) - : "cc"); + : "cc", "memory"); return; } #endif mask = 1UL << (nr & (BITS_PER_LONG - 1)); - __BITOPS_LOOP(addr, mask, __BITOPS_XOR); + __BITOPS_LOOP(addr, mask, __BITOPS_XOR, __BITOPS_NO_BARRIER); } static inline int @@ -206,8 +214,7 @@ test_and_set_bit(unsigned long nr, volatile unsigned long *ptr) unsigned long old, mask; mask = 1UL << (nr & (BITS_PER_LONG - 1)); - old = __BITOPS_LOOP(addr, mask, __BITOPS_OR); - barrier(); + old = __BITOPS_LOOP(addr, mask, __BITOPS_OR, __BITOPS_BARRIER); return (old & mask) != 0; } @@ -218,8 +225,7 @@ test_and_clear_bit(unsigned long nr, volatile unsigned long *ptr) unsigned long old, mask; mask = ~(1UL << (nr & (BITS_PER_LONG - 1))); - old = __BITOPS_LOOP(addr, mask, __BITOPS_AND); - barrier(); + old = __BITOPS_LOOP(addr, mask, __BITOPS_AND, __BITOPS_BARRIER); return (old & ~mask) != 0; } @@ -230,8 +236,7 @@ test_and_change_bit(unsigned long nr, volatile unsigned long *ptr) unsigned long old, mask; mask = 1UL << (nr & (BITS_PER_LONG - 1)); - old = __BITOPS_LOOP(addr, mask, __BITOPS_XOR); - barrier(); + old = __BITOPS_LOOP(addr, mask, __BITOPS_XOR, __BITOPS_BARRIER); return (old & mask) != 0; } From bd1cb5de140d844f63389bf21b336c194a8c83a1 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 21 Mar 2014 15:24:27 +0100 Subject: [PATCH 03/13] s390/3270: fix crash with multiple reset device requests If the 3270 device is detached the initial reset device request will stays pending until the device is operational. A second reset device call will reuse the same request structure which will cause an oops. Add a check to see if the reset device request is already pending and do nothing in this case. Signed-off-by: Martin Schwidefsky --- drivers/s390/char/raw3270.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/s390/char/raw3270.c b/drivers/s390/char/raw3270.c index 9f849df4381e..15b3459f8656 100644 --- a/drivers/s390/char/raw3270.c +++ b/drivers/s390/char/raw3270.c @@ -632,6 +632,8 @@ raw3270_reset_device_cb(struct raw3270_request *rq, void *data) raw3270_size_device_done(rp); } else raw3270_writesf_readpart(rp); + memset(&rp->init_reset, 0, sizeof(rp->init_reset)); + memset(&rp->init_data, 0, sizeof(rp->init_data)); } static int @@ -639,9 +641,10 @@ __raw3270_reset_device(struct raw3270 *rp) { int rc; + /* Check if reset is already pending */ + if (rp->init_reset.view) + return -EBUSY; /* Store reset data stream to init_data/init_reset */ - memset(&rp->init_reset, 0, sizeof(rp->init_reset)); - memset(&rp->init_data, 0, sizeof(rp->init_data)); rp->init_data[0] = TW_KR; rp->init_reset.ccw.cmd_code = TC_EWRITEA; rp->init_reset.ccw.flags = CCW_FLAG_SLI; @@ -850,7 +853,7 @@ raw3270_create_device(struct ccw_device *cdev) char *ascebc; int rc; - rp = kmalloc(sizeof(struct raw3270), GFP_KERNEL | GFP_DMA); + rp = kzalloc(sizeof(struct raw3270), GFP_KERNEL | GFP_DMA); if (!rp) return ERR_PTR(-ENOMEM); ascebc = kmalloc(256, GFP_KERNEL); From 02ba8d211d16f3b385147dd50d7198744e9af59c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 23 Mar 2014 15:09:24 +0000 Subject: [PATCH 04/13] s390/tape: Use del_timer_sync() del_timer() does not wait for a possible running callback to complete. So the call side might free request and the associated objects while on another cpu the timer handler runs. Signed-off-by: Thomas Gleixner Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- drivers/s390/char/tape_std.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/s390/char/tape_std.c b/drivers/s390/char/tape_std.c index 981a99fd8d42..cf577a2b13ac 100644 --- a/drivers/s390/char/tape_std.c +++ b/drivers/s390/char/tape_std.c @@ -78,7 +78,7 @@ tape_std_assign(struct tape_device *device) rc = tape_do_io_interruptible(device, request); - del_timer(&timeout); + del_timer_sync(&timeout); if (rc != 0) { DBF_EVENT(3, "%08x: assign failed - device might be busy\n", From 50f0e3bc93d86d0df6805eb55c58302d27a5cfca Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 23 Mar 2014 15:09:25 +0000 Subject: [PATCH 05/13] s390/tape: Add missing destroy_timer_on_stack() Otherwise we leak a tracking object when DEBUG_OBJECTS is enabled. Signed-off-by: Thomas Gleixner Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- drivers/s390/char/tape_std.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/s390/char/tape_std.c b/drivers/s390/char/tape_std.c index cf577a2b13ac..3478e19ae194 100644 --- a/drivers/s390/char/tape_std.c +++ b/drivers/s390/char/tape_std.c @@ -79,6 +79,7 @@ tape_std_assign(struct tape_device *device) rc = tape_do_io_interruptible(device, request); del_timer_sync(&timeout); + destroy_timer_on_stack(&timeout); if (rc != 0) { DBF_EVENT(3, "%08x: assign failed - device might be busy\n", From 12797e8e9da93dca9040f8a1a018e27635f29f04 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 23 Mar 2014 15:09:25 +0000 Subject: [PATCH 06/13] lcs: Add missing destroy_timer_on_stack() Otherwise we leak a tracking object when DEBUG_OBJECTS is enabled. Signed-off-by: Thomas Gleixner Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- drivers/s390/net/lcs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/s390/net/lcs.c b/drivers/s390/net/lcs.c index f404f55b3191..c461f2aac610 100644 --- a/drivers/s390/net/lcs.c +++ b/drivers/s390/net/lcs.c @@ -899,6 +899,7 @@ lcs_send_lancmd(struct lcs_card *card, struct lcs_buffer *buffer, add_timer(&timer); wait_event(reply->wait_q, reply->received); del_timer_sync(&timer); + destroy_timer_on_stack(&timer); LCS_DBF_TEXT_(4, trace, "rc:%d",reply->rc); rc = reply->rc; lcs_put_reply(reply); From 1b6a19b34d54d3d56b90423e4a7c60b971d70197 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Mon, 31 Mar 2014 09:44:30 +0200 Subject: [PATCH 07/13] kvm/s390: also set guest pages back to stable on kexec/kdump We need to reset the usage state of the pages on kexec/kdump, which use subcode 0 and 1. We will only do the cmma reset in the kernel, everything else is done in userspace as before. Signed-off-by: Christian Borntraeger Signed-off-by: Martin Schwidefsky --- arch/s390/kvm/diag.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c index 6f9cfa500372..1facd9277f38 100644 --- a/arch/s390/kvm/diag.c +++ b/arch/s390/kvm/diag.c @@ -85,6 +85,10 @@ static int __diag_ipl_functions(struct kvm_vcpu *vcpu) VCPU_EVENT(vcpu, 5, "diag ipl functions, subcode %lx", subcode); switch (subcode) { + case 0: + case 1: + page_table_reset_pgste(current->mm, 0, TASK_SIZE); + return -EOPNOTSUPP; case 3: vcpu->run->s390_reset_flags = KVM_S390_RESET_CLEAR; page_table_reset_pgste(current->mm, 0, TASK_SIZE); From 9f0128f9e7bf7b2929540d395daf8d823dc30c9f Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Mon, 31 Mar 2014 16:18:29 +0200 Subject: [PATCH 08/13] s390/sclp: add timeout for queued requests This patch adds a timeout option for queued requests and introduces sclp_sync_request_timeout() to use this timer. With this, blocking the system too long, e.g. during an SE reboot, can be avoided in critical situations like CPU and memory hotplug. Since there is no way to cancel a running request, this timeout only applies to queued requests that have not yet been started. Reviewed-by: Peter Oberparleiter Signed-off-by: Gerald Schaefer Signed-off-by: Martin Schwidefsky --- drivers/s390/char/sclp.c | 82 ++++++++++++++++++++++++++++++++++++ drivers/s390/char/sclp.h | 9 ++++ drivers/s390/char/sclp_cmd.c | 17 ++++++-- 3 files changed, 104 insertions(+), 4 deletions(-) diff --git a/drivers/s390/char/sclp.c b/drivers/s390/char/sclp.c index 1fe264379e0d..e9ede5980994 100644 --- a/drivers/s390/char/sclp.c +++ b/drivers/s390/char/sclp.c @@ -91,6 +91,9 @@ static struct sclp_req sclp_suspend_req; /* Timer for request retries. */ static struct timer_list sclp_request_timer; +/* Timer for queued requests. */ +static struct timer_list sclp_queue_timer; + /* Internal state: is the driver initialized? */ static volatile enum sclp_init_state_t { sclp_init_state_uninitialized, @@ -215,6 +218,76 @@ sclp_request_timeout(unsigned long data) sclp_process_queue(); } +/* + * Returns the expire value in jiffies of the next pending request timeout, + * if any. Needs to be called with sclp_lock. + */ +static unsigned long __sclp_req_queue_find_next_timeout(void) +{ + unsigned long expires_next = 0; + struct sclp_req *req; + + list_for_each_entry(req, &sclp_req_queue, list) { + if (!req->queue_expires) + continue; + if (!expires_next || + (time_before(req->queue_expires, expires_next))) + expires_next = req->queue_expires; + } + return expires_next; +} + +/* + * Returns expired request, if any, and removes it from the list. + */ +static struct sclp_req *__sclp_req_queue_remove_expired_req(void) +{ + unsigned long flags, now; + struct sclp_req *req; + + spin_lock_irqsave(&sclp_lock, flags); + now = jiffies; + /* Don't need list_for_each_safe because we break out after list_del */ + list_for_each_entry(req, &sclp_req_queue, list) { + if (!req->queue_expires) + continue; + if (time_before_eq(req->queue_expires, now)) { + if (req->status == SCLP_REQ_QUEUED) { + req->status = SCLP_REQ_QUEUED_TIMEOUT; + list_del(&req->list); + goto out; + } + } + } + req = NULL; +out: + spin_unlock_irqrestore(&sclp_lock, flags); + return req; +} + +/* + * Timeout handler for queued requests. Removes request from list and + * invokes callback. This timer can be set per request in situations where + * waiting too long would be harmful to the system, e.g. during SE reboot. + */ +static void sclp_req_queue_timeout(unsigned long data) +{ + unsigned long flags, expires_next; + struct sclp_req *req; + + do { + req = __sclp_req_queue_remove_expired_req(); + if (req && req->callback) + req->callback(req, req->callback_data); + } while (req); + + spin_lock_irqsave(&sclp_lock, flags); + expires_next = __sclp_req_queue_find_next_timeout(); + if (expires_next) + mod_timer(&sclp_queue_timer, expires_next); + spin_unlock_irqrestore(&sclp_lock, flags); +} + /* Try to start a request. Return zero if the request was successfully * started or if it will be started at a later time. Return non-zero otherwise. * Called while sclp_lock is locked. */ @@ -317,6 +390,13 @@ sclp_add_request(struct sclp_req *req) req->start_count = 0; list_add_tail(&req->list, &sclp_req_queue); rc = 0; + if (req->queue_timeout) { + req->queue_expires = jiffies + req->queue_timeout * HZ; + if (!timer_pending(&sclp_queue_timer) || + time_after(sclp_queue_timer.expires, req->queue_expires)) + mod_timer(&sclp_queue_timer, req->queue_expires); + } else + req->queue_expires = 0; /* Start if request is first in list */ if (sclp_running_state == sclp_running_state_idle && req->list.prev == &sclp_req_queue) { @@ -1113,6 +1193,8 @@ sclp_init(void) INIT_LIST_HEAD(&sclp_reg_list); list_add(&sclp_state_change_event.list, &sclp_reg_list); init_timer(&sclp_request_timer); + init_timer(&sclp_queue_timer); + sclp_queue_timer.function = sclp_req_queue_timeout; /* Check interface */ spin_unlock_irqrestore(&sclp_lock, flags); rc = sclp_check_interface(); diff --git a/drivers/s390/char/sclp.h b/drivers/s390/char/sclp.h index fea76aed9eea..a68b5ec7d042 100644 --- a/drivers/s390/char/sclp.h +++ b/drivers/s390/char/sclp.h @@ -133,6 +133,11 @@ struct sclp_req { /* Callback that is called after reaching final status. */ void (*callback)(struct sclp_req *, void *data); void *callback_data; + int queue_timeout; /* request queue timeout (sec), set by + caller of sclp_add_request(), if + needed */ + /* Internal fields */ + unsigned long queue_expires; /* request queue timeout (jiffies) */ }; #define SCLP_REQ_FILLED 0x00 /* request is ready to be processed */ @@ -140,6 +145,9 @@ struct sclp_req { #define SCLP_REQ_RUNNING 0x02 /* request is currently running */ #define SCLP_REQ_DONE 0x03 /* request is completed successfully */ #define SCLP_REQ_FAILED 0x05 /* request is finally failed */ +#define SCLP_REQ_QUEUED_TIMEOUT 0x06 /* request on queue timed out */ + +#define SCLP_QUEUE_INTERVAL 5 /* timeout interval for request queue */ /* function pointers that a high level driver has to use for registration */ /* of some routines it wants to be called from the low level driver */ @@ -173,6 +181,7 @@ int sclp_deactivate(void); int sclp_reactivate(void); int sclp_service_call(sclp_cmdw_t command, void *sccb); int sclp_sync_request(sclp_cmdw_t command, void *sccb); +int sclp_sync_request_timeout(sclp_cmdw_t command, void *sccb, int timeout); int sclp_sdias_init(void); void sclp_sdias_exit(void); diff --git a/drivers/s390/char/sclp_cmd.c b/drivers/s390/char/sclp_cmd.c index 49af8eeb90ea..6e8f90f84e49 100644 --- a/drivers/s390/char/sclp_cmd.c +++ b/drivers/s390/char/sclp_cmd.c @@ -36,6 +36,11 @@ static void sclp_sync_callback(struct sclp_req *req, void *data) } int sclp_sync_request(sclp_cmdw_t cmd, void *sccb) +{ + return sclp_sync_request_timeout(cmd, sccb, 0); +} + +int sclp_sync_request_timeout(sclp_cmdw_t cmd, void *sccb, int timeout) { struct completion completion; struct sclp_req *request; @@ -44,6 +49,8 @@ int sclp_sync_request(sclp_cmdw_t cmd, void *sccb) request = kzalloc(sizeof(*request), GFP_KERNEL); if (!request) return -ENOMEM; + if (timeout) + request->queue_timeout = timeout; request->command = cmd; request->sccb = sccb; request->status = SCLP_REQ_FILLED; @@ -110,7 +117,8 @@ int sclp_get_cpu_info(struct sclp_cpu_info *info) if (!sccb) return -ENOMEM; sccb->header.length = sizeof(*sccb); - rc = sclp_sync_request(SCLP_CMDW_READ_CPU_INFO, sccb); + rc = sclp_sync_request_timeout(SCLP_CMDW_READ_CPU_INFO, sccb, + SCLP_QUEUE_INTERVAL); if (rc) goto out; if (sccb->header.response_code != 0x0010) { @@ -144,7 +152,7 @@ static int do_cpu_configure(sclp_cmdw_t cmd) if (!sccb) return -ENOMEM; sccb->header.length = sizeof(*sccb); - rc = sclp_sync_request(cmd, sccb); + rc = sclp_sync_request_timeout(cmd, sccb, SCLP_QUEUE_INTERVAL); if (rc) goto out; switch (sccb->header.response_code) { @@ -214,7 +222,7 @@ static int do_assign_storage(sclp_cmdw_t cmd, u16 rn) return -ENOMEM; sccb->header.length = PAGE_SIZE; sccb->rn = rn; - rc = sclp_sync_request(cmd, sccb); + rc = sclp_sync_request_timeout(cmd, sccb, SCLP_QUEUE_INTERVAL); if (rc) goto out; switch (sccb->header.response_code) { @@ -269,7 +277,8 @@ static int sclp_attach_storage(u8 id) if (!sccb) return -ENOMEM; sccb->header.length = PAGE_SIZE; - rc = sclp_sync_request(0x00080001 | id << 8, sccb); + rc = sclp_sync_request_timeout(0x00080001 | id << 8, sccb, + SCLP_QUEUE_INTERVAL); if (rc) goto out; switch (sccb->header.response_code) { From 072c2790294210419db287995628406e270bf027 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Wed, 2 Apr 2014 14:06:10 +0200 Subject: [PATCH 09/13] s390/irq: Add defines for external interruption codes Introduce defines for external interruption codes so that we can get rid of some "magic" numbers in the s390 source code. Signed-off-by: Thomas Huth Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/irq.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/arch/s390/include/asm/irq.h b/arch/s390/include/asm/irq.h index 5f8bcc5fe423..58c1ea639452 100644 --- a/arch/s390/include/asm/irq.h +++ b/arch/s390/include/asm/irq.h @@ -16,6 +16,19 @@ /* This number is used when no interrupt has been assigned */ #define NO_IRQ 0 +/* External interruption codes */ +#define EXT_IRQ_INTERRUPT_KEY 0x0040 +#define EXT_IRQ_CLK_COMP 0x1004 +#define EXT_IRQ_CPU_TIMER 0x1005 +#define EXT_IRQ_WARNING_TRACK 0x1007 +#define EXT_IRQ_MALFUNC_ALERT 0x1200 +#define EXT_IRQ_EMERGENCY_SIG 0x1201 +#define EXT_IRQ_EXTERNAL_CALL 0x1202 +#define EXT_IRQ_TIMING_ALERT 0x1406 +#define EXT_IRQ_MEASURE_ALERT 0x1407 +#define EXT_IRQ_SERVICE_SIG 0x2401 +#define EXT_IRQ_IUCV 0x4000 + #ifndef __ASSEMBLY__ #include From 1dad093b66fdd4fd5d7d2692169dc1bafd794628 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Mon, 31 Mar 2014 15:24:08 +0200 Subject: [PATCH 10/13] s390/irq: Use defines for external interruption codes Use the new defines for external interruption codes to get rid of "magic" numbers in the s390 source code. And while we're at it, also rename the (un-)register_external_interrupt function to something shorter so that this patch does not exceed the 80 columns all over the place. Signed-off-by: Thomas Huth Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/irq.h | 5 +++-- arch/s390/kernel/irq.c | 10 +++++----- arch/s390/kernel/perf_cpum_cf.c | 6 ++++-- arch/s390/kernel/perf_cpum_sf.c | 6 ++++-- arch/s390/kernel/runtime_instr.c | 3 ++- arch/s390/kernel/sclp.S | 5 +++-- arch/s390/kernel/smp.c | 4 ++-- arch/s390/kernel/time.c | 6 +++--- arch/s390/mm/fault.c | 4 ++-- arch/s390/oprofile/hwsampler.c | 4 ++-- drivers/s390/block/dasd_diag.c | 4 ++-- drivers/s390/char/sclp.c | 6 +++--- drivers/s390/kvm/kvm_virtio.c | 2 +- net/iucv/iucv.c | 6 +++--- 14 files changed, 39 insertions(+), 32 deletions(-) diff --git a/arch/s390/include/asm/irq.h b/arch/s390/include/asm/irq.h index 58c1ea639452..763ccdcb70ba 100644 --- a/arch/s390/include/asm/irq.h +++ b/arch/s390/include/asm/irq.h @@ -27,6 +27,7 @@ #define EXT_IRQ_TIMING_ALERT 0x1406 #define EXT_IRQ_MEASURE_ALERT 0x1407 #define EXT_IRQ_SERVICE_SIG 0x2401 +#define EXT_IRQ_CP_SERVICE 0x2603 #define EXT_IRQ_IUCV 0x4000 #ifndef __ASSEMBLY__ @@ -89,8 +90,8 @@ struct ext_code { typedef void (*ext_int_handler_t)(struct ext_code, unsigned int, unsigned long); -int register_external_interrupt(u16 code, ext_int_handler_t handler); -int unregister_external_interrupt(u16 code, ext_int_handler_t handler); +int register_external_irq(u16 code, ext_int_handler_t handler); +int unregister_external_irq(u16 code, ext_int_handler_t handler); enum irq_subclass { IRQ_SUBCLASS_MEASUREMENT_ALERT = 5, diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c index bb27a262c44a..083617d739d8 100644 --- a/arch/s390/kernel/irq.c +++ b/arch/s390/kernel/irq.c @@ -205,7 +205,7 @@ static inline int ext_hash(u16 code) return (code + (code >> 9)) & (ARRAY_SIZE(ext_int_hash) - 1); } -int register_external_interrupt(u16 code, ext_int_handler_t handler) +int register_external_irq(u16 code, ext_int_handler_t handler) { struct ext_int_info *p; unsigned long flags; @@ -223,9 +223,9 @@ int register_external_interrupt(u16 code, ext_int_handler_t handler) spin_unlock_irqrestore(&ext_int_hash_lock, flags); return 0; } -EXPORT_SYMBOL(register_external_interrupt); +EXPORT_SYMBOL(register_external_irq); -int unregister_external_interrupt(u16 code, ext_int_handler_t handler) +int unregister_external_irq(u16 code, ext_int_handler_t handler) { struct ext_int_info *p; unsigned long flags; @@ -241,7 +241,7 @@ int unregister_external_interrupt(u16 code, ext_int_handler_t handler) spin_unlock_irqrestore(&ext_int_hash_lock, flags); return 0; } -EXPORT_SYMBOL(unregister_external_interrupt); +EXPORT_SYMBOL(unregister_external_irq); static irqreturn_t do_ext_interrupt(int irq, void *dummy) { @@ -251,7 +251,7 @@ static irqreturn_t do_ext_interrupt(int irq, void *dummy) int index; ext_code = *(struct ext_code *) ®s->int_code; - if (ext_code.code != 0x1004) + if (ext_code.code != EXT_IRQ_CLK_COMP) __get_cpu_var(s390_idle).nohz_delay = 1; index = ext_hash(ext_code.code); diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c index f51214c04858..ea75d011a6fc 100644 --- a/arch/s390/kernel/perf_cpum_cf.c +++ b/arch/s390/kernel/perf_cpum_cf.c @@ -673,7 +673,8 @@ static int __init cpumf_pmu_init(void) ctl_clear_bit(0, 48); /* register handler for measurement-alert interruptions */ - rc = register_external_interrupt(0x1407, cpumf_measurement_alert); + rc = register_external_irq(EXT_IRQ_MEASURE_ALERT, + cpumf_measurement_alert); if (rc) { pr_err("Registering for CPU-measurement alerts " "failed with rc=%i\n", rc); @@ -684,7 +685,8 @@ static int __init cpumf_pmu_init(void) rc = perf_pmu_register(&cpumf_pmu, "cpum_cf", PERF_TYPE_RAW); if (rc) { pr_err("Registering the cpum_cf PMU failed with rc=%i\n", rc); - unregister_external_interrupt(0x1407, cpumf_measurement_alert); + unregister_external_irq(EXT_IRQ_MEASURE_ALERT, + cpumf_measurement_alert); goto out; } perf_cpu_notifier(cpumf_pmu_notifier); diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c index 6c0d29827cb6..ea0c7b2ef030 100644 --- a/arch/s390/kernel/perf_cpum_sf.c +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -1621,7 +1621,8 @@ static int __init init_cpum_sampling_pmu(void) pr_err("Registering for s390dbf failed\n"); debug_register_view(sfdbg, &debug_sprintf_view); - err = register_external_interrupt(0x1407, cpumf_measurement_alert); + err = register_external_irq(EXT_IRQ_MEASURE_ALERT, + cpumf_measurement_alert); if (err) { pr_cpumsf_err(RS_INIT_FAILURE_ALRT); goto out; @@ -1630,7 +1631,8 @@ static int __init init_cpum_sampling_pmu(void) err = perf_pmu_register(&cpumf_sampling, "cpum_sf", PERF_TYPE_RAW); if (err) { pr_cpumsf_err(RS_INIT_FAILURE_PERF); - unregister_external_interrupt(0x1407, cpumf_measurement_alert); + unregister_external_irq(EXT_IRQ_MEASURE_ALERT, + cpumf_measurement_alert); goto out; } perf_cpu_notifier(cpumf_pmu_notifier); diff --git a/arch/s390/kernel/runtime_instr.c b/arch/s390/kernel/runtime_instr.c index d817cce7e72d..26b4ae96fdd7 100644 --- a/arch/s390/kernel/runtime_instr.c +++ b/arch/s390/kernel/runtime_instr.c @@ -138,7 +138,8 @@ static int __init runtime_instr_init(void) return 0; irq_subclass_register(IRQ_SUBCLASS_MEASUREMENT_ALERT); - rc = register_external_interrupt(0x1407, runtime_instr_int_handler); + rc = register_external_irq(EXT_IRQ_MEASURE_ALERT, + runtime_instr_int_handler); if (rc) irq_subclass_unregister(IRQ_SUBCLASS_MEASUREMENT_ALERT); else diff --git a/arch/s390/kernel/sclp.S b/arch/s390/kernel/sclp.S index 29bd7bec4176..a41f2c99dcc8 100644 --- a/arch/s390/kernel/sclp.S +++ b/arch/s390/kernel/sclp.S @@ -9,6 +9,7 @@ */ #include +#include LC_EXT_NEW_PSW = 0x58 # addr of ext int handler LC_EXT_NEW_PSW_64 = 0x1b0 # addr of ext int handler 64 bit @@ -73,9 +74,9 @@ _sclp_wait_int: lpsw .LwaitpswS1-.LbaseS1(%r13) # wait until interrupt .LwaitS1: lh %r7,LC_EXT_INT_CODE - chi %r7,0x1004 # timeout? + chi %r7,EXT_IRQ_CLK_COMP # timeout? je .LtimeoutS1 - chi %r7,0x2401 # service int? + chi %r7,EXT_IRQ_SERVICE_SIG # service int? jne .LloopS1 sr %r2,%r2 l %r3,LC_EXT_INT_PARAM diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 8827883310dd..366d14460c2b 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -785,10 +785,10 @@ void __init smp_fill_possible_mask(void) void __init smp_prepare_cpus(unsigned int max_cpus) { /* request the 0x1201 emergency signal external interrupt */ - if (register_external_interrupt(0x1201, do_ext_call_interrupt) != 0) + if (register_external_irq(EXT_IRQ_EMERGENCY_SIG, do_ext_call_interrupt)) panic("Couldn't request external interrupt 0x1201"); /* request the 0x1202 external call external interrupt */ - if (register_external_interrupt(0x1202, do_ext_call_interrupt) != 0) + if (register_external_irq(EXT_IRQ_EXTERNAL_CALL, do_ext_call_interrupt)) panic("Couldn't request external interrupt 0x1202"); smp_detect_cpus(); } diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index dd95f1631621..386d37a228bb 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -262,11 +262,11 @@ void __init time_init(void) stp_reset(); /* request the clock comparator external interrupt */ - if (register_external_interrupt(0x1004, clock_comparator_interrupt)) - panic("Couldn't request external interrupt 0x1004"); + if (register_external_irq(EXT_IRQ_CLK_COMP, clock_comparator_interrupt)) + panic("Couldn't request external interrupt 0x1004"); /* request the timing alert external interrupt */ - if (register_external_interrupt(0x1406, timing_alert_interrupt)) + if (register_external_irq(EXT_IRQ_TIMING_ALERT, timing_alert_interrupt)) panic("Couldn't request external interrupt 0x1406"); if (clocksource_register(&clocksource_tod) != 0) diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index d95265b2719f..750565f72e06 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -627,7 +627,7 @@ static int __init pfault_irq_init(void) { int rc; - rc = register_external_interrupt(0x2603, pfault_interrupt); + rc = register_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt); if (rc) goto out_extint; rc = pfault_init() == 0 ? 0 : -EOPNOTSUPP; @@ -638,7 +638,7 @@ static int __init pfault_irq_init(void) return 0; out_pfault: - unregister_external_interrupt(0x2603, pfault_interrupt); + unregister_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt); out_extint: pfault_disable = 1; return rc; diff --git a/arch/s390/oprofile/hwsampler.c b/arch/s390/oprofile/hwsampler.c index a32c96761eab..276f2e26c761 100644 --- a/arch/s390/oprofile/hwsampler.c +++ b/arch/s390/oprofile/hwsampler.c @@ -1033,7 +1033,7 @@ int hwsampler_setup(void) max_sampler_rate = cb->qsi.max_sampl_rate; } } - register_external_interrupt(0x1407, hws_ext_handler); + register_external_irq(EXT_IRQ_MEASURE_ALERT, hws_ext_handler); hws_state = HWS_DEALLOCATED; rc = 0; @@ -1068,7 +1068,7 @@ int hwsampler_shutdown(void) hws_wq = NULL; } - unregister_external_interrupt(0x1407, hws_ext_handler); + unregister_external_irq(EXT_IRQ_MEASURE_ALERT, hws_ext_handler); hws_state = HWS_INIT; rc = 0; } diff --git a/drivers/s390/block/dasd_diag.c b/drivers/s390/block/dasd_diag.c index 9cbc567698ce..c062f1620c58 100644 --- a/drivers/s390/block/dasd_diag.c +++ b/drivers/s390/block/dasd_diag.c @@ -646,7 +646,7 @@ dasd_diag_init(void) ASCEBC(dasd_diag_discipline.ebcname, 4); irq_subclass_register(IRQ_SUBCLASS_SERVICE_SIGNAL); - register_external_interrupt(0x2603, dasd_ext_handler); + register_external_irq(EXT_IRQ_CP_SERVICE, dasd_ext_handler); dasd_diag_discipline_pointer = &dasd_diag_discipline; return 0; } @@ -654,7 +654,7 @@ dasd_diag_init(void) static void __exit dasd_diag_cleanup(void) { - unregister_external_interrupt(0x2603, dasd_ext_handler); + unregister_external_irq(EXT_IRQ_CP_SERVICE, dasd_ext_handler); irq_subclass_unregister(IRQ_SUBCLASS_SERVICE_SIGNAL); dasd_diag_discipline_pointer = NULL; } diff --git a/drivers/s390/char/sclp.c b/drivers/s390/char/sclp.c index e9ede5980994..1990285296c6 100644 --- a/drivers/s390/char/sclp.c +++ b/drivers/s390/char/sclp.c @@ -972,7 +972,7 @@ sclp_check_interface(void) spin_lock_irqsave(&sclp_lock, flags); /* Prepare init mask command */ - rc = register_external_interrupt(0x2401, sclp_check_handler); + rc = register_external_irq(EXT_IRQ_SERVICE_SIG, sclp_check_handler); if (rc) { spin_unlock_irqrestore(&sclp_lock, flags); return rc; @@ -1005,7 +1005,7 @@ sclp_check_interface(void) } else rc = -EBUSY; } - unregister_external_interrupt(0x2401, sclp_check_handler); + unregister_external_irq(EXT_IRQ_SERVICE_SIG, sclp_check_handler); spin_unlock_irqrestore(&sclp_lock, flags); return rc; } @@ -1206,7 +1206,7 @@ sclp_init(void) if (rc) goto fail_init_state_uninitialized; /* Register interrupt handler */ - rc = register_external_interrupt(0x2401, sclp_interrupt_handler); + rc = register_external_irq(EXT_IRQ_SERVICE_SIG, sclp_interrupt_handler); if (rc) goto fail_unregister_reboot_notifier; sclp_init_state = sclp_init_state_initialized; diff --git a/drivers/s390/kvm/kvm_virtio.c b/drivers/s390/kvm/kvm_virtio.c index 1abd0db29915..a1349653c6d9 100644 --- a/drivers/s390/kvm/kvm_virtio.c +++ b/drivers/s390/kvm/kvm_virtio.c @@ -477,7 +477,7 @@ static int __init kvm_devices_init(void) INIT_WORK(&hotplug_work, hotplug_devices); irq_subclass_register(IRQ_SUBCLASS_SERVICE_SIGNAL); - register_external_interrupt(0x2603, kvm_extint_handler); + register_external_irq(EXT_IRQ_CP_SERVICE, kvm_extint_handler); scan_devices(); return 0; diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c index cd5b8ec9be04..12afba10a61f 100644 --- a/net/iucv/iucv.c +++ b/net/iucv/iucv.c @@ -2016,7 +2016,7 @@ static int __init iucv_init(void) rc = iucv_query_maxconn(); if (rc) goto out_ctl; - rc = register_external_interrupt(0x4000, iucv_external_interrupt); + rc = register_external_irq(EXT_IRQ_IUCV, iucv_external_interrupt); if (rc) goto out_ctl; iucv_root = root_device_register("iucv"); @@ -2081,7 +2081,7 @@ out_free: } root_device_unregister(iucv_root); out_int: - unregister_external_interrupt(0x4000, iucv_external_interrupt); + unregister_external_irq(EXT_IRQ_IUCV, iucv_external_interrupt); out_ctl: ctl_clear_bit(0, 1); out: @@ -2116,7 +2116,7 @@ static void __exit iucv_exit(void) } root_device_unregister(iucv_root); bus_unregister(&iucv_bus); - unregister_external_interrupt(0x4000, iucv_external_interrupt); + unregister_external_irq(EXT_IRQ_IUCV, iucv_external_interrupt); } subsys_initcall(iucv_init); From 02a8f3abb708919149cb657a5202f4603f0c38e2 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Thu, 3 Apr 2014 13:54:59 +0200 Subject: [PATCH 11/13] s390/mm,tlb: safeguard against speculative TLB creation The principles of operations states that the CPU is allowed to create TLB entries for an address space anytime while an ASCE is loaded to the control register. This is true even if the CPU is running in the kernel and the user address space is not (actively) accessed. In theory this can affect two aspects of the TLB flush logic. For full-mm flushes the ASCE of the dying process is still attached. The approach to flush first with IDTE and then just free all page tables can in theory lead to stale TLB entries. Use the batched free of page tables for the full-mm flushes as well. For operations that can have a stale ASCE in the control register, e.g. a delayed update_user_asce in switch_mm, load the kernel ASCE to prevent invalid TLBs from being created. Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/mmu_context.h | 17 +++++++++++++---- arch/s390/include/asm/tlb.h | 14 +++----------- arch/s390/mm/pgtable.c | 8 +++++--- 3 files changed, 21 insertions(+), 18 deletions(-) diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h index 38149b63dc44..7abf318b1522 100644 --- a/arch/s390/include/asm/mmu_context.h +++ b/arch/s390/include/asm/mmu_context.h @@ -35,7 +35,7 @@ static inline int init_new_context(struct task_struct *tsk, #define LCTL_OPCODE "lctlg" #endif -static inline void update_mm(struct mm_struct *mm, struct task_struct *tsk) +static inline void update_user_asce(struct mm_struct *mm) { pgd_t *pgd = mm->pgd; @@ -45,6 +45,13 @@ static inline void update_mm(struct mm_struct *mm, struct task_struct *tsk) set_fs(current->thread.mm_segment); } +static inline void clear_user_asce(struct mm_struct *mm) +{ + S390_lowcore.user_asce = S390_lowcore.kernel_asce; + asm volatile(LCTL_OPCODE" 1,1,%0\n" : : "m" (S390_lowcore.user_asce)); + asm volatile(LCTL_OPCODE" 7,7,%0\n" : : "m" (S390_lowcore.user_asce)); +} + static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { @@ -53,11 +60,13 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, if (prev == next) return; if (atomic_inc_return(&next->context.attach_count) >> 16) { - /* Delay update_mm until all TLB flushes are done. */ + /* Delay update_user_asce until all TLB flushes are done. */ set_tsk_thread_flag(tsk, TIF_TLB_WAIT); + /* Clear old ASCE by loading the kernel ASCE. */ + clear_user_asce(next); } else { cpumask_set_cpu(cpu, mm_cpumask(next)); - update_mm(next, tsk); + update_user_asce(next); if (next->context.flush_mm) /* Flush pending TLBs */ __tlb_flush_mm(next); @@ -80,7 +89,7 @@ static inline void finish_arch_post_lock_switch(void) cpu_relax(); cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm)); - update_mm(mm, tsk); + update_user_asce(mm); if (mm->context.flush_mm) __tlb_flush_mm(mm); preempt_enable(); diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h index 2cb846c4b37f..c544b6f05d95 100644 --- a/arch/s390/include/asm/tlb.h +++ b/arch/s390/include/asm/tlb.h @@ -57,8 +57,6 @@ static inline void tlb_gather_mmu(struct mmu_gather *tlb, tlb->end = end; tlb->fullmm = !(start | (end+1)); tlb->batch = NULL; - if (tlb->fullmm) - __tlb_flush_mm(mm); } static inline void tlb_flush_mmu(struct mmu_gather *tlb) @@ -96,9 +94,7 @@ static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) static inline void pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, unsigned long address) { - if (!tlb->fullmm) - return page_table_free_rcu(tlb, (unsigned long *) pte); - page_table_free(tlb->mm, (unsigned long *) pte); + page_table_free_rcu(tlb, (unsigned long *) pte); } /* @@ -114,9 +110,7 @@ static inline void pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, #ifdef CONFIG_64BIT if (tlb->mm->context.asce_limit <= (1UL << 31)) return; - if (!tlb->fullmm) - return tlb_remove_table(tlb, pmd); - crst_table_free(tlb->mm, (unsigned long *) pmd); + tlb_remove_table(tlb, pmd); #endif } @@ -133,9 +127,7 @@ static inline void pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, #ifdef CONFIG_64BIT if (tlb->mm->context.asce_limit <= (1UL << 42)) return; - if (!tlb->fullmm) - return tlb_remove_table(tlb, pud); - crst_table_free(tlb->mm, (unsigned long *) pud); + tlb_remove_table(tlb, pud); #endif } diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 796c9320c709..24c62900b532 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -54,7 +54,7 @@ static void __crst_table_upgrade(void *arg) struct mm_struct *mm = arg; if (current->active_mm == mm) - update_mm(mm, current); + update_user_asce(mm); __tlb_flush_local(); } @@ -107,8 +107,10 @@ void crst_table_downgrade(struct mm_struct *mm, unsigned long limit) { pgd_t *pgd; - if (current->active_mm == mm) + if (current->active_mm == mm) { + clear_user_asce(mm); __tlb_flush_mm(mm); + } while (mm->context.asce_limit > limit) { pgd = mm->pgd; switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) { @@ -132,7 +134,7 @@ void crst_table_downgrade(struct mm_struct *mm, unsigned long limit) crst_table_free(mm, (unsigned long *) pgd); } if (current->active_mm == mm) - update_mm(mm, current); + update_user_asce(mm); } #endif From 1b948d6caec4f28e3524244ca0f77c6ae8ddceef Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Thu, 3 Apr 2014 13:55:01 +0200 Subject: [PATCH 12/13] s390/mm,tlb: optimize TLB flushing for zEC12 The zEC12 machines introduced the local-clearing control for the IDTE and IPTE instruction. If the control is set only the TLB of the local CPU is cleared of entries, either all entries of a single address space for IDTE, or the entry for a single page-table entry for IPTE. Without the local-clearing control the TLB flush is broadcasted to all CPUs in the configuration, which is expensive. The reset of the bit mask of the CPUs that need flushing after a non-local IDTE is tricky. As TLB entries for an address space remain in the TLB even if the address space is detached a new bit field is required to keep track of attached CPUs vs. CPUs in the need of a flush. After a non-local flush with IDTE the bit-field of attached CPUs is copied to the bit-field of CPUs in need of a flush. The ordering of operations on cpu_attach_mask, attach_count and mm_cpumask(mm) is such that an underindication in mm_cpumask(mm) is prevented but an overindication in mm_cpumask(mm) is possible. Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/mmu.h | 2 + arch/s390/include/asm/mmu_context.h | 5 ++ arch/s390/include/asm/pgtable.h | 128 ++++++++++++++++++++-------- arch/s390/include/asm/setup.h | 3 + arch/s390/include/asm/tlbflush.h | 119 +++++++++++++++++++++----- arch/s390/kernel/early.c | 2 + arch/s390/kernel/smp.c | 6 ++ arch/s390/mm/hugetlbpage.c | 5 +- arch/s390/mm/init.c | 7 +- arch/s390/mm/pgtable.c | 4 +- arch/s390/mm/vmem.c | 2 - 11 files changed, 215 insertions(+), 68 deletions(-) diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h index ff132ac64ddd..f77695a82f64 100644 --- a/arch/s390/include/asm/mmu.h +++ b/arch/s390/include/asm/mmu.h @@ -1,9 +1,11 @@ #ifndef __MMU_H #define __MMU_H +#include #include typedef struct { + cpumask_t cpu_attach_mask; atomic_t attach_count; unsigned int flush_mm; spinlock_t list_lock; diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h index 7abf318b1522..71a258839039 100644 --- a/arch/s390/include/asm/mmu_context.h +++ b/arch/s390/include/asm/mmu_context.h @@ -15,6 +15,7 @@ static inline int init_new_context(struct task_struct *tsk, struct mm_struct *mm) { + cpumask_clear(&mm->context.cpu_attach_mask); atomic_set(&mm->context.attach_count, 0); mm->context.flush_mm = 0; mm->context.asce_bits = _ASCE_TABLE_LENGTH | _ASCE_USER_BITS; @@ -59,6 +60,8 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, if (prev == next) return; + if (MACHINE_HAS_TLB_LC) + cpumask_set_cpu(cpu, &next->context.cpu_attach_mask); if (atomic_inc_return(&next->context.attach_count) >> 16) { /* Delay update_user_asce until all TLB flushes are done. */ set_tsk_thread_flag(tsk, TIF_TLB_WAIT); @@ -73,6 +76,8 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, } atomic_dec(&prev->context.attach_count); WARN_ON(atomic_read(&prev->context.attach_count) < 0); + if (MACHINE_HAS_TLB_LC) + cpumask_clear_cpu(cpu, &prev->context.cpu_attach_mask); } #define finish_arch_post_lock_switch finish_arch_post_lock_switch diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 1ab75eaacbd4..66d51834f2cb 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1068,12 +1068,35 @@ static inline void __ptep_ipte(unsigned long address, pte_t *ptep) : "=m" (*ptep) : "m" (*ptep), "a" (pto), "a" (address)); } +static inline void __ptep_ipte_local(unsigned long address, pte_t *ptep) +{ + unsigned long pto = (unsigned long) ptep; + +#ifndef CONFIG_64BIT + /* pto in ESA mode must point to the start of the segment table */ + pto &= 0x7ffffc00; +#endif + /* Invalidation + local TLB flush for the pte */ + asm volatile( + " .insn rrf,0xb2210000,%2,%3,0,1" + : "=m" (*ptep) : "m" (*ptep), "a" (pto), "a" (address)); +} + static inline void ptep_flush_direct(struct mm_struct *mm, unsigned long address, pte_t *ptep) { + int active, count; + if (pte_val(*ptep) & _PAGE_INVALID) return; - __ptep_ipte(address, ptep); + active = (mm == current->active_mm) ? 1 : 0; + count = atomic_add_return(0x10000, &mm->context.attach_count); + if (MACHINE_HAS_TLB_LC && (count & 0xffff) <= active && + cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) + __ptep_ipte_local(address, ptep); + else + __ptep_ipte(address, ptep); + atomic_sub(0x10000, &mm->context.attach_count); } static inline void ptep_flush_lazy(struct mm_struct *mm, @@ -1382,35 +1405,6 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) #define pte_offset_map(pmd, address) pte_offset_kernel(pmd, address) #define pte_unmap(pte) do { } while (0) -static inline void __pmd_idte(unsigned long address, pmd_t *pmdp) -{ - unsigned long sto = (unsigned long) pmdp - - pmd_index(address) * sizeof(pmd_t); - - if (!(pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID)) { - asm volatile( - " .insn rrf,0xb98e0000,%2,%3,0,0" - : "=m" (*pmdp) - : "m" (*pmdp), "a" (sto), - "a" ((address & HPAGE_MASK)) - : "cc" - ); - } -} - -static inline void __pmd_csp(pmd_t *pmdp) -{ - register unsigned long reg2 asm("2") = pmd_val(*pmdp); - register unsigned long reg3 asm("3") = pmd_val(*pmdp) | - _SEGMENT_ENTRY_INVALID; - register unsigned long reg4 asm("4") = ((unsigned long) pmdp) + 5; - - asm volatile( - " csp %1,%3" - : "=m" (*pmdp) - : "d" (reg2), "d" (reg3), "d" (reg4), "m" (*pmdp) : "cc"); -} - #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE) static inline unsigned long massage_pgprot_pmd(pgprot_t pgprot) { @@ -1479,18 +1473,80 @@ static inline pmd_t pmd_mkwrite(pmd_t pmd) } #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLB_PAGE */ +static inline void __pmdp_csp(pmd_t *pmdp) +{ + register unsigned long reg2 asm("2") = pmd_val(*pmdp); + register unsigned long reg3 asm("3") = pmd_val(*pmdp) | + _SEGMENT_ENTRY_INVALID; + register unsigned long reg4 asm("4") = ((unsigned long) pmdp) + 5; + + asm volatile( + " csp %1,%3" + : "=m" (*pmdp) + : "d" (reg2), "d" (reg3), "d" (reg4), "m" (*pmdp) : "cc"); +} + +static inline void __pmdp_idte(unsigned long address, pmd_t *pmdp) +{ + unsigned long sto; + + sto = (unsigned long) pmdp - pmd_index(address) * sizeof(pmd_t); + asm volatile( + " .insn rrf,0xb98e0000,%2,%3,0,0" + : "=m" (*pmdp) + : "m" (*pmdp), "a" (sto), "a" ((address & HPAGE_MASK)) + : "cc" ); +} + +static inline void __pmdp_idte_local(unsigned long address, pmd_t *pmdp) +{ + unsigned long sto; + + sto = (unsigned long) pmdp - pmd_index(address) * sizeof(pmd_t); + asm volatile( + " .insn rrf,0xb98e0000,%2,%3,0,1" + : "=m" (*pmdp) + : "m" (*pmdp), "a" (sto), "a" ((address & HPAGE_MASK)) + : "cc" ); +} + +static inline void pmdp_flush_direct(struct mm_struct *mm, + unsigned long address, pmd_t *pmdp) +{ + int active, count; + + if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID) + return; + if (!MACHINE_HAS_IDTE) { + __pmdp_csp(pmdp); + return; + } + active = (mm == current->active_mm) ? 1 : 0; + count = atomic_add_return(0x10000, &mm->context.attach_count); + if (MACHINE_HAS_TLB_LC && (count & 0xffff) <= active && + cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) + __pmdp_idte_local(address, pmdp); + else + __pmdp_idte(address, pmdp); + atomic_sub(0x10000, &mm->context.attach_count); +} + static inline void pmdp_flush_lazy(struct mm_struct *mm, unsigned long address, pmd_t *pmdp) { int active, count; + if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID) + return; active = (mm == current->active_mm) ? 1 : 0; count = atomic_add_return(0x10000, &mm->context.attach_count); if ((count & 0xffff) <= active) { pmd_val(*pmdp) |= _SEGMENT_ENTRY_INVALID; mm->context.flush_mm = 1; - } else - __pmd_idte(address, pmdp); + } else if (MACHINE_HAS_IDTE) + __pmdp_idte(address, pmdp); + else + __pmdp_csp(pmdp); atomic_sub(0x10000, &mm->context.attach_count); } @@ -1543,7 +1599,7 @@ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, pmd_t pmd; pmd = *pmdp; - __pmd_idte(address, pmdp); + pmdp_flush_direct(vma->vm_mm, address, pmdp); *pmdp = pmd_mkold(pmd); return pmd_young(pmd); } @@ -1554,7 +1610,7 @@ static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, { pmd_t pmd = *pmdp; - __pmd_idte(address, pmdp); + pmdp_flush_direct(mm, address, pmdp); pmd_clear(pmdp); return pmd; } @@ -1570,7 +1626,7 @@ static inline pmd_t pmdp_clear_flush(struct vm_area_struct *vma, static inline void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { - __pmd_idte(address, pmdp); + pmdp_flush_direct(vma->vm_mm, address, pmdp); } #define __HAVE_ARCH_PMDP_SET_WRPROTECT @@ -1580,7 +1636,7 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, pmd_t pmd = *pmdp; if (pmd_write(pmd)) { - __pmd_idte(address, pmdp); + pmdp_flush_direct(mm, address, pmdp); set_pmd_at(mm, address, pmdp, pmd_wrprotect(pmd)); } } diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h index 406f3a1e63ef..b31b22dba948 100644 --- a/arch/s390/include/asm/setup.h +++ b/arch/s390/include/asm/setup.h @@ -68,6 +68,7 @@ void create_mem_hole(struct mem_chunk mem_chunk[], unsigned long addr, #define MACHINE_FLAG_TOPOLOGY (1UL << 14) #define MACHINE_FLAG_TE (1UL << 15) #define MACHINE_FLAG_RRBM (1UL << 16) +#define MACHINE_FLAG_TLB_LC (1UL << 17) #define MACHINE_IS_VM (S390_lowcore.machine_flags & MACHINE_FLAG_VM) #define MACHINE_IS_KVM (S390_lowcore.machine_flags & MACHINE_FLAG_KVM) @@ -90,6 +91,7 @@ void create_mem_hole(struct mem_chunk mem_chunk[], unsigned long addr, #define MACHINE_HAS_TOPOLOGY (0) #define MACHINE_HAS_TE (0) #define MACHINE_HAS_RRBM (0) +#define MACHINE_HAS_TLB_LC (0) #else /* CONFIG_64BIT */ #define MACHINE_HAS_IEEE (1) #define MACHINE_HAS_CSP (1) @@ -102,6 +104,7 @@ void create_mem_hole(struct mem_chunk mem_chunk[], unsigned long addr, #define MACHINE_HAS_TOPOLOGY (S390_lowcore.machine_flags & MACHINE_FLAG_TOPOLOGY) #define MACHINE_HAS_TE (S390_lowcore.machine_flags & MACHINE_FLAG_TE) #define MACHINE_HAS_RRBM (S390_lowcore.machine_flags & MACHINE_FLAG_RRBM) +#define MACHINE_HAS_TLB_LC (S390_lowcore.machine_flags & MACHINE_FLAG_TLB_LC) #endif /* CONFIG_64BIT */ /* diff --git a/arch/s390/include/asm/tlbflush.h b/arch/s390/include/asm/tlbflush.h index f9fef0425fee..16c9c88658c8 100644 --- a/arch/s390/include/asm/tlbflush.h +++ b/arch/s390/include/asm/tlbflush.h @@ -7,19 +7,41 @@ #include /* - * Flush all tlb entries on the local cpu. + * Flush all TLB entries on the local CPU. */ static inline void __tlb_flush_local(void) { asm volatile("ptlb" : : : "memory"); } -#ifdef CONFIG_SMP /* - * Flush all tlb entries on all cpus. + * Flush TLB entries for a specific ASCE on all CPUs */ +static inline void __tlb_flush_idte(unsigned long asce) +{ + /* Global TLB flush for the mm */ + asm volatile( + " .insn rrf,0xb98e0000,0,%0,%1,0" + : : "a" (2048), "a" (asce) : "cc"); +} + +/* + * Flush TLB entries for a specific ASCE on the local CPU + */ +static inline void __tlb_flush_idte_local(unsigned long asce) +{ + /* Local TLB flush for the mm */ + asm volatile( + " .insn rrf,0xb98e0000,0,%0,%1,1" + : : "a" (2048), "a" (asce) : "cc"); +} + +#ifdef CONFIG_SMP void smp_ptlb_all(void); +/* + * Flush all TLB entries on all CPUs. + */ static inline void __tlb_flush_global(void) { register unsigned long reg2 asm("2"); @@ -42,36 +64,89 @@ static inline void __tlb_flush_global(void) : : "d" (reg2), "d" (reg3), "d" (reg4), "m" (dummy) : "cc" ); } +/* + * Flush TLB entries for a specific mm on all CPUs (in case gmap is used + * this implicates multiple ASCEs!). + */ static inline void __tlb_flush_full(struct mm_struct *mm) { - cpumask_t local_cpumask; - preempt_disable(); - /* - * If the process only ran on the local cpu, do a local flush. - */ - cpumask_copy(&local_cpumask, cpumask_of(smp_processor_id())); - if (cpumask_equal(mm_cpumask(mm), &local_cpumask)) + atomic_add(0x10000, &mm->context.attach_count); + if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) { + /* Local TLB flush */ __tlb_flush_local(); - else + } else { + /* Global TLB flush */ __tlb_flush_global(); + /* Reset TLB flush mask */ + if (MACHINE_HAS_TLB_LC) + cpumask_copy(mm_cpumask(mm), + &mm->context.cpu_attach_mask); + } + atomic_sub(0x10000, &mm->context.attach_count); preempt_enable(); } -#else -#define __tlb_flush_full(mm) __tlb_flush_local() -#define __tlb_flush_global() __tlb_flush_local() -#endif /* - * Flush all tlb entries of a page table on all cpus. + * Flush TLB entries for a specific ASCE on all CPUs. */ -static inline void __tlb_flush_idte(unsigned long asce) +static inline void __tlb_flush_asce(struct mm_struct *mm, unsigned long asce) { - asm volatile( - " .insn rrf,0xb98e0000,0,%0,%1,0" - : : "a" (2048), "a" (asce) : "cc" ); + int active, count; + + preempt_disable(); + active = (mm == current->active_mm) ? 1 : 0; + count = atomic_add_return(0x10000, &mm->context.attach_count); + if (MACHINE_HAS_TLB_LC && (count & 0xffff) <= active && + cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) { + __tlb_flush_idte_local(asce); + } else { + if (MACHINE_HAS_IDTE) + __tlb_flush_idte(asce); + else + __tlb_flush_global(); + /* Reset TLB flush mask */ + if (MACHINE_HAS_TLB_LC) + cpumask_copy(mm_cpumask(mm), + &mm->context.cpu_attach_mask); + } + atomic_sub(0x10000, &mm->context.attach_count); + preempt_enable(); } +static inline void __tlb_flush_kernel(void) +{ + if (MACHINE_HAS_IDTE) + __tlb_flush_idte((unsigned long) init_mm.pgd | + init_mm.context.asce_bits); + else + __tlb_flush_global(); +} +#else +#define __tlb_flush_global() __tlb_flush_local() +#define __tlb_flush_full(mm) __tlb_flush_local() + +/* + * Flush TLB entries for a specific ASCE on all CPUs. + */ +static inline void __tlb_flush_asce(struct mm_struct *mm, unsigned long asce) +{ + if (MACHINE_HAS_TLB_LC) + __tlb_flush_idte_local(asce); + else + __tlb_flush_local(); +} + +static inline void __tlb_flush_kernel(void) +{ + if (MACHINE_HAS_TLB_LC) + __tlb_flush_idte_local((unsigned long) init_mm.pgd | + init_mm.context.asce_bits); + else + __tlb_flush_local(); +} +#endif + static inline void __tlb_flush_mm(struct mm_struct * mm) { /* @@ -80,7 +155,7 @@ static inline void __tlb_flush_mm(struct mm_struct * mm) * only ran on the local cpu. */ if (MACHINE_HAS_IDTE && list_empty(&mm->context.gmap_list)) - __tlb_flush_idte((unsigned long) mm->pgd | + __tlb_flush_asce(mm, (unsigned long) mm->pgd | mm->context.asce_bits); else __tlb_flush_full(mm); @@ -130,7 +205,7 @@ static inline void flush_tlb_range(struct vm_area_struct *vma, static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end) { - __tlb_flush_mm(&init_mm); + __tlb_flush_kernel(); } #endif /* _S390_TLBFLUSH_H */ diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c index 6b594439cca5..a734f3585ceb 100644 --- a/arch/s390/kernel/early.c +++ b/arch/s390/kernel/early.c @@ -386,6 +386,8 @@ static __init void detect_machine_facilities(void) S390_lowcore.machine_flags |= MACHINE_FLAG_TE; if (test_facility(66)) S390_lowcore.machine_flags |= MACHINE_FLAG_RRBM; + if (test_facility(51)) + S390_lowcore.machine_flags |= MACHINE_FLAG_TLB_LC; #endif } diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 366d14460c2b..42a501d13a3b 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -236,6 +236,9 @@ static void pcpu_prepare_secondary(struct pcpu *pcpu, int cpu) { struct _lowcore *lc = pcpu->lowcore; + if (MACHINE_HAS_TLB_LC) + cpumask_set_cpu(cpu, &init_mm.context.cpu_attach_mask); + cpumask_set_cpu(cpu, mm_cpumask(&init_mm)); atomic_inc(&init_mm.context.attach_count); lc->cpu_nr = cpu; lc->percpu_offset = __per_cpu_offset[cpu]; @@ -760,6 +763,9 @@ void __cpu_die(unsigned int cpu) cpu_relax(); pcpu_free_lowcore(pcpu); atomic_dec(&init_mm.context.attach_count); + cpumask_clear_cpu(cpu, mm_cpumask(&init_mm)); + if (MACHINE_HAS_TLB_LC) + cpumask_clear_cpu(cpu, &init_mm.context.cpu_attach_mask); } void __noreturn cpu_die(void) diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index d261c62e40a6..0727a55d87d9 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -123,10 +123,7 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, pmd_t *pmdp = (pmd_t *) ptep; pte_t pte = huge_ptep_get(ptep); - if (MACHINE_HAS_IDTE) - __pmd_idte(addr, pmdp); - else - __pmd_csp(pmdp); + pmdp_flush_direct(mm, addr, pmdp); pmd_val(*pmdp) = _SEGMENT_ENTRY_EMPTY; return pte; } diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index ad446b0c55b6..0c1073ed1e84 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -124,8 +124,6 @@ void __init paging_init(void) __ctl_load(S390_lowcore.kernel_asce, 13, 13); arch_local_irq_restore(4UL << (BITS_PER_LONG - 8)); - atomic_set(&init_mm.context.attach_count, 1); - sparse_memory_present_with_active_regions(MAX_NUMNODES); sparse_init(); memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); @@ -136,6 +134,11 @@ void __init paging_init(void) void __init mem_init(void) { + if (MACHINE_HAS_TLB_LC) + cpumask_set_cpu(0, &init_mm.context.cpu_attach_mask); + cpumask_set_cpu(0, mm_cpumask(&init_mm)); + atomic_set(&init_mm.context.attach_count, 1); + max_mapnr = max_low_pfn; high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 24c62900b532..c57c63380184 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -200,7 +200,7 @@ static int gmap_unlink_segment(struct gmap *gmap, unsigned long *table) static void gmap_flush_tlb(struct gmap *gmap) { if (MACHINE_HAS_IDTE) - __tlb_flush_idte((unsigned long) gmap->table | + __tlb_flush_asce(gmap->mm, (unsigned long) gmap->table | _ASCE_TYPE_REGION1); else __tlb_flush_global(); @@ -219,7 +219,7 @@ void gmap_free(struct gmap *gmap) /* Flush tlb. */ if (MACHINE_HAS_IDTE) - __tlb_flush_idte((unsigned long) gmap->table | + __tlb_flush_asce(gmap->mm, (unsigned long) gmap->table | _ASCE_TYPE_REGION1); else __tlb_flush_global(); diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index bcfb70b60be6..72b04de18283 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -138,7 +138,6 @@ static int vmem_add_mem(unsigned long start, unsigned long size, int ro) } ret = 0; out: - flush_tlb_kernel_range(start, end); return ret; } @@ -265,7 +264,6 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) memset((void *)start, 0, end - start); ret = 0; out: - flush_tlb_kernel_range(start, end); return ret; } From 457f2180951cdcbfb4657ddcc83b486e93497f56 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Fri, 21 Mar 2014 10:42:25 +0100 Subject: [PATCH 13/13] s390/uaccess: rework uaccess code - fix locking issues The current uaccess code uses a page table walk in some circumstances, e.g. in case of the in atomic futex operations or if running on old hardware which doesn't support the mvcos instruction. However it turned out that the page table walk code does not correctly lock page tables when accessing page table entries. In other words: a different cpu may invalidate a page table entry while the current cpu inspects the pte. This may lead to random data corruption. Adding correct locking however isn't trivial for all uaccess operations. Especially copy_in_user() is problematic since that requires to hold at least two locks, but must be protected against ABBA deadlock when a different cpu also performs a copy_in_user() operation. So the solution is a different approach where we change address spaces: User space runs in primary address mode, or access register mode within vdso code, like it currently already does. The kernel usually also runs in home space mode, however when accessing user space the kernel switches to primary or secondary address mode if the mvcos instruction is not available or if a compare-and-swap (futex) instruction on a user space address is performed. KVM however is special, since that requires the kernel to run in home address space while implicitly accessing user space with the sie instruction. So we end up with: User space: - runs in primary or access register mode - cr1 contains the user asce - cr7 contains the user asce - cr13 contains the kernel asce Kernel space: - runs in home space mode - cr1 contains the user or kernel asce -> the kernel asce is loaded when a uaccess requires primary or secondary address mode - cr7 contains the user or kernel asce, (changed with set_fs()) - cr13 contains the kernel asce In case of uaccess the kernel changes to: - primary space mode in case of a uaccess (copy_to_user) and uses e.g. the mvcp instruction to access user space. However the kernel will stay in home space mode if the mvcos instruction is available - secondary space mode in case of futex atomic operations, so that the instructions come from primary address space and data from secondary space In case of kvm the kernel runs in home space mode, but cr1 gets switched to contain the gmap asce before the sie instruction gets executed. When the sie instruction is finished cr1 will be switched back to contain the user asce. A context switch between two processes will always load the kernel asce for the next process in cr1. So the first exit to user space is a bit more expensive (one extra load control register instruction) than before, however keeps the code rather simple. In sum this means there is no need to perform any error prone page table walks anymore when accessing user space. The patch seems to be rather large, however it mainly removes the the page table walk code and restores the previously deleted "standard" uaccess code, with a couple of changes. The uaccess without mvcos mode can be enforced with the "uaccess_primary" kernel parameter. Reported-by: Christian Borntraeger Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/futex.h | 66 +++- arch/s390/include/asm/mmu_context.h | 37 ++- arch/s390/include/asm/switch_to.h | 1 + arch/s390/include/asm/thread_info.h | 2 + arch/s390/include/asm/uaccess.h | 2 - arch/s390/kernel/asm-offsets.c | 1 + arch/s390/kernel/entry.S | 24 +- arch/s390/kernel/entry64.S | 24 +- arch/s390/lib/Makefile | 2 +- arch/s390/lib/uaccess.c | 407 ++++++++++++++++++++++++ arch/s390/lib/uaccess.h | 16 - arch/s390/lib/uaccess_mvcos.c | 263 ---------------- arch/s390/lib/uaccess_pt.c | 471 ---------------------------- arch/s390/mm/fault.c | 49 +-- arch/s390/mm/pgtable.c | 6 +- 15 files changed, 556 insertions(+), 815 deletions(-) create mode 100644 arch/s390/lib/uaccess.c delete mode 100644 arch/s390/lib/uaccess.h delete mode 100644 arch/s390/lib/uaccess_mvcos.c delete mode 100644 arch/s390/lib/uaccess_pt.c diff --git a/arch/s390/include/asm/futex.h b/arch/s390/include/asm/futex.h index fda46bd38c99..69cf5b5eddc9 100644 --- a/arch/s390/include/asm/futex.h +++ b/arch/s390/include/asm/futex.h @@ -1,12 +1,25 @@ #ifndef _ASM_S390_FUTEX_H #define _ASM_S390_FUTEX_H -#include #include +#include +#include #include -int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 oldval, u32 newval); -int __futex_atomic_op_inuser(int op, u32 __user *uaddr, int oparg, int *old); +#define __futex_atomic_op(insn, ret, oldval, newval, uaddr, oparg) \ + asm volatile( \ + " sacf 256\n" \ + "0: l %1,0(%6)\n" \ + "1:"insn \ + "2: cs %1,%2,0(%6)\n" \ + "3: jl 1b\n" \ + " lhi %0,0\n" \ + "4: sacf 768\n" \ + EX_TABLE(0b,4b) EX_TABLE(2b,4b) EX_TABLE(3b,4b) \ + : "=d" (ret), "=&d" (oldval), "=&d" (newval), \ + "=m" (*uaddr) \ + : "0" (-EFAULT), "d" (oparg), "a" (uaddr), \ + "m" (*uaddr) : "cc"); static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) { @@ -14,13 +27,37 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) int cmp = (encoded_op >> 24) & 15; int oparg = (encoded_op << 8) >> 20; int cmparg = (encoded_op << 20) >> 20; - int oldval, ret; + int oldval = 0, newval, ret; + update_primary_asce(current); if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) oparg = 1 << oparg; pagefault_disable(); - ret = __futex_atomic_op_inuser(op, uaddr, oparg, &oldval); + switch (op) { + case FUTEX_OP_SET: + __futex_atomic_op("lr %2,%5\n", + ret, oldval, newval, uaddr, oparg); + break; + case FUTEX_OP_ADD: + __futex_atomic_op("lr %2,%1\nar %2,%5\n", + ret, oldval, newval, uaddr, oparg); + break; + case FUTEX_OP_OR: + __futex_atomic_op("lr %2,%1\nor %2,%5\n", + ret, oldval, newval, uaddr, oparg); + break; + case FUTEX_OP_ANDN: + __futex_atomic_op("lr %2,%1\nnr %2,%5\n", + ret, oldval, newval, uaddr, oparg); + break; + case FUTEX_OP_XOR: + __futex_atomic_op("lr %2,%1\nxr %2,%5\n", + ret, oldval, newval, uaddr, oparg); + break; + default: + ret = -ENOSYS; + } pagefault_enable(); if (!ret) { @@ -37,4 +74,23 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) return ret; } +static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, + u32 oldval, u32 newval) +{ + int ret; + + update_primary_asce(current); + asm volatile( + " sacf 256\n" + "0: cs %1,%4,0(%5)\n" + "1: la %0,0\n" + "2: sacf 768\n" + EX_TABLE(0b,2b) EX_TABLE(1b,2b) + : "=d" (ret), "+d" (oldval), "=m" (*uaddr) + : "0" (-EFAULT), "d" (newval), "a" (uaddr), "m" (*uaddr) + : "cc", "memory"); + *uval = oldval; + return ret; +} + #endif /* _ASM_S390_FUTEX_H */ diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h index 71a258839039..71be346d0e3c 100644 --- a/arch/s390/include/asm/mmu_context.h +++ b/arch/s390/include/asm/mmu_context.h @@ -30,27 +30,33 @@ static inline int init_new_context(struct task_struct *tsk, #define destroy_context(mm) do { } while (0) -#ifndef CONFIG_64BIT -#define LCTL_OPCODE "lctl" -#else -#define LCTL_OPCODE "lctlg" -#endif - -static inline void update_user_asce(struct mm_struct *mm) +static inline void update_user_asce(struct mm_struct *mm, int load_primary) { pgd_t *pgd = mm->pgd; S390_lowcore.user_asce = mm->context.asce_bits | __pa(pgd); - /* Load primary space page table origin. */ - asm volatile(LCTL_OPCODE" 1,1,%0\n" : : "m" (S390_lowcore.user_asce)); + if (load_primary) + __ctl_load(S390_lowcore.user_asce, 1, 1); set_fs(current->thread.mm_segment); } -static inline void clear_user_asce(struct mm_struct *mm) +static inline void clear_user_asce(struct mm_struct *mm, int load_primary) { S390_lowcore.user_asce = S390_lowcore.kernel_asce; - asm volatile(LCTL_OPCODE" 1,1,%0\n" : : "m" (S390_lowcore.user_asce)); - asm volatile(LCTL_OPCODE" 7,7,%0\n" : : "m" (S390_lowcore.user_asce)); + + if (load_primary) + __ctl_load(S390_lowcore.user_asce, 1, 1); + __ctl_load(S390_lowcore.user_asce, 7, 7); +} + +static inline void update_primary_asce(struct task_struct *tsk) +{ + unsigned long asce; + + __ctl_store(asce, 1, 1); + if (asce != S390_lowcore.kernel_asce) + __ctl_load(S390_lowcore.kernel_asce, 1, 1); + set_tsk_thread_flag(tsk, TIF_ASCE); } static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, @@ -58,6 +64,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, { int cpu = smp_processor_id(); + update_primary_asce(tsk); if (prev == next) return; if (MACHINE_HAS_TLB_LC) @@ -66,10 +73,10 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, /* Delay update_user_asce until all TLB flushes are done. */ set_tsk_thread_flag(tsk, TIF_TLB_WAIT); /* Clear old ASCE by loading the kernel ASCE. */ - clear_user_asce(next); + clear_user_asce(next, 0); } else { cpumask_set_cpu(cpu, mm_cpumask(next)); - update_user_asce(next); + update_user_asce(next, 0); if (next->context.flush_mm) /* Flush pending TLBs */ __tlb_flush_mm(next); @@ -94,7 +101,7 @@ static inline void finish_arch_post_lock_switch(void) cpu_relax(); cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm)); - update_user_asce(mm); + update_user_asce(mm, 0); if (mm->context.flush_mm) __tlb_flush_mm(mm); preempt_enable(); diff --git a/arch/s390/include/asm/switch_to.h b/arch/s390/include/asm/switch_to.h index 29c81f82705e..e759181357fc 100644 --- a/arch/s390/include/asm/switch_to.h +++ b/arch/s390/include/asm/switch_to.h @@ -132,6 +132,7 @@ static inline void restore_access_regs(unsigned int *acrs) update_cr_regs(next); \ } \ prev = __switch_to(prev,next); \ + update_primary_asce(current); \ } while (0) #define finish_arch_switch(prev) do { \ diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h index 3ccd71b90345..50630e6a35de 100644 --- a/arch/s390/include/asm/thread_info.h +++ b/arch/s390/include/asm/thread_info.h @@ -82,6 +82,7 @@ static inline struct thread_info *current_thread_info(void) #define TIF_SIGPENDING 2 /* signal pending */ #define TIF_NEED_RESCHED 3 /* rescheduling necessary */ #define TIF_TLB_WAIT 4 /* wait for TLB flush completion */ +#define TIF_ASCE 5 /* primary asce needs fixup / uaccess */ #define TIF_PER_TRAP 6 /* deliver sigtrap on return to user */ #define TIF_MCCK_PENDING 7 /* machine check handling is pending */ #define TIF_SYSCALL_TRACE 8 /* syscall trace active */ @@ -99,6 +100,7 @@ static inline struct thread_info *current_thread_info(void) #define _TIF_SIGPENDING (1< +#include +#include +#include +#include +#include +#include + +#ifndef CONFIG_64BIT +#define AHI "ahi" +#define ALR "alr" +#define CLR "clr" +#define LHI "lhi" +#define SLR "slr" +#else +#define AHI "aghi" +#define ALR "algr" +#define CLR "clgr" +#define LHI "lghi" +#define SLR "slgr" +#endif + +static struct static_key have_mvcos = STATIC_KEY_INIT_FALSE; + +static inline unsigned long copy_from_user_mvcos(void *x, const void __user *ptr, + unsigned long size) +{ + register unsigned long reg0 asm("0") = 0x81UL; + unsigned long tmp1, tmp2; + + tmp1 = -4096UL; + asm volatile( + "0: .insn ss,0xc80000000000,0(%0,%2),0(%1),0\n" + "9: jz 7f\n" + "1:"ALR" %0,%3\n" + " "SLR" %1,%3\n" + " "SLR" %2,%3\n" + " j 0b\n" + "2: la %4,4095(%1)\n"/* %4 = ptr + 4095 */ + " nr %4,%3\n" /* %4 = (ptr + 4095) & -4096 */ + " "SLR" %4,%1\n" + " "CLR" %0,%4\n" /* copy crosses next page boundary? */ + " jnh 4f\n" + "3: .insn ss,0xc80000000000,0(%4,%2),0(%1),0\n" + "10:"SLR" %0,%4\n" + " "ALR" %2,%4\n" + "4:"LHI" %4,-1\n" + " "ALR" %4,%0\n" /* copy remaining size, subtract 1 */ + " bras %3,6f\n" /* memset loop */ + " xc 0(1,%2),0(%2)\n" + "5: xc 0(256,%2),0(%2)\n" + " la %2,256(%2)\n" + "6:"AHI" %4,-256\n" + " jnm 5b\n" + " ex %4,0(%3)\n" + " j 8f\n" + "7:"SLR" %0,%0\n" + "8:\n" + EX_TABLE(0b,2b) EX_TABLE(3b,4b) EX_TABLE(9b,2b) EX_TABLE(10b,4b) + : "+a" (size), "+a" (ptr), "+a" (x), "+a" (tmp1), "=a" (tmp2) + : "d" (reg0) : "cc", "memory"); + return size; +} + +static inline unsigned long copy_from_user_mvcp(void *x, const void __user *ptr, + unsigned long size) +{ + unsigned long tmp1, tmp2; + + update_primary_asce(current); + tmp1 = -256UL; + asm volatile( + " sacf 0\n" + "0: mvcp 0(%0,%2),0(%1),%3\n" + "10:jz 8f\n" + "1:"ALR" %0,%3\n" + " la %1,256(%1)\n" + " la %2,256(%2)\n" + "2: mvcp 0(%0,%2),0(%1),%3\n" + "11:jnz 1b\n" + " j 8f\n" + "3: la %4,255(%1)\n" /* %4 = ptr + 255 */ + " "LHI" %3,-4096\n" + " nr %4,%3\n" /* %4 = (ptr + 255) & -4096 */ + " "SLR" %4,%1\n" + " "CLR" %0,%4\n" /* copy crosses next page boundary? */ + " jnh 5f\n" + "4: mvcp 0(%4,%2),0(%1),%3\n" + "12:"SLR" %0,%4\n" + " "ALR" %2,%4\n" + "5:"LHI" %4,-1\n" + " "ALR" %4,%0\n" /* copy remaining size, subtract 1 */ + " bras %3,7f\n" /* memset loop */ + " xc 0(1,%2),0(%2)\n" + "6: xc 0(256,%2),0(%2)\n" + " la %2,256(%2)\n" + "7:"AHI" %4,-256\n" + " jnm 6b\n" + " ex %4,0(%3)\n" + " j 9f\n" + "8:"SLR" %0,%0\n" + "9: sacf 768\n" + EX_TABLE(0b,3b) EX_TABLE(2b,3b) EX_TABLE(4b,5b) + EX_TABLE(10b,3b) EX_TABLE(11b,3b) EX_TABLE(12b,5b) + : "+a" (size), "+a" (ptr), "+a" (x), "+a" (tmp1), "=a" (tmp2) + : : "cc", "memory"); + return size; +} + +unsigned long __copy_from_user(void *to, const void __user *from, unsigned long n) +{ + if (static_key_false(&have_mvcos)) + return copy_from_user_mvcos(to, from, n); + return copy_from_user_mvcp(to, from, n); +} +EXPORT_SYMBOL(__copy_from_user); + +static inline unsigned long copy_to_user_mvcos(void __user *ptr, const void *x, + unsigned long size) +{ + register unsigned long reg0 asm("0") = 0x810000UL; + unsigned long tmp1, tmp2; + + tmp1 = -4096UL; + asm volatile( + "0: .insn ss,0xc80000000000,0(%0,%1),0(%2),0\n" + "6: jz 4f\n" + "1:"ALR" %0,%3\n" + " "SLR" %1,%3\n" + " "SLR" %2,%3\n" + " j 0b\n" + "2: la %4,4095(%1)\n"/* %4 = ptr + 4095 */ + " nr %4,%3\n" /* %4 = (ptr + 4095) & -4096 */ + " "SLR" %4,%1\n" + " "CLR" %0,%4\n" /* copy crosses next page boundary? */ + " jnh 5f\n" + "3: .insn ss,0xc80000000000,0(%4,%1),0(%2),0\n" + "7:"SLR" %0,%4\n" + " j 5f\n" + "4:"SLR" %0,%0\n" + "5:\n" + EX_TABLE(0b,2b) EX_TABLE(3b,5b) EX_TABLE(6b,2b) EX_TABLE(7b,5b) + : "+a" (size), "+a" (ptr), "+a" (x), "+a" (tmp1), "=a" (tmp2) + : "d" (reg0) : "cc", "memory"); + return size; +} + +static inline unsigned long copy_to_user_mvcs(void __user *ptr, const void *x, + unsigned long size) +{ + unsigned long tmp1, tmp2; + + update_primary_asce(current); + tmp1 = -256UL; + asm volatile( + " sacf 0\n" + "0: mvcs 0(%0,%1),0(%2),%3\n" + "7: jz 5f\n" + "1:"ALR" %0,%3\n" + " la %1,256(%1)\n" + " la %2,256(%2)\n" + "2: mvcs 0(%0,%1),0(%2),%3\n" + "8: jnz 1b\n" + " j 5f\n" + "3: la %4,255(%1)\n" /* %4 = ptr + 255 */ + " "LHI" %3,-4096\n" + " nr %4,%3\n" /* %4 = (ptr + 255) & -4096 */ + " "SLR" %4,%1\n" + " "CLR" %0,%4\n" /* copy crosses next page boundary? */ + " jnh 6f\n" + "4: mvcs 0(%4,%1),0(%2),%3\n" + "9:"SLR" %0,%4\n" + " j 6f\n" + "5:"SLR" %0,%0\n" + "6: sacf 768\n" + EX_TABLE(0b,3b) EX_TABLE(2b,3b) EX_TABLE(4b,6b) + EX_TABLE(7b,3b) EX_TABLE(8b,3b) EX_TABLE(9b,6b) + : "+a" (size), "+a" (ptr), "+a" (x), "+a" (tmp1), "=a" (tmp2) + : : "cc", "memory"); + return size; +} + +unsigned long __copy_to_user(void __user *to, const void *from, unsigned long n) +{ + if (static_key_false(&have_mvcos)) + return copy_to_user_mvcos(to, from, n); + return copy_to_user_mvcs(to, from, n); +} +EXPORT_SYMBOL(__copy_to_user); + +static inline unsigned long copy_in_user_mvcos(void __user *to, const void __user *from, + unsigned long size) +{ + register unsigned long reg0 asm("0") = 0x810081UL; + unsigned long tmp1, tmp2; + + tmp1 = -4096UL; + /* FIXME: copy with reduced length. */ + asm volatile( + "0: .insn ss,0xc80000000000,0(%0,%1),0(%2),0\n" + " jz 2f\n" + "1:"ALR" %0,%3\n" + " "SLR" %1,%3\n" + " "SLR" %2,%3\n" + " j 0b\n" + "2:"SLR" %0,%0\n" + "3: \n" + EX_TABLE(0b,3b) + : "+a" (size), "+a" (to), "+a" (from), "+a" (tmp1), "=a" (tmp2) + : "d" (reg0) : "cc", "memory"); + return size; +} + +static inline unsigned long copy_in_user_mvc(void __user *to, const void __user *from, + unsigned long size) +{ + unsigned long tmp1; + + update_primary_asce(current); + asm volatile( + " sacf 256\n" + " "AHI" %0,-1\n" + " jo 5f\n" + " bras %3,3f\n" + "0:"AHI" %0,257\n" + "1: mvc 0(1,%1),0(%2)\n" + " la %1,1(%1)\n" + " la %2,1(%2)\n" + " "AHI" %0,-1\n" + " jnz 1b\n" + " j 5f\n" + "2: mvc 0(256,%1),0(%2)\n" + " la %1,256(%1)\n" + " la %2,256(%2)\n" + "3:"AHI" %0,-256\n" + " jnm 2b\n" + "4: ex %0,1b-0b(%3)\n" + "5: "SLR" %0,%0\n" + "6: sacf 768\n" + EX_TABLE(1b,6b) EX_TABLE(2b,0b) EX_TABLE(4b,0b) + : "+a" (size), "+a" (to), "+a" (from), "=a" (tmp1) + : : "cc", "memory"); + return size; +} + +unsigned long __copy_in_user(void __user *to, const void __user *from, unsigned long n) +{ + if (static_key_false(&have_mvcos)) + return copy_in_user_mvcos(to, from, n); + return copy_in_user_mvc(to, from, n); +} +EXPORT_SYMBOL(__copy_in_user); + +static inline unsigned long clear_user_mvcos(void __user *to, unsigned long size) +{ + register unsigned long reg0 asm("0") = 0x810000UL; + unsigned long tmp1, tmp2; + + tmp1 = -4096UL; + asm volatile( + "0: .insn ss,0xc80000000000,0(%0,%1),0(%4),0\n" + " jz 4f\n" + "1:"ALR" %0,%2\n" + " "SLR" %1,%2\n" + " j 0b\n" + "2: la %3,4095(%1)\n"/* %4 = to + 4095 */ + " nr %3,%2\n" /* %4 = (to + 4095) & -4096 */ + " "SLR" %3,%1\n" + " "CLR" %0,%3\n" /* copy crosses next page boundary? */ + " jnh 5f\n" + "3: .insn ss,0xc80000000000,0(%3,%1),0(%4),0\n" + " "SLR" %0,%3\n" + " j 5f\n" + "4:"SLR" %0,%0\n" + "5:\n" + EX_TABLE(0b,2b) EX_TABLE(3b,5b) + : "+a" (size), "+a" (to), "+a" (tmp1), "=a" (tmp2) + : "a" (empty_zero_page), "d" (reg0) : "cc", "memory"); + return size; +} + +static inline unsigned long clear_user_xc(void __user *to, unsigned long size) +{ + unsigned long tmp1, tmp2; + + update_primary_asce(current); + asm volatile( + " sacf 256\n" + " "AHI" %0,-1\n" + " jo 5f\n" + " bras %3,3f\n" + " xc 0(1,%1),0(%1)\n" + "0:"AHI" %0,257\n" + " la %2,255(%1)\n" /* %2 = ptr + 255 */ + " srl %2,12\n" + " sll %2,12\n" /* %2 = (ptr + 255) & -4096 */ + " "SLR" %2,%1\n" + " "CLR" %0,%2\n" /* clear crosses next page boundary? */ + " jnh 5f\n" + " "AHI" %2,-1\n" + "1: ex %2,0(%3)\n" + " "AHI" %2,1\n" + " "SLR" %0,%2\n" + " j 5f\n" + "2: xc 0(256,%1),0(%1)\n" + " la %1,256(%1)\n" + "3:"AHI" %0,-256\n" + " jnm 2b\n" + "4: ex %0,0(%3)\n" + "5: "SLR" %0,%0\n" + "6: sacf 768\n" + EX_TABLE(1b,6b) EX_TABLE(2b,0b) EX_TABLE(4b,0b) + : "+a" (size), "+a" (to), "=a" (tmp1), "=a" (tmp2) + : : "cc", "memory"); + return size; +} + +unsigned long __clear_user(void __user *to, unsigned long size) +{ + if (static_key_false(&have_mvcos)) + return clear_user_mvcos(to, size); + return clear_user_xc(to, size); +} +EXPORT_SYMBOL(__clear_user); + +static inline unsigned long strnlen_user_srst(const char __user *src, + unsigned long size) +{ + register unsigned long reg0 asm("0") = 0; + unsigned long tmp1, tmp2; + + if (unlikely(!size)) + return 0; + update_primary_asce(current); + asm volatile( + " la %2,0(%1)\n" + " la %3,0(%0,%1)\n" + " "SLR" %0,%0\n" + " sacf 256\n" + "0: srst %3,%2\n" + " jo 0b\n" + " la %0,1(%3)\n" /* strnlen_user results includes \0 */ + " "SLR" %0,%1\n" + "1: sacf 768\n" + EX_TABLE(0b,1b) + : "+a" (size), "+a" (src), "=a" (tmp1), "=a" (tmp2) + : "d" (reg0) : "cc", "memory"); + return size; +} + +unsigned long __strnlen_user(const char __user *src, unsigned long size) +{ + update_primary_asce(current); + return strnlen_user_srst(src, size); +} +EXPORT_SYMBOL(__strnlen_user); + +long __strncpy_from_user(char *dst, const char __user *src, long size) +{ + size_t done, len, offset, len_str; + + if (unlikely(size <= 0)) + return 0; + done = 0; + do { + offset = (size_t)src & ~PAGE_MASK; + len = min(size - done, PAGE_SIZE - offset); + if (copy_from_user(dst, src, len)) + return -EFAULT; + len_str = strnlen(dst, len); + done += len_str; + src += len_str; + dst += len_str; + } while ((len_str == len) && (done < size)); + return done; +} +EXPORT_SYMBOL(__strncpy_from_user); + +/* + * The "old" uaccess variant without mvcos can be enforced with the + * uaccess_primary kernel parameter. This is mainly for debugging purposes. + */ +static int uaccess_primary __initdata; + +static int __init parse_uaccess_pt(char *__unused) +{ + uaccess_primary = 1; + return 0; +} +early_param("uaccess_primary", parse_uaccess_pt); + +static int __init uaccess_init(void) +{ + if (IS_ENABLED(CONFIG_64BIT) && !uaccess_primary && test_facility(27)) + static_key_slow_inc(&have_mvcos); + return 0; +} +early_initcall(uaccess_init); diff --git a/arch/s390/lib/uaccess.h b/arch/s390/lib/uaccess.h deleted file mode 100644 index c7e0e81f4b4e..000000000000 --- a/arch/s390/lib/uaccess.h +++ /dev/null @@ -1,16 +0,0 @@ -/* - * Copyright IBM Corp. 2007 - * - */ - -#ifndef __ARCH_S390_LIB_UACCESS_H -#define __ARCH_S390_LIB_UACCESS_H - -unsigned long copy_from_user_pt(void *to, const void __user *from, unsigned long n); -unsigned long copy_to_user_pt(void __user *to, const void *from, unsigned long n); -unsigned long copy_in_user_pt(void __user *to, const void __user *from, unsigned long n); -unsigned long clear_user_pt(void __user *to, unsigned long n); -unsigned long strnlen_user_pt(const char __user *src, unsigned long count); -long strncpy_from_user_pt(char *dst, const char __user *src, long count); - -#endif /* __ARCH_S390_LIB_UACCESS_H */ diff --git a/arch/s390/lib/uaccess_mvcos.c b/arch/s390/lib/uaccess_mvcos.c deleted file mode 100644 index ae97b8df11aa..000000000000 --- a/arch/s390/lib/uaccess_mvcos.c +++ /dev/null @@ -1,263 +0,0 @@ -/* - * Optimized user space space access functions based on mvcos. - * - * Copyright IBM Corp. 2006 - * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com), - * Gerald Schaefer (gerald.schaefer@de.ibm.com) - */ - -#include -#include -#include -#include -#include -#include -#include -#include "uaccess.h" - -#ifndef CONFIG_64BIT -#define AHI "ahi" -#define ALR "alr" -#define CLR "clr" -#define LHI "lhi" -#define SLR "slr" -#else -#define AHI "aghi" -#define ALR "algr" -#define CLR "clgr" -#define LHI "lghi" -#define SLR "slgr" -#endif - -static struct static_key have_mvcos = STATIC_KEY_INIT_TRUE; - -static inline unsigned long copy_from_user_mvcos(void *x, const void __user *ptr, - unsigned long size) -{ - register unsigned long reg0 asm("0") = 0x81UL; - unsigned long tmp1, tmp2; - - tmp1 = -4096UL; - asm volatile( - "0: .insn ss,0xc80000000000,0(%0,%2),0(%1),0\n" - "9: jz 7f\n" - "1:"ALR" %0,%3\n" - " "SLR" %1,%3\n" - " "SLR" %2,%3\n" - " j 0b\n" - "2: la %4,4095(%1)\n"/* %4 = ptr + 4095 */ - " nr %4,%3\n" /* %4 = (ptr + 4095) & -4096 */ - " "SLR" %4,%1\n" - " "CLR" %0,%4\n" /* copy crosses next page boundary? */ - " jnh 4f\n" - "3: .insn ss,0xc80000000000,0(%4,%2),0(%1),0\n" - "10:"SLR" %0,%4\n" - " "ALR" %2,%4\n" - "4:"LHI" %4,-1\n" - " "ALR" %4,%0\n" /* copy remaining size, subtract 1 */ - " bras %3,6f\n" /* memset loop */ - " xc 0(1,%2),0(%2)\n" - "5: xc 0(256,%2),0(%2)\n" - " la %2,256(%2)\n" - "6:"AHI" %4,-256\n" - " jnm 5b\n" - " ex %4,0(%3)\n" - " j 8f\n" - "7:"SLR" %0,%0\n" - "8: \n" - EX_TABLE(0b,2b) EX_TABLE(3b,4b) EX_TABLE(9b,2b) EX_TABLE(10b,4b) - : "+a" (size), "+a" (ptr), "+a" (x), "+a" (tmp1), "=a" (tmp2) - : "d" (reg0) : "cc", "memory"); - return size; -} - -unsigned long __copy_from_user(void *to, const void __user *from, unsigned long n) -{ - if (static_key_true(&have_mvcos)) - return copy_from_user_mvcos(to, from, n); - return copy_from_user_pt(to, from, n); -} -EXPORT_SYMBOL(__copy_from_user); - -static inline unsigned long copy_to_user_mvcos(void __user *ptr, const void *x, - unsigned long size) -{ - register unsigned long reg0 asm("0") = 0x810000UL; - unsigned long tmp1, tmp2; - - tmp1 = -4096UL; - asm volatile( - "0: .insn ss,0xc80000000000,0(%0,%1),0(%2),0\n" - "6: jz 4f\n" - "1:"ALR" %0,%3\n" - " "SLR" %1,%3\n" - " "SLR" %2,%3\n" - " j 0b\n" - "2: la %4,4095(%1)\n"/* %4 = ptr + 4095 */ - " nr %4,%3\n" /* %4 = (ptr + 4095) & -4096 */ - " "SLR" %4,%1\n" - " "CLR" %0,%4\n" /* copy crosses next page boundary? */ - " jnh 5f\n" - "3: .insn ss,0xc80000000000,0(%4,%1),0(%2),0\n" - "7:"SLR" %0,%4\n" - " j 5f\n" - "4:"SLR" %0,%0\n" - "5: \n" - EX_TABLE(0b,2b) EX_TABLE(3b,5b) EX_TABLE(6b,2b) EX_TABLE(7b,5b) - : "+a" (size), "+a" (ptr), "+a" (x), "+a" (tmp1), "=a" (tmp2) - : "d" (reg0) : "cc", "memory"); - return size; -} - -unsigned long __copy_to_user(void __user *to, const void *from, unsigned long n) -{ - if (static_key_true(&have_mvcos)) - return copy_to_user_mvcos(to, from, n); - return copy_to_user_pt(to, from, n); -} -EXPORT_SYMBOL(__copy_to_user); - -static inline unsigned long copy_in_user_mvcos(void __user *to, const void __user *from, - unsigned long size) -{ - register unsigned long reg0 asm("0") = 0x810081UL; - unsigned long tmp1, tmp2; - - tmp1 = -4096UL; - /* FIXME: copy with reduced length. */ - asm volatile( - "0: .insn ss,0xc80000000000,0(%0,%1),0(%2),0\n" - " jz 2f\n" - "1:"ALR" %0,%3\n" - " "SLR" %1,%3\n" - " "SLR" %2,%3\n" - " j 0b\n" - "2:"SLR" %0,%0\n" - "3: \n" - EX_TABLE(0b,3b) - : "+a" (size), "+a" (to), "+a" (from), "+a" (tmp1), "=a" (tmp2) - : "d" (reg0) : "cc", "memory"); - return size; -} - -unsigned long __copy_in_user(void __user *to, const void __user *from, unsigned long n) -{ - if (static_key_true(&have_mvcos)) - return copy_in_user_mvcos(to, from, n); - return copy_in_user_pt(to, from, n); -} -EXPORT_SYMBOL(__copy_in_user); - -static inline unsigned long clear_user_mvcos(void __user *to, unsigned long size) -{ - register unsigned long reg0 asm("0") = 0x810000UL; - unsigned long tmp1, tmp2; - - tmp1 = -4096UL; - asm volatile( - "0: .insn ss,0xc80000000000,0(%0,%1),0(%4),0\n" - " jz 4f\n" - "1:"ALR" %0,%2\n" - " "SLR" %1,%2\n" - " j 0b\n" - "2: la %3,4095(%1)\n"/* %4 = to + 4095 */ - " nr %3,%2\n" /* %4 = (to + 4095) & -4096 */ - " "SLR" %3,%1\n" - " "CLR" %0,%3\n" /* copy crosses next page boundary? */ - " jnh 5f\n" - "3: .insn ss,0xc80000000000,0(%3,%1),0(%4),0\n" - " "SLR" %0,%3\n" - " j 5f\n" - "4:"SLR" %0,%0\n" - "5: \n" - EX_TABLE(0b,2b) EX_TABLE(3b,5b) - : "+a" (size), "+a" (to), "+a" (tmp1), "=a" (tmp2) - : "a" (empty_zero_page), "d" (reg0) : "cc", "memory"); - return size; -} - -unsigned long __clear_user(void __user *to, unsigned long size) -{ - if (static_key_true(&have_mvcos)) - return clear_user_mvcos(to, size); - return clear_user_pt(to, size); -} -EXPORT_SYMBOL(__clear_user); - -static inline unsigned long strnlen_user_mvcos(const char __user *src, - unsigned long count) -{ - unsigned long done, len, offset, len_str; - char buf[256]; - - done = 0; - do { - offset = (unsigned long)src & ~PAGE_MASK; - len = min(256UL, PAGE_SIZE - offset); - len = min(count - done, len); - if (copy_from_user_mvcos(buf, src, len)) - return 0; - len_str = strnlen(buf, len); - done += len_str; - src += len_str; - } while ((len_str == len) && (done < count)); - return done + 1; -} - -unsigned long __strnlen_user(const char __user *src, unsigned long count) -{ - if (static_key_true(&have_mvcos)) - return strnlen_user_mvcos(src, count); - return strnlen_user_pt(src, count); -} -EXPORT_SYMBOL(__strnlen_user); - -static inline long strncpy_from_user_mvcos(char *dst, const char __user *src, - long count) -{ - unsigned long done, len, offset, len_str; - - if (unlikely(count <= 0)) - return 0; - done = 0; - do { - offset = (unsigned long)src & ~PAGE_MASK; - len = min(count - done, PAGE_SIZE - offset); - if (copy_from_user_mvcos(dst, src, len)) - return -EFAULT; - len_str = strnlen(dst, len); - done += len_str; - src += len_str; - dst += len_str; - } while ((len_str == len) && (done < count)); - return done; -} - -long __strncpy_from_user(char *dst, const char __user *src, long count) -{ - if (static_key_true(&have_mvcos)) - return strncpy_from_user_mvcos(dst, src, count); - return strncpy_from_user_pt(dst, src, count); -} -EXPORT_SYMBOL(__strncpy_from_user); - -/* - * The uaccess page tabe walk variant can be enforced with the "uaccesspt" - * kernel parameter. This is mainly for debugging purposes. - */ -static int force_uaccess_pt __initdata; - -static int __init parse_uaccess_pt(char *__unused) -{ - force_uaccess_pt = 1; - return 0; -} -early_param("uaccesspt", parse_uaccess_pt); - -static int __init uaccess_init(void) -{ - if (IS_ENABLED(CONFIG_32BIT) || force_uaccess_pt || !test_facility(27)) - static_key_slow_dec(&have_mvcos); - return 0; -} -early_initcall(uaccess_init); diff --git a/arch/s390/lib/uaccess_pt.c b/arch/s390/lib/uaccess_pt.c deleted file mode 100644 index 8d39760bae68..000000000000 --- a/arch/s390/lib/uaccess_pt.c +++ /dev/null @@ -1,471 +0,0 @@ -/* - * User access functions based on page table walks for enhanced - * system layout without hardware support. - * - * Copyright IBM Corp. 2006, 2012 - * Author(s): Gerald Schaefer (gerald.schaefer@de.ibm.com) - */ - -#include -#include -#include -#include -#include -#include -#include "uaccess.h" - -#ifndef CONFIG_64BIT -#define AHI "ahi" -#define SLR "slr" -#else -#define AHI "aghi" -#define SLR "slgr" -#endif - -static unsigned long strnlen_kernel(const char __user *src, unsigned long count) -{ - register unsigned long reg0 asm("0") = 0UL; - unsigned long tmp1, tmp2; - - asm volatile( - " la %2,0(%1)\n" - " la %3,0(%0,%1)\n" - " "SLR" %0,%0\n" - "0: srst %3,%2\n" - " jo 0b\n" - " la %0,1(%3)\n" /* strnlen_kernel results includes \0 */ - " "SLR" %0,%1\n" - "1:\n" - EX_TABLE(0b,1b) - : "+a" (count), "+a" (src), "=a" (tmp1), "=a" (tmp2) - : "d" (reg0) : "cc", "memory"); - return count; -} - -static unsigned long copy_in_kernel(void __user *to, const void __user *from, - unsigned long count) -{ - unsigned long tmp1; - - asm volatile( - " "AHI" %0,-1\n" - " jo 5f\n" - " bras %3,3f\n" - "0:"AHI" %0,257\n" - "1: mvc 0(1,%1),0(%2)\n" - " la %1,1(%1)\n" - " la %2,1(%2)\n" - " "AHI" %0,-1\n" - " jnz 1b\n" - " j 5f\n" - "2: mvc 0(256,%1),0(%2)\n" - " la %1,256(%1)\n" - " la %2,256(%2)\n" - "3:"AHI" %0,-256\n" - " jnm 2b\n" - "4: ex %0,1b-0b(%3)\n" - "5:"SLR" %0,%0\n" - "6:\n" - EX_TABLE(1b,6b) EX_TABLE(2b,0b) EX_TABLE(4b,0b) - : "+a" (count), "+a" (to), "+a" (from), "=a" (tmp1) - : : "cc", "memory"); - return count; -} - -/* - * Returns kernel address for user virtual address. If the returned address is - * >= -4095 (IS_ERR_VALUE(x) returns true), a fault has occurred and the - * address contains the (negative) exception code. - */ -#ifdef CONFIG_64BIT - -static unsigned long follow_table(struct mm_struct *mm, - unsigned long address, int write) -{ - unsigned long *table = (unsigned long *)__pa(mm->pgd); - - if (unlikely(address > mm->context.asce_limit - 1)) - return -0x38UL; - switch (mm->context.asce_bits & _ASCE_TYPE_MASK) { - case _ASCE_TYPE_REGION1: - table = table + ((address >> 53) & 0x7ff); - if (unlikely(*table & _REGION_ENTRY_INVALID)) - return -0x39UL; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - /* fallthrough */ - case _ASCE_TYPE_REGION2: - table = table + ((address >> 42) & 0x7ff); - if (unlikely(*table & _REGION_ENTRY_INVALID)) - return -0x3aUL; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - /* fallthrough */ - case _ASCE_TYPE_REGION3: - table = table + ((address >> 31) & 0x7ff); - if (unlikely(*table & _REGION_ENTRY_INVALID)) - return -0x3bUL; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - /* fallthrough */ - case _ASCE_TYPE_SEGMENT: - table = table + ((address >> 20) & 0x7ff); - if (unlikely(*table & _SEGMENT_ENTRY_INVALID)) - return -0x10UL; - if (unlikely(*table & _SEGMENT_ENTRY_LARGE)) { - if (write && (*table & _SEGMENT_ENTRY_PROTECT)) - return -0x04UL; - return (*table & _SEGMENT_ENTRY_ORIGIN_LARGE) + - (address & ~_SEGMENT_ENTRY_ORIGIN_LARGE); - } - table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN); - } - table = table + ((address >> 12) & 0xff); - if (unlikely(*table & _PAGE_INVALID)) - return -0x11UL; - if (write && (*table & _PAGE_PROTECT)) - return -0x04UL; - return (*table & PAGE_MASK) + (address & ~PAGE_MASK); -} - -#else /* CONFIG_64BIT */ - -static unsigned long follow_table(struct mm_struct *mm, - unsigned long address, int write) -{ - unsigned long *table = (unsigned long *)__pa(mm->pgd); - - table = table + ((address >> 20) & 0x7ff); - if (unlikely(*table & _SEGMENT_ENTRY_INVALID)) - return -0x10UL; - table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN); - table = table + ((address >> 12) & 0xff); - if (unlikely(*table & _PAGE_INVALID)) - return -0x11UL; - if (write && (*table & _PAGE_PROTECT)) - return -0x04UL; - return (*table & PAGE_MASK) + (address & ~PAGE_MASK); -} - -#endif /* CONFIG_64BIT */ - -static inline unsigned long __user_copy_pt(unsigned long uaddr, void *kptr, - unsigned long n, int write_user) -{ - struct mm_struct *mm = current->mm; - unsigned long offset, done, size, kaddr; - void *from, *to; - - if (!mm) - return n; - done = 0; -retry: - spin_lock(&mm->page_table_lock); - do { - kaddr = follow_table(mm, uaddr, write_user); - if (IS_ERR_VALUE(kaddr)) - goto fault; - - offset = uaddr & ~PAGE_MASK; - size = min(n - done, PAGE_SIZE - offset); - if (write_user) { - to = (void *) kaddr; - from = kptr + done; - } else { - from = (void *) kaddr; - to = kptr + done; - } - memcpy(to, from, size); - done += size; - uaddr += size; - } while (done < n); - spin_unlock(&mm->page_table_lock); - return n - done; -fault: - spin_unlock(&mm->page_table_lock); - if (__handle_fault(uaddr, -kaddr, write_user)) - return n - done; - goto retry; -} - -/* - * Do DAT for user address by page table walk, return kernel address. - * This function needs to be called with current->mm->page_table_lock held. - */ -static inline unsigned long __dat_user_addr(unsigned long uaddr, int write) -{ - struct mm_struct *mm = current->mm; - unsigned long kaddr; - int rc; - -retry: - kaddr = follow_table(mm, uaddr, write); - if (IS_ERR_VALUE(kaddr)) - goto fault; - - return kaddr; -fault: - spin_unlock(&mm->page_table_lock); - rc = __handle_fault(uaddr, -kaddr, write); - spin_lock(&mm->page_table_lock); - if (!rc) - goto retry; - return 0; -} - -unsigned long copy_from_user_pt(void *to, const void __user *from, unsigned long n) -{ - unsigned long rc; - - if (segment_eq(get_fs(), KERNEL_DS)) - return copy_in_kernel((void __user *) to, from, n); - rc = __user_copy_pt((unsigned long) from, to, n, 0); - if (unlikely(rc)) - memset(to + n - rc, 0, rc); - return rc; -} - -unsigned long copy_to_user_pt(void __user *to, const void *from, unsigned long n) -{ - if (segment_eq(get_fs(), KERNEL_DS)) - return copy_in_kernel(to, (void __user *) from, n); - return __user_copy_pt((unsigned long) to, (void *) from, n, 1); -} - -unsigned long clear_user_pt(void __user *to, unsigned long n) -{ - void *zpage = (void *) empty_zero_page; - unsigned long done, size, ret; - - done = 0; - do { - if (n - done > PAGE_SIZE) - size = PAGE_SIZE; - else - size = n - done; - if (segment_eq(get_fs(), KERNEL_DS)) - ret = copy_in_kernel(to, (void __user *) zpage, n); - else - ret = __user_copy_pt((unsigned long) to, zpage, size, 1); - done += size; - to += size; - if (ret) - return ret + n - done; - } while (done < n); - return 0; -} - -unsigned long strnlen_user_pt(const char __user *src, unsigned long count) -{ - unsigned long uaddr = (unsigned long) src; - struct mm_struct *mm = current->mm; - unsigned long offset, done, len, kaddr; - unsigned long len_str; - - if (unlikely(!count)) - return 0; - if (segment_eq(get_fs(), KERNEL_DS)) - return strnlen_kernel(src, count); - if (!mm) - return 0; - done = 0; -retry: - spin_lock(&mm->page_table_lock); - do { - kaddr = follow_table(mm, uaddr, 0); - if (IS_ERR_VALUE(kaddr)) - goto fault; - - offset = uaddr & ~PAGE_MASK; - len = min(count - done, PAGE_SIZE - offset); - len_str = strnlen((char *) kaddr, len); - done += len_str; - uaddr += len_str; - } while ((len_str == len) && (done < count)); - spin_unlock(&mm->page_table_lock); - return done + 1; -fault: - spin_unlock(&mm->page_table_lock); - if (__handle_fault(uaddr, -kaddr, 0)) - return 0; - goto retry; -} - -long strncpy_from_user_pt(char *dst, const char __user *src, long count) -{ - unsigned long done, len, offset, len_str; - - if (unlikely(count <= 0)) - return 0; - done = 0; - do { - offset = (unsigned long)src & ~PAGE_MASK; - len = min(count - done, PAGE_SIZE - offset); - if (segment_eq(get_fs(), KERNEL_DS)) { - if (copy_in_kernel((void __user *) dst, src, len)) - return -EFAULT; - } else { - if (__user_copy_pt((unsigned long) src, dst, len, 0)) - return -EFAULT; - } - len_str = strnlen(dst, len); - done += len_str; - src += len_str; - dst += len_str; - } while ((len_str == len) && (done < count)); - return done; -} - -unsigned long copy_in_user_pt(void __user *to, const void __user *from, - unsigned long n) -{ - struct mm_struct *mm = current->mm; - unsigned long offset_max, uaddr, done, size, error_code; - unsigned long uaddr_from = (unsigned long) from; - unsigned long uaddr_to = (unsigned long) to; - unsigned long kaddr_to, kaddr_from; - int write_user; - - if (segment_eq(get_fs(), KERNEL_DS)) - return copy_in_kernel(to, from, n); - if (!mm) - return n; - done = 0; -retry: - spin_lock(&mm->page_table_lock); - do { - write_user = 0; - uaddr = uaddr_from; - kaddr_from = follow_table(mm, uaddr_from, 0); - error_code = kaddr_from; - if (IS_ERR_VALUE(error_code)) - goto fault; - - write_user = 1; - uaddr = uaddr_to; - kaddr_to = follow_table(mm, uaddr_to, 1); - error_code = (unsigned long) kaddr_to; - if (IS_ERR_VALUE(error_code)) - goto fault; - - offset_max = max(uaddr_from & ~PAGE_MASK, - uaddr_to & ~PAGE_MASK); - size = min(n - done, PAGE_SIZE - offset_max); - - memcpy((void *) kaddr_to, (void *) kaddr_from, size); - done += size; - uaddr_from += size; - uaddr_to += size; - } while (done < n); - spin_unlock(&mm->page_table_lock); - return n - done; -fault: - spin_unlock(&mm->page_table_lock); - if (__handle_fault(uaddr, -error_code, write_user)) - return n - done; - goto retry; -} - -#define __futex_atomic_op(insn, ret, oldval, newval, uaddr, oparg) \ - asm volatile("0: l %1,0(%6)\n" \ - "1: " insn \ - "2: cs %1,%2,0(%6)\n" \ - "3: jl 1b\n" \ - " lhi %0,0\n" \ - "4:\n" \ - EX_TABLE(0b,4b) EX_TABLE(2b,4b) EX_TABLE(3b,4b) \ - : "=d" (ret), "=&d" (oldval), "=&d" (newval), \ - "=m" (*uaddr) \ - : "0" (-EFAULT), "d" (oparg), "a" (uaddr), \ - "m" (*uaddr) : "cc" ); - -static int __futex_atomic_op_pt(int op, u32 __user *uaddr, int oparg, int *old) -{ - int oldval = 0, newval, ret; - - switch (op) { - case FUTEX_OP_SET: - __futex_atomic_op("lr %2,%5\n", - ret, oldval, newval, uaddr, oparg); - break; - case FUTEX_OP_ADD: - __futex_atomic_op("lr %2,%1\nar %2,%5\n", - ret, oldval, newval, uaddr, oparg); - break; - case FUTEX_OP_OR: - __futex_atomic_op("lr %2,%1\nor %2,%5\n", - ret, oldval, newval, uaddr, oparg); - break; - case FUTEX_OP_ANDN: - __futex_atomic_op("lr %2,%1\nnr %2,%5\n", - ret, oldval, newval, uaddr, oparg); - break; - case FUTEX_OP_XOR: - __futex_atomic_op("lr %2,%1\nxr %2,%5\n", - ret, oldval, newval, uaddr, oparg); - break; - default: - ret = -ENOSYS; - } - if (ret == 0) - *old = oldval; - return ret; -} - -int __futex_atomic_op_inuser(int op, u32 __user *uaddr, int oparg, int *old) -{ - int ret; - - if (segment_eq(get_fs(), KERNEL_DS)) - return __futex_atomic_op_pt(op, uaddr, oparg, old); - if (unlikely(!current->mm)) - return -EFAULT; - spin_lock(¤t->mm->page_table_lock); - uaddr = (u32 __force __user *) - __dat_user_addr((__force unsigned long) uaddr, 1); - if (!uaddr) { - spin_unlock(¤t->mm->page_table_lock); - return -EFAULT; - } - get_page(virt_to_page(uaddr)); - spin_unlock(¤t->mm->page_table_lock); - ret = __futex_atomic_op_pt(op, uaddr, oparg, old); - put_page(virt_to_page(uaddr)); - return ret; -} - -static int __futex_atomic_cmpxchg_pt(u32 *uval, u32 __user *uaddr, - u32 oldval, u32 newval) -{ - int ret; - - asm volatile("0: cs %1,%4,0(%5)\n" - "1: la %0,0\n" - "2:\n" - EX_TABLE(0b,2b) EX_TABLE(1b,2b) - : "=d" (ret), "+d" (oldval), "=m" (*uaddr) - : "0" (-EFAULT), "d" (newval), "a" (uaddr), "m" (*uaddr) - : "cc", "memory" ); - *uval = oldval; - return ret; -} - -int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, - u32 oldval, u32 newval) -{ - int ret; - - if (segment_eq(get_fs(), KERNEL_DS)) - return __futex_atomic_cmpxchg_pt(uval, uaddr, oldval, newval); - if (unlikely(!current->mm)) - return -EFAULT; - spin_lock(¤t->mm->page_table_lock); - uaddr = (u32 __force __user *) - __dat_user_addr((__force unsigned long) uaddr, 1); - if (!uaddr) { - spin_unlock(¤t->mm->page_table_lock); - return -EFAULT; - } - get_page(virt_to_page(uaddr)); - spin_unlock(¤t->mm->page_table_lock); - ret = __futex_atomic_cmpxchg_pt(uval, uaddr, oldval, newval); - put_page(virt_to_page(uaddr)); - return ret; -} diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 750565f72e06..f93e6c2d4ba5 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -105,21 +105,24 @@ void bust_spinlocks(int yes) * Returns the address space associated with the fault. * Returns 0 for kernel space and 1 for user space. */ -static inline int user_space_fault(unsigned long trans_exc_code) +static inline int user_space_fault(struct pt_regs *regs) { + unsigned long trans_exc_code; + /* * The lowest two bits of the translation exception * identification indicate which paging table was used. */ - trans_exc_code &= 3; - if (trans_exc_code == 2) - /* Access via secondary space, set_fs setting decides */ + trans_exc_code = regs->int_parm_long & 3; + if (trans_exc_code == 3) /* home space -> kernel */ + return 0; + if (user_mode(regs)) + return 1; + if (trans_exc_code == 2) /* secondary space -> set_fs */ return current->thread.mm_segment.ar4; - /* - * Access via primary space or access register is from user space - * and access via home space is from the kernel. - */ - return trans_exc_code != 3; + if (current->flags & PF_VCPU) + return 1; + return 0; } static inline void report_user_fault(struct pt_regs *regs, long signr) @@ -171,7 +174,7 @@ static noinline void do_no_context(struct pt_regs *regs) * terminate things with extreme prejudice. */ address = regs->int_parm_long & __FAIL_ADDR_MASK; - if (!user_space_fault(regs->int_parm_long)) + if (!user_space_fault(regs)) printk(KERN_ALERT "Unable to handle kernel pointer dereference" " at virtual kernel address %p\n", (void *)address); else @@ -291,7 +294,7 @@ static inline int do_exception(struct pt_regs *regs, int access) * user context. */ fault = VM_FAULT_BADCONTEXT; - if (unlikely(!user_space_fault(trans_exc_code) || in_atomic() || !mm)) + if (unlikely(!user_space_fault(regs) || in_atomic() || !mm)) goto out; address = trans_exc_code & __FAIL_ADDR_MASK; @@ -423,30 +426,6 @@ void __kprobes do_dat_exception(struct pt_regs *regs) do_fault_error(regs, fault); } -int __handle_fault(unsigned long uaddr, unsigned long pgm_int_code, int write) -{ - struct pt_regs regs; - int access, fault; - - /* Emulate a uaccess fault from kernel mode. */ - regs.psw.mask = PSW_KERNEL_BITS | PSW_MASK_DAT | PSW_MASK_MCHECK; - if (!irqs_disabled()) - regs.psw.mask |= PSW_MASK_IO | PSW_MASK_EXT; - regs.psw.addr = (unsigned long) __builtin_return_address(0); - regs.psw.addr |= PSW_ADDR_AMODE; - regs.int_code = pgm_int_code; - regs.int_parm_long = (uaddr & PAGE_MASK) | 2; - access = write ? VM_WRITE : VM_READ; - fault = do_exception(®s, access); - /* - * Since the fault happened in kernel mode while performing a uaccess - * all we need to do now is emulating a fixup in case "fault" is not - * zero. - * For the calling uaccess functions this results always in -EFAULT. - */ - return fault ? -EFAULT : 0; -} - #ifdef CONFIG_PFAULT /* * 'pfault' pseudo page faults routines. diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index c57c63380184..b5745dc9c6b5 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -54,7 +54,7 @@ static void __crst_table_upgrade(void *arg) struct mm_struct *mm = arg; if (current->active_mm == mm) - update_user_asce(mm); + update_user_asce(mm, 1); __tlb_flush_local(); } @@ -108,7 +108,7 @@ void crst_table_downgrade(struct mm_struct *mm, unsigned long limit) pgd_t *pgd; if (current->active_mm == mm) { - clear_user_asce(mm); + clear_user_asce(mm, 1); __tlb_flush_mm(mm); } while (mm->context.asce_limit > limit) { @@ -134,7 +134,7 @@ void crst_table_downgrade(struct mm_struct *mm, unsigned long limit) crst_table_free(mm, (unsigned long *) pgd); } if (current->active_mm == mm) - update_user_asce(mm); + update_user_asce(mm, 1); } #endif