diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h index a6870ea6ea8b..62943af36ac6 100644 --- a/arch/s390/include/asm/lowcore.h +++ b/arch/s390/include/asm/lowcore.h @@ -133,8 +133,9 @@ struct lowcore { __u8 pad_0x03b4[0x03b8-0x03b4]; /* 0x03b4 */ __u64 gmap; /* 0x03b8 */ __u32 spinlock_lockval; /* 0x03c0 */ - __u32 fpu_flags; /* 0x03c4 */ - __u8 pad_0x03c8[0x0400-0x03c8]; /* 0x03c8 */ + __u32 spinlock_index; /* 0x03c4 */ + __u32 fpu_flags; /* 0x03c8 */ + __u8 pad_0x03cc[0x0400-0x03cc]; /* 0x03cc */ /* Per cpu primary space access list */ __u32 paste[16]; /* 0x0400 */ diff --git a/arch/s390/include/asm/spinlock.h b/arch/s390/include/asm/spinlock.h index 6727cc30d59b..2da4a6d13f54 100644 --- a/arch/s390/include/asm/spinlock.h +++ b/arch/s390/include/asm/spinlock.h @@ -36,15 +36,11 @@ bool arch_vcpu_is_preempted(int cpu); */ void arch_lock_relax(int cpu); +void arch_spin_relax(arch_spinlock_t *lock); void arch_spin_lock_wait(arch_spinlock_t *); int arch_spin_trylock_retry(arch_spinlock_t *); -void arch_spin_lock_wait_flags(arch_spinlock_t *, unsigned long flags); - -static inline void arch_spin_relax(arch_spinlock_t *lock) -{ - arch_lock_relax(lock->lock); -} +void arch_spin_lock_setup(int cpu); static inline u32 arch_spin_lockval(int cpu) { @@ -64,8 +60,7 @@ static inline int arch_spin_is_locked(arch_spinlock_t *lp) static inline int arch_spin_trylock_once(arch_spinlock_t *lp) { barrier(); - return likely(arch_spin_value_unlocked(*lp) && - __atomic_cmpxchg_bool(&lp->lock, 0, SPINLOCK_LOCKVAL)); + return likely(__atomic_cmpxchg_bool(&lp->lock, 0, SPINLOCK_LOCKVAL)); } static inline void arch_spin_lock(arch_spinlock_t *lp) @@ -78,7 +73,7 @@ static inline void arch_spin_lock_flags(arch_spinlock_t *lp, unsigned long flags) { if (!arch_spin_trylock_once(lp)) - arch_spin_lock_wait_flags(lp, flags); + arch_spin_lock_wait(lp); } static inline int arch_spin_trylock(arch_spinlock_t *lp) @@ -95,8 +90,9 @@ static inline void arch_spin_unlock(arch_spinlock_t *lp) #ifdef CONFIG_HAVE_MARCH_ZEC12_FEATURES " .long 0xb2fa0070\n" /* NIAI 7 */ #endif - " st %1,%0\n" - : "=Q" (lp->lock) : "d" (0) : "cc", "memory"); + " sth %1,%0\n" + : "=Q" (((unsigned short *) &lp->lock)[1]) + : "d" (0) : "cc", "memory"); } /* diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 164a1e16b53e..b2c9af9b88d5 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -380,6 +380,8 @@ static void __init setup_lowcore(void) #ifdef CONFIG_SMP lc->spinlock_lockval = arch_spin_lockval(0); + lc->spinlock_index = 0; + arch_spin_lock_setup(0); #endif set_prefix((u32)(unsigned long) lc); diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index b9fc9b00d845..cc04b74fbb84 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -226,6 +226,7 @@ static int pcpu_alloc_lowcore(struct pcpu *pcpu, int cpu) lc->mcesad = mcesa_origin | mcesa_bits; lc->cpu_nr = cpu; lc->spinlock_lockval = arch_spin_lockval(cpu); + lc->spinlock_index = 0; if (vdso_alloc_per_cpu(lc)) goto out; lowcore_ptr[cpu] = lc; @@ -273,6 +274,7 @@ static void pcpu_prepare_secondary(struct pcpu *pcpu, int cpu) cpumask_set_cpu(cpu, mm_cpumask(&init_mm)); lc->cpu_nr = cpu; lc->spinlock_lockval = arch_spin_lockval(cpu); + lc->spinlock_index = 0; lc->percpu_offset = __per_cpu_offset[cpu]; lc->kernel_asce = S390_lowcore.kernel_asce; lc->machine_flags = S390_lowcore.machine_flags; @@ -281,6 +283,7 @@ static void pcpu_prepare_secondary(struct pcpu *pcpu, int cpu) save_access_regs((unsigned int *) lc->access_regs_save_area); memcpy(lc->stfle_fac_list, S390_lowcore.stfle_fac_list, MAX_FACILITY_BIT/8); + arch_spin_lock_setup(cpu); } static void pcpu_attach_task(struct pcpu *pcpu, struct task_struct *tsk) @@ -967,6 +970,7 @@ void __init smp_setup_processor_id(void) pcpu_devices[0].address = stap(); S390_lowcore.cpu_nr = 0; S390_lowcore.spinlock_lockval = arch_spin_lockval(0); + S390_lowcore.spinlock_index = 0; } /* diff --git a/arch/s390/lib/spinlock.c b/arch/s390/lib/spinlock.c index ee73bcca7e6f..6747134227cd 100644 --- a/arch/s390/lib/spinlock.c +++ b/arch/s390/lib/spinlock.c @@ -8,8 +8,10 @@ #include #include #include +#include #include #include +#include #include int spin_retry = -1; @@ -32,6 +34,40 @@ static int __init spin_retry_setup(char *str) } __setup("spin_retry=", spin_retry_setup); +struct spin_wait { + struct spin_wait *next, *prev; + int node_id; +} __aligned(32); + +static DEFINE_PER_CPU_ALIGNED(struct spin_wait, spin_wait[4]); + +#define _Q_LOCK_CPU_OFFSET 0 +#define _Q_LOCK_STEAL_OFFSET 16 +#define _Q_TAIL_IDX_OFFSET 18 +#define _Q_TAIL_CPU_OFFSET 20 + +#define _Q_LOCK_CPU_MASK 0x0000ffff +#define _Q_LOCK_STEAL_ADD 0x00010000 +#define _Q_LOCK_STEAL_MASK 0x00030000 +#define _Q_TAIL_IDX_MASK 0x000c0000 +#define _Q_TAIL_CPU_MASK 0xfff00000 + +#define _Q_LOCK_MASK (_Q_LOCK_CPU_MASK | _Q_LOCK_STEAL_MASK) +#define _Q_TAIL_MASK (_Q_TAIL_IDX_MASK | _Q_TAIL_CPU_MASK) + +void arch_spin_lock_setup(int cpu) +{ + struct spin_wait *node; + int ix; + + node = per_cpu_ptr(&spin_wait[0], cpu); + for (ix = 0; ix < 4; ix++, node++) { + memset(node, 0, sizeof(*node)); + node->node_id = ((cpu + 1) << _Q_TAIL_CPU_OFFSET) + + (ix << _Q_TAIL_IDX_OFFSET); + } +} + static inline int arch_load_niai4(int *lock) { int owner; @@ -60,76 +96,161 @@ static inline int arch_cmpxchg_niai8(int *lock, int old, int new) return expected == old; } +static inline struct spin_wait *arch_spin_decode_tail(int lock) +{ + int ix, cpu; + + ix = (lock & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; + cpu = (lock & _Q_TAIL_CPU_MASK) >> _Q_TAIL_CPU_OFFSET; + return per_cpu_ptr(&spin_wait[ix], cpu - 1); +} + +static inline int arch_spin_yield_target(int lock, struct spin_wait *node) +{ + if (lock & _Q_LOCK_CPU_MASK) + return lock & _Q_LOCK_CPU_MASK; + if (node == NULL || node->prev == NULL) + return 0; /* 0 -> no target cpu */ + while (node->prev) + node = node->prev; + return node->node_id >> _Q_TAIL_CPU_OFFSET; +} + +static inline void arch_spin_lock_queued(arch_spinlock_t *lp) +{ + struct spin_wait *node, *next; + int lockval, ix, node_id, tail_id, old, new, owner, count; + + ix = S390_lowcore.spinlock_index++; + barrier(); + lockval = SPINLOCK_LOCKVAL; /* cpu + 1 */ + node = this_cpu_ptr(&spin_wait[ix]); + node->prev = node->next = NULL; + node_id = node->node_id; + + /* Enqueue the node for this CPU in the spinlock wait queue */ + while (1) { + old = READ_ONCE(lp->lock); + if ((old & _Q_LOCK_CPU_MASK) == 0 && + (old & _Q_LOCK_STEAL_MASK) != _Q_LOCK_STEAL_MASK) { + /* + * The lock is free but there may be waiters. + * With no waiters simply take the lock, if there + * are waiters try to steal the lock. The lock may + * be stolen three times before the next queued + * waiter will get the lock. + */ + new = (old ? (old + _Q_LOCK_STEAL_ADD) : 0) | lockval; + if (__atomic_cmpxchg_bool(&lp->lock, old, new)) + /* Got the lock */ + goto out; + /* lock passing in progress */ + continue; + } + /* Make the node of this CPU the new tail. */ + new = node_id | (old & _Q_LOCK_MASK); + if (__atomic_cmpxchg_bool(&lp->lock, old, new)) + break; + } + /* Set the 'next' pointer of the tail node in the queue */ + tail_id = old & _Q_TAIL_MASK; + if (tail_id != 0) { + node->prev = arch_spin_decode_tail(tail_id); + WRITE_ONCE(node->prev->next, node); + } + + /* Pass the virtual CPU to the lock holder if it is not running */ + owner = arch_spin_yield_target(old, node); + if (owner && arch_vcpu_is_preempted(owner - 1)) + smp_yield_cpu(owner - 1); + + /* Spin on the CPU local node->prev pointer */ + if (tail_id != 0) { + count = spin_retry; + while (READ_ONCE(node->prev) != NULL) { + if (count-- >= 0) + continue; + count = spin_retry; + /* Query running state of lock holder again. */ + owner = arch_spin_yield_target(old, node); + if (owner && arch_vcpu_is_preempted(owner - 1)) + smp_yield_cpu(owner - 1); + } + } + + /* Spin on the lock value in the spinlock_t */ + count = spin_retry; + while (1) { + old = READ_ONCE(lp->lock); + owner = old & _Q_LOCK_CPU_MASK; + if (!owner) { + tail_id = old & _Q_TAIL_MASK; + new = ((tail_id != node_id) ? tail_id : 0) | lockval; + if (__atomic_cmpxchg_bool(&lp->lock, old, new)) + /* Got the lock */ + break; + continue; + } + if (count-- >= 0) + continue; + count = spin_retry; + if (!MACHINE_IS_LPAR || arch_vcpu_is_preempted(owner - 1)) + smp_yield_cpu(owner - 1); + } + + /* Pass lock_spin job to next CPU in the queue */ + if (node_id && tail_id != node_id) { + /* Wait until the next CPU has set up the 'next' pointer */ + while ((next = READ_ONCE(node->next)) == NULL) + ; + next->prev = NULL; + } + + out: + S390_lowcore.spinlock_index--; +} + +static inline void arch_spin_lock_classic(arch_spinlock_t *lp) +{ + int lockval, old, new, owner, count; + + lockval = SPINLOCK_LOCKVAL; /* cpu + 1 */ + + /* Pass the virtual CPU to the lock holder if it is not running */ + owner = arch_spin_yield_target(ACCESS_ONCE(lp->lock), NULL); + if (owner && arch_vcpu_is_preempted(owner - 1)) + smp_yield_cpu(owner - 1); + + count = spin_retry; + while (1) { + old = arch_load_niai4(&lp->lock); + owner = old & _Q_LOCK_CPU_MASK; + /* Try to get the lock if it is free. */ + if (!owner) { + new = (old & _Q_TAIL_MASK) | lockval; + if (arch_cmpxchg_niai8(&lp->lock, old, new)) + /* Got the lock */ + return; + continue; + } + if (count-- >= 0) + continue; + count = spin_retry; + if (!MACHINE_IS_LPAR || arch_vcpu_is_preempted(owner - 1)) + smp_yield_cpu(owner - 1); + } +} + void arch_spin_lock_wait(arch_spinlock_t *lp) { - int cpu = SPINLOCK_LOCKVAL; - int owner, count; - - /* Pass the virtual CPU to the lock holder if it is not running */ - owner = arch_load_niai4(&lp->lock); - if (owner && arch_vcpu_is_preempted(owner - 1)) - smp_yield_cpu(owner - 1); - - count = spin_retry; - while (1) { - owner = arch_load_niai4(&lp->lock); - /* Try to get the lock if it is free. */ - if (!owner) { - if (arch_cmpxchg_niai8(&lp->lock, 0, cpu)) - return; - continue; - } - if (count-- >= 0) - continue; - count = spin_retry; - /* - * For multiple layers of hypervisors, e.g. z/VM + LPAR - * yield the CPU unconditionally. For LPAR rely on the - * sense running status. - */ - if (!MACHINE_IS_LPAR || arch_vcpu_is_preempted(owner - 1)) - smp_yield_cpu(owner - 1); - } + /* Use classic spinlocks + niai if the steal time is >= 10% */ + if (test_cpu_flag(CIF_DEDICATED_CPU)) + arch_spin_lock_queued(lp); + else + arch_spin_lock_classic(lp); } EXPORT_SYMBOL(arch_spin_lock_wait); -void arch_spin_lock_wait_flags(arch_spinlock_t *lp, unsigned long flags) -{ - int cpu = SPINLOCK_LOCKVAL; - int owner, count; - - local_irq_restore(flags); - - /* Pass the virtual CPU to the lock holder if it is not running */ - owner = arch_load_niai4(&lp->lock); - if (owner && arch_vcpu_is_preempted(owner - 1)) - smp_yield_cpu(owner - 1); - - count = spin_retry; - while (1) { - owner = arch_load_niai4(&lp->lock); - /* Try to get the lock if it is free. */ - if (!owner) { - local_irq_disable(); - if (arch_cmpxchg_niai8(&lp->lock, 0, cpu)) - return; - local_irq_restore(flags); - continue; - } - if (count-- >= 0) - continue; - count = spin_retry; - /* - * For multiple layers of hypervisors, e.g. z/VM + LPAR - * yield the CPU unconditionally. For LPAR rely on the - * sense running status. - */ - if (!MACHINE_IS_LPAR || arch_vcpu_is_preempted(owner - 1)) - smp_yield_cpu(owner - 1); - } -} -EXPORT_SYMBOL(arch_spin_lock_wait_flags); - int arch_spin_trylock_retry(arch_spinlock_t *lp) { int cpu = SPINLOCK_LOCKVAL; @@ -270,3 +391,16 @@ void arch_lock_relax(int cpu) smp_yield_cpu(cpu - 1); } EXPORT_SYMBOL(arch_lock_relax); + +void arch_spin_relax(arch_spinlock_t *lp) +{ + int cpu; + + cpu = READ_ONCE(lp->lock) & _Q_LOCK_CPU_MASK; + if (!cpu) + return; + if (MACHINE_IS_LPAR && !arch_vcpu_is_preempted(cpu - 1)) + return; + smp_yield_cpu(cpu - 1); +} +EXPORT_SYMBOL(arch_spin_relax);