diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index ed9d96708f93..2ea42999d2d8 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -251,15 +251,16 @@ static __always_inline void __pv_init_node(struct mcs_spinlock *node) { } static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { } static __always_inline void __pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node) { } -static __always_inline void __pv_wait_head(struct qspinlock *lock, - struct mcs_spinlock *node) { } +static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock, + struct mcs_spinlock *node) + { return 0; } #define pv_enabled() false #define pv_init_node __pv_init_node #define pv_wait_node __pv_wait_node #define pv_kick_node __pv_kick_node -#define pv_wait_head __pv_wait_head +#define pv_wait_head_or_lock __pv_wait_head_or_lock #ifdef CONFIG_PARAVIRT_SPINLOCKS #define queued_spin_lock_slowpath native_queued_spin_lock_slowpath @@ -431,10 +432,22 @@ queue: * sequentiality; this is because the set_locked() function below * does not imply a full barrier. * + * The PV pv_wait_head_or_lock function, if active, will acquire + * the lock and return a non-zero value. So we have to skip the + * smp_load_acquire() call. As the next PV queue head hasn't been + * designated yet, there is no way for the locked value to become + * _Q_SLOW_VAL. So both the set_locked() and the + * atomic_cmpxchg_relaxed() calls will be safe. + * + * If PV isn't active, 0 will be returned instead. + * */ - pv_wait_head(lock, node); + if ((val = pv_wait_head_or_lock(lock, node))) + goto locked; + smp_cond_acquire(!((val = atomic_read(&lock->val)) & _Q_LOCKED_PENDING_MASK)); +locked: /* * claim the lock: * @@ -446,7 +459,8 @@ queue: * to grab the lock. */ for (;;) { - if (val != tail) { + /* In the PV case we might already have _Q_LOCKED_VAL set */ + if ((val & _Q_TAIL_MASK) != tail) { set_locked(lock); break; } @@ -493,7 +507,7 @@ EXPORT_SYMBOL(queued_spin_lock_slowpath); #undef pv_init_node #undef pv_wait_node #undef pv_kick_node -#undef pv_wait_head +#undef pv_wait_head_or_lock #undef queued_spin_lock_slowpath #define queued_spin_lock_slowpath __pv_queued_spin_lock_slowpath diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index aaeeefb791f8..ace60a451b4f 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -40,6 +40,89 @@ struct pv_node { u8 state; }; +/* + * By replacing the regular queued_spin_trylock() with the function below, + * it will be called once when a lock waiter enter the PV slowpath before + * being queued. By allowing one lock stealing attempt here when the pending + * bit is off, it helps to reduce the performance impact of lock waiter + * preemption without the drawback of lock starvation. + */ +#define queued_spin_trylock(l) pv_queued_spin_steal_lock(l) +static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock) +{ + struct __qspinlock *l = (void *)lock; + + return !(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) && + (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0); +} + +/* + * The pending bit is used by the queue head vCPU to indicate that it + * is actively spinning on the lock and no lock stealing is allowed. + */ +#if _Q_PENDING_BITS == 8 +static __always_inline void set_pending(struct qspinlock *lock) +{ + struct __qspinlock *l = (void *)lock; + + WRITE_ONCE(l->pending, 1); +} + +static __always_inline void clear_pending(struct qspinlock *lock) +{ + struct __qspinlock *l = (void *)lock; + + WRITE_ONCE(l->pending, 0); +} + +/* + * The pending bit check in pv_queued_spin_steal_lock() isn't a memory + * barrier. Therefore, an atomic cmpxchg() is used to acquire the lock + * just to be sure that it will get it. + */ +static __always_inline int trylock_clear_pending(struct qspinlock *lock) +{ + struct __qspinlock *l = (void *)lock; + + return !READ_ONCE(l->locked) && + (cmpxchg(&l->locked_pending, _Q_PENDING_VAL, _Q_LOCKED_VAL) + == _Q_PENDING_VAL); +} +#else /* _Q_PENDING_BITS == 8 */ +static __always_inline void set_pending(struct qspinlock *lock) +{ + atomic_set_mask(_Q_PENDING_VAL, &lock->val); +} + +static __always_inline void clear_pending(struct qspinlock *lock) +{ + atomic_clear_mask(_Q_PENDING_VAL, &lock->val); +} + +static __always_inline int trylock_clear_pending(struct qspinlock *lock) +{ + int val = atomic_read(&lock->val); + + for (;;) { + int old, new; + + if (val & _Q_LOCKED_MASK) + break; + + /* + * Try to clear pending bit & set locked bit + */ + old = val; + new = (val & ~_Q_PENDING_MASK) | _Q_LOCKED_VAL; + val = atomic_cmpxchg(&lock->val, old, new); + + if (val == old) + return 1; + } + return 0; +} +#endif /* _Q_PENDING_BITS == 8 */ + /* * Include queued spinlock statistics code */ @@ -202,8 +285,8 @@ static void pv_wait_node(struct mcs_spinlock *node) /* * If pv_kick_node() changed us to vcpu_hashed, retain that - * value so that pv_wait_head() knows to not also try to hash - * this lock. + * value so that pv_wait_head_or_lock() knows to not also try + * to hash this lock. */ cmpxchg(&pn->state, vcpu_halted, vcpu_running); @@ -227,8 +310,9 @@ static void pv_wait_node(struct mcs_spinlock *node) /* * Called after setting next->locked = 1 when we're the lock owner. * - * Instead of waking the waiters stuck in pv_wait_node() advance their state such - * that they're waiting in pv_wait_head(), this avoids a wake/sleep cycle. + * Instead of waking the waiters stuck in pv_wait_node() advance their state + * such that they're waiting in pv_wait_head_or_lock(), this avoids a + * wake/sleep cycle. */ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node) { @@ -257,10 +341,14 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node) } /* - * Wait for l->locked to become clear; halt the vcpu after a short spin. + * Wait for l->locked to become clear and acquire the lock; + * halt the vcpu after a short spin. * __pv_queued_spin_unlock() will wake us. + * + * The current value of the lock will be returned for additional processing. */ -static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node) +static u32 +pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node) { struct pv_node *pn = (struct pv_node *)node; struct __qspinlock *l = (void *)lock; @@ -276,11 +364,18 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node) lp = (struct qspinlock **)1; for (;; waitcnt++) { + /* + * Set the pending bit in the active lock spinning loop to + * disable lock stealing before attempting to acquire the lock. + */ + set_pending(lock); for (loop = SPIN_THRESHOLD; loop; loop--) { - if (!READ_ONCE(l->locked)) - return; + if (trylock_clear_pending(lock)) + goto gotlock; cpu_relax(); } + clear_pending(lock); + if (!lp) { /* ONCE */ lp = pv_hash(lock, pn); @@ -296,36 +391,38 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node) * * Matches the smp_rmb() in __pv_queued_spin_unlock(). */ - if (!cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)) { + if (xchg(&l->locked, _Q_SLOW_VAL) == 0) { /* - * The lock is free and _Q_SLOW_VAL has never - * been set. Therefore we need to unhash before - * getting the lock. + * The lock was free and now we own the lock. + * Change the lock value back to _Q_LOCKED_VAL + * and unhash the table. */ + WRITE_ONCE(l->locked, _Q_LOCKED_VAL); WRITE_ONCE(*lp, NULL); - return; + goto gotlock; } } qstat_inc(qstat_pv_wait_head, true); qstat_inc(qstat_pv_wait_again, waitcnt); pv_wait(&l->locked, _Q_SLOW_VAL); - if (!READ_ONCE(l->locked)) - return; /* * The unlocker should have freed the lock before kicking the * CPU. So if the lock is still not free, it is a spurious - * wakeup and so the vCPU should wait again after spinning for - * a while. + * wakeup or another vCPU has stolen the lock. The current + * vCPU should spin again. */ - qstat_inc(qstat_pv_spurious_wakeup, true); + qstat_inc(qstat_pv_spurious_wakeup, READ_ONCE(l->locked)); } /* - * Lock is unlocked now; the caller will acquire it without waiting. - * As with pv_wait_node() we rely on the caller to do a load-acquire - * for us. + * The cmpxchg() or xchg() call before coming here provides the + * acquire semantics for locking. The dummy ORing of _Q_LOCKED_VAL + * here is to indicate to the compiler that the value will always + * be nozero to enable better code optimization. */ +gotlock: + return (u32)(atomic_read(&lock->val) | _Q_LOCKED_VAL); } /* @@ -350,7 +447,7 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked) * so we need a barrier to order the read of the node data in * pv_unhash *after* we've read the lock being _Q_SLOW_VAL. * - * Matches the cmpxchg() in pv_wait_head() setting _Q_SLOW_VAL. + * Matches the cmpxchg() in pv_wait_head_or_lock() setting _Q_SLOW_VAL. */ smp_rmb(); diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h index b1553adec2e7..94d4533fe984 100644 --- a/kernel/locking/qspinlock_stat.h +++ b/kernel/locking/qspinlock_stat.h @@ -22,6 +22,7 @@ * pv_kick_wake - # of vCPU kicks used for computing pv_latency_wake * pv_latency_kick - average latency (ns) of vCPU kick operation * pv_latency_wake - average latency (ns) from vCPU kick to wakeup + * pv_lock_stealing - # of lock stealing operations * pv_spurious_wakeup - # of spurious wakeups * pv_wait_again - # of vCPU wait's that happened after a vCPU kick * pv_wait_head - # of vCPU wait's at the queue head @@ -43,6 +44,7 @@ enum qlock_stats { qstat_pv_kick_wake, qstat_pv_latency_kick, qstat_pv_latency_wake, + qstat_pv_lock_stealing, qstat_pv_spurious_wakeup, qstat_pv_wait_again, qstat_pv_wait_head, @@ -66,6 +68,7 @@ static const char * const qstat_names[qstat_num + 1] = { [qstat_pv_spurious_wakeup] = "pv_spurious_wakeup", [qstat_pv_latency_kick] = "pv_latency_kick", [qstat_pv_latency_wake] = "pv_latency_wake", + [qstat_pv_lock_stealing] = "pv_lock_stealing", [qstat_pv_wait_again] = "pv_wait_again", [qstat_pv_wait_head] = "pv_wait_head", [qstat_pv_wait_node] = "pv_wait_node", @@ -273,6 +276,19 @@ static inline void __pv_wait(u8 *ptr, u8 val) #define pv_kick(c) __pv_kick(c) #define pv_wait(p, v) __pv_wait(p, v) +/* + * PV unfair trylock count tracking function + */ +static inline int qstat_spin_steal_lock(struct qspinlock *lock) +{ + int ret = pv_queued_spin_steal_lock(lock); + + qstat_inc(qstat_pv_lock_stealing, ret); + return ret; +} +#undef queued_spin_trylock +#define queued_spin_trylock(l) qstat_spin_steal_lock(l) + #else /* CONFIG_QUEUED_LOCK_STAT */ static inline void qstat_inc(enum qlock_stats stat, bool cond) { }