futex: Change locking rules

Currently futex-pi relies on hb->lock to serialize everything. But hb->lock
creates another set of problems, especially priority inversions on RT where
hb->lock becomes a rt_mutex itself.

The rt_mutex::wait_lock is the most obvious protection for keeping the
futex user space value and the kernel internal pi_state in sync.

Rework and document the locking so rt_mutex::wait_lock is held accross all
operations which modify the user space value and the pi state.

This allows to invoke rt_mutex_unlock() (including deboost) without holding
hb->lock as a next step.

Nothing yet relies on the new locking rules.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: juri.lelli@arm.com
Cc: bigeasy@linutronix.de
Cc: xlpang@redhat.com
Cc: rostedt@goodmis.org
Cc: mathieu.desnoyers@efficios.com
Cc: jdesfossez@efficios.com
Cc: dvhart@infradead.org
Cc: bristot@redhat.com
Link: http://lkml.kernel.org/r/20170322104151.751993333@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
This commit is contained in:
Peter Zijlstra 2017-03-22 11:35:52 +01:00 committed by Thomas Gleixner
parent 5293c2efda
commit 734009e96d
1 changed files with 132 additions and 33 deletions

View File

@ -973,6 +973,39 @@ void exit_pi_state_list(struct task_struct *curr)
* *
* [10] There is no transient state which leaves owner and user space * [10] There is no transient state which leaves owner and user space
* TID out of sync. * TID out of sync.
*
*
* Serialization and lifetime rules:
*
* hb->lock:
*
* hb -> futex_q, relation
* futex_q -> pi_state, relation
*
* (cannot be raw because hb can contain arbitrary amount
* of futex_q's)
*
* pi_mutex->wait_lock:
*
* {uval, pi_state}
*
* (and pi_mutex 'obviously')
*
* p->pi_lock:
*
* p->pi_state_list -> pi_state->list, relation
*
* pi_state->refcount:
*
* pi_state lifetime
*
*
* Lock order:
*
* hb->lock
* pi_mutex->wait_lock
* p->pi_lock
*
*/ */
/* /*
@ -980,10 +1013,12 @@ void exit_pi_state_list(struct task_struct *curr)
* the pi_state against the user space value. If correct, attach to * the pi_state against the user space value. If correct, attach to
* it. * it.
*/ */
static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state, static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
struct futex_pi_state *pi_state,
struct futex_pi_state **ps) struct futex_pi_state **ps)
{ {
pid_t pid = uval & FUTEX_TID_MASK; pid_t pid = uval & FUTEX_TID_MASK;
int ret, uval2;
/* /*
* Userspace might have messed up non-PI and PI futexes [3] * Userspace might have messed up non-PI and PI futexes [3]
@ -991,8 +1026,33 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
if (unlikely(!pi_state)) if (unlikely(!pi_state))
return -EINVAL; return -EINVAL;
/*
* We get here with hb->lock held, and having found a
* futex_top_waiter(). This means that futex_lock_pi() of said futex_q
* has dropped the hb->lock in between queue_me() and unqueue_me_pi(),
* which in turn means that futex_lock_pi() still has a reference on
* our pi_state.
*/
WARN_ON(!atomic_read(&pi_state->refcount)); WARN_ON(!atomic_read(&pi_state->refcount));
/*
* Now that we have a pi_state, we can acquire wait_lock
* and do the state validation.
*/
raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
/*
* Since {uval, pi_state} is serialized by wait_lock, and our current
* uval was read without holding it, it can have changed. Verify it
* still is what we expect it to be, otherwise retry the entire
* operation.
*/
if (get_futex_value_locked(&uval2, uaddr))
goto out_efault;
if (uval != uval2)
goto out_eagain;
/* /*
* Handle the owner died case: * Handle the owner died case:
*/ */
@ -1008,11 +1068,11 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
* is not 0. Inconsistent state. [5] * is not 0. Inconsistent state. [5]
*/ */
if (pid) if (pid)
return -EINVAL; goto out_einval;
/* /*
* Take a ref on the state and return success. [4] * Take a ref on the state and return success. [4]
*/ */
goto out_state; goto out_attach;
} }
/* /*
@ -1024,14 +1084,14 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
* Take a ref on the state and return success. [6] * Take a ref on the state and return success. [6]
*/ */
if (!pid) if (!pid)
goto out_state; goto out_attach;
} else { } else {
/* /*
* If the owner died bit is not set, then the pi_state * If the owner died bit is not set, then the pi_state
* must have an owner. [7] * must have an owner. [7]
*/ */
if (!pi_state->owner) if (!pi_state->owner)
return -EINVAL; goto out_einval;
} }
/* /*
@ -1040,11 +1100,29 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
* user space TID. [9/10] * user space TID. [9/10]
*/ */
if (pid != task_pid_vnr(pi_state->owner)) if (pid != task_pid_vnr(pi_state->owner))
return -EINVAL; goto out_einval;
out_state:
out_attach:
atomic_inc(&pi_state->refcount); atomic_inc(&pi_state->refcount);
raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
*ps = pi_state; *ps = pi_state;
return 0; return 0;
out_einval:
ret = -EINVAL;
goto out_error;
out_eagain:
ret = -EAGAIN;
goto out_error;
out_efault:
ret = -EFAULT;
goto out_error;
out_error:
raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
return ret;
} }
/* /*
@ -1095,6 +1173,9 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
/* /*
* No existing pi state. First waiter. [2] * No existing pi state. First waiter. [2]
*
* This creates pi_state, we have hb->lock held, this means nothing can
* observe this state, wait_lock is irrelevant.
*/ */
pi_state = alloc_pi_state(); pi_state = alloc_pi_state();
@ -1119,7 +1200,8 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
return 0; return 0;
} }
static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, static int lookup_pi_state(u32 __user *uaddr, u32 uval,
struct futex_hash_bucket *hb,
union futex_key *key, struct futex_pi_state **ps) union futex_key *key, struct futex_pi_state **ps)
{ {
struct futex_q *top_waiter = futex_top_waiter(hb, key); struct futex_q *top_waiter = futex_top_waiter(hb, key);
@ -1129,7 +1211,7 @@ static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
* attach to the pi_state when the validation succeeds. * attach to the pi_state when the validation succeeds.
*/ */
if (top_waiter) if (top_waiter)
return attach_to_pi_state(uval, top_waiter->pi_state, ps); return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
/* /*
* We are the first waiter - try to look up the owner based on * We are the first waiter - try to look up the owner based on
@ -1148,7 +1230,7 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
return -EFAULT; return -EFAULT;
/*If user space value changed, let the caller retry */ /* If user space value changed, let the caller retry */
return curval != uval ? -EAGAIN : 0; return curval != uval ? -EAGAIN : 0;
} }
@ -1204,7 +1286,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
*/ */
top_waiter = futex_top_waiter(hb, key); top_waiter = futex_top_waiter(hb, key);
if (top_waiter) if (top_waiter)
return attach_to_pi_state(uval, top_waiter->pi_state, ps); return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
/* /*
* No waiter and user TID is 0. We are here because the * No waiter and user TID is 0. We are here because the
@ -1336,6 +1418,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *top_waiter
if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) { if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) {
ret = -EFAULT; ret = -EFAULT;
} else if (curval != uval) { } else if (curval != uval) {
/* /*
* If a unconditional UNLOCK_PI operation (user space did not * If a unconditional UNLOCK_PI operation (user space did not
@ -1348,6 +1431,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *top_waiter
else else
ret = -EINVAL; ret = -EINVAL;
} }
if (ret) { if (ret) {
raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
return ret; return ret;
@ -1823,7 +1907,7 @@ retry_private:
* If that call succeeds then we have pi_state and an * If that call succeeds then we have pi_state and an
* initial refcount on it. * initial refcount on it.
*/ */
ret = lookup_pi_state(ret, hb2, &key2, &pi_state); ret = lookup_pi_state(uaddr2, ret, hb2, &key2, &pi_state);
} }
switch (ret) { switch (ret) {
@ -2122,10 +2206,13 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
{ {
u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
struct futex_pi_state *pi_state = q->pi_state; struct futex_pi_state *pi_state = q->pi_state;
struct task_struct *oldowner = pi_state->owner;
u32 uval, uninitialized_var(curval), newval; u32 uval, uninitialized_var(curval), newval;
struct task_struct *oldowner;
int ret; int ret;
raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
oldowner = pi_state->owner;
/* Owner died? */ /* Owner died? */
if (!pi_state->owner) if (!pi_state->owner)
newtid |= FUTEX_OWNER_DIED; newtid |= FUTEX_OWNER_DIED;
@ -2141,11 +2228,10 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
* because we can fault here. Imagine swapped out pages or a fork * because we can fault here. Imagine swapped out pages or a fork
* that marked all the anonymous memory readonly for cow. * that marked all the anonymous memory readonly for cow.
* *
* Modifying pi_state _before_ the user space value would * Modifying pi_state _before_ the user space value would leave the
* leave the pi_state in an inconsistent state when we fault * pi_state in an inconsistent state when we fault here, because we
* here, because we need to drop the hash bucket lock to * need to drop the locks to handle the fault. This might be observed
* handle the fault. This might be observed in the PID check * in the PID check in lookup_pi_state.
* in lookup_pi_state.
*/ */
retry: retry:
if (get_futex_value_locked(&uval, uaddr)) if (get_futex_value_locked(&uval, uaddr))
@ -2166,47 +2252,60 @@ retry:
* itself. * itself.
*/ */
if (pi_state->owner != NULL) { if (pi_state->owner != NULL) {
raw_spin_lock_irq(&pi_state->owner->pi_lock); raw_spin_lock(&pi_state->owner->pi_lock);
WARN_ON(list_empty(&pi_state->list)); WARN_ON(list_empty(&pi_state->list));
list_del_init(&pi_state->list); list_del_init(&pi_state->list);
raw_spin_unlock_irq(&pi_state->owner->pi_lock); raw_spin_unlock(&pi_state->owner->pi_lock);
} }
pi_state->owner = newowner; pi_state->owner = newowner;
raw_spin_lock_irq(&newowner->pi_lock); raw_spin_lock(&newowner->pi_lock);
WARN_ON(!list_empty(&pi_state->list)); WARN_ON(!list_empty(&pi_state->list));
list_add(&pi_state->list, &newowner->pi_state_list); list_add(&pi_state->list, &newowner->pi_state_list);
raw_spin_unlock_irq(&newowner->pi_lock); raw_spin_unlock(&newowner->pi_lock);
raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
return 0; return 0;
/* /*
* To handle the page fault we need to drop the hash bucket * To handle the page fault we need to drop the locks here. That gives
* lock here. That gives the other task (either the highest priority * the other task (either the highest priority waiter itself or the
* waiter itself or the task which stole the rtmutex) the * task which stole the rtmutex) the chance to try the fixup of the
* chance to try the fixup of the pi_state. So once we are * pi_state. So once we are back from handling the fault we need to
* back from handling the fault we need to check the pi_state * check the pi_state after reacquiring the locks and before trying to
* after reacquiring the hash bucket lock and before trying to * do another fixup. When the fixup has been done already we simply
* do another fixup. When the fixup has been done already we * return.
* simply return. *
* Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
* drop hb->lock since the caller owns the hb -> futex_q relation.
* Dropping the pi_mutex->wait_lock requires the state revalidate.
*/ */
handle_fault: handle_fault:
raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
spin_unlock(q->lock_ptr); spin_unlock(q->lock_ptr);
ret = fault_in_user_writeable(uaddr); ret = fault_in_user_writeable(uaddr);
spin_lock(q->lock_ptr); spin_lock(q->lock_ptr);
raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
/* /*
* Check if someone else fixed it for us: * Check if someone else fixed it for us:
*/ */
if (pi_state->owner != oldowner) if (pi_state->owner != oldowner) {
return 0; ret = 0;
goto out_unlock;
}
if (ret) if (ret)
return ret; goto out_unlock;
goto retry; goto retry;
out_unlock:
raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
return ret;
} }
static long futex_wait_restart(struct restart_block *restart); static long futex_wait_restart(struct restart_block *restart);