diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b32ebf92b0ce..67e00740531c 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -16,6 +16,7 @@ config X86_64 def_bool y depends on 64BIT select X86_DEV_DMA_OPS + select ARCH_USE_CMPXCHG_LOCKREF ### Arch settings config X86 diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index e3ddd7db723f..e0e668422c75 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -34,6 +34,11 @@ # define UNLOCK_LOCK_PREFIX #endif +static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock) +{ + return lock.tickets.head == lock.tickets.tail; +} + /* * Ticket locks are conceptually two parts, one indicating the current head of * the queue, and the other indicating the current tail. The lock is acquired diff --git a/fs/dcache.c b/fs/dcache.c index b949af850cd6..96655f4f4574 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -611,8 +611,23 @@ static inline void __dget(struct dentry *dentry) struct dentry *dget_parent(struct dentry *dentry) { + int gotref; struct dentry *ret; + /* + * Do optimistic parent lookup without any + * locking. + */ + rcu_read_lock(); + ret = ACCESS_ONCE(dentry->d_parent); + gotref = lockref_get_not_zero(&ret->d_lockref); + rcu_read_unlock(); + if (likely(gotref)) { + if (likely(ret == ACCESS_ONCE(dentry->d_parent))) + return ret; + dput(ret); + } + repeat: /* * Don't need rcu_dereference because we re-check it was correct under @@ -1771,7 +1786,7 @@ static noinline enum slow_d_compare slow_dentry_cmp( * without taking d_lock and checking d_seq sequence count against @seq * returned here. * - * A refcount may be taken on the found dentry with the __d_rcu_to_refcount + * A refcount may be taken on the found dentry with the d_rcu_to_refcount * function. * * Alternatively, __d_lookup_rcu may be called again to look up the child of diff --git a/fs/namei.c b/fs/namei.c index 7720fbd5277b..2c30c84d4ea1 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -494,6 +494,50 @@ static inline void unlock_rcu_walk(void) br_read_unlock(&vfsmount_lock); } +/* + * When we move over from the RCU domain to properly refcounted + * long-lived dentries, we need to check the sequence numbers + * we got before lookup very carefully. + * + * We cannot blindly increment a dentry refcount - even if it + * is not locked - if it is zero, because it may have gone + * through the final d_kill() logic already. + * + * So for a zero refcount, we need to get the spinlock (which is + * safe even for a dead dentry because the de-allocation is + * RCU-delayed), and check the sequence count under the lock. + * + * Once we have checked the sequence count, we know it is live, + * and since we hold the spinlock it cannot die from under us. + * + * In contrast, if the reference count wasn't zero, we can just + * increment the lockref without having to take the spinlock. + * Even if the sequence number ends up being stale, we haven't + * gone through the final dput() and killed the dentry yet. + */ +static inline int d_rcu_to_refcount(struct dentry *dentry, seqcount_t *validate, unsigned seq) +{ + int gotref; + + gotref = lockref_get_or_lock(&dentry->d_lockref); + + /* Does the sequence number still match? */ + if (read_seqcount_retry(validate, seq)) { + if (gotref) + dput(dentry); + else + spin_unlock(&dentry->d_lock); + return -ECHILD; + } + + /* Get the ref now, if we couldn't get it originally */ + if (!gotref) { + dentry->d_lockref.count++; + spin_unlock(&dentry->d_lock); + } + return 0; +} + /** * unlazy_walk - try to switch to ref-walk mode. * @nd: nameidata pathwalk data @@ -518,29 +562,28 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry) nd->root.dentry != fs->root.dentry) goto err_root; } - spin_lock(&parent->d_lock); + + /* + * For a negative lookup, the lookup sequence point is the parents + * sequence point, and it only needs to revalidate the parent dentry. + * + * For a positive lookup, we need to move both the parent and the + * dentry from the RCU domain to be properly refcounted. And the + * sequence number in the dentry validates *both* dentry counters, + * since we checked the sequence number of the parent after we got + * the child sequence number. So we know the parent must still + * be valid if the child sequence number is still valid. + */ if (!dentry) { - if (!__d_rcu_to_refcount(parent, nd->seq)) - goto err_parent; + if (d_rcu_to_refcount(parent, &parent->d_seq, nd->seq) < 0) + goto err_root; BUG_ON(nd->inode != parent->d_inode); } else { - if (dentry->d_parent != parent) + if (d_rcu_to_refcount(dentry, &dentry->d_seq, nd->seq) < 0) + goto err_root; + if (d_rcu_to_refcount(parent, &dentry->d_seq, nd->seq) < 0) goto err_parent; - spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); - if (!__d_rcu_to_refcount(dentry, nd->seq)) - goto err_child; - /* - * If the sequence check on the child dentry passed, then - * the child has not been removed from its parent. This - * means the parent dentry must be valid and able to take - * a reference at this point. - */ - BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent); - BUG_ON(!parent->d_lockref.count); - parent->d_lockref.count++; - spin_unlock(&dentry->d_lock); } - spin_unlock(&parent->d_lock); if (want_root) { path_get(&nd->root); spin_unlock(&fs->lock); @@ -551,10 +594,8 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry) nd->flags &= ~LOOKUP_RCU; return 0; -err_child: - spin_unlock(&dentry->d_lock); err_parent: - spin_unlock(&parent->d_lock); + dput(dentry); err_root: if (want_root) spin_unlock(&fs->lock); @@ -585,14 +626,11 @@ static int complete_walk(struct nameidata *nd) nd->flags &= ~LOOKUP_RCU; if (!(nd->flags & LOOKUP_ROOT)) nd->root.mnt = NULL; - spin_lock(&dentry->d_lock); - if (unlikely(!__d_rcu_to_refcount(dentry, nd->seq))) { - spin_unlock(&dentry->d_lock); + + if (d_rcu_to_refcount(dentry, &dentry->d_seq, nd->seq) < 0) { unlock_rcu_walk(); return -ECHILD; } - BUG_ON(nd->inode != dentry->d_inode); - spin_unlock(&dentry->d_lock); mntget(nd->path.mnt); unlock_rcu_walk(); } diff --git a/include/linux/dcache.h b/include/linux/dcache.h index efdc94434c30..9169b91ea2d2 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -304,28 +304,6 @@ extern struct dentry *__d_lookup(const struct dentry *, const struct qstr *); extern struct dentry *__d_lookup_rcu(const struct dentry *parent, const struct qstr *name, unsigned *seq); -/** - * __d_rcu_to_refcount - take a refcount on dentry if sequence check is ok - * @dentry: dentry to take a ref on - * @seq: seqcount to verify against - * Returns: 0 on failure, else 1. - * - * __d_rcu_to_refcount operates on a dentry,seq pair that was returned - * by __d_lookup_rcu, to get a reference on an rcu-walk dentry. - */ -static inline int __d_rcu_to_refcount(struct dentry *dentry, unsigned seq) -{ - int ret = 0; - - assert_spin_locked(&dentry->d_lock); - if (!read_seqcount_retry(&dentry->d_seq, seq)) { - ret = 1; - dentry->d_lockref.count++; - } - - return ret; -} - static inline unsigned d_count(const struct dentry *dentry) { return dentry->d_lockref.count; diff --git a/include/linux/lockref.h b/include/linux/lockref.h index 01233e01627a..ca07b5028b01 100644 --- a/include/linux/lockref.h +++ b/include/linux/lockref.h @@ -17,55 +17,20 @@ #include struct lockref { - spinlock_t lock; - unsigned int count; + union { +#ifdef CONFIG_CMPXCHG_LOCKREF + aligned_u64 lock_count; +#endif + struct { + spinlock_t lock; + unsigned int count; + }; + }; }; -/** - * lockref_get - Increments reference count unconditionally - * @lockcnt: pointer to lockref structure - * - * This operation is only valid if you already hold a reference - * to the object, so you know the count cannot be zero. - */ -static inline void lockref_get(struct lockref *lockref) -{ - spin_lock(&lockref->lock); - lockref->count++; - spin_unlock(&lockref->lock); -} - -/** - * lockref_get_not_zero - Increments count unless the count is 0 - * @lockcnt: pointer to lockref structure - * Return: 1 if count updated successfully or 0 if count is 0 - */ -static inline int lockref_get_not_zero(struct lockref *lockref) -{ - int retval = 0; - - spin_lock(&lockref->lock); - if (lockref->count) { - lockref->count++; - retval = 1; - } - spin_unlock(&lockref->lock); - return retval; -} - -/** - * lockref_put_or_lock - decrements count unless count <= 1 before decrement - * @lockcnt: pointer to lockref structure - * Return: 1 if count updated successfully or 0 if count <= 1 and lock taken - */ -static inline int lockref_put_or_lock(struct lockref *lockref) -{ - spin_lock(&lockref->lock); - if (lockref->count <= 1) - return 0; - lockref->count--; - spin_unlock(&lockref->lock); - return 1; -} +extern void lockref_get(struct lockref *); +extern int lockref_get_not_zero(struct lockref *); +extern int lockref_get_or_lock(struct lockref *); +extern int lockref_put_or_lock(struct lockref *); #endif /* __LINUX_LOCKREF_H */ diff --git a/lib/Kconfig b/lib/Kconfig index 71d9f81f6eed..65561716c16c 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -48,6 +48,16 @@ config STMP_DEVICE config PERCPU_RWSEM boolean +config ARCH_USE_CMPXCHG_LOCKREF + bool + +config CMPXCHG_LOCKREF + def_bool y if ARCH_USE_CMPXCHG_LOCKREF + depends on SMP + depends on !GENERIC_LOCKBREAK + depends on !DEBUG_SPINLOCK + depends on !DEBUG_LOCK_ALLOC + config CRC_CCITT tristate "CRC-CCITT functions" help diff --git a/lib/Makefile b/lib/Makefile index 7baccfd8a4e9..f2cb3082697c 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -20,6 +20,7 @@ lib-$(CONFIG_MMU) += ioremap.o lib-$(CONFIG_SMP) += cpumask.o lib-y += kobject.o klist.o +obj-y += lockref.o obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \ bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o \ diff --git a/lib/lockref.c b/lib/lockref.c new file mode 100644 index 000000000000..7819c2d1d315 --- /dev/null +++ b/lib/lockref.c @@ -0,0 +1,127 @@ +#include +#include + +#ifdef CONFIG_CMPXCHG_LOCKREF + +/* + * Note that the "cmpxchg()" reloads the "old" value for the + * failure case. + */ +#define CMPXCHG_LOOP(CODE, SUCCESS) do { \ + struct lockref old; \ + BUILD_BUG_ON(sizeof(old) != 8); \ + old.lock_count = ACCESS_ONCE(lockref->lock_count); \ + while (likely(arch_spin_value_unlocked(old.lock.rlock.raw_lock))) { \ + struct lockref new = old, prev = old; \ + CODE \ + old.lock_count = cmpxchg(&lockref->lock_count, \ + old.lock_count, new.lock_count); \ + if (likely(old.lock_count == prev.lock_count)) { \ + SUCCESS; \ + } \ + } \ +} while (0) + +#else + +#define CMPXCHG_LOOP(CODE, SUCCESS) do { } while (0) + +#endif + +/** + * lockref_get - Increments reference count unconditionally + * @lockcnt: pointer to lockref structure + * + * This operation is only valid if you already hold a reference + * to the object, so you know the count cannot be zero. + */ +void lockref_get(struct lockref *lockref) +{ + CMPXCHG_LOOP( + new.count++; + , + return; + ); + + spin_lock(&lockref->lock); + lockref->count++; + spin_unlock(&lockref->lock); +} +EXPORT_SYMBOL(lockref_get); + +/** + * lockref_get_not_zero - Increments count unless the count is 0 + * @lockcnt: pointer to lockref structure + * Return: 1 if count updated successfully or 0 if count was zero + */ +int lockref_get_not_zero(struct lockref *lockref) +{ + int retval; + + CMPXCHG_LOOP( + new.count++; + if (!old.count) + return 0; + , + return 1; + ); + + spin_lock(&lockref->lock); + retval = 0; + if (lockref->count) { + lockref->count++; + retval = 1; + } + spin_unlock(&lockref->lock); + return retval; +} +EXPORT_SYMBOL(lockref_get_not_zero); + +/** + * lockref_get_or_lock - Increments count unless the count is 0 + * @lockcnt: pointer to lockref structure + * Return: 1 if count updated successfully or 0 if count was zero + * and we got the lock instead. + */ +int lockref_get_or_lock(struct lockref *lockref) +{ + CMPXCHG_LOOP( + new.count++; + if (!old.count) + break; + , + return 1; + ); + + spin_lock(&lockref->lock); + if (!lockref->count) + return 0; + lockref->count++; + spin_unlock(&lockref->lock); + return 1; +} +EXPORT_SYMBOL(lockref_get_or_lock); + +/** + * lockref_put_or_lock - decrements count unless count <= 1 before decrement + * @lockcnt: pointer to lockref structure + * Return: 1 if count updated successfully or 0 if count <= 1 and lock taken + */ +int lockref_put_or_lock(struct lockref *lockref) +{ + CMPXCHG_LOOP( + new.count--; + if (old.count <= 1) + break; + , + return 1; + ); + + spin_lock(&lockref->lock); + if (lockref->count <= 1) + return 0; + lockref->count--; + spin_unlock(&lockref->lock); + return 1; +} +EXPORT_SYMBOL(lockref_put_or_lock);