ca3d30cc02
fls(N), ffs(N) and fls64(N) can be optimised on x86_64. Currently they use a CMOV instruction after the BSR/BSF to set the destination register to -1 if the value to be scanned was 0 (in which case BSR/BSF set the Z flag). Instead, according to the AMD64 specification, we can make use of the fact that BSR/BSF doesn't modify its output register if its input is 0. By preloading the output with -1 and incrementing the result, we achieve the desired result without the need for a conditional check. The Intel x86_64 specification, however, says that the result of BSR/BSF in such a case is undefined. That said, when queried, one of the Intel CPU architects said that the behaviour on all Intel CPUs is that: (1) with BSRQ/BSFQ, the 64-bit destination register is written with its original value if the source is 0, thus, in essence, giving the effect we want. And, (2) with BSRL/BSFL, the lower half of the 64-bit destination register is written with its original value if the source is 0, and the upper half is cleared, thus giving us the effect we want (we return a 4-byte int). Further, it was indicated that they (Intel) are unlikely to get away with changing the behaviour. It might be possible to optimise the 32-bit versions of these functions, but there's a lot more variation, and so the effective non-destructive property of BSRL/BSRF cannot be relied on. [ hpa: specifically, some 486 chips are known to NOT have this property. ] I have benchmarked these functions on my Core2 Duo test machine using the following program: #include <stdlib.h> #include <stdio.h> #ifndef __x86_64__ #error #endif #define PAGE_SHIFT 12 typedef unsigned long long __u64, u64; typedef unsigned int __u32, u32; #define noinline __attribute__((noinline)) static __always_inline int fls64(__u64 x) { long bitpos = -1; asm("bsrq %1,%0" : "+r" (bitpos) : "rm" (x)); return bitpos + 1; } static inline unsigned long __fls(unsigned long word) { asm("bsr %1,%0" : "=r" (word) : "rm" (word)); return word; } static __always_inline int old_fls64(__u64 x) { if (x == 0) return 0; return __fls(x) + 1; } static noinline // __attribute__((const)) int old_get_order(unsigned long size) { int order; size = (size - 1) >> (PAGE_SHIFT - 1); order = -1; do { size >>= 1; order++; } while (size); return order; } static inline __attribute__((const)) int get_order_old_fls64(unsigned long size) { int order; size--; size >>= PAGE_SHIFT; order = old_fls64(size); return order; } static inline __attribute__((const)) int get_order(unsigned long size) { int order; size--; size >>= PAGE_SHIFT; order = fls64(size); return order; } unsigned long prevent_optimise_out; static noinline unsigned long test_old_get_order(void) { unsigned long n, total = 0; long rep, loop; for (rep = 1000000; rep > 0; rep--) { for (loop = 0; loop <= 16384; loop += 4) { n = 1UL << loop; total += old_get_order(n); } } return total; } static noinline unsigned long test_get_order_old_fls64(void) { unsigned long n, total = 0; long rep, loop; for (rep = 1000000; rep > 0; rep--) { for (loop = 0; loop <= 16384; loop += 4) { n = 1UL << loop; total += get_order_old_fls64(n); } } return total; } static noinline unsigned long test_get_order(void) { unsigned long n, total = 0; long rep, loop; for (rep = 1000000; rep > 0; rep--) { for (loop = 0; loop <= 16384; loop += 4) { n = 1UL << loop; total += get_order(n); } } return total; } int main(int argc, char **argv) { unsigned long total; switch (argc) { case 1: total = test_old_get_order(); break; case 2: total = test_get_order_old_fls64(); break; default: total = test_get_order(); break; } prevent_optimise_out = total; return 0; } This allows me to test the use of the old fls64() implementation and the new fls64() implementation and also to contrast these to the out-of-line loop-based implementation of get_order(). The results were: warthog>time ./get_order real 1m37.191s user 1m36.313s sys 0m0.861s warthog>time ./get_order x real 0m16.892s user 0m16.586s sys 0m0.287s warthog>time ./get_order x x real 0m7.731s user 0m7.727s sys 0m0.002s Using the current upstream fls64() as a basis for an inlined get_order() [the second result above] is much faster than using the current out-of-line loop-based get_order() [the first result above]. Using my optimised inline fls64()-based get_order() [the third result above] is even faster still. [ hpa: changed the selection of 32 vs 64 bits to use CONFIG_X86_64 instead of comparing BITS_PER_LONG, updated comments, rebased manually on top of 83d99df7c4bf x86, bitops: Move fls64.h inside __KERNEL__ ] Signed-off-by: David Howells <dhowells@redhat.com> Link: http://lkml.kernel.org/r/20111213145654.14362.39868.stgit@warthog.procyon.org.uk Cc: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
515 lines
13 KiB
C
515 lines
13 KiB
C
#ifndef _ASM_X86_BITOPS_H
|
|
#define _ASM_X86_BITOPS_H
|
|
|
|
/*
|
|
* Copyright 1992, Linus Torvalds.
|
|
*
|
|
* Note: inlines with more than a single statement should be marked
|
|
* __always_inline to avoid problems with older gcc's inlining heuristics.
|
|
*/
|
|
|
|
#ifndef _LINUX_BITOPS_H
|
|
#error only <linux/bitops.h> can be included directly
|
|
#endif
|
|
|
|
#include <linux/compiler.h>
|
|
#include <asm/alternative.h>
|
|
|
|
/*
|
|
* These have to be done with inline assembly: that way the bit-setting
|
|
* is guaranteed to be atomic. All bit operations return 0 if the bit
|
|
* was cleared before the operation and != 0 if it was not.
|
|
*
|
|
* bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1).
|
|
*/
|
|
|
|
#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 1)
|
|
/* Technically wrong, but this avoids compilation errors on some gcc
|
|
versions. */
|
|
#define BITOP_ADDR(x) "=m" (*(volatile long *) (x))
|
|
#else
|
|
#define BITOP_ADDR(x) "+m" (*(volatile long *) (x))
|
|
#endif
|
|
|
|
#define ADDR BITOP_ADDR(addr)
|
|
|
|
/*
|
|
* We do the locked ops that don't return the old value as
|
|
* a mask operation on a byte.
|
|
*/
|
|
#define IS_IMMEDIATE(nr) (__builtin_constant_p(nr))
|
|
#define CONST_MASK_ADDR(nr, addr) BITOP_ADDR((void *)(addr) + ((nr)>>3))
|
|
#define CONST_MASK(nr) (1 << ((nr) & 7))
|
|
|
|
/**
|
|
* set_bit - Atomically set a bit in memory
|
|
* @nr: the bit to set
|
|
* @addr: the address to start counting from
|
|
*
|
|
* This function is atomic and may not be reordered. See __set_bit()
|
|
* if you do not require the atomic guarantees.
|
|
*
|
|
* Note: there are no guarantees that this function will not be reordered
|
|
* on non x86 architectures, so if you are writing portable code,
|
|
* make sure not to rely on its reordering guarantees.
|
|
*
|
|
* Note that @nr may be almost arbitrarily large; this function is not
|
|
* restricted to acting on a single-word quantity.
|
|
*/
|
|
static __always_inline void
|
|
set_bit(unsigned int nr, volatile unsigned long *addr)
|
|
{
|
|
if (IS_IMMEDIATE(nr)) {
|
|
asm volatile(LOCK_PREFIX "orb %1,%0"
|
|
: CONST_MASK_ADDR(nr, addr)
|
|
: "iq" ((u8)CONST_MASK(nr))
|
|
: "memory");
|
|
} else {
|
|
asm volatile(LOCK_PREFIX "bts %1,%0"
|
|
: BITOP_ADDR(addr) : "Ir" (nr) : "memory");
|
|
}
|
|
}
|
|
|
|
/**
|
|
* __set_bit - Set a bit in memory
|
|
* @nr: the bit to set
|
|
* @addr: the address to start counting from
|
|
*
|
|
* Unlike set_bit(), this function is non-atomic and may be reordered.
|
|
* If it's called on the same region of memory simultaneously, the effect
|
|
* may be that only one operation succeeds.
|
|
*/
|
|
static inline void __set_bit(int nr, volatile unsigned long *addr)
|
|
{
|
|
asm volatile("bts %1,%0" : ADDR : "Ir" (nr) : "memory");
|
|
}
|
|
|
|
/**
|
|
* clear_bit - Clears a bit in memory
|
|
* @nr: Bit to clear
|
|
* @addr: Address to start counting from
|
|
*
|
|
* clear_bit() is atomic and may not be reordered. However, it does
|
|
* not contain a memory barrier, so if it is used for locking purposes,
|
|
* you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit()
|
|
* in order to ensure changes are visible on other processors.
|
|
*/
|
|
static __always_inline void
|
|
clear_bit(int nr, volatile unsigned long *addr)
|
|
{
|
|
if (IS_IMMEDIATE(nr)) {
|
|
asm volatile(LOCK_PREFIX "andb %1,%0"
|
|
: CONST_MASK_ADDR(nr, addr)
|
|
: "iq" ((u8)~CONST_MASK(nr)));
|
|
} else {
|
|
asm volatile(LOCK_PREFIX "btr %1,%0"
|
|
: BITOP_ADDR(addr)
|
|
: "Ir" (nr));
|
|
}
|
|
}
|
|
|
|
/*
|
|
* clear_bit_unlock - Clears a bit in memory
|
|
* @nr: Bit to clear
|
|
* @addr: Address to start counting from
|
|
*
|
|
* clear_bit() is atomic and implies release semantics before the memory
|
|
* operation. It can be used for an unlock.
|
|
*/
|
|
static inline void clear_bit_unlock(unsigned nr, volatile unsigned long *addr)
|
|
{
|
|
barrier();
|
|
clear_bit(nr, addr);
|
|
}
|
|
|
|
static inline void __clear_bit(int nr, volatile unsigned long *addr)
|
|
{
|
|
asm volatile("btr %1,%0" : ADDR : "Ir" (nr));
|
|
}
|
|
|
|
/*
|
|
* __clear_bit_unlock - Clears a bit in memory
|
|
* @nr: Bit to clear
|
|
* @addr: Address to start counting from
|
|
*
|
|
* __clear_bit() is non-atomic and implies release semantics before the memory
|
|
* operation. It can be used for an unlock if no other CPUs can concurrently
|
|
* modify other bits in the word.
|
|
*
|
|
* No memory barrier is required here, because x86 cannot reorder stores past
|
|
* older loads. Same principle as spin_unlock.
|
|
*/
|
|
static inline void __clear_bit_unlock(unsigned nr, volatile unsigned long *addr)
|
|
{
|
|
barrier();
|
|
__clear_bit(nr, addr);
|
|
}
|
|
|
|
#define smp_mb__before_clear_bit() barrier()
|
|
#define smp_mb__after_clear_bit() barrier()
|
|
|
|
/**
|
|
* __change_bit - Toggle a bit in memory
|
|
* @nr: the bit to change
|
|
* @addr: the address to start counting from
|
|
*
|
|
* Unlike change_bit(), this function is non-atomic and may be reordered.
|
|
* If it's called on the same region of memory simultaneously, the effect
|
|
* may be that only one operation succeeds.
|
|
*/
|
|
static inline void __change_bit(int nr, volatile unsigned long *addr)
|
|
{
|
|
asm volatile("btc %1,%0" : ADDR : "Ir" (nr));
|
|
}
|
|
|
|
/**
|
|
* change_bit - Toggle a bit in memory
|
|
* @nr: Bit to change
|
|
* @addr: Address to start counting from
|
|
*
|
|
* change_bit() is atomic and may not be reordered.
|
|
* Note that @nr may be almost arbitrarily large; this function is not
|
|
* restricted to acting on a single-word quantity.
|
|
*/
|
|
static inline void change_bit(int nr, volatile unsigned long *addr)
|
|
{
|
|
if (IS_IMMEDIATE(nr)) {
|
|
asm volatile(LOCK_PREFIX "xorb %1,%0"
|
|
: CONST_MASK_ADDR(nr, addr)
|
|
: "iq" ((u8)CONST_MASK(nr)));
|
|
} else {
|
|
asm volatile(LOCK_PREFIX "btc %1,%0"
|
|
: BITOP_ADDR(addr)
|
|
: "Ir" (nr));
|
|
}
|
|
}
|
|
|
|
/**
|
|
* test_and_set_bit - Set a bit and return its old value
|
|
* @nr: Bit to set
|
|
* @addr: Address to count from
|
|
*
|
|
* This operation is atomic and cannot be reordered.
|
|
* It also implies a memory barrier.
|
|
*/
|
|
static inline int test_and_set_bit(int nr, volatile unsigned long *addr)
|
|
{
|
|
int oldbit;
|
|
|
|
asm volatile(LOCK_PREFIX "bts %2,%1\n\t"
|
|
"sbb %0,%0" : "=r" (oldbit), ADDR : "Ir" (nr) : "memory");
|
|
|
|
return oldbit;
|
|
}
|
|
|
|
/**
|
|
* test_and_set_bit_lock - Set a bit and return its old value for lock
|
|
* @nr: Bit to set
|
|
* @addr: Address to count from
|
|
*
|
|
* This is the same as test_and_set_bit on x86.
|
|
*/
|
|
static __always_inline int
|
|
test_and_set_bit_lock(int nr, volatile unsigned long *addr)
|
|
{
|
|
return test_and_set_bit(nr, addr);
|
|
}
|
|
|
|
/**
|
|
* __test_and_set_bit - Set a bit and return its old value
|
|
* @nr: Bit to set
|
|
* @addr: Address to count from
|
|
*
|
|
* This operation is non-atomic and can be reordered.
|
|
* If two examples of this operation race, one can appear to succeed
|
|
* but actually fail. You must protect multiple accesses with a lock.
|
|
*/
|
|
static inline int __test_and_set_bit(int nr, volatile unsigned long *addr)
|
|
{
|
|
int oldbit;
|
|
|
|
asm("bts %2,%1\n\t"
|
|
"sbb %0,%0"
|
|
: "=r" (oldbit), ADDR
|
|
: "Ir" (nr));
|
|
return oldbit;
|
|
}
|
|
|
|
/**
|
|
* test_and_clear_bit - Clear a bit and return its old value
|
|
* @nr: Bit to clear
|
|
* @addr: Address to count from
|
|
*
|
|
* This operation is atomic and cannot be reordered.
|
|
* It also implies a memory barrier.
|
|
*/
|
|
static inline int test_and_clear_bit(int nr, volatile unsigned long *addr)
|
|
{
|
|
int oldbit;
|
|
|
|
asm volatile(LOCK_PREFIX "btr %2,%1\n\t"
|
|
"sbb %0,%0"
|
|
: "=r" (oldbit), ADDR : "Ir" (nr) : "memory");
|
|
|
|
return oldbit;
|
|
}
|
|
|
|
/**
|
|
* __test_and_clear_bit - Clear a bit and return its old value
|
|
* @nr: Bit to clear
|
|
* @addr: Address to count from
|
|
*
|
|
* This operation is non-atomic and can be reordered.
|
|
* If two examples of this operation race, one can appear to succeed
|
|
* but actually fail. You must protect multiple accesses with a lock.
|
|
*/
|
|
static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr)
|
|
{
|
|
int oldbit;
|
|
|
|
asm volatile("btr %2,%1\n\t"
|
|
"sbb %0,%0"
|
|
: "=r" (oldbit), ADDR
|
|
: "Ir" (nr));
|
|
return oldbit;
|
|
}
|
|
|
|
/* WARNING: non atomic and it can be reordered! */
|
|
static inline int __test_and_change_bit(int nr, volatile unsigned long *addr)
|
|
{
|
|
int oldbit;
|
|
|
|
asm volatile("btc %2,%1\n\t"
|
|
"sbb %0,%0"
|
|
: "=r" (oldbit), ADDR
|
|
: "Ir" (nr) : "memory");
|
|
|
|
return oldbit;
|
|
}
|
|
|
|
/**
|
|
* test_and_change_bit - Change a bit and return its old value
|
|
* @nr: Bit to change
|
|
* @addr: Address to count from
|
|
*
|
|
* This operation is atomic and cannot be reordered.
|
|
* It also implies a memory barrier.
|
|
*/
|
|
static inline int test_and_change_bit(int nr, volatile unsigned long *addr)
|
|
{
|
|
int oldbit;
|
|
|
|
asm volatile(LOCK_PREFIX "btc %2,%1\n\t"
|
|
"sbb %0,%0"
|
|
: "=r" (oldbit), ADDR : "Ir" (nr) : "memory");
|
|
|
|
return oldbit;
|
|
}
|
|
|
|
static __always_inline int constant_test_bit(unsigned int nr, const volatile unsigned long *addr)
|
|
{
|
|
return ((1UL << (nr % BITS_PER_LONG)) &
|
|
(addr[nr / BITS_PER_LONG])) != 0;
|
|
}
|
|
|
|
static inline int variable_test_bit(int nr, volatile const unsigned long *addr)
|
|
{
|
|
int oldbit;
|
|
|
|
asm volatile("bt %2,%1\n\t"
|
|
"sbb %0,%0"
|
|
: "=r" (oldbit)
|
|
: "m" (*(unsigned long *)addr), "Ir" (nr));
|
|
|
|
return oldbit;
|
|
}
|
|
|
|
#if 0 /* Fool kernel-doc since it doesn't do macros yet */
|
|
/**
|
|
* test_bit - Determine whether a bit is set
|
|
* @nr: bit number to test
|
|
* @addr: Address to start counting from
|
|
*/
|
|
static int test_bit(int nr, const volatile unsigned long *addr);
|
|
#endif
|
|
|
|
#define test_bit(nr, addr) \
|
|
(__builtin_constant_p((nr)) \
|
|
? constant_test_bit((nr), (addr)) \
|
|
: variable_test_bit((nr), (addr)))
|
|
|
|
/**
|
|
* __ffs - find first set bit in word
|
|
* @word: The word to search
|
|
*
|
|
* Undefined if no bit exists, so code should check against 0 first.
|
|
*/
|
|
static inline unsigned long __ffs(unsigned long word)
|
|
{
|
|
asm("bsf %1,%0"
|
|
: "=r" (word)
|
|
: "rm" (word));
|
|
return word;
|
|
}
|
|
|
|
/**
|
|
* ffz - find first zero bit in word
|
|
* @word: The word to search
|
|
*
|
|
* Undefined if no zero exists, so code should check against ~0UL first.
|
|
*/
|
|
static inline unsigned long ffz(unsigned long word)
|
|
{
|
|
asm("bsf %1,%0"
|
|
: "=r" (word)
|
|
: "r" (~word));
|
|
return word;
|
|
}
|
|
|
|
/*
|
|
* __fls: find last set bit in word
|
|
* @word: The word to search
|
|
*
|
|
* Undefined if no set bit exists, so code should check against 0 first.
|
|
*/
|
|
static inline unsigned long __fls(unsigned long word)
|
|
{
|
|
asm("bsr %1,%0"
|
|
: "=r" (word)
|
|
: "rm" (word));
|
|
return word;
|
|
}
|
|
|
|
#undef ADDR
|
|
|
|
#ifdef __KERNEL__
|
|
/**
|
|
* ffs - find first set bit in word
|
|
* @x: the word to search
|
|
*
|
|
* This is defined the same way as the libc and compiler builtin ffs
|
|
* routines, therefore differs in spirit from the other bitops.
|
|
*
|
|
* ffs(value) returns 0 if value is 0 or the position of the first
|
|
* set bit if value is nonzero. The first (least significant) bit
|
|
* is at position 1.
|
|
*/
|
|
static inline int ffs(int x)
|
|
{
|
|
int r;
|
|
|
|
#ifdef CONFIG_X86_64
|
|
/*
|
|
* AMD64 says BSFL won't clobber the dest reg if x==0; Intel64 says the
|
|
* dest reg is undefined if x==0, but their CPU architect says its
|
|
* value is written to set it to the same as before, except that the
|
|
* top 32 bits will be cleared.
|
|
*
|
|
* We cannot do this on 32 bits because at the very least some
|
|
* 486 CPUs did not behave this way.
|
|
*/
|
|
long tmp = -1;
|
|
asm("bsfl %1,%0"
|
|
: "=r" (r)
|
|
: "rm" (x), "0" (tmp));
|
|
#elif defined(CONFIG_X86_CMOV)
|
|
asm("bsfl %1,%0\n\t"
|
|
"cmovzl %2,%0"
|
|
: "=&r" (r) : "rm" (x), "r" (-1));
|
|
#else
|
|
asm("bsfl %1,%0\n\t"
|
|
"jnz 1f\n\t"
|
|
"movl $-1,%0\n"
|
|
"1:" : "=r" (r) : "rm" (x));
|
|
#endif
|
|
return r + 1;
|
|
}
|
|
|
|
/**
|
|
* fls - find last set bit in word
|
|
* @x: the word to search
|
|
*
|
|
* This is defined in a similar way as the libc and compiler builtin
|
|
* ffs, but returns the position of the most significant set bit.
|
|
*
|
|
* fls(value) returns 0 if value is 0 or the position of the last
|
|
* set bit if value is nonzero. The last (most significant) bit is
|
|
* at position 32.
|
|
*/
|
|
static inline int fls(int x)
|
|
{
|
|
int r;
|
|
|
|
#ifdef CONFIG_X86_64
|
|
/*
|
|
* AMD64 says BSRL won't clobber the dest reg if x==0; Intel64 says the
|
|
* dest reg is undefined if x==0, but their CPU architect says its
|
|
* value is written to set it to the same as before, except that the
|
|
* top 32 bits will be cleared.
|
|
*
|
|
* We cannot do this on 32 bits because at the very least some
|
|
* 486 CPUs did not behave this way.
|
|
*/
|
|
long tmp = -1;
|
|
asm("bsrl %1,%0"
|
|
: "=r" (r)
|
|
: "rm" (x), "0" (tmp));
|
|
#elif defined(CONFIG_X86_CMOV)
|
|
asm("bsrl %1,%0\n\t"
|
|
"cmovzl %2,%0"
|
|
: "=&r" (r) : "rm" (x), "rm" (-1));
|
|
#else
|
|
asm("bsrl %1,%0\n\t"
|
|
"jnz 1f\n\t"
|
|
"movl $-1,%0\n"
|
|
"1:" : "=r" (r) : "rm" (x));
|
|
#endif
|
|
return r + 1;
|
|
}
|
|
|
|
/**
|
|
* fls64 - find last set bit in a 64-bit word
|
|
* @x: the word to search
|
|
*
|
|
* This is defined in a similar way as the libc and compiler builtin
|
|
* ffsll, but returns the position of the most significant set bit.
|
|
*
|
|
* fls64(value) returns 0 if value is 0 or the position of the last
|
|
* set bit if value is nonzero. The last (most significant) bit is
|
|
* at position 64.
|
|
*/
|
|
#ifdef CONFIG_X86_64
|
|
static __always_inline int fls64(__u64 x)
|
|
{
|
|
long bitpos = -1;
|
|
/*
|
|
* AMD64 says BSRQ won't clobber the dest reg if x==0; Intel64 says the
|
|
* dest reg is undefined if x==0, but their CPU architect says its
|
|
* value is written to set it to the same as before.
|
|
*/
|
|
asm("bsrq %1,%0"
|
|
: "+r" (bitpos)
|
|
: "rm" (x));
|
|
return bitpos + 1;
|
|
}
|
|
#else
|
|
#include <asm-generic/bitops/fls64.h>
|
|
#endif
|
|
|
|
#include <asm-generic/bitops/find.h>
|
|
|
|
#include <asm-generic/bitops/sched.h>
|
|
|
|
#define ARCH_HAS_FAST_MULTIPLIER 1
|
|
|
|
#include <asm/arch_hweight.h>
|
|
|
|
#include <asm-generic/bitops/const_hweight.h>
|
|
|
|
#include <asm-generic/bitops/le.h>
|
|
|
|
#include <asm-generic/bitops/ext2-atomic-setbit.h>
|
|
|
|
#endif /* __KERNEL__ */
|
|
#endif /* _ASM_X86_BITOPS_H */
|