438 lines
20 KiB
C
438 lines
20 KiB
C
/* Macros for atomic functionality for tile.
|
|
Copyright (C) 2011-2022 Free Software Foundation, Inc.
|
|
Contributed by Walter Lee (walt@tilera.com)
|
|
|
|
This file is free software; you can redistribute it and/or modify it
|
|
under the terms of the GNU General Public License as published by the
|
|
Free Software Foundation; either version 3, or (at your option) any
|
|
later version.
|
|
|
|
This file is distributed in the hope that it will be useful, but
|
|
WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
General Public License for more details.
|
|
|
|
Under Section 7 of GPL version 3, you are granted additional
|
|
permissions described in the GCC Runtime Library Exception, version
|
|
3.1, as published by the Free Software Foundation.
|
|
|
|
You should have received a copy of the GNU General Public License and
|
|
a copy of the GCC Runtime Library Exception along with this program;
|
|
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
|
|
<http://www.gnu.org/licenses/>. */
|
|
|
|
|
|
/* Provides macros for common atomic functionality. */
|
|
|
|
#ifndef _ATOMIC_H_
|
|
#define _ATOMIC_H_
|
|
|
|
#ifdef __tilegx__
|
|
/* Atomic instruction macros
|
|
|
|
The macros provided by atomic.h simplify access to the TILE-Gx
|
|
architecture's atomic instructions. The architecture provides a
|
|
variety of atomic instructions, including "exchange", "compare and
|
|
exchange", "fetch and ADD", "fetch and AND", "fetch and OR", and
|
|
"fetch and ADD if greater than or equal to zero".
|
|
|
|
No barrier or fence semantics are implied by any of the atomic
|
|
instructions for manipulating memory; you must specify the barriers
|
|
that you wish explicitly, using the provided macros.
|
|
|
|
Any integral 32- or 64-bit value can be used as the argument
|
|
to these macros, such as "int", "long long", "unsigned long", etc.
|
|
The pointers must be aligned to 4 or 8 bytes for 32- or 64-bit data.
|
|
The "exchange" and "compare and exchange" macros may also take
|
|
pointer values. We use the pseudo-type "VAL" in the documentation
|
|
to indicate the use of an appropriate type. */
|
|
#else
|
|
/* Atomic instruction macros
|
|
|
|
The macros provided by atomic.h simplify access to the Tile
|
|
architecture's atomic instructions. Since the architecture
|
|
supports test-and-set as its only in-silicon atomic operation, many
|
|
of the operations provided by this header are implemented as
|
|
fast-path calls to Linux emulation routines.
|
|
|
|
Using the kernel for atomic operations allows userspace to take
|
|
advantage of the kernel's existing atomic-integer support (managed
|
|
by a distributed array of locks). The kernel provides proper
|
|
ordering among simultaneous atomic operations on different cores,
|
|
and guarantees a process cannot be context-switched part way
|
|
through an atomic operation. By virtue of sharing the kernel
|
|
atomic implementation, the userspace atomic operations
|
|
are compatible with the atomic methods provided by the kernel's
|
|
futex() syscall API. Note that these operations never cause Linux
|
|
kernel scheduling, and are in fact invisible to the kernel; they
|
|
simply act as regular function calls but with an elevated privilege
|
|
level. Note that the kernel's distributed lock array is hashed by
|
|
using only VA bits from the atomic value's address (to avoid the
|
|
performance hit of page table locking and multiple page-table
|
|
lookups to get the PA) and only the VA bits that are below page
|
|
granularity (to properly lock simultaneous accesses to the same
|
|
page mapped at different VAs). As a result, simultaneous atomic
|
|
operations on values whose addresses are at the same offset on a
|
|
page will contend in the kernel for the same lock array element.
|
|
|
|
No barrier or fence semantics are implied by any of the atomic
|
|
instructions for manipulating memory; you must specify the barriers
|
|
that you wish explicitly, using the provided macros.
|
|
|
|
Any integral 32- or 64-bit value can be used as the argument
|
|
to these macros, such as "int", "long long", "unsigned long", etc.
|
|
The pointers must be aligned to 4 or 8 bytes for 32- or 64-bit data.
|
|
The "exchange" and "compare and exchange" macros may also take
|
|
pointer values. We use the pseudo-type "VAL" in the documentation
|
|
to indicate the use of an appropriate type.
|
|
|
|
The 32-bit routines are implemented using a single kernel fast
|
|
syscall, as is the 64-bit compare-and-exchange. The other 64-bit
|
|
routines are implemented by looping over the 64-bit
|
|
compare-and-exchange routine, so may be potentially less efficient. */
|
|
#endif
|
|
|
|
#ifdef __tilegx__
|
|
#define SPR_CMPEXCH_VALUE 0x2780
|
|
#else
|
|
#define __NR_FAST_cmpxchg -1
|
|
#define __NR_FAST_atomic_update -2
|
|
#define __NR_FAST_cmpxchg64 -3
|
|
#endif
|
|
|
|
|
|
/* 32-bit integer compare-and-exchange. */
|
|
static __inline __attribute__ ((always_inline))
|
|
int arch_atomic_val_compare_and_exchange_4 (volatile int *mem,
|
|
int oldval, int newval)
|
|
{
|
|
#ifdef __tilegx__
|
|
__insn_mtspr (SPR_CMPEXCH_VALUE, oldval);
|
|
return __insn_cmpexch4 (mem, newval);
|
|
#else
|
|
int result;
|
|
__asm__ __volatile__ ("swint1":"=R00" (result),
|
|
"=m" (*mem):"R10" (__NR_FAST_cmpxchg), "R00" (mem),
|
|
"R01" (oldval), "R02" (newval), "m" (*mem):"r20",
|
|
"r21", "r22", "r23", "r24", "r25", "r26", "r27",
|
|
"r28", "r29", "memory");
|
|
return result;
|
|
#endif
|
|
}
|
|
|
|
/* 64-bit integer compare-and-exchange. */
|
|
static __inline __attribute__ ((always_inline))
|
|
long long arch_atomic_val_compare_and_exchange_8 (volatile long long
|
|
*mem, long long oldval,
|
|
long long newval)
|
|
{
|
|
#ifdef __tilegx__
|
|
__insn_mtspr (SPR_CMPEXCH_VALUE, oldval);
|
|
return __insn_cmpexch (mem, newval);
|
|
#else
|
|
unsigned int result_lo, result_hi;
|
|
unsigned int oldval_lo = oldval & 0xffffffffu, oldval_hi = oldval >> 32;
|
|
unsigned int newval_lo = newval & 0xffffffffu, newval_hi = newval >> 32;
|
|
__asm__ __volatile__ ("swint1":"=R00" (result_lo), "=R01" (result_hi),
|
|
"=m" (*mem):"R10" (__NR_FAST_cmpxchg64), "R00" (mem),
|
|
"R02" (oldval_lo), "R03" (oldval_hi),
|
|
"R04" (newval_lo), "R05" (newval_hi),
|
|
"m" (*mem):"r20", "r21", "r22", "r23", "r24", "r25",
|
|
"r26", "r27", "r28", "r29", "memory");
|
|
return ((long long) result_hi) << 32 | result_lo;
|
|
#endif
|
|
}
|
|
|
|
/* This non-existent symbol is called for sizes other than "4" and "8",
|
|
indicating a bug in the caller. */
|
|
extern int __arch_atomic_error_bad_argument_size (void)
|
|
__attribute__ ((warning ("sizeof atomic argument not 4 or 8")));
|
|
|
|
|
|
#define arch_atomic_val_compare_and_exchange(mem, o, n) \
|
|
__extension__ ({ \
|
|
(__typeof(*(mem)))(__typeof(*(mem)-*(mem))) \
|
|
((sizeof(*(mem)) == 8) ? \
|
|
arch_atomic_val_compare_and_exchange_8( \
|
|
(volatile long long*)(mem), (__typeof((o)-(o)))(o), \
|
|
(__typeof((n)-(n)))(n)) : \
|
|
(sizeof(*(mem)) == 4) ? \
|
|
arch_atomic_val_compare_and_exchange_4( \
|
|
(volatile int*)(mem), (__typeof((o)-(o)))(o), \
|
|
(__typeof((n)-(n)))(n)) : \
|
|
__arch_atomic_error_bad_argument_size()); \
|
|
})
|
|
|
|
#define arch_atomic_bool_compare_and_exchange(mem, o, n) \
|
|
__extension__ ({ \
|
|
__typeof(o) __o = (o); \
|
|
__builtin_expect( \
|
|
__o == arch_atomic_val_compare_and_exchange((mem), __o, (n)), 1); \
|
|
})
|
|
|
|
|
|
/* Loop with compare_and_exchange until we guess the correct value.
|
|
Normally "expr" will be an expression using __old and __value. */
|
|
#define __arch_atomic_update_cmpxchg(mem, value, expr) \
|
|
__extension__ ({ \
|
|
__typeof(value) __value = (value); \
|
|
__typeof(*(mem)) *__mem = (mem), __old = *__mem, __guess; \
|
|
do { \
|
|
__guess = __old; \
|
|
__old = arch_atomic_val_compare_and_exchange(__mem, __old, (expr)); \
|
|
} while (__builtin_expect(__old != __guess, 0)); \
|
|
__old; \
|
|
})
|
|
|
|
#ifdef __tilegx__
|
|
|
|
/* Generic atomic op with 8- or 4-byte variant.
|
|
The _mask, _addend, and _expr arguments are ignored on tilegx. */
|
|
#define __arch_atomic_update(mem, value, op, _mask, _addend, _expr) \
|
|
__extension__ ({ \
|
|
((__typeof(*(mem))) \
|
|
((sizeof(*(mem)) == 8) ? (__typeof(*(mem)-*(mem)))__insn_##op( \
|
|
(volatile void *)(mem), \
|
|
(long long)(__typeof((value)-(value)))(value)) : \
|
|
(sizeof(*(mem)) == 4) ? (int)__insn_##op##4( \
|
|
(volatile void *)(mem), \
|
|
(int)(__typeof((value)-(value)))(value)) : \
|
|
__arch_atomic_error_bad_argument_size())); \
|
|
})
|
|
|
|
#else
|
|
|
|
/* This uses TILEPro's fast syscall support to atomically compute:
|
|
|
|
int old = *ptr;
|
|
*ptr = (old & mask) + addend;
|
|
return old;
|
|
|
|
This primitive can be used for atomic exchange, add, or, and.
|
|
Only 32-bit support is provided. */
|
|
static __inline __attribute__ ((always_inline))
|
|
int
|
|
__arch_atomic_update_4 (volatile int *mem, int mask, int addend)
|
|
{
|
|
int result;
|
|
__asm__ __volatile__ ("swint1":"=R00" (result),
|
|
"=m" (*mem):"R10" (__NR_FAST_atomic_update),
|
|
"R00" (mem), "R01" (mask), "R02" (addend),
|
|
"m" (*mem):"r20", "r21", "r22", "r23", "r24", "r25",
|
|
"r26", "r27", "r28", "r29", "memory");
|
|
return result;
|
|
}
|
|
|
|
/* Generic atomic op with 8- or 4-byte variant.
|
|
The _op argument is ignored on tilepro. */
|
|
#define __arch_atomic_update(mem, value, _op, mask, addend, expr) \
|
|
__extension__ ({ \
|
|
(__typeof(*(mem)))(__typeof(*(mem)-*(mem))) \
|
|
((sizeof(*(mem)) == 8) ? \
|
|
__arch_atomic_update_cmpxchg((mem), (value), (expr)) : \
|
|
(sizeof(*(mem)) == 4) ? \
|
|
__arch_atomic_update_4((volatile int*)(mem), \
|
|
(__typeof((mask)-(mask)))(mask), \
|
|
(__typeof((addend)-(addend)))(addend)) : \
|
|
__arch_atomic_error_bad_argument_size()); \
|
|
})
|
|
|
|
#endif /* __tilegx__ */
|
|
|
|
|
|
#define arch_atomic_exchange(mem, newvalue) \
|
|
__arch_atomic_update(mem, newvalue, exch, 0, newvalue, __value)
|
|
|
|
#define arch_atomic_add(mem, value) \
|
|
__arch_atomic_update(mem, value, fetchadd, -1, value, __old + __value)
|
|
|
|
#define arch_atomic_sub(mem, value) arch_atomic_add((mem), -(value))
|
|
|
|
#define arch_atomic_increment(mem) arch_atomic_add((mem), 1)
|
|
|
|
#define arch_atomic_decrement(mem) arch_atomic_add((mem), -1)
|
|
|
|
#define arch_atomic_and(mem, mask) \
|
|
__arch_atomic_update(mem, mask, fetchand, mask, 0, __old & __value)
|
|
|
|
#define arch_atomic_or(mem, mask) \
|
|
__arch_atomic_update(mem, mask, fetchor, ~mask, mask, __old | __value)
|
|
|
|
#define arch_atomic_xor(mem, mask) \
|
|
__arch_atomic_update_cmpxchg(mem, mask, __old ^ __value)
|
|
|
|
#define arch_atomic_nand(mem, mask) \
|
|
__arch_atomic_update_cmpxchg(mem, mask, ~(__old & __value))
|
|
|
|
#define arch_atomic_bit_set(mem, bit) \
|
|
__extension__ ({ \
|
|
__typeof(*(mem)) __mask = (__typeof(*(mem)))1 << (bit); \
|
|
__mask & arch_atomic_or((mem), __mask); \
|
|
})
|
|
|
|
#define arch_atomic_bit_clear(mem, bit) \
|
|
__extension__ ({ \
|
|
__typeof(*(mem)) __mask = (__typeof(*(mem)))1 << (bit); \
|
|
__mask & arch_atomic_and((mem), ~__mask); \
|
|
})
|
|
|
|
#ifdef __tilegx__
|
|
/* Atomically store a new value to memory.
|
|
Note that you can freely use types of any size here, unlike the
|
|
other atomic routines, which require 32- or 64-bit types.
|
|
This accessor is provided for compatibility with TILEPro, which
|
|
required an explicit atomic operation for stores that needed
|
|
to be atomic with respect to other atomic methods in this header. */
|
|
#define arch_atomic_write(mem, value) ((void) (*(mem) = (value)))
|
|
#else
|
|
#define arch_atomic_write(mem, value) \
|
|
do { \
|
|
__typeof(mem) __aw_mem = (mem); \
|
|
__typeof(value) __aw_val = (value); \
|
|
unsigned int *__aw_mem32, __aw_intval, __aw_val32, __aw_off, __aw_mask; \
|
|
__aw_intval = (__typeof((value) - (value)))__aw_val; \
|
|
switch (sizeof(*__aw_mem)) { \
|
|
case 8: \
|
|
__arch_atomic_update_cmpxchg(__aw_mem, __aw_val, __value); \
|
|
break; \
|
|
case 4: \
|
|
__arch_atomic_update_4((int *)__aw_mem, 0, __aw_intval); \
|
|
break; \
|
|
case 2: \
|
|
__aw_off = 8 * ((long)__aw_mem & 0x2); \
|
|
__aw_mask = 0xffffU << __aw_off; \
|
|
__aw_mem32 = (unsigned int *)((long)__aw_mem & ~0x2); \
|
|
__aw_val32 = (__aw_intval << __aw_off) & __aw_mask; \
|
|
__arch_atomic_update_cmpxchg(__aw_mem32, __aw_val32, \
|
|
(__old & ~__aw_mask) | __value); \
|
|
break; \
|
|
case 1: \
|
|
__aw_off = 8 * ((long)__aw_mem & 0x3); \
|
|
__aw_mask = 0xffU << __aw_off; \
|
|
__aw_mem32 = (unsigned int *)((long)__aw_mem & ~0x3); \
|
|
__aw_val32 = (__aw_intval << __aw_off) & __aw_mask; \
|
|
__arch_atomic_update_cmpxchg(__aw_mem32, __aw_val32, \
|
|
(__old & ~__aw_mask) | __value); \
|
|
break; \
|
|
} \
|
|
} while (0)
|
|
#endif
|
|
|
|
/* Compiler barrier.
|
|
|
|
This macro prevents loads or stores from being moved by the compiler
|
|
across the macro. Any loaded value that was loaded before this
|
|
macro must then be reloaded by the compiler. */
|
|
#define arch_atomic_compiler_barrier() __asm__ __volatile__("" ::: "memory")
|
|
|
|
/* Full memory barrier.
|
|
|
|
This macro has the semantics of arch_atomic_compiler_barrer(), but also
|
|
ensures that previous stores are visible to other cores, and that
|
|
all previous loaded values have been placed into their target
|
|
register on this core. */
|
|
#define arch_atomic_full_barrier() __insn_mf()
|
|
|
|
/* Read memory barrier.
|
|
|
|
Ensure that all reads by this processor that occurred prior to the
|
|
read memory barrier have completed, and that no reads that occur
|
|
after the read memory barrier on this processor are initiated
|
|
before the barrier.
|
|
|
|
On current TILE chips a read barrier is implemented as a full barrier,
|
|
but this may not be true in later versions of the architecture.
|
|
|
|
See also arch_atomic_acquire_barrier() for the appropriate idiom to use
|
|
to ensure no reads are lifted above an atomic lock instruction. */
|
|
#define arch_atomic_read_barrier() arch_atomic_full_barrier()
|
|
|
|
/* Write memory barrier.
|
|
|
|
Ensure that all writes by this processor that occurred prior to the
|
|
write memory barrier have completed, and that no writes that occur
|
|
after the write memory barrier on this processor are initiated
|
|
before the barrier.
|
|
|
|
On current TILE chips a write barrier is implemented as a full barrier,
|
|
but this may not be true in later versions of the architecture.
|
|
|
|
See also arch_atomic_release_barrier() for the appropriate idiom to use
|
|
to ensure all writes are complete prior to an atomic unlock instruction. */
|
|
#define arch_atomic_write_barrier() arch_atomic_full_barrier()
|
|
|
|
/* Lock acquisition barrier.
|
|
|
|
Ensure that no load operations that follow this macro in the
|
|
program can issue prior to the barrier. Without such a barrier,
|
|
the compiler can reorder them to issue earlier, or the hardware can
|
|
issue them speculatively. The latter is not currently done in the
|
|
Tile microarchitecture, but using this operation improves
|
|
portability to future implementations.
|
|
|
|
This operation is intended to be used as part of the "acquire"
|
|
path for locking, that is, when entering a critical section.
|
|
This should be done after the atomic operation that actually
|
|
acquires the lock, and in conjunction with a "control dependency"
|
|
that checks the atomic operation result to see if the lock was
|
|
in fact acquired. See the arch_atomic_read_barrier() macro
|
|
for a heavier-weight barrier to use in certain unusual constructs,
|
|
or arch_atomic_acquire_barrier_value() if no control dependency exists. */
|
|
#define arch_atomic_acquire_barrier() arch_atomic_compiler_barrier()
|
|
|
|
/* Lock release barrier.
|
|
|
|
Ensure that no store operations that precede this macro in the
|
|
program complete subsequent to the barrier. Without such a
|
|
barrier, the compiler can reorder stores to issue later, or stores
|
|
can be still outstanding in the memory network.
|
|
|
|
This operation is intended to be used as part of the "release" path
|
|
for locking, that is, when leaving a critical section. This should
|
|
be done before the operation (such as a store of zero) that
|
|
actually releases the lock. */
|
|
#define arch_atomic_release_barrier() arch_atomic_write_barrier()
|
|
|
|
/* Barrier until the read of a particular value is complete.
|
|
|
|
This is occasionally useful when constructing certain locking
|
|
scenarios. For example, you might write a routine that issues an
|
|
atomic instruction to enter a critical section, then reads one or
|
|
more values within the critical section without checking to see if
|
|
the critical section was in fact acquired, and only later checks
|
|
the atomic instruction result to see if the lock was acquired. If
|
|
so the routine could properly release the lock and know that the
|
|
values that were read were valid.
|
|
|
|
In this scenario, it is required to wait for the result of the
|
|
atomic instruction, even if the value itself is not checked. This
|
|
guarantees that if the atomic instruction succeeded in taking the lock,
|
|
the lock was held before any reads in the critical section issued. */
|
|
#define arch_atomic_acquire_barrier_value(val) \
|
|
__asm__ __volatile__("move %0, %0" :: "r"(val))
|
|
|
|
/* Access the given variable in memory exactly once.
|
|
|
|
In some contexts, an algorithm may need to force access to memory,
|
|
since otherwise the compiler may think it can optimize away a
|
|
memory load or store; for example, in a loop when polling memory to
|
|
see if another cpu has updated it yet. Generally this is only
|
|
required for certain very carefully hand-tuned algorithms; using it
|
|
unnecessarily may result in performance losses.
|
|
|
|
A related use of this macro is to ensure that the compiler does not
|
|
rematerialize the value of "x" by reloading it from memory
|
|
unexpectedly; the "volatile" marking will prevent the compiler from
|
|
being able to rematerialize. This is helpful if an algorithm needs
|
|
to read a variable without locking, but needs it to have the same
|
|
value if it ends up being used several times within the algorithm.
|
|
|
|
Note that multiple uses of this macro are guaranteed to be ordered,
|
|
i.e. the compiler will not reorder stores or loads that are wrapped
|
|
in arch_atomic_access_once(). */
|
|
#define arch_atomic_access_once(x) (*(volatile __typeof(x) *)&(x))
|
|
|
|
|
|
|
|
#endif /* !_ATOMIC_H_ */
|