Multithreaded locking fixes.

git-svn-id: svn://svn.savannah.nongnu.org/qemu/trunk@4692 c046a42c-6fe2-441c-8c8c-71466251a162
2008-06-07 20:50:51 +00:00 · 2008-06-07 20:50:51 +00:00 · d597536303
parent 0a878c4760
commit d597536303
10 changed files with 447 additions and 237 deletions
--- a/cpu-defs.h
+++ b/cpu-defs.h
@ -166,6 +166,7 @@ typedef struct CPUTLBEntry {
                                                                        \
    void *next_cpu; /* next CPU sharing TB cache */                     \
    int cpu_index; /* CPU index (informative) */                        \
+    int running; /* Nonzero if cpu is currently running(usermode).  */  \
    /* user data */                                                     \
    void *opaque;                                                       \
                                                                        \
--- a/cpu-exec.c
+++ b/cpu-exec.c
@ -44,7 +44,6 @@
 #endif

 int tb_invalidated_flag;
-static unsigned long next_tb;

 //#define DEBUG_EXEC
 //#define DEBUG_SIGNAL
@ -93,8 +92,6 @@ static TranslationBlock *tb_find_slow(target_ulong pc,
    target_ulong phys_pc, phys_page1, phys_page2, virt_page2;
    uint8_t *tc_ptr;

-    spin_lock(&tb_lock);
-
    tb_invalidated_flag = 0;

    regs_to_env(); /* XXX: do it just before cpu_gen_code() */
@ -155,7 +152,6 @@ static TranslationBlock *tb_find_slow(target_ulong pc,
 found:
    /* we add the TB in the virtual pc hash table */
    env->tb_jmp_cache[tb_jmp_cache_hash_func(pc)] = tb;
-    spin_unlock(&tb_lock);
    return tb;
 }

@ -228,14 +224,6 @@ static inline TranslationBlock *tb_find_fast(void)
    if (__builtin_expect(!tb || tb->pc != pc || tb->cs_base != cs_base ||
                         tb->flags != flags, 0)) {
        tb = tb_find_slow(pc, cs_base, flags);
-        /* Note: we do it here to avoid a gcc bug on Mac OS X when
-           doing it in tb_find_slow */
-        if (tb_invalidated_flag) {
-            /* as some TB could have been invalidated because
-               of memory exceptions while generating the code, we
-               must recompute the hash index here */
-            next_tb = 0;
-        }
    }
    return tb;
 }
@ -249,6 +237,7 @@ int cpu_exec(CPUState *env1)
    int ret, interrupt_request;
    TranslationBlock *tb;
    uint8_t *tc_ptr;
+    unsigned long next_tb;

    if (cpu_halted(env1) == EXCP_HALTED)
        return EXCP_HALTED;
@ -577,7 +566,16 @@ int cpu_exec(CPUState *env1)
 #endif
                }
 #endif
+                spin_lock(&tb_lock);
                tb = tb_find_fast();
+                /* Note: we do it here to avoid a gcc bug on Mac OS X when
+                   doing it in tb_find_slow */
+                if (tb_invalidated_flag) {
+                    /* as some TB could have been invalidated because
+                       of memory exceptions while generating the code, we
+                       must recompute the hash index here */
+                    next_tb = 0;
+                }
 #ifdef DEBUG_EXEC
                if ((loglevel & CPU_LOG_EXEC)) {
                    fprintf(logfile, "Trace 0x%08lx [" TARGET_FMT_lx "] %s\n",
@ -594,11 +592,10 @@ int cpu_exec(CPUState *env1)
                        (env->kqemu_enabled != 2) &&
 #endif
                        tb->page_addr[1] == -1) {
-                    spin_lock(&tb_lock);
                    tb_add_jump((TranslationBlock *)(next_tb & ~3), next_tb & 3, tb);
-                    spin_unlock(&tb_lock);
                }
                }
+                spin_unlock(&tb_lock);
                tc_ptr = tb->tc_ptr;
                env->current_tb = tb;
                /* execute the generated code */
--- a/exec-all.h
+++ b/exec-all.h
@ -302,217 +302,7 @@ extern CPUWriteMemoryFunc *io_mem_write[IO_MEM_NB_ENTRIES][4];
 extern CPUReadMemoryFunc *io_mem_read[IO_MEM_NB_ENTRIES][4];
 extern void *io_mem_opaque[IO_MEM_NB_ENTRIES];

-#if defined(__hppa__)
-
-typedef int spinlock_t[4];
-
-#define SPIN_LOCK_UNLOCKED { 1, 1, 1, 1 }
-
-static inline void resetlock (spinlock_t *p)
-{
-    (*p)[0] = (*p)[1] = (*p)[2] = (*p)[3] = 1;
-}
-
-#else
-
-typedef int spinlock_t;
-
-#define SPIN_LOCK_UNLOCKED 0
-
-static inline void resetlock (spinlock_t *p)
-{
-    *p = SPIN_LOCK_UNLOCKED;
-}
-
-#endif
-
-#if defined(__powerpc__)
-static inline int testandset (int *p)
-{
-    int ret;
-    __asm__ __volatile__ (
-                          "0:    lwarx %0,0,%1\n"
-                          "      xor. %0,%3,%0\n"
-                          "      bne 1f\n"
-                          "      stwcx. %2,0,%1\n"
-                          "      bne- 0b\n"
-                          "1:    "
-                          : "=&r" (ret)
-                          : "r" (p), "r" (1), "r" (0)
-                          : "cr0", "memory");
-    return ret;
-}
-#elif defined(__i386__)
-static inline int testandset (int *p)
-{
-    long int readval = 0;
-
-    __asm__ __volatile__ ("lock; cmpxchgl %2, %0"
-                          : "+m" (*p), "+a" (readval)
-                          : "r" (1)
-                          : "cc");
-    return readval;
-}
-#elif defined(__x86_64__)
-static inline int testandset (int *p)
-{
-    long int readval = 0;
-
-    __asm__ __volatile__ ("lock; cmpxchgl %2, %0"
-                          : "+m" (*p), "+a" (readval)
-                          : "r" (1)
-                          : "cc");
-    return readval;
-}
-#elif defined(__s390__)
-static inline int testandset (int *p)
-{
-    int ret;
-
-    __asm__ __volatile__ ("0: cs    %0,%1,0(%2)\n"
-			  "   jl    0b"
-			  : "=&d" (ret)
-			  : "r" (1), "a" (p), "0" (*p)
-			  : "cc", "memory" );
-    return ret;
-}
-#elif defined(__alpha__)
-static inline int testandset (int *p)
-{
-    int ret;
-    unsigned long one;
-
-    __asm__ __volatile__ ("0:	mov 1,%2\n"
-			  "	ldl_l %0,%1\n"
-			  "	stl_c %2,%1\n"
-			  "	beq %2,1f\n"
-			  ".subsection 2\n"
-			  "1:	br 0b\n"
-			  ".previous"
-			  : "=r" (ret), "=m" (*p), "=r" (one)
-			  : "m" (*p));
-    return ret;
-}
-#elif defined(__sparc__)
-static inline int testandset (int *p)
-{
-	int ret;
-
-	__asm__ __volatile__("ldstub	[%1], %0"
-			     : "=r" (ret)
-			     : "r" (p)
-			     : "memory");
-
-	return (ret ? 1 : 0);
-}
-#elif defined(__arm__)
-static inline int testandset (int *spinlock)
-{
-    register unsigned int ret;
-    __asm__ __volatile__("swp %0, %1, [%2]"
-                         : "=r"(ret)
-                         : "0"(1), "r"(spinlock));
-
-    return ret;
-}
-#elif defined(__mc68000)
-static inline int testandset (int *p)
-{
-    char ret;
-    __asm__ __volatile__("tas %1; sne %0"
-                         : "=r" (ret)
-                         : "m" (p)
-                         : "cc","memory");
-    return ret;
-}
-#elif defined(__hppa__)
-
-/* Because malloc only guarantees 8-byte alignment for malloc'd data,
-   and GCC only guarantees 8-byte alignment for stack locals, we can't
-   be assured of 16-byte alignment for atomic lock data even if we
-   specify "__attribute ((aligned(16)))" in the type declaration.  So,
-   we use a struct containing an array of four ints for the atomic lock
-   type and dynamically select the 16-byte aligned int from the array
-   for the semaphore.  */
-#define __PA_LDCW_ALIGNMENT 16
-static inline void *ldcw_align (void *p) {
-    unsigned long a = (unsigned long)p;
-    a = (a + __PA_LDCW_ALIGNMENT - 1) & ~(__PA_LDCW_ALIGNMENT - 1);
-    return (void *)a;
-}
-
-static inline int testandset (spinlock_t *p)
-{
-    unsigned int ret;
-    p = ldcw_align(p);
-    __asm__ __volatile__("ldcw 0(%1),%0"
-                         : "=r" (ret)
-                         : "r" (p)
-                         : "memory" );
-    return !ret;
-}
-
-#elif defined(__ia64)
-
-#include <ia64intrin.h>
-
-static inline int testandset (int *p)
-{
-    return __sync_lock_test_and_set (p, 1);
-}
-#elif defined(__mips__)
-static inline int testandset (int *p)
-{
-    int ret;
-
-    __asm__ __volatile__ (
-	"	.set push		\n"
-	"	.set noat		\n"
-	"	.set mips2		\n"
-	"1:	li	$1, 1		\n"
-	"	ll	%0, %1		\n"
-	"	sc	$1, %1		\n"
-	"	beqz	$1, 1b		\n"
-	"	.set pop		"
-	: "=r" (ret), "+R" (*p)
-	:
-	: "memory");
-
-    return ret;
-}
-#else
-#error unimplemented CPU support
-#endif
-
-#if defined(CONFIG_USER_ONLY)
-static inline void spin_lock(spinlock_t *lock)
-{
-    while (testandset(lock));
-}
-
-static inline void spin_unlock(spinlock_t *lock)
-{
-    resetlock(lock);
-}
-
-static inline int spin_trylock(spinlock_t *lock)
-{
-    return !testandset(lock);
-}
-#else
-static inline void spin_lock(spinlock_t *lock)
-{
-}
-
-static inline void spin_unlock(spinlock_t *lock)
-{
-}
-
-static inline int spin_trylock(spinlock_t *lock)
-{
-    return 1;
-}
-#endif
+#include "qemu-lock.h"

 extern spinlock_t tb_lock;

--- a/exec.c
+++ b/exec.c
@ -1341,10 +1341,20 @@ void cpu_set_log_filename(const char *filename)
 /* mask must never be zero, except for A20 change call */
 void cpu_interrupt(CPUState *env, int mask)
 {
+#if !defined(USE_NPTL)
    TranslationBlock *tb;
    static spinlock_t interrupt_lock = SPIN_LOCK_UNLOCKED;
+#endif

+    /* FIXME: This is probably not threadsafe.  A different thread could
+       be in the mittle of a read-modify-write operation.  */
    env->interrupt_request |= mask;
+#if defined(USE_NPTL)
+    /* FIXME: TB unchaining isn't SMP safe.  For now just ignore the
+       problem and hope the cpu will stop of its own accord.  For userspace
+       emulation this often isn't actually as bad as it sounds.  Often
+       signals are used primarily to interrupt blocking syscalls.  */
+#else
    /* if the cpu is currently executing code, we must unlink it and
       all the potentially executing TB */
    tb = env->current_tb;
@ -1353,6 +1363,7 @@ void cpu_interrupt(CPUState *env, int mask)
        tb_reset_jump_recursive(tb);
        resetlock(&interrupt_lock);
    }
+#endif
 }

 void cpu_reset_interrupt(CPUState *env, int mask)
@ -2015,7 +2026,6 @@ void page_set_flags(target_ulong start, target_ulong end, int flags)
    end = TARGET_PAGE_ALIGN(end);
    if (flags & PAGE_WRITE)
        flags |= PAGE_WRITE_ORG;
-    spin_lock(&tb_lock);
    for(addr = start; addr < end; addr += TARGET_PAGE_SIZE) {
        p = page_find_alloc(addr >> TARGET_PAGE_BITS);
        /* if the write protection is set, then we invalidate the code
@ -2027,7 +2037,6 @@ void page_set_flags(target_ulong start, target_ulong end, int flags)
        }
        p->flags = flags;
    }
-    spin_unlock(&tb_lock);
 }

 int page_check_range(target_ulong start, target_ulong len, int flags)
--- a/linux-user/elfload.c
+++ b/linux-user/elfload.c
@ -89,7 +89,7 @@ enum {
 static const char *get_elf_platform(void)
 {
    static char elf_platform[] = "i386";
-    int family = (global_env->cpuid_version >> 8) & 0xff;
+    int family = (thread_env->cpuid_version >> 8) & 0xff;
    if (family > 6)
        family = 6;
    if (family >= 3)
@ -101,7 +101,7 @@ static const char *get_elf_platform(void)

 static uint32_t get_elf_hwcap(void)
 {
-  return global_env->cpuid_features;
+  return thread_env->cpuid_features;
 }

 #ifdef TARGET_X86_64
--- a/linux-user/main.c
+++ b/linux-user/main.c
@ -26,6 +26,8 @@

 #include "qemu.h"
 #include "qemu-common.h"
+/* For tb_lock */
+#include "exec-all.h"

 #define DEBUG_LOGFILE "/tmp/qemu.log"

@ -123,6 +125,135 @@ int64_t cpu_get_real_ticks(void)

 #endif

+#if defined(USE_NPTL)
+/***********************************************************/
+/* Helper routines for implementing atomic operations.  */
+
+/* To implement exclusive operations we force all cpus to syncronise.
+   We don't require a full sync, only that no cpus are executing guest code.
+   The alternative is to map target atomic ops onto host equivalents,
+   which requires quite a lot of per host/target work.  */
+static pthread_mutex_t exclusive_lock = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t exclusive_cond = PTHREAD_COND_INITIALIZER;
+static pthread_cond_t exclusive_resume = PTHREAD_COND_INITIALIZER;
+static int pending_cpus;
+
+/* Make sure everything is in a consistent state for calling fork().  */
+void fork_start(void)
+{
+    mmap_fork_start();
+    pthread_mutex_lock(&tb_lock);
+    pthread_mutex_lock(&exclusive_lock);
+}
+
+void fork_end(int child)
+{
+    if (child) {
+        /* Child processes created by fork() only have a single thread.
+           Discard information about the parent threads.  */
+        first_cpu = thread_env;
+        thread_env->next_cpu = NULL;
+        pending_cpus = 0;
+        pthread_mutex_init(&exclusive_lock, NULL);
+        pthread_cond_init(&exclusive_cond, NULL);
+        pthread_cond_init(&exclusive_resume, NULL);
+        pthread_mutex_init(&tb_lock, NULL);
+    } else {
+        pthread_mutex_unlock(&exclusive_lock);
+        pthread_mutex_unlock(&tb_lock);
+    }
+    mmap_fork_end(child);
+}
+
+/* Wait for pending exclusive operations to complete.  The exclusive lock
+   must be held.  */
+static inline void exclusive_idle(void)
+{
+    while (pending_cpus) {
+        pthread_cond_wait(&exclusive_resume, &exclusive_lock);
+    }
+}
+
+/* Start an exclusive operation.
+   Must only be called from outside cpu_arm_exec.   */
+static inline void start_exclusive(void)
+{
+    CPUState *other;
+    pthread_mutex_lock(&exclusive_lock);
+    exclusive_idle();
+
+    pending_cpus = 1;
+    /* Make all other cpus stop executing.  */
+    for (other = first_cpu; other; other = other->next_cpu) {
+        if (other->running) {
+            pending_cpus++;
+            cpu_interrupt(other, CPU_INTERRUPT_EXIT);
+        }
+    }
+    if (pending_cpus > 1) {
+        pthread_cond_wait(&exclusive_cond, &exclusive_lock);
+    }
+}
+
+/* Finish an exclusive operation.  */
+static inline void end_exclusive(void)
+{
+    pending_cpus = 0;
+    pthread_cond_broadcast(&exclusive_resume);
+    pthread_mutex_unlock(&exclusive_lock);
+}
+
+/* Wait for exclusive ops to finish, and begin cpu execution.  */
+static inline void cpu_exec_start(CPUState *env)
+{
+    pthread_mutex_lock(&exclusive_lock);
+    exclusive_idle();
+    env->running = 1;
+    pthread_mutex_unlock(&exclusive_lock);
+}
+
+/* Mark cpu as not executing, and release pending exclusive ops.  */
+static inline void cpu_exec_end(CPUState *env)
+{
+    pthread_mutex_lock(&exclusive_lock);
+    env->running = 0;
+    if (pending_cpus > 1) {
+        pending_cpus--;
+        if (pending_cpus == 1) {
+            pthread_cond_signal(&exclusive_cond);
+        }
+    }
+    exclusive_idle();
+    pthread_mutex_unlock(&exclusive_lock);
+}
+#else /* if !USE_NPTL */
+/* These are no-ops because we are not threadsafe.  */
+static inline void cpu_exec_start(CPUState *env)
+{
+}
+
+static inline void cpu_exec_end(CPUState *env)
+{
+}
+
+static inline void start_exclusive(void)
+{
+}
+
+static inline void end_exclusive(void)
+{
+}
+
+void fork_start(void)
+{
+}
+
+void fork_end(int child)
+{
+}
+#endif
+
+
 #ifdef TARGET_I386
 /***********************************************************/
 /* CPUX86 core interface */
@ -378,8 +509,11 @@ do_kernel_trap(CPUARMState *env)
        /* ??? No-op. Will need to do better for SMP.  */
        break;
    case 0xffff0fc0: /* __kernel_cmpxchg */
-        /* ??? This is not really atomic.  However we don't support
-           threads anyway, so it doesn't realy matter.  */
+         /* XXX: This only works between threads, not between processes.
+            It's probably possible to implement this with native host
+            operations. However things like ldrex/strex are much harder so
+            there's not much point trying.  */
+        start_exclusive();
        cpsr = cpsr_read(env);
        addr = env->regs[2];
        /* FIXME: This should SEGV if the access fails.  */
@ -396,6 +530,7 @@ do_kernel_trap(CPUARMState *env)
            cpsr &= ~CPSR_C;
        }
        cpsr_write(env, cpsr, CPSR_C);
+        end_exclusive();
        break;
    case 0xffff0fe0: /* __kernel_get_tls */
        env->regs[0] = env->cp15.c13_tls2;
@ -422,7 +557,9 @@ void cpu_loop(CPUARMState *env)
    uint32_t addr;

    for(;;) {
+        cpu_exec_start(env);
        trapnr = cpu_arm_exec(env);
+        cpu_exec_end(env);
        switch(trapnr) {
        case EXCP_UDEF:
            {
@ -2044,8 +2181,7 @@ void usage(void)
    _exit(1);
 }

-/* XXX: currently only used for async signals (see signal.c) */
-CPUState *global_env;
+THREAD CPUState *thread_env;

 void init_task_state(TaskState *ts)
 {
@ -2203,7 +2339,7 @@ int main(int argc, char **argv)
        fprintf(stderr, "Unable to find CPU definition\n");
        exit(1);
    }
-    global_env = env;
+    thread_env = env;

    if (getenv("QEMU_STRACE")) {
        do_strace = 1;
--- a/linux-user/mmap.c
+++ b/linux-user/mmap.c
@ -46,6 +46,22 @@ void mmap_unlock(void)
        pthread_mutex_unlock(&mmap_mutex);
    }
 }
+
+/* Grab lock to make sure things are in a consistent state after fork().  */
+void mmap_fork_start(void)
+{
+    if (mmap_lock_count)
+        abort();
+    pthread_mutex_lock(&mmap_mutex);
+}
+
+void mmap_fork_end(int child)
+{
+    if (child)
+        pthread_mutex_init(&mmap_mutex, NULL);
+    else
+        pthread_mutex_unlock(&mmap_mutex);
+}
 #else
 /* We aren't threadsafe to start with, so no need to worry about locking.  */
 void mmap_lock(void)
--- a/linux-user/qemu.h
+++ b/linux-user/qemu.h
@ -37,6 +37,12 @@ typedef target_long abi_long;
 #include "target_signal.h"
 #include "gdbstub.h"

+#if defined(USE_NPTL)
+#define THREAD __thread
+#else
+#define THREAD
+#endif
+
 /* This struct is used to hold certain information about the image.
 * Basically, it replicates in user space what would be certain
 * task_struct fields in the kernel
@ -184,12 +190,14 @@ abi_long do_syscall(void *cpu_env, int num, abi_long arg1,
                    abi_long arg2, abi_long arg3, abi_long arg4,
                    abi_long arg5, abi_long arg6);
 void gemu_log(const char *fmt, ...) __attribute__((format(printf,1,2)));
-extern CPUState *global_env;
+extern THREAD CPUState *thread_env;
 void cpu_loop(CPUState *env);
 void init_paths(const char *prefix);
 const char *path(const char *pathname);
 char *target_strerror(int err);
 int get_osversion(void);
+void fork_start(void);
+void fork_end(int child);

 extern int loglevel;
 extern FILE *logfile;
@ -235,6 +243,10 @@ int target_msync(abi_ulong start, abi_ulong len, int flags);
 extern unsigned long last_brk;
 void mmap_lock(void);
 void mmap_unlock(void);
+#if defined(USE_NPTL)
+void mmap_fork_start(void);
+void mmap_fork_end(int child);
+#endif

 /* user access */

--- a/linux-user/signal.c
+++ b/linux-user/signal.c
@ -424,9 +424,9 @@ static void host_signal_handler(int host_signum, siginfo_t *info,
    fprintf(stderr, "qemu: got signal %d\n", sig);
 #endif
    host_to_target_siginfo_noswap(&tinfo, info);
-    if (queue_signal(global_env, sig, &tinfo) == 1) {
+    if (queue_signal(thread_env, sig, &tinfo) == 1) {
        /* interrupt the virtual CPU as soon as possible */
-        cpu_interrupt(global_env, CPU_INTERRUPT_EXIT);
+        cpu_interrupt(thread_env, CPU_INTERRUPT_EXIT);
    }
 }

--- a/qemu-lock.h
+++ b/qemu-lock.h
@ -0,0 +1,249 @@
+/*
+ *  Copyright (c) 2003 Fabrice Bellard
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+/* Locking primitives.  Most of this code should be redundant -
+   system emulation doesn't need/use locking, NPTL userspace uses
+   pthread mutexes, and non-NPTL userspace isn't threadsafe anyway.
+   In either case a spinlock is probably the wrong kind of lock.
+   Spinlocks are only good if you know annother CPU has the lock and is
+   likely to release it soon.  In environments where you have more threads
+   than physical CPUs (the extreme case being a single CPU host) a spinlock
+   simply wastes CPU until the OS decides to preempt it.  */
+#if defined(USE_NPTL)
+
+#include <pthread.h>
+#define spin_lock pthread_mutex_lock
+#define spin_unlock pthread_mutex_unlock
+#define spinlock_t pthread_mutex_t
+#define SPIN_LOCK_UNLOCKED PTHREAD_MUTEX_INITIALIZER
+
+#else
+
+#if defined(__hppa__)
+
+typedef int spinlock_t[4];
+
+#define SPIN_LOCK_UNLOCKED { 1, 1, 1, 1 }
+
+static inline void resetlock (spinlock_t *p)
+{
+    (*p)[0] = (*p)[1] = (*p)[2] = (*p)[3] = 1;
+}
+
+#else
+
+typedef int spinlock_t;
+
+#define SPIN_LOCK_UNLOCKED 0
+
+static inline void resetlock (spinlock_t *p)
+{
+    *p = SPIN_LOCK_UNLOCKED;
+}
+
+#endif
+
+#if defined(__powerpc__)
+static inline int testandset (int *p)
+{
+    int ret;
+    __asm__ __volatile__ (
+                          "0:    lwarx %0,0,%1\n"
+                          "      xor. %0,%3,%0\n"
+                          "      bne 1f\n"
+                          "      stwcx. %2,0,%1\n"
+                          "      bne- 0b\n"
+                          "1:    "
+                          : "=&r" (ret)
+                          : "r" (p), "r" (1), "r" (0)
+                          : "cr0", "memory");
+    return ret;
+}
+#elif defined(__i386__)
+static inline int testandset (int *p)
+{
+    long int readval = 0;
+
+    __asm__ __volatile__ ("lock; cmpxchgl %2, %0"
+                          : "+m" (*p), "+a" (readval)
+                          : "r" (1)
+                          : "cc");
+    return readval;
+}
+#elif defined(__x86_64__)
+static inline int testandset (int *p)
+{
+    long int readval = 0;
+
+    __asm__ __volatile__ ("lock; cmpxchgl %2, %0"
+                          : "+m" (*p), "+a" (readval)
+                          : "r" (1)
+                          : "cc");
+    return readval;
+}
+#elif defined(__s390__)
+static inline int testandset (int *p)
+{
+    int ret;
+
+    __asm__ __volatile__ ("0: cs    %0,%1,0(%2)\n"
+			  "   jl    0b"
+			  : "=&d" (ret)
+			  : "r" (1), "a" (p), "0" (*p)
+			  : "cc", "memory" );
+    return ret;
+}
+#elif defined(__alpha__)
+static inline int testandset (int *p)
+{
+    int ret;
+    unsigned long one;
+
+    __asm__ __volatile__ ("0:	mov 1,%2\n"
+			  "	ldl_l %0,%1\n"
+			  "	stl_c %2,%1\n"
+			  "	beq %2,1f\n"
+			  ".subsection 2\n"
+			  "1:	br 0b\n"
+			  ".previous"
+			  : "=r" (ret), "=m" (*p), "=r" (one)
+			  : "m" (*p));
+    return ret;
+}
+#elif defined(__sparc__)
+static inline int testandset (int *p)
+{
+	int ret;
+
+	__asm__ __volatile__("ldstub	[%1], %0"
+			     : "=r" (ret)
+			     : "r" (p)
+			     : "memory");
+
+	return (ret ? 1 : 0);
+}
+#elif defined(__arm__)
+static inline int testandset (int *spinlock)
+{
+    register unsigned int ret;
+    __asm__ __volatile__("swp %0, %1, [%2]"
+                         : "=r"(ret)
+                         : "0"(1), "r"(spinlock));
+
+    return ret;
+}
+#elif defined(__mc68000)
+static inline int testandset (int *p)
+{
+    char ret;
+    __asm__ __volatile__("tas %1; sne %0"
+                         : "=r" (ret)
+                         : "m" (p)
+                         : "cc","memory");
+    return ret;
+}
+#elif defined(__hppa__)
+
+/* Because malloc only guarantees 8-byte alignment for malloc'd data,
+   and GCC only guarantees 8-byte alignment for stack locals, we can't
+   be assured of 16-byte alignment for atomic lock data even if we
+   specify "__attribute ((aligned(16)))" in the type declaration.  So,
+   we use a struct containing an array of four ints for the atomic lock
+   type and dynamically select the 16-byte aligned int from the array
+   for the semaphore.  */
+#define __PA_LDCW_ALIGNMENT 16
+static inline void *ldcw_align (void *p) {
+    unsigned long a = (unsigned long)p;
+    a = (a + __PA_LDCW_ALIGNMENT - 1) & ~(__PA_LDCW_ALIGNMENT - 1);
+    return (void *)a;
+}
+
+static inline int testandset (spinlock_t *p)
+{
+    unsigned int ret;
+    p = ldcw_align(p);
+    __asm__ __volatile__("ldcw 0(%1),%0"
+                         : "=r" (ret)
+                         : "r" (p)
+                         : "memory" );
+    return !ret;
+}
+
+#elif defined(__ia64)
+
+#include <ia64intrin.h>
+
+static inline int testandset (int *p)
+{
+    return __sync_lock_test_and_set (p, 1);
+}
+#elif defined(__mips__)
+static inline int testandset (int *p)
+{
+    int ret;
+
+    __asm__ __volatile__ (
+	"	.set push		\n"
+	"	.set noat		\n"
+	"	.set mips2		\n"
+	"1:	li	$1, 1		\n"
+	"	ll	%0, %1		\n"
+	"	sc	$1, %1		\n"
+	"	beqz	$1, 1b		\n"
+	"	.set pop		"
+	: "=r" (ret), "+R" (*p)
+	:
+	: "memory");
+
+    return ret;
+}
+#else
+#error unimplemented CPU support
+#endif
+
+#if defined(CONFIG_USER_ONLY)
+static inline void spin_lock(spinlock_t *lock)
+{
+    while (testandset(lock));
+}
+
+static inline void spin_unlock(spinlock_t *lock)
+{
+    resetlock(lock);
+}
+
+static inline int spin_trylock(spinlock_t *lock)
+{
+    return !testandset(lock);
+}
+#else
+static inline void spin_lock(spinlock_t *lock)
+{
+}
+
+static inline void spin_unlock(spinlock_t *lock)
+{
+}
+
+static inline int spin_trylock(spinlock_t *lock)
+{
+    return 1;
+}
+#endif
+
+#endif