* thread-safe tb_flush (Fred, Alex, Sergey, me, Richard, Emilio,... :-)

* license clarification for compiler.h (Felipe) * glib cflags improvement (Marc-André) * checkpatch silencing (Paolo) * SMRAM migration fix (Paolo) * Replay improvements (Pavel) * IOMMU notifier improvements (Peter) * IOAPIC now defaults to version 0x20 (Peter) -----BEGIN PGP SIGNATURE----- Version: GnuPG v2 iQExBAABCAAbBQJX6kKUFBxwYm9uemluaUByZWRoYXQuY29tAAoJEL/70l94x66D M1UIAKCQ7XfWDoClYd1TyGZ+Qj3K3TrjwLDIl/Z258euyeZ9p7PpqYQ64OCRsREJ fsGQOqkFYDe7gi4epJiJOuu4oAW7Xu8G6lB2RfBd7KWVMhsl3Che9AEom7amzyzh yoN+g9gwKfAmYwpKyjYWnlWOSjUvif6o0DaTCQCMTaAoEM3b4HKdgHfr6A2dA/E/ 47rtIVp/jNExmrZkaOjnCDS1DJ8XYT3aVeoTkuzRFQ3DBzrAiPABn6B4ExP8IBcJ YLFX/W8xG7F3qyXbKQOV/uYM25A55WS5B0G94ZfSlDtUGa/avzS7df9DFD/IWQT+ RpfiyDdeJueByiTw9R0ZYxFjhd8= =g7xm -----END PGP SIGNATURE----- Merge remote-tracking branch 'remotes/bonzini/tags/for-upstream' into staging * thread-safe tb_flush (Fred, Alex, Sergey, me, Richard, Emilio,... :-) * license clarification for compiler.h (Felipe) * glib cflags improvement (Marc-André) * checkpatch silencing (Paolo) * SMRAM migration fix (Paolo) * Replay improvements (Pavel) * IOMMU notifier improvements (Peter) * IOAPIC now defaults to version 0x20 (Peter) # gpg: Signature made Tue 27 Sep 2016 10:57:40 BST # gpg: using RSA key 0xBFFBD25F78C7AE83 # gpg: Good signature from "Paolo Bonzini <bonzini@gnu.org>" # gpg: aka "Paolo Bonzini <pbonzini@redhat.com>" # Primary key fingerprint: 46F5 9FBD 57D6 12E7 BFD4 E2F7 7E15 100C CD36 69B1 # Subkey fingerprint: F133 3857 4B66 2389 866C 7682 BFFB D25F 78C7 AE83 * remotes/bonzini/tags/for-upstream: (28 commits) replay: allow replay stopping and restarting replay: vmstate for replay module replay: move internal data to the structure cpus-common: lock-free fast path for cpu_exec_start/end tcg: Make tb_flush() thread safe cpus-common: Introduce async_safe_run_on_cpu() cpus-common: simplify locking for start_exclusive/end_exclusive cpus-common: remove redundant call to exclusive_idle() cpus-common: always defer async_run_on_cpu work items docs: include formal model for TCG exclusive sections cpus-common: move exclusive work infrastructure from linux-user cpus-common: fix uninitialized variable use in run_on_cpu cpus-common: move CPU work item management to common code cpus-common: move CPU list management to common code linux-user: Add qemu_cpu_is_self() and qemu_cpu_kick() linux-user: Use QemuMutex and QemuCond cpus: Rename flush_queued_work() cpus: Move common code out of {async_, }run_on_cpu() cpus: pass CPUState to run_on_cpu helpers build-sys: put glib_cflags in QEMU_CFLAGS ... Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
2016-09-28 23:02:56 +01:00 · 2016-09-28 23:02:56 +01:00 · c640f2849e
commit c640f2849e
parent bc63afaf5f 6d0ceb80ff
49 changed files with 1157 additions and 521 deletions
--- a/Makefile.objs
+++ b/Makefile.objs
@ -89,7 +89,7 @@ endif
 #######################################################################
 # Target-independent parts used in system and user emulation
-common-obj-y += tcg-runtime.o
+common-obj-y += tcg-runtime.o cpus-common.o
 common-obj-y += hw/
 common-obj-y += qom/
 common-obj-y += disas/
--- a/block/blkreplay.c
+++ b/block/blkreplay.c
@ -20,11 +20,6 @@ typedef struct Request {
    QEMUBH *bh;
 } Request;
 /* Next request id.
   This counter is global, because requests from different
   block devices should not get overlapping ids. */
 static uint64_t request_id;
 static int blkreplay_open(BlockDriverState *bs, QDict *options, int flags,
                          Error **errp)
 {
@ -84,7 +79,7 @@ static void block_request_create(uint64_t reqid, BlockDriverState *bs,
 static int coroutine_fn blkreplay_co_preadv(BlockDriverState *bs,
    uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags)
 {
-    uint64_t reqid = request_id++;
+    uint64_t reqid = blkreplay_next_id();
    int ret = bdrv_co_preadv(bs->file, offset, bytes, qiov, flags);
    block_request_create(reqid, bs, qemu_coroutine_self());
    qemu_coroutine_yield();
@ -95,7 +90,7 @@ static int coroutine_fn blkreplay_co_preadv(BlockDriverState *bs,
 static int coroutine_fn blkreplay_co_pwritev(BlockDriverState *bs,
    uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags)
 {
-    uint64_t reqid = request_id++;
+    uint64_t reqid = blkreplay_next_id();
    int ret = bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags);
    block_request_create(reqid, bs, qemu_coroutine_self());
    qemu_coroutine_yield();
@ -106,7 +101,7 @@ static int coroutine_fn blkreplay_co_pwritev(BlockDriverState *bs,
 static int coroutine_fn blkreplay_co_pwrite_zeroes(BlockDriverState *bs,
    int64_t offset, int count, BdrvRequestFlags flags)
 {
-    uint64_t reqid = request_id++;
+    uint64_t reqid = blkreplay_next_id();
    int ret = bdrv_co_pwrite_zeroes(bs->file, offset, count, flags);
    block_request_create(reqid, bs, qemu_coroutine_self());
    qemu_coroutine_yield();
@ -117,7 +112,7 @@ static int coroutine_fn blkreplay_co_pwrite_zeroes(BlockDriverState *bs,
 static int coroutine_fn blkreplay_co_pdiscard(BlockDriverState *bs,
                                              int64_t offset, int count)
 {
-    uint64_t reqid = request_id++;
+    uint64_t reqid = blkreplay_next_id();
    int ret = bdrv_co_pdiscard(bs->file->bs, offset, count);
    block_request_create(reqid, bs, qemu_coroutine_self());
    qemu_coroutine_yield();
@ -127,7 +122,7 @@ static int coroutine_fn blkreplay_co_pdiscard(BlockDriverState *bs,
 static int coroutine_fn blkreplay_co_flush(BlockDriverState *bs)
 {
-    uint64_t reqid = request_id++;
+    uint64_t reqid = blkreplay_next_id();
    int ret = bdrv_co_flush(bs->file->bs);
    block_request_create(reqid, bs, qemu_coroutine_self());
    qemu_coroutine_yield();
--- a/bsd-user/main.c
+++ b/bsd-user/main.c
@ -67,23 +67,6 @@ int cpu_get_pic_interrupt(CPUX86State *env)
 }
 #endif
 /* These are no-ops because we are not threadsafe.  */
 static inline void cpu_exec_start(CPUArchState *env)
 {
 }
 static inline void cpu_exec_end(CPUArchState *env)
 {
 }
 static inline void start_exclusive(void)
 {
 }
 static inline void end_exclusive(void)
 {
 }
 void fork_start(void)
 {
 }
@ -95,14 +78,6 @@ void fork_end(int child)
    }
 }
 void cpu_list_lock(void)
 {
 }
 void cpu_list_unlock(void)
 {
 }
 #ifdef TARGET_I386
 /***********************************************************/
 /* CPUX86 core interface */
@ -172,7 +147,11 @@ void cpu_loop(CPUX86State *env)
    //target_siginfo_t info;
    for(;;) {
        cpu_exec_start(cs);
        trapnr = cpu_exec(cs);
        cpu_exec_end(cs);
        process_queued_cpu_work(cs);
        switch(trapnr) {
        case 0x80:
            /* syscall from int $0x80 */
@ -513,7 +492,10 @@ void cpu_loop(CPUSPARCState *env)
    //target_siginfo_t info;
    while (1) {
        cpu_exec_start(cs);
        trapnr = cpu_exec(cs);
        cpu_exec_end(cs);
        process_queued_cpu_work(cs);
        switch (trapnr) {
 #ifndef TARGET_SPARC64
@ -748,6 +730,7 @@ int main(int argc, char **argv)
    if (argc <= 1)
        usage();
    qemu_init_cpu_list();
    module_call_init(MODULE_INIT_QOM);
    if ((envlist = envlist_create()) == NULL) {
--- a/3
+++ b/3
@ -2988,7 +2988,7 @@ for i in $glib_modules; do
    if $pkg_config --atleast-version=$glib_req_ver $i; then
        glib_cflags=$($pkg_config --cflags $i)
        glib_libs=$($pkg_config --libs $i)
-        CFLAGS="$glib_cflags $CFLAGS"
+        QEMU_CFLAGS="$glib_cflags $QEMU_CFLAGS"
        LIBS="$glib_libs $LIBS"
        libs_qga="$glib_libs $libs_qga"
    else
@ -5195,7 +5195,6 @@ fi
 if test "$glib_subprocess" = "yes" ; then
  echo "CONFIG_HAS_GLIB_SUBPROCESS_TESTS=y" >> $config_host_mak
 fi
 echo "GLIB_CFLAGS=$glib_cflags" >> $config_host_mak
 if test "$gtk" = "yes" ; then
  echo "CONFIG_GTK=y" >> $config_host_mak
  echo "CONFIG_GTKABI=$gtkabi" >> $config_host_mak
--- a/cpu-exec.c
+++ b/cpu-exec.c
@ -204,20 +204,16 @@ static void cpu_exec_nocache(CPUState *cpu, int max_cycles,
                             TranslationBlock *orig_tb, bool ignore_icount)
 {
    TranslationBlock *tb;
    bool old_tb_flushed;
    /* Should never happen.
       We only end up here when an existing TB is too long.  */
    if (max_cycles > CF_COUNT_MASK)
        max_cycles = CF_COUNT_MASK;
    old_tb_flushed = cpu->tb_flushed;
    cpu->tb_flushed = false;
    tb = tb_gen_code(cpu, orig_tb->pc, orig_tb->cs_base, orig_tb->flags,
                     max_cycles | CF_NOCACHE
                         | (ignore_icount ? CF_IGNORE_ICOUNT : 0));
-    tb->orig_tb = cpu->tb_flushed ? NULL : orig_tb;
+    tb->orig_tb = orig_tb;
    cpu->tb_flushed |= old_tb_flushed;
    /* execute the generated code */
    trace_exec_tb_nocache(tb, tb->pc);
    cpu_tb_exec(cpu, tb);
@ -338,10 +334,7 @@ static inline TranslationBlock *tb_find(CPUState *cpu,
            tb_lock();
            have_tb_lock = true;
        }
-        /* Check if translation buffer has been flushed */
+        if (!tb->invalid) {
        if (cpu->tb_flushed) {
            cpu->tb_flushed = false;
        } else if (!tb->invalid) {
            tb_add_jump(last_tb, tb_exit, tb);
        }
    }
@ -606,7 +599,6 @@ int cpu_exec(CPUState *cpu)
                break;
            }
            atomic_mb_set(&cpu->tb_flushed, false); /* reset before first TB lookup */
            for(;;) {
                cpu_handle_interrupt(cpu, &last_tb);
                tb = tb_find(cpu, last_tb, tb_exit);
--- a/cpus-common.c
+++ b/cpus-common.c
@ -0,0 +1,352 @@
 /*
 * CPU thread main loop - common bits for user and system mode emulation
 *
 *  Copyright (c) 2003-2005 Fabrice Bellard
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
 */
 #include "qemu/osdep.h"
 #include "qemu/main-loop.h"
 #include "exec/cpu-common.h"
 #include "qom/cpu.h"
 #include "sysemu/cpus.h"
 static QemuMutex qemu_cpu_list_lock;
 static QemuCond exclusive_cond;
 static QemuCond exclusive_resume;
 static QemuCond qemu_work_cond;
 /* >= 1 if a thread is inside start_exclusive/end_exclusive.  Written
 * under qemu_cpu_list_lock, read with atomic operations.
 */
 static int pending_cpus;
 void qemu_init_cpu_list(void)
 {
    /* This is needed because qemu_init_cpu_list is also called by the
     * child process in a fork.  */
    pending_cpus = 0;
    qemu_mutex_init(&qemu_cpu_list_lock);
    qemu_cond_init(&exclusive_cond);
    qemu_cond_init(&exclusive_resume);
    qemu_cond_init(&qemu_work_cond);
 }
 void cpu_list_lock(void)
 {
    qemu_mutex_lock(&qemu_cpu_list_lock);
 }
 void cpu_list_unlock(void)
 {
    qemu_mutex_unlock(&qemu_cpu_list_lock);
 }
 static bool cpu_index_auto_assigned;
 static int cpu_get_free_index(void)
 {
    CPUState *some_cpu;
    int cpu_index = 0;
    cpu_index_auto_assigned = true;
    CPU_FOREACH(some_cpu) {
        cpu_index++;
    }
    return cpu_index;
 }
 static void finish_safe_work(CPUState *cpu)
 {
    cpu_exec_start(cpu);
    cpu_exec_end(cpu);
 }
 void cpu_list_add(CPUState *cpu)
 {
    qemu_mutex_lock(&qemu_cpu_list_lock);
    if (cpu->cpu_index == UNASSIGNED_CPU_INDEX) {
        cpu->cpu_index = cpu_get_free_index();
        assert(cpu->cpu_index != UNASSIGNED_CPU_INDEX);
    } else {
        assert(!cpu_index_auto_assigned);
    }
    QTAILQ_INSERT_TAIL(&cpus, cpu, node);
    qemu_mutex_unlock(&qemu_cpu_list_lock);
    finish_safe_work(cpu);
 }
 void cpu_list_remove(CPUState *cpu)
 {
    qemu_mutex_lock(&qemu_cpu_list_lock);
    if (!QTAILQ_IN_USE(cpu, node)) {
        /* there is nothing to undo since cpu_exec_init() hasn't been called */
        qemu_mutex_unlock(&qemu_cpu_list_lock);
        return;
    }
    assert(!(cpu_index_auto_assigned && cpu != QTAILQ_LAST(&cpus, CPUTailQ)));
    QTAILQ_REMOVE(&cpus, cpu, node);
    cpu->cpu_index = UNASSIGNED_CPU_INDEX;
    qemu_mutex_unlock(&qemu_cpu_list_lock);
 }
 struct qemu_work_item {
    struct qemu_work_item *next;
    run_on_cpu_func func;
    void *data;
    bool free, exclusive, done;
 };
 static void queue_work_on_cpu(CPUState *cpu, struct qemu_work_item *wi)
 {
    qemu_mutex_lock(&cpu->work_mutex);
    if (cpu->queued_work_first == NULL) {
        cpu->queued_work_first = wi;
    } else {
        cpu->queued_work_last->next = wi;
    }
    cpu->queued_work_last = wi;
    wi->next = NULL;
    wi->done = false;
    qemu_mutex_unlock(&cpu->work_mutex);
    qemu_cpu_kick(cpu);
 }
 void do_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data,
                   QemuMutex *mutex)
 {
    struct qemu_work_item wi;
    if (qemu_cpu_is_self(cpu)) {
        func(cpu, data);
        return;
    }
    wi.func = func;
    wi.data = data;
    wi.done = false;
    wi.free = false;
    wi.exclusive = false;
    queue_work_on_cpu(cpu, &wi);
    while (!atomic_mb_read(&wi.done)) {
        CPUState *self_cpu = current_cpu;
        qemu_cond_wait(&qemu_work_cond, mutex);
        current_cpu = self_cpu;
    }
 }
 void async_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data)
 {
    struct qemu_work_item *wi;
    wi = g_malloc0(sizeof(struct qemu_work_item));
    wi->func = func;
    wi->data = data;
    wi->free = true;
    queue_work_on_cpu(cpu, wi);
 }
 /* Wait for pending exclusive operations to complete.  The CPU list lock
   must be held.  */
 static inline void exclusive_idle(void)
 {
    while (pending_cpus) {
        qemu_cond_wait(&exclusive_resume, &qemu_cpu_list_lock);
    }
 }
 /* Start an exclusive operation.
   Must only be called from outside cpu_exec.  */
 void start_exclusive(void)
 {
    CPUState *other_cpu;
    int running_cpus;
    qemu_mutex_lock(&qemu_cpu_list_lock);
    exclusive_idle();
    /* Make all other cpus stop executing.  */
    atomic_set(&pending_cpus, 1);
    /* Write pending_cpus before reading other_cpu->running.  */
    smp_mb();
    running_cpus = 0;
    CPU_FOREACH(other_cpu) {
        if (atomic_read(&other_cpu->running)) {
            other_cpu->has_waiter = true;
            running_cpus++;
            qemu_cpu_kick(other_cpu);
        }
    }
    atomic_set(&pending_cpus, running_cpus + 1);
    while (pending_cpus > 1) {
        qemu_cond_wait(&exclusive_cond, &qemu_cpu_list_lock);
    }
    /* Can release mutex, no one will enter another exclusive
     * section until end_exclusive resets pending_cpus to 0.
     */
    qemu_mutex_unlock(&qemu_cpu_list_lock);
 }
 /* Finish an exclusive operation.  */
 void end_exclusive(void)
 {
    qemu_mutex_lock(&qemu_cpu_list_lock);
    atomic_set(&pending_cpus, 0);
    qemu_cond_broadcast(&exclusive_resume);
    qemu_mutex_unlock(&qemu_cpu_list_lock);
 }
 /* Wait for exclusive ops to finish, and begin cpu execution.  */
 void cpu_exec_start(CPUState *cpu)
 {
    atomic_set(&cpu->running, true);
    /* Write cpu->running before reading pending_cpus.  */
    smp_mb();
    /* 1. start_exclusive saw cpu->running == true and pending_cpus >= 1.
     * After taking the lock we'll see cpu->has_waiter == true and run---not
     * for long because start_exclusive kicked us.  cpu_exec_end will
     * decrement pending_cpus and signal the waiter.
     *
     * 2. start_exclusive saw cpu->running == false but pending_cpus >= 1.
     * This includes the case when an exclusive item is running now.
     * Then we'll see cpu->has_waiter == false and wait for the item to
     * complete.
     *
     * 3. pending_cpus == 0.  Then start_exclusive is definitely going to
     * see cpu->running == true, and it will kick the CPU.
     */
    if (unlikely(atomic_read(&pending_cpus))) {
        qemu_mutex_lock(&qemu_cpu_list_lock);
        if (!cpu->has_waiter) {
            /* Not counted in pending_cpus, let the exclusive item
             * run.  Since we have the lock, just set cpu->running to true
             * while holding it; no need to check pending_cpus again.
             */
            atomic_set(&cpu->running, false);
            exclusive_idle();
            /* Now pending_cpus is zero.  */
            atomic_set(&cpu->running, true);
        } else {
            /* Counted in pending_cpus, go ahead and release the
             * waiter at cpu_exec_end.
             */
        }
        qemu_mutex_unlock(&qemu_cpu_list_lock);
    }
 }
 /* Mark cpu as not executing, and release pending exclusive ops.  */
 void cpu_exec_end(CPUState *cpu)
 {
    atomic_set(&cpu->running, false);
    /* Write cpu->running before reading pending_cpus.  */
    smp_mb();
    /* 1. start_exclusive saw cpu->running == true.  Then it will increment
     * pending_cpus and wait for exclusive_cond.  After taking the lock
     * we'll see cpu->has_waiter == true.
     *
     * 2. start_exclusive saw cpu->running == false but here pending_cpus >= 1.
     * This includes the case when an exclusive item started after setting
     * cpu->running to false and before we read pending_cpus.  Then we'll see
     * cpu->has_waiter == false and not touch pending_cpus.  The next call to
     * cpu_exec_start will run exclusive_idle if still necessary, thus waiting
     * for the item to complete.
     *
     * 3. pending_cpus == 0.  Then start_exclusive is definitely going to
     * see cpu->running == false, and it can ignore this CPU until the
     * next cpu_exec_start.
     */
    if (unlikely(atomic_read(&pending_cpus))) {
        qemu_mutex_lock(&qemu_cpu_list_lock);
        if (cpu->has_waiter) {
            cpu->has_waiter = false;
            atomic_set(&pending_cpus, pending_cpus - 1);
            if (pending_cpus == 1) {
                qemu_cond_signal(&exclusive_cond);
            }
        }
        qemu_mutex_unlock(&qemu_cpu_list_lock);
    }
 }
 void async_safe_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data)
 {
    struct qemu_work_item *wi;
    wi = g_malloc0(sizeof(struct qemu_work_item));
    wi->func = func;
    wi->data = data;
    wi->free = true;
    wi->exclusive = true;
    queue_work_on_cpu(cpu, wi);
 }
 void process_queued_cpu_work(CPUState *cpu)
 {
    struct qemu_work_item *wi;
    if (cpu->queued_work_first == NULL) {
        return;
    }
    qemu_mutex_lock(&cpu->work_mutex);
    while (cpu->queued_work_first != NULL) {
        wi = cpu->queued_work_first;
        cpu->queued_work_first = wi->next;
        if (!cpu->queued_work_first) {
            cpu->queued_work_last = NULL;
        }
        qemu_mutex_unlock(&cpu->work_mutex);
        if (wi->exclusive) {
            /* Running work items outside the BQL avoids the following deadlock:
             * 1) start_exclusive() is called with the BQL taken while another
             * CPU is running; 2) cpu_exec in the other CPU tries to takes the
             * BQL, so it goes to sleep; start_exclusive() is sleeping too, so
             * neither CPU can proceed.
             */
            qemu_mutex_unlock_iothread();
            start_exclusive();
            wi->func(cpu, wi->data);
            end_exclusive();
            qemu_mutex_lock_iothread();
        } else {
            wi->func(cpu, wi->data);
        }
        qemu_mutex_lock(&cpu->work_mutex);
        if (wi->free) {
            g_free(wi);
        } else {
            atomic_mb_set(&wi->done, true);
        }
    }
    qemu_mutex_unlock(&cpu->work_mutex);
    qemu_cond_broadcast(&qemu_work_cond);
 }
--- a/cpus.c
+++ b/cpus.c
@ -557,9 +557,8 @@ static const VMStateDescription vmstate_timers = {
    }
 };
-static void cpu_throttle_thread(void *opaque)
+static void cpu_throttle_thread(CPUState *cpu, void *opaque)
 {
    CPUState *cpu = opaque;
    double pct;
    double throttle_ratio;
    long sleeptime_ns;
@ -589,7 +588,7 @@ static void cpu_throttle_timer_tick(void *opaque)
    }
    CPU_FOREACH(cpu) {
        if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
-            async_run_on_cpu(cpu, cpu_throttle_thread, cpu);
+            async_run_on_cpu(cpu, cpu_throttle_thread, NULL);
        }
    }
@ -751,6 +750,7 @@ static int do_vm_stop(RunState state)
    }
    bdrv_drain_all();
    replay_disable_events();
    ret = blk_flush_all();
    return ret;
@ -903,79 +903,21 @@ static QemuThread io_thread;
 static QemuCond qemu_cpu_cond;
 /* system init */
 static QemuCond qemu_pause_cond;
 static QemuCond qemu_work_cond;
 void qemu_init_cpu_loop(void)
 {
    qemu_init_sigbus();
    qemu_cond_init(&qemu_cpu_cond);
    qemu_cond_init(&qemu_pause_cond);
    qemu_cond_init(&qemu_work_cond);
    qemu_cond_init(&qemu_io_proceeded_cond);
    qemu_mutex_init(&qemu_global_mutex);
    qemu_thread_get_self(&io_thread);
 }
-void run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data)
+void run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data)
 {
-    struct qemu_work_item wi;
+    do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
    if (qemu_cpu_is_self(cpu)) {
        func(data);
        return;
    }
    wi.func = func;
    wi.data = data;
    wi.free = false;
    qemu_mutex_lock(&cpu->work_mutex);
    if (cpu->queued_work_first == NULL) {
        cpu->queued_work_first = &wi;
    } else {
        cpu->queued_work_last->next = &wi;
    }
    cpu->queued_work_last = &wi;
    wi.next = NULL;
    wi.done = false;
    qemu_mutex_unlock(&cpu->work_mutex);
    qemu_cpu_kick(cpu);
    while (!atomic_mb_read(&wi.done)) {
        CPUState *self_cpu = current_cpu;
        qemu_cond_wait(&qemu_work_cond, &qemu_global_mutex);
        current_cpu = self_cpu;
    }
 }
 void async_run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data)
 {
    struct qemu_work_item *wi;
    if (qemu_cpu_is_self(cpu)) {
        func(data);
        return;
    }
    wi = g_malloc0(sizeof(struct qemu_work_item));
    wi->func = func;
    wi->data = data;
    wi->free = true;
    qemu_mutex_lock(&cpu->work_mutex);
    if (cpu->queued_work_first == NULL) {
        cpu->queued_work_first = wi;
    } else {
        cpu->queued_work_last->next = wi;
    }
    cpu->queued_work_last = wi;
    wi->next = NULL;
    wi->done = false;
    qemu_mutex_unlock(&cpu->work_mutex);
    qemu_cpu_kick(cpu);
 }
 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
@ -990,34 +932,6 @@ static void qemu_tcg_destroy_vcpu(CPUState *cpu)
 {
 }
 static void flush_queued_work(CPUState *cpu)
 {
    struct qemu_work_item *wi;
    if (cpu->queued_work_first == NULL) {
        return;
    }
    qemu_mutex_lock(&cpu->work_mutex);
    while (cpu->queued_work_first != NULL) {
        wi = cpu->queued_work_first;
        cpu->queued_work_first = wi->next;
        if (!cpu->queued_work_first) {
            cpu->queued_work_last = NULL;
        }
        qemu_mutex_unlock(&cpu->work_mutex);
        wi->func(wi->data);
        qemu_mutex_lock(&cpu->work_mutex);
        if (wi->free) {
            g_free(wi);
        } else {
            atomic_mb_set(&wi->done, true);
        }
    }
    qemu_mutex_unlock(&cpu->work_mutex);
    qemu_cond_broadcast(&qemu_work_cond);
 }
 static void qemu_wait_io_event_common(CPUState *cpu)
 {
    if (cpu->stop) {
@ -1025,7 +939,7 @@ static void qemu_wait_io_event_common(CPUState *cpu)
        cpu->stopped = true;
        qemu_cond_broadcast(&qemu_pause_cond);
    }
-    flush_queued_work(cpu);
+    process_queued_cpu_work(cpu);
    cpu->thread_kicked = false;
 }
@ -1544,7 +1458,9 @@ static int tcg_cpu_exec(CPUState *cpu)
        cpu->icount_decr.u16.low = decr;
        cpu->icount_extra = count;
    }
    cpu_exec_start(cpu);
    ret = cpu_exec(cpu);
    cpu_exec_end(cpu);
 #ifdef CONFIG_PROFILER
    tcg_time += profile_getclock() - ti;
 #endif
--- a/docs/tcg-exclusive.promela
+++ b/docs/tcg-exclusive.promela
@ -0,0 +1,225 @@
 /*
 * This model describes the implementation of exclusive sections in
 * cpus-common.c (start_exclusive, end_exclusive, cpu_exec_start,
 * cpu_exec_end).
 *
 * Author: Paolo Bonzini <pbonzini@redhat.com>
 *
 * This file is in the public domain.  If you really want a license,
 * the WTFPL will do.
 *
 * To verify it:
 *     spin -a docs/tcg-exclusive.promela
 *     gcc pan.c -O2
 *     ./a.out -a
 *
 * Tunable processor macros: N_CPUS, N_EXCLUSIVE, N_CYCLES, USE_MUTEX,
 *                           TEST_EXPENSIVE.
 */
 // Define the missing parameters for the model
 #ifndef N_CPUS
 #define N_CPUS 2
 #warning defaulting to 2 CPU processes
 #endif
 // the expensive test is not so expensive for <= 2 CPUs
 // If the mutex is used, it's also cheap (300 MB / 4 seconds) for 3 CPUs
 // For 3 CPUs and the lock-free option it needs 1.5 GB of RAM
 #if N_CPUS <= 2 || (N_CPUS <= 3 && defined USE_MUTEX)
 #define TEST_EXPENSIVE
 #endif
 #ifndef N_EXCLUSIVE
 # if !defined N_CYCLES || N_CYCLES <= 1 || defined TEST_EXPENSIVE
 #  define N_EXCLUSIVE     2
 #  warning defaulting to 2 concurrent exclusive sections
 # else
 #  define N_EXCLUSIVE     1
 #  warning defaulting to 1 concurrent exclusive sections
 # endif
 #endif
 #ifndef N_CYCLES
 # if N_EXCLUSIVE <= 1 || defined TEST_EXPENSIVE
 #  define N_CYCLES        2
 #  warning defaulting to 2 CPU cycles
 # else
 #  define N_CYCLES        1
 #  warning defaulting to 1 CPU cycles
 # endif
 #endif
 // synchronization primitives.  condition variables require a
 // process-local "cond_t saved;" variable.
 #define mutex_t              byte
 #define MUTEX_LOCK(m)        atomic { m == 0 -> m = 1 }
 #define MUTEX_UNLOCK(m)      m = 0
 #define cond_t               int
 #define COND_WAIT(c, m)      {                                  \
                               saved = c;                       \
                               MUTEX_UNLOCK(m);                 \
                               c != saved -> MUTEX_LOCK(m);     \
                             }
 #define COND_BROADCAST(c)    c++
 // this is the logic from cpus-common.c
 mutex_t mutex;
 cond_t exclusive_cond;
 cond_t exclusive_resume;
 byte pending_cpus;
 byte running[N_CPUS];
 byte has_waiter[N_CPUS];
 #define exclusive_idle()                                          \
  do                                                              \
      :: pending_cpus -> COND_WAIT(exclusive_resume, mutex);      \
      :: else         -> break;                                   \
  od
 #define start_exclusive()                                         \
    MUTEX_LOCK(mutex);                                            \
    exclusive_idle();                                             \
    pending_cpus = 1;                                             \
                                                                  \
    i = 0;                                                        \
    do                                                            \
       :: i < N_CPUS -> {                                         \
           if                                                     \
              :: running[i] -> has_waiter[i] = 1; pending_cpus++; \
              :: else       -> skip;                              \
           fi;                                                    \
           i++;                                                   \
       }                                                          \
       :: else -> break;                                          \
    od;                                                           \
                                                                  \
    do                                                            \
      :: pending_cpus > 1 -> COND_WAIT(exclusive_cond, mutex);    \
      :: else             -> break;                               \
    od;                                                           \
    MUTEX_UNLOCK(mutex);
 #define end_exclusive()                                           \
    MUTEX_LOCK(mutex);                                            \
    pending_cpus = 0;                                             \
    COND_BROADCAST(exclusive_resume);                             \
    MUTEX_UNLOCK(mutex);
 #ifdef USE_MUTEX
 // Simple version using mutexes
 #define cpu_exec_start(id)                                                   \
    MUTEX_LOCK(mutex);                                                       \
    exclusive_idle();                                                        \
    running[id] = 1;                                                         \
    MUTEX_UNLOCK(mutex);
 #define cpu_exec_end(id)                                                     \
    MUTEX_LOCK(mutex);                                                       \
    running[id] = 0;                                                         \
    if                                                                       \
        :: pending_cpus -> {                                                 \
            pending_cpus--;                                                  \
            if                                                               \
                :: pending_cpus == 1 -> COND_BROADCAST(exclusive_cond);      \
                :: else -> skip;                                             \
            fi;                                                              \
        }                                                                    \
        :: else -> skip;                                                     \
    fi;                                                                      \
    MUTEX_UNLOCK(mutex);
 #else
 // Wait-free fast path, only needs mutex when concurrent with
 // an exclusive section
 #define cpu_exec_start(id)                                                   \
    running[id] = 1;                                                         \
    if                                                                       \
        :: pending_cpus -> {                                                 \
            MUTEX_LOCK(mutex);                                               \
            if                                                               \
                :: !has_waiter[id] -> {                                      \
                    running[id] = 0;                                         \
                    exclusive_idle();                                        \
                    running[id] = 1;                                         \
                }                                                            \
                :: else -> skip;                                             \
            fi;                                                              \
            MUTEX_UNLOCK(mutex);                                             \
        }                                                                    \
        :: else -> skip;                                                     \
    fi;
 #define cpu_exec_end(id)                                                     \
    running[id] = 0;                                                         \
    if                                                                       \
        :: pending_cpus -> {                                                 \
            MUTEX_LOCK(mutex);                                               \
            if                                                               \
                :: has_waiter[id] -> {                                       \
                    has_waiter[id] = 0;                                      \
                    pending_cpus--;                                          \
                    if                                                       \
                        :: pending_cpus == 1 -> COND_BROADCAST(exclusive_cond); \
                        :: else -> skip;                                     \
                    fi;                                                      \
                }                                                            \
                :: else -> skip;                                             \
            fi;                                                              \
            MUTEX_UNLOCK(mutex);                                             \
        }                                                                    \
        :: else -> skip;                                                     \
    fi
 #endif
 // Promela processes
 byte done_cpu;
 byte in_cpu;
 active[N_CPUS] proctype cpu()
 {
    byte id = _pid % N_CPUS;
    byte cycles = 0;
    cond_t saved;
    do
       :: cycles == N_CYCLES -> break;
       :: else -> {
           cycles++;
           cpu_exec_start(id)
           in_cpu++;
           done_cpu++;
           in_cpu--;
           cpu_exec_end(id)
       }
    od;
 }
 byte done_exclusive;
 byte in_exclusive;
 active[N_EXCLUSIVE] proctype exclusive()
 {
    cond_t saved;
    byte i;
    start_exclusive();
    in_exclusive = 1;
    done_exclusive++;
    in_exclusive = 0;
    end_exclusive();
 }
 #define LIVENESS   (done_cpu == N_CPUS * N_CYCLES && done_exclusive == N_EXCLUSIVE)
 #define SAFETY     !(in_exclusive && in_cpu)
 never {    /* ! ([] SAFETY && <> [] LIVENESS) */
    do
    // once the liveness property is satisfied, this is not executable
    // and the never clause is not accepted
    :: ! LIVENESS -> accept_liveness: skip
    :: 1          -> assert(SAFETY)
    od;
 }
--- a/exec.c
+++ b/exec.c
@ -598,36 +598,11 @@ AddressSpace *cpu_get_address_space(CPUState *cpu, int asidx)
 }
 #endif
 static bool cpu_index_auto_assigned;
 static int cpu_get_free_index(void)
 {
    CPUState *some_cpu;
    int cpu_index = 0;
    cpu_index_auto_assigned = true;
    CPU_FOREACH(some_cpu) {
        cpu_index++;
    }
    return cpu_index;
 }
 void cpu_exec_exit(CPUState *cpu)
 {
    CPUClass *cc = CPU_GET_CLASS(cpu);
-    cpu_list_lock();
+    cpu_list_remove(cpu);
    if (!QTAILQ_IN_USE(cpu, node)) {
        /* there is nothing to undo since cpu_exec_init() hasn't been called */
        cpu_list_unlock();
        return;
    }
    assert(!(cpu_index_auto_assigned && cpu != QTAILQ_LAST(&cpus, CPUTailQ)));
    QTAILQ_REMOVE(&cpus, cpu, node);
    cpu->cpu_index = UNASSIGNED_CPU_INDEX;
    cpu_list_unlock();
    if (cc->vmsd != NULL) {
        vmstate_unregister(NULL, cc->vmsd, cpu);
@ -663,15 +638,7 @@ void cpu_exec_init(CPUState *cpu, Error **errp)
    object_ref(OBJECT(cpu->memory));
 #endif
-    cpu_list_lock();
+    cpu_list_add(cpu);
    if (cpu->cpu_index == UNASSIGNED_CPU_INDEX) {
        cpu->cpu_index = cpu_get_free_index();
        assert(cpu->cpu_index != UNASSIGNED_CPU_INDEX);
    } else {
        assert(!cpu_index_auto_assigned);
    }
    QTAILQ_INSERT_TAIL(&cpus, cpu, node);
    cpu_list_unlock();
 #ifndef CONFIG_USER_ONLY
    if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
--- a/hw/i386/amd_iommu.c
+++ b/hw/i386/amd_iommu.c
@ -21,6 +21,7 @@
 */
 #include "qemu/osdep.h"
 #include "hw/i386/amd_iommu.h"
 #include "qemu/error-report.h"
 #include "trace.h"
 /* used AMD-Vi MMIO registers */
@ -1066,13 +1067,18 @@ static const MemoryRegionOps mmio_mem_ops = {
    }
 };
-static void amdvi_iommu_notify_started(MemoryRegion *iommu)
+static void amdvi_iommu_notify_flag_changed(MemoryRegion *iommu,
                                            IOMMUNotifierFlag old,
                                            IOMMUNotifierFlag new)
 {
    AMDVIAddressSpace *as = container_of(iommu, AMDVIAddressSpace, iommu);
-    hw_error("device %02x.%02x.%x requires iommu notifier which is not "
+    if (new & IOMMU_NOTIFIER_MAP) {
-             "currently supported", as->bus_num, PCI_SLOT(as->devfn),
+        error_report("device %02x.%02x.%x requires iommu notifier which is not "
-             PCI_FUNC(as->devfn));
+                     "currently supported", as->bus_num, PCI_SLOT(as->devfn),
                     PCI_FUNC(as->devfn));
        exit(1);
    }
 }
 static void amdvi_init(AMDVIState *s)
@ -1080,7 +1086,7 @@ static void amdvi_init(AMDVIState *s)
    amdvi_iotlb_reset(s);
    s->iommu_ops.translate = amdvi_translate;
-    s->iommu_ops.notify_started = amdvi_iommu_notify_started;
+    s->iommu_ops.notify_flag_changed = amdvi_iommu_notify_flag_changed;
    s->devtab_len = 0;
    s->cmdbuf_len = 0;
    s->cmdbuf_head = 0;
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@ -1974,14 +1974,20 @@ static IOMMUTLBEntry vtd_iommu_translate(MemoryRegion *iommu, hwaddr addr,
    return ret;
 }
-static void vtd_iommu_notify_started(MemoryRegion *iommu)
+static void vtd_iommu_notify_flag_changed(MemoryRegion *iommu,
                                          IOMMUNotifierFlag old,
                                          IOMMUNotifierFlag new)
 {
    VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu);
-    hw_error("Device at bus %s addr %02x.%d requires iommu notifier which "
+    if (new & IOMMU_NOTIFIER_MAP) {
-             "is currently not supported by intel-iommu emulation",
+        error_report("Device at bus %s addr %02x.%d requires iommu "
-             vtd_as->bus->qbus.name, PCI_SLOT(vtd_as->devfn),
+                     "notifier which is currently not supported by "
-             PCI_FUNC(vtd_as->devfn));
+                     "intel-iommu emulation",
                     vtd_as->bus->qbus.name, PCI_SLOT(vtd_as->devfn),
                     PCI_FUNC(vtd_as->devfn));
        exit(1);
    }
 }
 static const VMStateDescription vtd_vmstate = {
@ -2348,7 +2354,7 @@ static void vtd_init(IntelIOMMUState *s)
    memset(s->womask, 0, DMAR_REG_SIZE);
    s->iommu_ops.translate = vtd_iommu_translate;
-    s->iommu_ops.notify_started = vtd_iommu_notify_started;
+    s->iommu_ops.notify_flag_changed = vtd_iommu_notify_flag_changed;
    s->root = 0;
    s->root_extended = false;
    s->dmar_enabled = false;
--- a/hw/i386/kvm/apic.c
+++ b/hw/i386/kvm/apic.c
@ -125,7 +125,7 @@ static void kvm_apic_vapic_base_update(APICCommonState *s)
    }
 }
-static void kvm_apic_put(void *data)
+static void kvm_apic_put(CPUState *cs, void *data)
 {
    APICCommonState *s = data;
    struct kvm_lapic_state kapic;
@ -146,10 +146,9 @@ static void kvm_apic_post_load(APICCommonState *s)
    run_on_cpu(CPU(s->cpu), kvm_apic_put, s);
 }
-static void do_inject_external_nmi(void *data)
+static void do_inject_external_nmi(CPUState *cpu, void *data)
 {
    APICCommonState *s = data;
    CPUState *cpu = CPU(s->cpu);
    uint32_t lvt;
    int ret;
--- a/hw/i386/kvmvapic.c
+++ b/hw/i386/kvmvapic.c
@ -483,7 +483,7 @@ typedef struct VAPICEnableTPRReporting {
    bool enable;
 } VAPICEnableTPRReporting;
-static void vapic_do_enable_tpr_reporting(void *data)
+static void vapic_do_enable_tpr_reporting(CPUState *cpu, void *data)
 {
    VAPICEnableTPRReporting *info = data;
@ -734,10 +734,10 @@ static void vapic_realize(DeviceState *dev, Error **errp)
    nb_option_roms++;
 }
-static void do_vapic_enable(void *data)
+static void do_vapic_enable(CPUState *cs, void *data)
 {
    VAPICROMState *s = data;
-    X86CPU *cpu = X86_CPU(first_cpu);
+    X86CPU *cpu = X86_CPU(cs);
    static const uint8_t enabled = 1;
    cpu_physical_memory_write(s->vapic_paddr + offsetof(VAPICState, enabled),
--- a/hw/intc/ioapic.c
+++ b/hw/intc/ioapic.c
@ -416,7 +416,7 @@ static void ioapic_realize(DeviceState *dev, Error **errp)
 }
 static Property ioapic_properties[] = {
-    DEFINE_PROP_UINT8("version", IOAPICCommonState, version, 0x11),
+    DEFINE_PROP_UINT8("version", IOAPICCommonState, version, 0x20),
    DEFINE_PROP_END_OF_LIST(),
 };
--- a/hw/ppc/ppce500_spin.c
+++ b/hw/ppc/ppce500_spin.c
@ -54,11 +54,6 @@ typedef struct SpinState {
    SpinInfo spin[MAX_CPUS];
 } SpinState;
 typedef struct spin_kick {
    PowerPCCPU *cpu;
    SpinInfo *spin;
 } SpinKick;
 static void spin_reset(void *opaque)
 {
    SpinState *s = opaque;
@ -89,16 +84,15 @@ static void mmubooke_create_initial_mapping(CPUPPCState *env,
    env->tlb_dirty = true;
 }
-static void spin_kick(void *data)
+static void spin_kick(CPUState *cs, void *data)
 {
-    SpinKick *kick = data;
+    PowerPCCPU *cpu = POWERPC_CPU(cs);
-    CPUState *cpu = CPU(kick->cpu);
+    CPUPPCState *env = &cpu->env;
-    CPUPPCState *env = &kick->cpu->env;
+    SpinInfo *curspin = data;
    SpinInfo *curspin = kick->spin;
    hwaddr map_size = 64 * 1024 * 1024;
    hwaddr map_start;
-    cpu_synchronize_state(cpu);
+    cpu_synchronize_state(cs);
    stl_p(&curspin->pir, env->spr[SPR_BOOKE_PIR]);
    env->nip = ldq_p(&curspin->addr) & (map_size - 1);
    env->gpr[3] = ldq_p(&curspin->r3);
@ -112,10 +106,10 @@ static void spin_kick(void *data)
    map_start = ldq_p(&curspin->addr) & ~(map_size - 1);
    mmubooke_create_initial_mapping(env, 0, map_start, map_size);
-    cpu->halted = 0;
+    cs->halted = 0;
-    cpu->exception_index = -1;
+    cs->exception_index = -1;
-    cpu->stopped = false;
+    cs->stopped = false;
-    qemu_cpu_kick(cpu);
+    qemu_cpu_kick(cs);
 }
 static void spin_write(void *opaque, hwaddr addr, uint64_t value,
@ -153,12 +147,7 @@ static void spin_write(void *opaque, hwaddr addr, uint64_t value,
    if (!(ldq_p(&curspin->addr) & 1)) {
        /* run CPU */
-        SpinKick kick = {
+        run_on_cpu(cpu, spin_kick, curspin);
            .cpu = POWERPC_CPU(cpu),
            .spin = curspin,
        };
        run_on_cpu(cpu, spin_kick, &kick);
    }
 }
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@ -2132,10 +2132,8 @@ static void spapr_machine_finalizefn(Object *obj)
    g_free(spapr->kvm_type);
 }
-static void ppc_cpu_do_nmi_on_cpu(void *arg)
+static void ppc_cpu_do_nmi_on_cpu(CPUState *cs, void *arg)
 {
    CPUState *cs = arg;
    cpu_synchronize_state(cs);
    ppc_cpu_do_system_reset(cs);
 }
@ -2145,7 +2143,7 @@ static void spapr_nmi(NMIState *n, int cpu_index, Error **errp)
    CPUState *cs;
    CPU_FOREACH(cs) {
-        async_run_on_cpu(cs, ppc_cpu_do_nmi_on_cpu, cs);
+        async_run_on_cpu(cs, ppc_cpu_do_nmi_on_cpu, NULL);
    }
 }
--- a/hw/ppc/spapr_hcall.c
+++ b/hw/ppc/spapr_hcall.c
@ -13,19 +13,18 @@
 #include "kvm_ppc.h"
 struct SPRSyncState {
    CPUState *cs;
    int spr;
    target_ulong value;
    target_ulong mask;
 };
-static void do_spr_sync(void *arg)
+static void do_spr_sync(CPUState *cs, void *arg)
 {
    struct SPRSyncState *s = arg;
-    PowerPCCPU *cpu = POWERPC_CPU(s->cs);
+    PowerPCCPU *cpu = POWERPC_CPU(cs);
    CPUPPCState *env = &cpu->env;
-    cpu_synchronize_state(s->cs);
+    cpu_synchronize_state(cs);
    env->spr[s->spr] &= ~s->mask;
    env->spr[s->spr] |= s->value;
 }
@ -34,7 +33,6 @@ static void set_spr(CPUState *cs, int spr, target_ulong value,
                    target_ulong mask)
 {
    struct SPRSyncState s = {
        .cs = cs,
        .spr = spr,
        .value = value,
        .mask = mask
@ -909,17 +907,17 @@ static target_ulong cas_get_option_vector(int vector, target_ulong table)
 }
 typedef struct {
    PowerPCCPU *cpu;
    uint32_t cpu_version;
    Error *err;
 } SetCompatState;
-static void do_set_compat(void *arg)
+static void do_set_compat(CPUState *cs, void *arg)
 {
    PowerPCCPU *cpu = POWERPC_CPU(cs);
    SetCompatState *s = arg;
-    cpu_synchronize_state(CPU(s->cpu));
+    cpu_synchronize_state(cs);
-    ppc_set_compat(s->cpu, s->cpu_version, &s->err);
+    ppc_set_compat(cpu, s->cpu_version, &s->err);
 }
 #define get_compat_level(cpuver) ( \
@ -1015,7 +1013,6 @@ static target_ulong h_client_architecture_support(PowerPCCPU *cpu_,
    if (old_cpu_version != cpu_version) {
        CPU_FOREACH(cs) {
            SetCompatState s = {
                .cpu = POWERPC_CPU(cs),
                .cpu_version = cpu_version,
                .err = NULL,
            };
--- a/hw/ppc/spapr_iommu.c
+++ b/hw/ppc/spapr_iommu.c
@ -156,14 +156,17 @@ static uint64_t spapr_tce_get_min_page_size(MemoryRegion *iommu)
    return 1ULL << tcet->page_shift;
 }
-static void spapr_tce_notify_started(MemoryRegion *iommu)
+static void spapr_tce_notify_flag_changed(MemoryRegion *iommu,
                                          IOMMUNotifierFlag old,
                                          IOMMUNotifierFlag new)
 {
-    spapr_tce_set_need_vfio(container_of(iommu, sPAPRTCETable, iommu), true);
+    struct sPAPRTCETable *tbl = container_of(iommu, sPAPRTCETable, iommu);
 }
-static void spapr_tce_notify_stopped(MemoryRegion *iommu)
+    if (old == IOMMU_NOTIFIER_NONE && new != IOMMU_NOTIFIER_NONE) {
-{
+        spapr_tce_set_need_vfio(tbl, true);
-    spapr_tce_set_need_vfio(container_of(iommu, sPAPRTCETable, iommu), false);
+    } else if (old != IOMMU_NOTIFIER_NONE && new == IOMMU_NOTIFIER_NONE) {
        spapr_tce_set_need_vfio(tbl, false);
    }
 }
 static int spapr_tce_table_post_load(void *opaque, int version_id)
@ -246,8 +249,7 @@ static const VMStateDescription vmstate_spapr_tce_table = {
 static MemoryRegionIOMMUOps spapr_iommu_ops = {
    .translate = spapr_tce_translate_iommu,
    .get_min_page_size = spapr_tce_get_min_page_size,
-    .notify_started = spapr_tce_notify_started,
+    .notify_flag_changed = spapr_tce_notify_flag_changed,
    .notify_stopped = spapr_tce_notify_stopped,
 };
 static int spapr_tce_table_realize(DeviceState *dev)
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@ -293,11 +293,10 @@ static bool vfio_listener_skipped_section(MemoryRegionSection *section)
           section->offset_within_address_space & (1ULL << 63);
 }
-static void vfio_iommu_map_notify(Notifier *n, void *data)
+static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
 {
    VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
    VFIOContainer *container = giommu->container;
    IOMMUTLBEntry *iotlb = data;
    hwaddr iova = iotlb->iova + giommu->iommu_offset;
    MemoryRegion *mr;
    hwaddr xlat;
@ -454,6 +453,7 @@ static void vfio_listener_region_add(MemoryListener *listener,
                               section->offset_within_region;
        giommu->container = container;
        giommu->n.notify = vfio_iommu_map_notify;
        giommu->n.notifier_flags = IOMMU_NOTIFIER_ALL;
        QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);
        memory_region_register_iommu_notifier(giommu->iommu, &giommu->n);
--- a/include/exec/cpu-common.h
+++ b/include/exec/cpu-common.h
@ -23,6 +23,11 @@ typedef struct CPUListState {
    FILE *file;
 } CPUListState;
 /* The CPU list lock nests outside tb_lock/tb_unlock.  */
 void qemu_init_cpu_list(void);
 void cpu_list_lock(void);
 void cpu_list_unlock(void);
 #if !defined(CONFIG_USER_ONLY)
 enum device_endian {
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@ -56,17 +56,6 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
                              target_ulong pc, target_ulong cs_base,
                              uint32_t flags,
                              int cflags);
 #if defined(CONFIG_USER_ONLY)
 void cpu_list_lock(void);
 void cpu_list_unlock(void);
 #else
 static inline void cpu_list_unlock(void)
 {
 }
 static inline void cpu_list_lock(void)
 {
 }
 #endif
 void cpu_exec_init(CPUState *cpu, Error **errp);
 void QEMU_NORETURN cpu_loop_exit(CPUState *cpu);
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@ -67,6 +67,27 @@ struct IOMMUTLBEntry {
    IOMMUAccessFlags perm;
 };
 /*
 * Bitmap for different IOMMUNotifier capabilities. Each notifier can
 * register with one or multiple IOMMU Notifier capability bit(s).
 */
 typedef enum {
    IOMMU_NOTIFIER_NONE = 0,
    /* Notify cache invalidations */
    IOMMU_NOTIFIER_UNMAP = 0x1,
    /* Notify entry changes (newly created entries) */
    IOMMU_NOTIFIER_MAP = 0x2,
 } IOMMUNotifierFlag;
 #define IOMMU_NOTIFIER_ALL (IOMMU_NOTIFIER_MAP | IOMMU_NOTIFIER_UNMAP)
 struct IOMMUNotifier {
    void (*notify)(struct IOMMUNotifier *notifier, IOMMUTLBEntry *data);
    IOMMUNotifierFlag notifier_flags;
    QLIST_ENTRY(IOMMUNotifier) node;
 };
 typedef struct IOMMUNotifier IOMMUNotifier;
 /* New-style MMIO accessors can indicate that the transaction failed.
 * A zero (MEMTX_OK) response means success; anything else is a failure
 * of some kind. The memory subsystem will bitwise-OR together results
@ -153,10 +174,10 @@ struct MemoryRegionIOMMUOps {
    IOMMUTLBEntry (*translate)(MemoryRegion *iommu, hwaddr addr, bool is_write);
    /* Returns minimum supported page size */
    uint64_t (*get_min_page_size)(MemoryRegion *iommu);
-    /* Called when the first notifier is set */
+    /* Called when IOMMU Notifier flag changed */
-    void (*notify_started)(MemoryRegion *iommu);
+    void (*notify_flag_changed)(MemoryRegion *iommu,
-    /* Called when the last notifier is removed */
+                                IOMMUNotifierFlag old_flags,
-    void (*notify_stopped)(MemoryRegion *iommu);
+                                IOMMUNotifierFlag new_flags);
 };
 typedef struct CoalescedMemoryRange CoalescedMemoryRange;
@ -201,7 +222,8 @@ struct MemoryRegion {
    const char *name;
    unsigned ioeventfd_nb;
    MemoryRegionIoeventfd *ioeventfds;
-    NotifierList iommu_notify;
+    QLIST_HEAD(, IOMMUNotifier) iommu_notify;
    IOMMUNotifierFlag iommu_notify_flags;
 };
 /**
@ -607,6 +629,15 @@ uint64_t memory_region_iommu_get_min_page_size(MemoryRegion *mr);
 /**
 * memory_region_notify_iommu: notify a change in an IOMMU translation entry.
 *
 * The notification type will be decided by entry.perm bits:
 *
 * - For UNMAP (cache invalidation) notifies: set entry.perm to IOMMU_NONE.
 * - For MAP (newly added entry) notifies: set entry.perm to the
 *   permission of the page (which is definitely !IOMMU_NONE).
 *
 * Note: for any IOMMU implementation, an in-place mapping change
 * should be notified with an UNMAP followed by a MAP.
 *
 * @mr: the memory region that was changed
 * @entry: the new entry in the IOMMU translation table.  The entry
 *         replaces all old entries for the same virtual I/O address range.
@ -620,11 +651,12 @@ void memory_region_notify_iommu(MemoryRegion *mr,
 * IOMMU translation entries.
 *
 * @mr: the memory region to observe
- * @n: the notifier to be added; the notifier receives a pointer to an
+ * @n: the IOMMUNotifier to be added; the notify callback receives a
- *     #IOMMUTLBEntry as the opaque value; the pointer ceases to be
+ *     pointer to an #IOMMUTLBEntry as the opaque value; the pointer
- *     valid on exit from the notifier.
+ *     ceases to be valid on exit from the notifier.
 */
-void memory_region_register_iommu_notifier(MemoryRegion *mr, Notifier *n);
+void memory_region_register_iommu_notifier(MemoryRegion *mr,
                                           IOMMUNotifier *n);
 /**
 * memory_region_iommu_replay: replay existing IOMMU translations to
@ -636,7 +668,8 @@ void memory_region_register_iommu_notifier(MemoryRegion *mr, Notifier *n);
 * @is_write: Whether to treat the replay as a translate "write"
 *     through the iommu
 */
-void memory_region_iommu_replay(MemoryRegion *mr, Notifier *n, bool is_write);
+void memory_region_iommu_replay(MemoryRegion *mr, IOMMUNotifier *n,
                                bool is_write);
 /**
 * memory_region_unregister_iommu_notifier: unregister a notifier for
@ -646,7 +679,8 @@ void memory_region_iommu_replay(MemoryRegion *mr, Notifier *n, bool is_write);
 *      needs to be called
 * @n: the notifier to be removed.
 */
-void memory_region_unregister_iommu_notifier(MemoryRegion *mr, Notifier *n);
+void memory_region_unregister_iommu_notifier(MemoryRegion *mr,
                                             IOMMUNotifier *n);
 /**
 * memory_region_name: get a memory region's name
@ -1154,12 +1188,11 @@ MemoryRegionSection memory_region_find(MemoryRegion *mr,
                                       hwaddr addr, uint64_t size);
 /**
- * address_space_sync_dirty_bitmap: synchronize the dirty log for all memory
+ * memory_global_dirty_log_sync: synchronize the dirty log for all memory
 *
- * Synchronizes the dirty page log for an entire address space.
+ * Synchronizes the dirty page log for all address spaces.
 * @as: the address space that contains the memory being synchronized
 */
-void address_space_sync_dirty_bitmap(AddressSpace *as);
+void memory_global_dirty_log_sync(void);
 /**
 * memory_region_transaction_begin: Start a transaction.
--- a/include/exec/tb-context.h
+++ b/include/exec/tb-context.h
@ -38,7 +38,7 @@ struct TBContext {
    QemuMutex tb_lock;
    /* statistics */
-    int tb_flush_count;
+    unsigned tb_flush_count;
    int tb_phys_invalidate_count;
 };
--- a/include/hw/compat.h
+++ b/include/hw/compat.h
@ -6,6 +6,10 @@
        .driver   = "virtio-pci",\
        .property = "page-per-vq",\
        .value    = "on",\
    },{\
        .driver   = "ioapic",\
        .property = "version",\
        .value    = "0x11",\
    },
 #define HW_COMPAT_2_6 \
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@ -93,7 +93,7 @@ typedef struct VFIOGuestIOMMU {
    VFIOContainer *container;
    MemoryRegion *iommu;
    hwaddr iommu_offset;
-    Notifier n;
+    IOMMUNotifier n;
    QLIST_ENTRY(VFIOGuestIOMMU) giommu_next;
 } VFIOGuestIOMMU;
--- a/include/qemu/compiler.h
+++ b/include/qemu/compiler.h
@ -1,4 +1,8 @@
-/* public domain */
+/* compiler.h: macros to abstract away compiler specifics
 *
 * This work is licensed under the terms of the GNU GPL, version 2 or later.
 * See the COPYING file in the top-level directory.
 */
 #ifndef COMPILER_H
 #define COMPILER_H
--- a/include/qom/cpu.h
+++ b/include/qom/cpu.h
@ -232,13 +232,8 @@ struct kvm_run;
 #define TB_JMP_CACHE_SIZE (1 << TB_JMP_CACHE_BITS)
 /* work queue */
-struct qemu_work_item {
+typedef void (*run_on_cpu_func)(CPUState *cpu, void *data);
-    struct qemu_work_item *next;
+struct qemu_work_item;
    void (*func)(void *data);
    void *data;
    int done;
    bool free;
 };
 /**
 * CPUState:
@ -247,7 +242,9 @@ struct qemu_work_item {
 * @nr_threads: Number of threads within this CPU.
 * @numa_node: NUMA node this CPU is belonging to.
 * @host_tid: Host thread ID.
- * @running: #true if CPU is currently running (usermode).
+ * @running: #true if CPU is currently running (lockless).
 * @has_waiter: #true if a CPU is currently waiting for the cpu_exec_end;
 * valid under cpu_list_lock.
 * @created: Indicates whether the CPU thread has been successfully created.
 * @interrupt_request: Indicates a pending interrupt request.
 * @halted: Nonzero if the CPU is in suspended state.
@ -257,7 +254,6 @@ struct qemu_work_item {
 * @crash_occurred: Indicates the OS reported a crash (panic) for this CPU
 * @tcg_exit_req: Set to force TCG to stop executing linked TBs for this
 *           CPU and return to its top level loop.
 * @tb_flushed: Indicates the translation buffer has been flushed.
 * @singlestep_enabled: Flags for single-stepping.
 * @icount_extra: Instructions until next timer event.
 * @icount_decr: Number of cycles left, with interrupt flag in high bit.
@ -301,7 +297,7 @@ struct CPUState {
 #endif
    int thread_id;
    uint32_t host_tid;
-    bool running;
+    bool running, has_waiter;
    struct QemuCond *halt_cond;
    bool thread_kicked;
    bool created;
@ -310,7 +306,6 @@ struct CPUState {
    bool unplug;
    bool crash_occurred;
    bool exit_request;
    bool tb_flushed;
    uint32_t interrupt_request;
    int singlestep_enabled;
    int64_t icount_extra;
@ -542,6 +537,18 @@ static inline int cpu_asidx_from_attrs(CPUState *cpu, MemTxAttrs attrs)
 }
 #endif
 /**
 * cpu_list_add:
 * @cpu: The CPU to be added to the list of CPUs.
 */
 void cpu_list_add(CPUState *cpu);
 /**
 * cpu_list_remove:
 * @cpu: The CPU to be removed from the list of CPUs.
 */
 void cpu_list_remove(CPUState *cpu);
 /**
 * cpu_reset:
 * @cpu: The CPU whose state is to be reset.
@ -615,6 +622,18 @@ void qemu_cpu_kick(CPUState *cpu);
 */
 bool cpu_is_stopped(CPUState *cpu);
 /**
 * do_run_on_cpu:
 * @cpu: The vCPU to run on.
 * @func: The function to be executed.
 * @data: Data to pass to the function.
 * @mutex: Mutex to release while waiting for @func to run.
 *
 * Used internally in the implementation of run_on_cpu.
 */
 void do_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data,
                   QemuMutex *mutex);
 /**
 * run_on_cpu:
 * @cpu: The vCPU to run on.
@ -623,7 +642,7 @@ bool cpu_is_stopped(CPUState *cpu);
 *
 * Schedules the function @func for execution on the vCPU @cpu.
 */
-void run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data);
+void run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data);
 /**
 * async_run_on_cpu:
@ -633,7 +652,21 @@ void run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data);
 *
 * Schedules the function @func for execution on the vCPU @cpu asynchronously.
 */
-void async_run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data);
+void async_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data);
 /**
 * async_safe_run_on_cpu:
 * @cpu: The vCPU to run on.
 * @func: The function to be executed.
 * @data: Data to pass to the function.
 *
 * Schedules the function @func for execution on the vCPU @cpu asynchronously,
 * while all other vCPUs are sleeping.
 *
 * Unlike run_on_cpu and async_run_on_cpu, the function is run outside the
 * BQL.
 */
 void async_safe_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data);
 /**
 * qemu_get_cpu:
@ -793,6 +826,49 @@ void cpu_remove(CPUState *cpu);
 */
 void cpu_remove_sync(CPUState *cpu);
 /**
 * process_queued_cpu_work() - process all items on CPU work queue
 * @cpu: The CPU which work queue to process.
 */
 void process_queued_cpu_work(CPUState *cpu);
 /**
 * cpu_exec_start:
 * @cpu: The CPU for the current thread.
 *
 * Record that a CPU has started execution and can be interrupted with
 * cpu_exit.
 */
 void cpu_exec_start(CPUState *cpu);
 /**
 * cpu_exec_end:
 * @cpu: The CPU for the current thread.
 *
 * Record that a CPU has stopped execution and exclusive sections
 * can be executed without interrupting it.
 */
 void cpu_exec_end(CPUState *cpu);
 /**
 * start_exclusive:
 *
 * Wait for a concurrent exclusive section to end, and then start
 * a section of work that is run while other CPUs are not running
 * between cpu_exec_start and cpu_exec_end.  CPUs that are running
 * cpu_exec are exited immediately.  CPUs that call cpu_exec_start
 * during the exclusive section go to sleep until this CPU calls
 * end_exclusive.
 */
 void start_exclusive(void);
 /**
 * end_exclusive:
 *
 * Concludes an exclusive execution section started by start_exclusive.
 */
 void end_exclusive(void);
 /**
 * qemu_init_vcpu:
 * @cpu: The vCPU to initialize.
--- a/include/sysemu/replay.h
+++ b/include/sysemu/replay.h
@ -105,6 +105,8 @@ bool replay_checkpoint(ReplayCheckpoint checkpoint);
 /*! Disables storing events in the queue */
 void replay_disable_events(void);
 /*! Enables storing events in the queue */
 void replay_enable_events(void);
 /*! Returns true when saving events is enabled */
 bool replay_events_enabled(void);
 /*! Adds bottom half event to the queue */
@ -115,6 +117,8 @@ void replay_input_event(QemuConsole *src, InputEvent *evt);
 void replay_input_sync_event(void);
 /*! Adds block layer event to the queue */
 void replay_block_event(QEMUBH *bh, uint64_t id);
 /*! Returns ID for the next block event */
 uint64_t blkreplay_next_id(void);
 /* Character device */
--- a/kvm-all.c
+++ b/kvm-all.c
@ -1847,10 +1847,8 @@ void kvm_flush_coalesced_mmio_buffer(void)
    s->coalesced_flush_in_progress = false;
 }
-static void do_kvm_cpu_synchronize_state(void *arg)
+static void do_kvm_cpu_synchronize_state(CPUState *cpu, void *arg)
 {
    CPUState *cpu = arg;
    if (!cpu->kvm_vcpu_dirty) {
        kvm_arch_get_registers(cpu);
        cpu->kvm_vcpu_dirty = true;
@ -1860,34 +1858,30 @@ static void do_kvm_cpu_synchronize_state(void *arg)
 void kvm_cpu_synchronize_state(CPUState *cpu)
 {
    if (!cpu->kvm_vcpu_dirty) {
-        run_on_cpu(cpu, do_kvm_cpu_synchronize_state, cpu);
+        run_on_cpu(cpu, do_kvm_cpu_synchronize_state, NULL);
    }
 }
-static void do_kvm_cpu_synchronize_post_reset(void *arg)
+static void do_kvm_cpu_synchronize_post_reset(CPUState *cpu, void *arg)
 {
    CPUState *cpu = arg;
    kvm_arch_put_registers(cpu, KVM_PUT_RESET_STATE);
    cpu->kvm_vcpu_dirty = false;
 }
 void kvm_cpu_synchronize_post_reset(CPUState *cpu)
 {
-    run_on_cpu(cpu, do_kvm_cpu_synchronize_post_reset, cpu);
+    run_on_cpu(cpu, do_kvm_cpu_synchronize_post_reset, NULL);
 }
-static void do_kvm_cpu_synchronize_post_init(void *arg)
+static void do_kvm_cpu_synchronize_post_init(CPUState *cpu, void *arg)
 {
    CPUState *cpu = arg;
    kvm_arch_put_registers(cpu, KVM_PUT_FULL_STATE);
    cpu->kvm_vcpu_dirty = false;
 }
 void kvm_cpu_synchronize_post_init(CPUState *cpu)
 {
-    run_on_cpu(cpu, do_kvm_cpu_synchronize_post_init, cpu);
+    run_on_cpu(cpu, do_kvm_cpu_synchronize_post_init, NULL);
 }
 int kvm_cpu_exec(CPUState *cpu)
@ -2216,7 +2210,7 @@ struct kvm_set_guest_debug_data {
    int err;
 };
-static void kvm_invoke_set_guest_debug(void *data)
+static void kvm_invoke_set_guest_debug(CPUState *unused_cpu, void *data)
 {
    struct kvm_set_guest_debug_data *dbg_data = data;
@ -2234,7 +2228,6 @@ int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap)
        data.dbg.control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP;
    }
    kvm_arch_update_guest_debug(cpu, &data.dbg);
    data.cpu = cpu;
    run_on_cpu(cpu, kvm_invoke_set_guest_debug, &data);
    return data.err;
--- a/linux-user/main.c
+++ b/linux-user/main.c
@ -107,21 +107,11 @@ int cpu_get_pic_interrupt(CPUX86State *env)
 /***********************************************************/
 /* Helper routines for implementing atomic operations.  */
 /* To implement exclusive operations we force all cpus to syncronise.
   We don't require a full sync, only that no cpus are executing guest code.
   The alternative is to map target atomic ops onto host equivalents,
   which requires quite a lot of per host/target work.  */
 static pthread_mutex_t cpu_list_mutex = PTHREAD_MUTEX_INITIALIZER;
 static pthread_mutex_t exclusive_lock = PTHREAD_MUTEX_INITIALIZER;
 static pthread_cond_t exclusive_cond = PTHREAD_COND_INITIALIZER;
 static pthread_cond_t exclusive_resume = PTHREAD_COND_INITIALIZER;
 static int pending_cpus;
 /* Make sure everything is in a consistent state for calling fork().  */
 void fork_start(void)
 {
    cpu_list_lock();
    qemu_mutex_lock(&tcg_ctx.tb_ctx.tb_lock);
    pthread_mutex_lock(&exclusive_lock);
    mmap_fork_start();
 }
@ -137,93 +127,15 @@ void fork_end(int child)
                QTAILQ_REMOVE(&cpus, cpu, node);
            }
        }
        pending_cpus = 0;
        pthread_mutex_init(&exclusive_lock, NULL);
        pthread_mutex_init(&cpu_list_mutex, NULL);
        pthread_cond_init(&exclusive_cond, NULL);
        pthread_cond_init(&exclusive_resume, NULL);
        qemu_mutex_init(&tcg_ctx.tb_ctx.tb_lock);
        qemu_init_cpu_list();
        gdbserver_fork(thread_cpu);
    } else {
        pthread_mutex_unlock(&exclusive_lock);
        qemu_mutex_unlock(&tcg_ctx.tb_ctx.tb_lock);
        cpu_list_unlock();
    }
 }
 /* Wait for pending exclusive operations to complete.  The exclusive lock
   must be held.  */
 static inline void exclusive_idle(void)
 {
    while (pending_cpus) {
        pthread_cond_wait(&exclusive_resume, &exclusive_lock);
    }
 }
 /* Start an exclusive operation.
   Must only be called from outside cpu_exec.   */
 static inline void start_exclusive(void)
 {
    CPUState *other_cpu;
    pthread_mutex_lock(&exclusive_lock);
    exclusive_idle();
    pending_cpus = 1;
    /* Make all other cpus stop executing.  */
    CPU_FOREACH(other_cpu) {
        if (other_cpu->running) {
            pending_cpus++;
            cpu_exit(other_cpu);
        }
    }
    if (pending_cpus > 1) {
        pthread_cond_wait(&exclusive_cond, &exclusive_lock);
    }
 }
 /* Finish an exclusive operation.  */
 static inline void __attribute__((unused)) end_exclusive(void)
 {
    pending_cpus = 0;
    pthread_cond_broadcast(&exclusive_resume);
    pthread_mutex_unlock(&exclusive_lock);
 }
 /* Wait for exclusive ops to finish, and begin cpu execution.  */
 static inline void cpu_exec_start(CPUState *cpu)
 {
    pthread_mutex_lock(&exclusive_lock);
    exclusive_idle();
    cpu->running = true;
    pthread_mutex_unlock(&exclusive_lock);
 }
 /* Mark cpu as not executing, and release pending exclusive ops.  */
 static inline void cpu_exec_end(CPUState *cpu)
 {
    pthread_mutex_lock(&exclusive_lock);
    cpu->running = false;
    if (pending_cpus > 1) {
        pending_cpus--;
        if (pending_cpus == 1) {
            pthread_cond_signal(&exclusive_cond);
        }
    }
    exclusive_idle();
    pthread_mutex_unlock(&exclusive_lock);
 }
 void cpu_list_lock(void)
 {
    pthread_mutex_lock(&cpu_list_mutex);
 }
 void cpu_list_unlock(void)
 {
    pthread_mutex_unlock(&cpu_list_mutex);
 }
 #ifdef TARGET_I386
 /***********************************************************/
 /* CPUX86 core interface */
@ -296,6 +208,8 @@ void cpu_loop(CPUX86State *env)
        cpu_exec_start(cs);
        trapnr = cpu_exec(cs);
        cpu_exec_end(cs);
        process_queued_cpu_work(cs);
        switch(trapnr) {
        case 0x80:
            /* linux syscall from int $0x80 */
@ -737,6 +651,8 @@ void cpu_loop(CPUARMState *env)
        cpu_exec_start(cs);
        trapnr = cpu_exec(cs);
        cpu_exec_end(cs);
        process_queued_cpu_work(cs);
        switch(trapnr) {
        case EXCP_UDEF:
            {
@ -1073,6 +989,7 @@ void cpu_loop(CPUARMState *env)
        cpu_exec_start(cs);
        trapnr = cpu_exec(cs);
        cpu_exec_end(cs);
        process_queued_cpu_work(cs);
        switch (trapnr) {
        case EXCP_SWI:
@ -1161,6 +1078,8 @@ void cpu_loop(CPUUniCore32State *env)
        cpu_exec_start(cs);
        trapnr = cpu_exec(cs);
        cpu_exec_end(cs);
        process_queued_cpu_work(cs);
        switch (trapnr) {
        case UC32_EXCP_PRIV:
            {
@ -1366,6 +1285,7 @@ void cpu_loop (CPUSPARCState *env)
        cpu_exec_start(cs);
        trapnr = cpu_exec(cs);
        cpu_exec_end(cs);
        process_queued_cpu_work(cs);
        /* Compute PSR before exposing state.  */
        if (env->cc_op != CC_OP_FLAGS) {
@ -1638,6 +1558,8 @@ void cpu_loop(CPUPPCState *env)
        cpu_exec_start(cs);
        trapnr = cpu_exec(cs);
        cpu_exec_end(cs);
        process_queued_cpu_work(cs);
        switch(trapnr) {
        case POWERPC_EXCP_NONE:
            /* Just go on */
@ -2484,6 +2406,8 @@ void cpu_loop(CPUMIPSState *env)
        cpu_exec_start(cs);
        trapnr = cpu_exec(cs);
        cpu_exec_end(cs);
        process_queued_cpu_work(cs);
        switch(trapnr) {
        case EXCP_SYSCALL:
            env->active_tc.PC += 4;
@ -2724,6 +2648,7 @@ void cpu_loop(CPUOpenRISCState *env)
        cpu_exec_start(cs);
        trapnr = cpu_exec(cs);
        cpu_exec_end(cs);
        process_queued_cpu_work(cs);
        gdbsig = 0;
        switch (trapnr) {
@ -2818,6 +2743,7 @@ void cpu_loop(CPUSH4State *env)
        cpu_exec_start(cs);
        trapnr = cpu_exec(cs);
        cpu_exec_end(cs);
        process_queued_cpu_work(cs);
        switch (trapnr) {
        case 0x160:
@ -2884,6 +2810,8 @@ void cpu_loop(CPUCRISState *env)
        cpu_exec_start(cs);
        trapnr = cpu_exec(cs);
        cpu_exec_end(cs);
        process_queued_cpu_work(cs);
        switch (trapnr) {
        case 0xaa:
            {
@ -2949,6 +2877,8 @@ void cpu_loop(CPUMBState *env)
        cpu_exec_start(cs);
        trapnr = cpu_exec(cs);
        cpu_exec_end(cs);
        process_queued_cpu_work(cs);
        switch (trapnr) {
        case 0xaa:
            {
@ -3066,6 +2996,8 @@ void cpu_loop(CPUM68KState *env)
        cpu_exec_start(cs);
        trapnr = cpu_exec(cs);
        cpu_exec_end(cs);
        process_queued_cpu_work(cs);
        switch(trapnr) {
        case EXCP_ILLEGAL:
            {
@ -3209,6 +3141,7 @@ void cpu_loop(CPUAlphaState *env)
        cpu_exec_start(cs);
        trapnr = cpu_exec(cs);
        cpu_exec_end(cs);
        process_queued_cpu_work(cs);
        /* All of the traps imply a transition through PALcode, which
           implies an REI instruction has been executed.  Which means
@ -3401,6 +3334,8 @@ void cpu_loop(CPUS390XState *env)
        cpu_exec_start(cs);
        trapnr = cpu_exec(cs);
        cpu_exec_end(cs);
        process_queued_cpu_work(cs);
        switch (trapnr) {
        case EXCP_INTERRUPT:
            /* Just indicate that signals should be handled asap.  */
@ -3710,6 +3645,8 @@ void cpu_loop(CPUTLGState *env)
        cpu_exec_start(cs);
        trapnr = cpu_exec(cs);
        cpu_exec_end(cs);
        process_queued_cpu_work(cs);
        switch (trapnr) {
        case TILEGX_EXCP_SYSCALL:
        {
@ -3769,6 +3706,16 @@ void cpu_loop(CPUTLGState *env)
 THREAD CPUState *thread_cpu;
 bool qemu_cpu_is_self(CPUState *cpu)
 {
    return thread_cpu == cpu;
 }
 void qemu_cpu_kick(CPUState *cpu)
 {
    cpu_exit(cpu);
 }
 void task_settid(TaskState *ts)
 {
    if (ts->ts_tid == 0) {
@ -4211,6 +4158,7 @@ int main(int argc, char **argv, char **envp)
    int ret;
    int execfd;
    qemu_init_cpu_list();
    module_call_init(MODULE_INIT_QOM);
    if ((envlist = envlist_create()) == NULL) {
--- a/memory.c
+++ b/memory.c
@ -158,14 +158,10 @@ static bool memory_listener_match(MemoryListener *listener,
 /* No need to ref/unref .mr, the FlatRange keeps it alive.  */
 #define MEMORY_LISTENER_UPDATE_REGION(fr, as, dir, callback, _args...)  \
-    MEMORY_LISTENER_CALL(callback, dir, (&(MemoryRegionSection) {       \
+    do {                                                                \
-        .mr = (fr)->mr,                                                 \
+        MemoryRegionSection mrs = section_from_flat_range(fr, as);      \
-        .address_space = (as),                                          \
+        MEMORY_LISTENER_CALL(callback, dir, &mrs, ##_args);             \
-        .offset_within_region = (fr)->offset_in_region,                 \
+    } while(0)
        .size = (fr)->addr.size,                                        \
        .offset_within_address_space = int128_get64((fr)->addr.start),  \
        .readonly = (fr)->readonly,                                     \
              }), ##_args)
 struct CoalescedMemoryRange {
    AddrRange addr;
@ -245,6 +241,19 @@ typedef struct AddressSpaceOps AddressSpaceOps;
 #define FOR_EACH_FLAT_RANGE(var, view)          \
    for (var = (view)->ranges; var < (view)->ranges + (view)->nr; ++var)
 static inline MemoryRegionSection
 section_from_flat_range(FlatRange *fr, AddressSpace *as)
 {
    return (MemoryRegionSection) {
        .mr = fr->mr,
        .address_space = as,
        .offset_within_region = fr->offset_in_region,
        .size = fr->addr.size,
        .offset_within_address_space = int128_get64(fr->addr.start),
        .readonly = fr->readonly,
    };
 }
 static bool flatrange_equal(FlatRange *a, FlatRange *b)
 {
    return a->mr == b->mr
@ -1413,7 +1422,8 @@ void memory_region_init_iommu(MemoryRegion *mr,
    memory_region_init(mr, owner, name, size);
    mr->iommu_ops = ops,
    mr->terminates = true;  /* then re-forwards */
-    notifier_list_init(&mr->iommu_notify);
+    QLIST_INIT(&mr->iommu_notify);
    mr->iommu_notify_flags = IOMMU_NOTIFIER_NONE;
 }
 static void memory_region_finalize(Object *obj)
@ -1508,13 +1518,31 @@ bool memory_region_is_logging(MemoryRegion *mr, uint8_t client)
    return memory_region_get_dirty_log_mask(mr) & (1 << client);
 }
-void memory_region_register_iommu_notifier(MemoryRegion *mr, Notifier *n)
+static void memory_region_update_iommu_notify_flags(MemoryRegion *mr)
 {
-    if (mr->iommu_ops->notify_started &&
+    IOMMUNotifierFlag flags = IOMMU_NOTIFIER_NONE;
-        QLIST_EMPTY(&mr->iommu_notify.notifiers)) {
+    IOMMUNotifier *iommu_notifier;
-        mr->iommu_ops->notify_started(mr);
+
    QLIST_FOREACH(iommu_notifier, &mr->iommu_notify, node) {
        flags |= iommu_notifier->notifier_flags;
    }
-    notifier_list_add(&mr->iommu_notify, n);
+
    if (flags != mr->iommu_notify_flags &&
        mr->iommu_ops->notify_flag_changed) {
        mr->iommu_ops->notify_flag_changed(mr, mr->iommu_notify_flags,
                                           flags);
    }
    mr->iommu_notify_flags = flags;
 }
 void memory_region_register_iommu_notifier(MemoryRegion *mr,
                                           IOMMUNotifier *n)
 {
    /* We need to register for at least one bitfield */
    assert(n->notifier_flags != IOMMU_NOTIFIER_NONE);
    QLIST_INSERT_HEAD(&mr->iommu_notify, n, node);
    memory_region_update_iommu_notify_flags(mr);
 }
 uint64_t memory_region_iommu_get_min_page_size(MemoryRegion *mr)
@ -1526,7 +1554,8 @@ uint64_t memory_region_iommu_get_min_page_size(MemoryRegion *mr)
    return TARGET_PAGE_SIZE;
 }
-void memory_region_iommu_replay(MemoryRegion *mr, Notifier *n, bool is_write)
+void memory_region_iommu_replay(MemoryRegion *mr, IOMMUNotifier *n,
                                bool is_write)
 {
    hwaddr addr, granularity;
    IOMMUTLBEntry iotlb;
@ -1547,20 +1576,32 @@ void memory_region_iommu_replay(MemoryRegion *mr, Notifier *n, bool is_write)
    }
 }
-void memory_region_unregister_iommu_notifier(MemoryRegion *mr, Notifier *n)
+void memory_region_unregister_iommu_notifier(MemoryRegion *mr,
                                             IOMMUNotifier *n)
 {
-    notifier_remove(n);
+    QLIST_REMOVE(n, node);
-    if (mr->iommu_ops->notify_stopped &&
+    memory_region_update_iommu_notify_flags(mr);
        QLIST_EMPTY(&mr->iommu_notify.notifiers)) {
        mr->iommu_ops->notify_stopped(mr);
    }
 }
 void memory_region_notify_iommu(MemoryRegion *mr,
                                IOMMUTLBEntry entry)
 {
    IOMMUNotifier *iommu_notifier;
    IOMMUNotifierFlag request_flags;
    assert(memory_region_is_iommu(mr));
-    notifier_list_notify(&mr->iommu_notify, &entry);
+
    if (entry.perm & IOMMU_RW) {
        request_flags = IOMMU_NOTIFIER_MAP;
    } else {
        request_flags = IOMMU_NOTIFIER_UNMAP;
    }
    QLIST_FOREACH(iommu_notifier, &mr->iommu_notify, node) {
        if (iommu_notifier->notifier_flags & request_flags) {
            iommu_notifier->notify(iommu_notifier, &entry);
        }
    }
 }
 void memory_region_set_log(MemoryRegion *mr, bool log, unsigned client)
@ -2124,16 +2165,27 @@ bool memory_region_present(MemoryRegion *container, hwaddr addr)
    return mr && mr != container;
 }
-void address_space_sync_dirty_bitmap(AddressSpace *as)
+void memory_global_dirty_log_sync(void)
 {
    MemoryListener *listener;
    AddressSpace *as;
    FlatView *view;
    FlatRange *fr;
-    view = address_space_get_flatview(as);
+    QTAILQ_FOREACH(listener, &memory_listeners, link) {
-    FOR_EACH_FLAT_RANGE(fr, view) {
+        if (!listener->log_sync) {
-        MEMORY_LISTENER_UPDATE_REGION(fr, as, Forward, log_sync);
+            continue;
        }
        /* Global listeners are being phased out.  */
        assert(listener->address_space_filter);
        as = listener->address_space_filter;
        view = address_space_get_flatview(as);
        FOR_EACH_FLAT_RANGE(fr, view) {
            MemoryRegionSection mrs = section_from_flat_range(fr, as);
            listener->log_sync(listener, &mrs);
        }
        flatview_unref(view);
    }
    flatview_unref(view);
 }
 void memory_global_dirty_log_start(void)
--- a/migration/ram.c
+++ b/migration/ram.c
@ -626,7 +626,7 @@ static void migration_bitmap_sync(void)
    }
    trace_migration_bitmap_sync_start();
-    address_space_sync_dirty_bitmap(&address_space_memory);
+    memory_global_dirty_log_sync();
    qemu_mutex_lock(&migration_bitmap_mutex);
    rcu_read_lock();
--- a/replay/Makefile.objs
+++ b/replay/Makefile.objs
@ -4,3 +4,4 @@ common-obj-y += replay-events.o
 common-obj-y += replay-time.o
 common-obj-y += replay-input.o
 common-obj-y += replay-char.o
 common-obj-y += replay-snapshot.o
--- a/replay/replay-events.c
+++ b/replay/replay-events.c
@ -279,7 +279,7 @@ static Event *replay_read_event(int checkpoint)
 /* Called with replay mutex locked */
 void replay_read_events(int checkpoint)
 {
-    while (replay_data_kind == EVENT_ASYNC) {
+    while (replay_state.data_kind == EVENT_ASYNC) {
        Event *event = replay_read_event(checkpoint);
        if (!event) {
            break;
@ -309,3 +309,11 @@ bool replay_events_enabled(void)
 {
    return events_enabled;
 }
 uint64_t blkreplay_next_id(void)
 {
    if (replay_events_enabled()) {
        return replay_state.block_request_id++;
    }
    return 0;
 }
--- a/replay/replay-internal.c
+++ b/replay/replay-internal.c
@ -16,11 +16,8 @@
 #include "qemu/error-report.h"
 #include "sysemu/sysemu.h"
 unsigned int replay_data_kind = -1;
 static unsigned int replay_has_unread_data;
 /* Mutex to protect reading and writing events to the log.
-   replay_data_kind and replay_has_unread_data are also protected
+   data_kind and has_unread_data are also protected
   by this mutex.
   It also protects replay events queue which stores events to be
   written or read to the log. */
@ -150,15 +147,16 @@ void replay_check_error(void)
 void replay_fetch_data_kind(void)
 {
    if (replay_file) {
-        if (!replay_has_unread_data) {
+        if (!replay_state.has_unread_data) {
-            replay_data_kind = replay_get_byte();
+            replay_state.data_kind = replay_get_byte();
-            if (replay_data_kind == EVENT_INSTRUCTION) {
+            if (replay_state.data_kind == EVENT_INSTRUCTION) {
                replay_state.instructions_count = replay_get_dword();
            }
            replay_check_error();
-            replay_has_unread_data = 1;
+            replay_state.has_unread_data = 1;
-            if (replay_data_kind >= EVENT_COUNT) {
+            if (replay_state.data_kind >= EVENT_COUNT) {
-                error_report("Replay: unknown event kind %d", replay_data_kind);
+                error_report("Replay: unknown event kind %d",
                             replay_state.data_kind);
                exit(1);
            }
        }
@ -167,7 +165,7 @@ void replay_fetch_data_kind(void)
 void replay_finish_event(void)
 {
-    replay_has_unread_data = 0;
+    replay_state.has_unread_data = 0;
    replay_fetch_data_kind();
 }
--- a/replay/replay-internal.h
+++ b/replay/replay-internal.h
@ -62,11 +62,19 @@ typedef struct ReplayState {
    uint64_t current_step;
    /*! Number of instructions to be executed before other events happen. */
    int instructions_count;
    /*! Type of the currently executed event. */
    unsigned int data_kind;
    /*! Flag which indicates that event is not processed yet. */
    unsigned int has_unread_data;
    /*! Temporary variable for saving current log offset. */
    uint64_t file_offset;
    /*! Next block operation id.
        This counter is global, because requests from different
        block devices should not get overlapping ids. */
    uint64_t block_request_id;
 } ReplayState;
 extern ReplayState replay_state;
 extern unsigned int replay_data_kind;
 /* File for replay writing */
 extern FILE *replay_file;
@ -98,7 +106,7 @@ void replay_check_error(void);
    the next event from the log. */
 void replay_finish_event(void);
 /*! Reads data type from the file and stores it in the
-    replay_data_kind variable. */
+    data_kind variable. */
 void replay_fetch_data_kind(void);
 /*! Saves queued events (like instructions and sound). */
@ -119,8 +127,6 @@ void replay_read_next_clock(unsigned int kind);
 void replay_init_events(void);
 /*! Clears internal data structures for events handling */
 void replay_finish_events(void);
 /*! Enables storing events in the queue */
 void replay_enable_events(void);
 /*! Flushes events queue */
 void replay_flush_events(void);
 /*! Clears events list before loading new VM state */
@ -155,4 +161,11 @@ void replay_event_char_read_save(void *opaque);
 /*! Reads char event read from the file. */
 void *replay_event_char_read_load(void);
 /* VMState-related functions */
 /* Registers replay VMState.
   Should be called before virtual devices initialization
   to make cached timers available for post_load functions. */
 void replay_vmstate_register(void);
 #endif
--- a/replay/replay-snapshot.c
+++ b/replay/replay-snapshot.c
@ -0,0 +1,61 @@
 /*
 * replay-snapshot.c
 *
 * Copyright (c) 2010-2016 Institute for System Programming
 *                         of the Russian Academy of Sciences.
 *
 * This work is licensed under the terms of the GNU GPL, version 2 or later.
 * See the COPYING file in the top-level directory.
 *
 */
 #include "qemu/osdep.h"
 #include "qapi/error.h"
 #include "qemu-common.h"
 #include "sysemu/replay.h"
 #include "replay-internal.h"
 #include "sysemu/sysemu.h"
 #include "monitor/monitor.h"
 #include "qapi/qmp/qstring.h"
 #include "qemu/error-report.h"
 #include "migration/vmstate.h"
 static void replay_pre_save(void *opaque)
 {
    ReplayState *state = opaque;
    state->file_offset = ftell(replay_file);
 }
 static int replay_post_load(void *opaque, int version_id)
 {
    ReplayState *state = opaque;
    fseek(replay_file, state->file_offset, SEEK_SET);
    /* If this was a vmstate, saved in recording mode,
       we need to initialize replay data fields. */
    replay_fetch_data_kind();
    return 0;
 }
 static const VMStateDescription vmstate_replay = {
    .name = "replay",
    .version_id = 1,
    .minimum_version_id = 1,
    .pre_save = replay_pre_save,
    .post_load = replay_post_load,
    .fields = (VMStateField[]) {
        VMSTATE_INT64_ARRAY(cached_clock, ReplayState, REPLAY_CLOCK_COUNT),
        VMSTATE_UINT64(current_step, ReplayState),
        VMSTATE_INT32(instructions_count, ReplayState),
        VMSTATE_UINT32(data_kind, ReplayState),
        VMSTATE_UINT32(has_unread_data, ReplayState),
        VMSTATE_UINT64(file_offset, ReplayState),
        VMSTATE_UINT64(block_request_id, ReplayState),
        VMSTATE_END_OF_LIST()
    },
 };
 void replay_vmstate_register(void)
 {
    vmstate_register(NULL, 0, &vmstate_replay, &replay_state);
 }
--- a/replay/replay-time.c
+++ b/replay/replay-time.c
@ -31,7 +31,7 @@ int64_t replay_save_clock(ReplayClockKind kind, int64_t clock)
 void replay_read_next_clock(ReplayClockKind kind)
 {
-    unsigned int read_kind = replay_data_kind - EVENT_CLOCK;
+    unsigned int read_kind = replay_state.data_kind - EVENT_CLOCK;
    assert(read_kind == kind);
--- a/replay/replay.c
+++ b/replay/replay.c
@ -38,15 +38,15 @@ bool replay_next_event_is(int event)
    /* nothing to skip - not all instructions used */
    if (replay_state.instructions_count != 0) {
-        assert(replay_data_kind == EVENT_INSTRUCTION);
+        assert(replay_state.data_kind == EVENT_INSTRUCTION);
        return event == EVENT_INSTRUCTION;
    }
    while (true) {
-        if (event == replay_data_kind) {
+        if (event == replay_state.data_kind) {
            res = true;
        }
-        switch (replay_data_kind) {
+        switch (replay_state.data_kind) {
        case EVENT_SHUTDOWN:
            replay_finish_event();
            qemu_system_shutdown_request();
@ -85,7 +85,7 @@ void replay_account_executed_instructions(void)
            replay_state.instructions_count -= count;
            replay_state.current_step += count;
            if (replay_state.instructions_count == 0) {
-                assert(replay_data_kind == EVENT_INSTRUCTION);
+                assert(replay_state.data_kind == EVENT_INSTRUCTION);
                replay_finish_event();
                /* Wake up iothread. This is required because
                   timers will not expire until clock counters
@ -188,7 +188,7 @@ bool replay_checkpoint(ReplayCheckpoint checkpoint)
    if (replay_mode == REPLAY_MODE_PLAY) {
        if (replay_next_event_is(EVENT_CHECKPOINT + checkpoint)) {
            replay_finish_event();
-        } else if (replay_data_kind != EVENT_ASYNC) {
+        } else if (replay_state.data_kind != EVENT_ASYNC) {
            res = false;
            goto out;
        }
@ -196,7 +196,7 @@ bool replay_checkpoint(ReplayCheckpoint checkpoint)
        /* replay_read_events may leave some unread events.
           Return false if not all of the events associated with
           checkpoint were processed */
-        res = replay_data_kind != EVENT_ASYNC;
+        res = replay_state.data_kind != EVENT_ASYNC;
    } else if (replay_mode == REPLAY_MODE_RECORD) {
        replay_put_event(EVENT_CHECKPOINT + checkpoint);
        replay_save_events(checkpoint);
@ -237,9 +237,10 @@ static void replay_enable(const char *fname, int mode)
    replay_filename = g_strdup(fname);
    replay_mode = mode;
-    replay_data_kind = -1;
+    replay_state.data_kind = -1;
    replay_state.instructions_count = 0;
    replay_state.current_step = 0;
    replay_state.has_unread_data = 0;
    /* skip file header for RECORD and check it for PLAY */
    if (replay_mode == REPLAY_MODE_RECORD) {
@ -291,6 +292,7 @@ void replay_configure(QemuOpts *opts)
        exit(1);
    }
    replay_vmstate_register();
    replay_enable(fname, mode);
 out:
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@ -2407,7 +2407,7 @@ sub process {
 # we have e.g. CONFIG_LINUX and CONFIG_WIN32 for common cases
 # where they might be necessary.
 		if ($line =~ m@^.\s*\#\s*if.*\b__@) {
-			ERROR("architecture specific defines should be avoided\n" .  $herecurr);
+			WARN("architecture specific defines should be avoided\n" .  $herecurr);
 		}
 # Check that the storage class is at the beginning of a declaration
--- a/stubs/replay.c
+++ b/stubs/replay.c
@ -67,3 +67,8 @@ void replay_char_read_all_save_buf(uint8_t *buf, int offset)
 void replay_block_event(QEMUBH *bh, uint64_t id)
 {
 }
 uint64_t blkreplay_next_id(void)
 {
    return 0;
 }
--- a/target-i386/helper.c
+++ b/target-i386/helper.c
@ -1113,7 +1113,6 @@ out:
 typedef struct MCEInjectionParams {
    Monitor *mon;
    X86CPU *cpu;
    int bank;
    uint64_t status;
    uint64_t mcg_status;
@ -1122,14 +1121,14 @@ typedef struct MCEInjectionParams {
    int flags;
 } MCEInjectionParams;
-static void do_inject_x86_mce(void *data)
+static void do_inject_x86_mce(CPUState *cs, void *data)
 {
    MCEInjectionParams *params = data;
-    CPUX86State *cenv = &params->cpu->env;
+    X86CPU *cpu = X86_CPU(cs);
-    CPUState *cpu = CPU(params->cpu);
+    CPUX86State *cenv = &cpu->env;
    uint64_t *banks = cenv->mce_banks + 4 * params->bank;
-    cpu_synchronize_state(cpu);
+    cpu_synchronize_state(cs);
    /*
     * If there is an MCE exception being processed, ignore this SRAO MCE
@ -1149,7 +1148,7 @@ static void do_inject_x86_mce(void *data)
        if ((cenv->mcg_cap & MCG_CTL_P) && cenv->mcg_ctl != ~(uint64_t)0) {
            monitor_printf(params->mon,
                           "CPU %d: Uncorrected error reporting disabled\n",
-                           cpu->cpu_index);
+                           cs->cpu_index);
            return;
        }
@ -1161,7 +1160,7 @@ static void do_inject_x86_mce(void *data)
            monitor_printf(params->mon,
                           "CPU %d: Uncorrected error reporting disabled for"
                           " bank %d\n",
-                           cpu->cpu_index, params->bank);
+                           cs->cpu_index, params->bank);
            return;
        }
@ -1170,7 +1169,7 @@ static void do_inject_x86_mce(void *data)
            monitor_printf(params->mon,
                           "CPU %d: Previous MCE still in progress, raising"
                           " triple fault\n",
-                           cpu->cpu_index);
+                           cs->cpu_index);
            qemu_log_mask(CPU_LOG_RESET, "Triple fault\n");
            qemu_system_reset_request();
            return;
@ -1182,7 +1181,7 @@ static void do_inject_x86_mce(void *data)
        banks[3] = params->misc;
        cenv->mcg_status = params->mcg_status;
        banks[1] = params->status;
-        cpu_interrupt(cpu, CPU_INTERRUPT_MCE);
+        cpu_interrupt(cs, CPU_INTERRUPT_MCE);
    } else if (!(banks[1] & MCI_STATUS_VAL)
               || !(banks[1] & MCI_STATUS_UC)) {
        if (banks[1] & MCI_STATUS_VAL) {
@ -1204,7 +1203,6 @@ void cpu_x86_inject_mce(Monitor *mon, X86CPU *cpu, int bank,
    CPUX86State *cenv = &cpu->env;
    MCEInjectionParams params = {
        .mon = mon,
        .cpu = cpu,
        .bank = bank,
        .status = status,
        .mcg_status = mcg_status,
@ -1245,7 +1243,6 @@ void cpu_x86_inject_mce(Monitor *mon, X86CPU *cpu, int bank,
            if (other_cs == cs) {
                continue;
            }
            params.cpu = X86_CPU(other_cs);
            run_on_cpu(other_cs, do_inject_x86_mce, &params);
        }
    }
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@ -150,10 +150,8 @@ static int kvm_get_tsc(CPUState *cs)
    return 0;
 }
-static inline void do_kvm_synchronize_tsc(void *arg)
+static inline void do_kvm_synchronize_tsc(CPUState *cpu, void *arg)
 {
    CPUState *cpu = arg;
    kvm_get_tsc(cpu);
 }
@ -163,7 +161,7 @@ void kvm_synchronize_all_tsc(void)
    if (kvm_enabled()) {
        CPU_FOREACH(cpu) {
-            run_on_cpu(cpu, do_kvm_synchronize_tsc, cpu);
+            run_on_cpu(cpu, do_kvm_synchronize_tsc, NULL);
        }
    }
 }
--- a/target-s390x/cpu.c
+++ b/target-s390x/cpu.c
@ -164,7 +164,7 @@ static void s390_cpu_machine_reset_cb(void *opaque)
 {
    S390CPU *cpu = opaque;
-    run_on_cpu(CPU(cpu), s390_do_cpu_full_reset, CPU(cpu));
+    run_on_cpu(CPU(cpu), s390_do_cpu_full_reset, NULL);
 }
 #endif
@ -220,7 +220,7 @@ static void s390_cpu_realizefn(DeviceState *dev, Error **errp)
    s390_cpu_gdb_init(cs);
    qemu_init_vcpu(cs);
 #if !defined(CONFIG_USER_ONLY)
-    run_on_cpu(cs, s390_do_cpu_full_reset, cs);
+    run_on_cpu(cs, s390_do_cpu_full_reset, NULL);
 #else
    cpu_reset(cs);
 #endif
--- a/target-s390x/cpu.h
+++ b/target-s390x/cpu.h
@ -502,17 +502,14 @@ static inline hwaddr decode_basedisp_s(CPUS390XState *env, uint32_t ipb,
 #define decode_basedisp_rs decode_basedisp_s
 /* helper functions for run_on_cpu() */
-static inline void s390_do_cpu_reset(void *arg)
+static inline void s390_do_cpu_reset(CPUState *cs, void *arg)
 {
    CPUState *cs = arg;
    S390CPUClass *scc = S390_CPU_GET_CLASS(cs);
    scc->cpu_reset(cs);
 }
-static inline void s390_do_cpu_full_reset(void *arg)
+static inline void s390_do_cpu_full_reset(CPUState *cs, void *arg)
 {
    CPUState *cs = arg;
    cpu_reset(cs);
 }
--- a/target-s390x/kvm.c
+++ b/target-s390x/kvm.c
@ -1385,7 +1385,6 @@ static int handle_diag(S390CPU *cpu, struct kvm_run *run, uint32_t ipb)
 }
 typedef struct SigpInfo {
    S390CPU *cpu;
    uint64_t param;
    int cc;
    uint64_t *status_reg;
@ -1398,38 +1397,40 @@ static void set_sigp_status(SigpInfo *si, uint64_t status)
    si->cc = SIGP_CC_STATUS_STORED;
 }
-static void sigp_start(void *arg)
+static void sigp_start(CPUState *cs, void *arg)
 {
    S390CPU *cpu = S390_CPU(cs);
    SigpInfo *si = arg;
-    if (s390_cpu_get_state(si->cpu) != CPU_STATE_STOPPED) {
+    if (s390_cpu_get_state(cpu) != CPU_STATE_STOPPED) {
        si->cc = SIGP_CC_ORDER_CODE_ACCEPTED;
        return;
    }
-    s390_cpu_set_state(CPU_STATE_OPERATING, si->cpu);
+    s390_cpu_set_state(CPU_STATE_OPERATING, cpu);
    si->cc = SIGP_CC_ORDER_CODE_ACCEPTED;
 }
-static void sigp_stop(void *arg)
+static void sigp_stop(CPUState *cs, void *arg)
 {
    S390CPU *cpu = S390_CPU(cs);
    SigpInfo *si = arg;
    struct kvm_s390_irq irq = {
        .type = KVM_S390_SIGP_STOP,
    };
-    if (s390_cpu_get_state(si->cpu) != CPU_STATE_OPERATING) {
+    if (s390_cpu_get_state(cpu) != CPU_STATE_OPERATING) {
        si->cc = SIGP_CC_ORDER_CODE_ACCEPTED;
        return;
    }
    /* disabled wait - sleeping in user space */
-    if (CPU(si->cpu)->halted) {
+    if (cs->halted) {
-        s390_cpu_set_state(CPU_STATE_STOPPED, si->cpu);
+        s390_cpu_set_state(CPU_STATE_STOPPED, cpu);
    } else {
        /* execute the stop function */
-        si->cpu->env.sigp_order = SIGP_STOP;
+        cpu->env.sigp_order = SIGP_STOP;
-        kvm_s390_vcpu_interrupt(si->cpu, &irq);
+        kvm_s390_vcpu_interrupt(cpu, &irq);
    }
    si->cc = SIGP_CC_ORDER_CODE_ACCEPTED;
 }
@ -1496,56 +1497,58 @@ static int kvm_s390_store_status(S390CPU *cpu, hwaddr addr, bool store_arch)
    return 0;
 }
-static void sigp_stop_and_store_status(void *arg)
+static void sigp_stop_and_store_status(CPUState *cs, void *arg)
 {
    S390CPU *cpu = S390_CPU(cs);
    SigpInfo *si = arg;
    struct kvm_s390_irq irq = {
        .type = KVM_S390_SIGP_STOP,
    };
    /* disabled wait - sleeping in user space */
-    if (s390_cpu_get_state(si->cpu) == CPU_STATE_OPERATING &&
+    if (s390_cpu_get_state(cpu) == CPU_STATE_OPERATING && cs->halted) {
-        CPU(si->cpu)->halted) {
+        s390_cpu_set_state(CPU_STATE_STOPPED, cpu);
        s390_cpu_set_state(CPU_STATE_STOPPED, si->cpu);
    }
-    switch (s390_cpu_get_state(si->cpu)) {
+    switch (s390_cpu_get_state(cpu)) {
    case CPU_STATE_OPERATING:
-        si->cpu->env.sigp_order = SIGP_STOP_STORE_STATUS;
+        cpu->env.sigp_order = SIGP_STOP_STORE_STATUS;
-        kvm_s390_vcpu_interrupt(si->cpu, &irq);
+        kvm_s390_vcpu_interrupt(cpu, &irq);
        /* store will be performed when handling the stop intercept */
        break;
    case CPU_STATE_STOPPED:
        /* already stopped, just store the status */
-        cpu_synchronize_state(CPU(si->cpu));
+        cpu_synchronize_state(cs);
-        kvm_s390_store_status(si->cpu, KVM_S390_STORE_STATUS_DEF_ADDR, true);
+        kvm_s390_store_status(cpu, KVM_S390_STORE_STATUS_DEF_ADDR, true);
        break;
    }
    si->cc = SIGP_CC_ORDER_CODE_ACCEPTED;
 }
-static void sigp_store_status_at_address(void *arg)
+static void sigp_store_status_at_address(CPUState *cs, void *arg)
 {
    S390CPU *cpu = S390_CPU(cs);
    SigpInfo *si = arg;
    uint32_t address = si->param & 0x7ffffe00u;
    /* cpu has to be stopped */
-    if (s390_cpu_get_state(si->cpu) != CPU_STATE_STOPPED) {
+    if (s390_cpu_get_state(cpu) != CPU_STATE_STOPPED) {
        set_sigp_status(si, SIGP_STAT_INCORRECT_STATE);
        return;
    }
-    cpu_synchronize_state(CPU(si->cpu));
+    cpu_synchronize_state(cs);
-    if (kvm_s390_store_status(si->cpu, address, false)) {
+    if (kvm_s390_store_status(cpu, address, false)) {
        set_sigp_status(si, SIGP_STAT_INVALID_PARAMETER);
        return;
    }
    si->cc = SIGP_CC_ORDER_CODE_ACCEPTED;
 }
-static void sigp_store_adtl_status(void *arg)
+static void sigp_store_adtl_status(CPUState *cs, void *arg)
 {
    S390CPU *cpu = S390_CPU(cs);
    SigpInfo *si = arg;
    if (!s390_has_feat(S390_FEAT_VECTOR)) {
@ -1554,7 +1557,7 @@ static void sigp_store_adtl_status(void *arg)
    }
    /* cpu has to be stopped */
-    if (s390_cpu_get_state(si->cpu) != CPU_STATE_STOPPED) {
+    if (s390_cpu_get_state(cpu) != CPU_STATE_STOPPED) {
        set_sigp_status(si, SIGP_STAT_INCORRECT_STATE);
        return;
    }
@ -1565,31 +1568,32 @@ static void sigp_store_adtl_status(void *arg)
        return;
    }
-    cpu_synchronize_state(CPU(si->cpu));
+    cpu_synchronize_state(cs);
-    if (kvm_s390_store_adtl_status(si->cpu, si->param)) {
+    if (kvm_s390_store_adtl_status(cpu, si->param)) {
        set_sigp_status(si, SIGP_STAT_INVALID_PARAMETER);
        return;
    }
    si->cc = SIGP_CC_ORDER_CODE_ACCEPTED;
 }
-static void sigp_restart(void *arg)
+static void sigp_restart(CPUState *cs, void *arg)
 {
    S390CPU *cpu = S390_CPU(cs);
    SigpInfo *si = arg;
    struct kvm_s390_irq irq = {
        .type = KVM_S390_RESTART,
    };
-    switch (s390_cpu_get_state(si->cpu)) {
+    switch (s390_cpu_get_state(cpu)) {
    case CPU_STATE_STOPPED:
        /* the restart irq has to be delivered prior to any other pending irq */
-        cpu_synchronize_state(CPU(si->cpu));
+        cpu_synchronize_state(cs);
-        do_restart_interrupt(&si->cpu->env);
+        do_restart_interrupt(&cpu->env);
-        s390_cpu_set_state(CPU_STATE_OPERATING, si->cpu);
+        s390_cpu_set_state(CPU_STATE_OPERATING, cpu);
        break;
    case CPU_STATE_OPERATING:
-        kvm_s390_vcpu_interrupt(si->cpu, &irq);
+        kvm_s390_vcpu_interrupt(cpu, &irq);
        break;
    }
    si->cc = SIGP_CC_ORDER_CODE_ACCEPTED;
@ -1597,20 +1601,18 @@ static void sigp_restart(void *arg)
 int kvm_s390_cpu_restart(S390CPU *cpu)
 {
-    SigpInfo si = {
+    SigpInfo si = {};
        .cpu = cpu,
    };
    run_on_cpu(CPU(cpu), sigp_restart, &si);
    DPRINTF("DONE: KVM cpu restart: %p\n", &cpu->env);
    return 0;
 }
-static void sigp_initial_cpu_reset(void *arg)
+static void sigp_initial_cpu_reset(CPUState *cs, void *arg)
 {
    S390CPU *cpu = S390_CPU(cs);
    S390CPUClass *scc = S390_CPU_GET_CLASS(cpu);
    SigpInfo *si = arg;
    CPUState *cs = CPU(si->cpu);
    S390CPUClass *scc = S390_CPU_GET_CLASS(si->cpu);
    cpu_synchronize_state(cs);
    scc->initial_cpu_reset(cs);
@ -1618,11 +1620,11 @@ static void sigp_initial_cpu_reset(void *arg)
    si->cc = SIGP_CC_ORDER_CODE_ACCEPTED;
 }
-static void sigp_cpu_reset(void *arg)
+static void sigp_cpu_reset(CPUState *cs, void *arg)
 {
    S390CPU *cpu = S390_CPU(cs);
    S390CPUClass *scc = S390_CPU_GET_CLASS(cpu);
    SigpInfo *si = arg;
    CPUState *cs = CPU(si->cpu);
    S390CPUClass *scc = S390_CPU_GET_CLASS(si->cpu);
    cpu_synchronize_state(cs);
    scc->cpu_reset(cs);
@ -1630,12 +1632,13 @@ static void sigp_cpu_reset(void *arg)
    si->cc = SIGP_CC_ORDER_CODE_ACCEPTED;
 }
-static void sigp_set_prefix(void *arg)
+static void sigp_set_prefix(CPUState *cs, void *arg)
 {
    S390CPU *cpu = S390_CPU(cs);
    SigpInfo *si = arg;
    uint32_t addr = si->param & 0x7fffe000u;
-    cpu_synchronize_state(CPU(si->cpu));
+    cpu_synchronize_state(cs);
    if (!address_space_access_valid(&address_space_memory, addr,
                                    sizeof(struct LowCore), false)) {
@ -1644,13 +1647,13 @@ static void sigp_set_prefix(void *arg)
    }
    /* cpu has to be stopped */
-    if (s390_cpu_get_state(si->cpu) != CPU_STATE_STOPPED) {
+    if (s390_cpu_get_state(cpu) != CPU_STATE_STOPPED) {
        set_sigp_status(si, SIGP_STAT_INCORRECT_STATE);
        return;
    }
-    si->cpu->env.psa = addr;
+    cpu->env.psa = addr;
-    cpu_synchronize_post_init(CPU(si->cpu));
+    cpu_synchronize_post_init(cs);
    si->cc = SIGP_CC_ORDER_CODE_ACCEPTED;
 }
@ -1658,7 +1661,6 @@ static int handle_sigp_single_dst(S390CPU *dst_cpu, uint8_t order,
                                  uint64_t param, uint64_t *status_reg)
 {
    SigpInfo si = {
        .cpu = dst_cpu,
        .param = param,
        .status_reg = status_reg,
    };
--- a/target-s390x/misc_helper.c
+++ b/target-s390x/misc_helper.c
@ -126,7 +126,7 @@ static int modified_clear_reset(S390CPU *cpu)
    pause_all_vcpus();
    cpu_synchronize_all_states();
    CPU_FOREACH(t) {
-        run_on_cpu(t, s390_do_cpu_full_reset, t);
+        run_on_cpu(t, s390_do_cpu_full_reset, NULL);
    }
    s390_cmma_reset();
    subsystem_reset();
@ -145,7 +145,7 @@ static int load_normal_reset(S390CPU *cpu)
    pause_all_vcpus();
    cpu_synchronize_all_states();
    CPU_FOREACH(t) {
-        run_on_cpu(t, s390_do_cpu_reset, t);
+        run_on_cpu(t, s390_do_cpu_reset, NULL);
    }
    s390_cmma_reset();
    subsystem_reset();
--- a/translate-all.c
+++ b/translate-all.c
@ -834,12 +834,19 @@ static void page_flush_tb(void)
 }
 /* flush all the translation blocks */
-/* XXX: tb_flush is currently not thread safe */
+static void do_tb_flush(CPUState *cpu, void *data)
 void tb_flush(CPUState *cpu)
 {
-    if (!tcg_enabled()) {
+    unsigned tb_flush_req = (unsigned) (uintptr_t) data;
-        return;
+
    tb_lock();
    /* If it's already been done on request of another CPU,
     * just retry.
     */
    if (tcg_ctx.tb_ctx.tb_flush_count != tb_flush_req) {
        goto done;
    }
 #if defined(DEBUG_FLUSH)
    printf("qemu: flush code_size=%ld nb_tbs=%d avg_tb_size=%ld\n",
           (unsigned long)(tcg_ctx.code_gen_ptr - tcg_ctx.code_gen_buffer),
@ -858,7 +865,6 @@ void tb_flush(CPUState *cpu)
        for (i = 0; i < TB_JMP_CACHE_SIZE; ++i) {
            atomic_set(&cpu->tb_jmp_cache[i], NULL);
        }
        atomic_mb_set(&cpu->tb_flushed, true);
    }
    tcg_ctx.tb_ctx.nb_tbs = 0;
@ -868,7 +874,19 @@ void tb_flush(CPUState *cpu)
    tcg_ctx.code_gen_ptr = tcg_ctx.code_gen_buffer;
    /* XXX: flush processor icache at this point if cache flush is
       expensive */
-    tcg_ctx.tb_ctx.tb_flush_count++;
+    atomic_mb_set(&tcg_ctx.tb_ctx.tb_flush_count,
                  tcg_ctx.tb_ctx.tb_flush_count + 1);
 done:
    tb_unlock();
 }
 void tb_flush(CPUState *cpu)
 {
    if (tcg_enabled()) {
        uintptr_t tb_flush_req = atomic_mb_read(&tcg_ctx.tb_ctx.tb_flush_count);
        async_safe_run_on_cpu(cpu, do_tb_flush, (void *) tb_flush_req);
    }
 }
 #ifdef DEBUG_TB_CHECK
@ -1175,9 +1193,8 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
 buffer_overflow:
        /* flush must be done */
        tb_flush(cpu);
-        /* cannot fail at this point */
+        mmap_unlock();
-        tb = tb_alloc(pc);
+        cpu_loop_exit(cpu);
        assert(tb != NULL);
    }
    gen_code_buf = tcg_ctx.code_gen_ptr;
@ -1775,7 +1792,8 @@ void dump_exec_info(FILE *f, fprintf_function cpu_fprintf)
    qht_statistics_destroy(&hst);
    cpu_fprintf(f, "\nStatistics:\n");
-    cpu_fprintf(f, "TB flush count      %d\n", tcg_ctx.tb_ctx.tb_flush_count);
+    cpu_fprintf(f, "TB flush count      %u\n",
            atomic_read(&tcg_ctx.tb_ctx.tb_flush_count));
    cpu_fprintf(f, "TB invalidate count %d\n",
            tcg_ctx.tb_ctx.tb_phys_invalidate_count);
    cpu_fprintf(f, "TLB flush count     %d\n", tlb_flush_count);
--- a/vl.c
+++ b/vl.c
@ -784,6 +784,7 @@ void vm_start(void)
    if (runstate_is_running()) {
        qapi_event_send_stop(&error_abort);
    } else {
        replay_enable_events();
        cpu_enable_ticks();
        runstate_set(RUN_STATE_RUNNING);
        vm_state_notify(1, RUN_STATE_RUNNING);
@ -3019,6 +3020,7 @@ int main(int argc, char **argv, char **envp)
    Error *err = NULL;
    bool list_data_dirs = false;
    qemu_init_cpu_list();
    qemu_init_cpu_loop();
    qemu_mutex_lock_iothread();