Merge branch 'master' of git://git.qemu.org/qemu into prep-up

Conflicts: hw/Makefile.objs hw/ppc_prep.c Signed-off-by: Andreas Färber <andreas.faerber@web.de>
2013-01-10 21:52:28 +01:00 · 2013-01-10 21:52:28 +01:00 · 63e3555e80
parent 52a71bff60 a6308bc222
commit 63e3555e80
1953 changed files with 109215 additions and 41440 deletions
--- a/.exrc
+++ b/.exrc
@ -0,0 +1,7 @@
+"VIM settings to match QEMU coding style.  They are activated by adding the
+"following settings (without the " symbol) as last two lines in $HOME/.vimrc:
+"set secure
+"set exrc
+set expandtab
+set shiftwidth=4
+set smarttab
--- a/.gitignore
+++ b/.gitignore
@ -1,19 +1,18 @@
 config-devices.*
 config-all-devices.*
+config-all-disas.*
 config-host.*
 config-target.*
-trace.h
-trace.c
-trace-dtrace.h
-trace-dtrace.dtrace
+trace/generated-tracers.h
+trace/generated-tracers.c
+trace/generated-tracers-dtrace.h
+trace/generated-tracers-dtrace.dtrace
 *-timestamp
 *-softmmu
 *-darwin-user
 *-linux-user
 *-bsd-user
 libdis*
-libhw32
-libhw64
 libuser
 linux-headers/asm
 qapi-generated
@ -49,6 +48,7 @@ test-qmp-output-visitor
 test-string-input-visitor
 test-string-output-visitor
 test-visitor-serialization
+fsdev/virtfs-proxy-helper
 fsdev/virtfs-proxy-helper.1
 fsdev/virtfs-proxy-helper.pod
 .gdbinit
@ -70,6 +70,7 @@ fsdev/virtfs-proxy-helper.pod
 *.tp
 *.vr
 *.d
+!scripts/qemu-guest-agent/fsfreeze-hook.d
 *.o
 *.lo
 *.la
--- a/.gitmodules
+++ b/.gitmodules
@ -19,3 +19,6 @@
 [submodule "roms/sgabios"]
 	path = roms/sgabios
 	url = git://git.qemu.org/sgabios.git
+[submodule "pixman"]
+	path = pixman
+	url = git://anongit.freedesktop.org/pixman
--- a/31
+++ b/31
@ -32,7 +32,7 @@ mandatory for VMState fields.

 Don't use Linux kernel internal types like u32, __u32 or __le32.

-Use target_phys_addr_t for guest physical addresses except pcibus_t
+Use hwaddr for guest physical addresses except pcibus_t
 for PCI addresses.  In addition, ram_addr_t is a QEMU internal address
 space that maps guest RAM physical addresses into an intermediate
 address space that can map to host virtual address spaces.  Generally
@ -91,10 +91,11 @@ emulators.

 4. String manipulation

-Do not use the strncpy function.  According to the man page, it does
-*not* guarantee a NULL-terminated buffer, which makes it extremely dangerous
-to use.  Instead, use functionally equivalent function:
-void pstrcpy(char *buf, int buf_size, const char *str)
+Do not use the strncpy function.  As mentioned in the man page, it does *not*
+guarantee a NULL-terminated buffer, which makes it extremely dangerous to use.
+It also zeros trailing destination bytes out to the specified length.  Instead,
+use this similar function when possible, but note its different signature:
+void pstrcpy(char *dest, int dest_buf_size, const char *src)

 Don't use strcat because it can't check for buffer overflows, but:
 char *pstrcat(char *buf, int buf_size, const char *s)
@ -122,3 +123,23 @@ gcc's printf attribute directive in the prototype.
 This makes it so gcc's -Wformat and -Wformat-security options can do
 their jobs and cross-check format strings with the number and types
 of arguments.
+
+6. C standard, implementation defined and undefined behaviors
+
+C code in QEMU should be written to the C99 language specification. A copy
+of the final version of the C99 standard with corrigenda TC1, TC2, and TC3
+included, formatted as a draft, can be downloaded from:
+ http://www.open-std.org/jtc1/sc22/WG14/www/docs/n1256.pdf
+
+The C language specification defines regions of undefined behavior and
+implementation defined behavior (to give compiler authors enough leeway to
+produce better code).  In general, code in QEMU should follow the language
+specification and avoid both undefined and implementation defined
+constructs. ("It works fine on the gcc I tested it with" is not a valid
+argument...) However there are a few areas where we allow ourselves to
+assume certain behaviors because in practice all the platforms we care about
+behave in the same way and writing strictly conformant code would be
+painful. These are:
+ * you may assume that integers are 2s complement representation
+ * you may assume that right shift of a signed integer duplicates
+   the sign bit (ie it is an arithmetic shift, not a logical shift)
--- a/87
+++ b/87
@ -132,7 +132,7 @@ Guest CPU Cores (KVM):
 ----------------------

 Overall
-M: Avi Kivity <avi@redhat.com>
+M: Gleb Natapov <gleb@redhat.com>
 M: Marcelo Tosatti <mtosatti@redhat.com>
 L: kvm@vger.kernel.org
 S: Supported
@ -150,7 +150,7 @@ S: Maintained
 F: target-s390x/kvm.c

 X86
-M: Avi Kivity <avi@redhat.com>
+M: Gleb Natapov <gleb@redhat.com>
 M: Marcelo Tosatti <mtosatti@redhat.com>
 L: kvm@vger.kernel.org
 S: Supported
@ -268,6 +268,7 @@ S: Maintained
 F: hw/xilinx_zynq.c
 F: hw/zynq_slcr.c
 F: hw/cadence_*
+F: hw/xilinx_spips.c

 CRIS Machines
 -------------
@ -349,9 +350,31 @@ PowerPC Machines
 405
 M: Alexander Graf <agraf@suse.de>
 L: qemu-ppc@nongnu.org
-S: Maintained
+S: Odd Fixes
 F: hw/ppc405_boards.c

+Bamboo
+M: Alexander Graf <agraf@suse.de>
+L: qemu-ppc@nongnu.org
+S: Odd Fixes
+F: hw/ppc440_bamboo.c
+
+e500
+M: Alexander Graf <agraf@suse.de>
+M: Scott Wood <scottwood@freescale.com>
+L: qemu-ppc@nongnu.org
+S: Supported
+F: hw/ppc/e500.[hc]
+F: hw/ppc/e500plat.c
+
+mpc8544ds
+M: Alexander Graf <agraf@suse.de>
+M: Scott Wood <scottwood@freescale.com>
+L: qemu-ppc@nongnu.org
+S: Supported
+F: hw/ppc/mpc8544ds.c
+F: hw/mpc8544_guts.c
+
 New World
 M: Alexander Graf <agraf@suse.de>
 L: qemu-ppc@nongnu.org
@ -375,6 +398,19 @@ F: hw/ppc_prep.c
 F: hw/prep_pci.[hc]
 F: hw/pc87312.[hc]

+sPAPR
+M: David Gibson <david@gibson.dropbear.id.au>
+M: Alexander Graf <agraf@suse.de>
+L: qemu-ppc@nongnu.org
+S: Supported
+F: hw/spapr*
+
+virtex_ml507
+M: Edgar E. Iglesias <edgar.iglesias@gmail.com>
+L: qemu-ppc@nongnu.org
+S: Odd Fixes
+F: hw/virtex_ml507.c
+
 SH4 Machines
 ------------
 R2D
@ -399,6 +435,12 @@ M: Blue Swirl <blauwirbel@gmail.com>
 S: Maintained
 F: hw/sun4u.c

+Leon3
+M: Fabien Chouteau <chouteau@adacore.com>
+S: Maintained
+F: hw/leon3.c
+F: hw/grlib*
+
 S390 Machines
 -------------
 S390 Virtio
@ -449,9 +491,23 @@ F: hw/omap*
 PCI
 M: Michael S. Tsirkin <mst@redhat.com>
 S: Supported
+F: hw/pci/*
 F: hw/pci*
 F: hw/piix*

+ppc4xx
+M: Alexander Graf <agraf@suse.de>
+L: qemu-ppc@nongnu.org
+S: Odd Fixes
+F: hw/ppc4xx*.[hc]
+
+ppce500
+M: Alexander Graf <agraf@suse.de>
+M: Scott Wood <scottwood@freescale.com>
+L: qemu-ppc@nongnu.org
+S: Supported
+F: hw/ppce500_*
+
 SCSI
 M: Paolo Bonzini <pbonzini@redhat.com>
 S: Supported
@ -464,11 +520,22 @@ M: Paul Brook <paul@codesourcery.com>
 S: Odd Fixes
 F: hw/lsi53c895a.c

+SSI
+M: Peter Crosthwaite <peter.crosthwaite@petalogix.com>
+S: Maintained
+F: hw/ssi.*
+F: hw/m25p80.c
+
 USB
 M: Gerd Hoffmann <kraxel@redhat.com>
 S: Maintained
 F: hw/usb*

+VFIO
+M: Alex Williamson <alex.williamson@redhat.com>
+S: Supported
+F: hw/vfio*
+
 vhost
 M: Michael S. Tsirkin <mst@redhat.com>
 S: Supported
@ -488,6 +555,7 @@ T: git git://github.com/kvaneesh/QEMU.git

 virtio-blk
 M: Kevin Wolf <kwolf@redhat.com>
+M: Stefan Hajnoczi <stefanha@redhat.com>
 S: Supported
 F: hw/virtio-blk*

@ -507,6 +575,7 @@ F: hw/xilinx_intc.c
 F: hw/xilinx_ethlite.c
 F: hw/xilinx_timer.c
 F: hw/xilinx.h
+F: hw/xilinx_spi.c

 Subsystems
 ----------
@ -517,6 +586,7 @@ F: audio/

 Block
 M: Kevin Wolf <kwolf@redhat.com>
+M: Stefan Hajnoczi <stefanha@redhat.com>
 S: Supported
 F: block*
 F: block/
@ -526,6 +596,13 @@ M: Anthony Liguori <aliguori@us.ibm.com>
 S: Maintained
 F: qemu-char.c

+CPU
+M: Andreas Färber <afaerber@suse.de>
+S: Supported
+F: qom/cpu.c
+F: include/qemu/cpu.h
+F: target-i386/cpu.c
+
 Device Tree
 M: Peter Crosthwaite <peter.crosthwaite@petalogix.com>
 M: Alexander Graf <agraf@suse.de>
@ -569,7 +646,7 @@ F: monitor.c

 Network device layer
 M: Anthony Liguori <aliguori@us.ibm.com>
-M: Stefan Hajnoczi <stefanha@gmail.com>
+M: Stefan Hajnoczi <stefanha@redhat.com>
 S: Maintained
 F: net/
 T: git git://github.com/stefanha/qemu.git net
@ -589,7 +666,7 @@ F: slirp/
 T: git git://git.kiszka.org/qemu.git queues/slirp

 Tracing
-M: Stefan Hajnoczi <stefanha@gmail.com>
+M: Stefan Hajnoczi <stefanha@redhat.com>
 S: Maintained
 F: trace/
 F: scripts/tracetool.py
--- a/116
+++ b/116
@ -8,22 +8,38 @@ ifneq ($(wildcard config-host.mak),)
 # Put the all: rule here so that config-host.mak can contain dependencies.
 all:
 include config-host.mak
+
+# Check that we're not trying to do an out-of-tree build from
+# a tree that's been used for an in-tree build.
+ifneq ($(realpath $(SRC_PATH)),$(realpath .))
+ifneq ($(wildcard $(SRC_PATH)/config-host.mak),)
+$(error This is an out of tree build but your source tree ($(SRC_PATH)) \
+seems to have been used for an in-tree build. You can fix this by running \
+"make distclean && rm -rf *-linux-user *-softmmu" in your source tree)
+endif
+endif
+
 include $(SRC_PATH)/rules.mak
 config-host.mak: $(SRC_PATH)/configure
 	@echo $@ is out-of-date, running configure
 	@sed -n "/.*Configured with/s/[^:]*: //p" $@ | sh
 else
 config-host.mak:
+ifneq ($(filter-out %clean,$(MAKECMDGOALS)),$(if $(MAKECMDGOALS),,fail))
 	@echo "Please call configure before running make!"
 	@exit 1
 endif
-
-GENERATED_HEADERS = config-host.h trace.h qemu-options.def
-ifeq ($(TRACE_BACKEND),dtrace)
-GENERATED_HEADERS += trace-dtrace.h
 endif
+
+GENERATED_HEADERS = config-host.h qemu-options.def
 GENERATED_HEADERS += qmp-commands.h qapi-types.h qapi-visit.h
-GENERATED_SOURCES += qmp-marshal.c qapi-types.c qapi-visit.c trace.c
+GENERATED_SOURCES += qmp-marshal.c qapi-types.c qapi-visit.c
+
+GENERATED_HEADERS += trace/generated-tracers.h
+ifeq ($(TRACE_BACKEND),dtrace)
+GENERATED_HEADERS += trace/generated-tracers-dtrace.h
+endif
+GENERATED_SOURCES += trace/generated-tracers.c

 # Don't try to regenerate Makefile or configure
 # We don't generate any of them
@ -52,8 +68,13 @@ SUBDIR_MAKEFLAGS=$(if $(V),,--no-print-directory) BUILD_DIR=$(BUILD_DIR)
 SUBDIR_DEVICES_MAK=$(patsubst %, %/config-devices.mak, $(TARGET_DIRS))
 SUBDIR_DEVICES_MAK_DEP=$(patsubst %, %/config-devices.mak.d, $(TARGET_DIRS))

+ifeq ($(SUBDIR_DEVICES_MAK),)
+config-all-devices.mak:
+	$(call quiet-command,echo '# no devices' > $@,"  GEN   $@")
+else
 config-all-devices.mak: $(SUBDIR_DEVICES_MAK)
 	$(call quiet-command,cat $(SUBDIR_DEVICES_MAK) | grep =y | sort -u > $@,"  GEN   $@")
+endif

 -include $(SUBDIR_DEVICES_MAK_DEP)

@ -81,6 +102,7 @@ defconfig:
 	rm -f config-all-devices.mak $(SUBDIR_DEVICES_MAK)

 -include config-all-devices.mak
+-include config-all-disas.mak

 all: $(DOCS) $(TOOLS) $(HELPERS-y) recurse-all

@ -100,9 +122,20 @@ endif

 subdir-libcacard: $(oslib-obj-y) $(trace-obj-y) qemu-timer-common.o

-$(filter %-softmmu,$(SUBDIR_RULES)): $(universal-obj-y) $(trace-obj-y) $(common-obj-y) $(extra-obj-y) subdir-libdis
+subdir-pixman: pixman/Makefile
+	$(call quiet-command,$(MAKE) $(SUBDIR_MAKEFLAGS) -C pixman V="$(V)" all,)

-$(filter %-user,$(SUBDIR_RULES)): $(universal-obj-y) $(trace-obj-y) subdir-libdis-user subdir-libuser
+pixman/Makefile: $(SRC_PATH)/pixman/configure
+	(cd pixman; CFLAGS="$(CFLAGS) -fPIC" $(SRC_PATH)/pixman/configure $(AUTOCONF_HOST) --disable-gtk --disable-shared --enable-static)
+
+$(SRC_PATH)/pixman/configure:
+	(cd $(SRC_PATH)/pixman; autoreconf -v --install)
+
+$(SUBDIR_RULES): libqemustub.a
+
+$(filter %-softmmu,$(SUBDIR_RULES)): $(universal-obj-y) $(trace-obj-y) $(common-obj-y) $(extra-obj-y)
+
+$(filter %-user,$(SUBDIR_RULES)): $(universal-obj-y) $(trace-obj-y) $(user-obj-y)

 ROMSUBDIR_RULES=$(patsubst %,romsubdir-%, $(ROMS))
 romsubdir-%:
@ -112,59 +145,45 @@ ALL_SUBDIRS=$(TARGET_DIRS) $(patsubst %,pc-bios/%, $(ROMS))

 recurse-all: $(SUBDIR_RULES) $(ROMSUBDIR_RULES)

-audio/audio.o audio/fmodaudio.o: QEMU_CFLAGS += $(FMOD_CFLAGS)
-
-QEMU_CFLAGS+=$(CURL_CFLAGS)
-
-QEMU_CFLAGS += -I$(SRC_PATH)/include
-
-ui/cocoa.o: ui/cocoa.m
-
-ui/sdl.o audio/sdlaudio.o ui/sdl_zoom.o hw/baum.o: QEMU_CFLAGS += $(SDL_CFLAGS)
-
-ui/vnc.o: QEMU_CFLAGS += $(VNC_TLS_CFLAGS)
-
 bt-host.o: QEMU_CFLAGS += $(BLUEZ_CFLAGS)

 version.o: $(SRC_PATH)/version.rc config-host.h
 	$(call quiet-command,$(WINDRES) -I. -o $@ $<,"  RC    $(TARGET_DIR)$@")

 version-obj-$(CONFIG_WIN32) += version.o
+
+######################################################################
+# Build library with stubs
+
+libqemustub.a: $(stub-obj-y)
+
 ######################################################################
 # Support building shared library libcacard

 .PHONY: libcacard.la install-libcacard
-ifeq ($(LIBTOOL),)
-libcacard.la:
-	@echo "libtool is missing, please install and rerun configure"; exit 1
-
-install-libcacard:
-	@echo "libtool is missing, please install and rerun configure"; exit 1
-else
-libcacard.la: $(oslib-obj-y) qemu-timer-common.o $(addsuffix .lo, $(basename $(trace-obj-y)))
+libcacard.la: $(oslib-obj-y) qemu-timer-common.o $(trace-obj-y)
 	$(call quiet-command,$(MAKE) $(SUBDIR_MAKEFLAGS) -C libcacard V="$(V)" TARGET_DIR="$*/" libcacard.la,)

 install-libcacard: libcacard.la
 	$(call quiet-command,$(MAKE) $(SUBDIR_MAKEFLAGS) -C libcacard V="$(V)" TARGET_DIR="$*/" install-libcacard,)
-endif

 ######################################################################

 qemu-img.o: qemu-img-cmds.h

 tools-obj-y = $(oslib-obj-y) $(trace-obj-y) qemu-tool.o qemu-timer.o \
-	qemu-timer-common.o main-loop.o notify.o \
-	iohandler.o cutils.o iov.o async.o
+	main-loop.o iohandler.o error.o
 tools-obj-$(CONFIG_POSIX) += compatfd.o

-qemu-img$(EXESUF): qemu-img.o $(tools-obj-y) $(block-obj-y)
-qemu-nbd$(EXESUF): qemu-nbd.o $(tools-obj-y) $(block-obj-y)
-qemu-io$(EXESUF): qemu-io.o cmd.o $(tools-obj-y) $(block-obj-y)
+qemu-img$(EXESUF): qemu-img.o $(tools-obj-y) $(block-obj-y) libqemustub.a
+qemu-nbd$(EXESUF): qemu-nbd.o $(tools-obj-y) $(block-obj-y) libqemustub.a
+qemu-io$(EXESUF): qemu-io.o cmd.o $(tools-obj-y) $(block-obj-y) libqemustub.a

 qemu-bridge-helper$(EXESUF): qemu-bridge-helper.o

-vscclient$(EXESUF): $(libcacard-y) $(oslib-obj-y) $(trace-obj-y) $(tools-obj-y) qemu-timer-common.o libcacard/vscclient.o
-	$(call quiet-command,$(CC) $(LDFLAGS) -o $@ $^ $(libcacard_libs) $(LIBS),"  LINK  $@")
+vscclient$(EXESUF): LIBS += $(libcacard_libs)
+vscclient$(EXESUF): $(libcacard-y) $(oslib-obj-y) $(trace-obj-y) libcacard/vscclient.o libqemustub.a
+	$(call LINK, $^)

 fsdev/virtfs-proxy-helper$(EXESUF): fsdev/virtfs-proxy-helper.o fsdev/virtio-9p-marshal.o oslib-posix.o $(trace-obj-y)
 fsdev/virtfs-proxy-helper$(EXESUF): LIBS += -lcap
@ -184,13 +203,13 @@ endif
 qapi-py = $(SRC_PATH)/scripts/qapi.py $(SRC_PATH)/scripts/ordereddict.py

 qga/qapi-generated/qga-qapi-types.c qga/qapi-generated/qga-qapi-types.h :\
-$(SRC_PATH)/qapi-schema-guest.json $(SRC_PATH)/scripts/qapi-types.py $(qapi-py)
+$(SRC_PATH)/qga/qapi-schema.json $(SRC_PATH)/scripts/qapi-types.py $(qapi-py)
 	$(call quiet-command,$(PYTHON) $(SRC_PATH)/scripts/qapi-types.py $(gen-out-type) -o qga/qapi-generated -p "qga-" < $<, "  GEN   $@")
 qga/qapi-generated/qga-qapi-visit.c qga/qapi-generated/qga-qapi-visit.h :\
-$(SRC_PATH)/qapi-schema-guest.json $(SRC_PATH)/scripts/qapi-visit.py $(qapi-py)
+$(SRC_PATH)/qga/qapi-schema.json $(SRC_PATH)/scripts/qapi-visit.py $(qapi-py)
 	$(call quiet-command,$(PYTHON) $(SRC_PATH)/scripts/qapi-visit.py $(gen-out-type) -o qga/qapi-generated -p "qga-" < $<, "  GEN   $@")
 qga/qapi-generated/qga-qmp-commands.h qga/qapi-generated/qga-qmp-marshal.c :\
-$(SRC_PATH)/qapi-schema-guest.json $(SRC_PATH)/scripts/qapi-commands.py $(qapi-py)
+$(SRC_PATH)/qga/qapi-schema.json $(SRC_PATH)/scripts/qapi-commands.py $(qapi-py)
 	$(call quiet-command,$(PYTHON) $(SRC_PATH)/scripts/qapi-commands.py $(gen-out-type) -o qga/qapi-generated -p "qga-" < $<, "  GEN   $@")

 qapi-types.c qapi-types.h :\
@ -206,27 +225,26 @@ $(SRC_PATH)/qapi-schema.json $(SRC_PATH)/scripts/qapi-commands.py $(qapi-py)
 QGALIB_GEN=$(addprefix qga/qapi-generated/, qga-qapi-types.h qga-qapi-visit.h qga-qmp-commands.h)
 $(qga-obj-y) qemu-ga.o: $(QGALIB_GEN)

-qemu-ga$(EXESUF): qemu-ga.o $(qga-obj-y) $(tools-obj-y) $(qapi-obj-y) $(qobject-obj-y) $(version-obj-y)
-
-QEMULIBS=libhw32 libhw64 libuser libdis libdis-user
+qemu-ga$(EXESUF): $(qga-obj-y) $(oslib-obj-y) $(trace-obj-y) $(qapi-obj-y) $(qobject-obj-y) $(version-obj-y) libqemustub.a
+	$(call LINK, $^)

 clean:
 # avoid old build problems by removing potentially incorrect old files
 	rm -f config.mak op-i386.h opc-i386.h gen-op-i386.h op-arm.h opc-arm.h gen-op-arm.h
 	rm -f qemu-options.def
-	find . -name '*.[od]' -exec rm -f {} +
+	find . -name '*.[od]' -type f -exec rm -f {} +
 	rm -f *.a *.lo $(TOOLS) $(HELPERS-y) qemu-ga TAGS cscope.* *.pod *~ */*~
 	rm -Rf .libs
 	rm -f qemu-img-cmds.h
-	rm -f trace-dtrace.dtrace trace-dtrace.dtrace-timestamp
 	@# May not be present in GENERATED_HEADERS
-	rm -f trace-dtrace.h trace-dtrace.h-timestamp
+	rm -f trace/generated-tracers-dtrace.dtrace*
+	rm -f trace/generated-tracers-dtrace.h*
 	rm -f $(foreach f,$(GENERATED_HEADERS),$(f) $(f)-timestamp)
 	rm -f $(foreach f,$(GENERATED_SOURCES),$(f) $(f)-timestamp)
 	rm -rf qapi-generated
 	rm -rf qga/qapi-generated
 	$(MAKE) -C tests/tcg clean
-	for d in $(ALL_SUBDIRS) $(QEMULIBS) libcacard; do \
+	for d in $(ALL_SUBDIRS) libcacard; do \
 	if test -d $$d; then $(MAKE) -C $$d $@ || exit 1; fi; \
 	rm -f $$d/qemu-options.def; \
        done
@ -240,7 +258,7 @@ qemu-%.tar.bz2:

 distclean: clean
 	rm -f config-host.mak config-host.h* config-host.ld $(DOCS) qemu-options.texi qemu-img-cmds.texi qemu-monitor.texi
-	rm -f config-all-devices.mak
+	rm -f config-all-devices.mak config-all-disas.mak
 	rm -f roms/seabios/config.mak roms/vgabios/config.mak
 	rm -f qemu-doc.info qemu-doc.aux qemu-doc.cp qemu-doc.cps qemu-doc.dvi
 	rm -f qemu-doc.fn qemu-doc.fns qemu-doc.info qemu-doc.ky qemu-doc.kys
@ -249,9 +267,10 @@ distclean: clean
 	rm -f config.log
 	rm -f linux-headers/asm
 	rm -f qemu-tech.info qemu-tech.aux qemu-tech.cp qemu-tech.dvi qemu-tech.fn qemu-tech.info qemu-tech.ky qemu-tech.log qemu-tech.pdf qemu-tech.pg qemu-tech.toc qemu-tech.tp qemu-tech.vr
-	for d in $(TARGET_DIRS) $(QEMULIBS); do \
+	for d in $(TARGET_DIRS); do \
 	rm -rf $$d || exit 1 ; \
        done
+	if test -f pixman/config.log; then make -C pixman distclean; fi

 KEYMAPS=da     en-gb  et  fr     fr-ch  is  lt  modifiers  no  pt-br  sv \
 ar      de     en-us  fi  fr-be  hr     it  lv  nl         pl  ru     th \
@ -297,7 +316,6 @@ install-confdir:

 install-sysconfig: install-datadir install-confdir
 	$(INSTALL_DATA) $(SRC_PATH)/sysconfigs/target/target-x86_64.conf "$(DESTDIR)$(qemu_confdir)"
-	$(INSTALL_DATA) $(SRC_PATH)/sysconfigs/target/cpus-x86_64.conf "$(DESTDIR)$(qemu_datadir)"

 install: all $(if $(BUILD_DOCS),install-doc) install-sysconfig install-datadir
 	$(INSTALL_DIR) "$(DESTDIR)$(bindir)"
@ -398,7 +416,9 @@ qemu-doc.dvi qemu-doc.html qemu-doc.info qemu-doc.pdf: \

 # Add a dependency on the generated files, so that they are always
 # rebuilt before other object files
+ifneq ($(filter-out %clean,$(MAKECMDGOALS)),$(if $(MAKECMDGOALS),,fail))
 Makefile: $(GENERATED_HEADERS)
+endif

 # Include automatically generated dependency files
 # Dependencies in Makefile.objs files come from our recursive subdir rules
--- a/Makefile.dis
+++ b/Makefile.dis
@ -1,20 +0,0 @@
-# Makefile for disassemblers.
-
-include ../config-host.mak
-include config.mak
-include $(SRC_PATH)/rules.mak
-
-.PHONY: all
-
-$(call set-vpath, $(SRC_PATH))
-
-QEMU_CFLAGS+=-I..
-
-include $(SRC_PATH)/Makefile.objs
-
-all: $(libdis-y)
-# Dummy command so that make thinks it has done something
-	@true
-
-clean:
-	rm -f *.o *.d *.a *~
--- a/Makefile.hw
+++ b/Makefile.hw
@ -1,23 +0,0 @@
-# Makefile for qemu target independent devices.
-
-include ../config-host.mak
-include ../config-all-devices.mak
-include config.mak
-include $(SRC_PATH)/rules.mak
-
-.PHONY: all
-
-$(call set-vpath, $(SRC_PATH))
-
-QEMU_CFLAGS+=-I..
-QEMU_CFLAGS += -I$(SRC_PATH)/include
-
-include $(SRC_PATH)/Makefile.objs
-
-all: $(hw-obj-y)
-# Dummy command so that make thinks it has done something
-	@true
-
-clean:
-	rm -f $(addsuffix *.o, $(sort $(dir $(hw-obj-y))))
-	rm -f $(addsuffix *.d, $(sort $(dir $(hw-obj-y))))
--- a/Makefile.objs
+++ b/Makefile.objs
@ -1,3 +1,7 @@
+#######################################################################
+# Stub library, linked in tools
+stub-obj-y = stubs/
+
 #######################################################################
 # Target-independent parts used in system and user emulation
 universal-obj-y =
@ -17,9 +21,16 @@ qom-obj-y = qom/

 universal-obj-y += $(qom-obj-y)

+#######################################################################
+# Core hw code (qdev core)
+hw-core-obj-y += hw/
+hw-core-obj-y += qemu-option.o
+
+universal-obj-y += $(hw-core-obj-y)
+
 #######################################################################
 # oslib-obj-y is code depending on the OS (win32 vs posix)
-oslib-obj-y = osdep.o
+oslib-obj-y = osdep.o cutils.o qemu-timer-common.o
 oslib-obj-$(CONFIG_WIN32) += oslib-win32.o qemu-thread-win32.o
 oslib-obj-$(CONFIG_POSIX) += oslib-posix.o qemu-thread-posix.o

@ -27,6 +38,8 @@ oslib-obj-$(CONFIG_POSIX) += oslib-posix.o qemu-thread-posix.o
 # coroutines
 coroutine-obj-y = qemu-coroutine.o qemu-coroutine-lock.o qemu-coroutine-io.o
 coroutine-obj-y += qemu-coroutine-sleep.o
+
+# If you change this logic, please also check tests/Makefile
 ifeq ($(CONFIG_UCONTEXT_COROUTINE),y)
 coroutine-obj-$(CONFIG_POSIX) += coroutine-ucontext.o
 else
@ -41,12 +54,14 @@ coroutine-obj-$(CONFIG_WIN32) += coroutine-win32.o
 #######################################################################
 # block-obj-y is code used by both qemu system emulation and qemu-img

-block-obj-y = cutils.o iov.o cache-utils.o qemu-option.o module.o async.o
-block-obj-y += nbd.o block.o aio.o aes.o qemu-config.o qemu-progress.o qemu-sockets.o
+block-obj-y = iov.o cache-utils.o qemu-option.o module.o async.o
+block-obj-y += nbd.o block.o blockjob.o aes.o qemu-config.o
+block-obj-y += thread-pool.o qemu-progress.o qemu-sockets.o uri.o notify.o
 block-obj-y += $(coroutine-obj-y) $(qobject-obj-y) $(version-obj-y)
-block-obj-$(CONFIG_POSIX) += posix-aio-compat.o
-block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
+block-obj-$(CONFIG_POSIX) += event_notifier-posix.o aio-posix.o
+block-obj-$(CONFIG_WIN32) += event_notifier-win32.o aio-win32.o
 block-obj-y += block/
+block-obj-y += $(qapi-obj-y) qapi-types.o qapi-visit.o

 ifeq ($(CONFIG_VIRTIO)$(CONFIG_VIRTFS)$(CONFIG_PCI),yyy)
 # Lots of the fsdev/9pcode is pulled in by vl.c via qemu_fsdev_add.
@ -59,10 +74,10 @@ endif
 # suppress *all* target specific code in case of system emulation, i.e. a
 # single QEMU executable should support all CPUs and machines.

-common-obj-y = $(block-obj-y) blockdev.o
-common-obj-y += net.o net/
+common-obj-y = $(block-obj-y) blockdev.o blockdev-nbd.o block/
+common-obj-y += net/
 common-obj-y += qom/
-common-obj-y += readline.o console.o cursor.o
+common-obj-y += readline.o
 common-obj-y += $(oslib-obj-y)
 common-obj-$(CONFIG_WIN32) += os-win32.o
 common-obj-$(CONFIG_POSIX) += os-posix.o
@ -71,11 +86,10 @@ common-obj-$(CONFIG_LINUX) += fsdev/
 extra-obj-$(CONFIG_LINUX) += fsdev/

 common-obj-y += tcg-runtime.o host-utils.o main-loop.o
-common-obj-y += input.o
-common-obj-y += buffered_file.o migration.o migration-tcp.o
+common-obj-y += migration.o migration-tcp.o
+common-obj-y += migration.o migration-tcp.o
 common-obj-y += qemu-char.o #aio.o
 common-obj-y += block-migration.o iohandler.o
-common-obj-y += pflib.o
 common-obj-y += bitmap.o bitops.o
 common-obj-y += page_cache.o

@ -86,116 +100,51 @@ common-obj-$(CONFIG_SPICE) += spice-qemu-char.o

 common-obj-y += audio/
 common-obj-y += hw/
+extra-obj-y += hw/
+
 common-obj-y += ui/
 common-obj-y += bt-host.o bt-vhci.o

-common-obj-y += iov.o acl.o
+common-obj-y += dma-helpers.o
+common-obj-y += acl.o
 common-obj-$(CONFIG_POSIX) += compatfd.o
-common-obj-y += notify.o event_notifier.o
 common-obj-y += qemu-timer.o qemu-timer-common.o
+common-obj-y += qtest.o
+common-obj-y += vl.o

 common-obj-$(CONFIG_SLIRP) += slirp/

+common-obj-y += backends/
+
+######################################################################
+# libseccomp
+ifeq ($(CONFIG_SECCOMP),y)
+common-obj-y += qemu-seccomp.o
+endif
+
 ######################################################################
 # libuser

 user-obj-y =
 user-obj-y += envlist.o path.o
 user-obj-y += tcg-runtime.o host-utils.o
-user-obj-y += cutils.o iov.o cache-utils.o
+user-obj-y += cache-utils.o
 user-obj-y += module.o
 user-obj-y += qemu-user.o
-user-obj-y += $(trace-obj-y)
 user-obj-y += qom/

 ######################################################################
-# libhw
-
-hw-obj-y = vl.o dma-helpers.o qtest.o hw/
-
-######################################################################
-# libdis
+# disassemblers
 # NOTE: the disassembler code is only needed for debugging

-libdis-y =
-libdis-$(CONFIG_ALPHA_DIS) += alpha-dis.o
-libdis-$(CONFIG_ARM_DIS) += arm-dis.o
-libdis-$(CONFIG_CRIS_DIS) += cris-dis.o
-libdis-$(CONFIG_HPPA_DIS) += hppa-dis.o
-libdis-$(CONFIG_I386_DIS) += i386-dis.o
-libdis-$(CONFIG_IA64_DIS) += ia64-dis.o
-libdis-$(CONFIG_M68K_DIS) += m68k-dis.o
-libdis-$(CONFIG_MICROBLAZE_DIS) += microblaze-dis.o
-libdis-$(CONFIG_MIPS_DIS) += mips-dis.o
-libdis-$(CONFIG_PPC_DIS) += ppc-dis.o
-libdis-$(CONFIG_S390_DIS) += s390-dis.o
-libdis-$(CONFIG_SH4_DIS) += sh4-dis.o
-libdis-$(CONFIG_SPARC_DIS) += sparc-dis.o
-libdis-$(CONFIG_LM32_DIS) += lm32-dis.o
+universal-obj-y += disas/

 ######################################################################
 # trace

-ifeq ($(TRACE_BACKEND),dtrace)
-TRACE_H_EXTRA_DEPS=trace-dtrace.h
-endif
-trace.h: trace.h-timestamp $(TRACE_H_EXTRA_DEPS)
-trace.h-timestamp: $(SRC_PATH)/trace-events $(BUILD_DIR)/config-host.mak
-	$(call quiet-command,$(TRACETOOL) \
-		--format=h \
-		--backend=$(TRACE_BACKEND) \
-		< $< > $@,"  GEN   trace.h")
-	@cmp -s $@ trace.h || cp $@ trace.h
+trace-obj-y += trace/

-trace.c: trace.c-timestamp
-trace.c-timestamp: $(SRC_PATH)/trace-events $(BUILD_DIR)/config-host.mak
-	$(call quiet-command,$(TRACETOOL) \
-		--format=c \
-		--backend=$(TRACE_BACKEND) \
-		< $< > $@,"  GEN   trace.c")
-	@cmp -s $@ trace.c || cp $@ trace.c
-
-trace.o: trace.c $(GENERATED_HEADERS)
-
-trace-dtrace.h: trace-dtrace.dtrace
-	$(call quiet-command,dtrace -o $@ -h -s $<, "  GEN   trace-dtrace.h")
-
-# Normal practice is to name DTrace probe file with a '.d' extension
-# but that gets picked up by QEMU's Makefile as an external dependency
-# rule file. So we use '.dtrace' instead
-trace-dtrace.dtrace: trace-dtrace.dtrace-timestamp
-trace-dtrace.dtrace-timestamp: $(SRC_PATH)/trace-events $(BUILD_DIR)/config-host.mak
-	$(call quiet-command,$(TRACETOOL) \
-		--format=d \
-		--backend=$(TRACE_BACKEND) \
-		< $< > $@,"  GEN   trace-dtrace.dtrace")
-	@cmp -s $@ trace-dtrace.dtrace || cp $@ trace-dtrace.dtrace
-
-trace-dtrace.o: trace-dtrace.dtrace $(GENERATED_HEADERS)
-	$(call quiet-command,dtrace -o $@ -G -s $<, "  GEN   trace-dtrace.o")
-
-ifeq ($(LIBTOOL),)
-trace-dtrace.lo: trace-dtrace.dtrace
-	@echo "missing libtool. please install and rerun configure."; exit 1
-else
-trace-dtrace.lo: trace-dtrace.dtrace
-	$(call quiet-command,$(LIBTOOL) --mode=compile --tag=CC dtrace -o $@ -G -s $<, "  lt GEN trace-dtrace.o")
-endif
-
-trace/simple.o: trace/simple.c $(GENERATED_HEADERS)
-
-trace-obj-$(CONFIG_TRACE_DTRACE) += trace-dtrace.o
-ifneq ($(TRACE_BACKEND),dtrace)
-trace-obj-y = trace.o
-endif
-
-trace-obj-$(CONFIG_TRACE_DEFAULT) += trace/default.o
-trace-obj-$(CONFIG_TRACE_SIMPLE) += trace/simple.o
-trace-obj-$(CONFIG_TRACE_SIMPLE) += qemu-timer-common.o
-trace-obj-$(CONFIG_TRACE_STDERR) += trace/stderr.o
-trace-obj-y += trace/control.o
-
-$(trace-obj-y): $(GENERATED_HEADERS)
+universal-obj-y += $(trace-obj-y)

 ######################################################################
 # smartcard
@ -222,9 +171,8 @@ universal-obj-y += $(qapi-obj-y)
 ######################################################################
 # guest agent

-qga-obj-y = qga/ qemu-ga.o module.o
-qga-obj-$(CONFIG_WIN32) += oslib-win32.o
-qga-obj-$(CONFIG_POSIX) += oslib-posix.o qemu-sockets.o qemu-option.o
+qga-obj-y = qga/ module.o qemu-tool.o
+qga-obj-$(CONFIG_POSIX) += qemu-sockets.o qemu-option.o

 vl.o: QEMU_CFLAGS+=$(GPROF_CFLAGS)

@ -233,12 +181,15 @@ vl.o: QEMU_CFLAGS+=$(SDL_CFLAGS)
 QEMU_CFLAGS+=$(GLIB_CFLAGS)

 nested-vars += \
-	hw-obj-y \
+	stub-obj-y \
 	qga-obj-y \
-	block-obj-y \
 	qom-obj-y \
 	qapi-obj-y \
+	block-obj-y \
 	user-obj-y \
 	common-obj-y \
-	extra-obj-y
+	universal-obj-y \
+	hw-core-obj-y \
+	extra-obj-y \
+	trace-obj-y
 dummy := $(call unnest-vars)
--- a/Makefile.target
+++ b/Makefile.target
@ -4,9 +4,6 @@ include ../config-host.mak
 include config-devices.mak
 include config-target.mak
 include $(SRC_PATH)/rules.mak
-ifneq ($(HWDIR),)
-include $(HWDIR)/config.mak
-endif

 $(call set-vpath, $(SRC_PATH))
 ifdef CONFIG_LINUX
@ -72,22 +69,12 @@ all: $(PROGS) stap
 obj-y = exec.o translate-all.o cpu-exec.o
 obj-y += tcg/tcg.o tcg/optimize.o
 obj-$(CONFIG_TCG_INTERPRETER) += tci.o
+obj-$(CONFIG_TCG_INTERPRETER) += disas/tci.o
 obj-y += fpu/softfloat.o
-obj-y += disas.o
-obj-$(CONFIG_TCI_DIS) += tci-dis.o
 obj-y += target-$(TARGET_BASE_ARCH)/
+obj-y += disas.o
 obj-$(CONFIG_GDBSTUB_XML) += gdbstub-xml.o

-tci-dis.o: QEMU_CFLAGS += -I$(SRC_PATH)/tcg -I$(SRC_PATH)/tcg/tci
-
-# HELPER_CFLAGS is used for all the legacy code compiled with static register
-# variables
-user-exec.o: QEMU_CFLAGS += $(HELPER_CFLAGS)
-
-# Note: this is a workaround. The real fix is to avoid compiling
-# cpu_signal_handler() in user-exec.c.
-%/signal.o: QEMU_CFLAGS += $(HELPER_CFLAGS)
-
 #########################################################
 # Linux user emulator target

@ -132,11 +119,6 @@ obj-$(CONFIG_NO_GET_MEMORY_MAPPING) += memory_mapping-stub.o
 obj-$(CONFIG_NO_CORE_DUMP) += dump-stub.o
 LIBS+=-lz

-QEMU_CFLAGS += $(VNC_TLS_CFLAGS)
-QEMU_CFLAGS += $(VNC_SASL_CFLAGS)
-QEMU_CFLAGS += $(VNC_JPEG_CFLAGS)
-QEMU_CFLAGS += $(VNC_PNG_CFLAGS)
-
 # xen support
 obj-$(CONFIG_XEN) += xen-all.o xen-mapcache.o
 obj-$(CONFIG_NO_XEN) += xen-stub.o
@ -154,6 +136,9 @@ GENERATED_HEADERS += hmp-commands.h qmp-commands-old.h

 endif # CONFIG_SOFTMMU

+# Workaround for http://gcc.gnu.org/PR55489, see configure.
+%/translate.o: QEMU_CFLAGS += $(TRANSLATE_OPT_CFLAGS)
+
 nested-vars += obj-y

 # This resolves all nested paths, so it must come last
@ -164,22 +149,18 @@ all-obj-y += $(addprefix ../, $(universal-obj-y))

 ifdef CONFIG_SOFTMMU
 all-obj-y += $(addprefix ../, $(common-obj-y))
-all-obj-y += $(addprefix ../libdis/, $(libdis-y))
-all-obj-y += $(addprefix $(HWDIR)/, $(hw-obj-y))
-all-obj-y += $(addprefix ../, $(trace-obj-y))
 else
-all-obj-y += $(addprefix ../libuser/, $(user-obj-y))
-all-obj-y += $(addprefix ../libdis-user/, $(libdis-y))
+all-obj-y += $(addprefix ../, $(user-obj-y))
 endif #CONFIG_LINUX_USER

 ifdef QEMU_PROGW
 # The linker builds a windows executable. Make also a console executable.
-$(QEMU_PROGW): $(all-obj-y)
+$(QEMU_PROGW): $(all-obj-y) ../libqemustub.a
 	$(call LINK,$^)
 $(QEMU_PROG): $(QEMU_PROGW)
 	$(call quiet-command,$(OBJCOPY) --subsystem console $(QEMU_PROGW) $(QEMU_PROG),"  GEN   $(TARGET_DIR)$(QEMU_PROG)")
 else
-$(QEMU_PROG): $(all-obj-y)
+$(QEMU_PROG): $(all-obj-y) ../libqemustub.a
 	$(call LINK,$^)
 endif

--- a/Makefile.user
+++ b/Makefile.user
@ -1,24 +0,0 @@
-# Makefile for qemu target independent user files.
-
-include ../config-host.mak
-include $(SRC_PATH)/rules.mak
-include config.mak
-
-.PHONY: all
-
-$(call set-vpath, $(SRC_PATH))
-
-QEMU_CFLAGS+=-I..
-QEMU_CFLAGS += -I$(SRC_PATH)/include
-QEMU_CFLAGS += -DCONFIG_USER_ONLY
-
-include $(SRC_PATH)/Makefile.objs
-
-all: $(user-obj-y)
-# Dummy command so that make thinks it has done something
-	@true
-
-clean:
-	for d in . trace; do \
-	rm -f $$d/*.o $$d/*.d $$d/*.a $$d/*~; \
-	done
--- a/QMP/qemu-ga-client
+++ b/QMP/qemu-ga-client
@ -0,0 +1,299 @@
+#!/usr/bin/python
+
+# QEMU Guest Agent Client
+#
+# Copyright (C) 2012 Ryota Ozaki <ozaki.ryota@gmail.com>
+#
+# This work is licensed under the terms of the GNU GPL, version 2.  See
+# the COPYING file in the top-level directory.
+#
+# Usage:
+#
+# Start QEMU with:
+#
+# # qemu [...] -chardev socket,path=/tmp/qga.sock,server,nowait,id=qga0 \
+#   -device virtio-serial -device virtserialport,chardev=qga0,name=org.qemu.guest_agent.0
+#
+# Run the script:
+#
+# $ qemu-ga-client --address=/tmp/qga.sock <command> [args...]
+#
+# or
+#
+# $ export QGA_CLIENT_ADDRESS=/tmp/qga.sock
+# $ qemu-ga-client <command> [args...]
+#
+# For example:
+#
+# $ qemu-ga-client cat /etc/resolv.conf
+# # Generated by NetworkManager
+# nameserver 10.0.2.3
+# $ qemu-ga-client fsfreeze status
+# thawed
+# $ qemu-ga-client fsfreeze freeze
+# 2 filesystems frozen
+#
+# See also: http://wiki.qemu.org/Features/QAPI/GuestAgent
+#
+
+import base64
+import random
+
+import qmp
+
+
+class QemuGuestAgent(qmp.QEMUMonitorProtocol):
+    def __getattr__(self, name):
+        def wrapper(**kwds):
+            return self.command('guest-' + name.replace('_', '-'), **kwds)
+        return wrapper
+
+
+class QemuGuestAgentClient:
+    error = QemuGuestAgent.error
+
+    def __init__(self, address):
+        self.qga = QemuGuestAgent(address)
+        self.qga.connect(negotiate=False)
+
+    def sync(self, timeout=3):
+        # Avoid being blocked forever
+        if not self.ping(timeout):
+            raise EnvironmentError('Agent seems not alive')
+        uid = random.randint(0, (1 << 32) - 1)
+        while True:
+            ret = self.qga.sync(id=uid)
+            if isinstance(ret, int) and int(ret) == uid:
+                break
+
+    def __file_read_all(self, handle):
+        eof = False
+        data = ''
+        while not eof:
+            ret = self.qga.file_read(handle=handle, count=1024)
+            _data = base64.b64decode(ret['buf-b64'])
+            data += _data
+            eof = ret['eof']
+        return data
+
+    def read(self, path):
+        handle = self.qga.file_open(path=path)
+        try:
+            data = self.__file_read_all(handle)
+        finally:
+            self.qga.file_close(handle=handle)
+        return data
+
+    def info(self):
+        info = self.qga.info()
+
+        msgs = []
+        msgs.append('version: ' + info['version'])
+        msgs.append('supported_commands:')
+        enabled = [c['name'] for c in info['supported_commands'] if c['enabled']]
+        msgs.append('\tenabled: ' + ', '.join(enabled))
+        disabled = [c['name'] for c in info['supported_commands'] if not c['enabled']]
+        msgs.append('\tdisabled: ' + ', '.join(disabled))
+
+        return '\n'.join(msgs)
+
+    def __gen_ipv4_netmask(self, prefixlen):
+        mask = int('1' * prefixlen + '0' * (32 - prefixlen), 2)
+        return '.'.join([str(mask >> 24),
+                         str((mask >> 16) & 0xff),
+                         str((mask >> 8) & 0xff),
+                         str(mask & 0xff)])
+
+    def ifconfig(self):
+        nifs = self.qga.network_get_interfaces()
+
+        msgs = []
+        for nif in nifs:
+            msgs.append(nif['name'] + ':')
+            if 'ip-addresses' in nif:
+                for ipaddr in nif['ip-addresses']:
+                    if ipaddr['ip-address-type'] == 'ipv4':
+                        addr = ipaddr['ip-address']
+                        mask = self.__gen_ipv4_netmask(int(ipaddr['prefix']))
+                        msgs.append("\tinet %s  netmask %s" % (addr, mask))
+                    elif ipaddr['ip-address-type'] == 'ipv6':
+                        addr = ipaddr['ip-address']
+                        prefix = ipaddr['prefix']
+                        msgs.append("\tinet6 %s  prefixlen %s" % (addr, prefix))
+            if nif['hardware-address'] != '00:00:00:00:00:00':
+                msgs.append("\tether " + nif['hardware-address'])
+
+        return '\n'.join(msgs)
+
+    def ping(self, timeout):
+        self.qga.settimeout(timeout)
+        try:
+            self.qga.ping()
+        except self.qga.timeout:
+            return False
+        return True
+
+    def fsfreeze(self, cmd):
+        if cmd not in ['status', 'freeze', 'thaw']:
+            raise StandardError('Invalid command: ' + cmd)
+
+        return getattr(self.qga, 'fsfreeze' + '_' + cmd)()
+
+    def fstrim(self, minimum=0):
+        return getattr(self.qga, 'fstrim')(minimum=minimum)
+
+    def suspend(self, mode):
+        if mode not in ['disk', 'ram', 'hybrid']:
+            raise StandardError('Invalid mode: ' + mode)
+
+        try:
+            getattr(self.qga, 'suspend' + '_' + mode)()
+            # On error exception will raise
+        except self.qga.timeout:
+            # On success command will timed out
+            return
+
+    def shutdown(self, mode='powerdown'):
+        if mode not in ['powerdown', 'halt', 'reboot']:
+            raise StandardError('Invalid mode: ' + mode)
+
+        try:
+            self.qga.shutdown(mode=mode)
+        except self.qga.timeout:
+            return
+
+
+def _cmd_cat(client, args):
+    if len(args) != 1:
+        print('Invalid argument')
+        print('Usage: cat <file>')
+        sys.exit(1)
+    print(client.read(args[0]))
+
+
+def _cmd_fsfreeze(client, args):
+    usage = 'Usage: fsfreeze status|freeze|thaw'
+    if len(args) != 1:
+        print('Invalid argument')
+        print(usage)
+        sys.exit(1)
+    if args[0] not in ['status', 'freeze', 'thaw']:
+        print('Invalid command: ' + args[0])
+        print(usage)
+        sys.exit(1)
+    cmd = args[0]
+    ret = client.fsfreeze(cmd)
+    if cmd == 'status':
+        print(ret)
+    elif cmd == 'freeze':
+        print("%d filesystems frozen" % ret)
+    else:
+        print("%d filesystems thawed" % ret)
+
+
+def _cmd_fstrim(client, args):
+    if len(args) == 0:
+        minimum = 0
+    else:
+        minimum = int(args[0])
+    print(client.fstrim(minimum))
+
+
+def _cmd_ifconfig(client, args):
+    print(client.ifconfig())
+
+
+def _cmd_info(client, args):
+    print(client.info())
+
+
+def _cmd_ping(client, args):
+    if len(args) == 0:
+        timeout = 3
+    else:
+        timeout = float(args[0])
+    alive = client.ping(timeout)
+    if not alive:
+        print("Not responded in %s sec" % args[0])
+        sys.exit(1)
+
+
+def _cmd_suspend(client, args):
+    usage = 'Usage: suspend disk|ram|hybrid'
+    if len(args) != 1:
+        print('Less argument')
+        print(usage)
+        sys.exit(1)
+    if args[0] not in ['disk', 'ram', 'hybrid']:
+        print('Invalid command: ' + args[0])
+        print(usage)
+        sys.exit(1)
+    client.suspend(args[0])
+
+
+def _cmd_shutdown(client, args):
+    client.shutdown()
+_cmd_powerdown = _cmd_shutdown
+
+
+def _cmd_halt(client, args):
+    client.shutdown('halt')
+
+
+def _cmd_reboot(client, args):
+    client.shutdown('reboot')
+
+
+commands = [m.replace('_cmd_', '') for m in dir() if '_cmd_' in m]
+
+
+def main(address, cmd, args):
+    if not os.path.exists(address):
+        print('%s not found' % address)
+        sys.exit(1)
+
+    if cmd not in commands:
+        print('Invalid command: ' + cmd)
+        print('Available commands: ' + ', '.join(commands))
+        sys.exit(1)
+
+    try:
+        client = QemuGuestAgentClient(address)
+    except QemuGuestAgent.error, e:
+        import errno
+
+        print(e)
+        if e.errno == errno.ECONNREFUSED:
+            print('Hint: qemu is not running?')
+        sys.exit(1)
+
+    if cmd != 'ping':
+        client.sync()
+
+    globals()['_cmd_' + cmd](client, args)
+
+
+if __name__ == '__main__':
+    import sys
+    import os
+    import optparse
+
+    address = os.environ['QGA_CLIENT_ADDRESS'] if 'QGA_CLIENT_ADDRESS' in os.environ else None
+
+    usage = "%prog [--address=<unix_path>|<ipv4_address>] <command> [args...]\n"
+    usage += '<command>: ' + ', '.join(commands)
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option('--address', action='store', type='string',
+                      default=address, help='Specify a ip:port pair or a unix socket path')
+    options, args = parser.parse_args()
+
+    address = options.address
+    if address is None:
+        parser.error('address is not specified')
+        sys.exit(1)
+
+    if len(args) == 0:
+        parser.error('Less argument')
+        sys.exit(1)
+
+    main(address, args[0], args[1:])
--- a/QMP/qmp-events.txt
+++ b/QMP/qmp-events.txt
@ -50,7 +50,8 @@ Emitted when a block job has been cancelled.

 Data:

- "type":     Job type ("stream" for image streaming, json-string)
+- "type":     Job type (json-string; "stream" for image streaming
+                                     "commit" for block commit)
 - "device":   Device name (json-string)
 - "len":      Maximum progress value (json-int)
 - "offset":   Current progress value (json-int)
@ -73,7 +74,8 @@ Emitted when a block job has completed.

 Data:

- "type":     Job type ("stream" for image streaming, json-string)
+- "type":     Job type (json-string; "stream" for image streaming
+                                     "commit" for block commit)
 - "device":   Device name (json-string)
 - "len":      Maximum progress value (json-int)
 - "offset":   Current progress value (json-int)
@ -94,6 +96,46 @@ Example:
               "speed": 0 },
     "timestamp": { "seconds": 1267061043, "microseconds": 959568 } }

+BLOCK_JOB_ERROR
+---------------
+
+Emitted when a block job encounters an error.
+
+Data:
+
+- "device": device name (json-string)
+- "operation": I/O operation (json-string, "read" or "write")
+- "action": action that has been taken, it's one of the following (json-string):
+    "ignore": error has been ignored, the job may fail later
+    "report": error will be reported and the job canceled
+    "stop": error caused job to be paused
+
+Example:
+
+{ "event": "BLOCK_JOB_ERROR",
+    "data": { "device": "ide0-hd1",
+              "operation": "write",
+              "action": "stop" },
+    "timestamp": { "seconds": 1265044230, "microseconds": 450486 } }
+
+BLOCK_JOB_READY
+---------------
+
+Emitted when a block job is ready to complete.
+
+Data:
+
+- "device": device name (json-string)
+
+Example:
+
+{ "event": "BLOCK_JOB_READY",
+    "data": { "device": "ide0-hd1" },
+    "timestamp": { "seconds": 1265044230, "microseconds": 450486 } }
+
+Note: The "ready to complete" status is always reset by a BLOCK_JOB_ERROR
+event.
+
 DEVICE_TRAY_MOVED
 -----------------

--- a/QMP/qmp-shell
+++ b/QMP/qmp-shell
@ -33,6 +33,7 @@
 import qmp
 import readline
 import sys
+import pprint

 class QMPCompleter(list):
    def complete(self, text, state):
@ -52,10 +53,11 @@ class QMPShellBadPort(QMPShellError):
 # TODO: QMPShell's interface is a bit ugly (eg. _fill_completion() and
 #       _execute_cmd()). Let's design a better one.
 class QMPShell(qmp.QEMUMonitorProtocol):
-    def __init__(self, address):
+    def __init__(self, address, pp=None):
        qmp.QEMUMonitorProtocol.__init__(self, self.__get_address(address))
        self._greeting = None
        self._completer = None
+        self._pp = pp

    def __get_address(self, arg):
        """
@ -114,7 +116,11 @@ class QMPShell(qmp.QEMUMonitorProtocol):
        if resp is None:
            print 'Disconnected'
            return False
-        print resp
+
+        if self._pp is not None:
+            self._pp.pprint(resp)
+        else:
+            print resp
        return True

    def connect(self):
@ -222,22 +228,36 @@ def die(msg):
 def fail_cmdline(option=None):
    if option:
        sys.stderr.write('ERROR: bad command-line option \'%s\'\n' % option)
-    sys.stderr.write('qemu-shell [ -H ] < UNIX socket path> | < TCP address:port >\n')
+    sys.stderr.write('qemu-shell [ -p ] [ -H ] < UNIX socket path> | < TCP address:port >\n')
    sys.exit(1)

 def main():
    addr = ''
+    qemu = None
+    hmp = False
+    pp = None
+
    try:
-        if len(sys.argv) == 2:
-            qemu = QMPShell(sys.argv[1])
-            addr = sys.argv[1]
-        elif len(sys.argv) == 3:
-            if sys.argv[1] != '-H':
-                fail_cmdline(sys.argv[1])
-            qemu = HMPShell(sys.argv[2])
-            addr = sys.argv[2]
-        else:
-                fail_cmdline()
+        for arg in sys.argv[1:]:
+            if arg == "-H":
+                if qemu is not None:
+                    fail_cmdline(arg)
+                hmp = True
+            elif arg == "-p":
+                if pp is not None:
+                    fail_cmdline(arg)
+                pp = pprint.PrettyPrinter(indent=4)
+            else:
+                if qemu is not None:
+                    fail_cmdline(arg)
+                if hmp:
+                    qemu = HMPShell(arg)
+                else:
+                    qemu = QMPShell(arg, pp)
+                addr = arg
+
+        if qemu is None:
+            fail_cmdline()
    except QMPShellBadPort:
        die('bad port number in command-line')

--- a/QMP/qmp.py
+++ b/QMP/qmp.py
@ -49,7 +49,6 @@ class QEMUMonitorProtocol:
        return socket.socket(family, socket.SOCK_STREAM)

    def __negotiate_capabilities(self):
-        self.__sockfile = self.__sock.makefile()
        greeting = self.__json_read()
        if greeting is None or not greeting.has_key('QMP'):
            raise QMPConnectError
@ -73,7 +72,7 @@ class QEMUMonitorProtocol:

    error = socket.error

-    def connect(self):
+    def connect(self, negotiate=True):
        """
        Connect to the QMP Monitor and perform capabilities negotiation.

@ -83,7 +82,9 @@ class QEMUMonitorProtocol:
        @raise QMPCapabilitiesError if fails to negotiate capabilities
        """
        self.__sock.connect(self.__address)
-        return self.__negotiate_capabilities()
+        self.__sockfile = self.__sock.makefile()
+        if negotiate:
+            return self.__negotiate_capabilities()

    def accept(self):
        """
@ -95,6 +96,7 @@ class QEMUMonitorProtocol:
        @raise QMPCapabilitiesError if fails to negotiate capabilities
        """
        self.__sock, _ = self.__sock.accept()
+        self.__sockfile = self.__sock.makefile()
        return self.__negotiate_capabilities()

    def cmd_obj(self, qmp_cmd):
@ -134,6 +136,26 @@ class QEMUMonitorProtocol:
            raise Exception(ret['error']['desc'])
        return ret['return']

+    def pull_event(self, wait=False):
+        """
+        Get and delete the first available QMP event.
+
+        @param wait: block until an event is available (bool)
+        """
+        self.__sock.setblocking(0)
+        try:
+            self.__json_read()
+        except socket.error, err:
+            if err[0] == errno.EAGAIN:
+                # No data available
+                pass
+        self.__sock.setblocking(1)
+        if not self.__events and wait:
+            self.__json_read(only_event=True)
+        event = self.__events[0]
+        del self.__events[0]
+        return event
+
    def get_events(self, wait=False):
        """
        Get a list of available QMP events.
@ -161,3 +183,8 @@ class QEMUMonitorProtocol:
    def close(self):
        self.__sock.close()
        self.__sockfile.close()
+
+    timeout = socket.timeout
+
+    def settimeout(self, timeout):
+        self.__sock.settimeout(timeout)
--- a/2
+++ b/2
@ -1 +1 @@
-1.1.50
+1.3.50
--- a/a.out.h
+++ b/a.out.h
@ -1,430 +0,0 @@
-/* a.out.h
-
-   Copyright 1997, 1998, 1999, 2001 Red Hat, Inc.
-
-This file is part of Cygwin.
-
-This software is a copyrighted work licensed under the terms of the
-Cygwin license.  Please consult the file "CYGWIN_LICENSE" for
-details. */
-
-#ifndef _A_OUT_H_
-#define _A_OUT_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-#define COFF_IMAGE_WITH_PE
-#define COFF_LONG_SECTION_NAMES
-
-/*** coff information for Intel 386/486.  */
-
-
-/********************** FILE HEADER **********************/
-
-struct external_filehdr {
-  short f_magic;	/* magic number			*/
-  short f_nscns;	/* number of sections		*/
-  host_ulong f_timdat;	/* time & date stamp		*/
-  host_ulong f_symptr;	/* file pointer to symtab	*/
-  host_ulong f_nsyms;	/* number of symtab entries	*/
-  short f_opthdr;	/* sizeof(optional hdr)		*/
-  short f_flags;	/* flags			*/
-};
-
-/* Bits for f_flags:
- *	F_RELFLG	relocation info stripped from file
- *	F_EXEC		file is executable (no unresolved external references)
- *	F_LNNO		line numbers stripped from file
- *	F_LSYMS		local symbols stripped from file
- *	F_AR32WR	file has byte ordering of an AR32WR machine (e.g. vax)
- */
-
-#define F_RELFLG	(0x0001)
-#define F_EXEC		(0x0002)
-#define F_LNNO		(0x0004)
-#define F_LSYMS		(0x0008)
-
-
-
-#define	I386MAGIC	0x14c
-#define I386PTXMAGIC	0x154
-#define I386AIXMAGIC	0x175
-
-/* This is Lynx's all-platform magic number for executables. */
-
-#define LYNXCOFFMAGIC	0415
-
-#define I386BADMAG(x) (((x).f_magic != I386MAGIC) \
-		       && (x).f_magic != I386AIXMAGIC \
-		       && (x).f_magic != I386PTXMAGIC \
-		       && (x).f_magic != LYNXCOFFMAGIC)
-
-#define	FILHDR	struct external_filehdr
-#define	FILHSZ	20
-
-
-/********************** AOUT "OPTIONAL HEADER"=
- **********************/
-
-
-typedef struct
-{
-  unsigned short magic;		/* type of file				*/
-  unsigned short vstamp;	/* version stamp			*/
-  host_ulong	tsize;		/* text size in bytes, padded to FW bdry*/
-  host_ulong	dsize;		/* initialized data "  "		*/
-  host_ulong	bsize;		/* uninitialized data "   "		*/
-  host_ulong	entry;		/* entry pt.				*/
-  host_ulong text_start;	/* base of text used for this file */
-  host_ulong data_start;	/* base of data used for this file=
- */
-}
-AOUTHDR;
-
-#define AOUTSZ 28
-#define AOUTHDRSZ 28
-
-#define OMAGIC          0404    /* object files, eg as output */
-#define ZMAGIC          0413    /* demand load format, eg normal ld output */
-#define STMAGIC		0401	/* target shlib */
-#define SHMAGIC		0443	/* host   shlib */
-
-
-/* define some NT default values */
-/*  #define NT_IMAGE_BASE        0x400000 moved to internal.h */
-#define NT_SECTION_ALIGNMENT 0x1000
-#define NT_FILE_ALIGNMENT    0x200
-#define NT_DEF_RESERVE       0x100000
-#define NT_DEF_COMMIT        0x1000
-
-/********************** SECTION HEADER **********************/
-
-
-struct external_scnhdr {
-  char		s_name[8];	/* section name			*/
-  host_ulong	s_paddr;	/* physical address, offset
-				   of last addr in scn */
-  host_ulong	s_vaddr;	/* virtual address		*/
-  host_ulong	s_size;		/* section size			*/
-  host_ulong	s_scnptr;	/* file ptr to raw data for section */
-  host_ulong	s_relptr;	/* file ptr to relocation	*/
-  host_ulong	s_lnnoptr;	/* file ptr to line numbers	*/
-  unsigned short s_nreloc;	/* number of relocation entries	*/
-  unsigned short s_nlnno;	/* number of line number entries*/
-  host_ulong	s_flags;	/* flags			*/
-};
-
-#define	SCNHDR	struct external_scnhdr
-#define	SCNHSZ	40
-
-/*
- * names of "special" sections
- */
-#define _TEXT	".text"
-#define _DATA	".data"
-#define _BSS	".bss"
-#define _COMMENT ".comment"
-#define _LIB ".lib"
-
-/********************** LINE NUMBERS **********************/
-
-/* 1 line number entry for every "breakpointable" source line in a section.
- * Line numbers are grouped on a per function basis; first entry in a function
- * grouping will have l_lnno = 0 and in place of physical address will be the
- * symbol table index of the function name.
- */
-struct external_lineno {
-  union {
-    host_ulong l_symndx; /* function name symbol index, iff l_lnno 0 */
-    host_ulong l_paddr;	/* (physical) address of line number	*/
-  } l_addr;
-  unsigned short l_lnno;	/* line number		*/
-};
-
-#define	LINENO	struct external_lineno
-#define	LINESZ	6
-
-/********************** SYMBOLS **********************/
-
-#define E_SYMNMLEN	8	/* # characters in a symbol name	*/
-#define E_FILNMLEN	14	/* # characters in a file name		*/
-#define E_DIMNUM	4	/* # array dimensions in auxiliary entry */
-
-struct QEMU_PACKED external_syment
-{
-  union {
-    char e_name[E_SYMNMLEN];
-    struct {
-      host_ulong e_zeroes;
-      host_ulong e_offset;
-    } e;
-  } e;
-  host_ulong e_value;
-  unsigned short e_scnum;
-  unsigned short e_type;
-  char e_sclass[1];
-  char e_numaux[1];
-};
-
-#define N_BTMASK	(0xf)
-#define N_TMASK		(0x30)
-#define N_BTSHFT	(4)
-#define N_TSHIFT	(2)
-
-union external_auxent {
-  struct {
-    host_ulong x_tagndx;	/* str, un, or enum tag indx */
-    union {
-      struct {
-	unsigned short  x_lnno; /* declaration line number */
-	unsigned short  x_size; /* str/union/array size */
-      } x_lnsz;
-      host_ulong x_fsize;	/* size of function */
-    } x_misc;
-    union {
-      struct {			/* if ISFCN, tag, or .bb */
-	host_ulong x_lnnoptr;/* ptr to fcn line # */
-	host_ulong x_endndx;	/* entry ndx past block end */
-      } x_fcn;
-      struct {			/* if ISARY, up to 4 dimen. */
-	char x_dimen[E_DIMNUM][2];
-      } x_ary;
-    } x_fcnary;
-    unsigned short x_tvndx;	/* tv index */
-  } x_sym;
-
-  union {
-    char x_fname[E_FILNMLEN];
-    struct {
-      host_ulong x_zeroes;
-      host_ulong x_offset;
-    } x_n;
-  } x_file;
-
-  struct {
-    host_ulong x_scnlen;	/* section length */
-    unsigned short x_nreloc;	/* # relocation entries */
-    unsigned short x_nlinno;	/* # line numbers */
-    host_ulong x_checksum;	/* section COMDAT checksum */
-    unsigned short x_associated;/* COMDAT associated section index */
-    char x_comdat[1];		/* COMDAT selection number */
-  } x_scn;
-
-  struct {
-    host_ulong x_tvfill;	/* tv fill value */
-    unsigned short x_tvlen;	/* length of .tv */
-    char x_tvran[2][2];		/* tv range */
-  } x_tv;	/* info about .tv section (in auxent of symbol .tv)) */
-
-};
-
-#define	SYMENT	struct external_syment
-#define	SYMESZ	18
-#define	AUXENT	union external_auxent
-#define	AUXESZ	18
-
-#define _ETEXT	"etext"
-
-/********************** RELOCATION DIRECTIVES **********************/
-
-struct external_reloc {
-  char r_vaddr[4];
-  char r_symndx[4];
-  char r_type[2];
-};
-
-#define RELOC struct external_reloc
-#define RELSZ 10
-
-/* end of coff/i386.h */
-
-/* PE COFF header information */
-
-#ifndef _PE_H
-#define _PE_H
-
-/* NT specific file attributes */
-#define IMAGE_FILE_RELOCS_STRIPPED           0x0001
-#define IMAGE_FILE_EXECUTABLE_IMAGE          0x0002
-#define IMAGE_FILE_LINE_NUMS_STRIPPED        0x0004
-#define IMAGE_FILE_LOCAL_SYMS_STRIPPED       0x0008
-#define IMAGE_FILE_BYTES_REVERSED_LO         0x0080
-#define IMAGE_FILE_32BIT_MACHINE             0x0100
-#define IMAGE_FILE_DEBUG_STRIPPED            0x0200
-#define IMAGE_FILE_SYSTEM                    0x1000
-#define IMAGE_FILE_DLL                       0x2000
-#define IMAGE_FILE_BYTES_REVERSED_HI         0x8000
-
-/* additional flags to be set for section headers to allow the NT loader to
-   read and write to the section data (to replace the addresses of data in
-   dlls for one thing); also to execute the section in .text's case=
- */
-#define IMAGE_SCN_MEM_DISCARDABLE 0x02000000
-#define IMAGE_SCN_MEM_EXECUTE     0x20000000
-#define IMAGE_SCN_MEM_READ        0x40000000
-#define IMAGE_SCN_MEM_WRITE       0x80000000
-
-/*
- * Section characteristics added for ppc-nt
- */
-
-#define IMAGE_SCN_TYPE_NO_PAD                0x00000008  /* Reserved.  */
-
-#define IMAGE_SCN_CNT_CODE                   0x00000020  /* Section contains code. */
-#define IMAGE_SCN_CNT_INITIALIZED_DATA       0x00000040  /* Section contains initialized data. */
-#define IMAGE_SCN_CNT_UNINITIALIZED_DATA     0x00000080  /* Section contains uninitialized data. */
-
-#define IMAGE_SCN_LNK_OTHER                  0x00000100  /* Reserved.  */
-#define IMAGE_SCN_LNK_INFO                   0x00000200  /* Section contains comments or some other type of information. */
-#define IMAGE_SCN_LNK_REMOVE                 0x00000800  /* Section contents will not become part of image. */
-#define IMAGE_SCN_LNK_COMDAT                 0x00001000  /* Section contents comdat. */
-
-#define IMAGE_SCN_MEM_FARDATA                0x00008000
-
-#define IMAGE_SCN_MEM_PURGEABLE              0x00020000
-#define IMAGE_SCN_MEM_16BIT                  0x00020000
-#define IMAGE_SCN_MEM_LOCKED                 0x00040000
-#define IMAGE_SCN_MEM_PRELOAD                0x00080000
-
-#define IMAGE_SCN_ALIGN_1BYTES               0x00100000
-#define IMAGE_SCN_ALIGN_2BYTES               0x00200000
-#define IMAGE_SCN_ALIGN_4BYTES               0x00300000
-#define IMAGE_SCN_ALIGN_8BYTES               0x00400000
-#define IMAGE_SCN_ALIGN_16BYTES              0x00500000  /* Default alignment if no others are specified. */
-#define IMAGE_SCN_ALIGN_32BYTES              0x00600000
-#define IMAGE_SCN_ALIGN_64BYTES              0x00700000
-
-
-#define IMAGE_SCN_LNK_NRELOC_OVFL            0x01000000  /* Section contains extended relocations. */
-#define IMAGE_SCN_MEM_NOT_CACHED             0x04000000  /* Section is not cachable.               */
-#define IMAGE_SCN_MEM_NOT_PAGED              0x08000000  /* Section is not pageable.               */
-#define IMAGE_SCN_MEM_SHARED                 0x10000000  /* Section is shareable.                  */
-
-/* COMDAT selection codes.  */
-
-#define IMAGE_COMDAT_SELECT_NODUPLICATES     (1) /* Warn if duplicates.  */
-#define IMAGE_COMDAT_SELECT_ANY		     (2) /* No warning.  */
-#define IMAGE_COMDAT_SELECT_SAME_SIZE	     (3) /* Warn if different size.  */
-#define IMAGE_COMDAT_SELECT_EXACT_MATCH	     (4) /* Warn if different.  */
-#define IMAGE_COMDAT_SELECT_ASSOCIATIVE	     (5) /* Base on other section.  */
-
-/* Magic values that are true for all dos/nt implementations */
-#define DOSMAGIC       0x5a4d
-#define NT_SIGNATURE   0x00004550
-
-/* NT allows long filenames, we want to accommodate this.  This may break
-     some of the bfd functions */
-#undef  FILNMLEN
-#define FILNMLEN	18	/* # characters in a file name		*/
-
-
-#ifdef COFF_IMAGE_WITH_PE
-/* The filehdr is only weired in images */
-
-#undef FILHDR
-struct external_PE_filehdr
-{
-  /* DOS header fields */
-  unsigned short e_magic;	/* Magic number, 0x5a4d */
-  unsigned short e_cblp;	/* Bytes on last page of file, 0x90 */
-  unsigned short e_cp;		/* Pages in file, 0x3 */
-  unsigned short e_crlc;	/* Relocations, 0x0 */
-  unsigned short e_cparhdr;	/* Size of header in paragraphs, 0x4 */
-  unsigned short e_minalloc;	/* Minimum extra paragraphs needed, 0x0 */
-  unsigned short e_maxalloc;	/* Maximum extra paragraphs needed, 0xFFFF */
-  unsigned short e_ss;		/* Initial (relative) SS value, 0x0 */
-  unsigned short e_sp;		/* Initial SP value, 0xb8 */
-  unsigned short e_csum;	/* Checksum, 0x0 */
-  unsigned short e_ip;		/* Initial IP value, 0x0 */
-  unsigned short e_cs;		/* Initial (relative) CS value, 0x0 */
-  unsigned short e_lfarlc;	/* File address of relocation table, 0x40 */
-  unsigned short e_ovno;	/* Overlay number, 0x0 */
-  char e_res[4][2];		/* Reserved words, all 0x0 */
-  unsigned short e_oemid;	/* OEM identifier (for e_oeminfo), 0x0 */
-  unsigned short e_oeminfo;	/* OEM information; e_oemid specific, 0x0 */
-  char e_res2[10][2];		/* Reserved words, all 0x0 */
-  host_ulong e_lfanew;	/* File address of new exe header, 0x80 */
-  char dos_message[16][4];	/* other stuff, always follow DOS header */
-  unsigned int nt_signature;	/* required NT signature, 0x4550 */
-
-  /* From standard header */
-
-  unsigned short f_magic;	/* magic number			*/
-  unsigned short f_nscns;	/* number of sections		*/
-  host_ulong f_timdat;	/* time & date stamp		*/
-  host_ulong f_symptr;	/* file pointer to symtab	*/
-  host_ulong f_nsyms;	/* number of symtab entries	*/
-  unsigned short f_opthdr;	/* sizeof(optional hdr)		*/
-  unsigned short f_flags;	/* flags			*/
-};
-
-
-#define FILHDR struct external_PE_filehdr
-#undef FILHSZ
-#define FILHSZ 152
-
-#endif
-
-typedef struct
-{
-  unsigned short magic;		/* type of file				*/
-  unsigned short vstamp;	/* version stamp			*/
-  host_ulong	tsize;		/* text size in bytes, padded to FW bdry*/
-  host_ulong	dsize;		/* initialized data "  "		*/
-  host_ulong	bsize;		/* uninitialized data "   "		*/
-  host_ulong	entry;		/* entry pt.				*/
-  host_ulong text_start;	/* base of text used for this file */
-  host_ulong data_start;	/* base of all data used for this file */
-
-  /* NT extra fields; see internal.h for descriptions */
-  host_ulong  ImageBase;
-  host_ulong  SectionAlignment;
-  host_ulong  FileAlignment;
-  unsigned short  MajorOperatingSystemVersion;
-  unsigned short  MinorOperatingSystemVersion;
-  unsigned short  MajorImageVersion;
-  unsigned short  MinorImageVersion;
-  unsigned short  MajorSubsystemVersion;
-  unsigned short  MinorSubsystemVersion;
-  char  Reserved1[4];
-  host_ulong  SizeOfImage;
-  host_ulong  SizeOfHeaders;
-  host_ulong  CheckSum;
-  unsigned short Subsystem;
-  unsigned short DllCharacteristics;
-  host_ulong  SizeOfStackReserve;
-  host_ulong  SizeOfStackCommit;
-  host_ulong  SizeOfHeapReserve;
-  host_ulong  SizeOfHeapCommit;
-  host_ulong  LoaderFlags;
-  host_ulong  NumberOfRvaAndSizes;
-  /* IMAGE_DATA_DIRECTORY DataDirectory[IMAGE_NUMBEROF_DIRECTORY_ENTRIES]; */
-  char  DataDirectory[16][2][4]; /* 16 entries, 2 elements/entry, 4 chars */
-
-} PEAOUTHDR;
-
-
-#undef AOUTSZ
-#define AOUTSZ (AOUTHDRSZ + 196)
-
-#undef  E_FILNMLEN
-#define E_FILNMLEN	18	/* # characters in a file name		*/
-#endif
-
-/* end of coff/pe.h */
-
-#define DT_NON		(0)	/* no derived type */
-#define DT_PTR		(1)	/* pointer */
-#define DT_FCN		(2)	/* function */
-#define DT_ARY		(3)	/* array */
-
-#define ISPTR(x)	(((x) & N_TMASK) == (DT_PTR << N_BTSHFT))
-#define ISFCN(x)	(((x) & N_TMASK) == (DT_FCN << N_BTSHFT))
-#define ISARY(x)	(((x) & N_TMASK) == (DT_ARY << N_BTSHFT))
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _A_OUT_H_ */
--- a/acl.c
+++ b/acl.c
@ -24,7 +24,7 @@


 #include "qemu-common.h"
-#include "acl.h"
+#include "qemu/acl.h"

 #ifdef CONFIG_FNMATCH
 #include <fnmatch.h>
--- a/aes.c
+++ b/aes.c
@ -28,7 +28,7 @@
 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #include "qemu-common.h"
-#include "aes.h"
+#include "block/aes.h"

 #ifndef NDEBUG
 #define NDEBUG
--- a/aio-posix.c
+++ b/aio-posix.c
@ -0,0 +1,268 @@
+/*
+ * QEMU aio implementation
+ *
+ * Copyright IBM, Corp. 2008
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+
+#include "qemu-common.h"
+#include "block/block.h"
+#include "qemu/queue.h"
+#include "qemu/sockets.h"
+
+struct AioHandler
+{
+    GPollFD pfd;
+    IOHandler *io_read;
+    IOHandler *io_write;
+    AioFlushHandler *io_flush;
+    int deleted;
+    void *opaque;
+    QLIST_ENTRY(AioHandler) node;
+};
+
+static AioHandler *find_aio_handler(AioContext *ctx, int fd)
+{
+    AioHandler *node;
+
+    QLIST_FOREACH(node, &ctx->aio_handlers, node) {
+        if (node->pfd.fd == fd)
+            if (!node->deleted)
+                return node;
+    }
+
+    return NULL;
+}
+
+void aio_set_fd_handler(AioContext *ctx,
+                        int fd,
+                        IOHandler *io_read,
+                        IOHandler *io_write,
+                        AioFlushHandler *io_flush,
+                        void *opaque)
+{
+    AioHandler *node;
+
+    node = find_aio_handler(ctx, fd);
+
+    /* Are we deleting the fd handler? */
+    if (!io_read && !io_write) {
+        if (node) {
+            g_source_remove_poll(&ctx->source, &node->pfd);
+
+            /* If the lock is held, just mark the node as deleted */
+            if (ctx->walking_handlers) {
+                node->deleted = 1;
+                node->pfd.revents = 0;
+            } else {
+                /* Otherwise, delete it for real.  We can't just mark it as
+                 * deleted because deleted nodes are only cleaned up after
+                 * releasing the walking_handlers lock.
+                 */
+                QLIST_REMOVE(node, node);
+                g_free(node);
+            }
+        }
+    } else {
+        if (node == NULL) {
+            /* Alloc and insert if it's not already there */
+            node = g_malloc0(sizeof(AioHandler));
+            node->pfd.fd = fd;
+            QLIST_INSERT_HEAD(&ctx->aio_handlers, node, node);
+
+            g_source_add_poll(&ctx->source, &node->pfd);
+        }
+        /* Update handler with latest information */
+        node->io_read = io_read;
+        node->io_write = io_write;
+        node->io_flush = io_flush;
+        node->opaque = opaque;
+
+        node->pfd.events = (io_read ? G_IO_IN | G_IO_HUP : 0);
+        node->pfd.events |= (io_write ? G_IO_OUT : 0);
+    }
+
+    aio_notify(ctx);
+}
+
+void aio_set_event_notifier(AioContext *ctx,
+                            EventNotifier *notifier,
+                            EventNotifierHandler *io_read,
+                            AioFlushEventNotifierHandler *io_flush)
+{
+    aio_set_fd_handler(ctx, event_notifier_get_fd(notifier),
+                       (IOHandler *)io_read, NULL,
+                       (AioFlushHandler *)io_flush, notifier);
+}
+
+bool aio_pending(AioContext *ctx)
+{
+    AioHandler *node;
+
+    QLIST_FOREACH(node, &ctx->aio_handlers, node) {
+        int revents;
+
+        /*
+         * FIXME: right now we cannot get G_IO_HUP and G_IO_ERR because
+         * main-loop.c is still select based (due to the slirp legacy).
+         * If main-loop.c ever switches to poll, G_IO_ERR should be
+         * tested too.  Dispatching G_IO_ERR to both handlers should be
+         * okay, since handlers need to be ready for spurious wakeups.
+         */
+        revents = node->pfd.revents & node->pfd.events;
+        if (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR) && node->io_read) {
+            return true;
+        }
+        if (revents & (G_IO_OUT | G_IO_ERR) && node->io_write) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+bool aio_poll(AioContext *ctx, bool blocking)
+{
+    static struct timeval tv0;
+    AioHandler *node;
+    fd_set rdfds, wrfds;
+    int max_fd = -1;
+    int ret;
+    bool busy, progress;
+
+    progress = false;
+
+    /*
+     * If there are callbacks left that have been queued, we need to call then.
+     * Do not call select in this case, because it is possible that the caller
+     * does not need a complete flush (as is the case for qemu_aio_wait loops).
+     */
+    if (aio_bh_poll(ctx)) {
+        blocking = false;
+        progress = true;
+    }
+
+    /*
+     * Then dispatch any pending callbacks from the GSource.
+     *
+     * We have to walk very carefully in case qemu_aio_set_fd_handler is
+     * called while we're walking.
+     */
+    node = QLIST_FIRST(&ctx->aio_handlers);
+    while (node) {
+        AioHandler *tmp;
+        int revents;
+
+        ctx->walking_handlers++;
+
+        revents = node->pfd.revents & node->pfd.events;
+        node->pfd.revents = 0;
+
+        /* See comment in aio_pending.  */
+        if (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR) && node->io_read) {
+            node->io_read(node->opaque);
+            progress = true;
+        }
+        if (revents & (G_IO_OUT | G_IO_ERR) && node->io_write) {
+            node->io_write(node->opaque);
+            progress = true;
+        }
+
+        tmp = node;
+        node = QLIST_NEXT(node, node);
+
+        ctx->walking_handlers--;
+
+        if (!ctx->walking_handlers && tmp->deleted) {
+            QLIST_REMOVE(tmp, node);
+            g_free(tmp);
+        }
+    }
+
+    if (progress && !blocking) {
+        return true;
+    }
+
+    ctx->walking_handlers++;
+
+    FD_ZERO(&rdfds);
+    FD_ZERO(&wrfds);
+
+    /* fill fd sets */
+    busy = false;
+    QLIST_FOREACH(node, &ctx->aio_handlers, node) {
+        /* If there aren't pending AIO operations, don't invoke callbacks.
+         * Otherwise, if there are no AIO requests, qemu_aio_wait() would
+         * wait indefinitely.
+         */
+        if (!node->deleted && node->io_flush) {
+            if (node->io_flush(node->opaque) == 0) {
+                continue;
+            }
+            busy = true;
+        }
+        if (!node->deleted && node->io_read) {
+            FD_SET(node->pfd.fd, &rdfds);
+            max_fd = MAX(max_fd, node->pfd.fd + 1);
+        }
+        if (!node->deleted && node->io_write) {
+            FD_SET(node->pfd.fd, &wrfds);
+            max_fd = MAX(max_fd, node->pfd.fd + 1);
+        }
+    }
+
+    ctx->walking_handlers--;
+
+    /* No AIO operations?  Get us out of here */
+    if (!busy) {
+        return progress;
+    }
+
+    /* wait until next event */
+    ret = select(max_fd, &rdfds, &wrfds, NULL, blocking ? NULL : &tv0);
+
+    /* if we have any readable fds, dispatch event */
+    if (ret > 0) {
+        /* we have to walk very carefully in case
+         * qemu_aio_set_fd_handler is called while we're walking */
+        node = QLIST_FIRST(&ctx->aio_handlers);
+        while (node) {
+            AioHandler *tmp;
+
+            ctx->walking_handlers++;
+
+            if (!node->deleted &&
+                FD_ISSET(node->pfd.fd, &rdfds) &&
+                node->io_read) {
+                node->io_read(node->opaque);
+                progress = true;
+            }
+            if (!node->deleted &&
+                FD_ISSET(node->pfd.fd, &wrfds) &&
+                node->io_write) {
+                node->io_write(node->opaque);
+                progress = true;
+            }
+
+            tmp = node;
+            node = QLIST_NEXT(node, node);
+
+            ctx->walking_handlers--;
+
+            if (!ctx->walking_handlers && tmp->deleted) {
+                QLIST_REMOVE(tmp, node);
+                g_free(tmp);
+            }
+        }
+    }
+
+    return progress;
+}
--- a/aio-win32.c
+++ b/aio-win32.c
@ -0,0 +1,218 @@
+/*
+ * QEMU aio implementation
+ *
+ * Copyright IBM Corp., 2008
+ * Copyright Red Hat Inc., 2012
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *  Paolo Bonzini     <pbonzini@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+
+#include "qemu-common.h"
+#include "block/block.h"
+#include "qemu/queue.h"
+#include "qemu/sockets.h"
+
+struct AioHandler {
+    EventNotifier *e;
+    EventNotifierHandler *io_notify;
+    AioFlushEventNotifierHandler *io_flush;
+    GPollFD pfd;
+    int deleted;
+    QLIST_ENTRY(AioHandler) node;
+};
+
+void aio_set_event_notifier(AioContext *ctx,
+                            EventNotifier *e,
+                            EventNotifierHandler *io_notify,
+                            AioFlushEventNotifierHandler *io_flush)
+{
+    AioHandler *node;
+
+    QLIST_FOREACH(node, &ctx->aio_handlers, node) {
+        if (node->e == e && !node->deleted) {
+            break;
+        }
+    }
+
+    /* Are we deleting the fd handler? */
+    if (!io_notify) {
+        if (node) {
+            g_source_remove_poll(&ctx->source, &node->pfd);
+
+            /* If the lock is held, just mark the node as deleted */
+            if (ctx->walking_handlers) {
+                node->deleted = 1;
+                node->pfd.revents = 0;
+            } else {
+                /* Otherwise, delete it for real.  We can't just mark it as
+                 * deleted because deleted nodes are only cleaned up after
+                 * releasing the walking_handlers lock.
+                 */
+                QLIST_REMOVE(node, node);
+                g_free(node);
+            }
+        }
+    } else {
+        if (node == NULL) {
+            /* Alloc and insert if it's not already there */
+            node = g_malloc0(sizeof(AioHandler));
+            node->e = e;
+            node->pfd.fd = (uintptr_t)event_notifier_get_handle(e);
+            node->pfd.events = G_IO_IN;
+            QLIST_INSERT_HEAD(&ctx->aio_handlers, node, node);
+
+            g_source_add_poll(&ctx->source, &node->pfd);
+        }
+        /* Update handler with latest information */
+        node->io_notify = io_notify;
+        node->io_flush = io_flush;
+    }
+
+    aio_notify(ctx);
+}
+
+bool aio_pending(AioContext *ctx)
+{
+    AioHandler *node;
+
+    QLIST_FOREACH(node, &ctx->aio_handlers, node) {
+        if (node->pfd.revents && node->io_notify) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+bool aio_poll(AioContext *ctx, bool blocking)
+{
+    AioHandler *node;
+    HANDLE events[MAXIMUM_WAIT_OBJECTS + 1];
+    bool busy, progress;
+    int count;
+
+    progress = false;
+
+    /*
+     * If there are callbacks left that have been queued, we need to call then.
+     * Do not call select in this case, because it is possible that the caller
+     * does not need a complete flush (as is the case for qemu_aio_wait loops).
+     */
+    if (aio_bh_poll(ctx)) {
+        blocking = false;
+        progress = true;
+    }
+
+    /*
+     * Then dispatch any pending callbacks from the GSource.
+     *
+     * We have to walk very carefully in case qemu_aio_set_fd_handler is
+     * called while we're walking.
+     */
+    node = QLIST_FIRST(&ctx->aio_handlers);
+    while (node) {
+        AioHandler *tmp;
+
+        ctx->walking_handlers++;
+
+        if (node->pfd.revents && node->io_notify) {
+            node->pfd.revents = 0;
+            node->io_notify(node->e);
+            progress = true;
+        }
+
+        tmp = node;
+        node = QLIST_NEXT(node, node);
+
+        ctx->walking_handlers--;
+
+        if (!ctx->walking_handlers && tmp->deleted) {
+            QLIST_REMOVE(tmp, node);
+            g_free(tmp);
+        }
+    }
+
+    if (progress && !blocking) {
+        return true;
+    }
+
+    ctx->walking_handlers++;
+
+    /* fill fd sets */
+    busy = false;
+    count = 0;
+    QLIST_FOREACH(node, &ctx->aio_handlers, node) {
+        /* If there aren't pending AIO operations, don't invoke callbacks.
+         * Otherwise, if there are no AIO requests, qemu_aio_wait() would
+         * wait indefinitely.
+         */
+        if (!node->deleted && node->io_flush) {
+            if (node->io_flush(node->e) == 0) {
+                continue;
+            }
+            busy = true;
+        }
+        if (!node->deleted && node->io_notify) {
+            events[count++] = event_notifier_get_handle(node->e);
+        }
+    }
+
+    ctx->walking_handlers--;
+
+    /* No AIO operations?  Get us out of here */
+    if (!busy) {
+        return progress;
+    }
+
+    /* wait until next event */
+    while (count > 0) {
+        int timeout = blocking ? INFINITE : 0;
+        int ret = WaitForMultipleObjects(count, events, FALSE, timeout);
+
+        /* if we have any signaled events, dispatch event */
+        if ((DWORD) (ret - WAIT_OBJECT_0) >= count) {
+            break;
+        }
+
+        blocking = false;
+
+        /* we have to walk very carefully in case
+         * qemu_aio_set_fd_handler is called while we're walking */
+        node = QLIST_FIRST(&ctx->aio_handlers);
+        while (node) {
+            AioHandler *tmp;
+
+            ctx->walking_handlers++;
+
+            if (!node->deleted &&
+                event_notifier_get_handle(node->e) == events[ret - WAIT_OBJECT_0] &&
+                node->io_notify) {
+                node->io_notify(node->e);
+                progress = true;
+            }
+
+            tmp = node;
+            node = QLIST_NEXT(node, node);
+
+            ctx->walking_handlers--;
+
+            if (!ctx->walking_handlers && tmp->deleted) {
+                QLIST_REMOVE(tmp, node);
+                g_free(tmp);
+            }
+        }
+
+        /* Try again, but only call each handler once.  */
+        events[ret - WAIT_OBJECT_0] = events[--count];
+    }
+
+    return progress;
+}
--- a/aio.c
+++ b/aio.c
@ -1,194 +0,0 @@
-/*
- * QEMU aio implementation
- *
- * Copyright IBM, Corp. 2008
- *
- * Authors:
- *  Anthony Liguori   <aliguori@us.ibm.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2.  See
- * the COPYING file in the top-level directory.
- *
- * Contributions after 2012-01-13 are licensed under the terms of the
- * GNU GPL, version 2 or (at your option) any later version.
- */
-
-#include "qemu-common.h"
-#include "block.h"
-#include "qemu-queue.h"
-#include "qemu_socket.h"
-
-typedef struct AioHandler AioHandler;
-
-/* The list of registered AIO handlers */
-static QLIST_HEAD(, AioHandler) aio_handlers;
-
-/* This is a simple lock used to protect the aio_handlers list.  Specifically,
- * it's used to ensure that no callbacks are removed while we're walking and
- * dispatching callbacks.
- */
-static int walking_handlers;
-
-struct AioHandler
-{
-    int fd;
-    IOHandler *io_read;
-    IOHandler *io_write;
-    AioFlushHandler *io_flush;
-    int deleted;
-    void *opaque;
-    QLIST_ENTRY(AioHandler) node;
-};
-
-static AioHandler *find_aio_handler(int fd)
-{
-    AioHandler *node;
-
-    QLIST_FOREACH(node, &aio_handlers, node) {
-        if (node->fd == fd)
-            if (!node->deleted)
-                return node;
-    }
-
-    return NULL;
-}
-
-int qemu_aio_set_fd_handler(int fd,
-                            IOHandler *io_read,
-                            IOHandler *io_write,
-                            AioFlushHandler *io_flush,
-                            void *opaque)
-{
-    AioHandler *node;
-
-    node = find_aio_handler(fd);
-
-    /* Are we deleting the fd handler? */
-    if (!io_read && !io_write) {
-        if (node) {
-            /* If the lock is held, just mark the node as deleted */
-            if (walking_handlers)
-                node->deleted = 1;
-            else {
-                /* Otherwise, delete it for real.  We can't just mark it as
-                 * deleted because deleted nodes are only cleaned up after
-                 * releasing the walking_handlers lock.
-                 */
-                QLIST_REMOVE(node, node);
-                g_free(node);
-            }
-        }
-    } else {
-        if (node == NULL) {
-            /* Alloc and insert if it's not already there */
-            node = g_malloc0(sizeof(AioHandler));
-            node->fd = fd;
-            QLIST_INSERT_HEAD(&aio_handlers, node, node);
-        }
-        /* Update handler with latest information */
-        node->io_read = io_read;
-        node->io_write = io_write;
-        node->io_flush = io_flush;
-        node->opaque = opaque;
-    }
-
-    qemu_set_fd_handler2(fd, NULL, io_read, io_write, opaque);
-
-    return 0;
-}
-
-void qemu_aio_flush(void)
-{
-    while (qemu_aio_wait());
-}
-
-bool qemu_aio_wait(void)
-{
-    AioHandler *node;
-    fd_set rdfds, wrfds;
-    int max_fd = -1;
-    int ret;
-    bool busy;
-
-    /*
-     * If there are callbacks left that have been queued, we need to call then.
-     * Do not call select in this case, because it is possible that the caller
-     * does not need a complete flush (as is the case for qemu_aio_wait loops).
-     */
-    if (qemu_bh_poll()) {
-        return true;
-    }
-
-    walking_handlers = 1;
-
-    FD_ZERO(&rdfds);
-    FD_ZERO(&wrfds);
-
-    /* fill fd sets */
-    busy = false;
-    QLIST_FOREACH(node, &aio_handlers, node) {
-        /* If there aren't pending AIO operations, don't invoke callbacks.
-         * Otherwise, if there are no AIO requests, qemu_aio_wait() would
-         * wait indefinitely.
-         */
-        if (node->io_flush) {
-            if (node->io_flush(node->opaque) == 0) {
-                continue;
-            }
-            busy = true;
-        }
-        if (!node->deleted && node->io_read) {
-            FD_SET(node->fd, &rdfds);
-            max_fd = MAX(max_fd, node->fd + 1);
-        }
-        if (!node->deleted && node->io_write) {
-            FD_SET(node->fd, &wrfds);
-            max_fd = MAX(max_fd, node->fd + 1);
-        }
-    }
-
-    walking_handlers = 0;
-
-    /* No AIO operations?  Get us out of here */
-    if (!busy) {
-        return false;
-    }
-
-    /* wait until next event */
-    ret = select(max_fd, &rdfds, &wrfds, NULL, NULL);
-
-    /* if we have any readable fds, dispatch event */
-    if (ret > 0) {
-        walking_handlers = 1;
-
-        /* we have to walk very carefully in case
-         * qemu_aio_set_fd_handler is called while we're walking */
-        node = QLIST_FIRST(&aio_handlers);
-        while (node) {
-            AioHandler *tmp;
-
-            if (!node->deleted &&
-                FD_ISSET(node->fd, &rdfds) &&
-                node->io_read) {
-                node->io_read(node->opaque);
-            }
-            if (!node->deleted &&
-                FD_ISSET(node->fd, &wrfds) &&
-                node->io_write) {
-                node->io_write(node->opaque);
-            }
-
-            tmp = node;
-            node = QLIST_NEXT(node, node);
-
-            if (tmp->deleted) {
-                QLIST_REMOVE(tmp, node);
-                g_free(tmp);
-            }
-        }
-
-        walking_handlers = 0;
-    }
-
-    return true;
-}
--- a/arch_init.c
+++ b/arch_init.c
@ -29,21 +29,26 @@
 #include <sys/mman.h>
 #endif
 #include "config.h"
-#include "monitor.h"
-#include "sysemu.h"
-#include "arch_init.h"
+#include "monitor/monitor.h"
+#include "sysemu/sysemu.h"
+#include "qemu/bitops.h"
+#include "qemu/bitmap.h"
+#include "sysemu/arch_init.h"
 #include "audio/audio.h"
 #include "hw/pc.h"
-#include "hw/pci.h"
+#include "hw/pci/pci.h"
 #include "hw/audiodev.h"
-#include "kvm.h"
-#include "migration.h"
-#include "net.h"
-#include "gdbstub.h"
+#include "sysemu/kvm.h"
+#include "migration/migration.h"
+#include "exec/gdbstub.h"
 #include "hw/smbios.h"
-#include "exec-memory.h"
+#include "exec/address-spaces.h"
 #include "hw/pcspk.h"
-#include "qemu/page_cache.h"
+#include "migration/page_cache.h"
+#include "qemu/config-file.h"
+#include "qmp-commands.h"
+#include "trace.h"
+#include "exec/cpu-all.h"

 #ifdef DEBUG_ARCH_INIT
 #define DPRINTF(fmt, ...) \
@ -135,7 +140,6 @@ static struct defconfig_file {
    /* Indicates it is an user config file (disabled by -no-user-config) */
    bool userconfig;
 } default_config_files[] = {
-    { CONFIG_QEMU_DATADIR "/cpus-" TARGET_ARCH ".conf",  false },
    { CONFIG_QEMU_CONFDIR "/qemu.conf",                   true },
    { CONFIG_QEMU_CONFDIR "/target-" TARGET_ARCH ".conf", true },
    { NULL }, /* end of list */
@ -261,16 +265,21 @@ uint64_t xbzrle_mig_pages_overflow(void)
    return acct_info.xbzrle_overflows;
 }

-static void save_block_hdr(QEMUFile *f, RAMBlock *block, ram_addr_t offset,
-        int cont, int flag)
+static size_t save_block_hdr(QEMUFile *f, RAMBlock *block, ram_addr_t offset,
+                             int cont, int flag)
 {
-        qemu_put_be64(f, offset | cont | flag);
-        if (!cont) {
-                qemu_put_byte(f, strlen(block->idstr));
-                qemu_put_buffer(f, (uint8_t *)block->idstr,
-                                strlen(block->idstr));
-        }
+    size_t size;

+    qemu_put_be64(f, offset | cont | flag);
+    size = 8;
+
+    if (!cont) {
+        qemu_put_byte(f, strlen(block->idstr));
+        qemu_put_buffer(f, (uint8_t *)block->idstr,
+                        strlen(block->idstr));
+        size += 1 + strlen(block->idstr);
+    }
+    return size;
 }

 #define ENCODING_FLAG_XBZRLE 0x1
@ -317,56 +326,147 @@ static int save_xbzrle_page(QEMUFile *f, uint8_t *current_data,
    }

    /* Send XBZRLE based compressed page */
-    save_block_hdr(f, block, offset, cont, RAM_SAVE_FLAG_XBZRLE);
+    bytes_sent = save_block_hdr(f, block, offset, cont, RAM_SAVE_FLAG_XBZRLE);
    qemu_put_byte(f, ENCODING_FLAG_XBZRLE);
    qemu_put_be16(f, encoded_len);
    qemu_put_buffer(f, XBZRLE.encoded_buf, encoded_len);
-    bytes_sent = encoded_len + 1 + 2;
+    bytes_sent += encoded_len + 1 + 2;
    acct_info.xbzrle_pages++;
    acct_info.xbzrle_bytes += bytes_sent;

    return bytes_sent;
 }

-static RAMBlock *last_block;
+
+/* This is the last block that we have visited serching for dirty pages
+ */
+static RAMBlock *last_seen_block;
+/* This is the last block from where we have sent data */
+static RAMBlock *last_sent_block;
 static ram_addr_t last_offset;
+static unsigned long *migration_bitmap;
+static uint64_t migration_dirty_pages;
+static uint32_t last_version;
+
+static inline
+ram_addr_t migration_bitmap_find_and_reset_dirty(MemoryRegion *mr,
+                                                 ram_addr_t start)
+{
+    unsigned long base = mr->ram_addr >> TARGET_PAGE_BITS;
+    unsigned long nr = base + (start >> TARGET_PAGE_BITS);
+    unsigned long size = base + (int128_get64(mr->size) >> TARGET_PAGE_BITS);
+
+    unsigned long next = find_next_bit(migration_bitmap, size, nr);
+
+    if (next < size) {
+        clear_bit(next, migration_bitmap);
+        migration_dirty_pages--;
+    }
+    return (next - base) << TARGET_PAGE_BITS;
+}
+
+static inline bool migration_bitmap_set_dirty(MemoryRegion *mr,
+                                              ram_addr_t offset)
+{
+    bool ret;
+    int nr = (mr->ram_addr + offset) >> TARGET_PAGE_BITS;
+
+    ret = test_and_set_bit(nr, migration_bitmap);
+
+    if (!ret) {
+        migration_dirty_pages++;
+    }
+    return ret;
+}
+
+static void migration_bitmap_sync(void)
+{
+    RAMBlock *block;
+    ram_addr_t addr;
+    uint64_t num_dirty_pages_init = migration_dirty_pages;
+    MigrationState *s = migrate_get_current();
+    static int64_t start_time;
+    static int64_t num_dirty_pages_period;
+    int64_t end_time;
+
+    if (!start_time) {
+        start_time = qemu_get_clock_ms(rt_clock);
+    }
+
+    trace_migration_bitmap_sync_start();
+    memory_global_sync_dirty_bitmap(get_system_memory());
+
+    QTAILQ_FOREACH(block, &ram_list.blocks, next) {
+        for (addr = 0; addr < block->length; addr += TARGET_PAGE_SIZE) {
+            if (memory_region_test_and_clear_dirty(block->mr,
+                                                   addr, TARGET_PAGE_SIZE,
+                                                   DIRTY_MEMORY_MIGRATION)) {
+                migration_bitmap_set_dirty(block->mr, addr);
+            }
+        }
+    }
+    trace_migration_bitmap_sync_end(migration_dirty_pages
+                                    - num_dirty_pages_init);
+    num_dirty_pages_period += migration_dirty_pages - num_dirty_pages_init;
+    end_time = qemu_get_clock_ms(rt_clock);
+
+    /* more than 1 second = 1000 millisecons */
+    if (end_time > start_time + 1000) {
+        s->dirty_pages_rate = num_dirty_pages_period * 1000
+            / (end_time - start_time);
+        start_time = end_time;
+        num_dirty_pages_period = 0;
+    }
+}

 /*
 * ram_save_block: Writes a page of memory to the stream f
 *
- * Returns:  0: if the page hasn't changed
- *          -1: if there are no more dirty pages
- *           n: the amount of bytes written in other case
+ * Returns:  The number of bytes written.
+ *           0 means no dirty pages
 */

 static int ram_save_block(QEMUFile *f, bool last_stage)
 {
-    RAMBlock *block = last_block;
+    RAMBlock *block = last_seen_block;
    ram_addr_t offset = last_offset;
-    int bytes_sent = -1;
+    bool complete_round = false;
+    int bytes_sent = 0;
    MemoryRegion *mr;
    ram_addr_t current_addr;

    if (!block)
-        block = QLIST_FIRST(&ram_list.blocks);
+        block = QTAILQ_FIRST(&ram_list.blocks);

-    do {
+    while (true) {
        mr = block->mr;
-        if (memory_region_get_dirty(mr, offset, TARGET_PAGE_SIZE,
-                                    DIRTY_MEMORY_MIGRATION)) {
+        offset = migration_bitmap_find_and_reset_dirty(mr, offset);
+        if (complete_round && block == last_seen_block &&
+            offset >= last_offset) {
+            break;
+        }
+        if (offset >= block->length) {
+            offset = 0;
+            block = QTAILQ_NEXT(block, next);
+            if (!block) {
+                block = QTAILQ_FIRST(&ram_list.blocks);
+                complete_round = true;
+            }
+        } else {
            uint8_t *p;
-            int cont = (block == last_block) ? RAM_SAVE_FLAG_CONTINUE : 0;
-
-            memory_region_reset_dirty(mr, offset, TARGET_PAGE_SIZE,
-                                      DIRTY_MEMORY_MIGRATION);
+            int cont = (block == last_sent_block) ?
+                RAM_SAVE_FLAG_CONTINUE : 0;

            p = memory_region_get_ram_ptr(mr) + offset;

+            /* In doubt sent page as normal */
+            bytes_sent = -1;
            if (is_dup_page(p)) {
                acct_info.dup_pages++;
-                save_block_hdr(f, block, offset, cont, RAM_SAVE_FLAG_COMPRESS);
+                bytes_sent = save_block_hdr(f, block, offset, cont,
+                                            RAM_SAVE_FLAG_COMPRESS);
                qemu_put_byte(f, *p);
-                bytes_sent = 1;
+                bytes_sent += 1;
            } else if (migrate_use_xbzrle()) {
                current_addr = block->offset + offset;
                bytes_sent = save_xbzrle_page(f, p, current_addr, block,
@ -376,30 +476,22 @@ static int ram_save_block(QEMUFile *f, bool last_stage)
                }
            }

-            /* either we didn't send yet (we may have had XBZRLE overflow) */
+            /* XBZRLE overflow or normal page */
            if (bytes_sent == -1) {
-                save_block_hdr(f, block, offset, cont, RAM_SAVE_FLAG_PAGE);
+                bytes_sent = save_block_hdr(f, block, offset, cont, RAM_SAVE_FLAG_PAGE);
                qemu_put_buffer(f, p, TARGET_PAGE_SIZE);
-                bytes_sent = TARGET_PAGE_SIZE;
+                bytes_sent += TARGET_PAGE_SIZE;
                acct_info.norm_pages++;
            }

            /* if page is unmodified, continue to the next */
-            if (bytes_sent != 0) {
+            if (bytes_sent > 0) {
+                last_sent_block = block;
                break;
            }
        }
-
-        offset += TARGET_PAGE_SIZE;
-        if (offset >= block->length) {
-            offset = 0;
-            block = QLIST_NEXT(block, next);
-            if (!block)
-                block = QLIST_FIRST(&ram_list.blocks);
-        }
-    } while (block != last_block || offset != last_offset);
-
-    last_block = block;
+    }
+    last_seen_block = block;
    last_offset = offset;

    return bytes_sent;
@ -409,7 +501,7 @@ static uint64_t bytes_transferred;

 static ram_addr_t ram_save_remaining(void)
 {
-    return ram_list.dirty_pages;
+    return migration_dirty_pages;
 }

 uint64_t ram_bytes_remaining(void)
@ -427,46 +519,21 @@ uint64_t ram_bytes_total(void)
    RAMBlock *block;
    uint64_t total = 0;

-    QLIST_FOREACH(block, &ram_list.blocks, next)
+    QTAILQ_FOREACH(block, &ram_list.blocks, next)
        total += block->length;

    return total;
 }

-static int block_compar(const void *a, const void *b)
-{
-    RAMBlock * const *ablock = a;
-    RAMBlock * const *bblock = b;
-
-    return strcmp((*ablock)->idstr, (*bblock)->idstr);
-}
-
-static void sort_ram_list(void)
-{
-    RAMBlock *block, *nblock, **blocks;
-    int n;
-    n = 0;
-    QLIST_FOREACH(block, &ram_list.blocks, next) {
-        ++n;
-    }
-    blocks = g_malloc(n * sizeof *blocks);
-    n = 0;
-    QLIST_FOREACH_SAFE(block, &ram_list.blocks, next, nblock) {
-        blocks[n++] = block;
-        QLIST_REMOVE(block, next);
-    }
-    qsort(blocks, n, sizeof *blocks, block_compar);
-    while (--n >= 0) {
-        QLIST_INSERT_HEAD(&ram_list.blocks, blocks[n], next);
-    }
-    g_free(blocks);
-}
-
 static void migration_end(void)
 {
-    memory_global_dirty_log_stop();
+    if (migration_bitmap) {
+        memory_global_dirty_log_stop();
+        g_free(migration_bitmap);
+        migration_bitmap = NULL;
+    }

-    if (migrate_use_xbzrle()) {
+    if (XBZRLE.cache) {
        cache_fini(XBZRLE.cache);
        g_free(XBZRLE.cache);
        g_free(XBZRLE.encoded_buf);
@ -481,17 +548,28 @@ static void ram_migration_cancel(void *opaque)
    migration_end();
 }

+static void reset_ram_globals(void)
+{
+    last_seen_block = NULL;
+    last_sent_block = NULL;
+    last_offset = 0;
+    last_version = ram_list.version;
+}
+
 #define MAX_WAIT 50 /* ms, half buffered_file limit */

 static int ram_save_setup(QEMUFile *f, void *opaque)
 {
-    ram_addr_t addr;
    RAMBlock *block;
+    int64_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;

+    migration_bitmap = bitmap_new(ram_pages);
+    bitmap_set(migration_bitmap, 0, ram_pages);
+    migration_dirty_pages = ram_pages;
+
+    qemu_mutex_lock_ramlist();
    bytes_transferred = 0;
-    last_block = NULL;
-    last_offset = 0;
-    sort_ram_list();
+    reset_ram_globals();

    if (migrate_use_xbzrle()) {
        XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
@ -506,26 +584,18 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
        acct_clear();
    }

-    /* Make sure all dirty bits are set */
-    QLIST_FOREACH(block, &ram_list.blocks, next) {
-        for (addr = 0; addr < block->length; addr += TARGET_PAGE_SIZE) {
-            if (!memory_region_get_dirty(block->mr, addr, TARGET_PAGE_SIZE,
-                                         DIRTY_MEMORY_MIGRATION)) {
-                memory_region_set_dirty(block->mr, addr, TARGET_PAGE_SIZE);
-            }
-        }
-    }
-
    memory_global_dirty_log_start();
+    migration_bitmap_sync();

    qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);

-    QLIST_FOREACH(block, &ram_list.blocks, next) {
+    QTAILQ_FOREACH(block, &ram_list.blocks, next) {
        qemu_put_byte(f, strlen(block->idstr));
        qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
        qemu_put_be64(f, block->length);
    }

+    qemu_mutex_unlock_ramlist();
    qemu_put_be64(f, RAM_SAVE_FLAG_EOS);

    return 0;
@ -533,25 +603,28 @@ static int ram_save_setup(QEMUFile *f, void *opaque)

 static int ram_save_iterate(QEMUFile *f, void *opaque)
 {
-    uint64_t bytes_transferred_last;
-    double bwidth = 0;
    int ret;
    int i;
-    uint64_t expected_time;
+    int64_t t0;
+    int total_sent = 0;

-    bytes_transferred_last = bytes_transferred;
-    bwidth = qemu_get_clock_ns(rt_clock);
+    qemu_mutex_lock_ramlist();

+    if (ram_list.version != last_version) {
+        reset_ram_globals();
+    }
+
+    t0 = qemu_get_clock_ns(rt_clock);
    i = 0;
    while ((ret = qemu_file_rate_limit(f)) == 0) {
        int bytes_sent;

        bytes_sent = ram_save_block(f, false);
        /* no more blocks to sent */
-        if (bytes_sent < 0) {
+        if (bytes_sent == 0) {
            break;
        }
-        bytes_transferred += bytes_sent;
+        total_sent += bytes_sent;
        acct_info.iterations++;
        /* we want to check in the 1st loop, just in case it was the 1st time
           and we had to sync the dirty bitmap.
@ -559,9 +632,9 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
           iterations
        */
        if ((i & 63) == 0) {
-            uint64_t t1 = (qemu_get_clock_ns(rt_clock) - bwidth) / 1000000;
+            uint64_t t1 = (qemu_get_clock_ns(rt_clock) - t0) / 1000000;
            if (t1 > MAX_WAIT) {
-                DPRINTF("big wait: " PRIu64 " milliseconds, %d iterations\n",
+                DPRINTF("big wait: %" PRIu64 " milliseconds, %d iterations\n",
                        t1, i);
                break;
            }
@ -570,37 +643,23 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
    }

    if (ret < 0) {
+        bytes_transferred += total_sent;
        return ret;
    }

-    bwidth = qemu_get_clock_ns(rt_clock) - bwidth;
-    bwidth = (bytes_transferred - bytes_transferred_last) / bwidth;
-
-    /* if we haven't transferred anything this round, force expected_time to a
-     * a very high value, but without crashing */
-    if (bwidth == 0) {
-        bwidth = 0.000001;
-    }
-
+    qemu_mutex_unlock_ramlist();
    qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
+    total_sent += 8;
+    bytes_transferred += total_sent;

-    expected_time = ram_save_remaining() * TARGET_PAGE_SIZE / bwidth;
-
-    DPRINTF("ram_save_live: expected(" PRIu64 ") <= max(" PRIu64 ")?\n",
-            expected_time, migrate_max_downtime());
-
-    if (expected_time <= migrate_max_downtime()) {
-        memory_global_sync_dirty_bitmap(get_system_memory());
-        expected_time = ram_save_remaining() * TARGET_PAGE_SIZE / bwidth;
-
-        return expected_time <= migrate_max_downtime();
-    }
-    return 0;
+    return total_sent;
 }

 static int ram_save_complete(QEMUFile *f, void *opaque)
 {
-    memory_global_sync_dirty_bitmap(get_system_memory());
+    migration_bitmap_sync();
+
+    qemu_mutex_lock_ramlist();

    /* try transferring iterative blocks of memory */

@ -610,18 +669,32 @@ static int ram_save_complete(QEMUFile *f, void *opaque)

        bytes_sent = ram_save_block(f, true);
        /* no more blocks to sent */
-        if (bytes_sent < 0) {
+        if (bytes_sent == 0) {
            break;
        }
        bytes_transferred += bytes_sent;
    }
-    memory_global_dirty_log_stop();
+    migration_end();

+    qemu_mutex_unlock_ramlist();
    qemu_put_be64(f, RAM_SAVE_FLAG_EOS);

    return 0;
 }

+static uint64_t ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size)
+{
+    uint64_t remaining_size;
+
+    remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
+
+    if (remaining_size < max_size) {
+        migration_bitmap_sync();
+        remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
+    }
+    return remaining_size;
+}
+
 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
 {
    int ret, rc = 0;
@ -684,7 +757,7 @@ static inline void *host_from_stream_offset(QEMUFile *f,
    qemu_get_buffer(f, (uint8_t *)id, len);
    id[len] = 0;

-    QLIST_FOREACH(block, &ram_list.blocks, next) {
+    QTAILQ_FOREACH(block, &ram_list.blocks, next) {
        if (!strncmp(id, block->idstr, sizeof(id)))
            return memory_region_get_ram_ptr(block->mr) + offset;
    }
@ -728,7 +801,7 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
                    id[len] = 0;
                    length = qemu_get_be64(f);

-                    QLIST_FOREACH(block, &ram_list.blocks, next) {
+                    QTAILQ_FOREACH(block, &ram_list.blocks, next) {
                        if (!strncmp(id, block->idstr, sizeof(id))) {
                            if (block->length != length) {
                                ret =  -EINVAL;
@ -763,7 +836,8 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
            memset(host, ch, TARGET_PAGE_SIZE);
 #ifndef _WIN32
            if (ch == 0 &&
-                (!kvm_enabled() || kvm_has_sync_mmu())) {
+                (!kvm_enabled() || kvm_has_sync_mmu()) &&
+                getpagesize() <= TARGET_PAGE_SIZE) {
                qemu_madvise(host, TARGET_PAGE_SIZE, QEMU_MADV_DONTNEED);
            }
 #endif
@ -798,8 +872,8 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
    } while (!(flags & RAM_SAVE_FLAG_EOS));

 done:
-    DPRINTF("Completed load of VM with exit code %d seq iteration " PRIu64 "\n",
-            ret, seq_iter);
+    DPRINTF("Completed load of VM with exit code %d seq iteration "
+            "%" PRIu64 "\n", ret, seq_iter);
    return ret;
 }

@ -807,6 +881,7 @@ SaveVMHandlers savevm_ram_handlers = {
    .save_live_setup = ram_save_setup,
    .save_live_iterate = ram_save_iterate,
    .save_live_complete = ram_save_complete,
+    .save_live_pending = ram_save_pending,
    .load_state = ram_load,
    .cancel = ram_migration_cancel,
 };
@ -921,11 +996,16 @@ void select_soundhw(const char *optarg)
    if (is_help_option(optarg)) {
    show_valid_cards:

+#ifdef HAS_AUDIO_CHOICE
        printf("Valid sound card names (comma separated):\n");
        for (c = soundhw; c->name; ++c) {
            printf ("%-11s %s\n", c->name, c->descr);
        }
        printf("\n-soundhw all will enable all of the above\n");
+#else
+        printf("Machine has no user-selectable audio hardware "
+               "(it may or may not have always-present audio hardware).\n");
+#endif
        exit(!is_help_option(optarg));
    }
    else {
@ -1080,3 +1160,13 @@ int xen_available(void)
    return 0;
 #endif
 }
+
+
+TargetInfo *qmp_query_target(Error **errp)
+{
+    TargetInfo *info = g_malloc0(sizeof(*info));
+
+    info->arch = TARGET_TYPE;
+
+    return info;
+}
--- a/async.c
+++ b/async.c
@ -23,16 +23,14 @@
 */

 #include "qemu-common.h"
-#include "qemu-aio.h"
-#include "main-loop.h"
-
-/* Anchor of the list of Bottom Halves belonging to the context */
-static struct QEMUBH *first_bh;
+#include "block/aio.h"
+#include "qemu/main-loop.h"

 /***********************************************************/
 /* bottom halves (can be seen as timers which expire ASAP) */

 struct QEMUBH {
+    AioContext *ctx;
    QEMUBHFunc *cb;
    void *opaque;
    QEMUBH *next;
@ -41,27 +39,27 @@ struct QEMUBH {
    bool deleted;
 };

-QEMUBH *qemu_bh_new(QEMUBHFunc *cb, void *opaque)
+QEMUBH *aio_bh_new(AioContext *ctx, QEMUBHFunc *cb, void *opaque)
 {
    QEMUBH *bh;
    bh = g_malloc0(sizeof(QEMUBH));
+    bh->ctx = ctx;
    bh->cb = cb;
    bh->opaque = opaque;
-    bh->next = first_bh;
-    first_bh = bh;
+    bh->next = ctx->first_bh;
+    ctx->first_bh = bh;
    return bh;
 }

-int qemu_bh_poll(void)
+int aio_bh_poll(AioContext *ctx)
 {
    QEMUBH *bh, **bhp, *next;
    int ret;
-    static int nesting = 0;

-    nesting++;
+    ctx->walking_bh++;

    ret = 0;
-    for (bh = first_bh; bh; bh = next) {
+    for (bh = ctx->first_bh; bh; bh = next) {
        next = bh->next;
        if (!bh->deleted && bh->scheduled) {
            bh->scheduled = 0;
@ -72,11 +70,11 @@ int qemu_bh_poll(void)
        }
    }

-    nesting--;
+    ctx->walking_bh--;

    /* remove deleted bhs */
-    if (!nesting) {
-        bhp = &first_bh;
+    if (!ctx->walking_bh) {
+        bhp = &ctx->first_bh;
        while (*bhp) {
            bh = *bhp;
            if (bh->deleted) {
@ -105,8 +103,7 @@ void qemu_bh_schedule(QEMUBH *bh)
        return;
    bh->scheduled = 1;
    bh->idle = 0;
-    /* stop the currently executing CPU to execute the BH ASAP */
-    qemu_notify_event();
+    aio_notify(bh->ctx);
 }

 void qemu_bh_cancel(QEMUBH *bh)
@ -120,23 +117,101 @@ void qemu_bh_delete(QEMUBH *bh)
    bh->deleted = 1;
 }

-void qemu_bh_update_timeout(uint32_t *timeout)
+static gboolean
+aio_ctx_prepare(GSource *source, gint    *timeout)
 {
+    AioContext *ctx = (AioContext *) source;
    QEMUBH *bh;

-    for (bh = first_bh; bh; bh = bh->next) {
+    for (bh = ctx->first_bh; bh; bh = bh->next) {
        if (!bh->deleted && bh->scheduled) {
            if (bh->idle) {
                /* idle bottom halves will be polled at least
                 * every 10ms */
-                *timeout = MIN(10, *timeout);
+                *timeout = 10;
            } else {
                /* non-idle bottom halves will be executed
                 * immediately */
                *timeout = 0;
-                break;
+                return true;
            }
        }
    }
+
+    return false;
 }

+static gboolean
+aio_ctx_check(GSource *source)
+{
+    AioContext *ctx = (AioContext *) source;
+    QEMUBH *bh;
+
+    for (bh = ctx->first_bh; bh; bh = bh->next) {
+        if (!bh->deleted && bh->scheduled) {
+            return true;
+	}
+    }
+    return aio_pending(ctx);
+}
+
+static gboolean
+aio_ctx_dispatch(GSource     *source,
+                 GSourceFunc  callback,
+                 gpointer     user_data)
+{
+    AioContext *ctx = (AioContext *) source;
+
+    assert(callback == NULL);
+    aio_poll(ctx, false);
+    return true;
+}
+
+static void
+aio_ctx_finalize(GSource     *source)
+{
+    AioContext *ctx = (AioContext *) source;
+
+    aio_set_event_notifier(ctx, &ctx->notifier, NULL, NULL);
+    event_notifier_cleanup(&ctx->notifier);
+}
+
+static GSourceFuncs aio_source_funcs = {
+    aio_ctx_prepare,
+    aio_ctx_check,
+    aio_ctx_dispatch,
+    aio_ctx_finalize
+};
+
+GSource *aio_get_g_source(AioContext *ctx)
+{
+    g_source_ref(&ctx->source);
+    return &ctx->source;
+}
+
+void aio_notify(AioContext *ctx)
+{
+    event_notifier_set(&ctx->notifier);
+}
+
+AioContext *aio_context_new(void)
+{
+    AioContext *ctx;
+    ctx = (AioContext *) g_source_new(&aio_source_funcs, sizeof(AioContext));
+    event_notifier_init(&ctx->notifier, false);
+    aio_set_event_notifier(ctx, &ctx->notifier, 
+                           (EventNotifierHandler *)
+                           event_notifier_test_and_clear, NULL);
+
+    return ctx;
+}
+
+void aio_context_ref(AioContext *ctx)
+{
+    g_source_ref(&ctx->source);
+}
+
+void aio_context_unref(AioContext *ctx)
+{
+    g_source_unref(&ctx->source);
+}
--- a/audio/Makefile.objs
+++ b/audio/Makefile.objs
@ -12,3 +12,6 @@ common-obj-$(CONFIG_WINWAVE) += winwaveaudio.o
 common-obj-$(CONFIG_AUDIO_PT_INT) += audio_pt_int.o
 common-obj-$(CONFIG_AUDIO_WIN_INT) += audio_win_int.o
 common-obj-y += wavcapture.o
+
+$(obj)/audio.o $(obj)/fmodaudio.o: QEMU_CFLAGS += $(FMOD_CFLAGS)
+$(obj)/sdlaudio.o: QEMU_CFLAGS += $(SDL_CFLAGS)
--- a/audio/alsaaudio.c
+++ b/audio/alsaaudio.c
@ -23,7 +23,7 @@
 */
 #include <alsa/asoundlib.h>
 #include "qemu-common.h"
-#include "qemu-char.h"
+#include "qemu/main-loop.h"
 #include "audio.h"

 #if QEMU_GNUC_PREREQ(4, 3)
--- a/audio/audio.c
+++ b/audio/audio.c
@ -23,9 +23,9 @@
 */
 #include "hw/hw.h"
 #include "audio.h"
-#include "monitor.h"
-#include "qemu-timer.h"
-#include "sysemu.h"
+#include "monitor/monitor.h"
+#include "qemu/timer.h"
+#include "sysemu/sysemu.h"

 #define AUDIO_CAP "audio"
 #include "audio_int.h"
--- a/audio/audio.h
+++ b/audio/audio.h
@ -25,7 +25,7 @@
 #define QEMU_AUDIO_H

 #include "config-host.h"
-#include "qemu-queue.h"
+#include "qemu/queue.h"

 typedef void (*audio_callback_fn) (void *opaque, int avail);

--- a/audio/audio_template.h
+++ b/audio/audio_template.h
@ -410,15 +410,15 @@ SW *glue (AUD_open_, TYPE) (
    SW *old_sw = NULL;
 #endif

-    ldebug ("open %s, freq %d, nchannels %d, fmt %d\n",
-            name, as->freq, as->nchannels, as->fmt);
-
    if (audio_bug (AUDIO_FUNC, !card || !name || !callback_fn || !as)) {
        dolog ("card=%p name=%p callback_fn=%p as=%p\n",
               card, name, callback_fn, as);
        goto fail;
    }

+    ldebug ("open %s, freq %d, nchannels %d, fmt %d\n",
+            name, as->freq, as->nchannels, as->fmt);
+
    if (audio_bug (AUDIO_FUNC, audio_validate_settings (as))) {
        audio_print_settings (as);
        goto fail;
--- a/audio/noaudio.c
+++ b/audio/noaudio.c
@ -23,7 +23,7 @@
 */
 #include "qemu-common.h"
 #include "audio.h"
-#include "qemu-timer.h"
+#include "qemu/timer.h"

 #define AUDIO_CAP "noaudio"
 #include "audio_int.h"
--- a/audio/ossaudio.c
+++ b/audio/ossaudio.c
@ -31,8 +31,8 @@
 #include <sys/soundcard.h>
 #endif
 #include "qemu-common.h"
-#include "host-utils.h"
-#include "qemu-char.h"
+#include "qemu/main-loop.h"
+#include "qemu/host-utils.h"
 #include "audio.h"

 #define AUDIO_CAP "oss"
--- a/audio/spiceaudio.c
+++ b/audio/spiceaudio.c
@ -18,7 +18,7 @@
 */

 #include "hw/hw.h"
-#include "qemu-timer.h"
+#include "qemu/timer.h"
 #include "ui/qemu-spice.h"

 #define AUDIO_CAP "spice"
--- a/audio/wavaudio.c
+++ b/audio/wavaudio.c
@ -22,7 +22,7 @@
 * THE SOFTWARE.
 */
 #include "hw/hw.h"
-#include "qemu-timer.h"
+#include "qemu/timer.h"
 #include "audio.h"

 #define AUDIO_CAP "wav"
--- a/audio/wavcapture.c
+++ b/audio/wavcapture.c
@ -1,5 +1,5 @@
 #include "hw/hw.h"
-#include "monitor.h"
+#include "monitor/monitor.h"
 #include "audio.h"

 typedef struct {
--- a/audio/winwaveaudio.c
+++ b/audio/winwaveaudio.c
@ -1,7 +1,7 @@
 /* public domain */

 #include "qemu-common.h"
-#include "sysemu.h"
+#include "sysemu/sysemu.h"
 #include "audio.h"

 #define AUDIO_CAP "winwave"
@ -349,21 +349,15 @@ static int winwave_ctl_out (HWVoiceOut *hw, int cmd, ...)
            else {
                hw->poll_mode = 0;
            }
-            if (wave->paused) {
-                mr = waveOutRestart (wave->hwo);
-                if (mr != MMSYSERR_NOERROR) {
-                    winwave_logerr (mr, "waveOutRestart");
-                }
-                wave->paused = 0;
-            }
+            wave->paused = 0;
        }
        return 0;

    case VOICE_DISABLE:
        if (!wave->paused) {
-            mr = waveOutPause (wave->hwo);
+            mr = waveOutReset (wave->hwo);
            if (mr != MMSYSERR_NOERROR) {
-                winwave_logerr (mr, "waveOutPause");
+                winwave_logerr (mr, "waveOutReset");
            }
            else {
                wave->paused = 1;
--- a/backends/Makefile.objs
+++ b/backends/Makefile.objs
@ -0,0 +1,2 @@
+common-obj-y += rng.o rng-egd.o
+common-obj-$(CONFIG_POSIX) += rng-random.o
--- a/backends/rng-egd.c
+++ b/backends/rng-egd.c
@ -0,0 +1,224 @@
+/*
+ * QEMU Random Number Generator Backend
+ *
+ * Copyright IBM, Corp. 2012
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/rng.h"
+#include "char/char.h"
+#include "qapi/qmp/qerror.h"
+#include "hw/qdev.h" /* just for DEFINE_PROP_CHR */
+
+#define TYPE_RNG_EGD "rng-egd"
+#define RNG_EGD(obj) OBJECT_CHECK(RngEgd, (obj), TYPE_RNG_EGD)
+
+typedef struct RngEgd
+{
+    RngBackend parent;
+
+    CharDriverState *chr;
+    char *chr_name;
+
+    GSList *requests;
+} RngEgd;
+
+typedef struct RngRequest
+{
+    EntropyReceiveFunc *receive_entropy;
+    uint8_t *data;
+    void *opaque;
+    size_t offset;
+    size_t size;
+} RngRequest;
+
+static void rng_egd_request_entropy(RngBackend *b, size_t size,
+                                    EntropyReceiveFunc *receive_entropy,
+                                    void *opaque)
+{
+    RngEgd *s = RNG_EGD(b);
+    RngRequest *req;
+
+    req = g_malloc(sizeof(*req));
+
+    req->offset = 0;
+    req->size = size;
+    req->receive_entropy = receive_entropy;
+    req->opaque = opaque;
+    req->data = g_malloc(req->size);
+
+    while (size > 0) {
+        uint8_t header[2];
+        uint8_t len = MIN(size, 255);
+
+        /* synchronous entropy request */
+        header[0] = 0x02;
+        header[1] = len;
+
+        qemu_chr_fe_write(s->chr, header, sizeof(header));
+
+        size -= len;
+    }
+
+    s->requests = g_slist_append(s->requests, req);
+}
+
+static void rng_egd_free_request(RngRequest *req)
+{
+    g_free(req->data);
+    g_free(req);
+}
+
+static int rng_egd_chr_can_read(void *opaque)
+{
+    RngEgd *s = RNG_EGD(opaque);
+    GSList *i;
+    int size = 0;
+
+    for (i = s->requests; i; i = i->next) {
+        RngRequest *req = i->data;
+        size += req->size - req->offset;
+    }
+
+    return size;
+}
+
+static void rng_egd_chr_read(void *opaque, const uint8_t *buf, int size)
+{
+    RngEgd *s = RNG_EGD(opaque);
+
+    while (size > 0 && s->requests) {
+        RngRequest *req = s->requests->data;
+        int len = MIN(size, req->size - req->offset);
+
+        memcpy(req->data + req->offset, buf, len);
+        req->offset += len;
+        size -= len;
+
+        if (req->offset == req->size) {
+            s->requests = g_slist_remove_link(s->requests, s->requests);
+
+            req->receive_entropy(req->opaque, req->data, req->size);
+
+            rng_egd_free_request(req);
+        }
+    }
+}
+
+static void rng_egd_free_requests(RngEgd *s)
+{
+    GSList *i;
+
+    for (i = s->requests; i; i = i->next) {
+        rng_egd_free_request(i->data);
+    }
+
+    g_slist_free(s->requests);
+    s->requests = NULL;
+}
+
+static void rng_egd_cancel_requests(RngBackend *b)
+{
+    RngEgd *s = RNG_EGD(b);
+
+    /* We simply delete the list of pending requests.  If there is data in the 
+     * queue waiting to be read, this is okay, because there will always be
+     * more data than we requested originally
+     */
+    rng_egd_free_requests(s);
+}
+
+static void rng_egd_opened(RngBackend *b, Error **errp)
+{
+    RngEgd *s = RNG_EGD(b);
+
+    if (s->chr_name == NULL) {
+        error_set(errp, QERR_INVALID_PARAMETER_VALUE,
+                  "chardev", "a valid character device");
+        return;
+    }
+
+    s->chr = qemu_chr_find(s->chr_name);
+    if (s->chr == NULL) {
+        error_set(errp, QERR_DEVICE_NOT_FOUND, s->chr_name);
+        return;
+    }
+
+    /* FIXME we should resubmit pending requests when the CDS reconnects. */
+    qemu_chr_add_handlers(s->chr, rng_egd_chr_can_read, rng_egd_chr_read,
+                          NULL, s);
+}
+
+static void rng_egd_set_chardev(Object *obj, const char *value, Error **errp)
+{
+    RngBackend *b = RNG_BACKEND(obj);
+    RngEgd *s = RNG_EGD(b);
+
+    if (b->opened) {
+        error_set(errp, QERR_PERMISSION_DENIED);
+    } else {
+        g_free(s->chr_name);
+        s->chr_name = g_strdup(value);
+    }
+}
+
+static char *rng_egd_get_chardev(Object *obj, Error **errp)
+{
+    RngEgd *s = RNG_EGD(obj);
+
+    if (s->chr && s->chr->label) {
+        return g_strdup(s->chr->label);
+    }
+
+    return NULL;
+}
+
+static void rng_egd_init(Object *obj)
+{
+    object_property_add_str(obj, "chardev",
+                            rng_egd_get_chardev, rng_egd_set_chardev,
+                            NULL);
+}
+
+static void rng_egd_finalize(Object *obj)
+{
+    RngEgd *s = RNG_EGD(obj);
+
+    if (s->chr) {
+        qemu_chr_add_handlers(s->chr, NULL, NULL, NULL, NULL);
+    }
+
+    g_free(s->chr_name);
+
+    rng_egd_free_requests(s);
+}
+
+static void rng_egd_class_init(ObjectClass *klass, void *data)
+{
+    RngBackendClass *rbc = RNG_BACKEND_CLASS(klass);
+
+    rbc->request_entropy = rng_egd_request_entropy;
+    rbc->cancel_requests = rng_egd_cancel_requests;
+    rbc->opened = rng_egd_opened;
+}
+
+static TypeInfo rng_egd_info = {
+    .name = TYPE_RNG_EGD,
+    .parent = TYPE_RNG_BACKEND,
+    .instance_size = sizeof(RngEgd),
+    .class_init = rng_egd_class_init,
+    .instance_init = rng_egd_init,
+    .instance_finalize = rng_egd_finalize,
+};
+
+static void register_types(void)
+{
+    type_register_static(&rng_egd_info);
+}
+
+type_init(register_types);
--- a/backends/rng-random.c
+++ b/backends/rng-random.c
@ -0,0 +1,161 @@
+/*
+ * QEMU Random Number Generator Backend
+ *
+ * Copyright IBM, Corp. 2012
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/rng-random.h"
+#include "qemu/rng.h"
+#include "qapi/qmp/qerror.h"
+#include "qemu/main-loop.h"
+
+struct RndRandom
+{
+    RngBackend parent;
+
+    int fd;
+    char *filename;
+
+    EntropyReceiveFunc *receive_func;
+    void *opaque;
+    size_t size;
+};
+
+/**
+ * A simple and incomplete backend to request entropy from /dev/random.
+ *
+ * This backend exposes an additional "filename" property that can be used to
+ * set the filename to use to open the backend.
+ */
+
+static void entropy_available(void *opaque)
+{
+    RndRandom *s = RNG_RANDOM(opaque);
+    uint8_t buffer[s->size];
+    ssize_t len;
+
+    len = read(s->fd, buffer, s->size);
+    g_assert(len != -1);
+
+    s->receive_func(s->opaque, buffer, len);
+    s->receive_func = NULL;
+
+    qemu_set_fd_handler(s->fd, NULL, NULL, NULL);
+}
+
+static void rng_random_request_entropy(RngBackend *b, size_t size,
+                                        EntropyReceiveFunc *receive_entropy,
+                                        void *opaque)
+{
+    RndRandom *s = RNG_RANDOM(b);
+
+    if (s->receive_func) {
+        s->receive_func(s->opaque, NULL, 0);
+    }
+
+    s->receive_func = receive_entropy;
+    s->opaque = opaque;
+    s->size = size;
+
+    qemu_set_fd_handler(s->fd, entropy_available, NULL, s);
+}
+
+static void rng_random_opened(RngBackend *b, Error **errp)
+{
+    RndRandom *s = RNG_RANDOM(b);
+
+    if (s->filename == NULL) {
+        error_set(errp, QERR_INVALID_PARAMETER_VALUE,
+                  "filename", "a valid filename");
+    } else {
+        s->fd = open(s->filename, O_RDONLY | O_NONBLOCK);
+
+        if (s->fd == -1) {
+            error_set(errp, QERR_OPEN_FILE_FAILED, s->filename);
+        }
+    }
+}
+
+static char *rng_random_get_filename(Object *obj, Error **errp)
+{
+    RndRandom *s = RNG_RANDOM(obj);
+
+    if (s->filename) {
+        return g_strdup(s->filename);
+    }
+
+    return NULL;
+}
+
+static void rng_random_set_filename(Object *obj, const char *filename,
+                                 Error **errp)
+{
+    RngBackend *b = RNG_BACKEND(obj);
+    RndRandom *s = RNG_RANDOM(obj);
+
+    if (b->opened) {
+        error_set(errp, QERR_PERMISSION_DENIED);
+        return;
+    }
+
+    if (s->filename) {
+        g_free(s->filename);
+    }
+
+    s->filename = g_strdup(filename);
+}
+
+static void rng_random_init(Object *obj)
+{
+    RndRandom *s = RNG_RANDOM(obj);
+
+    object_property_add_str(obj, "filename",
+                            rng_random_get_filename,
+                            rng_random_set_filename,
+                            NULL);
+
+    s->filename = g_strdup("/dev/random");
+}
+
+static void rng_random_finalize(Object *obj)
+{
+    RndRandom *s = RNG_RANDOM(obj);
+
+    qemu_set_fd_handler(s->fd, NULL, NULL, NULL);
+
+    if (s->fd != -1) {
+        close(s->fd);
+    }
+
+    g_free(s->filename);
+}
+
+static void rng_random_class_init(ObjectClass *klass, void *data)
+{
+    RngBackendClass *rbc = RNG_BACKEND_CLASS(klass);
+
+    rbc->request_entropy = rng_random_request_entropy;
+    rbc->opened = rng_random_opened;
+}
+
+static TypeInfo rng_random_info = {
+    .name = TYPE_RNG_RANDOM,
+    .parent = TYPE_RNG_BACKEND,
+    .instance_size = sizeof(RndRandom),
+    .class_init = rng_random_class_init,
+    .instance_init = rng_random_init,
+    .instance_finalize = rng_random_finalize,
+};
+
+static void register_types(void)
+{
+    type_register_static(&rng_random_info);
+}
+
+type_init(register_types);
--- a/backends/rng.c
+++ b/backends/rng.c
@ -0,0 +1,93 @@
+/*
+ * QEMU Random Number Generator Backend
+ *
+ * Copyright IBM, Corp. 2012
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/rng.h"
+#include "qapi/qmp/qerror.h"
+
+void rng_backend_request_entropy(RngBackend *s, size_t size,
+                                 EntropyReceiveFunc *receive_entropy,
+                                 void *opaque)
+{
+    RngBackendClass *k = RNG_BACKEND_GET_CLASS(s);
+
+    if (k->request_entropy) {
+        k->request_entropy(s, size, receive_entropy, opaque);
+    }
+}
+
+void rng_backend_cancel_requests(RngBackend *s)
+{
+    RngBackendClass *k = RNG_BACKEND_GET_CLASS(s);
+
+    if (k->cancel_requests) {
+        k->cancel_requests(s);
+    }
+}
+
+static bool rng_backend_prop_get_opened(Object *obj, Error **errp)
+{
+    RngBackend *s = RNG_BACKEND(obj);
+
+    return s->opened;
+}
+
+void rng_backend_open(RngBackend *s, Error **errp)
+{
+    object_property_set_bool(OBJECT(s), true, "opened", errp);
+}
+
+static void rng_backend_prop_set_opened(Object *obj, bool value, Error **errp)
+{
+    RngBackend *s = RNG_BACKEND(obj);
+    RngBackendClass *k = RNG_BACKEND_GET_CLASS(s);
+
+    if (value == s->opened) {
+        return;
+    }
+
+    if (!value && s->opened) {
+        error_set(errp, QERR_PERMISSION_DENIED);
+        return;
+    }
+
+    if (k->opened) {
+        k->opened(s, errp);
+    }
+
+    if (!error_is_set(errp)) {
+        s->opened = value;
+    }
+}
+
+static void rng_backend_init(Object *obj)
+{
+    object_property_add_bool(obj, "opened",
+                             rng_backend_prop_get_opened,
+                             rng_backend_prop_set_opened,
+                             NULL);
+}
+
+static TypeInfo rng_backend_info = {
+    .name = TYPE_RNG_BACKEND,
+    .parent = TYPE_OBJECT,
+    .instance_size = sizeof(RngBackend),
+    .instance_init = rng_backend_init,
+    .class_size = sizeof(RngBackendClass),
+    .abstract = true,
+};
+
+static void register_types(void)
+{
+    type_register_static(&rng_backend_info);
+}
+
+type_init(register_types);
--- a/balloon.c
+++ b/balloon.c
@ -24,13 +24,13 @@
 * THE SOFTWARE.
 */

-#include "monitor.h"
-#include "cpu-common.h"
-#include "kvm.h"
-#include "balloon.h"
+#include "monitor/monitor.h"
+#include "exec/cpu-common.h"
+#include "sysemu/kvm.h"
+#include "sysemu/balloon.h"
 #include "trace.h"
 #include "qmp-commands.h"
-#include "qjson.h"
+#include "qapi/qmp/qjson.h"

 static QEMUBalloonEvent *balloon_event_fn;
 static QEMUBalloonStatus *balloon_stat_fn;
--- a/bitmap.c
+++ b/bitmap.c
@ -9,8 +9,8 @@
 * Version 2.
 */

-#include "bitops.h"
-#include "bitmap.h"
+#include "qemu/bitops.h"
+#include "qemu/bitmap.h"

 /*
 * bitmaps provide an array of bits, implemented using an an
--- a/bitops.c
+++ b/bitops.c
@ -11,7 +11,7 @@
 * 2 of the License, or (at your option) any later version.
 */

-#include "bitops.h"
+#include "qemu/bitops.h"

 #define BITOP_WORD(nr)		((nr) / BITS_PER_LONG)

--- a/block-migration.c
+++ b/block-migration.c
@ -14,13 +14,13 @@
 */

 #include "qemu-common.h"
-#include "block_int.h"
+#include "block/block_int.h"
 #include "hw/hw.h"
-#include "qemu-queue.h"
-#include "qemu-timer.h"
-#include "block-migration.h"
-#include "migration.h"
-#include "blockdev.h"
+#include "qemu/queue.h"
+#include "qemu/timer.h"
+#include "migration/block.h"
+#include "migration/migration.h"
+#include "sysemu/blockdev.h"
 #include <assert.h>

 #define BLOCK_SIZE (BDRV_SECTORS_PER_DIRTY_CHUNK << BDRV_SECTOR_BITS)
@ -77,9 +77,7 @@ typedef struct BlkMigState {
    int64_t total_sector_sum;
    int prev_progress;
    int bulk_completed;
-    long double total_time;
    long double prev_time_offset;
-    int reads;
 } BlkMigState;

 static BlkMigState block_mig_state;
@ -132,12 +130,6 @@ uint64_t blk_mig_bytes_total(void)
    return sum << BDRV_SECTOR_BITS;
 }

-static inline long double compute_read_bwidth(void)
-{
-    assert(block_mig_state.total_time != 0);
-    return (block_mig_state.reads / block_mig_state.total_time) * BLOCK_SIZE;
-}
-
 static int bmds_aio_inflight(BlkMigDevState *bmds, int64_t sector)
 {
    int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
@ -191,8 +183,6 @@ static void blk_mig_read_cb(void *opaque, int ret)

    blk->ret = ret;

-    block_mig_state.reads++;
-    block_mig_state.total_time += (curr_time - block_mig_state.prev_time_offset);
    block_mig_state.prev_time_offset = curr_time;

    QSIMPLEQ_INSERT_TAIL(&block_mig_state.blk_list, blk, entry);
@ -310,8 +300,6 @@ static void init_blk_migration(QEMUFile *f)
    block_mig_state.total_sector_sum = 0;
    block_mig_state.prev_progress = -1;
    block_mig_state.bulk_completed = 0;
-    block_mig_state.total_time = 0;
-    block_mig_state.reads = 0;

    bdrv_iterate(init_blk_migration_it, NULL);
 }
@ -423,20 +411,23 @@ static int mig_save_device_dirty(QEMUFile *f, BlkMigDevState *bmds,

 error:
    DPRINTF("Error reading sector %" PRId64 "\n", sector);
-    qemu_file_set_error(f, ret);
    g_free(blk->buf);
    g_free(blk);
-    return 0;
+    return ret;
 }

+/* return value:
+ * 0: too much data for max_downtime
+ * 1: few enough data for max_downtime
+*/
 static int blk_mig_save_dirty_block(QEMUFile *f, int is_async)
 {
    BlkMigDevState *bmds;
-    int ret = 0;
+    int ret = 1;

    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
-        if (mig_save_device_dirty(f, bmds, is_async) == 0) {
-            ret = 1;
+        ret = mig_save_device_dirty(f, bmds, is_async);
+        if (ret <= 0) {
            break;
        }
    }
@ -444,9 +435,10 @@ static int blk_mig_save_dirty_block(QEMUFile *f, int is_async)
    return ret;
 }

-static void flush_blks(QEMUFile* f)
+static int flush_blks(QEMUFile *f)
 {
    BlkMigBlock *blk;
+    int ret = 0;

    DPRINTF("%s Enter submitted %d read_done %d transferred %d\n",
            __FUNCTION__, block_mig_state.submitted, block_mig_state.read_done,
@ -457,7 +449,7 @@ static void flush_blks(QEMUFile* f)
            break;
        }
        if (blk->ret < 0) {
-            qemu_file_set_error(f, blk->ret);
+            ret = blk->ret;
            break;
        }
        blk_send(f, blk);
@ -474,6 +466,7 @@ static void flush_blks(QEMUFile* f)
    DPRINTF("%s Exit submitted %d read_done %d transferred %d\n", __FUNCTION__,
            block_mig_state.submitted, block_mig_state.read_done,
            block_mig_state.transferred);
+    return ret;
 }

 static int64_t get_remaining_dirty(void)
@ -488,37 +481,13 @@ static int64_t get_remaining_dirty(void)
    return dirty * BLOCK_SIZE;
 }

-static int is_stage2_completed(void)
-{
-    int64_t remaining_dirty;
-    long double bwidth;
-
-    if (block_mig_state.bulk_completed == 1) {
-
-        remaining_dirty = get_remaining_dirty();
-        if (remaining_dirty == 0) {
-            return 1;
-        }
-
-        bwidth = compute_read_bwidth();
-
-        if ((remaining_dirty / bwidth) <=
-            migrate_max_downtime()) {
-            /* finish stage2 because we think that we can finish remaining work
-               below max_downtime */
-
-            return 1;
-        }
-    }
-
-    return 0;
-}
-
 static void blk_mig_cleanup(void)
 {
    BlkMigDevState *bmds;
    BlkMigBlock *blk;

+    bdrv_drain_all();
+
    set_dirty_tracking(0);

    while ((bmds = QSIMPLEQ_FIRST(&block_mig_state.bmds_list)) != NULL) {
@ -553,9 +522,7 @@ static int block_save_setup(QEMUFile *f, void *opaque)
    /* start track dirty blocks */
    set_dirty_tracking(1);

-    flush_blks(f);
-
-    ret = qemu_file_get_error(f);
+    ret = flush_blks(f);
    if (ret) {
        blk_mig_cleanup();
        return ret;
@ -575,9 +542,7 @@ static int block_save_iterate(QEMUFile *f, void *opaque)
    DPRINTF("Enter save live iterate submitted %d transferred %d\n",
            block_mig_state.submitted, block_mig_state.transferred);

-    flush_blks(f);
-
-    ret = qemu_file_get_error(f);
+    ret = flush_blks(f);
    if (ret) {
        blk_mig_cleanup();
        return ret;
@ -596,16 +561,19 @@ static int block_save_iterate(QEMUFile *f, void *opaque)
                block_mig_state.bulk_completed = 1;
            }
        } else {
-            if (blk_mig_save_dirty_block(f, 1) == 0) {
+            ret = blk_mig_save_dirty_block(f, 1);
+            if (ret != 0) {
                /* no more dirty blocks */
                break;
            }
        }
    }
+    if (ret) {
+        blk_mig_cleanup();
+        return ret;
+    }

-    flush_blks(f);
-
-    ret = qemu_file_get_error(f);
+    ret = flush_blks(f);
    if (ret) {
        blk_mig_cleanup();
        return ret;
@ -613,7 +581,7 @@ static int block_save_iterate(QEMUFile *f, void *opaque)

    qemu_put_be64(f, BLK_MIG_FLAG_EOS);

-    return is_stage2_completed();
+    return 0;
 }

 static int block_save_complete(QEMUFile *f, void *opaque)
@ -623,9 +591,7 @@ static int block_save_complete(QEMUFile *f, void *opaque)
    DPRINTF("Enter save live complete submitted %d transferred %d\n",
            block_mig_state.submitted, block_mig_state.transferred);

-    flush_blks(f);
-
-    ret = qemu_file_get_error(f);
+    ret = flush_blks(f);
    if (ret) {
        blk_mig_cleanup();
        return ret;
@ -637,18 +603,16 @@ static int block_save_complete(QEMUFile *f, void *opaque)
       all async read completed */
    assert(block_mig_state.submitted == 0);

-    while (blk_mig_save_dirty_block(f, 0) != 0) {
-        /* Do nothing */
-    }
+    do {
+        ret = blk_mig_save_dirty_block(f, 0);
+    } while (ret == 0);
+
    blk_mig_cleanup();
-
-    /* report completion */
-    qemu_put_be64(f, (100 << BDRV_SECTOR_BITS) | BLK_MIG_FLAG_PROGRESS);
-
-    ret = qemu_file_get_error(f);
    if (ret) {
        return ret;
    }
+    /* report completion */
+    qemu_put_be64(f, (100 << BDRV_SECTOR_BITS) | BLK_MIG_FLAG_PROGRESS);

    DPRINTF("Block migration completed\n");

@ -657,6 +621,14 @@ static int block_save_complete(QEMUFile *f, void *opaque)
    return 0;
 }

+static uint64_t block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size)
+{
+
+    DPRINTF("Enter save live pending  %ld\n", get_remaining_dirty());
+
+    return get_remaining_dirty();
+}
+
 static int block_load(QEMUFile *f, void *opaque, int version_id)
 {
    static int banner_printed;
@ -753,6 +725,7 @@ SaveVMHandlers savevm_block_handlers = {
    .save_live_setup = block_save_setup,
    .save_live_iterate = block_save_iterate,
    .save_live_complete = block_save_complete,
+    .save_live_pending = block_save_pending,
    .load_state = block_load,
    .cancel = block_migration_cancel,
    .is_active = block_is_active,
--- a/block.c
+++ b/block.c
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@ -2,10 +2,21 @@ block-obj-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat
 block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
 block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
 block-obj-y += qed-check.o
-block-obj-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
-block-obj-y += stream.o
-block-obj-$(CONFIG_WIN32) += raw-win32.o
+block-obj-y += parallels.o blkdebug.o blkverify.o
+block-obj-$(CONFIG_WIN32) += raw-win32.o win32-aio.o
 block-obj-$(CONFIG_POSIX) += raw-posix.o
+block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
+
+ifeq ($(CONFIG_POSIX),y)
+block-obj-y += nbd.o sheepdog.o
 block-obj-$(CONFIG_LIBISCSI) += iscsi.o
 block-obj-$(CONFIG_CURL) += curl.o
 block-obj-$(CONFIG_RBD) += rbd.o
+block-obj-$(CONFIG_GLUSTERFS) += gluster.o
+endif
+
+common-obj-y += stream.o
+common-obj-y += commit.o
+common-obj-y += mirror.o
+
+$(obj)/curl.o: QEMU_CFLAGS+=$(CURL_CFLAGS)
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@ -23,13 +23,17 @@
 */

 #include "qemu-common.h"
-#include "block_int.h"
-#include "module.h"
+#include "qemu/config-file.h"
+#include "block/block_int.h"
+#include "qemu/module.h"

 typedef struct BDRVBlkdebugState {
    int state;
+    int new_state;
+
    QLIST_HEAD(, BlkdebugRule) rules[BLKDBG_EVENT_MAX];
    QSIMPLEQ_HEAD(, BlkdebugRule) active_rules;
+    QLIST_HEAD(, BlkdebugSuspendedReq) suspended_reqs;
 } BDRVBlkdebugState;

 typedef struct BlkdebugAIOCB {
@ -38,9 +42,15 @@ typedef struct BlkdebugAIOCB {
    int ret;
 } BlkdebugAIOCB;

+typedef struct BlkdebugSuspendedReq {
+    Coroutine *co;
+    char *tag;
+    QLIST_ENTRY(BlkdebugSuspendedReq) next;
+} BlkdebugSuspendedReq;
+
 static void blkdebug_aio_cancel(BlockDriverAIOCB *blockacb);

-static AIOPool blkdebug_aio_pool = {
+static const AIOCBInfo blkdebug_aiocb_info = {
    .aiocb_size = sizeof(BlkdebugAIOCB),
    .cancel     = blkdebug_aio_cancel,
 };
@ -48,6 +58,7 @@ static AIOPool blkdebug_aio_pool = {
 enum {
    ACTION_INJECT_ERROR,
    ACTION_SET_STATE,
+    ACTION_SUSPEND,
 };

 typedef struct BlkdebugRule {
@ -64,6 +75,9 @@ typedef struct BlkdebugRule {
        struct {
            int new_state;
        } set_state;
+        struct {
+            char *tag;
+        } suspend;
    } options;
    QLIST_ENTRY(BlkdebugRule) next;
    QSIMPLEQ_ENTRY(BlkdebugRule) active_next;
@ -225,6 +239,11 @@ static int add_rule(QemuOpts *opts, void *opaque)
        rule->options.set_state.new_state =
            qemu_opt_get_number(opts, "new_state", 0);
        break;
+
+    case ACTION_SUSPEND:
+        rule->options.suspend.tag =
+            g_strdup(qemu_opt_get(opts, "tag"));
+        break;
    };

    /* Add the rule */
@ -233,12 +252,32 @@ static int add_rule(QemuOpts *opts, void *opaque)
    return 0;
 }

+static void remove_rule(BlkdebugRule *rule)
+{
+    switch (rule->action) {
+    case ACTION_INJECT_ERROR:
+    case ACTION_SET_STATE:
+        break;
+    case ACTION_SUSPEND:
+        g_free(rule->options.suspend.tag);
+        break;
+    }
+
+    QLIST_REMOVE(rule, next);
+    g_free(rule);
+}
+
 static int read_config(BDRVBlkdebugState *s, const char *filename)
 {
    FILE *f;
    int ret;
    struct add_rule_data d;

+    /* Allow usage without config file */
+    if (!*filename) {
+        return 0;
+    }
+
    f = fopen(filename, "r");
    if (f == NULL) {
        return -errno;
@ -334,7 +373,7 @@ static BlockDriverAIOCB *inject_error(BlockDriverState *bs,
        return NULL;
    }

-    acb = qemu_aio_get(&blkdebug_aio_pool, bs, cb, opaque);
+    acb = qemu_aio_get(&blkdebug_aiocb_info, bs, cb, opaque);
    acb->ret = -error;

    bh = qemu_bh_new(error_callback_bh, acb);
@ -388,6 +427,7 @@ static BlockDriverAIOCB *blkdebug_aio_writev(BlockDriverState *bs,
    return bdrv_aio_writev(bs->file, sector_num, qiov, nb_sectors, cb, opaque);
 }

+
 static void blkdebug_close(BlockDriverState *bs)
 {
    BDRVBlkdebugState *s = bs->opaque;
@ -396,19 +436,39 @@ static void blkdebug_close(BlockDriverState *bs)

    for (i = 0; i < BLKDBG_EVENT_MAX; i++) {
        QLIST_FOREACH_SAFE(rule, &s->rules[i], next, next) {
-            QLIST_REMOVE(rule, next);
-            g_free(rule);
+            remove_rule(rule);
        }
    }
 }

+static void suspend_request(BlockDriverState *bs, BlkdebugRule *rule)
+{
+    BDRVBlkdebugState *s = bs->opaque;
+    BlkdebugSuspendedReq r;
+
+    r = (BlkdebugSuspendedReq) {
+        .co         = qemu_coroutine_self(),
+        .tag        = g_strdup(rule->options.suspend.tag),
+    };
+
+    remove_rule(rule);
+    QLIST_INSERT_HEAD(&s->suspended_reqs, &r, next);
+
+    printf("blkdebug: Suspended request '%s'\n", r.tag);
+    qemu_coroutine_yield();
+    printf("blkdebug: Resuming request '%s'\n", r.tag);
+
+    QLIST_REMOVE(&r, next);
+    g_free(r.tag);
+}
+
 static bool process_rule(BlockDriverState *bs, struct BlkdebugRule *rule,
-    int old_state, bool injected)
+    bool injected)
 {
    BDRVBlkdebugState *s = bs->opaque;

    /* Only process rules for the current state */
-    if (rule->state && rule->state != old_state) {
+    if (rule->state && rule->state != s->state) {
        return injected;
    }

@ -423,7 +483,11 @@ static bool process_rule(BlockDriverState *bs, struct BlkdebugRule *rule,
        break;

    case ACTION_SET_STATE:
-        s->state = rule->options.set_state.new_state;
+        s->new_state = rule->options.set_state.new_state;
+        break;
+
+    case ACTION_SUSPEND:
+        suspend_request(bs, rule);
        break;
    }
    return injected;
@ -432,16 +496,70 @@ static bool process_rule(BlockDriverState *bs, struct BlkdebugRule *rule,
 static void blkdebug_debug_event(BlockDriverState *bs, BlkDebugEvent event)
 {
    BDRVBlkdebugState *s = bs->opaque;
-    struct BlkdebugRule *rule;
-    int old_state = s->state;
+    struct BlkdebugRule *rule, *next;
    bool injected;

    assert((int)event >= 0 && event < BLKDBG_EVENT_MAX);

    injected = false;
-    QLIST_FOREACH(rule, &s->rules[event], next) {
-        injected = process_rule(bs, rule, old_state, injected);
+    s->new_state = s->state;
+    QLIST_FOREACH_SAFE(rule, &s->rules[event], next, next) {
+        injected = process_rule(bs, rule, injected);
    }
+    s->state = s->new_state;
+}
+
+static int blkdebug_debug_breakpoint(BlockDriverState *bs, const char *event,
+                                     const char *tag)
+{
+    BDRVBlkdebugState *s = bs->opaque;
+    struct BlkdebugRule *rule;
+    BlkDebugEvent blkdebug_event;
+
+    if (get_event_by_name(event, &blkdebug_event) < 0) {
+        return -ENOENT;
+    }
+
+
+    rule = g_malloc(sizeof(*rule));
+    *rule = (struct BlkdebugRule) {
+        .event  = blkdebug_event,
+        .action = ACTION_SUSPEND,
+        .state  = 0,
+        .options.suspend.tag = g_strdup(tag),
+    };
+
+    QLIST_INSERT_HEAD(&s->rules[blkdebug_event], rule, next);
+
+    return 0;
+}
+
+static int blkdebug_debug_resume(BlockDriverState *bs, const char *tag)
+{
+    BDRVBlkdebugState *s = bs->opaque;
+    BlkdebugSuspendedReq *r;
+
+    QLIST_FOREACH(r, &s->suspended_reqs, next) {
+        if (!strcmp(r->tag, tag)) {
+            qemu_coroutine_enter(r->co, NULL);
+            return 0;
+        }
+    }
+    return -ENOENT;
+}
+
+
+static bool blkdebug_debug_is_suspended(BlockDriverState *bs, const char *tag)
+{
+    BDRVBlkdebugState *s = bs->opaque;
+    BlkdebugSuspendedReq *r;
+
+    QLIST_FOREACH(r, &s->suspended_reqs, next) {
+        if (!strcmp(r->tag, tag)) {
+            return true;
+        }
+    }
+    return false;
 }

 static int64_t blkdebug_getlength(BlockDriverState *bs)
@ -462,7 +580,10 @@ static BlockDriver bdrv_blkdebug = {
    .bdrv_aio_readv     = blkdebug_aio_readv,
    .bdrv_aio_writev    = blkdebug_aio_writev,

-    .bdrv_debug_event   = blkdebug_debug_event,
+    .bdrv_debug_event           = blkdebug_debug_event,
+    .bdrv_debug_breakpoint      = blkdebug_debug_breakpoint,
+    .bdrv_debug_resume          = blkdebug_debug_resume,
+    .bdrv_debug_is_suspended    = blkdebug_debug_is_suspended,
 };

 static void bdrv_blkdebug_init(void)
--- a/block/blkverify.c
+++ b/block/blkverify.c
@ -8,8 +8,8 @@
 */

 #include <stdarg.h>
-#include "qemu_socket.h" /* for EINPROGRESS on Windows */
-#include "block_int.h"
+#include "qemu/sockets.h" /* for EINPROGRESS on Windows */
+#include "block/block_int.h"

 typedef struct {
    BlockDriverState *test_file;
@ -48,7 +48,7 @@ static void blkverify_aio_cancel(BlockDriverAIOCB *blockacb)
    }
 }

-static AIOPool blkverify_aio_pool = {
+static const AIOCBInfo blkverify_aiocb_info = {
    .aiocb_size         = sizeof(BlkverifyAIOCB),
    .cancel             = blkverify_aio_cancel,
 };
@ -233,7 +233,7 @@ static BlkverifyAIOCB *blkverify_aio_get(BlockDriverState *bs, bool is_write,
                                         BlockDriverCompletionFunc *cb,
                                         void *opaque)
 {
-    BlkverifyAIOCB *acb = qemu_aio_get(&blkverify_aio_pool, bs, cb, opaque);
+    BlkverifyAIOCB *acb = qemu_aio_get(&blkverify_aiocb_info, bs, cb, opaque);

    acb->bh = NULL;
    acb->is_write = is_write;
--- a/block/bochs.c
+++ b/block/bochs.c
@ -23,8 +23,8 @@
 * THE SOFTWARE.
 */
 #include "qemu-common.h"
-#include "block_int.h"
-#include "module.h"
+#include "block/block_int.h"
+#include "qemu/module.h"

 /**************************************************************/

--- a/block/cloop.c
+++ b/block/cloop.c
@ -22,8 +22,8 @@
 * THE SOFTWARE.
 */
 #include "qemu-common.h"
-#include "block_int.h"
-#include "module.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
 #include <zlib.h>

 typedef struct BDRVCloopState {
--- a/block/commit.c
+++ b/block/commit.c
@ -0,0 +1,259 @@
+/*
+ * Live block commit
+ *
+ * Copyright Red Hat, Inc. 2012
+ *
+ * Authors:
+ *  Jeff Cody   <jcody@redhat.com>
+ *  Based on stream.c by Stefan Hajnoczi
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "trace.h"
+#include "block/block_int.h"
+#include "block/blockjob.h"
+#include "qemu/ratelimit.h"
+
+enum {
+    /*
+     * Size of data buffer for populating the image file.  This should be large
+     * enough to process multiple clusters in a single call, so that populating
+     * contiguous regions of the image is efficient.
+     */
+    COMMIT_BUFFER_SIZE = 512 * 1024, /* in bytes */
+};
+
+#define SLICE_TIME 100000000ULL /* ns */
+
+typedef struct CommitBlockJob {
+    BlockJob common;
+    RateLimit limit;
+    BlockDriverState *active;
+    BlockDriverState *top;
+    BlockDriverState *base;
+    BlockdevOnError on_error;
+    int base_flags;
+    int orig_overlay_flags;
+} CommitBlockJob;
+
+static int coroutine_fn commit_populate(BlockDriverState *bs,
+                                        BlockDriverState *base,
+                                        int64_t sector_num, int nb_sectors,
+                                        void *buf)
+{
+    int ret = 0;
+
+    ret = bdrv_read(bs, sector_num, buf, nb_sectors);
+    if (ret) {
+        return ret;
+    }
+
+    ret = bdrv_write(base, sector_num, buf, nb_sectors);
+    if (ret) {
+        return ret;
+    }
+
+    return 0;
+}
+
+static void coroutine_fn commit_run(void *opaque)
+{
+    CommitBlockJob *s = opaque;
+    BlockDriverState *active = s->active;
+    BlockDriverState *top = s->top;
+    BlockDriverState *base = s->base;
+    BlockDriverState *overlay_bs = NULL;
+    int64_t sector_num, end;
+    int ret = 0;
+    int n = 0;
+    void *buf;
+    int bytes_written = 0;
+    int64_t base_len;
+
+    ret = s->common.len = bdrv_getlength(top);
+
+
+    if (s->common.len < 0) {
+        goto exit_restore_reopen;
+    }
+
+    ret = base_len = bdrv_getlength(base);
+    if (base_len < 0) {
+        goto exit_restore_reopen;
+    }
+
+    if (base_len < s->common.len) {
+        ret = bdrv_truncate(base, s->common.len);
+        if (ret) {
+            goto exit_restore_reopen;
+        }
+    }
+
+    overlay_bs = bdrv_find_overlay(active, top);
+
+    end = s->common.len >> BDRV_SECTOR_BITS;
+    buf = qemu_blockalign(top, COMMIT_BUFFER_SIZE);
+
+    for (sector_num = 0; sector_num < end; sector_num += n) {
+        uint64_t delay_ns = 0;
+        bool copy;
+
+wait:
+        /* Note that even when no rate limit is applied we need to yield
+         * with no pending I/O here so that bdrv_drain_all() returns.
+         */
+        block_job_sleep_ns(&s->common, rt_clock, delay_ns);
+        if (block_job_is_cancelled(&s->common)) {
+            break;
+        }
+        /* Copy if allocated above the base */
+        ret = bdrv_co_is_allocated_above(top, base, sector_num,
+                                         COMMIT_BUFFER_SIZE / BDRV_SECTOR_SIZE,
+                                         &n);
+        copy = (ret == 1);
+        trace_commit_one_iteration(s, sector_num, n, ret);
+        if (copy) {
+            if (s->common.speed) {
+                delay_ns = ratelimit_calculate_delay(&s->limit, n);
+                if (delay_ns > 0) {
+                    goto wait;
+                }
+            }
+            ret = commit_populate(top, base, sector_num, n, buf);
+            bytes_written += n * BDRV_SECTOR_SIZE;
+        }
+        if (ret < 0) {
+            if (s->on_error == BLOCKDEV_ON_ERROR_STOP ||
+                s->on_error == BLOCKDEV_ON_ERROR_REPORT||
+                (s->on_error == BLOCKDEV_ON_ERROR_ENOSPC && ret == -ENOSPC)) {
+                goto exit_free_buf;
+            } else {
+                n = 0;
+                continue;
+            }
+        }
+        /* Publish progress */
+        s->common.offset += n * BDRV_SECTOR_SIZE;
+    }
+
+    ret = 0;
+
+    if (!block_job_is_cancelled(&s->common) && sector_num == end) {
+        /* success */
+        ret = bdrv_drop_intermediate(active, top, base);
+    }
+
+exit_free_buf:
+    qemu_vfree(buf);
+
+exit_restore_reopen:
+    /* restore base open flags here if appropriate (e.g., change the base back
+     * to r/o). These reopens do not need to be atomic, since we won't abort
+     * even on failure here */
+    if (s->base_flags != bdrv_get_flags(base)) {
+        bdrv_reopen(base, s->base_flags, NULL);
+    }
+    if (s->orig_overlay_flags != bdrv_get_flags(overlay_bs)) {
+        bdrv_reopen(overlay_bs, s->orig_overlay_flags, NULL);
+    }
+
+    block_job_completed(&s->common, ret);
+}
+
+static void commit_set_speed(BlockJob *job, int64_t speed, Error **errp)
+{
+    CommitBlockJob *s = container_of(job, CommitBlockJob, common);
+
+    if (speed < 0) {
+        error_set(errp, QERR_INVALID_PARAMETER, "speed");
+        return;
+    }
+    ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
+}
+
+static BlockJobType commit_job_type = {
+    .instance_size = sizeof(CommitBlockJob),
+    .job_type      = "commit",
+    .set_speed     = commit_set_speed,
+};
+
+void commit_start(BlockDriverState *bs, BlockDriverState *base,
+                  BlockDriverState *top, int64_t speed,
+                  BlockdevOnError on_error, BlockDriverCompletionFunc *cb,
+                  void *opaque, Error **errp)
+{
+    CommitBlockJob *s;
+    BlockReopenQueue *reopen_queue = NULL;
+    int orig_overlay_flags;
+    int orig_base_flags;
+    BlockDriverState *overlay_bs;
+    Error *local_err = NULL;
+
+    if ((on_error == BLOCKDEV_ON_ERROR_STOP ||
+         on_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
+        !bdrv_iostatus_is_enabled(bs)) {
+        error_set(errp, QERR_INVALID_PARAMETER_COMBINATION);
+        return;
+    }
+
+    /* Once we support top == active layer, remove this check */
+    if (top == bs) {
+        error_setg(errp,
+                   "Top image as the active layer is currently unsupported");
+        return;
+    }
+
+    if (top == base) {
+        error_setg(errp, "Invalid files for merge: top and base are the same");
+        return;
+    }
+
+    overlay_bs = bdrv_find_overlay(bs, top);
+
+    if (overlay_bs == NULL) {
+        error_setg(errp, "Could not find overlay image for %s:", top->filename);
+        return;
+    }
+
+    orig_base_flags    = bdrv_get_flags(base);
+    orig_overlay_flags = bdrv_get_flags(overlay_bs);
+
+    /* convert base & overlay_bs to r/w, if necessary */
+    if (!(orig_base_flags & BDRV_O_RDWR)) {
+        reopen_queue = bdrv_reopen_queue(reopen_queue, base,
+                                         orig_base_flags | BDRV_O_RDWR);
+    }
+    if (!(orig_overlay_flags & BDRV_O_RDWR)) {
+        reopen_queue = bdrv_reopen_queue(reopen_queue, overlay_bs,
+                                         orig_overlay_flags | BDRV_O_RDWR);
+    }
+    if (reopen_queue) {
+        bdrv_reopen_multiple(reopen_queue, &local_err);
+        if (local_err != NULL) {
+            error_propagate(errp, local_err);
+            return;
+        }
+    }
+
+
+    s = block_job_create(&commit_job_type, bs, speed, cb, opaque, errp);
+    if (!s) {
+        return;
+    }
+
+    s->base   = base;
+    s->top    = top;
+    s->active = bs;
+
+    s->base_flags          = orig_base_flags;
+    s->orig_overlay_flags  = orig_overlay_flags;
+
+    s->on_error = on_error;
+    s->common.co = qemu_coroutine_create(commit_run);
+
+    trace_commit_start(bs, base, top, s, s->common.co, opaque);
+    qemu_coroutine_enter(s->common.co, s);
+}
--- a/block/cow.c
+++ b/block/cow.c
@ -22,8 +22,8 @@
 * THE SOFTWARE.
 */
 #include "qemu-common.h"
-#include "block_int.h"
-#include "module.h"
+#include "block/block_int.h"
+#include "qemu/module.h"

 /**************************************************************/
 /* COW block driver using file system holes */
--- a/block/curl.c
+++ b/block/curl.c
@ -22,7 +22,7 @@
 * THE SOFTWARE.
 */
 #include "qemu-common.h"
-#include "block_int.h"
+#include "block/block_int.h"
 #include <curl/curl.h>

 // #define DEBUG
@ -438,7 +438,7 @@ static void curl_aio_cancel(BlockDriverAIOCB *blockacb)
    // Do we have to implement canceling? Seems to work without...
 }

-static AIOPool curl_aio_pool = {
+static const AIOCBInfo curl_aiocb_info = {
    .aiocb_size         = sizeof(CURLAIOCB),
    .cancel             = curl_aio_cancel,
 };
@ -505,7 +505,7 @@ static BlockDriverAIOCB *curl_aio_readv(BlockDriverState *bs,
 {
    CURLAIOCB *acb;

-    acb = qemu_aio_get(&curl_aio_pool, bs, cb, opaque);
+    acb = qemu_aio_get(&curl_aiocb_info, bs, cb, opaque);

    acb->qiov = qiov;
    acb->sector_num = sector_num;
@ -542,8 +542,7 @@ static void curl_close(BlockDriverState *bs)
    }
    if (s->multi)
        curl_multi_cleanup(s->multi);
-    if (s->url)
-        free(s->url);
+    g_free(s->url);
 }

 static int64_t curl_getlength(BlockDriverState *bs)
--- a/block/dmg.c
+++ b/block/dmg.c
@ -22,9 +22,9 @@
 * THE SOFTWARE.
 */
 #include "qemu-common.h"
-#include "block_int.h"
-#include "bswap.h"
-#include "module.h"
+#include "block/block_int.h"
+#include "qemu/bswap.h"
+#include "qemu/module.h"
 #include <zlib.h>

 typedef struct BDRVDMGState {
--- a/block/gluster.c
+++ b/block/gluster.c
@ -0,0 +1,624 @@
+/*
+ * GlusterFS backend for QEMU
+ *
+ * Copyright (C) 2012 Bharata B Rao <bharata@linux.vnet.ibm.com>
+ *
+ * Pipe handling mechanism in AIO implementation is derived from
+ * block/rbd.c. Hence,
+ *
+ * Copyright (C) 2010-2011 Christian Brunner <chb@muc.de>,
+ *                         Josh Durgin <josh.durgin@dreamhost.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+#include <glusterfs/api/glfs.h>
+#include "block/block_int.h"
+#include "qemu/sockets.h"
+#include "qemu/uri.h"
+
+typedef struct GlusterAIOCB {
+    BlockDriverAIOCB common;
+    int64_t size;
+    int ret;
+    bool *finished;
+    QEMUBH *bh;
+} GlusterAIOCB;
+
+typedef struct BDRVGlusterState {
+    struct glfs *glfs;
+    int fds[2];
+    struct glfs_fd *fd;
+    int qemu_aio_count;
+    int event_reader_pos;
+    GlusterAIOCB *event_acb;
+} BDRVGlusterState;
+
+#define GLUSTER_FD_READ  0
+#define GLUSTER_FD_WRITE 1
+
+typedef struct GlusterConf {
+    char *server;
+    int port;
+    char *volname;
+    char *image;
+    char *transport;
+} GlusterConf;
+
+static void qemu_gluster_gconf_free(GlusterConf *gconf)
+{
+    g_free(gconf->server);
+    g_free(gconf->volname);
+    g_free(gconf->image);
+    g_free(gconf->transport);
+    g_free(gconf);
+}
+
+static int parse_volume_options(GlusterConf *gconf, char *path)
+{
+    char *p, *q;
+
+    if (!path) {
+        return -EINVAL;
+    }
+
+    /* volume */
+    p = q = path + strspn(path, "/");
+    p += strcspn(p, "/");
+    if (*p == '\0') {
+        return -EINVAL;
+    }
+    gconf->volname = g_strndup(q, p - q);
+
+    /* image */
+    p += strspn(p, "/");
+    if (*p == '\0') {
+        return -EINVAL;
+    }
+    gconf->image = g_strdup(p);
+    return 0;
+}
+
+/*
+ * file=gluster[+transport]://[server[:port]]/volname/image[?socket=...]
+ *
+ * 'gluster' is the protocol.
+ *
+ * 'transport' specifies the transport type used to connect to gluster
+ * management daemon (glusterd). Valid transport types are
+ * tcp, unix and rdma. If a transport type isn't specified, then tcp
+ * type is assumed.
+ *
+ * 'server' specifies the server where the volume file specification for
+ * the given volume resides. This can be either hostname, ipv4 address
+ * or ipv6 address. ipv6 address needs to be within square brackets [ ].
+ * If transport type is 'unix', then 'server' field should not be specifed.
+ * The 'socket' field needs to be populated with the path to unix domain
+ * socket.
+ *
+ * 'port' is the port number on which glusterd is listening. This is optional
+ * and if not specified, QEMU will send 0 which will make gluster to use the
+ * default port. If the transport type is unix, then 'port' should not be
+ * specified.
+ *
+ * 'volname' is the name of the gluster volume which contains the VM image.
+ *
+ * 'image' is the path to the actual VM image that resides on gluster volume.
+ *
+ * Examples:
+ *
+ * file=gluster://1.2.3.4/testvol/a.img
+ * file=gluster+tcp://1.2.3.4/testvol/a.img
+ * file=gluster+tcp://1.2.3.4:24007/testvol/dir/a.img
+ * file=gluster+tcp://[1:2:3:4:5:6:7:8]/testvol/dir/a.img
+ * file=gluster+tcp://[1:2:3:4:5:6:7:8]:24007/testvol/dir/a.img
+ * file=gluster+tcp://server.domain.com:24007/testvol/dir/a.img
+ * file=gluster+unix:///testvol/dir/a.img?socket=/tmp/glusterd.socket
+ * file=gluster+rdma://1.2.3.4:24007/testvol/a.img
+ */
+static int qemu_gluster_parseuri(GlusterConf *gconf, const char *filename)
+{
+    URI *uri;
+    QueryParams *qp = NULL;
+    bool is_unix = false;
+    int ret = 0;
+
+    uri = uri_parse(filename);
+    if (!uri) {
+        return -EINVAL;
+    }
+
+    /* transport */
+    if (!strcmp(uri->scheme, "gluster")) {
+        gconf->transport = g_strdup("tcp");
+    } else if (!strcmp(uri->scheme, "gluster+tcp")) {
+        gconf->transport = g_strdup("tcp");
+    } else if (!strcmp(uri->scheme, "gluster+unix")) {
+        gconf->transport = g_strdup("unix");
+        is_unix = true;
+    } else if (!strcmp(uri->scheme, "gluster+rdma")) {
+        gconf->transport = g_strdup("rdma");
+    } else {
+        ret = -EINVAL;
+        goto out;
+    }
+
+    ret = parse_volume_options(gconf, uri->path);
+    if (ret < 0) {
+        goto out;
+    }
+
+    qp = query_params_parse(uri->query);
+    if (qp->n > 1 || (is_unix && !qp->n) || (!is_unix && qp->n)) {
+        ret = -EINVAL;
+        goto out;
+    }
+
+    if (is_unix) {
+        if (uri->server || uri->port) {
+            ret = -EINVAL;
+            goto out;
+        }
+        if (strcmp(qp->p[0].name, "socket")) {
+            ret = -EINVAL;
+            goto out;
+        }
+        gconf->server = g_strdup(qp->p[0].value);
+    } else {
+        gconf->server = g_strdup(uri->server);
+        gconf->port = uri->port;
+    }
+
+out:
+    if (qp) {
+        query_params_free(qp);
+    }
+    uri_free(uri);
+    return ret;
+}
+
+static struct glfs *qemu_gluster_init(GlusterConf *gconf, const char *filename)
+{
+    struct glfs *glfs = NULL;
+    int ret;
+    int old_errno;
+
+    ret = qemu_gluster_parseuri(gconf, filename);
+    if (ret < 0) {
+        error_report("Usage: file=gluster[+transport]://[server[:port]]/"
+            "volname/image[?socket=...]");
+        errno = -ret;
+        goto out;
+    }
+
+    glfs = glfs_new(gconf->volname);
+    if (!glfs) {
+        goto out;
+    }
+
+    ret = glfs_set_volfile_server(glfs, gconf->transport, gconf->server,
+            gconf->port);
+    if (ret < 0) {
+        goto out;
+    }
+
+    /*
+     * TODO: Use GF_LOG_ERROR instead of hard code value of 4 here when
+     * GlusterFS makes GF_LOG_* macros available to libgfapi users.
+     */
+    ret = glfs_set_logging(glfs, "-", 4);
+    if (ret < 0) {
+        goto out;
+    }
+
+    ret = glfs_init(glfs);
+    if (ret) {
+        error_report("Gluster connection failed for server=%s port=%d "
+             "volume=%s image=%s transport=%s\n", gconf->server, gconf->port,
+             gconf->volname, gconf->image, gconf->transport);
+        goto out;
+    }
+    return glfs;
+
+out:
+    if (glfs) {
+        old_errno = errno;
+        glfs_fini(glfs);
+        errno = old_errno;
+    }
+    return NULL;
+}
+
+static void qemu_gluster_complete_aio(GlusterAIOCB *acb, BDRVGlusterState *s)
+{
+    int ret;
+    bool *finished = acb->finished;
+    BlockDriverCompletionFunc *cb = acb->common.cb;
+    void *opaque = acb->common.opaque;
+
+    if (!acb->ret || acb->ret == acb->size) {
+        ret = 0; /* Success */
+    } else if (acb->ret < 0) {
+        ret = acb->ret; /* Read/Write failed */
+    } else {
+        ret = -EIO; /* Partial read/write - fail it */
+    }
+
+    s->qemu_aio_count--;
+    qemu_aio_release(acb);
+    cb(opaque, ret);
+    if (finished) {
+        *finished = true;
+    }
+}
+
+static void qemu_gluster_aio_event_reader(void *opaque)
+{
+    BDRVGlusterState *s = opaque;
+    ssize_t ret;
+
+    do {
+        char *p = (char *)&s->event_acb;
+
+        ret = read(s->fds[GLUSTER_FD_READ], p + s->event_reader_pos,
+                   sizeof(s->event_acb) - s->event_reader_pos);
+        if (ret > 0) {
+            s->event_reader_pos += ret;
+            if (s->event_reader_pos == sizeof(s->event_acb)) {
+                s->event_reader_pos = 0;
+                qemu_gluster_complete_aio(s->event_acb, s);
+            }
+        }
+    } while (ret < 0 && errno == EINTR);
+}
+
+static int qemu_gluster_aio_flush_cb(void *opaque)
+{
+    BDRVGlusterState *s = opaque;
+
+    return (s->qemu_aio_count > 0);
+}
+
+static int qemu_gluster_open(BlockDriverState *bs, const char *filename,
+    int bdrv_flags)
+{
+    BDRVGlusterState *s = bs->opaque;
+    int open_flags = O_BINARY;
+    int ret = 0;
+    GlusterConf *gconf = g_malloc0(sizeof(GlusterConf));
+
+    s->glfs = qemu_gluster_init(gconf, filename);
+    if (!s->glfs) {
+        ret = -errno;
+        goto out;
+    }
+
+    if (bdrv_flags & BDRV_O_RDWR) {
+        open_flags |= O_RDWR;
+    } else {
+        open_flags |= O_RDONLY;
+    }
+
+    if ((bdrv_flags & BDRV_O_NOCACHE)) {
+        open_flags |= O_DIRECT;
+    }
+
+    s->fd = glfs_open(s->glfs, gconf->image, open_flags);
+    if (!s->fd) {
+        ret = -errno;
+        goto out;
+    }
+
+    ret = qemu_pipe(s->fds);
+    if (ret < 0) {
+        ret = -errno;
+        goto out;
+    }
+    fcntl(s->fds[GLUSTER_FD_READ], F_SETFL, O_NONBLOCK);
+    qemu_aio_set_fd_handler(s->fds[GLUSTER_FD_READ],
+        qemu_gluster_aio_event_reader, NULL, qemu_gluster_aio_flush_cb, s);
+
+out:
+    qemu_gluster_gconf_free(gconf);
+    if (!ret) {
+        return ret;
+    }
+    if (s->fd) {
+        glfs_close(s->fd);
+    }
+    if (s->glfs) {
+        glfs_fini(s->glfs);
+    }
+    return ret;
+}
+
+static int qemu_gluster_create(const char *filename,
+        QEMUOptionParameter *options)
+{
+    struct glfs *glfs;
+    struct glfs_fd *fd;
+    int ret = 0;
+    int64_t total_size = 0;
+    GlusterConf *gconf = g_malloc0(sizeof(GlusterConf));
+
+    glfs = qemu_gluster_init(gconf, filename);
+    if (!glfs) {
+        ret = -errno;
+        goto out;
+    }
+
+    while (options && options->name) {
+        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
+            total_size = options->value.n / BDRV_SECTOR_SIZE;
+        }
+        options++;
+    }
+
+    fd = glfs_creat(glfs, gconf->image,
+        O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, S_IRUSR | S_IWUSR);
+    if (!fd) {
+        ret = -errno;
+    } else {
+        if (glfs_ftruncate(fd, total_size * BDRV_SECTOR_SIZE) != 0) {
+            ret = -errno;
+        }
+        if (glfs_close(fd) != 0) {
+            ret = -errno;
+        }
+    }
+out:
+    qemu_gluster_gconf_free(gconf);
+    if (glfs) {
+        glfs_fini(glfs);
+    }
+    return ret;
+}
+
+static void qemu_gluster_aio_cancel(BlockDriverAIOCB *blockacb)
+{
+    GlusterAIOCB *acb = (GlusterAIOCB *)blockacb;
+    bool finished = false;
+
+    acb->finished = &finished;
+    while (!finished) {
+        qemu_aio_wait();
+    }
+}
+
+static const AIOCBInfo gluster_aiocb_info = {
+    .aiocb_size = sizeof(GlusterAIOCB),
+    .cancel = qemu_gluster_aio_cancel,
+};
+
+static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg)
+{
+    GlusterAIOCB *acb = (GlusterAIOCB *)arg;
+    BlockDriverState *bs = acb->common.bs;
+    BDRVGlusterState *s = bs->opaque;
+    int retval;
+
+    acb->ret = ret;
+    retval = qemu_write_full(s->fds[GLUSTER_FD_WRITE], &acb, sizeof(acb));
+    if (retval != sizeof(acb)) {
+        /*
+         * Gluster AIO callback thread failed to notify the waiting
+         * QEMU thread about IO completion.
+         *
+         * Complete this IO request and make the disk inaccessible for
+         * subsequent reads and writes.
+         */
+        error_report("Gluster failed to notify QEMU about IO completion");
+
+        qemu_mutex_lock_iothread(); /* We are in gluster thread context */
+        acb->common.cb(acb->common.opaque, -EIO);
+        qemu_aio_release(acb);
+        s->qemu_aio_count--;
+        close(s->fds[GLUSTER_FD_READ]);
+        close(s->fds[GLUSTER_FD_WRITE]);
+        qemu_aio_set_fd_handler(s->fds[GLUSTER_FD_READ], NULL, NULL, NULL,
+            NULL);
+        bs->drv = NULL; /* Make the disk inaccessible */
+        qemu_mutex_unlock_iothread();
+    }
+}
+
+static BlockDriverAIOCB *qemu_gluster_aio_rw(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque, int write)
+{
+    int ret;
+    GlusterAIOCB *acb;
+    BDRVGlusterState *s = bs->opaque;
+    size_t size;
+    off_t offset;
+
+    offset = sector_num * BDRV_SECTOR_SIZE;
+    size = nb_sectors * BDRV_SECTOR_SIZE;
+    s->qemu_aio_count++;
+
+    acb = qemu_aio_get(&gluster_aiocb_info, bs, cb, opaque);
+    acb->size = size;
+    acb->ret = 0;
+    acb->finished = NULL;
+
+    if (write) {
+        ret = glfs_pwritev_async(s->fd, qiov->iov, qiov->niov, offset, 0,
+            &gluster_finish_aiocb, acb);
+    } else {
+        ret = glfs_preadv_async(s->fd, qiov->iov, qiov->niov, offset, 0,
+            &gluster_finish_aiocb, acb);
+    }
+
+    if (ret < 0) {
+        goto out;
+    }
+    return &acb->common;
+
+out:
+    s->qemu_aio_count--;
+    qemu_aio_release(acb);
+    return NULL;
+}
+
+static BlockDriverAIOCB *qemu_gluster_aio_readv(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    return qemu_gluster_aio_rw(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
+}
+
+static BlockDriverAIOCB *qemu_gluster_aio_writev(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    return qemu_gluster_aio_rw(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
+}
+
+static BlockDriverAIOCB *qemu_gluster_aio_flush(BlockDriverState *bs,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    int ret;
+    GlusterAIOCB *acb;
+    BDRVGlusterState *s = bs->opaque;
+
+    acb = qemu_aio_get(&gluster_aiocb_info, bs, cb, opaque);
+    acb->size = 0;
+    acb->ret = 0;
+    acb->finished = NULL;
+    s->qemu_aio_count++;
+
+    ret = glfs_fsync_async(s->fd, &gluster_finish_aiocb, acb);
+    if (ret < 0) {
+        goto out;
+    }
+    return &acb->common;
+
+out:
+    s->qemu_aio_count--;
+    qemu_aio_release(acb);
+    return NULL;
+}
+
+static int64_t qemu_gluster_getlength(BlockDriverState *bs)
+{
+    BDRVGlusterState *s = bs->opaque;
+    int64_t ret;
+
+    ret = glfs_lseek(s->fd, 0, SEEK_END);
+    if (ret < 0) {
+        return -errno;
+    } else {
+        return ret;
+    }
+}
+
+static int64_t qemu_gluster_allocated_file_size(BlockDriverState *bs)
+{
+    BDRVGlusterState *s = bs->opaque;
+    struct stat st;
+    int ret;
+
+    ret = glfs_fstat(s->fd, &st);
+    if (ret < 0) {
+        return -errno;
+    } else {
+        return st.st_blocks * 512;
+    }
+}
+
+static void qemu_gluster_close(BlockDriverState *bs)
+{
+    BDRVGlusterState *s = bs->opaque;
+
+    close(s->fds[GLUSTER_FD_READ]);
+    close(s->fds[GLUSTER_FD_WRITE]);
+    qemu_aio_set_fd_handler(s->fds[GLUSTER_FD_READ], NULL, NULL, NULL, NULL);
+
+    if (s->fd) {
+        glfs_close(s->fd);
+        s->fd = NULL;
+    }
+    glfs_fini(s->glfs);
+}
+
+static QEMUOptionParameter qemu_gluster_create_options[] = {
+    {
+        .name = BLOCK_OPT_SIZE,
+        .type = OPT_SIZE,
+        .help = "Virtual disk size"
+    },
+    { NULL }
+};
+
+static BlockDriver bdrv_gluster = {
+    .format_name                  = "gluster",
+    .protocol_name                = "gluster",
+    .instance_size                = sizeof(BDRVGlusterState),
+    .bdrv_file_open               = qemu_gluster_open,
+    .bdrv_close                   = qemu_gluster_close,
+    .bdrv_create                  = qemu_gluster_create,
+    .bdrv_getlength               = qemu_gluster_getlength,
+    .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
+    .bdrv_aio_readv               = qemu_gluster_aio_readv,
+    .bdrv_aio_writev              = qemu_gluster_aio_writev,
+    .bdrv_aio_flush               = qemu_gluster_aio_flush,
+    .create_options               = qemu_gluster_create_options,
+};
+
+static BlockDriver bdrv_gluster_tcp = {
+    .format_name                  = "gluster",
+    .protocol_name                = "gluster+tcp",
+    .instance_size                = sizeof(BDRVGlusterState),
+    .bdrv_file_open               = qemu_gluster_open,
+    .bdrv_close                   = qemu_gluster_close,
+    .bdrv_create                  = qemu_gluster_create,
+    .bdrv_getlength               = qemu_gluster_getlength,
+    .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
+    .bdrv_aio_readv               = qemu_gluster_aio_readv,
+    .bdrv_aio_writev              = qemu_gluster_aio_writev,
+    .bdrv_aio_flush               = qemu_gluster_aio_flush,
+    .create_options               = qemu_gluster_create_options,
+};
+
+static BlockDriver bdrv_gluster_unix = {
+    .format_name                  = "gluster",
+    .protocol_name                = "gluster+unix",
+    .instance_size                = sizeof(BDRVGlusterState),
+    .bdrv_file_open               = qemu_gluster_open,
+    .bdrv_close                   = qemu_gluster_close,
+    .bdrv_create                  = qemu_gluster_create,
+    .bdrv_getlength               = qemu_gluster_getlength,
+    .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
+    .bdrv_aio_readv               = qemu_gluster_aio_readv,
+    .bdrv_aio_writev              = qemu_gluster_aio_writev,
+    .bdrv_aio_flush               = qemu_gluster_aio_flush,
+    .create_options               = qemu_gluster_create_options,
+};
+
+static BlockDriver bdrv_gluster_rdma = {
+    .format_name                  = "gluster",
+    .protocol_name                = "gluster+rdma",
+    .instance_size                = sizeof(BDRVGlusterState),
+    .bdrv_file_open               = qemu_gluster_open,
+    .bdrv_close                   = qemu_gluster_close,
+    .bdrv_create                  = qemu_gluster_create,
+    .bdrv_getlength               = qemu_gluster_getlength,
+    .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
+    .bdrv_aio_readv               = qemu_gluster_aio_readv,
+    .bdrv_aio_writev              = qemu_gluster_aio_writev,
+    .bdrv_aio_flush               = qemu_gluster_aio_flush,
+    .create_options               = qemu_gluster_create_options,
+};
+
+static void bdrv_gluster_init(void)
+{
+    bdrv_register(&bdrv_gluster_rdma);
+    bdrv_register(&bdrv_gluster_unix);
+    bdrv_register(&bdrv_gluster_tcp);
+    bdrv_register(&bdrv_gluster);
+}
+
+block_init(bdrv_gluster_init);
--- a/block/iscsi.c
+++ b/block/iscsi.c
@ -27,8 +27,9 @@
 #include <poll.h>
 #include <arpa/inet.h>
 #include "qemu-common.h"
-#include "qemu-error.h"
-#include "block_int.h"
+#include "qemu/config-file.h"
+#include "qemu/error-report.h"
+#include "block/block_int.h"
 #include "trace.h"
 #include "hw/scsi-defs.h"

@ -65,21 +66,44 @@ typedef struct IscsiAIOCB {
 #endif
 } IscsiAIOCB;

-struct IscsiTask {
-    IscsiLun *iscsilun;
-    BlockDriverState *bs;
-    int status;
-    int complete;
-};
+static void
+iscsi_bh_cb(void *p)
+{
+    IscsiAIOCB *acb = p;
+
+    qemu_bh_delete(acb->bh);
+
+    if (acb->canceled == 0) {
+        acb->common.cb(acb->common.opaque, acb->status);
+    }
+
+    if (acb->task != NULL) {
+        scsi_free_scsi_task(acb->task);
+        acb->task = NULL;
+    }
+
+    qemu_aio_release(acb);
+}
+
+static void
+iscsi_schedule_bh(IscsiAIOCB *acb)
+{
+    if (acb->bh) {
+        return;
+    }
+    acb->bh = qemu_bh_new(iscsi_bh_cb, acb);
+    qemu_bh_schedule(acb->bh);
+}
+

 static void
 iscsi_abort_task_cb(struct iscsi_context *iscsi, int status, void *command_data,
                    void *private_data)
 {
-    IscsiAIOCB *acb = (IscsiAIOCB *)private_data;
+    IscsiAIOCB *acb = private_data;

-    scsi_free_scsi_task(acb->task);
-    acb->task = NULL;
+    acb->status = -ECANCELED;
+    iscsi_schedule_bh(acb);
 }

 static void
@ -88,18 +112,22 @@ iscsi_aio_cancel(BlockDriverAIOCB *blockacb)
    IscsiAIOCB *acb = (IscsiAIOCB *)blockacb;
    IscsiLun *iscsilun = acb->iscsilun;

+    if (acb->status != -EINPROGRESS) {
+        return;
+    }
+
    acb->canceled = 1;

-    acb->common.cb(acb->common.opaque, -ECANCELED);
-
-    /* send a task mgmt call to the target to cancel the task on the target
-     * this also cancels the task in libiscsi
-     */
+    /* send a task mgmt call to the target to cancel the task on the target */
    iscsi_task_mgmt_abort_task_async(iscsilun->iscsi, acb->task,
-                                     iscsi_abort_task_cb, &acb);
+                                     iscsi_abort_task_cb, acb);
+
+    while (acb->status == -EINPROGRESS) {
+        qemu_aio_wait();
+    }
 }

-static AIOPool iscsi_aio_pool = {
+static const AIOCBInfo iscsi_aiocb_info = {
    .aiocb_size         = sizeof(IscsiAIOCB),
    .cancel             = iscsi_aio_cancel,
 };
@ -133,12 +161,6 @@ iscsi_set_events(IscsiLun *iscsilun)

    }

-    /* If we just added an event, the callback might be delayed
-     * unless we call qemu_notify_event().
-     */
-    if (ev & ~iscsilun->events) {
-        qemu_notify_event();
-    }
    iscsilun->events = ev;
 }

@ -163,41 +185,6 @@ iscsi_process_write(void *arg)
 }


-static int
-iscsi_schedule_bh(QEMUBHFunc *cb, IscsiAIOCB *acb)
-{
-    acb->bh = qemu_bh_new(cb, acb);
-    if (!acb->bh) {
-        error_report("oom: could not create iscsi bh");
-        return -EIO;
-    }
-
-    qemu_bh_schedule(acb->bh);
-    return 0;
-}
-
-static void
-iscsi_readv_writev_bh_cb(void *p)
-{
-    IscsiAIOCB *acb = p;
-
-    qemu_bh_delete(acb->bh);
-
-    if (!acb->canceled) {
-        acb->common.cb(acb->common.opaque, acb->status);
-    }
-
-    qemu_aio_release(acb);
-
-    if (acb->canceled) {
-        return;
-    }
-
-    scsi_free_scsi_task(acb->task);
-    acb->task = NULL;
-}
-
-
 static void
 iscsi_aio_write16_cb(struct iscsi_context *iscsi, int status,
                     void *command_data, void *opaque)
@ -208,8 +195,7 @@ iscsi_aio_write16_cb(struct iscsi_context *iscsi, int status,

    g_free(acb->buf);

-    if (acb->canceled) {
-        qemu_aio_release(acb);
+    if (acb->canceled != 0) {
        return;
    }

@ -220,7 +206,7 @@ iscsi_aio_write16_cb(struct iscsi_context *iscsi, int status,
        acb->status = -EIO;
    }

-    iscsi_schedule_bh(iscsi_readv_writev_bh_cb, acb);
+    iscsi_schedule_bh(acb);
 }

 static int64_t sector_qemu2lun(int64_t sector, IscsiLun *iscsilun)
@ -242,13 +228,15 @@ iscsi_aio_writev(BlockDriverState *bs, int64_t sector_num,
    uint64_t lba;
    struct iscsi_data data;

-    acb = qemu_aio_get(&iscsi_aio_pool, bs, cb, opaque);
+    acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque);
    trace_iscsi_aio_writev(iscsi, sector_num, nb_sectors, opaque, acb);

    acb->iscsilun = iscsilun;
    acb->qiov     = qiov;

    acb->canceled   = 0;
+    acb->bh         = NULL;
+    acb->status     = -EINPROGRESS;

    /* XXX we should pass the iovec to write16 to avoid the extra copy */
    /* this will allow us to get rid of 'buf' completely */
@ -268,10 +256,6 @@ iscsi_aio_writev(BlockDriverState *bs, int64_t sector_num,
    acb->task->xfer_dir = SCSI_XFER_WRITE;
    acb->task->cdb_size = 16;
    acb->task->cdb[0] = 0x8a;
-    if (!(bs->open_flags & BDRV_O_CACHE_WB)) {
-        /* set FUA on writes when cache mode is write through */
-        acb->task->cdb[1] |= 0x04;
-    }
    lba = sector_qemu2lun(sector_num, iscsilun);
    *(uint32_t *)&acb->task->cdb[2]  = htonl(lba >> 32);
    *(uint32_t *)&acb->task->cdb[6]  = htonl(lba & 0xffffffff);
@ -305,8 +289,7 @@ iscsi_aio_read16_cb(struct iscsi_context *iscsi, int status,

    trace_iscsi_aio_read16_cb(iscsi, status, acb, acb->canceled);

-    if (acb->canceled) {
-        qemu_aio_release(acb);
+    if (acb->canceled != 0) {
        return;
    }

@ -317,7 +300,7 @@ iscsi_aio_read16_cb(struct iscsi_context *iscsi, int status,
        acb->status = -EIO;
    }

-    iscsi_schedule_bh(iscsi_readv_writev_bh_cb, acb);
+    iscsi_schedule_bh(acb);
 }

 static BlockDriverAIOCB *
@ -336,13 +319,15 @@ iscsi_aio_readv(BlockDriverState *bs, int64_t sector_num,

    qemu_read_size = BDRV_SECTOR_SIZE * (size_t)nb_sectors;

-    acb = qemu_aio_get(&iscsi_aio_pool, bs, cb, opaque);
+    acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque);
    trace_iscsi_aio_readv(iscsi, sector_num, nb_sectors, opaque, acb);

    acb->iscsilun = iscsilun;
    acb->qiov     = qiov;

    acb->canceled    = 0;
+    acb->bh          = NULL;
+    acb->status      = -EINPROGRESS;
    acb->read_size   = qemu_read_size;
    acb->buf         = NULL;

@ -389,7 +374,7 @@ iscsi_aio_readv(BlockDriverState *bs, int64_t sector_num,
        *(uint16_t *)&acb->task->cdb[7] = htons(num_sectors);
        break;
    }
-    
+
    if (iscsi_scsi_command_async(iscsi, iscsilun->lun, acb->task,
                                 iscsi_aio_read16_cb,
                                 NULL,
@ -417,8 +402,7 @@ iscsi_synccache10_cb(struct iscsi_context *iscsi, int status,
 {
    IscsiAIOCB *acb = opaque;

-    if (acb->canceled) {
-        qemu_aio_release(acb);
+    if (acb->canceled != 0) {
        return;
    }

@ -429,7 +413,7 @@ iscsi_synccache10_cb(struct iscsi_context *iscsi, int status,
        acb->status = -EIO;
    }

-    iscsi_schedule_bh(iscsi_readv_writev_bh_cb, acb);
+    iscsi_schedule_bh(acb);
 }

 static BlockDriverAIOCB *
@ -440,10 +424,12 @@ iscsi_aio_flush(BlockDriverState *bs,
    struct iscsi_context *iscsi = iscsilun->iscsi;
    IscsiAIOCB *acb;

-    acb = qemu_aio_get(&iscsi_aio_pool, bs, cb, opaque);
+    acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque);

    acb->iscsilun = iscsilun;
    acb->canceled   = 0;
+    acb->bh         = NULL;
+    acb->status     = -EINPROGRESS;

    acb->task = iscsi_synchronizecache10_task(iscsi, iscsilun->lun,
                                         0, 0, 0, 0,
@ -467,8 +453,7 @@ iscsi_unmap_cb(struct iscsi_context *iscsi, int status,
 {
    IscsiAIOCB *acb = opaque;

-    if (acb->canceled) {
-        qemu_aio_release(acb);
+    if (acb->canceled != 0) {
        return;
    }

@ -479,7 +464,7 @@ iscsi_unmap_cb(struct iscsi_context *iscsi, int status,
        acb->status = -EIO;
    }

-    iscsi_schedule_bh(iscsi_readv_writev_bh_cb, acb);
+    iscsi_schedule_bh(acb);
 }

 static BlockDriverAIOCB *
@ -492,10 +477,12 @@ iscsi_aio_discard(BlockDriverState *bs,
    IscsiAIOCB *acb;
    struct unmap_list list[1];

-    acb = qemu_aio_get(&iscsi_aio_pool, bs, cb, opaque);
+    acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque);

    acb->iscsilun = iscsilun;
    acb->canceled   = 0;
+    acb->bh         = NULL;
+    acb->status     = -EINPROGRESS;

    list[0].lba = sector_qemu2lun(sector_num, iscsilun);
    list[0].num = nb_sectors * BDRV_SECTOR_SIZE / iscsilun->block_size;
@ -523,8 +510,7 @@ iscsi_aio_ioctl_cb(struct iscsi_context *iscsi, int status,
 {
    IscsiAIOCB *acb = opaque;

-    if (acb->canceled) {
-        qemu_aio_release(acb);
+    if (acb->canceled != 0) {
        return;
    }

@ -552,7 +538,7 @@ iscsi_aio_ioctl_cb(struct iscsi_context *iscsi, int status,
        memcpy(acb->ioh->sbp, &acb->task->datain.data[2], ss);
    }

-    iscsi_schedule_bh(iscsi_readv_writev_bh_cb, acb);
+    iscsi_schedule_bh(acb);
 }

 static BlockDriverAIOCB *iscsi_aio_ioctl(BlockDriverState *bs,
@ -566,10 +552,12 @@ static BlockDriverAIOCB *iscsi_aio_ioctl(BlockDriverState *bs,

    assert(req == SG_IO);

-    acb = qemu_aio_get(&iscsi_aio_pool, bs, cb, opaque);
+    acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque);

    acb->iscsilun = iscsilun;
    acb->canceled    = 0;
+    acb->bh          = NULL;
+    acb->status      = -EINPROGRESS;
    acb->buf         = NULL;
    acb->ioh         = buf;

@ -624,9 +612,17 @@ static BlockDriverAIOCB *iscsi_aio_ioctl(BlockDriverState *bs,
    return &acb->common;
 }

+
+static void ioctl_cb(void *opaque, int status)
+{
+    int *p_status = opaque;
+    *p_status = status;
+}
+
 static int iscsi_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
 {
    IscsiLun *iscsilun = bs->opaque;
+    int status;

    switch (req) {
    case SG_GET_VERSION_NUM:
@ -635,6 +631,15 @@ static int iscsi_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
    case SG_GET_SCSI_ID:
        ((struct sg_scsi_id *)buf)->scsi_type = iscsilun->type;
        break;
+    case SG_IO:
+        status = -EINPROGRESS;
+        iscsi_aio_ioctl(bs, req, buf, ioctl_cb, &status);
+
+        while (status == -EINPROGRESS) {
+            qemu_aio_wait();
+        }
+
+        return 0;
    default:
        return -1;
    }
@ -654,158 +659,6 @@ iscsi_getlength(BlockDriverState *bs)
    return len;
 }

-static void
-iscsi_readcapacity16_cb(struct iscsi_context *iscsi, int status,
-                        void *command_data, void *opaque)
-{
-    struct IscsiTask *itask = opaque;
-    struct scsi_readcapacity16 *rc16;
-    struct scsi_task *task = command_data;
-
-    if (status != 0) {
-        error_report("iSCSI: Failed to read capacity of iSCSI lun. %s",
-                     iscsi_get_error(iscsi));
-        itask->status   = 1;
-        itask->complete = 1;
-        scsi_free_scsi_task(task);
-        return;
-    }
-
-    rc16 = scsi_datain_unmarshall(task);
-    if (rc16 == NULL) {
-        error_report("iSCSI: Failed to unmarshall readcapacity16 data.");
-        itask->status   = 1;
-        itask->complete = 1;
-        scsi_free_scsi_task(task);
-        return;
-    }
-
-    itask->iscsilun->block_size = rc16->block_length;
-    itask->iscsilun->num_blocks = rc16->returned_lba + 1;
-    itask->bs->total_sectors    = itask->iscsilun->num_blocks *
-                               itask->iscsilun->block_size / BDRV_SECTOR_SIZE ;
-
-    itask->status   = 0;
-    itask->complete = 1;
-    scsi_free_scsi_task(task);
-}
-
-static void
-iscsi_readcapacity10_cb(struct iscsi_context *iscsi, int status,
-                        void *command_data, void *opaque)
-{
-    struct IscsiTask *itask = opaque;
-    struct scsi_readcapacity10 *rc10;
-    struct scsi_task *task = command_data;
-
-    if (status != 0) {
-        error_report("iSCSI: Failed to read capacity of iSCSI lun. %s",
-                     iscsi_get_error(iscsi));
-        itask->status   = 1;
-        itask->complete = 1;
-        scsi_free_scsi_task(task);
-        return;
-    }
-
-    rc10 = scsi_datain_unmarshall(task);
-    if (rc10 == NULL) {
-        error_report("iSCSI: Failed to unmarshall readcapacity10 data.");
-        itask->status   = 1;
-        itask->complete = 1;
-        scsi_free_scsi_task(task);
-        return;
-    }
-
-    itask->iscsilun->block_size = rc10->block_size;
-    itask->iscsilun->num_blocks = rc10->lba + 1;
-    itask->bs->total_sectors    = itask->iscsilun->num_blocks *
-                               itask->iscsilun->block_size / BDRV_SECTOR_SIZE ;
-
-    itask->status   = 0;
-    itask->complete = 1;
-    scsi_free_scsi_task(task);
-}
-
-static void
-iscsi_inquiry_cb(struct iscsi_context *iscsi, int status, void *command_data,
-                 void *opaque)
-{
-    struct IscsiTask *itask = opaque;
-    struct scsi_task *task = command_data;
-    struct scsi_inquiry_standard *inq;
-
-    if (status != 0) {
-        itask->status   = 1;
-        itask->complete = 1;
-        scsi_free_scsi_task(task);
-        return;
-    }
-
-    inq = scsi_datain_unmarshall(task);
-    if (inq == NULL) {
-        error_report("iSCSI: Failed to unmarshall inquiry data.");
-        itask->status   = 1;
-        itask->complete = 1;
-        scsi_free_scsi_task(task);
-        return;
-    }
-
-    itask->iscsilun->type = inq->periperal_device_type;
-
-    scsi_free_scsi_task(task);
-
-    switch (itask->iscsilun->type) {
-    case TYPE_DISK:
-        task = iscsi_readcapacity16_task(iscsi, itask->iscsilun->lun,
-                                   iscsi_readcapacity16_cb, opaque);
-        if (task == NULL) {
-            error_report("iSCSI: failed to send readcapacity16 command.");
-            itask->status   = 1;
-            itask->complete = 1;
-            return;
-        }
-        break;
-    case TYPE_ROM:
-        task = iscsi_readcapacity10_task(iscsi, itask->iscsilun->lun,
-                                   0, 0,
-                                   iscsi_readcapacity10_cb, opaque);
-        if (task == NULL) {
-            error_report("iSCSI: failed to send readcapacity16 command.");
-            itask->status   = 1;
-            itask->complete = 1;
-            return;
-        }
-        break;
-    default:
-        itask->status   = 0;
-        itask->complete = 1;
-    }
-}
-
-static void
-iscsi_connect_cb(struct iscsi_context *iscsi, int status, void *command_data,
-                 void *opaque)
-{
-    struct IscsiTask *itask = opaque;
-    struct scsi_task *task;
-
-    if (status != 0) {
-        itask->status   = 1;
-        itask->complete = 1;
-        return;
-    }
-
-    task = iscsi_inquiry_task(iscsi, itask->iscsilun->lun,
-                              0, 0, 36,
-                              iscsi_inquiry_cb, opaque);
-    if (task == NULL) {
-        error_report("iSCSI: failed to send inquiry command.");
-        itask->status   = 1;
-        itask->complete = 1;
-        return;
-    }
-}
-
 static int parse_chap(struct iscsi_context *iscsi, const char *target)
 {
    QemuOptsList *list;
@ -918,7 +771,10 @@ static int iscsi_open(BlockDriverState *bs, const char *filename, int flags)
    IscsiLun *iscsilun = bs->opaque;
    struct iscsi_context *iscsi = NULL;
    struct iscsi_url *iscsi_url = NULL;
-    struct IscsiTask task;
+    struct scsi_task *task = NULL;
+    struct scsi_inquiry_standard *inq = NULL;
+    struct scsi_readcapacity10 *rc10 = NULL;
+    struct scsi_readcapacity16 *rc16 = NULL;
    char *initiator_name = NULL;
    int ret;

@ -931,8 +787,7 @@ static int iscsi_open(BlockDriverState *bs, const char *filename, int flags)

    iscsi_url = iscsi_parse_full_url(iscsi, filename);
    if (iscsi_url == NULL) {
-        error_report("Failed to parse URL : %s %s", filename,
-                     iscsi_get_error(iscsi));
+        error_report("Failed to parse URL : %s", filename);
        ret = -EINVAL;
        goto out;
    }
@ -982,33 +837,80 @@ static int iscsi_open(BlockDriverState *bs, const char *filename, int flags)
    /* check if we got HEADER_DIGEST via the options */
    parse_header_digest(iscsi, iscsi_url->target);

-    task.iscsilun = iscsilun;
-    task.status = 0;
-    task.complete = 0;
-    task.bs = bs;
+    if (iscsi_full_connect_sync(iscsi, iscsi_url->portal, iscsi_url->lun) != 0) {
+        error_report("iSCSI: Failed to connect to LUN : %s",
+            iscsi_get_error(iscsi));
+        ret = -EINVAL;
+        goto out;
+    }

    iscsilun->iscsi = iscsi;
    iscsilun->lun   = iscsi_url->lun;

-    if (iscsi_full_connect_async(iscsi, iscsi_url->portal, iscsi_url->lun,
-                                 iscsi_connect_cb, &task)
-        != 0) {
-        error_report("iSCSI: Failed to start async connect.");
+    task = iscsi_inquiry_sync(iscsi, iscsilun->lun, 0, 0, 36);
+
+    if (task == NULL || task->status != SCSI_STATUS_GOOD) {
+        error_report("iSCSI: failed to send inquiry command.");
        ret = -EINVAL;
        goto out;
    }

-    while (!task.complete) {
-        iscsi_set_events(iscsilun);
-        qemu_aio_wait();
-    }
-    if (task.status != 0) {
-        error_report("iSCSI: Failed to connect to LUN : %s",
-                     iscsi_get_error(iscsi));
+    inq = scsi_datain_unmarshall(task);
+    if (inq == NULL) {
+        error_report("iSCSI: Failed to unmarshall inquiry data.");
        ret = -EINVAL;
        goto out;
    }

+    iscsilun->type = inq->periperal_device_type;
+
+    scsi_free_scsi_task(task);
+
+    switch (iscsilun->type) {
+    case TYPE_DISK:
+        task = iscsi_readcapacity16_sync(iscsi, iscsilun->lun);
+        if (task == NULL || task->status != SCSI_STATUS_GOOD) {
+            error_report("iSCSI: failed to send readcapacity16 command.");
+            ret = -EINVAL;
+            goto out;
+        }
+        rc16 = scsi_datain_unmarshall(task);
+        if (rc16 == NULL) {
+            error_report("iSCSI: Failed to unmarshall readcapacity16 data.");
+            ret = -EINVAL;
+            goto out;
+        }
+        iscsilun->block_size = rc16->block_length;
+        iscsilun->num_blocks = rc16->returned_lba + 1;
+        break;
+    case TYPE_ROM:
+        task = iscsi_readcapacity10_sync(iscsi, iscsilun->lun, 0, 0);
+        if (task == NULL || task->status != SCSI_STATUS_GOOD) {
+            error_report("iSCSI: failed to send readcapacity10 command.");
+            ret = -EINVAL;
+            goto out;
+        }
+        rc10 = scsi_datain_unmarshall(task);
+        if (rc10 == NULL) {
+            error_report("iSCSI: Failed to unmarshall readcapacity10 data.");
+            ret = -EINVAL;
+            goto out;
+        }
+        iscsilun->block_size = rc10->block_size;
+        if (rc10->lba == 0) {
+            /* blank disk loaded */
+            iscsilun->num_blocks = 0;
+        } else {
+            iscsilun->num_blocks = rc10->lba + 1;
+        }
+        break;
+    default:
+        break;
+    }
+
+    bs->total_sectors    = iscsilun->num_blocks *
+                           iscsilun->block_size / BDRV_SECTOR_SIZE ;
+
    /* Medium changer or tape. We dont have any emulation for this so this must
     * be sg ioctl compatible. We force it to be sg, otherwise qemu will try
     * to read from the device to guess the image format.
@ -1027,6 +929,9 @@ out:
    if (iscsi_url != NULL) {
        iscsi_destroy_url(iscsi_url);
    }
+    if (task != NULL) {
+        scsi_free_scsi_task(task);
+    }

    if (ret) {
        if (iscsi != NULL) {
@ -1047,6 +952,11 @@ static void iscsi_close(BlockDriverState *bs)
    memset(iscsilun, 0, sizeof(IscsiLun));
 }

+static int iscsi_has_zero_init(BlockDriverState *bs)
+{
+    return 0;
+}
+
 static BlockDriver bdrv_iscsi = {
    .format_name     = "iscsi",
    .protocol_name   = "iscsi",
@ -1062,6 +972,7 @@ static BlockDriver bdrv_iscsi = {
    .bdrv_aio_flush  = iscsi_aio_flush,

    .bdrv_aio_discard = iscsi_aio_discard,
+    .bdrv_has_zero_init = iscsi_has_zero_init,

 #ifdef __linux__
    .bdrv_ioctl       = iscsi_ioctl,
--- a/block/linux-aio.c
+++ b/block/linux-aio.c
@ -8,10 +8,11 @@
 * See the COPYING file in the top-level directory.
 */
 #include "qemu-common.h"
-#include "qemu-aio.h"
-#include "block/raw-posix-aio.h"
+#include "block/aio.h"
+#include "qemu/queue.h"
+#include "block/raw-aio.h"
+#include "qemu/event_notifier.h"

-#include <sys/eventfd.h>
 #include <libaio.h>

 /*
@ -37,7 +38,7 @@ struct qemu_laiocb {

 struct qemu_laio_state {
    io_context_t ctx;
-    int efd;
+    EventNotifier e;
    int count;
 };

@ -76,29 +77,17 @@ static void qemu_laio_process_completion(struct qemu_laio_state *s,
    qemu_aio_release(laiocb);
 }

-static void qemu_laio_completion_cb(void *opaque)
+static void qemu_laio_completion_cb(EventNotifier *e)
 {
-    struct qemu_laio_state *s = opaque;
+    struct qemu_laio_state *s = container_of(e, struct qemu_laio_state, e);

-    while (1) {
+    while (event_notifier_test_and_clear(&s->e)) {
        struct io_event events[MAX_EVENTS];
-        uint64_t val;
-        ssize_t ret;
        struct timespec ts = { 0 };
        int nevents, i;

        do {
-            ret = read(s->efd, &val, sizeof(val));
-        } while (ret == -1 && errno == EINTR);
-
-        if (ret == -1 && errno == EAGAIN)
-            break;
-
-        if (ret != 8)
-            break;
-
-        do {
-            nevents = io_getevents(s->ctx, val, MAX_EVENTS, events, &ts);
+            nevents = io_getevents(s->ctx, MAX_EVENTS, MAX_EVENTS, events, &ts);
        } while (nevents == -EINTR);

        for (i = 0; i < nevents; i++) {
@ -112,9 +101,9 @@ static void qemu_laio_completion_cb(void *opaque)
    }
 }

-static int qemu_laio_flush_cb(void *opaque)
+static int qemu_laio_flush_cb(EventNotifier *e)
 {
-    struct qemu_laio_state *s = opaque;
+    struct qemu_laio_state *s = container_of(e, struct qemu_laio_state, e);

    return (s->count > 0) ? 1 : 0;
 }
@ -146,11 +135,12 @@ static void laio_cancel(BlockDriverAIOCB *blockacb)
     * We might be able to do this slightly more optimal by removing the
     * O_NONBLOCK flag.
     */
-    while (laiocb->ret == -EINPROGRESS)
-        qemu_laio_completion_cb(laiocb->ctx);
+    while (laiocb->ret == -EINPROGRESS) {
+        qemu_laio_completion_cb(&laiocb->ctx->e);
+    }
 }

-static AIOPool laio_pool = {
+static const AIOCBInfo laio_aiocb_info = {
    .aiocb_size         = sizeof(struct qemu_laiocb),
    .cancel             = laio_cancel,
 };
@ -164,7 +154,7 @@ BlockDriverAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd,
    struct iocb *iocbs;
    off_t offset = sector_num * 512;

-    laiocb = qemu_aio_get(&laio_pool, bs, cb, opaque);
+    laiocb = qemu_aio_get(&laio_aiocb_info, bs, cb, opaque);
    laiocb->nbytes = nb_sectors * 512;
    laiocb->ctx = s;
    laiocb->ret = -EINPROGRESS;
@ -186,7 +176,7 @@ BlockDriverAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd,
                        __func__, type);
        goto out_free_aiocb;
    }
-    io_set_eventfd(&laiocb->iocb, s->efd);
+    io_set_eventfd(&laiocb->iocb, event_notifier_get_fd(&s->e));
    s->count++;

    if (io_submit(s->ctx, 1, &iocbs) < 0)
@ -205,21 +195,21 @@ void *laio_init(void)
    struct qemu_laio_state *s;

    s = g_malloc0(sizeof(*s));
-    s->efd = eventfd(0, 0);
-    if (s->efd == -1)
+    if (event_notifier_init(&s->e, false) < 0) {
        goto out_free_state;
-    fcntl(s->efd, F_SETFL, O_NONBLOCK);
+    }

-    if (io_setup(MAX_EVENTS, &s->ctx) != 0)
+    if (io_setup(MAX_EVENTS, &s->ctx) != 0) {
        goto out_close_efd;
+    }

-    qemu_aio_set_fd_handler(s->efd, qemu_laio_completion_cb, NULL,
-        qemu_laio_flush_cb, s);
+    qemu_aio_set_event_notifier(&s->e, qemu_laio_completion_cb,
+                                qemu_laio_flush_cb);

    return s;

 out_close_efd:
-    close(s->efd);
+    event_notifier_cleanup(&s->e);
 out_free_state:
    g_free(s);
    return NULL;
--- a/block/mirror.c
+++ b/block/mirror.c
@ -0,0 +1,322 @@
+/*
+ * Image mirroring
+ *
+ * Copyright Red Hat, Inc. 2012
+ *
+ * Authors:
+ *  Paolo Bonzini  <pbonzini@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "trace.h"
+#include "block/blockjob.h"
+#include "block/block_int.h"
+#include "qemu/ratelimit.h"
+
+enum {
+    /*
+     * Size of data buffer for populating the image file.  This should be large
+     * enough to process multiple clusters in a single call, so that populating
+     * contiguous regions of the image is efficient.
+     */
+    BLOCK_SIZE = 512 * BDRV_SECTORS_PER_DIRTY_CHUNK, /* in bytes */
+};
+
+#define SLICE_TIME 100000000ULL /* ns */
+
+typedef struct MirrorBlockJob {
+    BlockJob common;
+    RateLimit limit;
+    BlockDriverState *target;
+    MirrorSyncMode mode;
+    BlockdevOnError on_source_error, on_target_error;
+    bool synced;
+    bool should_complete;
+    int64_t sector_num;
+    uint8_t *buf;
+} MirrorBlockJob;
+
+static BlockErrorAction mirror_error_action(MirrorBlockJob *s, bool read,
+                                            int error)
+{
+    s->synced = false;
+    if (read) {
+        return block_job_error_action(&s->common, s->common.bs,
+                                      s->on_source_error, true, error);
+    } else {
+        return block_job_error_action(&s->common, s->target,
+                                      s->on_target_error, false, error);
+    }
+}
+
+static int coroutine_fn mirror_iteration(MirrorBlockJob *s,
+                                         BlockErrorAction *p_action)
+{
+    BlockDriverState *source = s->common.bs;
+    BlockDriverState *target = s->target;
+    QEMUIOVector qiov;
+    int ret, nb_sectors;
+    int64_t end;
+    struct iovec iov;
+
+    end = s->common.len >> BDRV_SECTOR_BITS;
+    s->sector_num = bdrv_get_next_dirty(source, s->sector_num);
+    nb_sectors = MIN(BDRV_SECTORS_PER_DIRTY_CHUNK, end - s->sector_num);
+    bdrv_reset_dirty(source, s->sector_num, nb_sectors);
+
+    /* Copy the dirty cluster.  */
+    iov.iov_base = s->buf;
+    iov.iov_len  = nb_sectors * 512;
+    qemu_iovec_init_external(&qiov, &iov, 1);
+
+    trace_mirror_one_iteration(s, s->sector_num, nb_sectors);
+    ret = bdrv_co_readv(source, s->sector_num, nb_sectors, &qiov);
+    if (ret < 0) {
+        *p_action = mirror_error_action(s, true, -ret);
+        goto fail;
+    }
+    ret = bdrv_co_writev(target, s->sector_num, nb_sectors, &qiov);
+    if (ret < 0) {
+        *p_action = mirror_error_action(s, false, -ret);
+        s->synced = false;
+        goto fail;
+    }
+    return 0;
+
+fail:
+    /* Try again later.  */
+    bdrv_set_dirty(source, s->sector_num, nb_sectors);
+    return ret;
+}
+
+static void coroutine_fn mirror_run(void *opaque)
+{
+    MirrorBlockJob *s = opaque;
+    BlockDriverState *bs = s->common.bs;
+    int64_t sector_num, end;
+    int ret = 0;
+    int n;
+
+    if (block_job_is_cancelled(&s->common)) {
+        goto immediate_exit;
+    }
+
+    s->common.len = bdrv_getlength(bs);
+    if (s->common.len < 0) {
+        block_job_completed(&s->common, s->common.len);
+        return;
+    }
+
+    end = s->common.len >> BDRV_SECTOR_BITS;
+    s->buf = qemu_blockalign(bs, BLOCK_SIZE);
+
+    if (s->mode != MIRROR_SYNC_MODE_NONE) {
+        /* First part, loop on the sectors and initialize the dirty bitmap.  */
+        BlockDriverState *base;
+        base = s->mode == MIRROR_SYNC_MODE_FULL ? NULL : bs->backing_hd;
+        for (sector_num = 0; sector_num < end; ) {
+            int64_t next = (sector_num | (BDRV_SECTORS_PER_DIRTY_CHUNK - 1)) + 1;
+            ret = bdrv_co_is_allocated_above(bs, base,
+                                             sector_num, next - sector_num, &n);
+
+            if (ret < 0) {
+                goto immediate_exit;
+            }
+
+            assert(n > 0);
+            if (ret == 1) {
+                bdrv_set_dirty(bs, sector_num, n);
+                sector_num = next;
+            } else {
+                sector_num += n;
+            }
+        }
+    }
+
+    s->sector_num = -1;
+    for (;;) {
+        uint64_t delay_ns;
+        int64_t cnt;
+        bool should_complete;
+
+        cnt = bdrv_get_dirty_count(bs);
+        if (cnt != 0) {
+            BlockErrorAction action = BDRV_ACTION_REPORT;
+            ret = mirror_iteration(s, &action);
+            if (ret < 0 && action == BDRV_ACTION_REPORT) {
+                goto immediate_exit;
+            }
+            cnt = bdrv_get_dirty_count(bs);
+        }
+
+        should_complete = false;
+        if (cnt == 0) {
+            trace_mirror_before_flush(s);
+            ret = bdrv_flush(s->target);
+            if (ret < 0) {
+                if (mirror_error_action(s, false, -ret) == BDRV_ACTION_REPORT) {
+                    goto immediate_exit;
+                }
+            } else {
+                /* We're out of the streaming phase.  From now on, if the job
+                 * is cancelled we will actually complete all pending I/O and
+                 * report completion.  This way, block-job-cancel will leave
+                 * the target in a consistent state.
+                 */
+                s->common.offset = end * BDRV_SECTOR_SIZE;
+                if (!s->synced) {
+                    block_job_ready(&s->common);
+                    s->synced = true;
+                }
+
+                should_complete = s->should_complete ||
+                    block_job_is_cancelled(&s->common);
+                cnt = bdrv_get_dirty_count(bs);
+            }
+        }
+
+        if (cnt == 0 && should_complete) {
+            /* The dirty bitmap is not updated while operations are pending.
+             * If we're about to exit, wait for pending operations before
+             * calling bdrv_get_dirty_count(bs), or we may exit while the
+             * source has dirty data to copy!
+             *
+             * Note that I/O can be submitted by the guest while
+             * mirror_populate runs.
+             */
+            trace_mirror_before_drain(s, cnt);
+            bdrv_drain_all();
+            cnt = bdrv_get_dirty_count(bs);
+        }
+
+        ret = 0;
+        trace_mirror_before_sleep(s, cnt, s->synced);
+        if (!s->synced) {
+            /* Publish progress */
+            s->common.offset = end * BDRV_SECTOR_SIZE - cnt * BLOCK_SIZE;
+
+            if (s->common.speed) {
+                delay_ns = ratelimit_calculate_delay(&s->limit, BDRV_SECTORS_PER_DIRTY_CHUNK);
+            } else {
+                delay_ns = 0;
+            }
+
+            /* Note that even when no rate limit is applied we need to yield
+             * with no pending I/O here so that bdrv_drain_all() returns.
+             */
+            block_job_sleep_ns(&s->common, rt_clock, delay_ns);
+            if (block_job_is_cancelled(&s->common)) {
+                break;
+            }
+        } else if (!should_complete) {
+            delay_ns = (cnt == 0 ? SLICE_TIME : 0);
+            block_job_sleep_ns(&s->common, rt_clock, delay_ns);
+        } else if (cnt == 0) {
+            /* The two disks are in sync.  Exit and report successful
+             * completion.
+             */
+            assert(QLIST_EMPTY(&bs->tracked_requests));
+            s->common.cancelled = false;
+            break;
+        }
+    }
+
+immediate_exit:
+    g_free(s->buf);
+    bdrv_set_dirty_tracking(bs, false);
+    bdrv_iostatus_disable(s->target);
+    if (s->should_complete && ret == 0) {
+        if (bdrv_get_flags(s->target) != bdrv_get_flags(s->common.bs)) {
+            bdrv_reopen(s->target, bdrv_get_flags(s->common.bs), NULL);
+        }
+        bdrv_swap(s->target, s->common.bs);
+    }
+    bdrv_close(s->target);
+    bdrv_delete(s->target);
+    block_job_completed(&s->common, ret);
+}
+
+static void mirror_set_speed(BlockJob *job, int64_t speed, Error **errp)
+{
+    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
+
+    if (speed < 0) {
+        error_set(errp, QERR_INVALID_PARAMETER, "speed");
+        return;
+    }
+    ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
+}
+
+static void mirror_iostatus_reset(BlockJob *job)
+{
+    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
+
+    bdrv_iostatus_reset(s->target);
+}
+
+static void mirror_complete(BlockJob *job, Error **errp)
+{
+    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
+    int ret;
+
+    ret = bdrv_open_backing_file(s->target);
+    if (ret < 0) {
+        char backing_filename[PATH_MAX];
+        bdrv_get_full_backing_filename(s->target, backing_filename,
+                                       sizeof(backing_filename));
+        error_set(errp, QERR_OPEN_FILE_FAILED, backing_filename);
+        return;
+    }
+    if (!s->synced) {
+        error_set(errp, QERR_BLOCK_JOB_NOT_READY, job->bs->device_name);
+        return;
+    }
+
+    s->should_complete = true;
+    block_job_resume(job);
+}
+
+static BlockJobType mirror_job_type = {
+    .instance_size = sizeof(MirrorBlockJob),
+    .job_type      = "mirror",
+    .set_speed     = mirror_set_speed,
+    .iostatus_reset= mirror_iostatus_reset,
+    .complete      = mirror_complete,
+};
+
+void mirror_start(BlockDriverState *bs, BlockDriverState *target,
+                  int64_t speed, MirrorSyncMode mode,
+                  BlockdevOnError on_source_error,
+                  BlockdevOnError on_target_error,
+                  BlockDriverCompletionFunc *cb,
+                  void *opaque, Error **errp)
+{
+    MirrorBlockJob *s;
+
+    if ((on_source_error == BLOCKDEV_ON_ERROR_STOP ||
+         on_source_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
+        !bdrv_iostatus_is_enabled(bs)) {
+        error_set(errp, QERR_INVALID_PARAMETER, "on-source-error");
+        return;
+    }
+
+    s = block_job_create(&mirror_job_type, bs, speed, cb, opaque, errp);
+    if (!s) {
+        return;
+    }
+
+    s->on_source_error = on_source_error;
+    s->on_target_error = on_target_error;
+    s->target = target;
+    s->mode = mode;
+    bdrv_set_dirty_tracking(bs, true);
+    bdrv_set_enable_write_cache(s->target, true);
+    bdrv_set_on_error(s->target, on_target_error, on_target_error);
+    bdrv_iostatus_enable(s->target);
+    s->common.co = qemu_coroutine_create(mirror_run);
+    trace_mirror_start(bs, s, s->common.co, opaque);
+    qemu_coroutine_enter(s->common.co, s);
+}
--- a/block/nbd.c
+++ b/block/nbd.c
@ -27,10 +27,11 @@
 */

 #include "qemu-common.h"
-#include "nbd.h"
-#include "block_int.h"
-#include "module.h"
-#include "qemu_socket.h"
+#include "block/nbd.h"
+#include "qemu/uri.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
+#include "qemu/sockets.h"

 #include <sys/types.h>
 #include <unistd.h>
@ -55,7 +56,6 @@ typedef struct BDRVNBDState {
    uint32_t nbdflags;
    off_t size;
    size_t blocksize;
-    char *export_name; /* An NBD server may export several devices */

    CoMutex send_mutex;
    CoMutex free_sema;
@ -65,13 +65,75 @@ typedef struct BDRVNBDState {
    Coroutine *recv_coroutine[MAX_NBD_REQUESTS];
    struct nbd_reply reply;

-    /* If it begins with  '/', this is a UNIX domain socket. Otherwise,
-     * it's a string of the form <hostname|ip4|\[ip6\]>:port
-     */
+    int is_unix;
    char *host_spec;
+    char *export_name; /* An NBD server may export several devices */
 } BDRVNBDState;

-static int nbd_config(BDRVNBDState *s, const char *filename, int flags)
+static int nbd_parse_uri(BDRVNBDState *s, const char *filename)
+{
+    URI *uri;
+    const char *p;
+    QueryParams *qp = NULL;
+    int ret = 0;
+
+    uri = uri_parse(filename);
+    if (!uri) {
+        return -EINVAL;
+    }
+
+    /* transport */
+    if (!strcmp(uri->scheme, "nbd")) {
+        s->is_unix = false;
+    } else if (!strcmp(uri->scheme, "nbd+tcp")) {
+        s->is_unix = false;
+    } else if (!strcmp(uri->scheme, "nbd+unix")) {
+        s->is_unix = true;
+    } else {
+        ret = -EINVAL;
+        goto out;
+    }
+
+    p = uri->path ? uri->path : "/";
+    p += strspn(p, "/");
+    if (p[0]) {
+        s->export_name = g_strdup(p);
+    }
+
+    qp = query_params_parse(uri->query);
+    if (qp->n > 1 || (s->is_unix && !qp->n) || (!s->is_unix && qp->n)) {
+        ret = -EINVAL;
+        goto out;
+    }
+
+    if (s->is_unix) {
+        /* nbd+unix:///export?socket=path */
+        if (uri->server || uri->port || strcmp(qp->p[0].name, "socket")) {
+            ret = -EINVAL;
+            goto out;
+        }
+        s->host_spec = g_strdup(qp->p[0].value);
+    } else {
+        /* nbd[+tcp]://host:port/export */
+        if (!uri->server) {
+            ret = -EINVAL;
+            goto out;
+        }
+        if (!uri->port) {
+            uri->port = NBD_DEFAULT_PORT;
+        }
+        s->host_spec = g_strdup_printf("%s:%d", uri->server, uri->port);
+    }
+
+out:
+    if (qp) {
+        query_params_free(qp);
+    }
+    uri_free(uri);
+    return ret;
+}
+
+static int nbd_config(BDRVNBDState *s, const char *filename)
 {
    char *file;
    char *export_name;
@ -79,6 +141,10 @@ static int nbd_config(BDRVNBDState *s, const char *filename, int flags)
    const char *unixpath;
    int err = -EINVAL;

+    if (strstr(filename, "://")) {
+        return nbd_parse_uri(s, filename);
+    }
+
    file = g_strdup(filename);

    export_name = strstr(file, EN_OPTSTR);
@ -98,11 +164,10 @@ static int nbd_config(BDRVNBDState *s, const char *filename, int flags)

    /* are we a UNIX or TCP socket? */
    if (strstart(host_spec, "unix:", &unixpath)) {
-        if (unixpath[0] != '/') { /* We demand  an absolute path*/
-            goto out;
-        }
+        s->is_unix = true;
        s->host_spec = g_strdup(unixpath);
    } else {
+        s->is_unix = false;
        s->host_spec = g_strdup(host_spec);
    }

@ -262,7 +327,7 @@ static int nbd_establish_connection(BlockDriverState *bs)
    off_t size;
    size_t blocksize;

-    if (s->host_spec[0] == '/') {
+    if (s->is_unix) {
        sock = unix_socket_outgoing(s->host_spec);
    } else {
        sock = tcp_socket_outgoing_spec(s->host_spec);
@ -320,7 +385,7 @@ static int nbd_open(BlockDriverState *bs, const char* filename, int flags)
    qemu_co_mutex_init(&s->free_sema);

    /* Pop the config into our state object. Exit if invalid. */
-    result = nbd_config(s, filename, flags);
+    result = nbd_config(s, filename);
    if (result != 0) {
        return result;
    }
@ -498,6 +563,33 @@ static int64_t nbd_getlength(BlockDriverState *bs)

 static BlockDriver bdrv_nbd = {
    .format_name         = "nbd",
+    .protocol_name       = "nbd",
+    .instance_size       = sizeof(BDRVNBDState),
+    .bdrv_file_open      = nbd_open,
+    .bdrv_co_readv       = nbd_co_readv,
+    .bdrv_co_writev      = nbd_co_writev,
+    .bdrv_close          = nbd_close,
+    .bdrv_co_flush_to_os = nbd_co_flush,
+    .bdrv_co_discard     = nbd_co_discard,
+    .bdrv_getlength      = nbd_getlength,
+};
+
+static BlockDriver bdrv_nbd_tcp = {
+    .format_name         = "nbd",
+    .protocol_name       = "nbd+tcp",
+    .instance_size       = sizeof(BDRVNBDState),
+    .bdrv_file_open      = nbd_open,
+    .bdrv_co_readv       = nbd_co_readv,
+    .bdrv_co_writev      = nbd_co_writev,
+    .bdrv_close          = nbd_close,
+    .bdrv_co_flush_to_os = nbd_co_flush,
+    .bdrv_co_discard     = nbd_co_discard,
+    .bdrv_getlength      = nbd_getlength,
+};
+
+static BlockDriver bdrv_nbd_unix = {
+    .format_name         = "nbd",
+    .protocol_name       = "nbd+unix",
    .instance_size       = sizeof(BDRVNBDState),
    .bdrv_file_open      = nbd_open,
    .bdrv_co_readv       = nbd_co_readv,
@ -506,12 +598,13 @@ static BlockDriver bdrv_nbd = {
    .bdrv_co_flush_to_os = nbd_co_flush,
    .bdrv_co_discard     = nbd_co_discard,
    .bdrv_getlength      = nbd_getlength,
-    .protocol_name       = "nbd",
 };

 static void bdrv_nbd_init(void)
 {
    bdrv_register(&bdrv_nbd);
+    bdrv_register(&bdrv_nbd_tcp);
+    bdrv_register(&bdrv_nbd_unix);
 }

 block_init(bdrv_nbd_init);
--- a/block/parallels.c
+++ b/block/parallels.c
@ -24,8 +24,8 @@
 * THE SOFTWARE.
 */
 #include "qemu-common.h"
-#include "block_int.h"
-#include "module.h"
+#include "block/block_int.h"
+#include "qemu/module.h"

 /**************************************************************/

--- a/block/qcow.c
+++ b/block/qcow.c
@ -22,11 +22,11 @@
 * THE SOFTWARE.
 */
 #include "qemu-common.h"
-#include "block_int.h"
-#include "module.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
 #include <zlib.h>
-#include "aes.h"
-#include "migration.h"
+#include "block/aes.h"
+#include "migration/migration.h"

 /**************************************************************/
 /* QEMU COW block driver with compression and encryption support */
@ -197,6 +197,15 @@ static int qcow_open(BlockDriverState *bs, int flags)
    return ret;
 }

+
+/* We have nothing to do for QCOW reopen, stubs just return
+ * success */
+static int qcow_reopen_prepare(BDRVReopenState *state,
+                               BlockReopenQueue *queue, Error **errp)
+{
+    return 0;
+}
+
 static int qcow_set_key(BlockDriverState *bs, const char *key)
 {
    BDRVQcowState *s = bs->opaque;
@ -868,6 +877,7 @@ static BlockDriver bdrv_qcow = {
    .bdrv_probe		= qcow_probe,
    .bdrv_open		= qcow_open,
    .bdrv_close		= qcow_close,
+    .bdrv_reopen_prepare = qcow_reopen_prepare,
    .bdrv_create	= qcow_create,

    .bdrv_co_readv          = qcow_co_readv,
--- a/block/qcow2-cache.c
+++ b/block/qcow2-cache.c
@ -22,7 +22,7 @@
 * THE SOFTWARE.
 */

-#include "block_int.h"
+#include "block/block_int.h"
 #include "qemu-common.h"
 #include "qcow2.h"
 #include "trace.h"
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@ -25,7 +25,7 @@
 #include <zlib.h>

 #include "qemu-common.h"
-#include "block_int.h"
+#include "block/block_int.h"
 #include "block/qcow2.h"
 #include "trace.h"

@ -615,57 +615,67 @@ uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
    return cluster_offset;
 }

-int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
+static int perform_cow(BlockDriverState *bs, QCowL2Meta *m, Qcow2COWRegion *r)
 {
    BDRVQcowState *s = bs->opaque;
-    int i, j = 0, l2_index, ret;
-    uint64_t *old_cluster, start_sect, *l2_table;
-    uint64_t cluster_offset = m->alloc_offset;
-    bool cow = false;
+    int ret;

-    trace_qcow2_cluster_link_l2(qemu_coroutine_self(), m->nb_clusters);
-
-    if (m->nb_clusters == 0)
+    if (r->nb_sectors == 0) {
        return 0;
-
-    old_cluster = g_malloc(m->nb_clusters * sizeof(uint64_t));
-
-    /* copy content of unmodified sectors */
-    start_sect = (m->offset & ~(s->cluster_size - 1)) >> 9;
-    if (m->n_start) {
-        cow = true;
-        qemu_co_mutex_unlock(&s->lock);
-        ret = copy_sectors(bs, start_sect, cluster_offset, 0, m->n_start);
-        qemu_co_mutex_lock(&s->lock);
-        if (ret < 0)
-            goto err;
    }

-    if (m->nb_available & (s->cluster_sectors - 1)) {
-        cow = true;
-        qemu_co_mutex_unlock(&s->lock);
-        ret = copy_sectors(bs, start_sect, cluster_offset, m->nb_available,
-                           align_offset(m->nb_available, s->cluster_sectors));
-        qemu_co_mutex_lock(&s->lock);
-        if (ret < 0)
-            goto err;
+    qemu_co_mutex_unlock(&s->lock);
+    ret = copy_sectors(bs, m->offset / BDRV_SECTOR_SIZE, m->alloc_offset,
+                       r->offset / BDRV_SECTOR_SIZE,
+                       r->offset / BDRV_SECTOR_SIZE + r->nb_sectors);
+    qemu_co_mutex_lock(&s->lock);
+
+    if (ret < 0) {
+        return ret;
    }

    /*
-     * Update L2 table.
-     *
     * Before we update the L2 table to actually point to the new cluster, we
     * need to be sure that the refcounts have been increased and COW was
     * handled.
     */
-    if (cow) {
-        qcow2_cache_depends_on_flush(s->l2_table_cache);
+    qcow2_cache_depends_on_flush(s->l2_table_cache);
+
+    return 0;
+}
+
+int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
+{
+    BDRVQcowState *s = bs->opaque;
+    int i, j = 0, l2_index, ret;
+    uint64_t *old_cluster, *l2_table;
+    uint64_t cluster_offset = m->alloc_offset;
+
+    trace_qcow2_cluster_link_l2(qemu_coroutine_self(), m->nb_clusters);
+    assert(m->nb_clusters > 0);
+
+    old_cluster = g_malloc(m->nb_clusters * sizeof(uint64_t));
+
+    /* copy content of unmodified sectors */
+    ret = perform_cow(bs, m, &m->cow_start);
+    if (ret < 0) {
+        goto err;
    }

+    ret = perform_cow(bs, m, &m->cow_end);
+    if (ret < 0) {
+        goto err;
+    }
+
+    /* Update L2 table. */
+    if (s->compatible_features & QCOW2_COMPAT_LAZY_REFCOUNTS) {
+        qcow2_mark_dirty(bs);
+    }
    if (qcow2_need_accurate_refcounts(s)) {
        qcow2_cache_set_dependency(bs, s->l2_table_cache,
                                   s->refcount_block_cache);
    }
+
    ret = get_cluster_table(bs, m->offset, &l2_table, &l2_index);
    if (ret < 0) {
        goto err;
@ -743,38 +753,16 @@ out:
 }

 /*
- * Allocates new clusters for the given guest_offset.
- *
- * At most *nb_clusters are allocated, and on return *nb_clusters is updated to
- * contain the number of clusters that have been allocated and are contiguous
- * in the image file.
- *
- * If *host_offset is non-zero, it specifies the offset in the image file at
- * which the new clusters must start. *nb_clusters can be 0 on return in this
- * case if the cluster at host_offset is already in use. If *host_offset is
- * zero, the clusters can be allocated anywhere in the image file.
- *
- * *host_offset is updated to contain the offset into the image file at which
- * the first allocated cluster starts.
- *
- * Return 0 on success and -errno in error cases. -EAGAIN means that the
- * function has been waiting for another request and the allocation must be
- * restarted, but the whole request should not be failed.
+ * Check if there already is an AIO write request in flight which allocates
+ * the same cluster. In this case we need to wait until the previous
+ * request has completed and updated the L2 table accordingly.
 */
-static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset,
-    uint64_t *host_offset, unsigned int *nb_clusters)
+static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset,
+    unsigned int *nb_clusters)
 {
    BDRVQcowState *s = bs->opaque;
    QCowL2Meta *old_alloc;

-    trace_qcow2_do_alloc_clusters_offset(qemu_coroutine_self(), guest_offset,
-                                         *host_offset, *nb_clusters);
-
-    /*
-     * Check if there already is an AIO write request in flight which allocates
-     * the same cluster. In this case we need to wait until the previous
-     * request has completed and updated the L2 table accordingly.
-     */
    QLIST_FOREACH(old_alloc, &s->cluster_allocs, next_in_flight) {

        uint64_t start = guest_offset >> s->cluster_bits;
@ -807,6 +795,42 @@ static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset,
        abort();
    }

+    return 0;
+}
+
+/*
+ * Allocates new clusters for the given guest_offset.
+ *
+ * At most *nb_clusters are allocated, and on return *nb_clusters is updated to
+ * contain the number of clusters that have been allocated and are contiguous
+ * in the image file.
+ *
+ * If *host_offset is non-zero, it specifies the offset in the image file at
+ * which the new clusters must start. *nb_clusters can be 0 on return in this
+ * case if the cluster at host_offset is already in use. If *host_offset is
+ * zero, the clusters can be allocated anywhere in the image file.
+ *
+ * *host_offset is updated to contain the offset into the image file at which
+ * the first allocated cluster starts.
+ *
+ * Return 0 on success and -errno in error cases. -EAGAIN means that the
+ * function has been waiting for another request and the allocation must be
+ * restarted, but the whole request should not be failed.
+ */
+static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset,
+    uint64_t *host_offset, unsigned int *nb_clusters)
+{
+    BDRVQcowState *s = bs->opaque;
+    int ret;
+
+    trace_qcow2_do_alloc_clusters_offset(qemu_coroutine_self(), guest_offset,
+                                         *host_offset, *nb_clusters);
+
+    ret = handle_dependencies(bs, guest_offset, nb_clusters);
+    if (ret < 0) {
+        return ret;
+    }
+
    /* Allocate new clusters */
    trace_qcow2_cluster_alloc_phys(qemu_coroutine_self());
    if (*host_offset == 0) {
@ -818,7 +842,7 @@ static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset,
        *host_offset = cluster_offset;
        return 0;
    } else {
-        int ret = qcow2_alloc_clusters_at(bs, *host_offset, *nb_clusters);
+        ret = qcow2_alloc_clusters_at(bs, *host_offset, *nb_clusters);
        if (ret < 0) {
            return ret;
        }
@ -847,7 +871,7 @@ static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset,
 * Return 0 on success and -errno in error cases
 */
 int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
-    int n_start, int n_end, int *num, QCowL2Meta *m)
+    int n_start, int n_end, int *num, uint64_t *host_offset, QCowL2Meta **m)
 {
    BDRVQcowState *s = bs->opaque;
    int l2_index, ret, sectors;
@ -919,12 +943,6 @@ again:
    }

    /* If there is something left to allocate, do that now */
-    *m = (QCowL2Meta) {
-        .cluster_offset     = cluster_offset,
-        .nb_clusters        = 0,
-    };
-    qemu_co_queue_init(&m->dependent_requests);
-
    if (nb_clusters > 0) {
        uint64_t alloc_offset;
        uint64_t alloc_cluster_offset;
@ -957,22 +975,40 @@ again:
             *
             * avail_sectors: Number of sectors from the start of the first
             * newly allocated to the end of the last newly allocated cluster.
+             *
+             * nb_sectors: The number of sectors from the start of the first
+             * newly allocated cluster to the end of the aread that the write
+             * request actually writes to (excluding COW at the end)
             */
            int requested_sectors = n_end - keep_clusters * s->cluster_sectors;
            int avail_sectors = nb_clusters
                                << (s->cluster_bits - BDRV_SECTOR_BITS);
+            int alloc_n_start = keep_clusters == 0 ? n_start : 0;
+            int nb_sectors = MIN(requested_sectors, avail_sectors);

-            *m = (QCowL2Meta) {
-                .cluster_offset = keep_clusters == 0 ?
-                                  alloc_cluster_offset : cluster_offset,
+            if (keep_clusters == 0) {
+                cluster_offset = alloc_cluster_offset;
+            }
+
+            *m = g_malloc0(sizeof(**m));
+
+            **m = (QCowL2Meta) {
                .alloc_offset   = alloc_cluster_offset,
-                .offset         = alloc_offset,
-                .n_start        = keep_clusters == 0 ? n_start : 0,
+                .offset         = alloc_offset & ~(s->cluster_size - 1),
                .nb_clusters    = nb_clusters,
-                .nb_available   = MIN(requested_sectors, avail_sectors),
+                .nb_available   = nb_sectors,
+
+                .cow_start = {
+                    .offset     = 0,
+                    .nb_sectors = alloc_n_start,
+                },
+                .cow_end = {
+                    .offset     = nb_sectors * BDRV_SECTOR_SIZE,
+                    .nb_sectors = avail_sectors - nb_sectors,
+                },
            };
-            qemu_co_queue_init(&m->dependent_requests);
-            QLIST_INSERT_HEAD(&s->cluster_allocs, m, next_in_flight);
+            qemu_co_queue_init(&(*m)->dependent_requests);
+            QLIST_INSERT_HEAD(&s->cluster_allocs, *m, next_in_flight);
        }
    }

@ -984,12 +1020,13 @@ again:

    assert(sectors > n_start);
    *num = sectors - n_start;
+    *host_offset = cluster_offset;

    return 0;

 fail:
-    if (m->nb_clusters > 0) {
-        QLIST_REMOVE(m, next_in_flight);
+    if (*m && (*m)->nb_clusters > 0) {
+        QLIST_REMOVE(*m, next_in_flight);
    }
    return ret;
 }
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@ -23,7 +23,7 @@
 */

 #include "qemu-common.h"
-#include "block_int.h"
+#include "block/block_int.h"
 #include "block/qcow2.h"

 static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size);
@ -301,7 +301,8 @@ static int alloc_refcount_block(BlockDriverState *bs,
    uint64_t last_table_size;
    uint64_t blocks_clusters;
    do {
-        uint64_t table_clusters = size_to_clusters(s, table_size);
+        uint64_t table_clusters =
+            size_to_clusters(s, table_size * sizeof(uint64_t));
        blocks_clusters = 1 +
            ((table_clusters + refcount_block_clusters - 1)
            / refcount_block_clusters);
--- a/block/qcow2-snapshot.c
+++ b/block/qcow2-snapshot.c
@ -23,7 +23,7 @@
 */

 #include "qemu-common.h"
-#include "block_int.h"
+#include "block/block_int.h"
 #include "block/qcow2.h"

 typedef struct QEMU_PACKED QCowSnapshotHeader {
--- a/block/qcow2.c
+++ b/block/qcow2.c
@ -22,13 +22,13 @@
 * THE SOFTWARE.
 */
 #include "qemu-common.h"
-#include "block_int.h"
-#include "module.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
 #include <zlib.h>
-#include "aes.h"
+#include "block/aes.h"
 #include "block/qcow2.h"
-#include "qemu-error.h"
-#include "qerror.h"
+#include "qemu/error-report.h"
+#include "qapi/qmp/qerror.h"
 #include "trace.h"

 /*
@ -52,6 +52,7 @@ typedef struct {
    uint32_t magic;
    uint32_t len;
 } QCowExtension;
+
 #define  QCOW2_EXT_MAGIC_END 0
 #define  QCOW2_EXT_MAGIC_BACKING_FORMAT 0xE2792ACA
 #define  QCOW2_EXT_MAGIC_FEATURE_TABLE 0x6803f857
@ -221,7 +222,7 @@ static void report_unsupported_feature(BlockDriverState *bs,
 * updated successfully.  Therefore it is not required to check the return
 * value of this function.
 */
-static int qcow2_mark_dirty(BlockDriverState *bs)
+int qcow2_mark_dirty(BlockDriverState *bs)
 {
    BDRVQcowState *s = bs->opaque;
    uint64_t val;
@ -558,6 +559,14 @@ static int qcow2_set_key(BlockDriverState *bs, const char *key)
    return 0;
 }

+/* We have nothing to do for QCOW2 reopen, stubs just return
+ * success */
+static int qcow2_reopen_prepare(BDRVReopenState *state,
+                                BlockReopenQueue *queue, Error **errp)
+{
+    return 0;
+}
+
 static int coroutine_fn qcow2_co_is_allocated(BlockDriverState *bs,
        int64_t sector_num, int nb_sectors, int *pnum)
 {
@ -736,21 +745,6 @@ fail:
    return ret;
 }

-static void run_dependent_requests(BDRVQcowState *s, QCowL2Meta *m)
-{
-    /* Take the request off the list of running requests */
-    if (m->nb_clusters != 0) {
-        QLIST_REMOVE(m, next_in_flight);
-    }
-
-    /* Restart all dependent requests */
-    if (!qemu_co_queue_empty(&m->dependent_requests)) {
-        qemu_co_mutex_unlock(&s->lock);
-        qemu_co_queue_restart_all(&m->dependent_requests);
-        qemu_co_mutex_lock(&s->lock);
-    }
-}
-
 static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
                           int64_t sector_num,
                           int remaining_sectors,
@ -765,15 +759,11 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
    QEMUIOVector hd_qiov;
    uint64_t bytes_done = 0;
    uint8_t *cluster_data = NULL;
-    QCowL2Meta l2meta = {
-        .nb_clusters = 0,
-    };
+    QCowL2Meta *l2meta;

    trace_qcow2_writev_start_req(qemu_coroutine_self(), sector_num,
                                 remaining_sectors);

-    qemu_co_queue_init(&l2meta.dependent_requests);
-
    qemu_iovec_init(&hd_qiov, qiov->niov);

    s->cluster_cache_offset = -1; /* disable compressed cache */
@ -782,6 +772,8 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,

    while (remaining_sectors != 0) {

+        l2meta = NULL;
+
        trace_qcow2_writev_start_part(qemu_coroutine_self());
        index_in_cluster = sector_num & (s->cluster_sectors - 1);
        n_end = index_in_cluster + remaining_sectors;
@ -791,17 +783,11 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
        }

        ret = qcow2_alloc_cluster_offset(bs, sector_num << 9,
-            index_in_cluster, n_end, &cur_nr_sectors, &l2meta);
+            index_in_cluster, n_end, &cur_nr_sectors, &cluster_offset, &l2meta);
        if (ret < 0) {
            goto fail;
        }

-        if (l2meta.nb_clusters > 0 &&
-            (s->compatible_features & QCOW2_COMPAT_LAZY_REFCOUNTS)) {
-            qcow2_mark_dirty(bs);
-        }
-
-        cluster_offset = l2meta.cluster_offset;
        assert((cluster_offset & 511) == 0);

        qemu_iovec_reset(&hd_qiov);
@ -826,8 +812,8 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
                cur_nr_sectors * 512);
        }

-        BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
        qemu_co_mutex_unlock(&s->lock);
+        BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
        trace_qcow2_writev_data(qemu_coroutine_self(),
                                (cluster_offset >> 9) + index_in_cluster);
        ret = bdrv_co_writev(bs->file,
@ -838,12 +824,24 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
            goto fail;
        }

-        ret = qcow2_alloc_cluster_link_l2(bs, &l2meta);
-        if (ret < 0) {
-            goto fail;
-        }
+        if (l2meta != NULL) {
+            ret = qcow2_alloc_cluster_link_l2(bs, l2meta);
+            if (ret < 0) {
+                goto fail;
+            }

-        run_dependent_requests(s, &l2meta);
+            /* Take the request off the list of running requests */
+            if (l2meta->nb_clusters != 0) {
+                QLIST_REMOVE(l2meta, next_in_flight);
+            }
+
+            qemu_co_mutex_unlock(&s->lock);
+            qemu_co_queue_restart_all(&l2meta->dependent_requests);
+            qemu_co_mutex_lock(&s->lock);
+
+            g_free(l2meta);
+            l2meta = NULL;
+        }

        remaining_sectors -= cur_nr_sectors;
        sector_num += cur_nr_sectors;
@ -853,10 +851,16 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
    ret = 0;

 fail:
-    run_dependent_requests(s, &l2meta);
-
    qemu_co_mutex_unlock(&s->lock);

+    if (l2meta != NULL) {
+        if (l2meta->nb_clusters != 0) {
+            QLIST_REMOVE(l2meta, next_in_flight);
+        }
+        qemu_co_queue_restart_all(&l2meta->dependent_requests);
+        g_free(l2meta);
+    }
+
    qemu_iovec_destroy(&hd_qiov);
    qemu_vfree(cluster_data);
    trace_qcow2_writev_done_req(qemu_coroutine_self(), ret);
@ -1087,6 +1091,7 @@ int qcow2_update_header(BlockDriverState *bs)
            goto fail;
        }

+        /* Using strncpy is ok here, since buf is not NUL-terminated. */
        strncpy(buf, bs->backing_file, buflen);

        header->backing_file_offset = cpu_to_be64(buf - ((char*) header));
@ -1118,31 +1123,33 @@ static int preallocate(BlockDriverState *bs)
 {
    uint64_t nb_sectors;
    uint64_t offset;
+    uint64_t host_offset = 0;
    int num;
    int ret;
-    QCowL2Meta meta;
+    QCowL2Meta *meta;

    nb_sectors = bdrv_getlength(bs) >> 9;
    offset = 0;
-    qemu_co_queue_init(&meta.dependent_requests);
-    meta.cluster_offset = 0;

    while (nb_sectors) {
        num = MIN(nb_sectors, INT_MAX >> 9);
-        ret = qcow2_alloc_cluster_offset(bs, offset, 0, num, &num, &meta);
+        ret = qcow2_alloc_cluster_offset(bs, offset, 0, num, &num,
+                                         &host_offset, &meta);
        if (ret < 0) {
            return ret;
        }

-        ret = qcow2_alloc_cluster_link_l2(bs, &meta);
+        ret = qcow2_alloc_cluster_link_l2(bs, meta);
        if (ret < 0) {
-            qcow2_free_any_clusters(bs, meta.cluster_offset, meta.nb_clusters);
+            qcow2_free_any_clusters(bs, meta->alloc_offset, meta->nb_clusters);
            return ret;
        }

        /* There are no dependent requests, but we need to remove our request
         * from the list of in-flight requests */
-        run_dependent_requests(bs->opaque, &meta);
+        if (meta != NULL) {
+            QLIST_REMOVE(meta, next_in_flight);
+        }

        /* TODO Preallocate data if requested */

@ -1155,10 +1162,10 @@ static int preallocate(BlockDriverState *bs)
     * all of the allocated clusters (otherwise we get failing reads after
     * EOF). Extend the image to the last allocated sector.
     */
-    if (meta.cluster_offset != 0) {
+    if (host_offset != 0) {
        uint8_t buf[512];
        memset(buf, 0, 512);
-        ret = bdrv_write(bs->file, (meta.cluster_offset >> 9) + num - 1, buf, 1);
+        ret = bdrv_write(bs->file, (host_offset >> 9) + num - 1, buf, 1);
        if (ret < 0) {
            return ret;
        }
@ -1679,6 +1686,7 @@ static BlockDriver bdrv_qcow2 = {
    .bdrv_probe         = qcow2_probe,
    .bdrv_open          = qcow2_open,
    .bdrv_close         = qcow2_close,
+    .bdrv_reopen_prepare  = qcow2_reopen_prepare,
    .bdrv_create        = qcow2_create,
    .bdrv_co_is_allocated = qcow2_co_is_allocated,
    .bdrv_set_key       = qcow2_set_key,
--- a/block/qcow2.h
+++ b/block/qcow2.h
@ -25,8 +25,8 @@
 #ifndef BLOCK_QCOW2_H
 #define BLOCK_QCOW2_H

-#include "aes.h"
-#include "qemu-coroutine.h"
+#include "block/aes.h"
+#include "block/coroutine.h"

 //#define DEBUG_ALLOC
 //#define DEBUG_ALLOC2
@ -196,17 +196,56 @@ typedef struct QCowCreateState {

 struct QCowAIOCB;

-/* XXX This could be private for qcow2-cluster.c */
+typedef struct Qcow2COWRegion {
+    /**
+     * Offset of the COW region in bytes from the start of the first cluster
+     * touched by the request.
+     */
+    uint64_t    offset;
+
+    /** Number of sectors to copy */
+    int         nb_sectors;
+} Qcow2COWRegion;
+
+/**
+ * Describes an in-flight (part of a) write request that writes to clusters
+ * that are not referenced in their L2 table yet.
+ */
 typedef struct QCowL2Meta
 {
+    /** Guest offset of the first newly allocated cluster */
    uint64_t offset;
-    uint64_t cluster_offset;
+
+    /** Host offset of the first newly allocated cluster */
    uint64_t alloc_offset;
-    int n_start;
+
+    /**
+     * Number of sectors from the start of the first allocated cluster to
+     * the end of the (possibly shortened) request
+     */
    int nb_available;
+
+    /** Number of newly allocated clusters */
    int nb_clusters;
+
+    /**
+     * Requests that overlap with this allocation and wait to be restarted
+     * when the allocating request has completed.
+     */
    CoQueue dependent_requests;

+    /**
+     * The COW Region between the start of the first allocated cluster and the
+     * area the guest actually writes to.
+     */
+    Qcow2COWRegion cow_start;
+
+    /**
+     * The COW Region between the area the guest actually writes to and the
+     * end of the last allocated cluster.
+     */
+    Qcow2COWRegion cow_end;
+
    QLIST_ENTRY(QCowL2Meta) next_in_flight;
 } QCowL2Meta;

@ -264,6 +303,8 @@ static inline bool qcow2_need_accurate_refcounts(BDRVQcowState *s)
 /* qcow2.c functions */
 int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
                  int64_t sector_num, int nb_sectors);
+
+int qcow2_mark_dirty(BlockDriverState *bs);
 int qcow2_update_header(BlockDriverState *bs);

 /* qcow2-refcount.c functions */
@ -297,7 +338,7 @@ void qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
 int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
    int *num, uint64_t *cluster_offset);
 int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
-    int n_start, int n_end, int *num, QCowL2Meta *m);
+    int n_start, int n_end, int *num, uint64_t *host_offset, QCowL2Meta **m);
 uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
                                         uint64_t offset,
                                         int compressed_size);
--- a/block/qed-table.c
+++ b/block/qed-table.c
@ -13,7 +13,7 @@
 */

 #include "trace.h"
-#include "qemu_socket.h" /* for EINPROGRESS on Windows */
+#include "qemu/sockets.h" /* for EINPROGRESS on Windows */
 #include "qed.h"

 typedef struct {
@ -103,7 +103,6 @@ static void qed_write_table_cb(void *opaque, int ret)
 out:
    qemu_vfree(write_table_cb->table);
    gencb_complete(&write_table_cb->gencb, ret);
-    return;
 }

 /**
--- a/block/qed.c
+++ b/block/qed.c
@ -12,11 +12,11 @@
 *
 */

-#include "qemu-timer.h"
+#include "qemu/timer.h"
 #include "trace.h"
 #include "qed.h"
-#include "qerror.h"
-#include "migration.h"
+#include "qapi/qmp/qerror.h"
+#include "migration/migration.h"

 static void qed_aio_cancel(BlockDriverAIOCB *blockacb)
 {
@ -30,7 +30,7 @@ static void qed_aio_cancel(BlockDriverAIOCB *blockacb)
    }
 }

-static AIOPool qed_aio_pool = {
+static const AIOCBInfo qed_aiocb_info = {
    .aiocb_size         = sizeof(QEDAIOCB),
    .cancel             = qed_aio_cancel,
 };
@ -505,6 +505,14 @@ out:
    return ret;
 }

+/* We have nothing to do for QED reopen, stubs just return
+ * success */
+static int bdrv_qed_reopen_prepare(BDRVReopenState *state,
+                                   BlockReopenQueue *queue, Error **errp)
+{
+    return 0;
+}
+
 static void bdrv_qed_close(BlockDriverState *bs)
 {
    BDRVQEDState *s = bs->opaque;
@ -1303,7 +1311,7 @@ static BlockDriverAIOCB *qed_aio_setup(BlockDriverState *bs,
                                       BlockDriverCompletionFunc *cb,
                                       void *opaque, int flags)
 {
-    QEDAIOCB *acb = qemu_aio_get(&qed_aio_pool, bs, cb, opaque);
+    QEDAIOCB *acb = qemu_aio_get(&qed_aiocb_info, bs, cb, opaque);

    trace_qed_aio_setup(bs->opaque, acb, sector_num, nb_sectors,
                        opaque, flags);
@ -1363,10 +1371,21 @@ static int coroutine_fn bdrv_qed_co_write_zeroes(BlockDriverState *bs,
                                                 int nb_sectors)
 {
    BlockDriverAIOCB *blockacb;
+    BDRVQEDState *s = bs->opaque;
    QEDWriteZeroesCB cb = { .done = false };
    QEMUIOVector qiov;
    struct iovec iov;

+    /* Refuse if there are untouched backing file sectors */
+    if (bs->backing_hd) {
+        if (qed_offset_into_cluster(s, sector_num * BDRV_SECTOR_SIZE) != 0) {
+            return -ENOTSUP;
+        }
+        if (qed_offset_into_cluster(s, nb_sectors * BDRV_SECTOR_SIZE) != 0) {
+            return -ENOTSUP;
+        }
+    }
+
    /* Zero writes start without an I/O buffer.  If a buffer becomes necessary
     * then it will be allocated during request processing.
     */
@ -1553,6 +1572,7 @@ static BlockDriver bdrv_qed = {
    .bdrv_rebind              = bdrv_qed_rebind,
    .bdrv_open                = bdrv_qed_open,
    .bdrv_close               = bdrv_qed_close,
+    .bdrv_reopen_prepare      = bdrv_qed_reopen_prepare,
    .bdrv_create              = bdrv_qed_create,
    .bdrv_co_is_allocated     = bdrv_qed_co_is_allocated,
    .bdrv_make_empty          = bdrv_qed_make_empty,
--- a/block/qed.h
+++ b/block/qed.h
@ -15,7 +15,7 @@
 #ifndef BLOCK_QED_H
 #define BLOCK_QED_H

-#include "block_int.h"
+#include "block/block_int.h"

 /* The layout of a QED file is as follows:
 *
--- a/block/raw-posix-aio.h
+++ b/block/raw-posix-aio.h
@ -1,5 +1,5 @@
 /*
- * QEMU Posix block I/O backend AIO support
+ * Declarations for AIO in the raw protocol
 *
 * Copyright IBM, Corp. 2008
 *
@ -12,8 +12,8 @@
 * Contributions after 2012-01-13 are licensed under the terms of the
 * GNU GPL, version 2 or (at your option) any later version.
 */
-#ifndef QEMU_RAW_POSIX_AIO_H
-#define QEMU_RAW_POSIX_AIO_H
+#ifndef QEMU_RAW_AIO_H
+#define QEMU_RAW_AIO_H

 /* AIO request types */
 #define QEMU_AIO_READ         0x0001
@ -27,19 +27,22 @@
 #define QEMU_AIO_MISALIGNED   0x1000


-/* posix-aio-compat.c - thread pool based implementation */
-int paio_init(void);
-BlockDriverAIOCB *paio_submit(BlockDriverState *bs, int fd,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque, int type);
-BlockDriverAIOCB *paio_ioctl(BlockDriverState *bs, int fd,
-        unsigned long int req, void *buf,
-        BlockDriverCompletionFunc *cb, void *opaque);
-
 /* linux-aio.c - Linux native implementation */
+#ifdef CONFIG_LINUX_AIO
 void *laio_init(void);
 BlockDriverAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd,
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
        BlockDriverCompletionFunc *cb, void *opaque, int type);
+#endif

-#endif /* QEMU_RAW_POSIX_AIO_H */
+#ifdef _WIN32
+typedef struct QEMUWin32AIOState QEMUWin32AIOState;
+QEMUWin32AIOState *win32_aio_init(void);
+int win32_aio_attach(QEMUWin32AIOState *aio, HANDLE hfile);
+BlockDriverAIOCB *win32_aio_submit(BlockDriverState *bs,
+        QEMUWin32AIOState *aio, HANDLE hfile,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque, int type);
+#endif
+
+#endif /* QEMU_RAW_AIO_H */
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@ -22,12 +22,14 @@
 * THE SOFTWARE.
 */
 #include "qemu-common.h"
-#include "qemu-timer.h"
-#include "qemu-char.h"
-#include "qemu-log.h"
-#include "block_int.h"
-#include "module.h"
-#include "block/raw-posix-aio.h"
+#include "qemu/timer.h"
+#include "qemu/log.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
+#include "trace.h"
+#include "block/thread-pool.h"
+#include "qemu/iov.h"
+#include "raw-aio.h"

 #if defined(__APPLE__) && (__MACH__)
 #include <paths.h>
@ -133,16 +135,36 @@ typedef struct BDRVRawState {
    int use_aio;
    void *aio_ctx;
 #endif
-    uint8_t *aligned_buf;
-    unsigned aligned_buf_size;
 #ifdef CONFIG_XFS
    bool is_xfs : 1;
 #endif
 } BDRVRawState;

+typedef struct BDRVRawReopenState {
+    int fd;
+    int open_flags;
+#ifdef CONFIG_LINUX_AIO
+    int use_aio;
+#endif
+} BDRVRawReopenState;
+
 static int fd_open(BlockDriverState *bs);
 static int64_t raw_getlength(BlockDriverState *bs);

+typedef struct RawPosixAIOData {
+    BlockDriverState *bs;
+    int aio_fildes;
+    union {
+        struct iovec *aio_iov;
+        void *aio_ioctl_buf;
+    };
+    int aio_niov;
+    size_t aio_nbytes;
+#define aio_ioctl_cmd   aio_nbytes /* for QEMU_AIO_IOCTL */
+    off_t aio_offset;
+    int aio_type;
+} RawPosixAIOData;
+
 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
 static int cdrom_reopen(BlockDriverState *bs);
 #endif
@ -185,6 +207,57 @@ static int raw_normalize_devicepath(const char **filename)
 }
 #endif

+static void raw_parse_flags(int bdrv_flags, int *open_flags)
+{
+    assert(open_flags != NULL);
+
+    *open_flags |= O_BINARY;
+    *open_flags &= ~O_ACCMODE;
+    if (bdrv_flags & BDRV_O_RDWR) {
+        *open_flags |= O_RDWR;
+    } else {
+        *open_flags |= O_RDONLY;
+    }
+
+    /* Use O_DSYNC for write-through caching, no flags for write-back caching,
+     * and O_DIRECT for no caching. */
+    if ((bdrv_flags & BDRV_O_NOCACHE)) {
+        *open_flags |= O_DIRECT;
+    }
+}
+
+#ifdef CONFIG_LINUX_AIO
+static int raw_set_aio(void **aio_ctx, int *use_aio, int bdrv_flags)
+{
+    int ret = -1;
+    assert(aio_ctx != NULL);
+    assert(use_aio != NULL);
+    /*
+     * Currently Linux do AIO only for files opened with O_DIRECT
+     * specified so check NOCACHE flag too
+     */
+    if ((bdrv_flags & (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) ==
+                      (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) {
+
+        /* if non-NULL, laio_init() has already been run */
+        if (*aio_ctx == NULL) {
+            *aio_ctx = laio_init();
+            if (!*aio_ctx) {
+                goto error;
+            }
+        }
+        *use_aio = 1;
+    } else {
+        *use_aio = 0;
+    }
+
+    ret = 0;
+
+error:
+    return ret;
+}
+#endif
+
 static int raw_open_common(BlockDriverState *bs, const char *filename,
                           int bdrv_flags, int open_flags)
 {
@ -196,20 +269,8 @@ static int raw_open_common(BlockDriverState *bs, const char *filename,
        return ret;
    }

-    s->open_flags = open_flags | O_BINARY;
-    s->open_flags &= ~O_ACCMODE;
-    if (bdrv_flags & BDRV_O_RDWR) {
-        s->open_flags |= O_RDWR;
-    } else {
-        s->open_flags |= O_RDONLY;
-    }
-
-    /* Use O_DSYNC for write-through caching, no flags for write-back caching,
-     * and O_DIRECT for no caching. */
-    if ((bdrv_flags & BDRV_O_NOCACHE))
-        s->open_flags |= O_DIRECT;
-    if (!(bdrv_flags & BDRV_O_CACHE_WB))
-        s->open_flags |= O_DSYNC;
+    s->open_flags = open_flags;
+    raw_parse_flags(bdrv_flags, &s->open_flags);

    s->fd = -1;
    fd = qemu_open(filename, s->open_flags, 0644);
@ -220,45 +281,13 @@ static int raw_open_common(BlockDriverState *bs, const char *filename,
        return ret;
    }
    s->fd = fd;
-    s->aligned_buf = NULL;
-
-    if ((bdrv_flags & BDRV_O_NOCACHE)) {
-        /*
-         * Allocate a buffer for read/modify/write cycles.  Chose the size
-         * pessimistically as we don't know the block size yet.
-         */
-        s->aligned_buf_size = 32 * MAX_BLOCKSIZE;
-        s->aligned_buf = qemu_memalign(MAX_BLOCKSIZE, s->aligned_buf_size);
-        if (s->aligned_buf == NULL) {
-            goto out_close;
-        }
-    }
-
-    /* We're falling back to POSIX AIO in some cases so init always */
-    if (paio_init() < 0) {
-        goto out_free_buf;
-    }

 #ifdef CONFIG_LINUX_AIO
-    /*
-     * Currently Linux do AIO only for files opened with O_DIRECT
-     * specified so check NOCACHE flag too
-     */
-    if ((bdrv_flags & (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) ==
-                      (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) {
-
-        s->aio_ctx = laio_init();
-        if (!s->aio_ctx) {
-            goto out_free_buf;
-        }
-        s->use_aio = 1;
-    } else
-#endif
-    {
-#ifdef CONFIG_LINUX_AIO
-        s->use_aio = 0;
-#endif
+    if (raw_set_aio(&s->aio_ctx, &s->use_aio, bdrv_flags)) {
+        qemu_close(fd);
+        return -errno;
    }
+#endif

 #ifdef CONFIG_XFS
    if (platform_test_xfs_fd(s->fd)) {
@ -267,12 +296,6 @@ static int raw_open_common(BlockDriverState *bs, const char *filename,
 #endif

    return 0;
-
-out_free_buf:
-    qemu_vfree(s->aligned_buf);
-out_close:
-    qemu_close(fd);
-    return -errno;
 }

 static int raw_open(BlockDriverState *bs, const char *filename, int flags)
@ -283,6 +306,113 @@ static int raw_open(BlockDriverState *bs, const char *filename, int flags)
    return raw_open_common(bs, filename, flags, 0);
 }

+static int raw_reopen_prepare(BDRVReopenState *state,
+                              BlockReopenQueue *queue, Error **errp)
+{
+    BDRVRawState *s;
+    BDRVRawReopenState *raw_s;
+    int ret = 0;
+
+    assert(state != NULL);
+    assert(state->bs != NULL);
+
+    s = state->bs->opaque;
+
+    state->opaque = g_malloc0(sizeof(BDRVRawReopenState));
+    raw_s = state->opaque;
+
+#ifdef CONFIG_LINUX_AIO
+    raw_s->use_aio = s->use_aio;
+
+    /* we can use s->aio_ctx instead of a copy, because the use_aio flag is
+     * valid in the 'false' condition even if aio_ctx is set, and raw_set_aio()
+     * won't override aio_ctx if aio_ctx is non-NULL */
+    if (raw_set_aio(&s->aio_ctx, &raw_s->use_aio, state->flags)) {
+        return -1;
+    }
+#endif
+
+    if (s->type == FTYPE_FD || s->type == FTYPE_CD) {
+        raw_s->open_flags |= O_NONBLOCK;
+    }
+
+    raw_parse_flags(state->flags, &raw_s->open_flags);
+
+    raw_s->fd = -1;
+
+    int fcntl_flags = O_APPEND | O_ASYNC | O_NONBLOCK;
+#ifdef O_NOATIME
+    fcntl_flags |= O_NOATIME;
+#endif
+
+    if ((raw_s->open_flags & ~fcntl_flags) == (s->open_flags & ~fcntl_flags)) {
+        /* dup the original fd */
+        /* TODO: use qemu fcntl wrapper */
+#ifdef F_DUPFD_CLOEXEC
+        raw_s->fd = fcntl(s->fd, F_DUPFD_CLOEXEC, 0);
+#else
+        raw_s->fd = dup(s->fd);
+        if (raw_s->fd != -1) {
+            qemu_set_cloexec(raw_s->fd);
+        }
+#endif
+        if (raw_s->fd >= 0) {
+            ret = fcntl_setfl(raw_s->fd, raw_s->open_flags);
+            if (ret) {
+                qemu_close(raw_s->fd);
+                raw_s->fd = -1;
+            }
+        }
+    }
+
+    /* If we cannot use fcntl, or fcntl failed, fall back to qemu_open() */
+    if (raw_s->fd == -1) {
+        assert(!(raw_s->open_flags & O_CREAT));
+        raw_s->fd = qemu_open(state->bs->filename, raw_s->open_flags);
+        if (raw_s->fd == -1) {
+            ret = -1;
+        }
+    }
+    return ret;
+}
+
+
+static void raw_reopen_commit(BDRVReopenState *state)
+{
+    BDRVRawReopenState *raw_s = state->opaque;
+    BDRVRawState *s = state->bs->opaque;
+
+    s->open_flags = raw_s->open_flags;
+
+    qemu_close(s->fd);
+    s->fd = raw_s->fd;
+#ifdef CONFIG_LINUX_AIO
+    s->use_aio = raw_s->use_aio;
+#endif
+
+    g_free(state->opaque);
+    state->opaque = NULL;
+}
+
+
+static void raw_reopen_abort(BDRVReopenState *state)
+{
+    BDRVRawReopenState *raw_s = state->opaque;
+
+     /* nothing to do if NULL, we didn't get far enough */
+    if (raw_s == NULL) {
+        return;
+    }
+
+    if (raw_s->fd >= 0) {
+        qemu_close(raw_s->fd);
+        raw_s->fd = -1;
+    }
+    g_free(state->opaque);
+    state->opaque = NULL;
+}
+
+
 /* XXX: use host sector size if necessary with:
 #ifdef DIOCGSECTORSIZE
        {
@ -316,6 +446,267 @@ static int qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
    return 1;
 }

+static ssize_t handle_aiocb_ioctl(RawPosixAIOData *aiocb)
+{
+    int ret;
+
+    ret = ioctl(aiocb->aio_fildes, aiocb->aio_ioctl_cmd, aiocb->aio_ioctl_buf);
+    if (ret == -1) {
+        return -errno;
+    }
+
+    /*
+     * This looks weird, but the aio code only considers a request
+     * successful if it has written the full number of bytes.
+     *
+     * Now we overload aio_nbytes as aio_ioctl_cmd for the ioctl command,
+     * so in fact we return the ioctl command here to make posix_aio_read()
+     * happy..
+     */
+    return aiocb->aio_nbytes;
+}
+
+static ssize_t handle_aiocb_flush(RawPosixAIOData *aiocb)
+{
+    int ret;
+
+    ret = qemu_fdatasync(aiocb->aio_fildes);
+    if (ret == -1) {
+        return -errno;
+    }
+    return 0;
+}
+
+#ifdef CONFIG_PREADV
+
+static bool preadv_present = true;
+
+static ssize_t
+qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
+{
+    return preadv(fd, iov, nr_iov, offset);
+}
+
+static ssize_t
+qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
+{
+    return pwritev(fd, iov, nr_iov, offset);
+}
+
+#else
+
+static bool preadv_present = false;
+
+static ssize_t
+qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
+{
+    return -ENOSYS;
+}
+
+static ssize_t
+qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
+{
+    return -ENOSYS;
+}
+
+#endif
+
+static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
+{
+    ssize_t len;
+
+    do {
+        if (aiocb->aio_type & QEMU_AIO_WRITE)
+            len = qemu_pwritev(aiocb->aio_fildes,
+                               aiocb->aio_iov,
+                               aiocb->aio_niov,
+                               aiocb->aio_offset);
+         else
+            len = qemu_preadv(aiocb->aio_fildes,
+                              aiocb->aio_iov,
+                              aiocb->aio_niov,
+                              aiocb->aio_offset);
+    } while (len == -1 && errno == EINTR);
+
+    if (len == -1) {
+        return -errno;
+    }
+    return len;
+}
+
+/*
+ * Read/writes the data to/from a given linear buffer.
+ *
+ * Returns the number of bytes handles or -errno in case of an error. Short
+ * reads are only returned if the end of the file is reached.
+ */
+static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf)
+{
+    ssize_t offset = 0;
+    ssize_t len;
+
+    while (offset < aiocb->aio_nbytes) {
+        if (aiocb->aio_type & QEMU_AIO_WRITE) {
+            len = pwrite(aiocb->aio_fildes,
+                         (const char *)buf + offset,
+                         aiocb->aio_nbytes - offset,
+                         aiocb->aio_offset + offset);
+        } else {
+            len = pread(aiocb->aio_fildes,
+                        buf + offset,
+                        aiocb->aio_nbytes - offset,
+                        aiocb->aio_offset + offset);
+        }
+        if (len == -1 && errno == EINTR) {
+            continue;
+        } else if (len == -1) {
+            offset = -errno;
+            break;
+        } else if (len == 0) {
+            break;
+        }
+        offset += len;
+    }
+
+    return offset;
+}
+
+static ssize_t handle_aiocb_rw(RawPosixAIOData *aiocb)
+{
+    ssize_t nbytes;
+    char *buf;
+
+    if (!(aiocb->aio_type & QEMU_AIO_MISALIGNED)) {
+        /*
+         * If there is just a single buffer, and it is properly aligned
+         * we can just use plain pread/pwrite without any problems.
+         */
+        if (aiocb->aio_niov == 1) {
+             return handle_aiocb_rw_linear(aiocb, aiocb->aio_iov->iov_base);
+        }
+        /*
+         * We have more than one iovec, and all are properly aligned.
+         *
+         * Try preadv/pwritev first and fall back to linearizing the
+         * buffer if it's not supported.
+         */
+        if (preadv_present) {
+            nbytes = handle_aiocb_rw_vector(aiocb);
+            if (nbytes == aiocb->aio_nbytes ||
+                (nbytes < 0 && nbytes != -ENOSYS)) {
+                return nbytes;
+            }
+            preadv_present = false;
+        }
+
+        /*
+         * XXX(hch): short read/write.  no easy way to handle the reminder
+         * using these interfaces.  For now retry using plain
+         * pread/pwrite?
+         */
+    }
+
+    /*
+     * Ok, we have to do it the hard way, copy all segments into
+     * a single aligned buffer.
+     */
+    buf = qemu_blockalign(aiocb->bs, aiocb->aio_nbytes);
+    if (aiocb->aio_type & QEMU_AIO_WRITE) {
+        char *p = buf;
+        int i;
+
+        for (i = 0; i < aiocb->aio_niov; ++i) {
+            memcpy(p, aiocb->aio_iov[i].iov_base, aiocb->aio_iov[i].iov_len);
+            p += aiocb->aio_iov[i].iov_len;
+        }
+    }
+
+    nbytes = handle_aiocb_rw_linear(aiocb, buf);
+    if (!(aiocb->aio_type & QEMU_AIO_WRITE)) {
+        char *p = buf;
+        size_t count = aiocb->aio_nbytes, copy;
+        int i;
+
+        for (i = 0; i < aiocb->aio_niov && count; ++i) {
+            copy = count;
+            if (copy > aiocb->aio_iov[i].iov_len) {
+                copy = aiocb->aio_iov[i].iov_len;
+            }
+            memcpy(aiocb->aio_iov[i].iov_base, p, copy);
+            p     += copy;
+            count -= copy;
+        }
+    }
+    qemu_vfree(buf);
+
+    return nbytes;
+}
+
+static int aio_worker(void *arg)
+{
+    RawPosixAIOData *aiocb = arg;
+    ssize_t ret = 0;
+
+    switch (aiocb->aio_type & QEMU_AIO_TYPE_MASK) {
+    case QEMU_AIO_READ:
+        ret = handle_aiocb_rw(aiocb);
+        if (ret >= 0 && ret < aiocb->aio_nbytes && aiocb->bs->growable) {
+            iov_memset(aiocb->aio_iov, aiocb->aio_niov, ret,
+                      0, aiocb->aio_nbytes - ret);
+
+            ret = aiocb->aio_nbytes;
+        }
+        if (ret == aiocb->aio_nbytes) {
+            ret = 0;
+        } else if (ret >= 0 && ret < aiocb->aio_nbytes) {
+            ret = -EINVAL;
+        }
+        break;
+    case QEMU_AIO_WRITE:
+        ret = handle_aiocb_rw(aiocb);
+        if (ret == aiocb->aio_nbytes) {
+            ret = 0;
+        } else if (ret >= 0 && ret < aiocb->aio_nbytes) {
+            ret = -EINVAL;
+        }
+        break;
+    case QEMU_AIO_FLUSH:
+        ret = handle_aiocb_flush(aiocb);
+        break;
+    case QEMU_AIO_IOCTL:
+        ret = handle_aiocb_ioctl(aiocb);
+        break;
+    default:
+        fprintf(stderr, "invalid aio request (0x%x)\n", aiocb->aio_type);
+        ret = -EINVAL;
+        break;
+    }
+
+    g_slice_free(RawPosixAIOData, aiocb);
+    return ret;
+}
+
+static BlockDriverAIOCB *paio_submit(BlockDriverState *bs, int fd,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque, int type)
+{
+    RawPosixAIOData *acb = g_slice_new(RawPosixAIOData);
+
+    acb->bs = bs;
+    acb->aio_type = type;
+    acb->aio_fildes = fd;
+
+    if (qiov) {
+        acb->aio_iov = qiov->iov;
+        acb->aio_niov = qiov->niov;
+    }
+    acb->aio_nbytes = nb_sectors * 512;
+    acb->aio_offset = sector_num * 512;
+
+    trace_paio_submit(acb, opaque, sector_num, nb_sectors, type);
+    return thread_pool_submit_aio(aio_worker, acb, cb, opaque);
+}
+
 static BlockDriverAIOCB *raw_aio_submit(BlockDriverState *bs,
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
        BlockDriverCompletionFunc *cb, void *opaque, int type)
@ -330,7 +721,7 @@ static BlockDriverAIOCB *raw_aio_submit(BlockDriverState *bs,
     * boundary.  Check if this is the case or tell the low-level
     * driver that it needs to copy the buffer.
     */
-    if (s->aligned_buf) {
+    if ((bs->open_flags & BDRV_O_NOCACHE)) {
        if (!qiov_is_aligned(bs, qiov)) {
            type |= QEMU_AIO_MISALIGNED;
 #ifdef CONFIG_LINUX_AIO
@ -378,8 +769,6 @@ static void raw_close(BlockDriverState *bs)
    if (s->fd >= 0) {
        qemu_close(s->fd);
        s->fd = -1;
-        if (s->aligned_buf != NULL)
-            qemu_vfree(s->aligned_buf);
    }
 }

@ -735,6 +1124,9 @@ static BlockDriver bdrv_file = {
    .instance_size = sizeof(BDRVRawState),
    .bdrv_probe = NULL, /* no probe for protocols */
    .bdrv_file_open = raw_open,
+    .bdrv_reopen_prepare = raw_reopen_prepare,
+    .bdrv_reopen_commit = raw_reopen_commit,
+    .bdrv_reopen_abort = raw_reopen_abort,
    .bdrv_close = raw_close,
    .bdrv_create = raw_create,
    .bdrv_co_discard = raw_co_discard,
@ -937,10 +1329,19 @@ static BlockDriverAIOCB *hdev_aio_ioctl(BlockDriverState *bs,
        BlockDriverCompletionFunc *cb, void *opaque)
 {
    BDRVRawState *s = bs->opaque;
+    RawPosixAIOData *acb;

    if (fd_open(bs) < 0)
        return NULL;
-    return paio_ioctl(bs, s->fd, req, buf, cb, opaque);
+
+    acb = g_slice_new(RawPosixAIOData);
+    acb->bs = bs;
+    acb->aio_type = QEMU_AIO_IOCTL;
+    acb->aio_fildes = s->fd;
+    acb->aio_offset = 0;
+    acb->aio_ioctl_buf = buf;
+    acb->aio_ioctl_cmd = req;
+    return thread_pool_submit_aio(aio_worker, acb, cb, opaque);
 }

 #elif defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
@ -1004,6 +1405,9 @@ static BlockDriver bdrv_host_device = {
    .bdrv_probe_device  = hdev_probe_device,
    .bdrv_file_open     = hdev_open,
    .bdrv_close         = raw_close,
+    .bdrv_reopen_prepare = raw_reopen_prepare,
+    .bdrv_reopen_commit  = raw_reopen_commit,
+    .bdrv_reopen_abort   = raw_reopen_abort,
    .bdrv_create        = hdev_create,
    .create_options     = raw_create_options,
    .bdrv_has_zero_init = hdev_has_zero_init,
@ -1125,6 +1529,9 @@ static BlockDriver bdrv_host_floppy = {
    .bdrv_probe_device	= floppy_probe_device,
    .bdrv_file_open     = floppy_open,
    .bdrv_close         = raw_close,
+    .bdrv_reopen_prepare = raw_reopen_prepare,
+    .bdrv_reopen_commit  = raw_reopen_commit,
+    .bdrv_reopen_abort   = raw_reopen_abort,
    .bdrv_create        = hdev_create,
    .create_options     = raw_create_options,
    .bdrv_has_zero_init = hdev_has_zero_init,
@ -1224,6 +1631,9 @@ static BlockDriver bdrv_host_cdrom = {
    .bdrv_probe_device	= cdrom_probe_device,
    .bdrv_file_open     = cdrom_open,
    .bdrv_close         = raw_close,
+    .bdrv_reopen_prepare = raw_reopen_prepare,
+    .bdrv_reopen_commit  = raw_reopen_commit,
+    .bdrv_reopen_abort   = raw_reopen_abort,
    .bdrv_create        = hdev_create,
    .create_options     = raw_create_options,
    .bdrv_has_zero_init = hdev_has_zero_init,
@ -1343,6 +1753,9 @@ static BlockDriver bdrv_host_cdrom = {
    .bdrv_probe_device	= cdrom_probe_device,
    .bdrv_file_open     = cdrom_open,
    .bdrv_close         = raw_close,
+    .bdrv_reopen_prepare = raw_reopen_prepare,
+    .bdrv_reopen_commit  = raw_reopen_commit,
+    .bdrv_reopen_abort   = raw_reopen_abort,
    .bdrv_create        = hdev_create,
    .create_options     = raw_create_options,
    .bdrv_has_zero_init = hdev_has_zero_init,
@ -1363,6 +1776,40 @@ static BlockDriver bdrv_host_cdrom = {
 };
 #endif /* __FreeBSD__ */

+#ifdef CONFIG_LINUX_AIO
+/**
+ * Return the file descriptor for Linux AIO
+ *
+ * This function is a layering violation and should be removed when it becomes
+ * possible to call the block layer outside the global mutex.  It allows the
+ * caller to hijack the file descriptor so I/O can be performed outside the
+ * block layer.
+ */
+int raw_get_aio_fd(BlockDriverState *bs)
+{
+    BDRVRawState *s;
+
+    if (!bs->drv) {
+        return -ENOMEDIUM;
+    }
+
+    if (bs->drv == bdrv_find_format("raw")) {
+        bs = bs->file;
+    }
+
+    /* raw-posix has several protocols so just check for raw_aio_readv */
+    if (bs->drv->bdrv_aio_readv != raw_aio_readv) {
+        return -ENOTSUP;
+    }
+
+    s = bs->opaque;
+    if (!s->use_aio) {
+        return -ENOTSUP;
+    }
+    return s->fd;
+}
+#endif /* CONFIG_LINUX_AIO */
+
 static void bdrv_file_init(void)
 {
    /*
--- a/block/raw-win32.c
+++ b/block/raw-win32.c
@ -22,9 +22,13 @@
 * THE SOFTWARE.
 */
 #include "qemu-common.h"
-#include "qemu-timer.h"
-#include "block_int.h"
-#include "module.h"
+#include "qemu/timer.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
+#include "raw-aio.h"
+#include "trace.h"
+#include "block/thread-pool.h"
+#include "qemu/iov.h"
 #include <windows.h>
 #include <winioctl.h>

@ -32,12 +36,130 @@
 #define FTYPE_CD     1
 #define FTYPE_HARDDISK 2

+static QEMUWin32AIOState *aio;
+
+typedef struct RawWin32AIOData {
+    BlockDriverState *bs;
+    HANDLE hfile;
+    struct iovec *aio_iov;
+    int aio_niov;
+    size_t aio_nbytes;
+    off64_t aio_offset;
+    int aio_type;
+} RawWin32AIOData;
+
 typedef struct BDRVRawState {
    HANDLE hfile;
    int type;
    char drive_path[16]; /* format: "d:\" */
+    QEMUWin32AIOState *aio;
 } BDRVRawState;

+/*
+ * Read/writes the data to/from a given linear buffer.
+ *
+ * Returns the number of bytes handles or -errno in case of an error. Short
+ * reads are only returned if the end of the file is reached.
+ */
+static size_t handle_aiocb_rw(RawWin32AIOData *aiocb)
+{
+    size_t offset = 0;
+    int i;
+
+    for (i = 0; i < aiocb->aio_niov; i++) {
+        OVERLAPPED ov;
+        DWORD ret, ret_count, len;
+
+        memset(&ov, 0, sizeof(ov));
+        ov.Offset = (aiocb->aio_offset + offset);
+        ov.OffsetHigh = (aiocb->aio_offset + offset) >> 32;
+        len = aiocb->aio_iov[i].iov_len;
+        if (aiocb->aio_type & QEMU_AIO_WRITE) {
+            ret = WriteFile(aiocb->hfile, aiocb->aio_iov[i].iov_base,
+                            len, &ret_count, &ov);
+        } else {
+            ret = ReadFile(aiocb->hfile, aiocb->aio_iov[i].iov_base,
+                           len, &ret_count, &ov);
+        }
+        if (!ret) {
+            ret_count = 0;
+        }
+        if (ret_count != len) {
+            break;
+        }
+        offset += len;
+    }
+
+    return offset;
+}
+
+static int aio_worker(void *arg)
+{
+    RawWin32AIOData *aiocb = arg;
+    ssize_t ret = 0;
+    size_t count;
+
+    switch (aiocb->aio_type & QEMU_AIO_TYPE_MASK) {
+    case QEMU_AIO_READ:
+        count = handle_aiocb_rw(aiocb);
+        if (count < aiocb->aio_nbytes && aiocb->bs->growable) {
+            /* A short read means that we have reached EOF. Pad the buffer
+             * with zeros for bytes after EOF. */
+            iov_memset(aiocb->aio_iov, aiocb->aio_niov, count,
+                      0, aiocb->aio_nbytes - count);
+
+            count = aiocb->aio_nbytes;
+        }
+        if (count == aiocb->aio_nbytes) {
+            ret = 0;
+        } else {
+            ret = -EINVAL;
+        }
+        break;
+    case QEMU_AIO_WRITE:
+        count = handle_aiocb_rw(aiocb);
+        if (count == aiocb->aio_nbytes) {
+            count = 0;
+        } else {
+            count = -EINVAL;
+        }
+        break;
+    case QEMU_AIO_FLUSH:
+        if (!FlushFileBuffers(aiocb->hfile)) {
+            return -EIO;
+        }
+        break;
+    default:
+        fprintf(stderr, "invalid aio request (0x%x)\n", aiocb->aio_type);
+        ret = -EINVAL;
+        break;
+    }
+
+    g_slice_free(RawWin32AIOData, aiocb);
+    return ret;
+}
+
+static BlockDriverAIOCB *paio_submit(BlockDriverState *bs, HANDLE hfile,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque, int type)
+{
+    RawWin32AIOData *acb = g_slice_new(RawWin32AIOData);
+
+    acb->bs = bs;
+    acb->hfile = hfile;
+    acb->aio_type = type;
+
+    if (qiov) {
+        acb->aio_iov = qiov->iov;
+        acb->aio_niov = qiov->niov;
+    }
+    acb->aio_nbytes = nb_sectors * 512;
+    acb->aio_offset = sector_num * 512;
+
+    trace_paio_submit(acb, opaque, sector_num, nb_sectors, type);
+    return thread_pool_submit_aio(aio_worker, acb, cb, opaque);
+}
+
 int qemu_ftruncate64(int fd, int64_t length)
 {
    LARGE_INTEGER li;
@ -77,6 +199,26 @@ static int set_sparse(int fd)
 				 NULL, 0, NULL, 0, &returned, NULL);
 }

+static void raw_parse_flags(int flags, int *access_flags, DWORD *overlapped)
+{
+    assert(access_flags != NULL);
+    assert(overlapped != NULL);
+
+    if (flags & BDRV_O_RDWR) {
+        *access_flags = GENERIC_READ | GENERIC_WRITE;
+    } else {
+        *access_flags = GENERIC_READ;
+    }
+
+    *overlapped = FILE_ATTRIBUTE_NORMAL;
+    if (flags & BDRV_O_NATIVE_AIO) {
+        *overlapped |= FILE_FLAG_OVERLAPPED;
+    }
+    if (flags & BDRV_O_NOCACHE) {
+        *overlapped |= FILE_FLAG_NO_BUFFERING;
+    }
+}
+
 static int raw_open(BlockDriverState *bs, const char *filename, int flags)
 {
    BDRVRawState *s = bs->opaque;
@ -85,17 +227,15 @@ static int raw_open(BlockDriverState *bs, const char *filename, int flags)

    s->type = FTYPE_FILE;

-    if (flags & BDRV_O_RDWR) {
-        access_flags = GENERIC_READ | GENERIC_WRITE;
-    } else {
-        access_flags = GENERIC_READ;
+    raw_parse_flags(flags, &access_flags, &overlapped);
+    
+    if ((flags & BDRV_O_NATIVE_AIO) && aio == NULL) {
+        aio = win32_aio_init();
+        if (aio == NULL) {
+            return -EINVAL;
+        }
    }

-    overlapped = FILE_ATTRIBUTE_NORMAL;
-    if (flags & BDRV_O_NOCACHE)
-        overlapped |= FILE_FLAG_NO_BUFFERING;
-    if (!(flags & BDRV_O_CACHE_WB))
-        overlapped |= FILE_FLAG_WRITE_THROUGH;
    s->hfile = CreateFile(filename, access_flags,
                          FILE_SHARE_READ, NULL,
                          OPEN_EXISTING, overlapped, NULL);
@ -104,64 +244,53 @@ static int raw_open(BlockDriverState *bs, const char *filename, int flags)

        if (err == ERROR_ACCESS_DENIED)
            return -EACCES;
-        return -1;
+        return -EINVAL;
+    }
+
+    if (flags & BDRV_O_NATIVE_AIO) {
+        int ret = win32_aio_attach(aio, s->hfile);
+        if (ret < 0) {
+            CloseHandle(s->hfile);
+            return ret;
+        }
+        s->aio = aio;
    }
    return 0;
 }

-static int raw_read(BlockDriverState *bs, int64_t sector_num,
-                    uint8_t *buf, int nb_sectors)
+static BlockDriverAIOCB *raw_aio_readv(BlockDriverState *bs,
+                         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+                         BlockDriverCompletionFunc *cb, void *opaque)
 {
    BDRVRawState *s = bs->opaque;
-    OVERLAPPED ov;
-    DWORD ret_count;
-    int ret;
-    int64_t offset = sector_num * 512;
-    int count = nb_sectors * 512;
-
-    memset(&ov, 0, sizeof(ov));
-    ov.Offset = offset;
-    ov.OffsetHigh = offset >> 32;
-    ret = ReadFile(s->hfile, buf, count, &ret_count, &ov);
-    if (!ret)
-        return ret_count;
-    if (ret_count == count)
-        ret_count = 0;
-    return ret_count;
-}
-
-static int raw_write(BlockDriverState *bs, int64_t sector_num,
-                     const uint8_t *buf, int nb_sectors)
-{
-    BDRVRawState *s = bs->opaque;
-    OVERLAPPED ov;
-    DWORD ret_count;
-    int ret;
-    int64_t offset = sector_num * 512;
-    int count = nb_sectors * 512;
-
-    memset(&ov, 0, sizeof(ov));
-    ov.Offset = offset;
-    ov.OffsetHigh = offset >> 32;
-    ret = WriteFile(s->hfile, buf, count, &ret_count, &ov);
-    if (!ret)
-        return ret_count;
-    if (ret_count == count)
-        ret_count = 0;
-    return ret_count;
-}
-
-static int raw_flush(BlockDriverState *bs)
-{
-    BDRVRawState *s = bs->opaque;
-    int ret;
-
-    ret = FlushFileBuffers(s->hfile);
-    if (ret == 0) {
-        return -EIO;
+    if (s->aio) {
+        return win32_aio_submit(bs, s->aio, s->hfile, sector_num, qiov,
+                                nb_sectors, cb, opaque, QEMU_AIO_READ); 
+    } else {
+        return paio_submit(bs, s->hfile, sector_num, qiov, nb_sectors,
+                           cb, opaque, QEMU_AIO_READ);
    }
+}

-    return 0;
+static BlockDriverAIOCB *raw_aio_writev(BlockDriverState *bs,
+                          int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+                          BlockDriverCompletionFunc *cb, void *opaque)
+{
+    BDRVRawState *s = bs->opaque;
+    if (s->aio) {
+        return win32_aio_submit(bs, s->aio, s->hfile, sector_num, qiov,
+                                nb_sectors, cb, opaque, QEMU_AIO_WRITE); 
+    } else {
+        return paio_submit(bs, s->hfile, sector_num, qiov, nb_sectors,
+                           cb, opaque, QEMU_AIO_WRITE);
+    }
+}
+
+static BlockDriverAIOCB *raw_aio_flush(BlockDriverState *bs,
+                         BlockDriverCompletionFunc *cb, void *opaque)
+{
+    BDRVRawState *s = bs->opaque;
+    return paio_submit(bs, s->hfile, 0, NULL, 0, cb, opaque, QEMU_AIO_FLUSH);
 }

 static void raw_close(BlockDriverState *bs)
@ -174,13 +303,24 @@ static int raw_truncate(BlockDriverState *bs, int64_t offset)
 {
    BDRVRawState *s = bs->opaque;
    LONG low, high;
+    DWORD dwPtrLow;

    low = offset;
    high = offset >> 32;
-    if (!SetFilePointer(s->hfile, low, &high, FILE_BEGIN))
-	return -EIO;
-    if (!SetEndOfFile(s->hfile))
+
+    /*
+     * An error has occurred if the return value is INVALID_SET_FILE_POINTER
+     * and GetLastError doesn't return NO_ERROR.
+     */
+    dwPtrLow = SetFilePointer(s->hfile, low, &high, FILE_BEGIN);
+    if (dwPtrLow == INVALID_SET_FILE_POINTER && GetLastError() != NO_ERROR) {
+        fprintf(stderr, "SetFilePointer error: %lu\n", GetLastError());
        return -EIO;
+    }
+    if (SetEndOfFile(s->hfile) == 0) {
+        fprintf(stderr, "SetEndOfFile error: %lu\n", GetLastError());
+        return -EIO;
+    }
    return 0;
 }

@ -282,9 +422,9 @@ static BlockDriver bdrv_file = {
    .bdrv_close		= raw_close,
    .bdrv_create	= raw_create,

-    .bdrv_read              = raw_read,
-    .bdrv_write             = raw_write,
-    .bdrv_co_flush_to_disk  = raw_flush,
+    .bdrv_aio_readv     = raw_aio_readv,
+    .bdrv_aio_writev    = raw_aio_writev,
+    .bdrv_aio_flush     = raw_aio_flush,

    .bdrv_truncate	= raw_truncate,
    .bdrv_getlength	= raw_getlength,
@ -374,18 +514,10 @@ static int hdev_open(BlockDriverState *bs, const char *filename, int flags)
    }
    s->type = find_device_type(bs, filename);

-    if (flags & BDRV_O_RDWR) {
-        access_flags = GENERIC_READ | GENERIC_WRITE;
-    } else {
-        access_flags = GENERIC_READ;
-    }
+    raw_parse_flags(flags, &access_flags, &overlapped);
+
    create_flags = OPEN_EXISTING;

-    overlapped = FILE_ATTRIBUTE_NORMAL;
-    if (flags & BDRV_O_NOCACHE)
-        overlapped |= FILE_FLAG_NO_BUFFERING;
-    if (!(flags & BDRV_O_CACHE_WB))
-        overlapped |= FILE_FLAG_WRITE_THROUGH;
    s->hfile = CreateFile(filename, access_flags,
                          FILE_SHARE_READ, NULL,
                          create_flags, overlapped, NULL);
@ -413,9 +545,9 @@ static BlockDriver bdrv_host_device = {
    .bdrv_close		= raw_close,
    .bdrv_has_zero_init = hdev_has_zero_init,

-    .bdrv_read              = raw_read,
-    .bdrv_write             = raw_write,
-    .bdrv_co_flush_to_disk  = raw_flush,
+    .bdrv_aio_readv     = raw_aio_readv,
+    .bdrv_aio_writev    = raw_aio_writev,
+    .bdrv_aio_flush     = raw_aio_flush,

    .bdrv_getlength	= raw_getlength,
    .bdrv_get_allocated_file_size
--- a/block/raw.c
+++ b/block/raw.c
@ -1,7 +1,7 @@

 #include "qemu-common.h"
-#include "block_int.h"
-#include "module.h"
+#include "block/block_int.h"
+#include "qemu/module.h"

 static int raw_open(BlockDriverState *bs, int flags)
 {
@ -9,6 +9,14 @@ static int raw_open(BlockDriverState *bs, int flags)
    return 0;
 }

+/* We have nothing to do for raw reopen, stubs just return
+ * success */
+static int raw_reopen_prepare(BDRVReopenState *state,
+                              BlockReopenQueue *queue,  Error **errp)
+{
+    return 0;
+}
+
 static int coroutine_fn raw_co_readv(BlockDriverState *bs, int64_t sector_num,
                                     int nb_sectors, QEMUIOVector *qiov)
 {
@ -115,6 +123,8 @@ static BlockDriver bdrv_raw = {
    .bdrv_open          = raw_open,
    .bdrv_close         = raw_close,

+    .bdrv_reopen_prepare  = raw_reopen_prepare,
+
    .bdrv_co_readv          = raw_co_readv,
    .bdrv_co_writev         = raw_co_writev,
    .bdrv_co_is_allocated   = raw_co_is_allocated,
--- a/block/rbd.c
+++ b/block/rbd.c
@ -14,8 +14,8 @@
 #include <inttypes.h>

 #include "qemu-common.h"
-#include "qemu-error.h"
-#include "block_int.h"
+#include "qemu/error-report.h"
+#include "block/block_int.h"

 #include <rbd/librbd.h>

@ -69,7 +69,7 @@ typedef enum {
 typedef struct RBDAIOCB {
    BlockDriverAIOCB common;
    QEMUBH *bh;
-    int ret;
+    int64_t ret;
    QEMUIOVector *qiov;
    char *bounce;
    RBDAIOCmd cmd;
@ -77,6 +77,7 @@ typedef struct RBDAIOCB {
    int error;
    struct BDRVRBDState *s;
    int cancelled;
+    int status;
 } RBDAIOCB;

 typedef struct RADOSCB {
@ -86,7 +87,7 @@ typedef struct RADOSCB {
    int done;
    int64_t size;
    char *buf;
-    int ret;
+    int64_t ret;
 } RADOSCB;

 #define RBD_FD_READ 0
@ -376,12 +377,6 @@ static void qemu_rbd_complete_aio(RADOSCB *rcb)
    RBDAIOCB *acb = rcb->acb;
    int64_t r;

-    if (acb->cancelled) {
-        qemu_vfree(acb->bounce);
-        qemu_aio_release(acb);
-        goto done;
-    }
-
    r = rcb->ret;

    if (acb->cmd == RBD_AIO_WRITE ||
@ -409,7 +404,6 @@ static void qemu_rbd_complete_aio(RADOSCB *rcb)
    /* Note that acb->bh can be NULL in case where the aio was cancelled */
    acb->bh = qemu_bh_new(rbd_aio_bh_cb, acb);
    qemu_bh_schedule(acb->bh);
-done:
    g_free(rcb);
 }

@ -487,12 +481,6 @@ static int qemu_rbd_open(BlockDriverState *bs, const char *filename, int flags)
        rados_conf_set(s->cluster, "rbd_cache", "false");
    } else {
        rados_conf_set(s->cluster, "rbd_cache", "true");
-        if (!(flags & BDRV_O_CACHE_WB)) {
-            r = rados_conf_set(s->cluster, "rbd_cache_max_dirty", "0");
-            if (r < 0) {
-                rados_conf_set(s->cluster, "rbd_cache", "false");
-            }
-        }
    }

    if (strstr(conf, "conf=") == NULL) {
@ -574,9 +562,15 @@ static void qemu_rbd_aio_cancel(BlockDriverAIOCB *blockacb)
 {
    RBDAIOCB *acb = (RBDAIOCB *) blockacb;
    acb->cancelled = 1;
+
+    while (acb->status == -EINPROGRESS) {
+        qemu_aio_wait();
+    }
+
+    qemu_aio_release(acb);
 }

-static AIOPool rbd_aio_pool = {
+static const AIOCBInfo rbd_aiocb_info = {
    .aiocb_size = sizeof(RBDAIOCB),
    .cancel = qemu_rbd_aio_cancel,
 };
@ -645,8 +639,11 @@ static void rbd_aio_bh_cb(void *opaque)
    acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret));
    qemu_bh_delete(acb->bh);
    acb->bh = NULL;
+    acb->status = 0;

-    qemu_aio_release(acb);
+    if (!acb->cancelled) {
+        qemu_aio_release(acb);
+    }
 }

 static int rbd_aio_discard_wrapper(rbd_image_t image,
@ -678,7 +675,7 @@ static BlockDriverAIOCB *rbd_start_aio(BlockDriverState *bs,

    BDRVRBDState *s = bs->opaque;

-    acb = qemu_aio_get(&rbd_aio_pool, bs, cb, opaque);
+    acb = qemu_aio_get(&rbd_aiocb_info, bs, cb, opaque);
    acb->cmd = cmd;
    acb->qiov = qiov;
    if (cmd == RBD_AIO_DISCARD) {
@ -691,6 +688,7 @@ static BlockDriverAIOCB *rbd_start_aio(BlockDriverState *bs,
    acb->s = s;
    acb->cancelled = 0;
    acb->bh = NULL;
+    acb->status = -EINPROGRESS;

    if (cmd == RBD_AIO_WRITE) {
        qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@ -13,10 +13,10 @@
 */

 #include "qemu-common.h"
-#include "qemu-error.h"
-#include "qemu_socket.h"
-#include "block_int.h"
-#include "bitops.h"
+#include "qemu/error-report.h"
+#include "qemu/sockets.h"
+#include "block/block_int.h"
+#include "qemu/bitops.h"

 #define SD_PROTO_VER 0x01

@ -201,12 +201,12 @@ static inline uint64_t fnv_64a_buf(void *buf, size_t len, uint64_t hval)
    return hval;
 }

-static inline int is_data_obj_writable(SheepdogInode *inode, unsigned int idx)
+static inline bool is_data_obj_writable(SheepdogInode *inode, unsigned int idx)
 {
    return inode->vdi_id == inode->data_vdi_id[idx];
 }

-static inline int is_data_obj(uint64_t oid)
+static inline bool is_data_obj(uint64_t oid)
 {
    return !(VDI_BIT & oid);
 }
@ -231,7 +231,7 @@ static inline uint64_t vid_to_data_oid(uint32_t vid, uint32_t idx)
    return ((uint64_t)vid << VDI_SPACE_SHIFT) | idx;
 }

-static inline int is_snapshot(struct SheepdogInode *inode)
+static inline bool is_snapshot(struct SheepdogInode *inode)
 {
    return !!inode->snap_ctime;
 }
@ -281,7 +281,7 @@ struct SheepdogAIOCB {
    Coroutine *coroutine;
    void (*aio_done_func)(SheepdogAIOCB *);

-    int canceled;
+    bool canceled;
    int nr_pending;
 };

@ -292,8 +292,8 @@ typedef struct BDRVSheepdogState {
    uint32_t max_dirty_data_idx;

    char name[SD_MAX_VDI_LEN];
-    int is_snapshot;
-    uint8_t cache_enabled;
+    bool is_snapshot;
+    bool cache_enabled;

    char *addr;
    char *port;
@ -417,10 +417,10 @@ static void sd_aio_cancel(BlockDriverAIOCB *blockacb)
     */
    acb->ret = -EIO;
    qemu_coroutine_enter(acb->coroutine, NULL);
-    acb->canceled = 1;
+    acb->canceled = true;
 }

-static AIOPool sd_aio_pool = {
+static const AIOCBInfo sd_aiocb_info = {
    .aiocb_size = sizeof(SheepdogAIOCB),
    .cancel = sd_aio_cancel,
 };
@ -431,7 +431,7 @@ static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov,
 {
    SheepdogAIOCB *acb;

-    acb = qemu_aio_get(&sd_aio_pool, bs, cb, opaque);
+    acb = qemu_aio_get(&sd_aiocb_info, bs, cb, opaque);

    acb->qiov = qiov;

@ -439,7 +439,7 @@ static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov,
    acb->nb_sectors = nb_sectors;

    acb->aio_done_func = NULL;
-    acb->canceled = 0;
+    acb->canceled = false;
    acb->coroutine = qemu_coroutine_self();
    acb->ret = 0;
    acb->nr_pending = 0;
@ -485,6 +485,7 @@ static int connect_to_sdog(const char *addr, const char *port)
            if (errno == EINTR) {
                goto reconnect;
            }
+            close(fd);
            break;
        }

@ -612,7 +613,7 @@ static int do_req(int sockfd, SheepdogReq *hdr, void *data,
 }

 static int coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
-                           struct iovec *iov, int niov, int create,
+                           struct iovec *iov, int niov, bool create,
                           enum AIOCBState aiocb_type);


@ -645,7 +646,7 @@ static void coroutine_fn send_pending_req(BDRVSheepdogState *s, uint64_t oid)
        QLIST_REMOVE(aio_req, aio_siblings);
        QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
        ret = add_aio_request(s, aio_req, acb->qiov->iov,
-                              acb->qiov->niov, 0, acb->aiocb_type);
+                              acb->qiov->niov, false, acb->aiocb_type);
        if (ret < 0) {
            error_report("add_aio_request is failed");
            free_aio_req(s, aio_req);
@ -713,16 +714,17 @@ static void coroutine_fn aio_read_response(void *opaque)
             * and max_dirty_data_idx are changed to include updated
             * index between them.
             */
-            s->inode.data_vdi_id[idx] = s->inode.vdi_id;
-            s->max_dirty_data_idx = MAX(idx, s->max_dirty_data_idx);
-            s->min_dirty_data_idx = MIN(idx, s->min_dirty_data_idx);
-
+            if (rsp.result == SD_RES_SUCCESS) {
+                s->inode.data_vdi_id[idx] = s->inode.vdi_id;
+                s->max_dirty_data_idx = MAX(idx, s->max_dirty_data_idx);
+                s->min_dirty_data_idx = MIN(idx, s->min_dirty_data_idx);
+            }
            /*
             * Some requests may be blocked because simultaneous
             * create requests are not allowed, so we search the
             * pending requests here.
             */
-            send_pending_req(s, vid_to_data_oid(s->inode.vdi_id, idx));
+            send_pending_req(s, aio_req->oid);
        }
        break;
    case AIOCB_READ_UDATA:
@ -865,14 +867,14 @@ static int parse_vdiname(BDRVSheepdogState *s, const char *filename,
        s->port = 0;
    }

-    strncpy(vdi, p, SD_MAX_VDI_LEN);
+    pstrcpy(vdi, SD_MAX_VDI_LEN, p);

    p = strchr(vdi, ':');
    if (p) {
        *p++ = '\0';
        *snapid = strtoul(p, NULL, 10);
        if (*snapid == 0) {
-            strncpy(tag, p, SD_MAX_VDI_TAG_LEN);
+            pstrcpy(tag, SD_MAX_VDI_TAG_LEN, p);
        }
    } else {
        *snapid = CURRENT_VDI_ID; /* search current vdi */
@ -899,7 +901,10 @@ static int find_vdi_name(BDRVSheepdogState *s, char *filename, uint32_t snapid,
        return fd;
    }

-    memset(buf, 0, sizeof(buf));
+    /* This pair of strncpy calls ensures that the buffer is zero-filled,
+     * which is desirable since we'll soon be sending those bytes, and
+     * don't want the send_req to read uninitialized data.
+     */
    strncpy(buf, filename, SD_MAX_VDI_LEN);
    strncpy(buf + SD_MAX_VDI_LEN, tag, SD_MAX_VDI_TAG_LEN);

@ -939,7 +944,7 @@ out:
 }

 static int coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
-                           struct iovec *iov, int niov, int create,
+                           struct iovec *iov, int niov, bool create,
                           enum AIOCBState aiocb_type)
 {
    int nr_copies = s->inode.nr_copies;
@ -1018,7 +1023,7 @@ static int coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,

 static int read_write_object(int fd, char *buf, uint64_t oid, int copies,
                             unsigned int datalen, uint64_t offset,
-                             int write, int create, uint8_t cache)
+                             bool write, bool create, bool cache)
 {
    SheepdogObjReq hdr;
    SheepdogObjRsp *rsp = (SheepdogObjRsp *)&hdr;
@ -1067,18 +1072,18 @@ static int read_write_object(int fd, char *buf, uint64_t oid, int copies,
 }

 static int read_object(int fd, char *buf, uint64_t oid, int copies,
-                       unsigned int datalen, uint64_t offset, uint8_t cache)
+                       unsigned int datalen, uint64_t offset, bool cache)
 {
-    return read_write_object(fd, buf, oid, copies, datalen, offset, 0, 0,
-                             cache);
+    return read_write_object(fd, buf, oid, copies, datalen, offset, false,
+                             false, cache);
 }

 static int write_object(int fd, char *buf, uint64_t oid, int copies,
-                        unsigned int datalen, uint64_t offset, int create,
-                        uint8_t cache)
+                        unsigned int datalen, uint64_t offset, bool create,
+                        bool cache)
 {
-    return read_write_object(fd, buf, oid, copies, datalen, offset, 1, create,
-                             cache);
+    return read_write_object(fd, buf, oid, copies, datalen, offset, true,
+                             create, cache);
 }

 static int sd_open(BlockDriverState *bs, const char *filename, int flags)
@ -1113,19 +1118,17 @@ static int sd_open(BlockDriverState *bs, const char *filename, int flags)
        goto out;
    }

-    if (flags & BDRV_O_CACHE_WB) {
-        s->cache_enabled = 1;
-        s->flush_fd = connect_to_sdog(s->addr, s->port);
-        if (s->flush_fd < 0) {
-            error_report("failed to connect");
-            ret = s->flush_fd;
-            goto out;
-        }
+    s->cache_enabled = true;
+    s->flush_fd = connect_to_sdog(s->addr, s->port);
+    if (s->flush_fd < 0) {
+        error_report("failed to connect");
+        ret = s->flush_fd;
+        goto out;
    }

    if (snapid || tag[0] != '\0') {
        dprintf("%" PRIx32 " snapshot inode was open.\n", vid);
-        s->is_snapshot = 1;
+        s->is_snapshot = true;
    }

    fd = connect_to_sdog(s->addr, s->port);
@ -1150,7 +1153,7 @@ static int sd_open(BlockDriverState *bs, const char *filename, int flags)
    s->max_dirty_data_idx = 0;

    bs->total_sectors = s->inode.vdi_size / SECTOR_SIZE;
-    strncpy(s->name, vdi, sizeof(s->name));
+    pstrcpy(s->name, sizeof(s->name), vdi);
    qemu_co_mutex_init(&s->lock);
    g_free(buf);
    return 0;
@ -1178,8 +1181,11 @@ static int do_sd_create(char *filename, int64_t vdi_size,
        return fd;
    }

+    /* FIXME: would it be better to fail (e.g., return -EIO) when filename
+     * does not fit in buf?  For now, just truncate and avoid buffer overrun.
+     */
    memset(buf, 0, sizeof(buf));
-    strncpy(buf, filename, SD_MAX_VDI_LEN);
+    pstrcpy(buf, sizeof(buf), filename);

    memset(&hdr, 0, sizeof(hdr));
    hdr.opcode = SD_OP_NEW_VDI;
@ -1265,7 +1271,7 @@ static int sd_create(const char *filename, QEMUOptionParameter *options)
    BDRVSheepdogState *s;
    char vdi[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN];
    uint32_t snapid;
-    int prealloc = 0;
+    bool prealloc = false;
    const char *vdiname;

    s = g_malloc0(sizeof(BDRVSheepdogState));
@ -1287,9 +1293,9 @@ static int sd_create(const char *filename, QEMUOptionParameter *options)
            backing_file = options->value.s;
        } else if (!strcmp(options->name, BLOCK_OPT_PREALLOC)) {
            if (!options->value.s || !strcmp(options->value.s, "off")) {
-                prealloc = 0;
+                prealloc = false;
            } else if (!strcmp(options->value.s, "full")) {
-                prealloc = 1;
+                prealloc = true;
            } else {
                error_report("Invalid preallocation mode: '%s'",
                             options->value.s);
@ -1417,7 +1423,7 @@ static int sd_truncate(BlockDriverState *bs, int64_t offset)
    datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id);
    s->inode.vdi_size = offset;
    ret = write_object(fd, (char *)&s->inode, vid_to_vdi_oid(s->inode.vdi_id),
-                       s->inode.nr_copies, datalen, 0, 0, s->cache_enabled);
+                       s->inode.nr_copies, datalen, 0, false, s->cache_enabled);
    close(fd);

    if (ret < 0) {
@ -1456,7 +1462,7 @@ static void coroutine_fn sd_write_done(SheepdogAIOCB *acb)
        aio_req = alloc_aio_req(s, acb, vid_to_vdi_oid(s->inode.vdi_id),
                                data_len, offset, 0, 0, offset);
        QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
-        ret = add_aio_request(s, aio_req, &iov, 1, 0, AIOCB_WRITE_UDATA);
+        ret = add_aio_request(s, aio_req, &iov, 1, false, AIOCB_WRITE_UDATA);
        if (ret) {
            free_aio_req(s, aio_req);
            acb->ret = -EIO;
@ -1510,7 +1516,7 @@ static int sd_create_branch(BDRVSheepdogState *s)

    memcpy(&s->inode, buf, sizeof(s->inode));

-    s->is_snapshot = 0;
+    s->is_snapshot = false;
    ret = 0;
    dprintf("%" PRIx32 " was newly created.\n", s->inode.vdi_id);

@ -1565,7 +1571,7 @@ static int coroutine_fn sd_co_rw_vector(void *p)
    while (done != total) {
        uint8_t flags = 0;
        uint64_t old_oid = 0;
-        int create = 0;
+        bool create = false;

        oid = vid_to_data_oid(inode->data_vdi_id[idx], idx);

@ -1580,10 +1586,10 @@ static int coroutine_fn sd_co_rw_vector(void *p)
            break;
        case AIOCB_WRITE_UDATA:
            if (!inode->data_vdi_id[idx]) {
-                create = 1;
+                create = true;
            } else if (!is_data_obj_writable(inode, idx)) {
                /* Copy-On-Write */
-                create = 1;
+                create = true;
                old_oid = oid;
                flags = SD_FLAG_CMD_COW;
            }
@ -1717,7 +1723,7 @@ static int coroutine_fn sd_co_flush_to_disk(BlockDriverState *bs)
    if (rsp->result == SD_RES_INVALID_PARMS) {
        dprintf("disable write cache since the server doesn't support it\n");

-        s->cache_enabled = 0;
+        s->cache_enabled = false;
        closesocket(s->flush_fd);
        return 0;
    }
@ -1753,6 +1759,9 @@ static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)

    s->inode.vm_state_size = sn_info->vm_state_size;
    s->inode.vm_clock_nsec = sn_info->vm_clock_nsec;
+    /* It appears that inode.tag does not require a NUL terminator,
+     * which means this use of strncpy is ok.
+     */
    strncpy(s->inode.tag, sn_info->name, sizeof(s->inode.tag));
    /* we don't need to update entire object */
    datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id);
@ -1765,7 +1774,7 @@ static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
    }

    ret = write_object(fd, (char *)&s->inode, vid_to_vdi_oid(s->inode.vdi_id),
-                       s->inode.nr_copies, datalen, 0, 0, s->cache_enabled);
+                       s->inode.nr_copies, datalen, 0, false, s->cache_enabled);
    if (ret < 0) {
        error_report("failed to write snapshot's inode.");
        goto cleanup;
@ -1812,13 +1821,13 @@ static int sd_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)

    memcpy(old_s, s, sizeof(BDRVSheepdogState));

-    memset(vdi, 0, sizeof(vdi));
-    strncpy(vdi, s->name, sizeof(vdi));
+    pstrcpy(vdi, sizeof(vdi), s->name);

-    memset(tag, 0, sizeof(tag));
    snapid = strtoul(snapshot_id, NULL, 10);
-    if (!snapid) {
-        strncpy(tag, s->name, sizeof(tag));
+    if (snapid) {
+        tag[0] = 0;
+    } else {
+        pstrcpy(tag, sizeof(tag), s->name);
    }

    ret = find_vdi_name(s, vdi, snapid, tag, &vid, 1);
@ -1852,7 +1861,7 @@ static int sd_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
        goto out;
    }

-    s->is_snapshot = 1;
+    s->is_snapshot = true;

    g_free(buf);
    g_free(old_s);
@ -1947,8 +1956,9 @@ static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab)

            snprintf(sn_tab[found].id_str, sizeof(sn_tab[found].id_str), "%u",
                     inode.snap_id);
-            strncpy(sn_tab[found].name, inode.tag,
-                    MIN(sizeof(sn_tab[found].name), sizeof(inode.tag)));
+            pstrcpy(sn_tab[found].name,
+                    MIN(sizeof(sn_tab[found].name), sizeof(inode.tag)),
+                    inode.tag);
            found++;
        }
    }
@ -1969,8 +1979,8 @@ out:
 static int do_load_save_vmstate(BDRVSheepdogState *s, uint8_t *data,
                                int64_t pos, int size, int load)
 {
-    int fd, create;
-    int ret = 0, remaining = size;
+    bool create;
+    int fd, ret = 0, remaining = size;
    unsigned int data_len;
    uint64_t vmstate_oid;
    uint32_t vdi_index;
@ -1985,7 +1995,7 @@ static int do_load_save_vmstate(BDRVSheepdogState *s, uint8_t *data,
        vdi_index = pos / SD_DATA_OBJ_SIZE;
        offset = pos % SD_DATA_OBJ_SIZE;

-        data_len = MIN(remaining, SD_DATA_OBJ_SIZE);
+        data_len = MIN(remaining, SD_DATA_OBJ_SIZE - offset);

        vmstate_oid = vid_to_vmstate_oid(s->inode.vdi_id, vdi_index);

@ -2006,6 +2016,7 @@ static int do_load_save_vmstate(BDRVSheepdogState *s, uint8_t *data,
        }

        pos += data_len;
+        data += data_len;
        remaining -= data_len;
    }
    ret = size;
--- a/block/stream.c
+++ b/block/stream.c
@ -12,7 +12,8 @@
 */

 #include "trace.h"
-#include "block_int.h"
+#include "block/block_int.h"
+#include "block/blockjob.h"
 #include "qemu/ratelimit.h"

 enum {
@ -30,6 +31,7 @@ typedef struct StreamBlockJob {
    BlockJob common;
    RateLimit limit;
    BlockDriverState *base;
+    BlockdevOnError on_error;
    char backing_file_id[1024];
 } StreamBlockJob;

@ -77,13 +79,14 @@ static void coroutine_fn stream_run(void *opaque)
    BlockDriverState *bs = s->common.bs;
    BlockDriverState *base = s->base;
    int64_t sector_num, end;
+    int error = 0;
    int ret = 0;
    int n = 0;
    void *buf;

    s->common.len = bdrv_getlength(bs);
    if (s->common.len < 0) {
-        block_job_complete(&s->common, s->common.len);
+        block_job_completed(&s->common, s->common.len);
        return;
    }

@ -105,7 +108,7 @@ static void coroutine_fn stream_run(void *opaque)

 wait:
        /* Note that even when no rate limit is applied we need to yield
-         * with no pending I/O here so that qemu_aio_flush() returns.
+         * with no pending I/O here so that bdrv_drain_all() returns.
         */
        block_job_sleep_ns(&s->common, rt_clock, delay_ns);
        if (block_job_is_cancelled(&s->common)) {
@ -122,6 +125,12 @@ wait:
             * known-unallocated area [sector_num, sector_num+n).  */
            ret = bdrv_co_is_allocated_above(bs->backing_hd, base,
                                             sector_num, n, &n);
+
+            /* Finish early if end of backing file has been reached */
+            if (ret == 0 && n == 0) {
+                n = end - sector_num;
+            }
+
            copy = (ret == 1);
        }
        trace_stream_one_iteration(s, sector_num, n, ret);
@ -135,7 +144,19 @@ wait:
            ret = stream_populate(bs, sector_num, n, buf);
        }
        if (ret < 0) {
-            break;
+            BlockErrorAction action =
+                block_job_error_action(&s->common, s->common.bs, s->on_error,
+                                       true, -ret);
+            if (action == BDRV_ACTION_STOP) {
+                n = 0;
+                continue;
+            }
+            if (error == 0) {
+                error = ret;
+            }
+            if (action == BDRV_ACTION_REPORT) {
+                break;
+            }
        }
        ret = 0;

@ -147,6 +168,9 @@ wait:
        bdrv_disable_copy_on_read(bs);
    }

+    /* Do not remove the backing file if an error was there but ignored.  */
+    ret = error;
+
    if (!block_job_is_cancelled(&s->common) && sector_num == end && ret == 0) {
        const char *base_id = NULL, *base_fmt = NULL;
        if (base) {
@ -160,7 +184,7 @@ wait:
    }

    qemu_vfree(buf);
-    block_job_complete(&s->common, ret);
+    block_job_completed(&s->common, ret);
 }

 static void stream_set_speed(BlockJob *job, int64_t speed, Error **errp)
@ -182,11 +206,19 @@ static BlockJobType stream_job_type = {

 void stream_start(BlockDriverState *bs, BlockDriverState *base,
                  const char *base_id, int64_t speed,
+                  BlockdevOnError on_error,
                  BlockDriverCompletionFunc *cb,
                  void *opaque, Error **errp)
 {
    StreamBlockJob *s;

+    if ((on_error == BLOCKDEV_ON_ERROR_STOP ||
+         on_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
+        !bdrv_iostatus_is_enabled(bs)) {
+        error_set(errp, QERR_INVALID_PARAMETER, "on-error");
+        return;
+    }
+
    s = block_job_create(&stream_job_type, bs, speed, cb, opaque, errp);
    if (!s) {
        return;
@ -197,6 +229,7 @@ void stream_start(BlockDriverState *bs, BlockDriverState *base,
        pstrcpy(s->backing_file_id, sizeof(s->backing_file_id), base_id);
    }

+    s->on_error = on_error;
    s->common.co = qemu_coroutine_create(stream_run);
    trace_stream_start(bs, base, s, s->common.co, opaque);
    qemu_coroutine_enter(s->common.co, s);
--- a/block/vdi.c
+++ b/block/vdi.c
@ -50,19 +50,16 @@
 */

 #include "qemu-common.h"
-#include "block_int.h"
-#include "module.h"
-#include "migration.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
+#include "migration/migration.h"

 #if defined(CONFIG_UUID)
 #include <uuid/uuid.h>
 #else
 /* TODO: move uuid emulation to some central place in QEMU. */
-#include "sysemu.h"     /* UUID_FMT */
+#include "sysemu/sysemu.h"     /* UUID_FMT */
 typedef unsigned char uuid_t[16];
-void uuid_generate(uuid_t out);
-int uuid_is_null(const uuid_t uu);
-void uuid_unparse(const uuid_t uu, char *out);
 #endif

 /* Code configuration options. */
@ -124,18 +121,18 @@ void uuid_unparse(const uuid_t uu, char *out);
 #define VDI_IS_ALLOCATED(X) ((X) < VDI_DISCARDED)

 #if !defined(CONFIG_UUID)
-void uuid_generate(uuid_t out)
+static inline void uuid_generate(uuid_t out)
 {
    memset(out, 0, sizeof(uuid_t));
 }

-int uuid_is_null(const uuid_t uu)
+static inline int uuid_is_null(const uuid_t uu)
 {
    uuid_t null_uuid = { 0 };
    return memcmp(uu, null_uuid, sizeof(uuid_t)) == 0;
 }

-void uuid_unparse(const uuid_t uu, char *out)
+static inline void uuid_unparse(const uuid_t uu, char *out)
 {
    snprintf(out, 37, UUID_FMT,
            uu[0], uu[1], uu[2], uu[3], uu[4], uu[5], uu[6], uu[7],
@ -454,6 +451,12 @@ static int vdi_open(BlockDriverState *bs, int flags)
    return -1;
 }

+static int vdi_reopen_prepare(BDRVReopenState *state,
+                              BlockReopenQueue *queue, Error **errp)
+{
+    return 0;
+}
+
 static int coroutine_fn vdi_co_is_allocated(BlockDriverState *bs,
        int64_t sector_num, int nb_sectors, int *pnum)
 {
@ -628,7 +631,6 @@ static int vdi_create(const char *filename, QEMUOptionParameter *options)
    VdiHeader header;
    size_t i;
    size_t bmap_size;
-    uint32_t *bmap;

    logout("\n");

@ -693,21 +695,21 @@ static int vdi_create(const char *filename, QEMUOptionParameter *options)
        result = -errno;
    }

-    bmap = NULL;
    if (bmap_size > 0) {
-        bmap = (uint32_t *)g_malloc0(bmap_size);
-    }
-    for (i = 0; i < blocks; i++) {
-        if (image_type == VDI_TYPE_STATIC) {
-            bmap[i] = i;
-        } else {
-            bmap[i] = VDI_UNALLOCATED;
+        uint32_t *bmap = g_malloc0(bmap_size);
+        for (i = 0; i < blocks; i++) {
+            if (image_type == VDI_TYPE_STATIC) {
+                bmap[i] = i;
+            } else {
+                bmap[i] = VDI_UNALLOCATED;
+            }
        }
+        if (write(fd, bmap, bmap_size) < 0) {
+            result = -errno;
+        }
+        g_free(bmap);
    }
-    if (write(fd, bmap, bmap_size) < 0) {
-        result = -errno;
-    }
-    g_free(bmap);
+
    if (image_type == VDI_TYPE_STATIC) {
        if (ftruncate(fd, sizeof(header) + bmap_size + blocks * block_size)) {
            result = -errno;
@ -762,6 +764,7 @@ static BlockDriver bdrv_vdi = {
    .bdrv_probe = vdi_probe,
    .bdrv_open = vdi_open,
    .bdrv_close = vdi_close,
+    .bdrv_reopen_prepare = vdi_reopen_prepare,
    .bdrv_create = vdi_create,
    .bdrv_co_is_allocated = vdi_co_is_allocated,
    .bdrv_make_empty = vdi_make_empty,
--- a/block/vmdk.c
+++ b/block/vmdk.c
@ -24,9 +24,9 @@
 */

 #include "qemu-common.h"
-#include "block_int.h"
-#include "module.h"
-#include "migration.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
+#include "migration/migration.h"
 #include <zlib.h>

 #define VMDK3_MAGIC (('C' << 24) | ('O' << 16) | ('W' << 8) | 'D')
@ -35,6 +35,7 @@
 #define VMDK4_FLAG_RGD (1 << 1)
 #define VMDK4_FLAG_COMPRESS (1 << 16)
 #define VMDK4_FLAG_MARKER (1 << 17)
+#define VMDK4_GD_AT_END 0xffffffffffffffffULL

 typedef struct {
    uint32_t version;
@ -57,8 +58,8 @@ typedef struct {
    int64_t desc_offset;
    int64_t desc_size;
    int32_t num_gtes_per_gte;
-    int64_t gd_offset;
    int64_t rgd_offset;
+    int64_t gd_offset;
    int64_t grain_offset;
    char filler[1];
    char check_bytes[4];
@ -115,6 +116,13 @@ typedef struct VmdkGrainMarker {
    uint8_t  data[0];
 } VmdkGrainMarker;

+enum {
+    MARKER_END_OF_STREAM    = 0,
+    MARKER_GRAIN_TABLE      = 1,
+    MARKER_GRAIN_DIRECTORY  = 2,
+    MARKER_FOOTER           = 3,
+};
+
 static int vmdk_probe(const uint8_t *buf, int buf_size, const char *filename)
 {
    uint32_t magic;
@ -292,6 +300,40 @@ static int vmdk_is_cid_valid(BlockDriverState *bs)
    return 1;
 }

+/* Queue extents, if any, for reopen() */
+static int vmdk_reopen_prepare(BDRVReopenState *state,
+                               BlockReopenQueue *queue, Error **errp)
+{
+    BDRVVmdkState *s;
+    int ret = -1;
+    int i;
+    VmdkExtent *e;
+
+    assert(state != NULL);
+    assert(state->bs != NULL);
+
+    if (queue == NULL) {
+        error_set(errp, ERROR_CLASS_GENERIC_ERROR,
+                 "No reopen queue for VMDK extents");
+        goto exit;
+    }
+
+    s = state->bs->opaque;
+
+    assert(s != NULL);
+
+    for (i = 0; i < s->num_extents; i++) {
+        e = &s->extents[i];
+        if (e->file != state->bs->file) {
+            bdrv_reopen_queue(queue, e->file, state->flags);
+        }
+    }
+    ret = 0;
+
+exit:
+    return ret;
+}
+
 static int vmdk_parent_open(BlockDriverState *bs)
 {
    char *p_name;
@ -451,6 +493,54 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
    if (header.capacity == 0 && header.desc_offset) {
        return vmdk_open_desc_file(bs, flags, header.desc_offset << 9);
    }
+
+    if (le64_to_cpu(header.gd_offset) == VMDK4_GD_AT_END) {
+        /*
+         * The footer takes precedence over the header, so read it in. The
+         * footer starts at offset -1024 from the end: One sector for the
+         * footer, and another one for the end-of-stream marker.
+         */
+        struct {
+            struct {
+                uint64_t val;
+                uint32_t size;
+                uint32_t type;
+                uint8_t pad[512 - 16];
+            } QEMU_PACKED footer_marker;
+
+            uint32_t magic;
+            VMDK4Header header;
+            uint8_t pad[512 - 4 - sizeof(VMDK4Header)];
+
+            struct {
+                uint64_t val;
+                uint32_t size;
+                uint32_t type;
+                uint8_t pad[512 - 16];
+            } QEMU_PACKED eos_marker;
+        } QEMU_PACKED footer;
+
+        ret = bdrv_pread(file,
+            bs->file->total_sectors * 512 - 1536,
+            &footer, sizeof(footer));
+        if (ret < 0) {
+            return ret;
+        }
+
+        /* Some sanity checks for the footer */
+        if (be32_to_cpu(footer.magic) != VMDK4_MAGIC ||
+            le32_to_cpu(footer.footer_marker.size) != 0  ||
+            le32_to_cpu(footer.footer_marker.type) != MARKER_FOOTER ||
+            le64_to_cpu(footer.eos_marker.val) != 0  ||
+            le32_to_cpu(footer.eos_marker.size) != 0  ||
+            le32_to_cpu(footer.eos_marker.type) != MARKER_END_OF_STREAM)
+        {
+            return -EINVAL;
+        }
+
+        header = footer.header;
+    }
+
    l1_entry_sectors = le32_to_cpu(header.num_gtes_per_gte)
                        * le64_to_cpu(header.granularity);
    if (l1_entry_sectors == 0) {
@ -1002,6 +1092,7 @@ static int vmdk_read(BlockDriverState *bs, int64_t sector_num,
    BDRVVmdkState *s = bs->opaque;
    int ret;
    uint64_t n, index_in_cluster;
+    uint64_t extent_begin_sector, extent_relative_sector_num;
    VmdkExtent *extent = NULL;
    uint64_t cluster_offset;

@ -1013,7 +1104,9 @@ static int vmdk_read(BlockDriverState *bs, int64_t sector_num,
        ret = get_cluster_offset(
                            bs, extent, NULL,
                            sector_num << 9, 0, &cluster_offset);
-        index_in_cluster = sector_num % extent->cluster_sectors;
+        extent_begin_sector = extent->end_sector - extent->sectors;
+        extent_relative_sector_num = sector_num - extent_begin_sector;
+        index_in_cluster = extent_relative_sector_num % extent->cluster_sectors;
        n = extent->cluster_sectors - index_in_cluster;
        if (n > nb_sectors) {
            n = nb_sectors;
@ -1064,6 +1157,7 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
    VmdkExtent *extent = NULL;
    int n, ret;
    int64_t index_in_cluster;
+    uint64_t extent_begin_sector, extent_relative_sector_num;
    uint64_t cluster_offset;
    VmdkMetaData m_data;

@ -1106,7 +1200,9 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
        if (ret) {
            return -EINVAL;
        }
-        index_in_cluster = sector_num % extent->cluster_sectors;
+        extent_begin_sector = extent->end_sector - extent->sectors;
+        extent_relative_sector_num = sector_num - extent_begin_sector;
+        index_in_cluster = extent_relative_sector_num % extent->cluster_sectors;
        n = extent->cluster_sectors - index_in_cluster;
        if (n > nb_sectors) {
            n = nb_sectors;
@ -1318,8 +1414,7 @@ static int relative_path(char *dest, int dest_size,
        return -1;
    }
    if (path_is_absolute(target)) {
-        dest[dest_size - 1] = '\0';
-        strncpy(dest, target, dest_size - 1);
+        pstrcpy(dest, dest_size, target);
        return 0;
    }
    while (base[i] == target[i]) {
@ -1590,6 +1685,7 @@ static BlockDriver bdrv_vmdk = {
    .instance_size  = sizeof(BDRVVmdkState),
    .bdrv_probe     = vmdk_probe,
    .bdrv_open      = vmdk_open,
+    .bdrv_reopen_prepare = vmdk_reopen_prepare,
    .bdrv_read      = vmdk_co_read,
    .bdrv_write     = vmdk_co_write,
    .bdrv_close     = vmdk_close,
--- a/block/vpc.c
+++ b/block/vpc.c
@ -23,9 +23,12 @@
 * THE SOFTWARE.
 */
 #include "qemu-common.h"
-#include "block_int.h"
-#include "module.h"
-#include "migration.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
+#include "migration/migration.h"
+#if defined(CONFIG_UUID)
+#include <uuid/uuid.h>
+#endif

 /**************************************************************/

@ -198,7 +201,8 @@ static int vpc_open(BlockDriverState *bs, int flags)
    bs->total_sectors = (int64_t)
        be16_to_cpu(footer->cyls) * footer->heads * footer->secs_per_cyl;

-    if (bs->total_sectors >= 65535 * 16 * 255) {
+    /* Allow a maximum disk size of approximately 2 TB */
+    if (bs->total_sectors >= 65535LL * 255 * 255) {
        err = -EFBIG;
        goto fail;
    }
@ -265,6 +269,12 @@ static int vpc_open(BlockDriverState *bs, int flags)
    return err;
 }

+static int vpc_reopen_prepare(BDRVReopenState *state,
+                              BlockReopenQueue *queue, Error **errp)
+{
+    return 0;
+}
+
 /*
 * Returns the absolute byte offset of the given sector in the image file.
 * If the sector is not allocated, -1 is returned instead.
@ -518,19 +528,27 @@ static coroutine_fn int vpc_co_write(BlockDriverState *bs, int64_t sector_num,
 * Note that the geometry doesn't always exactly match total_sectors but
 * may round it down.
 *
- * Returns 0 on success, -EFBIG if the size is larger than 127 GB
+ * Returns 0 on success, -EFBIG if the size is larger than ~2 TB. Override
+ * the hardware EIDE and ATA-2 limit of 16 heads (max disk size of 127 GB)
+ * and instead allow up to 255 heads.
 */
 static int calculate_geometry(int64_t total_sectors, uint16_t* cyls,
    uint8_t* heads, uint8_t* secs_per_cyl)
 {
    uint32_t cyls_times_heads;

-    if (total_sectors > 65535 * 16 * 255)
+    /* Allow a maximum disk size of approximately 2 TB */
+    if (total_sectors > 65535LL * 255 * 255) {
        return -EFBIG;
+    }

    if (total_sectors > 65535 * 16 * 63) {
        *secs_per_cyl = 255;
-        *heads = 16;
+        if (total_sectors > 65535 * 16 * 255) {
+            *heads = 255;
+        } else {
+            *heads = 16;
+        }
        cyls_times_heads = total_sectors / *secs_per_cyl;
    } else {
        *secs_per_cyl = 17;
@ -733,7 +751,9 @@ static int vpc_create(const char *filename, QEMUOptionParameter *options)

    footer->type = be32_to_cpu(disk_type);

-    /* TODO uuid is missing */
+#if defined(CONFIG_UUID)
+    uuid_generate(footer->uuid);
+#endif

    footer->checksum = be32_to_cpu(vpc_checksum(buf, HEADER_SIZE));

@ -783,6 +803,7 @@ static BlockDriver bdrv_vpc = {
    .bdrv_probe     = vpc_probe,
    .bdrv_open      = vpc_open,
    .bdrv_close     = vpc_close,
+    .bdrv_reopen_prepare = vpc_reopen_prepare,
    .bdrv_create    = vpc_create,

    .bdrv_read              = vpc_co_read,
--- a/block/vvfat.c
+++ b/block/vvfat.c
@ -25,9 +25,9 @@
 #include <sys/stat.h>
 #include <dirent.h>
 #include "qemu-common.h"
-#include "block_int.h"
-#include "module.h"
-#include "migration.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
+#include "migration/migration.h"

 #ifndef S_IWGRP
 #define S_IWGRP 0
--- a/block/win32-aio.c
+++ b/block/win32-aio.c
@ -0,0 +1,226 @@
+/*
+ * Block driver for RAW files (win32)
+ *
+ * Copyright (c) 2006 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu-common.h"
+#include "qemu/timer.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
+#include "qemu-common.h"
+#include "block/aio.h"
+#include "raw-aio.h"
+#include "qemu/event_notifier.h"
+#include <windows.h>
+#include <winioctl.h>
+
+#define FTYPE_FILE 0
+#define FTYPE_CD     1
+#define FTYPE_HARDDISK 2
+
+struct QEMUWin32AIOState {
+    HANDLE hIOCP;
+    EventNotifier e;
+    int count;
+};
+
+typedef struct QEMUWin32AIOCB {
+    BlockDriverAIOCB common;
+    struct QEMUWin32AIOState *ctx;
+    int nbytes;
+    OVERLAPPED ov;
+    QEMUIOVector *qiov;
+    void *buf;
+    bool is_read;
+    bool is_linear;
+} QEMUWin32AIOCB;
+
+/*
+ * Completes an AIO request (calls the callback and frees the ACB).
+ */
+static void win32_aio_process_completion(QEMUWin32AIOState *s,
+    QEMUWin32AIOCB *waiocb, DWORD count)
+{
+    int ret;
+    s->count--;
+
+    if (waiocb->ov.Internal != 0) {
+        ret = -EIO;
+    } else {
+        ret = 0;
+        if (count < waiocb->nbytes) {
+            /* Short reads mean EOF, pad with zeros. */
+            if (waiocb->is_read) {
+                qemu_iovec_memset(waiocb->qiov, count, 0,
+                    waiocb->qiov->size - count);
+            } else {
+                ret = -EINVAL;
+            }
+       }
+    }
+
+    if (!waiocb->is_linear) {
+        if (ret == 0 && waiocb->is_read) {
+            QEMUIOVector *qiov = waiocb->qiov;
+            char *p = waiocb->buf;
+            int i;
+
+            for (i = 0; i < qiov->niov; ++i) {
+                memcpy(p, qiov->iov[i].iov_base, qiov->iov[i].iov_len);
+                p += qiov->iov[i].iov_len;
+            }
+            g_free(waiocb->buf);
+        }
+    }
+
+
+    waiocb->common.cb(waiocb->common.opaque, ret);
+    qemu_aio_release(waiocb);
+}
+
+static void win32_aio_completion_cb(EventNotifier *e)
+{
+    QEMUWin32AIOState *s = container_of(e, QEMUWin32AIOState, e);
+    DWORD count;
+    ULONG_PTR key;
+    OVERLAPPED *ov;
+
+    event_notifier_test_and_clear(&s->e);
+    while (GetQueuedCompletionStatus(s->hIOCP, &count, &key, &ov, 0)) {
+        QEMUWin32AIOCB *waiocb = container_of(ov, QEMUWin32AIOCB, ov);
+
+        win32_aio_process_completion(s, waiocb, count);
+    }
+}
+
+static int win32_aio_flush_cb(EventNotifier *e)
+{
+    QEMUWin32AIOState *s = container_of(e, QEMUWin32AIOState, e);
+
+    return (s->count > 0) ? 1 : 0;
+}
+
+static void win32_aio_cancel(BlockDriverAIOCB *blockacb)
+{
+    QEMUWin32AIOCB *waiocb = (QEMUWin32AIOCB *)blockacb;
+
+    /*
+     * CancelIoEx is only supported in Vista and newer.  For now, just
+     * wait for completion.
+     */
+    while (!HasOverlappedIoCompleted(&waiocb->ov)) {
+        qemu_aio_wait();
+    }
+}
+
+static const AIOCBInfo win32_aiocb_info = {
+    .aiocb_size         = sizeof(QEMUWin32AIOCB),
+    .cancel             = win32_aio_cancel,
+};
+
+BlockDriverAIOCB *win32_aio_submit(BlockDriverState *bs,
+        QEMUWin32AIOState *aio, HANDLE hfile,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque, int type)
+{
+    struct QEMUWin32AIOCB *waiocb;
+    uint64_t offset = sector_num * 512;
+    DWORD rc;
+
+    waiocb = qemu_aio_get(&win32_aiocb_info, bs, cb, opaque);
+    waiocb->nbytes = nb_sectors * 512;
+    waiocb->qiov = qiov;
+    waiocb->is_read = (type == QEMU_AIO_READ);
+
+    if (qiov->niov > 1) {
+        waiocb->buf = qemu_blockalign(bs, qiov->size);
+        if (type & QEMU_AIO_WRITE) {
+            char *p = waiocb->buf;
+            int i;
+
+            for (i = 0; i < qiov->niov; ++i) {
+                memcpy(p, qiov->iov[i].iov_base, qiov->iov[i].iov_len);
+                p += qiov->iov[i].iov_len;
+            }
+        }
+        waiocb->is_linear = false;
+    } else {
+        waiocb->buf = qiov->iov[0].iov_base;
+        waiocb->is_linear = true;
+    }
+
+    memset(&waiocb->ov, 0, sizeof(waiocb->ov));
+    waiocb->ov.Offset = (DWORD)offset;
+    waiocb->ov.OffsetHigh = (DWORD)(offset >> 32);
+    waiocb->ov.hEvent = event_notifier_get_handle(&aio->e);
+
+    aio->count++;
+
+    if (type & QEMU_AIO_READ) {
+        rc = ReadFile(hfile, waiocb->buf, waiocb->nbytes, NULL, &waiocb->ov);
+    } else {
+        rc = WriteFile(hfile, waiocb->buf, waiocb->nbytes, NULL, &waiocb->ov);
+    }
+    if(rc == 0 && GetLastError() != ERROR_IO_PENDING) {
+        goto out_dec_count;
+    }
+    return &waiocb->common;
+
+out_dec_count:
+    aio->count--;
+    qemu_aio_release(waiocb);
+    return NULL;
+}
+
+int win32_aio_attach(QEMUWin32AIOState *aio, HANDLE hfile)
+{
+    if (CreateIoCompletionPort(hfile, aio->hIOCP, (ULONG_PTR) 0, 0) == NULL) {
+        return -EINVAL;
+    } else {
+        return 0;
+    }
+}
+
+QEMUWin32AIOState *win32_aio_init(void)
+{
+    QEMUWin32AIOState *s;
+
+    s = g_malloc0(sizeof(*s));
+    if (event_notifier_init(&s->e, false) < 0) {
+        goto out_free_state;
+    }
+
+    s->hIOCP = CreateIoCompletionPort(INVALID_HANDLE_VALUE, NULL, 0, 0);
+    if (s->hIOCP == NULL) {
+        goto out_close_efd;
+    }
+
+    qemu_aio_set_event_notifier(&s->e, win32_aio_completion_cb,
+                                win32_aio_flush_cb);
+
+    return s;
+
+out_close_efd:
+    event_notifier_cleanup(&s->e);
+out_free_state:
+    g_free(s);
+    return NULL;
+}
--- a/blockdev-nbd.c
+++ b/blockdev-nbd.c
@ -0,0 +1,133 @@
+/*
+ * Serving QEMU block devices via NBD
+ *
+ * Copyright (c) 2012 Red Hat, Inc.
+ *
+ * Author: Paolo Bonzini <pbonzini@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later.  See the COPYING file in the top-level directory.
+ */
+
+#include "sysemu/blockdev.h"
+#include "hw/block-common.h"
+#include "monitor/monitor.h"
+#include "qapi/qmp/qerror.h"
+#include "sysemu/sysemu.h"
+#include "qmp-commands.h"
+#include "trace.h"
+#include "block/nbd.h"
+#include "qemu/sockets.h"
+
+static int server_fd = -1;
+
+static void nbd_accept(void *opaque)
+{
+    struct sockaddr_in addr;
+    socklen_t addr_len = sizeof(addr);
+
+    int fd = accept(server_fd, (struct sockaddr *)&addr, &addr_len);
+    if (fd >= 0) {
+        nbd_client_new(NULL, fd, nbd_client_put);
+    }
+}
+
+void qmp_nbd_server_start(SocketAddress *addr, Error **errp)
+{
+    if (server_fd != -1) {
+        error_setg(errp, "NBD server already running");
+        return;
+    }
+
+    server_fd = socket_listen(addr, errp);
+    if (server_fd != -1) {
+        qemu_set_fd_handler2(server_fd, NULL, nbd_accept, NULL, NULL);
+    }
+}
+
+/* Hook into the BlockDriverState notifiers to close the export when
+ * the file is closed.
+ */
+typedef struct NBDCloseNotifier {
+    Notifier n;
+    NBDExport *exp;
+    QTAILQ_ENTRY(NBDCloseNotifier) next;
+} NBDCloseNotifier;
+
+static QTAILQ_HEAD(, NBDCloseNotifier) close_notifiers =
+    QTAILQ_HEAD_INITIALIZER(close_notifiers);
+
+static void nbd_close_notifier(Notifier *n, void *data)
+{
+    NBDCloseNotifier *cn = DO_UPCAST(NBDCloseNotifier, n, n);
+
+    notifier_remove(&cn->n);
+    QTAILQ_REMOVE(&close_notifiers, cn, next);
+
+    nbd_export_close(cn->exp);
+    nbd_export_put(cn->exp);
+    g_free(cn);
+}
+
+static void nbd_server_put_ref(NBDExport *exp)
+{
+    BlockDriverState *bs = nbd_export_get_blockdev(exp);
+    drive_put_ref(drive_get_by_blockdev(bs));
+}
+
+void qmp_nbd_server_add(const char *device, bool has_writable, bool writable,
+                        Error **errp)
+{
+    BlockDriverState *bs;
+    NBDExport *exp;
+    NBDCloseNotifier *n;
+
+    if (server_fd == -1) {
+        error_setg(errp, "NBD server not running");
+        return;
+    }
+
+    if (nbd_export_find(device)) {
+        error_setg(errp, "NBD server already exporting device '%s'", device);
+        return;
+    }
+
+    bs = bdrv_find(device);
+    if (!bs) {
+        error_set(errp, QERR_DEVICE_NOT_FOUND, device);
+        return;
+    }
+
+    if (!has_writable) {
+        writable = false;
+    }
+    if (bdrv_is_read_only(bs)) {
+        writable = false;
+    }
+
+    exp = nbd_export_new(bs, 0, -1, writable ? 0 : NBD_FLAG_READ_ONLY,
+                         nbd_server_put_ref);
+
+    nbd_export_set_name(exp, device);
+    drive_get_ref(drive_get_by_blockdev(bs));
+
+    n = g_malloc0(sizeof(NBDCloseNotifier));
+    n->n.notify = nbd_close_notifier;
+    n->exp = exp;
+    bdrv_add_close_notifier(bs, &n->n);
+    QTAILQ_INSERT_TAIL(&close_notifiers, n, next);
+}
+
+void qmp_nbd_server_stop(Error **errp)
+{
+    while (!QTAILQ_EMPTY(&close_notifiers)) {
+        NBDCloseNotifier *cn = QTAILQ_FIRST(&close_notifiers);
+        nbd_close_notifier(&cn->n, nbd_export_get_blockdev(cn->exp));
+    }
+
+    if (server_fd != -1) {
+        qemu_set_fd_handler2(server_fd, NULL, NULL, NULL, NULL);
+        close(server_fd);
+        server_fd = -1;
+    }
+}
--- a/blockdev.c
+++ b/blockdev.c
@ -7,18 +7,19 @@
 * later.  See the COPYING file in the top-level directory.
 */

-#include "blockdev.h"
+#include "sysemu/blockdev.h"
 #include "hw/block-common.h"
-#include "monitor.h"
-#include "qerror.h"
-#include "qemu-option.h"
-#include "qemu-config.h"
-#include "qemu-objects.h"
-#include "sysemu.h"
-#include "block_int.h"
+#include "block/blockjob.h"
+#include "monitor/monitor.h"
+#include "qapi/qmp/qerror.h"
+#include "qemu/option.h"
+#include "qemu/config-file.h"
+#include "qapi/qmp/types.h"
+#include "sysemu/sysemu.h"
+#include "block/block_int.h"
 #include "qmp-commands.h"
 #include "trace.h"
-#include "arch_init.h"
+#include "sysemu/arch_init.h"

 static QTAILQ_HEAD(drivelist, DriveInfo) drives = QTAILQ_HEAD_INITIALIZER(drives);

@ -237,16 +238,16 @@ static void drive_put_ref_bh_schedule(DriveInfo *dinfo)
    qemu_bh_schedule(s->bh);
 }

-static int parse_block_error_action(const char *buf, int is_read)
+static int parse_block_error_action(const char *buf, bool is_read)
 {
    if (!strcmp(buf, "ignore")) {
-        return BLOCK_ERR_IGNORE;
+        return BLOCKDEV_ON_ERROR_IGNORE;
    } else if (!is_read && !strcmp(buf, "enospc")) {
-        return BLOCK_ERR_STOP_ENOSPC;
+        return BLOCKDEV_ON_ERROR_ENOSPC;
    } else if (!strcmp(buf, "stop")) {
-        return BLOCK_ERR_STOP_ANY;
+        return BLOCKDEV_ON_ERROR_STOP;
    } else if (!strcmp(buf, "report")) {
-        return BLOCK_ERR_REPORT;
+        return BLOCKDEV_ON_ERROR_REPORT;
    } else {
        error_report("'%s' invalid %s error action",
                     buf, is_read ? "read" : "write");
@ -274,7 +275,7 @@ static bool do_check_io_limits(BlockIOLimit *io_limits)
    return true;
 }

-DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi)
+DriveInfo *drive_init(QemuOpts *opts, BlockInterfaceType block_default_type)
 {
    const char *buf;
    const char *file = NULL;
@ -324,7 +325,7 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi)
            return NULL;
 	}
    } else {
-        type = default_to_scsi ? IF_SCSI : IF_IDE;
+        type = block_default_type;
    }

    max_devs = if_max_devs[type];
@ -432,7 +433,13 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi)
        return NULL;
    }

-    on_write_error = BLOCK_ERR_STOP_ENOSPC;
+    if (qemu_opt_get(opts, "boot") != NULL) {
+        fprintf(stderr, "qemu-kvm: boot=on|off is deprecated and will be "
+                "ignored. Future versions will reject this parameter. Please "
+                "update your scripts.\n");
+    }
+
+    on_write_error = BLOCKDEV_ON_ERROR_ENOSPC;
    if ((buf = qemu_opt_get(opts, "werror")) != NULL) {
        if (type != IF_IDE && type != IF_SCSI && type != IF_VIRTIO && type != IF_NONE) {
            error_report("werror is not supported by this bus type");
@ -445,7 +452,7 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi)
        }
    }

-    on_read_error = BLOCK_ERR_REPORT;
+    on_read_error = BLOCKDEV_ON_ERROR_REPORT;
    if ((buf = qemu_opt_get(opts, "rerror")) != NULL) {
        if (type != IF_IDE && type != IF_VIRTIO && type != IF_SCSI && type != IF_NONE) {
            error_report("rerror is not supported by this bus type");
@ -527,6 +534,8 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi)
                     if_name[type], mediastr, unit_id);
    }
    dinfo->bdrv = bdrv_new(dinfo->id);
+    dinfo->bdrv->open_flags = snapshot ? BDRV_O_SNAPSHOT : 0;
+    dinfo->bdrv->read_only = ro;
    dinfo->devaddr = devaddr;
    dinfo->type = type;
    dinfo->bus = bus_id;
@ -559,7 +568,7 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi)
        break;
    case IF_VIRTIO:
        /* add virtio block device */
-        opts = qemu_opts_create(qemu_find_opts("device"), NULL, 0, NULL);
+        opts = qemu_opts_create_nofail(qemu_find_opts("device"));
        if (arch_type == QEMU_ARCH_S390X) {
            qemu_opt_set(opts, "driver", "virtio-blk-s390");
        } else {
@ -698,6 +707,7 @@ void qmp_transaction(BlockdevActionList *dev_list, Error **errp)
    int ret = 0;
    BlockdevActionList *dev_entry = dev_list;
    BlkTransactionStates *states, *next;
+    Error *local_err = NULL;

    QSIMPLEQ_HEAD(snap_bdrv_states, BlkTransactionStates) snap_bdrv_states;
    QSIMPLEQ_INIT(&snap_bdrv_states);
@ -777,12 +787,12 @@ void qmp_transaction(BlockdevActionList *dev_list, Error **errp)

        /* create new image w/backing file */
        if (mode != NEW_IMAGE_MODE_EXISTING) {
-            ret = bdrv_img_create(new_image_file, format,
-                                  states->old_bs->filename,
-                                  states->old_bs->drv->format_name,
-                                  NULL, -1, flags);
-            if (ret) {
-                error_set(errp, QERR_OPEN_FILE_FAILED, new_image_file);
+            bdrv_img_create(new_image_file, format,
+                            states->old_bs->filename,
+                            states->old_bs->drv->format_name,
+                            NULL, -1, flags, &local_err);
+            if (error_is_set(&local_err)) {
+                error_propagate(errp, local_err);
                goto delete_and_fail;
            }
        }
@ -803,6 +813,11 @@ void qmp_transaction(BlockdevActionList *dev_list, Error **errp)
    QSIMPLEQ_FOREACH(states, &snap_bdrv_states, entry) {
        /* This removes our old bs from the bdrv_states, and adds the new bs */
        bdrv_append(states->new_bs, states->old_bs);
+        /* We don't need (or want) to use the transactional
+         * bdrv_reopen_multiple() across all the entries at once, because we
+         * don't want to abort all of them if one of them fails the reopen */
+        bdrv_reopen(states->new_bs, states->new_bs->open_flags & ~BDRV_O_RDWR,
+                    NULL);
    }

    /* success */
@ -822,7 +837,6 @@ exit:
    QSIMPLEQ_FOREACH_SAFE(states, &snap_bdrv_states, entry, next) {
        g_free(states);
    }
-    return;
 }


@ -1049,26 +1063,12 @@ void qmp_block_resize(const char *device, int64_t size, Error **errp)
    }
 }

-static QObject *qobject_from_block_job(BlockJob *job)
-{
-    return qobject_from_jsonf("{ 'type': %s,"
-                              "'device': %s,"
-                              "'len': %" PRId64 ","
-                              "'offset': %" PRId64 ","
-                              "'speed': %" PRId64 " }",
-                              job->job_type->job_type,
-                              bdrv_get_device_name(job->bs),
-                              job->len,
-                              job->offset,
-                              job->speed);
-}
-
-static void block_stream_cb(void *opaque, int ret)
+static void block_job_cb(void *opaque, int ret)
 {
    BlockDriverState *bs = opaque;
    QObject *obj;

-    trace_block_stream_cb(bs, bs->job, ret);
+    trace_block_job_cb(bs, bs->job, ret);

    assert(bs->job);
    obj = qobject_from_block_job(bs->job);
@ -1088,13 +1088,18 @@ static void block_stream_cb(void *opaque, int ret)
 }

 void qmp_block_stream(const char *device, bool has_base,
-                      const char *base, bool has_speed,
-                      int64_t speed, Error **errp)
+                      const char *base, bool has_speed, int64_t speed,
+                      bool has_on_error, BlockdevOnError on_error,
+                      Error **errp)
 {
    BlockDriverState *bs;
    BlockDriverState *base_bs = NULL;
    Error *local_err = NULL;

+    if (!has_on_error) {
+        on_error = BLOCKDEV_ON_ERROR_REPORT;
+    }
+
    bs = bdrv_find(device);
    if (!bs) {
        error_set(errp, QERR_DEVICE_NOT_FOUND, device);
@ -1110,7 +1115,7 @@ void qmp_block_stream(const char *device, bool has_base,
    }

    stream_start(bs, base_bs, base, has_speed ? speed : 0,
-                 block_stream_cb, bs, &local_err);
+                 on_error, block_job_cb, bs, &local_err);
    if (error_is_set(&local_err)) {
        error_propagate(errp, local_err);
        return;
@ -1124,6 +1129,199 @@ void qmp_block_stream(const char *device, bool has_base,
    trace_qmp_block_stream(bs, bs->job);
 }

+void qmp_block_commit(const char *device,
+                      bool has_base, const char *base, const char *top,
+                      bool has_speed, int64_t speed,
+                      Error **errp)
+{
+    BlockDriverState *bs;
+    BlockDriverState *base_bs, *top_bs;
+    Error *local_err = NULL;
+    /* This will be part of the QMP command, if/when the
+     * BlockdevOnError change for blkmirror makes it in
+     */
+    BlockdevOnError on_error = BLOCKDEV_ON_ERROR_REPORT;
+
+    /* drain all i/o before commits */
+    bdrv_drain_all();
+
+    bs = bdrv_find(device);
+    if (!bs) {
+        error_set(errp, QERR_DEVICE_NOT_FOUND, device);
+        return;
+    }
+
+    /* default top_bs is the active layer */
+    top_bs = bs;
+
+    if (top) {
+        if (strcmp(bs->filename, top) != 0) {
+            top_bs = bdrv_find_backing_image(bs, top);
+        }
+    }
+
+    if (top_bs == NULL) {
+        error_setg(errp, "Top image file %s not found", top ? top : "NULL");
+        return;
+    }
+
+    if (has_base && base) {
+        base_bs = bdrv_find_backing_image(top_bs, base);
+    } else {
+        base_bs = bdrv_find_base(top_bs);
+    }
+
+    if (base_bs == NULL) {
+        error_set(errp, QERR_BASE_NOT_FOUND, base ? base : "NULL");
+        return;
+    }
+
+    commit_start(bs, base_bs, top_bs, speed, on_error, block_job_cb, bs,
+                &local_err);
+    if (local_err != NULL) {
+        error_propagate(errp, local_err);
+        return;
+    }
+    /* Grab a reference so hotplug does not delete the BlockDriverState from
+     * underneath us.
+     */
+    drive_get_ref(drive_get_by_blockdev(bs));
+}
+
+void qmp_drive_mirror(const char *device, const char *target,
+                      bool has_format, const char *format,
+                      enum MirrorSyncMode sync,
+                      bool has_mode, enum NewImageMode mode,
+                      bool has_speed, int64_t speed,
+                      bool has_on_source_error, BlockdevOnError on_source_error,
+                      bool has_on_target_error, BlockdevOnError on_target_error,
+                      Error **errp)
+{
+    BlockDriverInfo bdi;
+    BlockDriverState *bs;
+    BlockDriverState *source, *target_bs;
+    BlockDriver *proto_drv;
+    BlockDriver *drv = NULL;
+    Error *local_err = NULL;
+    int flags;
+    uint64_t size;
+    int ret;
+
+    if (!has_speed) {
+        speed = 0;
+    }
+    if (!has_on_source_error) {
+        on_source_error = BLOCKDEV_ON_ERROR_REPORT;
+    }
+    if (!has_on_target_error) {
+        on_target_error = BLOCKDEV_ON_ERROR_REPORT;
+    }
+    if (!has_mode) {
+        mode = NEW_IMAGE_MODE_ABSOLUTE_PATHS;
+    }
+
+    bs = bdrv_find(device);
+    if (!bs) {
+        error_set(errp, QERR_DEVICE_NOT_FOUND, device);
+        return;
+    }
+
+    if (!bdrv_is_inserted(bs)) {
+        error_set(errp, QERR_DEVICE_HAS_NO_MEDIUM, device);
+        return;
+    }
+
+    if (!has_format) {
+        format = mode == NEW_IMAGE_MODE_EXISTING ? NULL : bs->drv->format_name;
+    }
+    if (format) {
+        drv = bdrv_find_format(format);
+        if (!drv) {
+            error_set(errp, QERR_INVALID_BLOCK_FORMAT, format);
+            return;
+        }
+    }
+
+    if (bdrv_in_use(bs)) {
+        error_set(errp, QERR_DEVICE_IN_USE, device);
+        return;
+    }
+
+    flags = bs->open_flags | BDRV_O_RDWR;
+    source = bs->backing_hd;
+    if (!source && sync == MIRROR_SYNC_MODE_TOP) {
+        sync = MIRROR_SYNC_MODE_FULL;
+    }
+
+    proto_drv = bdrv_find_protocol(target);
+    if (!proto_drv) {
+        error_set(errp, QERR_INVALID_BLOCK_FORMAT, format);
+        return;
+    }
+
+    if (sync == MIRROR_SYNC_MODE_FULL && mode != NEW_IMAGE_MODE_EXISTING) {
+        /* create new image w/o backing file */
+        assert(format && drv);
+        bdrv_get_geometry(bs, &size);
+        size *= 512;
+        bdrv_img_create(target, format,
+                        NULL, NULL, NULL, size, flags, &local_err);
+    } else {
+        switch (mode) {
+        case NEW_IMAGE_MODE_EXISTING:
+            ret = 0;
+            break;
+        case NEW_IMAGE_MODE_ABSOLUTE_PATHS:
+            /* create new image with backing file */
+            bdrv_img_create(target, format,
+                            source->filename,
+                            source->drv->format_name,
+                            NULL, -1, flags, &local_err);
+            break;
+        default:
+            abort();
+        }
+    }
+
+    if (error_is_set(&local_err)) {
+        error_propagate(errp, local_err);
+        return;
+    }
+
+    target_bs = bdrv_new("");
+    ret = bdrv_open(target_bs, target, flags | BDRV_O_NO_BACKING, drv);
+
+    if (ret < 0) {
+        bdrv_delete(target_bs);
+        error_set(errp, QERR_OPEN_FILE_FAILED, target);
+        return;
+    }
+
+    /* We need a backing file if we will copy parts of a cluster.  */
+    if (bdrv_get_info(target_bs, &bdi) >= 0 && bdi.cluster_size != 0 &&
+        bdi.cluster_size >= BDRV_SECTORS_PER_DIRTY_CHUNK * 512) {
+        ret = bdrv_open_backing_file(target_bs);
+        if (ret < 0) {
+            bdrv_delete(target_bs);
+            error_set(errp, QERR_OPEN_FILE_FAILED, target);
+            return;
+        }
+    }
+
+    mirror_start(bs, target_bs, speed, sync, on_source_error, on_target_error,
+                 block_job_cb, bs, &local_err);
+    if (local_err != NULL) {
+        bdrv_delete(target_bs);
+        error_propagate(errp, local_err);
+        return;
+    }
+
+    /* Grab a reference so hotplug does not delete the BlockDriverState from
+     * underneath us.
+     */
+    drive_get_ref(drive_get_by_blockdev(bs));
+}
+
 static BlockJob *find_block_job(const char *device)
 {
    BlockDriverState *bs;
@ -1140,19 +1338,28 @@ void qmp_block_job_set_speed(const char *device, int64_t speed, Error **errp)
    BlockJob *job = find_block_job(device);

    if (!job) {
-        error_set(errp, QERR_DEVICE_NOT_ACTIVE, device);
+        error_set(errp, QERR_BLOCK_JOB_NOT_ACTIVE, device);
        return;
    }

    block_job_set_speed(job, speed, errp);
 }

-void qmp_block_job_cancel(const char *device, Error **errp)
+void qmp_block_job_cancel(const char *device,
+                          bool has_force, bool force, Error **errp)
 {
    BlockJob *job = find_block_job(device);

+    if (!has_force) {
+        force = false;
+    }
+
    if (!job) {
-        error_set(errp, QERR_DEVICE_NOT_ACTIVE, device);
+        error_set(errp, QERR_BLOCK_JOB_NOT_ACTIVE, device);
+        return;
+    }
+    if (job->paused && !force) {
+        error_set(errp, QERR_BLOCK_JOB_PAUSED, device);
        return;
    }

@ -1160,25 +1367,53 @@ void qmp_block_job_cancel(const char *device, Error **errp)
    block_job_cancel(job);
 }

+void qmp_block_job_pause(const char *device, Error **errp)
+{
+    BlockJob *job = find_block_job(device);
+
+    if (!job) {
+        error_set(errp, QERR_BLOCK_JOB_NOT_ACTIVE, device);
+        return;
+    }
+
+    trace_qmp_block_job_pause(job);
+    block_job_pause(job);
+}
+
+void qmp_block_job_resume(const char *device, Error **errp)
+{
+    BlockJob *job = find_block_job(device);
+
+    if (!job) {
+        error_set(errp, QERR_BLOCK_JOB_NOT_ACTIVE, device);
+        return;
+    }
+
+    trace_qmp_block_job_resume(job);
+    block_job_resume(job);
+}
+
+void qmp_block_job_complete(const char *device, Error **errp)
+{
+    BlockJob *job = find_block_job(device);
+
+    if (!job) {
+        error_set(errp, QERR_BLOCK_JOB_NOT_ACTIVE, device);
+        return;
+    }
+
+    trace_qmp_block_job_complete(job);
+    block_job_complete(job, errp);
+}
+
 static void do_qmp_query_block_jobs_one(void *opaque, BlockDriverState *bs)
 {
    BlockJobInfoList **prev = opaque;
    BlockJob *job = bs->job;

    if (job) {
-        BlockJobInfoList *elem;
-        BlockJobInfo *info = g_new(BlockJobInfo, 1);
-        *info = (BlockJobInfo){
-            .type   = g_strdup(job->job_type->job_type),
-            .device = g_strdup(bdrv_get_device_name(bs)),
-            .len    = job->len,
-            .offset = job->offset,
-            .speed  = job->speed,
-        };
-
-        elem = g_new0(BlockJobInfoList, 1);
-        elem->value = info;
-
+        BlockJobInfoList *elem = g_new0(BlockJobInfoList, 1);
+        elem->value = block_job_query(bs->job);
        (*prev)->next = elem;
        *prev = elem;
    }
--- a/blockjob.c
+++ b/blockjob.c
@ -0,0 +1,283 @@
+/*
+ * QEMU System Emulator block driver
+ *
+ * Copyright (c) 2011 IBM Corp.
+ * Copyright (c) 2012 Red Hat, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "config-host.h"
+#include "qemu-common.h"
+#include "trace.h"
+#include "monitor/monitor.h"
+#include "block/block.h"
+#include "block/blockjob.h"
+#include "block/block_int.h"
+#include "qapi/qmp/qjson.h"
+#include "block/coroutine.h"
+#include "qmp-commands.h"
+#include "qemu/timer.h"
+
+void *block_job_create(const BlockJobType *job_type, BlockDriverState *bs,
+                       int64_t speed, BlockDriverCompletionFunc *cb,
+                       void *opaque, Error **errp)
+{
+    BlockJob *job;
+
+    if (bs->job || bdrv_in_use(bs)) {
+        error_set(errp, QERR_DEVICE_IN_USE, bdrv_get_device_name(bs));
+        return NULL;
+    }
+    bdrv_set_in_use(bs, 1);
+
+    job = g_malloc0(job_type->instance_size);
+    job->job_type      = job_type;
+    job->bs            = bs;
+    job->cb            = cb;
+    job->opaque        = opaque;
+    job->busy          = true;
+    bs->job = job;
+
+    /* Only set speed when necessary to avoid NotSupported error */
+    if (speed != 0) {
+        Error *local_err = NULL;
+
+        block_job_set_speed(job, speed, &local_err);
+        if (error_is_set(&local_err)) {
+            bs->job = NULL;
+            g_free(job);
+            bdrv_set_in_use(bs, 0);
+            error_propagate(errp, local_err);
+            return NULL;
+        }
+    }
+    return job;
+}
+
+void block_job_completed(BlockJob *job, int ret)
+{
+    BlockDriverState *bs = job->bs;
+
+    assert(bs->job == job);
+    job->cb(job->opaque, ret);
+    bs->job = NULL;
+    g_free(job);
+    bdrv_set_in_use(bs, 0);
+}
+
+void block_job_set_speed(BlockJob *job, int64_t speed, Error **errp)
+{
+    Error *local_err = NULL;
+
+    if (!job->job_type->set_speed) {
+        error_set(errp, QERR_NOT_SUPPORTED);
+        return;
+    }
+    job->job_type->set_speed(job, speed, &local_err);
+    if (error_is_set(&local_err)) {
+        error_propagate(errp, local_err);
+        return;
+    }
+
+    job->speed = speed;
+}
+
+void block_job_complete(BlockJob *job, Error **errp)
+{
+    if (job->paused || job->cancelled || !job->job_type->complete) {
+        error_set(errp, QERR_BLOCK_JOB_NOT_READY, job->bs->device_name);
+        return;
+    }
+
+    job->job_type->complete(job, errp);
+}
+
+void block_job_pause(BlockJob *job)
+{
+    job->paused = true;
+}
+
+bool block_job_is_paused(BlockJob *job)
+{
+    return job->paused;
+}
+
+void block_job_resume(BlockJob *job)
+{
+    job->paused = false;
+    block_job_iostatus_reset(job);
+    if (job->co && !job->busy) {
+        qemu_coroutine_enter(job->co, NULL);
+    }
+}
+
+void block_job_cancel(BlockJob *job)
+{
+    job->cancelled = true;
+    block_job_resume(job);
+}
+
+bool block_job_is_cancelled(BlockJob *job)
+{
+    return job->cancelled;
+}
+
+void block_job_iostatus_reset(BlockJob *job)
+{
+    job->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
+    if (job->job_type->iostatus_reset) {
+        job->job_type->iostatus_reset(job);
+    }
+}
+
+struct BlockCancelData {
+    BlockJob *job;
+    BlockDriverCompletionFunc *cb;
+    void *opaque;
+    bool cancelled;
+    int ret;
+};
+
+static void block_job_cancel_cb(void *opaque, int ret)
+{
+    struct BlockCancelData *data = opaque;
+
+    data->cancelled = block_job_is_cancelled(data->job);
+    data->ret = ret;
+    data->cb(data->opaque, ret);
+}
+
+int block_job_cancel_sync(BlockJob *job)
+{
+    struct BlockCancelData data;
+    BlockDriverState *bs = job->bs;
+
+    assert(bs->job == job);
+
+    /* Set up our own callback to store the result and chain to
+     * the original callback.
+     */
+    data.job = job;
+    data.cb = job->cb;
+    data.opaque = job->opaque;
+    data.ret = -EINPROGRESS;
+    job->cb = block_job_cancel_cb;
+    job->opaque = &data;
+    block_job_cancel(job);
+    while (data.ret == -EINPROGRESS) {
+        qemu_aio_wait();
+    }
+    return (data.cancelled && data.ret == 0) ? -ECANCELED : data.ret;
+}
+
+void block_job_sleep_ns(BlockJob *job, QEMUClock *clock, int64_t ns)
+{
+    assert(job->busy);
+
+    /* Check cancellation *before* setting busy = false, too!  */
+    if (block_job_is_cancelled(job)) {
+        return;
+    }
+
+    job->busy = false;
+    if (block_job_is_paused(job)) {
+        qemu_coroutine_yield();
+    } else {
+        co_sleep_ns(clock, ns);
+    }
+    job->busy = true;
+}
+
+BlockJobInfo *block_job_query(BlockJob *job)
+{
+    BlockJobInfo *info = g_new0(BlockJobInfo, 1);
+    info->type      = g_strdup(job->job_type->job_type);
+    info->device    = g_strdup(bdrv_get_device_name(job->bs));
+    info->len       = job->len;
+    info->busy      = job->busy;
+    info->paused    = job->paused;
+    info->offset    = job->offset;
+    info->speed     = job->speed;
+    info->io_status = job->iostatus;
+    return info;
+}
+
+static void block_job_iostatus_set_err(BlockJob *job, int error)
+{
+    if (job->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
+        job->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
+                                          BLOCK_DEVICE_IO_STATUS_FAILED;
+    }
+}
+
+
+QObject *qobject_from_block_job(BlockJob *job)
+{
+    return qobject_from_jsonf("{ 'type': %s,"
+                              "'device': %s,"
+                              "'len': %" PRId64 ","
+                              "'offset': %" PRId64 ","
+                              "'speed': %" PRId64 " }",
+                              job->job_type->job_type,
+                              bdrv_get_device_name(job->bs),
+                              job->len,
+                              job->offset,
+                              job->speed);
+}
+
+void block_job_ready(BlockJob *job)
+{
+    QObject *data = qobject_from_block_job(job);
+    monitor_protocol_event(QEVENT_BLOCK_JOB_READY, data);
+    qobject_decref(data);
+}
+
+BlockErrorAction block_job_error_action(BlockJob *job, BlockDriverState *bs,
+                                        BlockdevOnError on_err,
+                                        int is_read, int error)
+{
+    BlockErrorAction action;
+
+    switch (on_err) {
+    case BLOCKDEV_ON_ERROR_ENOSPC:
+        action = (error == ENOSPC) ? BDRV_ACTION_STOP : BDRV_ACTION_REPORT;
+        break;
+    case BLOCKDEV_ON_ERROR_STOP:
+        action = BDRV_ACTION_STOP;
+        break;
+    case BLOCKDEV_ON_ERROR_REPORT:
+        action = BDRV_ACTION_REPORT;
+        break;
+    case BLOCKDEV_ON_ERROR_IGNORE:
+        action = BDRV_ACTION_IGNORE;
+        break;
+    default:
+        abort();
+    }
+    bdrv_emit_qmp_error_event(job->bs, QEVENT_BLOCK_JOB_ERROR, action, is_read);
+    if (action == BDRV_ACTION_STOP) {
+        block_job_pause(job);
+        block_job_iostatus_set_err(job, error);
+        if (bs != job->bs) {
+            bdrv_iostatus_set_err(bs, error);
+        }
+    }
+    return action;
+}
--- a/bsd-user/elfload.c
+++ b/bsd-user/elfload.c
@ -10,7 +10,7 @@
 #include <string.h>

 #include "qemu.h"
-#include "disas.h"
+#include "disas/disas.h"

 #ifdef _ARCH_PPC64
 #undef ARCH_DLINFO
--- a/bsd-user/main.c
+++ b/bsd-user/main.c
@ -31,8 +31,8 @@
 /* For tb_lock */
 #include "cpu.h"
 #include "tcg.h"
-#include "qemu-timer.h"
-#include "envlist.h"
+#include "qemu/timer.h"
+#include "qemu/envlist.h"

 #define DEBUG_LOGFILE "/tmp/qemu.log"

--- a/bsd-user/qemu-types.h
+++ b/bsd-user/qemu-types.h
@ -1,24 +0,0 @@
-#ifndef QEMU_TYPES_H
-#define QEMU_TYPES_H
-#include "cpu.h"
-
-#ifdef TARGET_ABI32
-typedef uint32_t abi_ulong;
-typedef int32_t abi_long;
-#define TARGET_ABI_FMT_lx "%08x"
-#define TARGET_ABI_FMT_ld "%d"
-#define TARGET_ABI_FMT_lu "%u"
-#define TARGET_ABI_BITS 32
-#else
-typedef target_ulong abi_ulong;
-typedef target_long abi_long;
-#define TARGET_ABI_FMT_lx TARGET_FMT_lx
-#define TARGET_ABI_FMT_ld TARGET_FMT_ld
-#define TARGET_ABI_FMT_lu TARGET_FMT_lu
-#define TARGET_ABI_BITS TARGET_LONG_BITS
-/* for consistency, define ABI32 too */
-#if TARGET_ABI_BITS == 32
-#define TARGET_ABI32 1
-#endif
-#endif
-#endif
--- a/bsd-user/qemu.h
+++ b/bsd-user/qemu.h
@ -11,7 +11,7 @@
 #include <stdlib.h>
 #endif /* DEBUG_REMAP */

-#include "qemu-types.h"
+#include "exec/user/abitypes.h"

 enum BSDType {
    target_freebsd,
@ -23,7 +23,7 @@ extern enum BSDType bsd_type;
 #include "syscall_defs.h"
 #include "syscall.h"
 #include "target_signal.h"
-#include "gdbstub.h"
+#include "exec/gdbstub.h"

 #if defined(CONFIG_USE_NPTL)
 #define THREAD __thread
@ -146,7 +146,7 @@ int get_osversion(void);
 void fork_start(void);
 void fork_end(int child);

-#include "qemu-log.h"
+#include "qemu/log.h"

 /* strace.c */
 void
--- a/bt-host.c
+++ b/bt-host.c
@ -18,9 +18,8 @@
 */

 #include "qemu-common.h"
-#include "qemu-char.h"
-#include "net.h"
-#include "bt-host.h"
+#include "bt/bt.h"
+#include "qemu/main-loop.h"

 #ifndef _WIN32
 # include <errno.h>
--- a/bt-host.h
+++ b/bt-host.h
@ -1,9 +0,0 @@
-#ifndef BT_HOST_H
-#define BT_HOST_H
-
-struct HCIInfo;
-
-/* bt-host.c */
-struct HCIInfo *bt_host_hci(const char *id);
-
-#endif
--- a/bt-vhci.c
+++ b/bt-vhci.c
@ -18,9 +18,9 @@
 */

 #include "qemu-common.h"
-#include "qemu-char.h"
-#include "net.h"
+#include "bt/bt.h"
 #include "hw/bt.h"
+#include "qemu/main-loop.h"

 #define VHCI_DEV	"/dev/vhci"
 #define VHCI_UDEV	"/dev/hci_vhci"
--- a/buffered_file.c
+++ b/buffered_file.c
@ -1,293 +0,0 @@
-/*
- * QEMU buffered QEMUFile
- *
- * Copyright IBM, Corp. 2008
- *
- * Authors:
- *  Anthony Liguori   <aliguori@us.ibm.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2.  See
- * the COPYING file in the top-level directory.
- *
- * Contributions after 2012-01-13 are licensed under the terms of the
- * GNU GPL, version 2 or (at your option) any later version.
- */
-
-#include "qemu-common.h"
-#include "hw/hw.h"
-#include "qemu-timer.h"
-#include "qemu-char.h"
-#include "buffered_file.h"
-
-//#define DEBUG_BUFFERED_FILE
-
-typedef struct QEMUFileBuffered
-{
-    BufferedPutFunc *put_buffer;
-    BufferedPutReadyFunc *put_ready;
-    BufferedWaitForUnfreezeFunc *wait_for_unfreeze;
-    BufferedCloseFunc *close;
-    void *opaque;
-    QEMUFile *file;
-    int freeze_output;
-    size_t bytes_xfer;
-    size_t xfer_limit;
-    uint8_t *buffer;
-    size_t buffer_size;
-    size_t buffer_capacity;
-    QEMUTimer *timer;
-} QEMUFileBuffered;
-
-#ifdef DEBUG_BUFFERED_FILE
-#define DPRINTF(fmt, ...) \
-    do { printf("buffered-file: " fmt, ## __VA_ARGS__); } while (0)
-#else
-#define DPRINTF(fmt, ...) \
-    do { } while (0)
-#endif
-
-static void buffered_append(QEMUFileBuffered *s,
-                            const uint8_t *buf, size_t size)
-{
-    if (size > (s->buffer_capacity - s->buffer_size)) {
-        void *tmp;
-
-        DPRINTF("increasing buffer capacity from %zu by %zu\n",
-                s->buffer_capacity, size + 1024);
-
-        s->buffer_capacity += size + 1024;
-
-        tmp = g_realloc(s->buffer, s->buffer_capacity);
-        if (tmp == NULL) {
-            fprintf(stderr, "qemu file buffer expansion failed\n");
-            exit(1);
-        }
-
-        s->buffer = tmp;
-    }
-
-    memcpy(s->buffer + s->buffer_size, buf, size);
-    s->buffer_size += size;
-}
-
-static void buffered_flush(QEMUFileBuffered *s)
-{
-    size_t offset = 0;
-    int error;
-
-    error = qemu_file_get_error(s->file);
-    if (error != 0) {
-        DPRINTF("flush when error, bailing: %s\n", strerror(-error));
-        return;
-    }
-
-    DPRINTF("flushing %zu byte(s) of data\n", s->buffer_size);
-
-    while (offset < s->buffer_size) {
-        ssize_t ret;
-
-        ret = s->put_buffer(s->opaque, s->buffer + offset,
-                            s->buffer_size - offset);
-        if (ret == -EAGAIN) {
-            DPRINTF("backend not ready, freezing\n");
-            s->freeze_output = 1;
-            break;
-        }
-
-        if (ret <= 0) {
-            DPRINTF("error flushing data, %zd\n", ret);
-            qemu_file_set_error(s->file, ret);
-            break;
-        } else {
-            DPRINTF("flushed %zd byte(s)\n", ret);
-            offset += ret;
-        }
-    }
-
-    DPRINTF("flushed %zu of %zu byte(s)\n", offset, s->buffer_size);
-    memmove(s->buffer, s->buffer + offset, s->buffer_size - offset);
-    s->buffer_size -= offset;
-}
-
-static int buffered_put_buffer(void *opaque, const uint8_t *buf, int64_t pos, int size)
-{
-    QEMUFileBuffered *s = opaque;
-    int offset = 0, error;
-    ssize_t ret;
-
-    DPRINTF("putting %d bytes at %" PRId64 "\n", size, pos);
-
-    error = qemu_file_get_error(s->file);
-    if (error) {
-        DPRINTF("flush when error, bailing: %s\n", strerror(-error));
-        return error;
-    }
-
-    DPRINTF("unfreezing output\n");
-    s->freeze_output = 0;
-
-    buffered_flush(s);
-
-    while (!s->freeze_output && offset < size) {
-        if (s->bytes_xfer > s->xfer_limit) {
-            DPRINTF("transfer limit exceeded when putting\n");
-            break;
-        }
-
-        ret = s->put_buffer(s->opaque, buf + offset, size - offset);
-        if (ret == -EAGAIN) {
-            DPRINTF("backend not ready, freezing\n");
-            s->freeze_output = 1;
-            break;
-        }
-
-        if (ret <= 0) {
-            DPRINTF("error putting\n");
-            qemu_file_set_error(s->file, ret);
-            offset = -EINVAL;
-            break;
-        }
-
-        DPRINTF("put %zd byte(s)\n", ret);
-        offset += ret;
-        s->bytes_xfer += ret;
-    }
-
-    if (offset >= 0) {
-        DPRINTF("buffering %d bytes\n", size - offset);
-        buffered_append(s, buf + offset, size - offset);
-        offset = size;
-    }
-
-    if (pos == 0 && size == 0) {
-        DPRINTF("file is ready\n");
-        if (s->bytes_xfer <= s->xfer_limit) {
-            DPRINTF("notifying client\n");
-            s->put_ready(s->opaque);
-        }
-    }
-
-    return offset;
-}
-
-static int buffered_close(void *opaque)
-{
-    QEMUFileBuffered *s = opaque;
-    int ret;
-
-    DPRINTF("closing\n");
-
-    while (!qemu_file_get_error(s->file) && s->buffer_size) {
-        buffered_flush(s);
-        if (s->freeze_output)
-            s->wait_for_unfreeze(s->opaque);
-    }
-
-    ret = s->close(s->opaque);
-
-    qemu_del_timer(s->timer);
-    qemu_free_timer(s->timer);
-    g_free(s->buffer);
-    g_free(s);
-
-    return ret;
-}
-
-/*
- * The meaning of the return values is:
- *   0: We can continue sending
- *   1: Time to stop
- *   negative: There has been an error
- */
-static int buffered_rate_limit(void *opaque)
-{
-    QEMUFileBuffered *s = opaque;
-    int ret;
-
-    ret = qemu_file_get_error(s->file);
-    if (ret) {
-        return ret;
-    }
-    if (s->freeze_output)
-        return 1;
-
-    if (s->bytes_xfer > s->xfer_limit)
-        return 1;
-
-    return 0;
-}
-
-static int64_t buffered_set_rate_limit(void *opaque, int64_t new_rate)
-{
-    QEMUFileBuffered *s = opaque;
-    if (qemu_file_get_error(s->file)) {
-        goto out;
-    }
-    if (new_rate > SIZE_MAX) {
-        new_rate = SIZE_MAX;
-    }
-
-    s->xfer_limit = new_rate / 10;
-    
-out:
-    return s->xfer_limit;
-}
-
-static int64_t buffered_get_rate_limit(void *opaque)
-{
-    QEMUFileBuffered *s = opaque;
-  
-    return s->xfer_limit;
-}
-
-static void buffered_rate_tick(void *opaque)
-{
-    QEMUFileBuffered *s = opaque;
-
-    if (qemu_file_get_error(s->file)) {
-        buffered_close(s);
-        return;
-    }
-
-    qemu_mod_timer(s->timer, qemu_get_clock_ms(rt_clock) + 100);
-
-    if (s->freeze_output)
-        return;
-
-    s->bytes_xfer = 0;
-
-    buffered_flush(s);
-
-    /* Add some checks around this */
-    s->put_ready(s->opaque);
-}
-
-QEMUFile *qemu_fopen_ops_buffered(void *opaque,
-                                  size_t bytes_per_sec,
-                                  BufferedPutFunc *put_buffer,
-                                  BufferedPutReadyFunc *put_ready,
-                                  BufferedWaitForUnfreezeFunc *wait_for_unfreeze,
-                                  BufferedCloseFunc *close)
-{
-    QEMUFileBuffered *s;
-
-    s = g_malloc0(sizeof(*s));
-
-    s->opaque = opaque;
-    s->xfer_limit = bytes_per_sec / 10;
-    s->put_buffer = put_buffer;
-    s->put_ready = put_ready;
-    s->wait_for_unfreeze = wait_for_unfreeze;
-    s->close = close;
-
-    s->file = qemu_fopen_ops(s, buffered_put_buffer, NULL,
-                             buffered_close, buffered_rate_limit,
-                             buffered_set_rate_limit,
-			     buffered_get_rate_limit);
-
-    s->timer = qemu_new_timer_ms(rt_clock, buffered_rate_tick, s);
-
-    qemu_mod_timer(s->timer, qemu_get_clock_ms(rt_clock) + 100);
-
-    return s->file;
-}
--- a/buffered_file.h
+++ b/buffered_file.h
@ -1,30 +0,0 @@
-/*
- * QEMU buffered QEMUFile
- *
- * Copyright IBM, Corp. 2008
- *
- * Authors:
- *  Anthony Liguori   <aliguori@us.ibm.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2.  See
- * the COPYING file in the top-level directory.
- *
- */
-
-#ifndef QEMU_BUFFERED_FILE_H
-#define QEMU_BUFFERED_FILE_H
-
-#include "hw/hw.h"
-
-typedef ssize_t (BufferedPutFunc)(void *opaque, const void *data, size_t size);
-typedef void (BufferedPutReadyFunc)(void *opaque);
-typedef void (BufferedWaitForUnfreezeFunc)(void *opaque);
-typedef int (BufferedCloseFunc)(void *opaque);
-
-QEMUFile *qemu_fopen_ops_buffered(void *opaque, size_t xfer_limit,
-                                  BufferedPutFunc *put_buffer,
-                                  BufferedPutReadyFunc *put_ready,
-                                  BufferedWaitForUnfreezeFunc *wait_for_unfreeze,
-                                  BufferedCloseFunc *close);
-
-#endif
--- a/cache-utils.c
+++ b/cache-utils.c
@ -1,4 +1,4 @@
-#include "cache-utils.h"
+#include "qemu/cache-utils.h"

 #if defined(_ARCH_PPC)
 struct qemu_cache_conf qemu_cache_conf = {
--- a/cmd.c
+++ b/cmd.c
@ -24,8 +24,8 @@
 #include <getopt.h>

 #include "cmd.h"
-#include "qemu-aio.h"
-#include "main-loop.h"
+#include "block/aio.h"
+#include "qemu/main-loop.h"

 #define _(x)	x	/* not gettext support yet */

--- a/compatfd.c
+++ b/compatfd.c
@ -14,7 +14,7 @@
 */

 #include "qemu-common.h"
-#include "compatfd.h"
+#include "qemu/compatfd.h"

 #include <sys/syscall.h>
 #include <pthread.h>
--- a/636
+++ b/636
--- a/coroutine-gthread.c
+++ b/coroutine-gthread.c
@ -20,7 +20,7 @@

 #include <glib.h>
 #include "qemu-common.h"
-#include "qemu-coroutine-int.h"
+#include "block/coroutine_int.h"

 typedef struct {
    Coroutine base;
--- a/coroutine-sigaltstack.c
+++ b/coroutine-sigaltstack.c
@ -31,7 +31,7 @@
 #include <pthread.h>
 #include <signal.h>
 #include "qemu-common.h"
-#include "qemu-coroutine-int.h"
+#include "block/coroutine_int.h"

 enum {
    /* Maximum free pool size prevents holding too many freed coroutines */
@ -171,8 +171,8 @@ static Coroutine *coroutine_new(void)
    CoroutineThreadState *coTS;
    struct sigaction sa;
    struct sigaction osa;
-    struct sigaltstack ss;
-    struct sigaltstack oss;
+    stack_t ss;
+    stack_t oss;
    sigset_t sigs;
    sigset_t osigs;
    jmp_buf old_env;
--- a/coroutine-ucontext.c
+++ b/coroutine-ucontext.c
@ -28,7 +28,7 @@
 #include <pthread.h>
 #include <ucontext.h>
 #include "qemu-common.h"
-#include "qemu-coroutine-int.h"
+#include "block/coroutine_int.h"

 #ifdef CONFIG_VALGRIND_H
 #include <valgrind/valgrind.h>
--- a/Show More
+++ b/Show More
 @ -1 +1 @@
 .1.50
 .3.50