From 777d05ba477dde63a097ad12f1bb286f6ab7c4cc Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 4 Oct 2017 16:35:53 +0200
Subject: [PATCH 01/29] checkpatch: refine mode selection

stgit produces patch files that lack the ".patch" extensions.  Others
might be using ".diff" too.  But since we are already limiting source files
to only a handful of extensions, we can reuse that in the mode selection
code.

While at it, do not match "../foo" as a branch name.

Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 scripts/checkpatch.pl | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 0c41f1212f..ce9f08eb9f 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -11,6 +11,8 @@ use warnings;
 my $P = $0;
 $P =~ s@.*/@@g;
 
+our $SrcFile    = qr{\.(?:h|c|cpp|s|S|pl|py|sh)$};
+
 my $V = '0.31';
 
 use Getopt::Long qw(:config no_auto_abbrev);
@@ -101,30 +103,29 @@ if ($#ARGV < 0) {
 }
 
 if (!defined $chk_branch && !defined $chk_patch && !defined $file) {
-	$chk_branch = $ARGV[0] =~ /\.\./ ? 1 : 0;
-	$chk_patch = $chk_branch ? 0 :
-		$ARGV[0] =~ /\.patch$/ || $ARGV[0] eq "-" ? 1 : 0;
-	$file = $chk_branch || $chk_patch ? 0 : 1;
+	$chk_branch = $ARGV[0] =~ /.\.\./ ? 1 : 0;
+	$file = $ARGV[0] =~ /$SrcFile/ ? 1 : 0;
+	$chk_patch = $chk_branch || $file ? 0 : 1;
 } elsif (!defined $chk_branch && !defined $chk_patch) {
 	if ($file) {
 		$chk_branch = $chk_patch = 0;
 	} else {
-		$chk_branch = $ARGV[0] =~ /\.\./ ? 1 : 0;
+		$chk_branch = $ARGV[0] =~ /.\.\./ ? 1 : 0;
 		$chk_patch = $chk_branch ? 0 : 1;
 	}
 } elsif (!defined $chk_branch && !defined $file) {
 	if ($chk_patch) {
 		$chk_branch = $file = 0;
 	} else {
-		$chk_branch = $ARGV[0] =~ /\.\./ ? 1 : 0;
+		$chk_branch = $ARGV[0] =~ /.\.\./ ? 1 : 0;
 		$file = $chk_branch ? 0 : 1;
 	}
 } elsif (!defined $chk_patch && !defined $file) {
 	if ($chk_branch) {
 		$chk_patch = $file = 0;
 	} else {
-		$chk_patch = $ARGV[0] =~ /\.patch$/ || $ARGV[0] eq "-" ? 1 : 0;
-		$file = $chk_patch ? 0 : 1;
+		$file = $ARGV[0] =~ /$SrcFile/ ? 1 : 0;
+		$chk_patch = $file ? 0 : 1;
 	}
 } elsif (!defined $chk_branch) {
 	$chk_branch = $chk_patch || $file ? 0 : 1;
@@ -1443,7 +1444,7 @@ sub process {
 		}
 
 # check we are in a valid source file if not then ignore this hunk
-		next if ($realfile !~ /\.(h|c|cpp|s|S|pl|py|sh)$/);
+		next if ($realfile !~ /$SrcFile/);
 
 #90 column limit
 		if ($line =~ /^\+/ &&

From 070f80095ad5b1143b50d2faffd2b1a84292e00d Mon Sep 17 00:00:00 2001
From: "Daniel P. Berrange" <berrange@redhat.com>
Date: Wed, 4 Oct 2017 12:40:07 +0100
Subject: [PATCH 02/29] scsi-disk: support reporting of rotation rate

The Linux kernel will query the SCSI "Block device characteristics"
VPD to determine the rotations per minute of the disk. If this has
the value 1, it is taken to be an SSD and so Linux sets the
'rotational' flag to 0 for the I/O queue and will stop using that
disk as a source of random entropy. Other operating systems may
also take into account rotation rate when setting up default
behaviour.

Mgmt apps should be able to set the rotation rate for virtualized
block devices, based on characteristics of the host storage in use,
so that the guest OS gets sensible behaviour out of the box. This
patch thus adds a 'rotation-rate' parameter for 'scsi-hd' and
'scsi-block' device types. For the latter, this parameter will be
ignored unless the host device has TYPE_DISK.

Signed-off-by: Daniel P. Berrange <berrange@redhat.com>
Message-Id: <20171004114008.14849-2-berrange@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/scsi/scsi-disk.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
index 6e841fb5ff..a518080e7d 100644
--- a/hw/scsi/scsi-disk.c
+++ b/hw/scsi/scsi-disk.c
@@ -104,6 +104,14 @@ typedef struct SCSIDiskState
     char *product;
     bool tray_open;
     bool tray_locked;
+    /*
+     * 0x0000        - rotation rate not reported
+     * 0x0001        - non-rotating medium (SSD)
+     * 0x0002-0x0400 - reserved
+     * 0x0401-0xffe  - rotations per minute
+     * 0xffff        - reserved
+     */
+    uint16_t rotation_rate;
 } SCSIDiskState;
 
 static bool scsi_handle_rw_error(SCSIDiskReq *r, int error, bool acct_failed);
@@ -605,6 +613,7 @@ static int scsi_disk_emulate_inquiry(SCSIRequest *req, uint8_t *outbuf)
             outbuf[buflen++] = 0x83; // device identification
             if (s->qdev.type == TYPE_DISK) {
                 outbuf[buflen++] = 0xb0; // block limits
+                outbuf[buflen++] = 0xb1; /* block device characteristics */
                 outbuf[buflen++] = 0xb2; // thin provisioning
             }
             break;
@@ -747,6 +756,15 @@ static int scsi_disk_emulate_inquiry(SCSIRequest *req, uint8_t *outbuf)
             outbuf[43] = max_io_sectors & 0xff;
             break;
         }
+        case 0xb1: /* block device characteristics */
+        {
+            buflen = 8;
+            outbuf[4] = (s->rotation_rate >> 8) & 0xff;
+            outbuf[5] = s->rotation_rate & 0xff;
+            outbuf[6] = 0;
+            outbuf[7] = 0;
+            break;
+        }
         case 0xb2: /* thin provisioning */
         {
             buflen = 8;
@@ -2911,6 +2929,7 @@ static Property scsi_hd_properties[] = {
                        DEFAULT_MAX_UNMAP_SIZE),
     DEFINE_PROP_UINT64("max_io_size", SCSIDiskState, max_io_size,
                        DEFAULT_MAX_IO_SIZE),
+    DEFINE_PROP_UINT16("rotation_rate", SCSIDiskState, rotation_rate, 0),
     DEFINE_BLOCK_CHS_PROPERTIES(SCSIDiskState, qdev.conf),
     DEFINE_PROP_END_OF_LIST(),
 };
@@ -2982,6 +3001,7 @@ static const TypeInfo scsi_cd_info = {
 static Property scsi_block_properties[] = {
     DEFINE_BLOCK_ERROR_PROPERTIES(SCSIDiskState, qdev.conf),         \
     DEFINE_PROP_DRIVE("drive", SCSIDiskState, qdev.conf.blk),
+    DEFINE_PROP_UINT16("rotation_rate", SCSIDiskState, rotation_rate, 0),
     DEFINE_PROP_END_OF_LIST(),
 };
 

From 3b19f4506901ecce25ff36cf62353a2b4bfe4f2b Mon Sep 17 00:00:00 2001
From: "Daniel P. Berrange" <berrange@redhat.com>
Date: Wed, 4 Oct 2017 12:40:08 +0100
Subject: [PATCH 03/29] ide: support reporting of rotation rate

The Linux kernel will query the ATA IDENTITY DEVICE data, word 217
to determine the rotations per minute of the disk. If this has
the value 1, it is taken to be an SSD and so Linux sets the
'rotational' flag to 0 for the I/O queue and will stop using that
disk as a source of random entropy. Other operating systems may
also take into account rotation rate when setting up default
behaviour.

Mgmt apps should be able to set the rotation rate for virtualized
block devices, based on characteristics of the host storage in use,
so that the guest OS gets sensible behaviour out of the box. This
patch thus adds a 'rotation-rate' parameter for 'ide-hd' device
types.

Signed-off-by: Daniel P. Berrange <berrange@redhat.com>
Message-Id: <20171004114008.14849-3-berrange@redhat.com>
Reviewed-by: John Snow <jsnow@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/ide/core.c             | 1 +
 hw/ide/qdev.c             | 1 +
 include/hw/ide/internal.h | 8 ++++++++
 3 files changed, 10 insertions(+)

diff --git a/hw/ide/core.c b/hw/ide/core.c
index 5f1cd3b91f..a04766aee7 100644
--- a/hw/ide/core.c
+++ b/hw/ide/core.c
@@ -208,6 +208,7 @@ static void ide_identify(IDEState *s)
     if (dev && dev->conf.discard_granularity) {
         put_le16(p + 169, 1); /* TRIM support */
     }
+    put_le16(p + 217, dev->rotation_rate); /* Nominal media rotation rate */
 
     ide_identify_size(s);
     s->identify_set = 1;
diff --git a/hw/ide/qdev.c b/hw/ide/qdev.c
index d60ac25be0..a5181b4448 100644
--- a/hw/ide/qdev.c
+++ b/hw/ide/qdev.c
@@ -299,6 +299,7 @@ static Property ide_hd_properties[] = {
     DEFINE_BLOCK_CHS_PROPERTIES(IDEDrive, dev.conf),
     DEFINE_PROP_BIOS_CHS_TRANS("bios-chs-trans",
                 IDEDrive, dev.chs_trans, BIOS_ATA_TRANSLATION_AUTO),
+    DEFINE_PROP_UINT16("rotation_rate", IDEDrive, dev.rotation_rate, 0),
     DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/include/hw/ide/internal.h b/include/hw/ide/internal.h
index e641012b48..31851b44d1 100644
--- a/include/hw/ide/internal.h
+++ b/include/hw/ide/internal.h
@@ -508,6 +508,14 @@ struct IDEDevice {
     char *serial;
     char *model;
     uint64_t wwn;
+    /*
+     * 0x0000        - rotation rate not reported
+     * 0x0001        - non-rotating medium (SSD)
+     * 0x0002-0x0400 - reserved
+     * 0x0401-0xffe  - rotations per minute
+     * 0xffff        - reserved
+     */
+    uint16_t rotation_rate;
 };
 
 /* These are used for the error_status field of IDEBus */

From 9cca7578b45ac5b10c4cdb3dd7e08bb28c766c6d Mon Sep 17 00:00:00 2001
From: "Daniel P. Berrange" <berrange@redhat.com>
Date: Thu, 5 Oct 2017 16:50:57 +0100
Subject: [PATCH 04/29] char: don't skip client cleanup if 'connected' flag is
 unset
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The tcp_chr_free_connection & tcp_chr_disconnect methods both
skip all of their cleanup work unless the 's->connected' flag
is set.  This flag is set when the incoming client connection
is ready to use. Crucially this is *after* the TLS handshake
has been completed. So if the TLS handshake fails and we try
to cleanup the failed client, all the cleanup is skipped as
's->connected' is still false.

The only important thing that should be skipped in this case
is sending of the CHR_EVENT_CLOSED, because we never got as
far as sending the corresponding CHR_EVENT_OPENED. Every other
bit of cleanup can be robust against being called even when
s->connected is false.

Signed-off-by: Daniel P. Berrange <berrange@redhat.com>
Message-Id: <20171005155057.7664-1-berrange@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 chardev/char-socket.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/chardev/char-socket.c b/chardev/char-socket.c
index e65148fe97..53eda8ef00 100644
--- a/chardev/char-socket.c
+++ b/chardev/char-socket.c
@@ -332,10 +332,6 @@ static void tcp_chr_free_connection(Chardev *chr)
     SocketChardev *s = SOCKET_CHARDEV(chr);
     int i;
 
-    if (!s->connected) {
-        return;
-    }
-
     if (s->read_msgfds_num) {
         for (i = 0; i < s->read_msgfds_num; i++) {
             close(s->read_msgfds[i]);
@@ -394,22 +390,25 @@ static void update_disconnected_filename(SocketChardev *s)
                                          s->is_listen, s->is_telnet);
 }
 
+/* NB may be called even if tcp_chr_connect has not been
+ * reached, due to TLS or telnet initialization failure,
+ * so can *not* assume s->connected == true
+ */
 static void tcp_chr_disconnect(Chardev *chr)
 {
     SocketChardev *s = SOCKET_CHARDEV(chr);
-
-    if (!s->connected) {
-        return;
-    }
+    bool emit_close = s->connected;
 
     tcp_chr_free_connection(chr);
 
-    if (s->listen_ioc) {
+    if (s->listen_ioc && s->listen_tag == 0) {
         s->listen_tag = qio_channel_add_watch(
             QIO_CHANNEL(s->listen_ioc), G_IO_IN, tcp_chr_accept, chr, NULL);
     }
     update_disconnected_filename(s);
-    qemu_chr_be_event(chr, CHR_EVENT_CLOSED);
+    if (emit_close) {
+        qemu_chr_be_event(chr, CHR_EVENT_CLOSED);
+    }
     if (s->reconnect_time) {
         qemu_chr_socket_restart_timer(chr);
     }

From d5e5fafd11be4458443c43f19c1ebdd24d99a751 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Tue, 10 Oct 2017 11:42:45 +0200
Subject: [PATCH 05/29] exec: add page_mask for flatview_do_translate

The function is originally used for flatview_space_translate() and what
we care about most is (xlat, plen) range. However for iotlb requests, we
don't really care about "plen", but the size of the page that "xlat" is
located on. While, plen cannot really contain this information.

A simple example to show why "plen" is not good for IOTLB translations:

E.g., for huge pages, it is possible that guest mapped 1G huge page on
device side that used this GPA range:

  0x100000000 - 0x13fffffff

Then let's say we want to translate one IOVA that finally mapped to GPA
0x13ffffe00 (which is located on this 1G huge page). Then here we'll
get:

  (xlat, plen) = (0x13fffe00, 0x200)

So the IOTLB would be only covering a very small range since from
"plen" (which is 0x200 bytes) we cannot tell the size of the page.

Actually we can really know that this is a huge page - we just throw the
information away in flatview_do_translate().

This patch introduced "page_mask" optional parameter to capture that
page mask info. Also, I made "plen" an optional parameter as well, with
some comments for the whole function.

No functional change yet.

Signed-off-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
Message-Id: <20171010094247.10173-2-maxime.coquelin@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 exec.c | 51 +++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 45 insertions(+), 6 deletions(-)

diff --git a/exec.c b/exec.c
index 6378714a2b..ca520ac62d 100644
--- a/exec.c
+++ b/exec.c
@@ -465,11 +465,29 @@ address_space_translate_internal(AddressSpaceDispatch *d, hwaddr addr, hwaddr *x
     return section;
 }
 
-/* Called from RCU critical section */
+/**
+ * flatview_do_translate - translate an address in FlatView
+ *
+ * @fv: the flat view that we want to translate on
+ * @addr: the address to be translated in above address space
+ * @xlat: the translated address offset within memory region. It
+ *        cannot be @NULL.
+ * @plen_out: valid read/write length of the translated address. It
+ *            can be @NULL when we don't care about it.
+ * @page_mask_out: page mask for the translated address. This
+ *            should only be meaningful for IOMMU translated
+ *            addresses, since there may be huge pages that this bit
+ *            would tell. It can be @NULL if we don't care about it.
+ * @is_write: whether the translation operation is for write
+ * @is_mmio: whether this can be MMIO, set true if it can
+ *
+ * This function is called from RCU critical section
+ */
 static MemoryRegionSection flatview_do_translate(FlatView *fv,
                                                  hwaddr addr,
                                                  hwaddr *xlat,
-                                                 hwaddr *plen,
+                                                 hwaddr *plen_out,
+                                                 hwaddr *page_mask_out,
                                                  bool is_write,
                                                  bool is_mmio,
                                                  AddressSpace **target_as)
@@ -478,11 +496,17 @@ static MemoryRegionSection flatview_do_translate(FlatView *fv,
     MemoryRegionSection *section;
     IOMMUMemoryRegion *iommu_mr;
     IOMMUMemoryRegionClass *imrc;
+    hwaddr page_mask = (hwaddr)(-1);
+    hwaddr plen = (hwaddr)(-1);
+
+    if (plen_out) {
+        plen = *plen_out;
+    }
 
     for (;;) {
         section = address_space_translate_internal(
                 flatview_to_dispatch(fv), addr, &addr,
-                plen, is_mmio);
+                &plen, is_mmio);
 
         iommu_mr = memory_region_get_iommu(section->mr);
         if (!iommu_mr) {
@@ -494,7 +518,8 @@ static MemoryRegionSection flatview_do_translate(FlatView *fv,
                                 IOMMU_WO : IOMMU_RO);
         addr = ((iotlb.translated_addr & ~iotlb.addr_mask)
                 | (addr & iotlb.addr_mask));
-        *plen = MIN(*plen, (addr | iotlb.addr_mask) - addr + 1);
+        page_mask &= iotlb.addr_mask;
+        plen = MIN(plen, (addr | iotlb.addr_mask) - addr + 1);
         if (!(iotlb.perm & (1 << is_write))) {
             goto translate_fail;
         }
@@ -505,6 +530,19 @@ static MemoryRegionSection flatview_do_translate(FlatView *fv,
 
     *xlat = addr;
 
+    if (page_mask == (hwaddr)(-1)) {
+        /* Not behind an IOMMU, use default page size. */
+        page_mask = ~TARGET_PAGE_MASK;
+    }
+
+    if (page_mask_out) {
+        *page_mask_out = page_mask;
+    }
+
+    if (plen_out) {
+        *plen_out = plen;
+    }
+
     return *section;
 
 translate_fail:
@@ -523,7 +561,7 @@ IOMMUTLBEntry address_space_get_iotlb_entry(AddressSpace *as, hwaddr addr,
 
     /* This can never be MMIO. */
     section = flatview_do_translate(address_space_to_flatview(as), addr,
-                                    &xlat, &plen, is_write, false, &as);
+                                    &xlat, &plen, NULL, is_write, false, &as);
 
     /* Illegal translation */
     if (section.mr == &io_mem_unassigned) {
@@ -567,7 +605,8 @@ MemoryRegion *flatview_translate(FlatView *fv, hwaddr addr, hwaddr *xlat,
     AddressSpace *as = NULL;
 
     /* This can be MMIO, so setup MMIO bit. */
-    section = flatview_do_translate(fv, addr, xlat, plen, is_write, true, &as);
+    section = flatview_do_translate(fv, addr, xlat, plen, NULL,
+                                    is_write, true, &as);
     mr = section.mr;
 
     if (xen_enabled() && memory_access_is_direct(mr, is_write)) {

From 076a93d7972c9c1e3839d2f65edc32568a2cce93 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Tue, 10 Oct 2017 11:42:46 +0200
Subject: [PATCH 06/29] exec: simplify address_space_get_iotlb_entry

This patch let address_space_get_iotlb_entry() to use the newly
introduced page_mask parameter in flatview_do_translate(). Then we
will be sure the IOTLB can be aligned to page mask, also we should
nicely support huge pages now when introducing a764040.

Fixes: a764040 ("exec: abstract address_space_do_translate()")
Signed-off-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Message-Id: <20171010094247.10173-3-maxime.coquelin@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 exec.c | 31 ++++++++++---------------------
 1 file changed, 10 insertions(+), 21 deletions(-)

diff --git a/exec.c b/exec.c
index ca520ac62d..6579e7ac4f 100644
--- a/exec.c
+++ b/exec.c
@@ -554,14 +554,14 @@ IOMMUTLBEntry address_space_get_iotlb_entry(AddressSpace *as, hwaddr addr,
                                             bool is_write)
 {
     MemoryRegionSection section;
-    hwaddr xlat, plen;
+    hwaddr xlat, page_mask;
 
-    /* Try to get maximum page mask during translation. */
-    plen = (hwaddr)-1;
-
-    /* This can never be MMIO. */
-    section = flatview_do_translate(address_space_to_flatview(as), addr,
-                                    &xlat, &plen, NULL, is_write, false, &as);
+    /*
+     * This can never be MMIO, and we don't really care about plen,
+     * but page mask.
+     */
+    section = flatview_do_translate(address_space_to_flatview(as), addr, &xlat,
+                                    NULL, &page_mask, is_write, false, &as);
 
     /* Illegal translation */
     if (section.mr == &io_mem_unassigned) {
@@ -572,22 +572,11 @@ IOMMUTLBEntry address_space_get_iotlb_entry(AddressSpace *as, hwaddr addr,
     xlat += section.offset_within_address_space -
         section.offset_within_region;
 
-    if (plen == (hwaddr)-1) {
-        /*
-         * We use default page size here. Logically it only happens
-         * for identity mappings.
-         */
-        plen = TARGET_PAGE_SIZE;
-    }
-
-    /* Convert to address mask */
-    plen -= 1;
-
     return (IOMMUTLBEntry) {
         .target_as = as,
-        .iova = addr & ~plen,
-        .translated_addr = xlat & ~plen,
-        .addr_mask = plen,
+        .iova = addr & ~page_mask,
+        .translated_addr = xlat & ~page_mask,
+        .addr_mask = page_mask,
         /* IOTLBs are for DMAs, and DMA only allows on RAMs. */
         .perm = IOMMU_RW,
     };

From b021d1c04452276f4926eed2d104ccbd1037a6e1 Mon Sep 17 00:00:00 2001
From: Maxime Coquelin <maxime.coquelin@redhat.com>
Date: Tue, 10 Oct 2017 11:42:47 +0200
Subject: [PATCH 07/29] memory: fix off-by-one error in
 memory_region_notify_one()

This patch fixes an off-by-one error that could lead to the
notifyee to receive notifications for ranges it is not
registered to.

The bug has been spotted by code review.

Fixes: bd2bfa4c52e5 ("memory: introduce memory_region_notify_one()")
Cc: qemu-stable@nongnu.org
Cc: Peter Xu <peterx@redhat.com>
Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
Message-Id: <20171010094247.10173-4-maxime.coquelin@redhat.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 memory.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/memory.c b/memory.c
index 5e6351a6c1..b637c12bad 100644
--- a/memory.c
+++ b/memory.c
@@ -1892,7 +1892,7 @@ void memory_region_notify_one(IOMMUNotifier *notifier,
      * Skip the notification if the notification does not overlap
      * with registered range.
      */
-    if (notifier->start > entry->iova + entry->addr_mask + 1 ||
+    if (notifier->start > entry->iova + entry->addr_mask ||
         notifier->end < entry->iova) {
         return;
     }

From 6970c5ff13a47df7ce41b901a4459c587a03d16b Mon Sep 17 00:00:00 2001
From: Igor Mammedov <imammedo@redhat.com>
Date: Tue, 10 Oct 2017 14:34:39 +0200
Subject: [PATCH 08/29] pc: make sure that plugged CPUs are of the same type

heterogeneous cpus are not supported and hotplugging different
cpu model crashes QEMU:

  qemu-system-x86_64 -cpu qemu64 -smp 1,maxcpus=2
  (qemu) device_add host-x86_64-cpu,socket-id=1,core-id=0,thread-id=0,id=foo
  (qemu) info cpus
  error: failed to get MSR 0x38d
  qemu-system-x86_64: target/i386/kvm.c:2121: kvm_get_msrs: Assertion `ret == cpu->kvm_msr_buf->nmsrs' failed.
  Aborted (core dumped)

Gracefully fail hotplug process in case of user mistake.

Reported-by: Greg Kurz <groug@kaod.org>
Signed-off-by: Igor Mammedov <imammedo@redhat.com>
Message-Id: <1507638879-200718-1-git-send-email-imammedo@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/i386/pc.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 05985d4927..8e307f7aff 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -1876,8 +1876,15 @@ static void pc_cpu_pre_plug(HotplugHandler *hotplug_dev,
     CPUArchId *cpu_slot;
     X86CPUTopoInfo topo;
     X86CPU *cpu = X86_CPU(dev);
+    MachineState *ms = MACHINE(hotplug_dev);
     PCMachineState *pcms = PC_MACHINE(hotplug_dev);
 
+    if(!object_dynamic_cast(OBJECT(cpu), ms->cpu_type)) {
+        error_setg(errp, "Invalid CPU type, expected cpu type: '%s'",
+                   ms->cpu_type);
+        return;
+    }
+
     /* if APIC ID is not set, set it based on socket/core/thread properties */
     if (cpu->apic_id == UNASSIGNED_APIC_ID) {
         int max_socket = (max_cpus - 1) / smp_threads / smp_cores;

From eb584b401fdc0866d2ff0c03ab8b09d2ba04a49b Mon Sep 17 00:00:00 2001
From: Thomas Huth <thuth@redhat.com>
Date: Wed, 27 Sep 2017 16:58:33 +0200
Subject: [PATCH 09/29] disas: Always initialize read_memory_inner_func
 properly

I've recently seen this with valgrind while running the HMP tester:

==22373== Conditional jump or move depends on uninitialised value(s)
==22373==    at 0x4A41FD: arm_disas_set_info (cpu.c:504)
==22373==    by 0x3867A7: monitor_disas (disas.c:390)
==22373==    by 0x38E80E: memory_dump (monitor.c:1339)
==22373==    by 0x38FA43: handle_hmp_command (monitor.c:3123)
==22373==    by 0x38FB9E: qmp_human_monitor_command (monitor.c:613)
==22373==    by 0x4E3124: qmp_marshal_human_monitor_command (qmp-marshal.c:1736)
==22373==    by 0x769678: do_qmp_dispatch (qmp-dispatch.c:104)
==22373==    by 0x769678: qmp_dispatch (qmp-dispatch.c:131)
==22373==    by 0x38B734: handle_qmp_command (monitor.c:3853)
==22373==    by 0x76ED07: json_message_process_token (json-streamer.c:105)
==22373==    by 0x78D40A: json_lexer_feed_char (json-lexer.c:323)
==22373==    by 0x78D4CD: json_lexer_feed (json-lexer.c:373)
==22373==    by 0x38A08D: monitor_qmp_read (monitor.c:3895)

And indeed, in monitor_disas, the read_memory_inner_func variable was
not initialized, but arm_disas_set_info() expects this to be NULL
or a valid pointer. Let's properly set this to NULL in the
INIT_DISASSEMBLE_INFO to fix it in all functions that use the
disassemble_info struct.

Fixes: f7478a92dd9ee2276bfaa5b7317140d3f9d6a53b ("Fix Thumb-1 BE32 execution")
Signed-off-by: Thomas Huth <thuth@redhat.com>
Message-Id: <1506524313-20037-1-git-send-email-thuth@redhat.com>
---
 disas.c             | 1 -
 include/disas/bfd.h | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/disas.c b/disas.c
index d6a1eb9c8e..54eea3f9c9 100644
--- a/disas.c
+++ b/disas.c
@@ -190,7 +190,6 @@ void target_disas(FILE *out, CPUState *cpu, target_ulong code,
 
     s.cpu = cpu;
     s.info.read_memory_func = target_read_memory;
-    s.info.read_memory_inner_func = NULL;
     s.info.buffer_vma = code;
     s.info.buffer_length = size;
     s.info.print_address_func = generic_print_address;
diff --git a/include/disas/bfd.h b/include/disas/bfd.h
index b01e002b4c..d99da68267 100644
--- a/include/disas/bfd.h
+++ b/include/disas/bfd.h
@@ -479,6 +479,7 @@ int generic_symbol_at_address(bfd_vma, struct disassemble_info *);
   (INFO).buffer_vma = 0, \
   (INFO).buffer_length = 0, \
   (INFO).read_memory_func = buffer_read_memory, \
+  (INFO).read_memory_inner_func = NULL, \
   (INFO).memory_error_func = perror_memory, \
   (INFO).print_address_func = generic_print_address, \
   (INFO).print_insn = NULL, \

From 7271a81949ee9806705d51618379246fb2b72209 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 14 Jul 2017 11:51:41 +0200
Subject: [PATCH 10/29] build: remove CONFIG_LIBDECNUMBER

It is used by all PPC targets; we can give the directory its own
Makefile.objs file, and include it directly from target/ppc.
target/s390 can do the same when it starts using it.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Makefile.target                           | 6 ------
 default-configs/ppc-linux-user.mak        | 1 -
 default-configs/ppc-softmmu.mak           | 1 -
 default-configs/ppc64-linux-user.mak      | 1 -
 default-configs/ppc64-softmmu.mak         | 1 -
 default-configs/ppc64abi32-linux-user.mak | 1 -
 default-configs/ppc64le-linux-user.mak    | 1 -
 default-configs/ppcemb-softmmu.mak        | 1 -
 libdecnumber/Makefile.objs                | 5 +++++
 target/ppc/Makefile.objs                  | 1 +
 10 files changed, 6 insertions(+), 13 deletions(-)
 create mode 100644 libdecnumber/Makefile.objs

diff --git a/Makefile.target b/Makefile.target
index a4b292d2e2..e4244c188a 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -102,12 +102,6 @@ obj-y += target/$(TARGET_BASE_ARCH)/
 obj-y += disas.o
 obj-$(call notempty,$(TARGET_XML_FILES)) += gdbstub-xml.o
 
-obj-$(CONFIG_LIBDECNUMBER) += libdecnumber/decContext.o
-obj-$(CONFIG_LIBDECNUMBER) += libdecnumber/decNumber.o
-obj-$(CONFIG_LIBDECNUMBER) += libdecnumber/dpd/decimal32.o
-obj-$(CONFIG_LIBDECNUMBER) += libdecnumber/dpd/decimal64.o
-obj-$(CONFIG_LIBDECNUMBER) += libdecnumber/dpd/decimal128.o
-
 #########################################################
 # Linux user emulator target
 
diff --git a/default-configs/ppc-linux-user.mak b/default-configs/ppc-linux-user.mak
index 260ba41836..6273df2930 100644
--- a/default-configs/ppc-linux-user.mak
+++ b/default-configs/ppc-linux-user.mak
@@ -1,2 +1 @@
 # Default configuration for ppc-linux-user
-CONFIG_LIBDECNUMBER=y
diff --git a/default-configs/ppc-softmmu.mak b/default-configs/ppc-softmmu.mak
index d7a3755881..bb225c6e46 100644
--- a/default-configs/ppc-softmmu.mak
+++ b/default-configs/ppc-softmmu.mak
@@ -46,7 +46,6 @@ CONFIG_E500=y
 CONFIG_OPENPIC_KVM=$(call land,$(CONFIG_E500),$(CONFIG_KVM))
 CONFIG_PLATFORM_BUS=y
 CONFIG_ETSEC=y
-CONFIG_LIBDECNUMBER=y
 CONFIG_SM501=y
 # For PReP
 CONFIG_SERIAL_ISA=y
diff --git a/default-configs/ppc64-linux-user.mak b/default-configs/ppc64-linux-user.mak
index e731ce016e..422d3fbaeb 100644
--- a/default-configs/ppc64-linux-user.mak
+++ b/default-configs/ppc64-linux-user.mak
@@ -1,2 +1 @@
 # Default configuration for ppc64-linux-user
-CONFIG_LIBDECNUMBER=y
diff --git a/default-configs/ppc64-softmmu.mak b/default-configs/ppc64-softmmu.mak
index 9086475bf6..d1b3a6dd50 100644
--- a/default-configs/ppc64-softmmu.mak
+++ b/default-configs/ppc64-softmmu.mak
@@ -51,7 +51,6 @@ CONFIG_E500=y
 CONFIG_OPENPIC_KVM=$(call land,$(CONFIG_E500),$(CONFIG_KVM))
 CONFIG_PLATFORM_BUS=y
 CONFIG_ETSEC=y
-CONFIG_LIBDECNUMBER=y
 CONFIG_SM501=y
 # For pSeries
 CONFIG_XICS=$(CONFIG_PSERIES)
diff --git a/default-configs/ppc64abi32-linux-user.mak b/default-configs/ppc64abi32-linux-user.mak
index c244d07d56..1c657ec9bb 100644
--- a/default-configs/ppc64abi32-linux-user.mak
+++ b/default-configs/ppc64abi32-linux-user.mak
@@ -1,2 +1 @@
 # Default configuration for ppc64abi32-linux-user
-CONFIG_LIBDECNUMBER=y
diff --git a/default-configs/ppc64le-linux-user.mak b/default-configs/ppc64le-linux-user.mak
index 4ba4eae6ce..63f4269023 100644
--- a/default-configs/ppc64le-linux-user.mak
+++ b/default-configs/ppc64le-linux-user.mak
@@ -1,2 +1 @@
 # Default configuration for ppc64le-linux-user
-CONFIG_LIBDECNUMBER=y
diff --git a/default-configs/ppcemb-softmmu.mak b/default-configs/ppcemb-softmmu.mak
index 635923a166..13917fb7a3 100644
--- a/default-configs/ppcemb-softmmu.mak
+++ b/default-configs/ppcemb-softmmu.mak
@@ -15,5 +15,4 @@ CONFIG_PTIMER=y
 CONFIG_I8259=y
 CONFIG_XILINX=y
 CONFIG_XILINX_ETHLITE=y
-CONFIG_LIBDECNUMBER=y
 CONFIG_SM501=y
diff --git a/libdecnumber/Makefile.objs b/libdecnumber/Makefile.objs
new file mode 100644
index 0000000000..d81db0443a
--- /dev/null
+++ b/libdecnumber/Makefile.objs
@@ -0,0 +1,5 @@
+obj-y += decContext.o
+obj-y += decNumber.o
+obj-y += dpd/decimal32.o
+obj-y += dpd/decimal64.o
+obj-y += dpd/decimal128.o
diff --git a/target/ppc/Makefile.objs b/target/ppc/Makefile.objs
index f92ba67ebd..e8fa18ce13 100644
--- a/target/ppc/Makefile.objs
+++ b/target/ppc/Makefile.objs
@@ -15,5 +15,6 @@ obj-y += int_helper.o
 obj-y += timebase_helper.o
 obj-y += misc_helper.o
 obj-y += mem_helper.o
+obj-y += ../../libdecnumber/
 obj-$(CONFIG_USER_ONLY) += user_only_helper.o
 obj-y += gdbstub.o

From 17bd9597be45b96ae00716b0ae01a4d11bbee1ab Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 10 Oct 2017 17:14:44 +0200
Subject: [PATCH 11/29] nios2: define tcg_env

This should be done by all target and, since commit 53f6672bcf
("gen-icount: use tcg_ctx.tcg_env instead of cpu_env", 2017-06-30),
is causing the NIOS2 target to hang.

This is because the test for "should I exit to the main loop"
was being done with the correct offset to the icount decrementer,
but using TCG temporary 0 (the frame pointer) rather than the
env pointer.

Cc: qemu-stable@nongnu.org
Cc: Marek Vasut <marex@denx.de>
Reported-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target/nios2/translate.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/target/nios2/translate.c b/target/nios2/translate.c
index 6b0961837d..54fbe898df 100644
--- a/target/nios2/translate.c
+++ b/target/nios2/translate.c
@@ -948,6 +948,7 @@ void nios2_tcg_init(void)
     int i;
 
     cpu_env = tcg_global_reg_new_ptr(TCG_AREG0, "env");
+    tcg_ctx.tcg_env = cpu_env;
 
     for (i = 0; i < NUM_CORE_REGS; i++) {
         cpu_R[i] = tcg_global_mem_new(cpu_env,

From b7ecba0f6f6427683b2dc609f4830535b9a271dd Mon Sep 17 00:00:00 2001
From: Peter Maydell <peter.maydell@linaro.org>
Date: Thu, 12 Oct 2017 13:59:41 +0100
Subject: [PATCH 12/29] docs/devel/loads-stores.rst: Document our various load
 and store APIs

QEMU has a wide selection of different functions for doing
loads and stores; provide some overview documentation of
what they do and how to pick which one to use.

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-Id: <1507813181-11860-1-git-send-email-peter.maydell@linaro.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 docs/devel/loads-stores.rst | 396 ++++++++++++++++++++++++++++++++++++
 1 file changed, 396 insertions(+)
 create mode 100644 docs/devel/loads-stores.rst

diff --git a/docs/devel/loads-stores.rst b/docs/devel/loads-stores.rst
new file mode 100644
index 0000000000..6a990cc243
--- /dev/null
+++ b/docs/devel/loads-stores.rst
@@ -0,0 +1,396 @@
+..
+   Copyright (c) 2017 Linaro Limited
+   Written by Peter Maydell
+
+===================
+Load and Store APIs
+===================
+
+QEMU internally has multiple families of functions for performing
+loads and stores. This document attempts to enumerate them all
+and indicate when to use them. It does not provide detailed
+documentation of each API -- for that you should look at the
+documentation comments in the relevant header files.
+
+
+``ld*_p and st*_p``
+~~~~~~~~~~~~~~~~~~~
+
+These functions operate on a host pointer, and should be used
+when you already have a pointer into host memory (corresponding
+to guest ram or a local buffer). They deal with doing accesses
+with the desired endianness and with correctly handling
+potentially unaligned pointer values.
+
+Function names follow the pattern:
+
+load: ``ld{type}{sign}{size}_{endian}_p(ptr)``
+
+store: ``st{type}{size}_{endian}_p(ptr, val)``
+
+``type``
+ - (empty) : integer access
+ - ``f`` : float access
+
+``sign``
+ - (empty) : for 32 or 64 bit sizes (including floats and doubles)
+ - ``u`` : unsigned
+ - ``s`` : signed
+
+``size``
+ - ``b`` : 8 bits
+ - ``w`` : 16 bits
+ - ``l`` : 32 bits
+ - ``q`` : 64 bits
+
+``endian``
+ - ``he`` : host endian
+ - ``be`` : big endian
+ - ``le`` : little endian
+
+The ``_{endian}`` infix is omitted for target-endian accesses.
+
+The target endian accessors are only available to source
+files which are built per-target.
+
+Regexes for git grep
+ - ``\<ldf\?[us]\?[bwlq]\(_[hbl]e\)\?_p\>``
+ - ``\<stf\?[bwlq]\(_[hbl]e\)\?_p\>``
+
+``cpu_{ld,st}_*``
+~~~~~~~~~~~~~~~~~
+
+These functions operate on a guest virtual address. Be aware
+that these functions may cause a guest CPU exception to be
+taken (e.g. for an alignment fault or MMU fault) which will
+result in guest CPU state being updated and control longjumping
+out of the function call. They should therefore only be used
+in code that is implementing emulation of the target CPU.
+
+These functions may throw an exception (longjmp() back out
+to the top level TCG loop). This means they must only be used
+from helper functions where the translator has saved all
+necessary CPU state before generating the helper function call.
+It's usually better to use the ``_ra`` variants described below
+from helper functions, but these functions are the right choice
+for calls made from hooks like the CPU do_interrupt hook or
+when you know for certain that the translator had to save all
+the CPU state that ``cpu_restore_state()`` would restore anyway.
+
+Function names follow the pattern:
+
+load: ``cpu_ld{sign}{size}_{mmusuffix}(env, ptr)``
+
+store: ``cpu_st{size}_{mmusuffix}(env, ptr, val)``
+
+``sign``
+ - (empty) : for 32 or 64 bit sizes
+ - ``u`` : unsigned
+ - ``s`` : signed
+
+``size``
+ - ``b`` : 8 bits
+ - ``w`` : 16 bits
+ - ``l`` : 32 bits
+ - ``q`` : 64 bits
+
+``mmusuffix`` is one of the generic suffixes ``data`` or ``code``, or
+(for softmmu configs) a target-specific MMU mode suffix as defined
+in the target's ``cpu.h``.
+
+Regexes for git grep
+ - ``\<cpu_ld[us]\?[bwlq]_[a-zA-Z0-9]\+\>``
+ - ``\<cpu_st[bwlq]_[a-zA-Z0-9]\+\>``
+
+``cpu_{ld,st}_*_ra``
+~~~~~~~~~~~~~~~~~~~~
+
+These functions work like the ``cpu_{ld,st}_*`` functions except
+that they also take a ``retaddr`` argument. This extra argument
+allows for correct unwinding of any exception that is taken,
+and should generally be the result of GETPC() called directly
+from the top level HELPER(foo) function (i.e. the return address
+in the generated code).
+
+These are generally the preferred way to do accesses by guest
+virtual address from helper functions; see the documentation
+of the non-``_ra`` variants for when those would be better.
+
+Calling these functions with a ``retaddr`` argument of 0 is
+equivalent to calling the non-``_ra`` version of the function.
+
+Function names follow the pattern:
+
+load: ``cpu_ld{sign}{size}_{mmusuffix}_ra(env, ptr, retaddr)``
+
+store: ``cpu_st{sign}{size}_{mmusuffix}_ra(env, ptr, val, retaddr)``
+
+Regexes for git grep
+ - ``\<cpu_ld[us]\?[bwlq]_[a-zA-Z0-9]\+_ra\>``
+ - ``\<cpu_st[bwlq]_[a-zA-Z0-9]\+_ra\>``
+
+``helper_*_{ld,st}*mmu``
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+These functions are intended primarily to be called by the code
+generated by the TCG backend. They may also be called by target
+CPU helper function code. Like the ``cpu_{ld,st}_*_ra`` functions
+they perform accesses by guest virtual address; the difference is
+that these functions allow you to specify an ``opindex`` parameter
+which encodes (among other things) the mmu index to use for the
+access. This is necessary if your helper needs to make an access
+via a specific mmu index (for instance, an "always as non-privileged"
+access) rather than using the default mmu index for the current state
+of the guest CPU.
+
+The ``opindex`` parameter should be created by calling ``make_memop_idx()``.
+
+The ``retaddr`` parameter should be the result of GETPC() called directly
+from the top level HELPER(foo) function (or 0 if no guest CPU state
+unwinding is required).
+
+**TODO** The names of these functions are a bit odd for historical
+reasons because they were originally expected to be called only from
+within generated code. We should rename them to bring them
+more in line with the other memory access functions.
+
+load: ``helper_{endian}_ld{sign}{size}_mmu(env, addr, opindex, retaddr)``
+
+load (code): ``helper_{endian}_ld{sign}{size}_cmmu(env, addr, opindex, retaddr)``
+
+store: ``helper_{endian}_st{size}_mmu(env, addr, val, opindex, retaddr)``
+
+``sign``
+ - (empty) : for 32 or 64 bit sizes
+ - ``u`` : unsigned
+ - ``s`` : signed
+
+``size``
+ - ``b`` : 8 bits
+ - ``w`` : 16 bits
+ - ``l`` : 32 bits
+ - ``q`` : 64 bits
+
+``endian``
+ - ``le`` : little endian
+ - ``be`` : big endian
+ - ``ret`` : target endianness
+
+Regexes for git grep
+ - ``\<helper_\(le\|be\|ret\)_ld[us]\?[bwlq]_c\?mmu\>``
+ - ``\<helper_\(le\|be\|ret\)_st[bwlq]_mmu\>``
+
+``address_space_*``
+~~~~~~~~~~~~~~~~~~~
+
+These functions are the primary ones to use when emulating CPU
+or device memory accesses. They take an AddressSpace, which is the
+way QEMU defines the view of memory that a device or CPU has.
+(They generally correspond to being the "master" end of a hardware bus
+or bus fabric.)
+
+Each CPU has an AddressSpace. Some kinds of CPU have more than
+one AddressSpace (for instance ARM guest CPUs have an AddressSpace
+for the Secure world and one for NonSecure if they implement TrustZone).
+Devices which can do DMA-type operations should generally have an
+AddressSpace. There is also a "system address space" which typically
+has all the devices and memory that all CPUs can see. (Some older
+device models use the "system address space" rather than properly
+modelling that they have an AddressSpace of their own.)
+
+Functions are provided for doing byte-buffer reads and writes,
+and also for doing one-data-item loads and stores.
+
+In all cases the caller provides a MemTxAttrs to specify bus
+transaction attributes, and can check whether the memory transaction
+succeeded using a MemTxResult return code.
+
+``address_space_read(address_space, addr, attrs, buf, len)``
+
+``address_space_write(address_space, addr, attrs, buf, len)``
+
+``address_space_rw(address_space, addr, attrs, buf, len, is_write)``
+
+``address_space_ld{sign}{size}_{endian}(address_space, addr, attrs, txresult)``
+
+``address_space_st{size}_{endian}(address_space, addr, val, attrs, txresult)``
+
+``sign``
+ - (empty) : for 32 or 64 bit sizes
+ - ``u`` : unsigned
+
+(No signed load operations are provided.)
+
+``size``
+ - ``b`` : 8 bits
+ - ``w`` : 16 bits
+ - ``l`` : 32 bits
+ - ``q`` : 64 bits
+
+``endian``
+ - ``le`` : little endian
+ - ``be`` : big endian
+
+The ``_{endian}`` suffix is omitted for byte accesses.
+
+Regexes for git grep
+ - ``\<address_space_\(read\|write\|rw\)\>``
+ - ``\<address_space_ldu\?[bwql]\(_[lb]e\)\?\>``
+ - ``\<address_space_st[bwql]\(_[lb]e\)\?\>``
+
+``{ld,st}*_phys``
+~~~~~~~~~~~~~~~~~
+
+These are functions which are identical to
+``address_space_{ld,st}*``, except that they always pass
+``MEMTXATTRS_UNSPECIFIED`` for the transaction attributes, and ignore
+whether the transaction succeeded or failed.
+
+The fact that they ignore whether the transaction succeeded means
+they should not be used in new code, unless you know for certain
+that your code will only be used in a context where the CPU or
+device doing the access has no way to report such an error.
+
+``load: ld{sign}{size}_{endian}_phys``
+
+``store: st{size}_{endian}_phys``
+
+``sign``
+ - (empty) : for 32 or 64 bit sizes
+ - ``u`` : unsigned
+
+(No signed load operations are provided.)
+
+``size``
+ - ``b`` : 8 bits
+ - ``w`` : 16 bits
+ - ``l`` : 32 bits
+ - ``q`` : 64 bits
+
+``endian``
+ - ``le`` : little endian
+ - ``be`` : big endian
+
+The ``_{endian}_`` infix is omitted for byte accesses.
+
+Regexes for git grep
+ - ``\<ldu\?[bwlq]\(_[bl]e\)\?_phys\>``
+ - ``\<st[bwlq]\(_[bl]e\)\?_phys\>``
+
+``cpu_physical_memory_*``
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+These are convenience functions which are identical to
+``address_space_*`` but operate specifically on the system address space,
+always pass a ``MEMTXATTRS_UNSPECIFIED`` set of memory attributes and
+ignore whether the memory transaction succeeded or failed.
+For new code they are better avoided:
+
+* there is likely to be behaviour you need to model correctly for a
+  failed read or write operation
+* a device should usually perform operations on its own AddressSpace
+  rather than using the system address space
+
+``cpu_physical_memory_read``
+
+``cpu_physical_memory_write``
+
+``cpu_physical_memory_rw``
+
+Regexes for git grep
+ - ``\<cpu_physical_memory_\(read\|write\|rw\)\>``
+
+``cpu_physical_memory_write_rom``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This function performs a write by physical address like
+``address_space_write``, except that if the write is to a ROM then
+the ROM contents will be modified, even though a write by the guest
+CPU to the ROM would be ignored.
+
+Note that unlike ``cpu_physical_memory_write()`` this function takes
+an AddressSpace argument, but unlike ``address_space_write()`` this
+function does not take a ``MemTxAttrs`` or return a ``MemTxResult``.
+
+**TODO**: we should probably clean up this inconsistency and
+turn the function into ``address_space_write_rom`` with an API
+matching ``address_space_write``.
+
+``cpu_physical_memory_write_rom``
+
+
+``cpu_memory_rw_debug``
+~~~~~~~~~~~~~~~~~~~~~~~
+
+Access CPU memory by virtual address for debug purposes.
+
+This function is intended for use by the GDB stub and similar code.
+It takes a virtual address, converts it to a physical address via
+an MMU lookup using the current settings of the specified CPU,
+and then performs the access (using ``address_space_rw`` for
+reads or ``cpu_physical_memory_write_rom`` for writes).
+This means that if the access is a write to a ROM then this
+function will modify the contents (whereas a normal guest CPU access
+would ignore the write attempt).
+
+``cpu_memory_rw_debug``
+
+``dma_memory_*``
+~~~~~~~~~~~~~~~~
+
+These behave like ``address_space_*``, except that they perform a DMA
+barrier operation first.
+
+**TODO**: We should provide guidance on when you need the DMA
+barrier operation and when it's OK to use ``address_space_*``, and
+make sure our existing code is doing things correctly.
+
+``dma_memory_read``
+
+``dma_memory_write``
+
+``dma_memory_rw``
+
+Regexes for git grep
+ - ``\<dma_memory_\(read\|write\|rw\)\>``
+
+``pci_dma_*`` and ``{ld,st}*_pci_dma``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+These functions are specifically for PCI device models which need to
+perform accesses where the PCI device is a bus master. You pass them a
+``PCIDevice *`` and they will do ``dma_memory_*`` operations on the
+correct address space for that device.
+
+``pci_dma_read``
+
+``pci_dma_write``
+
+``pci_dma_rw``
+
+``load: ld{sign}{size}_{endian}_pci_dma``
+
+``store: st{size}_{endian}_pci_dma``
+
+``sign``
+ - (empty) : for 32 or 64 bit sizes
+ - ``u`` : unsigned
+
+(No signed load operations are provided.)
+
+``size``
+ - ``b`` : 8 bits
+ - ``w`` : 16 bits
+ - ``l`` : 32 bits
+ - ``q`` : 64 bits
+
+``endian``
+ - ``le`` : little endian
+ - ``be`` : big endian
+
+The ``_{endian}_`` infix is omitted for byte accesses.
+
+Regexes for git grep
+ - ``\<pci_dma_\(read\|write\|rw\)\>``
+ - ``\<ldu\?[bwlq]\(_[bl]e\)\?_pci_dma\>``
+ - ``\<st[bwlq]\(_[bl]e\)\?_pci_dma\>``

From 6a24f34e5c0bf83c5a31015242c94185c95c2554 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 12 Oct 2017 15:54:08 +0200
Subject: [PATCH 13/29] tco: add trace events

Add trace events to the PCH watchdog timer, it can be useful to see how
the guest is using it.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <1507816448-86665-1-git-send-email-pbonzini@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/acpi/tco.c        | 11 +++++++++--
 hw/acpi/trace-events |  4 ++++
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/hw/acpi/tco.c b/hw/acpi/tco.c
index 05b9d7ba36..a9143963ab 100644
--- a/hw/acpi/tco.c
+++ b/hw/acpi/tco.c
@@ -12,6 +12,7 @@
 #include "hw/i386/ich9.h"
 
 #include "hw/acpi/tco.h"
+#include "trace.h"
 
 //#define DEBUG
 
@@ -41,8 +42,11 @@ enum {
 
 static inline void tco_timer_reload(TCOIORegs *tr)
 {
-    tr->expire_time = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
-        ((int64_t)(tr->tco.tmr & TCO_TMR_MASK) * TCO_TICK_NSEC);
+    int ticks = tr->tco.tmr & TCO_TMR_MASK;
+    int64_t nsec = (int64_t)ticks * TCO_TICK_NSEC;
+
+    trace_tco_timer_reload(ticks, nsec / 1000000);
+    tr->expire_time = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + nsec;
     timer_mod(tr->tco_timer, tr->expire_time);
 }
 
@@ -59,6 +63,9 @@ static void tco_timer_expired(void *opaque)
     ICH9LPCState *lpc = container_of(pm, ICH9LPCState, pm);
     uint32_t gcs = pci_get_long(lpc->chip_config + ICH9_CC_GCS);
 
+    trace_tco_timer_expired(tr->timeouts_no,
+                            lpc->pin_strap.spkr_hi,
+                            !!(gcs & ICH9_CC_GCS_NO_REBOOT));
     tr->tco.rld = 0;
     tr->tco.sts1 |= TCO_TIMEOUT;
     if (++tr->timeouts_no == 2) {
diff --git a/hw/acpi/trace-events b/hw/acpi/trace-events
index e3b41e9df4..df0024f8b2 100644
--- a/hw/acpi/trace-events
+++ b/hw/acpi/trace-events
@@ -30,3 +30,7 @@ cpuhp_acpi_ejecting_invalid_cpu(uint32_t idx) "0x%"PRIx32
 cpuhp_acpi_ejecting_cpu(uint32_t idx) "0x%"PRIx32
 cpuhp_acpi_write_ost_ev(uint32_t slot, uint32_t ev) "idx[0x%"PRIx32"] OST EVENT: 0x%"PRIx32
 cpuhp_acpi_write_ost_status(uint32_t slot, uint32_t st) "idx[0x%"PRIx32"] OST STATUS: 0x%"PRIx32
+
+# hw/acpi/tco.c
+tco_timer_reload(int ticks, int msec) "ticks=%d (%d ms)"
+tco_timer_expired(int timeouts_no, bool strap, bool no_reboot) "timeouts_no=%d no_reboot=%d/%d"

From e3af7c788b73a6495eb9d94992ef11f6ad6f3c56 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 26 Apr 2017 13:59:34 +0200
Subject: [PATCH 14/29] target/i386: introduce x86_ld*_code

These take care of advancing s->pc, and will provide a unified point
where to check for the 15-byte instruction length limit.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target/i386/translate.c | 228 ++++++++++++++++++++++------------------
 1 file changed, 125 insertions(+), 103 deletions(-)

diff --git a/target/i386/translate.c b/target/i386/translate.c
index 5d61fa96ad..4a938c21a0 100644
--- a/target/i386/translate.c
+++ b/target/i386/translate.c
@@ -1863,6 +1863,41 @@ static void gen_shifti(DisasContext *s1, int op, TCGMemOp ot, int d, int c)
     }
 }
 
+static uint64_t advance_pc(CPUX86State *env, DisasContext *s, int num_bytes)
+{
+    uint64_t pc = s->pc;
+
+    s->pc += num_bytes;
+    return pc;
+}
+
+static inline uint8_t x86_ldub_code(CPUX86State *env, DisasContext *s)
+{
+    return cpu_ldub_code(env, advance_pc(env, s, 1));
+}
+
+static inline int16_t x86_ldsw_code(CPUX86State *env, DisasContext *s)
+{
+    return cpu_ldsw_code(env, advance_pc(env, s, 2));
+}
+
+static inline uint16_t x86_lduw_code(CPUX86State *env, DisasContext *s)
+{
+    return cpu_lduw_code(env, advance_pc(env, s, 2));
+}
+
+static inline uint32_t x86_ldl_code(CPUX86State *env, DisasContext *s)
+{
+    return cpu_ldl_code(env, advance_pc(env, s, 4));
+}
+
+#ifdef TARGET_X86_64
+static inline uint64_t x86_ldq_code(CPUX86State *env, DisasContext *s)
+{
+    return cpu_ldq_code(env, advance_pc(env, s, 8));
+}
+#endif
+
 /* Decompose an address.  */
 
 typedef struct AddressParts {
@@ -1900,7 +1935,7 @@ static AddressParts gen_lea_modrm_0(CPUX86State *env, DisasContext *s,
     case MO_32:
         havesib = 0;
         if (rm == 4) {
-            int code = cpu_ldub_code(env, s->pc++);
+            int code = x86_ldub_code(env, s);
             scale = (code >> 6) & 3;
             index = ((code >> 3) & 7) | REX_X(s);
             if (index == 4) {
@@ -1914,8 +1949,7 @@ static AddressParts gen_lea_modrm_0(CPUX86State *env, DisasContext *s,
         case 0:
             if ((base & 7) == 5) {
                 base = -1;
-                disp = (int32_t)cpu_ldl_code(env, s->pc);
-                s->pc += 4;
+                disp = (int32_t)x86_ldl_code(env, s);
                 if (CODE64(s) && !havesib) {
                     base = -2;
                     disp += s->pc + s->rip_offset;
@@ -1923,12 +1957,11 @@ static AddressParts gen_lea_modrm_0(CPUX86State *env, DisasContext *s,
             }
             break;
         case 1:
-            disp = (int8_t)cpu_ldub_code(env, s->pc++);
+            disp = (int8_t)x86_ldub_code(env, s);
             break;
         default:
         case 2:
-            disp = (int32_t)cpu_ldl_code(env, s->pc);
-            s->pc += 4;
+            disp = (int32_t)x86_ldl_code(env, s);
             break;
         }
 
@@ -1945,15 +1978,13 @@ static AddressParts gen_lea_modrm_0(CPUX86State *env, DisasContext *s,
         if (mod == 0) {
             if (rm == 6) {
                 base = -1;
-                disp = cpu_lduw_code(env, s->pc);
-                s->pc += 2;
+                disp = x86_lduw_code(env, s);
                 break;
             }
         } else if (mod == 1) {
-            disp = (int8_t)cpu_ldub_code(env, s->pc++);
+            disp = (int8_t)x86_ldub_code(env, s);
         } else {
-            disp = (int16_t)cpu_lduw_code(env, s->pc);
-            s->pc += 2;
+            disp = (int16_t)x86_lduw_code(env, s);
         }
 
         switch (rm) {
@@ -2103,19 +2134,16 @@ static inline uint32_t insn_get(CPUX86State *env, DisasContext *s, TCGMemOp ot)
 
     switch (ot) {
     case MO_8:
-        ret = cpu_ldub_code(env, s->pc);
-        s->pc++;
+        ret = x86_ldub_code(env, s);
         break;
     case MO_16:
-        ret = cpu_lduw_code(env, s->pc);
-        s->pc += 2;
+        ret = x86_lduw_code(env, s);
         break;
     case MO_32:
 #ifdef TARGET_X86_64
     case MO_64:
 #endif
-        ret = cpu_ldl_code(env, s->pc);
-        s->pc += 4;
+        ret = x86_ldl_code(env, s);
         break;
     default:
         tcg_abort();
@@ -3041,7 +3069,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b,
         gen_helper_enter_mmx(cpu_env);
     }
 
-    modrm = cpu_ldub_code(env, s->pc++);
+    modrm = x86_ldub_code(env, s);
     reg = ((modrm >> 3) & 7);
     if (is_xmm)
         reg |= rex_r;
@@ -3250,8 +3278,8 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b,
 
                 if (b1 == 1 && reg != 0)
                     goto illegal_op;
-                field_length = cpu_ldub_code(env, s->pc++) & 0x3F;
-                bit_index = cpu_ldub_code(env, s->pc++) & 0x3F;
+                field_length = x86_ldub_code(env, s) & 0x3F;
+                bit_index = x86_ldub_code(env, s) & 0x3F;
                 tcg_gen_addi_ptr(cpu_ptr0, cpu_env,
                     offsetof(CPUX86State,xmm_regs[reg]));
                 if (b1 == 1)
@@ -3380,7 +3408,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b,
             if (b1 >= 2) {
 	        goto unknown_op;
             }
-            val = cpu_ldub_code(env, s->pc++);
+            val = x86_ldub_code(env, s);
             if (is_xmm) {
                 tcg_gen_movi_tl(cpu_T0, val);
                 tcg_gen_st32_tl(cpu_T0, cpu_env, offsetof(CPUX86State,xmm_t0.ZMM_L(0)));
@@ -3537,7 +3565,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b,
         case 0x1c4:
             s->rip_offset = 1;
             gen_ldst_modrm(env, s, modrm, MO_16, OR_TMP0, 0);
-            val = cpu_ldub_code(env, s->pc++);
+            val = x86_ldub_code(env, s);
             if (b1) {
                 val &= 7;
                 tcg_gen_st16_tl(cpu_T0, cpu_env,
@@ -3553,7 +3581,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b,
             if (mod != 3)
                 goto illegal_op;
             ot = mo_64_32(s->dflag);
-            val = cpu_ldub_code(env, s->pc++);
+            val = x86_ldub_code(env, s);
             if (b1) {
                 val &= 7;
                 rm = (modrm & 7) | REX_B(s);
@@ -3616,7 +3644,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b,
             if ((b & 0xf0) == 0xf0) {
                 goto do_0f_38_fx;
             }
-            modrm = cpu_ldub_code(env, s->pc++);
+            modrm = x86_ldub_code(env, s);
             rm = modrm & 7;
             reg = ((modrm >> 3) & 7) | rex_r;
             mod = (modrm >> 6) & 3;
@@ -3693,7 +3721,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b,
         do_0f_38_fx:
             /* Various integer extensions at 0f 38 f[0-f].  */
             b = modrm | (b1 << 8);
-            modrm = cpu_ldub_code(env, s->pc++);
+            modrm = x86_ldub_code(env, s);
             reg = ((modrm >> 3) & 7) | rex_r;
 
             switch (b) {
@@ -4054,7 +4082,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b,
         case 0x03a:
         case 0x13a:
             b = modrm;
-            modrm = cpu_ldub_code(env, s->pc++);
+            modrm = x86_ldub_code(env, s);
             rm = modrm & 7;
             reg = ((modrm >> 3) & 7) | rex_r;
             mod = (modrm >> 6) & 3;
@@ -4077,7 +4105,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b,
                 if (mod != 3)
                     gen_lea_modrm(env, s, modrm);
                 reg = ((modrm >> 3) & 7) | rex_r;
-                val = cpu_ldub_code(env, s->pc++);
+                val = x86_ldub_code(env, s);
                 switch (b) {
                 case 0x14: /* pextrb */
                     tcg_gen_ld8u_tl(cpu_T0, cpu_env, offsetof(CPUX86State,
@@ -4225,7 +4253,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b,
                     gen_ldq_env_A0(s, op2_offset);
                 }
             }
-            val = cpu_ldub_code(env, s->pc++);
+            val = x86_ldub_code(env, s);
 
             if ((b & 0xfc) == 0x60) { /* pcmpXstrX */
                 set_cc_op(s, CC_OP_EFLAGS);
@@ -4244,7 +4272,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b,
         case 0x33a:
             /* Various integer extensions at 0f 3a f[0-f].  */
             b = modrm | (b1 << 8);
-            modrm = cpu_ldub_code(env, s->pc++);
+            modrm = x86_ldub_code(env, s);
             reg = ((modrm >> 3) & 7) | rex_r;
 
             switch (b) {
@@ -4256,7 +4284,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b,
                 }
                 ot = mo_64_32(s->dflag);
                 gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
-                b = cpu_ldub_code(env, s->pc++);
+                b = x86_ldub_code(env, s);
                 if (ot == MO_64) {
                     tcg_gen_rotri_tl(cpu_T0, cpu_T0, b & 63);
                 } else {
@@ -4351,7 +4379,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b,
         }
         switch(b) {
         case 0x0f: /* 3DNow! data insns */
-            val = cpu_ldub_code(env, s->pc++);
+            val = x86_ldub_code(env, s);
             sse_fn_epp = sse_op_table5[val];
             if (!sse_fn_epp) {
                 goto unknown_op;
@@ -4365,7 +4393,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b,
             break;
         case 0x70: /* pshufx insn */
         case 0xc6: /* pshufx insn */
-            val = cpu_ldub_code(env, s->pc++);
+            val = x86_ldub_code(env, s);
             tcg_gen_addi_ptr(cpu_ptr0, cpu_env, op1_offset);
             tcg_gen_addi_ptr(cpu_ptr1, cpu_env, op2_offset);
             /* XXX: introduce a new table? */
@@ -4374,7 +4402,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b,
             break;
         case 0xc2:
             /* compare insns */
-            val = cpu_ldub_code(env, s->pc++);
+            val = x86_ldub_code(env, s);
             if (val >= 8)
                 goto unknown_op;
             sse_fn_epp = sse_op_table4[val][b1];
@@ -4443,8 +4471,7 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
     if (s->pc - pc_start > 14) {
         goto illegal_op;
     }
-    b = cpu_ldub_code(env, s->pc);
-    s->pc++;
+    b = x86_ldub_code(env, s);
     /* Collect prefixes.  */
     switch (b) {
     case 0xf3:
@@ -4501,7 +4528,7 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
             static const int pp_prefix[4] = {
                 0, PREFIX_DATA, PREFIX_REPZ, PREFIX_REPNZ
             };
-            int vex3, vex2 = cpu_ldub_code(env, s->pc);
+            int vex3, vex2 = x86_ldub_code(env, s);
 
             if (!CODE64(s) && (vex2 & 0xc0) != 0xc0) {
                 /* 4.1.4.6: In 32-bit mode, bits [7:6] must be 11b,
@@ -4523,17 +4550,17 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
             rex_r = (~vex2 >> 4) & 8;
             if (b == 0xc5) {
                 vex3 = vex2;
-                b = cpu_ldub_code(env, s->pc++);
+                b = x86_ldub_code(env, s);
             } else {
 #ifdef TARGET_X86_64
                 s->rex_x = (~vex2 >> 3) & 8;
                 s->rex_b = (~vex2 >> 2) & 8;
 #endif
-                vex3 = cpu_ldub_code(env, s->pc++);
+                vex3 = x86_ldub_code(env, s);
                 rex_w = (vex3 >> 7) & 1;
                 switch (vex2 & 0x1f) {
                 case 0x01: /* Implied 0f leading opcode bytes.  */
-                    b = cpu_ldub_code(env, s->pc++) | 0x100;
+                    b = x86_ldub_code(env, s) | 0x100;
                     break;
                 case 0x02: /* Implied 0f 38 leading opcode bytes.  */
                     b = 0x138;
@@ -4585,7 +4612,7 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
     case 0x0f:
         /**************************/
         /* extended op code */
-        b = cpu_ldub_code(env, s->pc++) | 0x100;
+        b = x86_ldub_code(env, s) | 0x100;
         goto reswitch;
 
         /**************************/
@@ -4607,7 +4634,7 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
 
             switch(f) {
             case 0: /* OP Ev, Gv */
-                modrm = cpu_ldub_code(env, s->pc++);
+                modrm = x86_ldub_code(env, s);
                 reg = ((modrm >> 3) & 7) | rex_r;
                 mod = (modrm >> 6) & 3;
                 rm = (modrm & 7) | REX_B(s);
@@ -4628,7 +4655,7 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
                 gen_op(s, op, ot, opreg);
                 break;
             case 1: /* OP Gv, Ev */
-                modrm = cpu_ldub_code(env, s->pc++);
+                modrm = x86_ldub_code(env, s);
                 mod = (modrm >> 6) & 3;
                 reg = ((modrm >> 3) & 7) | rex_r;
                 rm = (modrm & 7) | REX_B(s);
@@ -4662,7 +4689,7 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
 
             ot = mo_b_d(b, dflag);
 
-            modrm = cpu_ldub_code(env, s->pc++);
+            modrm = x86_ldub_code(env, s);
             mod = (modrm >> 6) & 3;
             rm = (modrm & 7) | REX_B(s);
             op = (modrm >> 3) & 7;
@@ -4708,7 +4735,7 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
     case 0xf7:
         ot = mo_b_d(b, dflag);
 
-        modrm = cpu_ldub_code(env, s->pc++);
+        modrm = x86_ldub_code(env, s);
         mod = (modrm >> 6) & 3;
         rm = (modrm & 7) | REX_B(s);
         op = (modrm >> 3) & 7;
@@ -4940,7 +4967,7 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
     case 0xff: /* GRP5 */
         ot = mo_b_d(b, dflag);
 
-        modrm = cpu_ldub_code(env, s->pc++);
+        modrm = x86_ldub_code(env, s);
         mod = (modrm >> 6) & 3;
         rm = (modrm & 7) | REX_B(s);
         op = (modrm >> 3) & 7;
@@ -5048,7 +5075,7 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
     case 0x85:
         ot = mo_b_d(b, dflag);
 
-        modrm = cpu_ldub_code(env, s->pc++);
+        modrm = x86_ldub_code(env, s);
         reg = ((modrm >> 3) & 7) | rex_r;
 
         gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
@@ -5120,7 +5147,7 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
     case 0x69: /* imul Gv, Ev, I */
     case 0x6b:
         ot = dflag;
-        modrm = cpu_ldub_code(env, s->pc++);
+        modrm = x86_ldub_code(env, s);
         reg = ((modrm >> 3) & 7) | rex_r;
         if (b == 0x69)
             s->rip_offset = insn_const_size(ot);
@@ -5172,7 +5199,7 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
     case 0x1c0:
     case 0x1c1: /* xadd Ev, Gv */
         ot = mo_b_d(b, dflag);
-        modrm = cpu_ldub_code(env, s->pc++);
+        modrm = x86_ldub_code(env, s);
         reg = ((modrm >> 3) & 7) | rex_r;
         mod = (modrm >> 6) & 3;
         gen_op_mov_v_reg(ot, cpu_T0, reg);
@@ -5204,7 +5231,7 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
             TCGv oldv, newv, cmpv;
 
             ot = mo_b_d(b, dflag);
-            modrm = cpu_ldub_code(env, s->pc++);
+            modrm = x86_ldub_code(env, s);
             reg = ((modrm >> 3) & 7) | rex_r;
             mod = (modrm >> 6) & 3;
             oldv = tcg_temp_new();
@@ -5256,7 +5283,7 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
         }
         break;
     case 0x1c7: /* cmpxchg8b */
-        modrm = cpu_ldub_code(env, s->pc++);
+        modrm = x86_ldub_code(env, s);
         mod = (modrm >> 6) & 3;
         if ((mod == 3) || ((modrm & 0x38) != 0x8))
             goto illegal_op;
@@ -5318,7 +5345,7 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
         gen_push_v(s, cpu_T0);
         break;
     case 0x8f: /* pop Ev */
-        modrm = cpu_ldub_code(env, s->pc++);
+        modrm = x86_ldub_code(env, s);
         mod = (modrm >> 6) & 3;
         ot = gen_pop_T0(s);
         if (mod == 3) {
@@ -5337,9 +5364,8 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
     case 0xc8: /* enter */
         {
             int level;
-            val = cpu_lduw_code(env, s->pc);
-            s->pc += 2;
-            level = cpu_ldub_code(env, s->pc++);
+            val = x86_lduw_code(env, s);
+            level = x86_ldub_code(env, s);
             gen_enter(s, val, level);
         }
         break;
@@ -5396,7 +5422,7 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
     case 0x88:
     case 0x89: /* mov Gv, Ev */
         ot = mo_b_d(b, dflag);
-        modrm = cpu_ldub_code(env, s->pc++);
+        modrm = x86_ldub_code(env, s);
         reg = ((modrm >> 3) & 7) | rex_r;
 
         /* generate a generic store */
@@ -5405,7 +5431,7 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
     case 0xc6:
     case 0xc7: /* mov Ev, Iv */
         ot = mo_b_d(b, dflag);
-        modrm = cpu_ldub_code(env, s->pc++);
+        modrm = x86_ldub_code(env, s);
         mod = (modrm >> 6) & 3;
         if (mod != 3) {
             s->rip_offset = insn_const_size(ot);
@@ -5422,14 +5448,14 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
     case 0x8a:
     case 0x8b: /* mov Ev, Gv */
         ot = mo_b_d(b, dflag);
-        modrm = cpu_ldub_code(env, s->pc++);
+        modrm = x86_ldub_code(env, s);
         reg = ((modrm >> 3) & 7) | rex_r;
 
         gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
         gen_op_mov_reg_v(ot, reg, cpu_T0);
         break;
     case 0x8e: /* mov seg, Gv */
-        modrm = cpu_ldub_code(env, s->pc++);
+        modrm = x86_ldub_code(env, s);
         reg = (modrm >> 3) & 7;
         if (reg >= 6 || reg == R_CS)
             goto illegal_op;
@@ -5447,7 +5473,7 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
         }
         break;
     case 0x8c: /* mov Gv, seg */
-        modrm = cpu_ldub_code(env, s->pc++);
+        modrm = x86_ldub_code(env, s);
         reg = (modrm >> 3) & 7;
         mod = (modrm >> 6) & 3;
         if (reg >= 6)
@@ -5472,7 +5498,7 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
             /* s_ot is the sign+size of source */
             s_ot = b & 8 ? MO_SIGN | ot : ot;
 
-            modrm = cpu_ldub_code(env, s->pc++);
+            modrm = x86_ldub_code(env, s);
             reg = ((modrm >> 3) & 7) | rex_r;
             mod = (modrm >> 6) & 3;
             rm = (modrm & 7) | REX_B(s);
@@ -5508,7 +5534,7 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
         break;
 
     case 0x8d: /* lea */
-        modrm = cpu_ldub_code(env, s->pc++);
+        modrm = x86_ldub_code(env, s);
         mod = (modrm >> 6) & 3;
         if (mod == 3)
             goto illegal_op;
@@ -5532,8 +5558,7 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
             switch (s->aflag) {
 #ifdef TARGET_X86_64
             case MO_64:
-                offset_addr = cpu_ldq_code(env, s->pc);
-                s->pc += 8;
+                offset_addr = x86_ldq_code(env, s);
                 break;
 #endif
             default:
@@ -5570,8 +5595,7 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
         if (dflag == MO_64) {
             uint64_t tmp;
             /* 64 bit case */
-            tmp = cpu_ldq_code(env, s->pc);
-            s->pc += 8;
+            tmp = x86_ldq_code(env, s);
             reg = (b & 7) | REX_B(s);
             tcg_gen_movi_tl(cpu_T0, tmp);
             gen_op_mov_reg_v(MO_64, reg, cpu_T0);
@@ -5595,7 +5619,7 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
     case 0x86:
     case 0x87: /* xchg Ev, Gv */
         ot = mo_b_d(b, dflag);
-        modrm = cpu_ldub_code(env, s->pc++);
+        modrm = x86_ldub_code(env, s);
         reg = ((modrm >> 3) & 7) | rex_r;
         mod = (modrm >> 6) & 3;
         if (mod == 3) {
@@ -5632,7 +5656,7 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
         op = R_GS;
     do_lxx:
         ot = dflag != MO_16 ? MO_32 : MO_16;
-        modrm = cpu_ldub_code(env, s->pc++);
+        modrm = x86_ldub_code(env, s);
         reg = ((modrm >> 3) & 7) | rex_r;
         mod = (modrm >> 6) & 3;
         if (mod == 3)
@@ -5660,7 +5684,7 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
     grp2:
         {
             ot = mo_b_d(b, dflag);
-            modrm = cpu_ldub_code(env, s->pc++);
+            modrm = x86_ldub_code(env, s);
             mod = (modrm >> 6) & 3;
             op = (modrm >> 3) & 7;
 
@@ -5679,7 +5703,7 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
                 gen_shift(s, op, ot, opreg, OR_ECX);
             } else {
                 if (shift == 2) {
-                    shift = cpu_ldub_code(env, s->pc++);
+                    shift = x86_ldub_code(env, s);
                 }
                 gen_shifti(s, op, ot, opreg, shift);
             }
@@ -5713,7 +5737,7 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
         shift = 0;
     do_shiftd:
         ot = dflag;
-        modrm = cpu_ldub_code(env, s->pc++);
+        modrm = x86_ldub_code(env, s);
         mod = (modrm >> 6) & 3;
         rm = (modrm & 7) | REX_B(s);
         reg = ((modrm >> 3) & 7) | rex_r;
@@ -5726,7 +5750,7 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
         gen_op_mov_v_reg(ot, cpu_T1, reg);
 
         if (shift) {
-            TCGv imm = tcg_const_tl(cpu_ldub_code(env, s->pc++));
+            TCGv imm = tcg_const_tl(x86_ldub_code(env, s));
             gen_shiftd_rm_T1(s, ot, opreg, op, imm);
             tcg_temp_free(imm);
         } else {
@@ -5743,7 +5767,7 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
             gen_exception(s, EXCP07_PREX, pc_start - s->cs_base);
             break;
         }
-        modrm = cpu_ldub_code(env, s->pc++);
+        modrm = x86_ldub_code(env, s);
         mod = (modrm >> 6) & 3;
         rm = modrm & 7;
         op = ((b & 7) << 3) | ((modrm >> 3) & 7);
@@ -6328,7 +6352,7 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
     case 0xe4:
     case 0xe5:
         ot = mo_b_d32(b, dflag);
-        val = cpu_ldub_code(env, s->pc++);
+        val = x86_ldub_code(env, s);
         tcg_gen_movi_tl(cpu_T0, val);
         gen_check_io(s, ot, pc_start - s->cs_base,
                      SVM_IOIO_TYPE_MASK | svm_is_rep(prefixes));
@@ -6347,7 +6371,7 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
     case 0xe6:
     case 0xe7:
         ot = mo_b_d32(b, dflag);
-        val = cpu_ldub_code(env, s->pc++);
+        val = x86_ldub_code(env, s);
         tcg_gen_movi_tl(cpu_T0, val);
         gen_check_io(s, ot, pc_start - s->cs_base,
                      svm_is_rep(prefixes));
@@ -6407,8 +6431,7 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
         /************************/
         /* control */
     case 0xc2: /* ret im */
-        val = cpu_ldsw_code(env, s->pc);
-        s->pc += 2;
+        val = x86_ldsw_code(env, s);
         ot = gen_pop_T0(s);
         gen_stack_update(s, val + (1 << ot));
         /* Note that gen_pop_T0 uses a zero-extending load.  */
@@ -6425,8 +6448,7 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
         gen_jr(s, cpu_T0);
         break;
     case 0xca: /* lret im */
-        val = cpu_ldsw_code(env, s->pc);
-        s->pc += 2;
+        val = x86_ldsw_code(env, s);
     do_lret:
         if (s->pe && !s->vm86) {
             gen_update_cc_op(s);
@@ -6563,7 +6585,7 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
         break;
 
     case 0x190 ... 0x19f: /* setcc Gv */
-        modrm = cpu_ldub_code(env, s->pc++);
+        modrm = x86_ldub_code(env, s);
         gen_setcc1(s, b, cpu_T0);
         gen_ldst_modrm(env, s, modrm, MO_8, OR_TMP0, 1);
         break;
@@ -6572,7 +6594,7 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
             goto illegal_op;
         }
         ot = dflag;
-        modrm = cpu_ldub_code(env, s->pc++);
+        modrm = x86_ldub_code(env, s);
         reg = ((modrm >> 3) & 7) | rex_r;
         gen_cmovcc1(env, s, ot, b, modrm, reg);
         break;
@@ -6689,7 +6711,7 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
         /* bit operations */
     case 0x1ba: /* bt/bts/btr/btc Gv, im */
         ot = dflag;
-        modrm = cpu_ldub_code(env, s->pc++);
+        modrm = x86_ldub_code(env, s);
         op = (modrm >> 3) & 7;
         mod = (modrm >> 6) & 3;
         rm = (modrm & 7) | REX_B(s);
@@ -6703,7 +6725,7 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
             gen_op_mov_v_reg(ot, cpu_T0, rm);
         }
         /* load shift */
-        val = cpu_ldub_code(env, s->pc++);
+        val = x86_ldub_code(env, s);
         tcg_gen_movi_tl(cpu_T1, val);
         if (op < 4)
             goto unknown_op;
@@ -6722,7 +6744,7 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
         op = 3;
     do_btx:
         ot = dflag;
-        modrm = cpu_ldub_code(env, s->pc++);
+        modrm = x86_ldub_code(env, s);
         reg = ((modrm >> 3) & 7) | rex_r;
         mod = (modrm >> 6) & 3;
         rm = (modrm & 7) | REX_B(s);
@@ -6827,7 +6849,7 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
     case 0x1bc: /* bsf / tzcnt */
     case 0x1bd: /* bsr / lzcnt */
         ot = dflag;
-        modrm = cpu_ldub_code(env, s->pc++);
+        modrm = x86_ldub_code(env, s);
         reg = ((modrm >> 3) & 7) | rex_r;
         gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
         gen_extu(ot, cpu_T0);
@@ -6907,7 +6929,7 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
     case 0xd4: /* aam */
         if (CODE64(s))
             goto illegal_op;
-        val = cpu_ldub_code(env, s->pc++);
+        val = x86_ldub_code(env, s);
         if (val == 0) {
             gen_exception(s, EXCP00_DIVZ, pc_start - s->cs_base);
         } else {
@@ -6918,7 +6940,7 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
     case 0xd5: /* aad */
         if (CODE64(s))
             goto illegal_op;
-        val = cpu_ldub_code(env, s->pc++);
+        val = x86_ldub_code(env, s);
         gen_helper_aad(cpu_env, tcg_const_i32(val));
         set_cc_op(s, CC_OP_LOGICB);
         break;
@@ -6952,7 +6974,7 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
         gen_interrupt(s, EXCP03_INT3, pc_start - s->cs_base, s->pc - s->cs_base);
         break;
     case 0xcd: /* int N */
-        val = cpu_ldub_code(env, s->pc++);
+        val = x86_ldub_code(env, s);
         if (s->vm86 && s->iopl != 3) {
             gen_exception(s, EXCP0D_GPF, pc_start - s->cs_base);
         } else {
@@ -7007,7 +7029,7 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
         if (CODE64(s))
             goto illegal_op;
         ot = dflag;
-        modrm = cpu_ldub_code(env, s->pc++);
+        modrm = x86_ldub_code(env, s);
         reg = (modrm >> 3) & 7;
         mod = (modrm >> 6) & 3;
         if (mod == 3)
@@ -7186,7 +7208,7 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
         }
         break;
     case 0x100:
-        modrm = cpu_ldub_code(env, s->pc++);
+        modrm = x86_ldub_code(env, s);
         mod = (modrm >> 6) & 3;
         op = (modrm >> 3) & 7;
         switch(op) {
@@ -7251,7 +7273,7 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
         break;
 
     case 0x101:
-        modrm = cpu_ldub_code(env, s->pc++);
+        modrm = x86_ldub_code(env, s);
         switch (modrm) {
         CASE_MODRM_MEM_OP(0): /* sgdt */
             gen_svm_check_intercept(s, pc_start, SVM_EXIT_GDTR_READ);
@@ -7596,7 +7618,7 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
             /* d_ot is the size of destination */
             d_ot = dflag;
 
-            modrm = cpu_ldub_code(env, s->pc++);
+            modrm = x86_ldub_code(env, s);
             reg = ((modrm >> 3) & 7) | rex_r;
             mod = (modrm >> 6) & 3;
             rm = (modrm & 7) | REX_B(s);
@@ -7625,7 +7647,7 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
             t1 = tcg_temp_local_new();
             t2 = tcg_temp_local_new();
             ot = MO_16;
-            modrm = cpu_ldub_code(env, s->pc++);
+            modrm = x86_ldub_code(env, s);
             reg = (modrm >> 3) & 7;
             mod = (modrm >> 6) & 3;
             rm = modrm & 7;
@@ -7670,7 +7692,7 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
             if (!s->pe || s->vm86)
                 goto illegal_op;
             ot = dflag != MO_16 ? MO_32 : MO_16;
-            modrm = cpu_ldub_code(env, s->pc++);
+            modrm = x86_ldub_code(env, s);
             reg = ((modrm >> 3) & 7) | rex_r;
             gen_ldst_modrm(env, s, modrm, MO_16, OR_TMP0, 0);
             t0 = tcg_temp_local_new();
@@ -7690,7 +7712,7 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
         }
         break;
     case 0x118:
-        modrm = cpu_ldub_code(env, s->pc++);
+        modrm = x86_ldub_code(env, s);
         mod = (modrm >> 6) & 3;
         op = (modrm >> 3) & 7;
         switch(op) {
@@ -7709,7 +7731,7 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
         }
         break;
     case 0x11a:
-        modrm = cpu_ldub_code(env, s->pc++);
+        modrm = x86_ldub_code(env, s);
         if (s->flags & HF_MPX_EN_MASK) {
             mod = (modrm >> 6) & 3;
             reg = ((modrm >> 3) & 7) | rex_r;
@@ -7799,7 +7821,7 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
         gen_nop_modrm(env, s, modrm);
         break;
     case 0x11b:
-        modrm = cpu_ldub_code(env, s->pc++);
+        modrm = x86_ldub_code(env, s);
         if (s->flags & HF_MPX_EN_MASK) {
             mod = (modrm >> 6) & 3;
             reg = ((modrm >> 3) & 7) | rex_r;
@@ -7901,7 +7923,7 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
         gen_nop_modrm(env, s, modrm);
         break;
     case 0x119: case 0x11c ... 0x11f: /* nop (multi byte) */
-        modrm = cpu_ldub_code(env, s->pc++);
+        modrm = x86_ldub_code(env, s);
         gen_nop_modrm(env, s, modrm);
         break;
     case 0x120: /* mov reg, crN */
@@ -7909,7 +7931,7 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
         if (s->cpl != 0) {
             gen_exception(s, EXCP0D_GPF, pc_start - s->cs_base);
         } else {
-            modrm = cpu_ldub_code(env, s->pc++);
+            modrm = x86_ldub_code(env, s);
             /* Ignore the mod bits (assume (modrm&0xc0)==0xc0).
              * AMD documentation (24594.pdf) and testing of
              * intel 386 and 486 processors all show that the mod bits
@@ -7966,7 +7988,7 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
         if (s->cpl != 0) {
             gen_exception(s, EXCP0D_GPF, pc_start - s->cs_base);
         } else {
-            modrm = cpu_ldub_code(env, s->pc++);
+            modrm = x86_ldub_code(env, s);
             /* Ignore the mod bits (assume (modrm&0xc0)==0xc0).
              * AMD documentation (24594.pdf) and testing of
              * intel 386 and 486 processors all show that the mod bits
@@ -8012,7 +8034,7 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
         if (!(s->cpuid_features & CPUID_SSE2))
             goto illegal_op;
         ot = mo_64_32(dflag);
-        modrm = cpu_ldub_code(env, s->pc++);
+        modrm = x86_ldub_code(env, s);
         mod = (modrm >> 6) & 3;
         if (mod == 3)
             goto illegal_op;
@@ -8021,7 +8043,7 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
         gen_ldst_modrm(env, s, modrm, ot, reg, 1);
         break;
     case 0x1ae:
-        modrm = cpu_ldub_code(env, s->pc++);
+        modrm = x86_ldub_code(env, s);
         switch (modrm) {
         CASE_MODRM_MEM_OP(0): /* fxsave */
             if (!(s->cpuid_features & CPUID_FXSR)
@@ -8219,7 +8241,7 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
         break;
 
     case 0x10d: /* 3DNow! prefetch(w) */
-        modrm = cpu_ldub_code(env, s->pc++);
+        modrm = x86_ldub_code(env, s);
         mod = (modrm >> 6) & 3;
         if (mod == 3)
             goto illegal_op;
@@ -8241,7 +8263,7 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
         if (!(s->cpuid_ext_features & CPUID_EXT_POPCNT))
             goto illegal_op;
 
-        modrm = cpu_ldub_code(env, s->pc++);
+        modrm = x86_ldub_code(env, s);
         reg = ((modrm >> 3) & 7) | rex_r;
 
         if (s->prefix & PREFIX_DATA) {

From b066c5375737ad0d630196dab2a2b329515a1d00 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 22 Mar 2017 11:57:10 +0100
Subject: [PATCH 15/29] target/i386: trap on instructions longer than >15 bytes

Besides being more correct, arbitrarily long instruction allow the
generation of a translation block that spans three pages.  This
confuses the generator and even allows ring 3 code to poison the
translation block cache and inject code into other processes that are
in guest ring 3.

This is an improved (and more invasive) fix for commit 30663fd ("tcg/i386:
Check the size of instruction being translated", 2017-03-24).  In addition
to being more precise (and generating the right exception, which is #GP
rather than #UD), it distinguishes better between page faults and too long
instructions, as shown by this test case:

    #include <sys/mman.h>
    #include <string.h>
    #include <stdio.h>

    int main()
    {
            char *x = mmap(NULL, 8192, PROT_READ|PROT_WRITE|PROT_EXEC,
                           MAP_PRIVATE|MAP_ANON, -1, 0);
            memset(x, 0x66, 4096);
            x[4096] = 0x90;
            x[4097] = 0xc3;
            char *i = x + 4096 - 15;
            mprotect(x + 4096, 4096, PROT_READ|PROT_WRITE);
            ((void(*)(void)) i) ();
    }

... which produces a #GP without the mprotect, and a #PF with it.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target/i386/translate.c | 29 ++++++++++++++++++++++-------
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/target/i386/translate.c b/target/i386/translate.c
index 4a938c21a0..5f24a2de3c 100644
--- a/target/i386/translate.c
+++ b/target/i386/translate.c
@@ -136,6 +136,7 @@ typedef struct DisasContext {
     int cpuid_ext3_features;
     int cpuid_7_0_ebx_features;
     int cpuid_xsave_features;
+    sigjmp_buf jmpbuf;
 } DisasContext;
 
 static void gen_eob(DisasContext *s);
@@ -1863,11 +1864,27 @@ static void gen_shifti(DisasContext *s1, int op, TCGMemOp ot, int d, int c)
     }
 }
 
+#define X86_MAX_INSN_LENGTH 15
+
 static uint64_t advance_pc(CPUX86State *env, DisasContext *s, int num_bytes)
 {
     uint64_t pc = s->pc;
 
     s->pc += num_bytes;
+    if (unlikely(s->pc - s->pc_start > X86_MAX_INSN_LENGTH)) {
+        /* If the instruction's 16th byte is on a different page than the 1st, a
+         * page fault on the second page wins over the general protection fault
+         * caused by the instruction being too long.
+         * This can happen even if the operand is only one byte long!
+         */
+        if (((s->pc - 1) ^ (pc - 1)) & TARGET_PAGE_MASK) {
+            volatile uint8_t unused =
+                cpu_ldub_code(env, (s->pc - 1) & TARGET_PAGE_MASK);
+            (void) unused;
+        }
+        siglongjmp(s->jmpbuf, 1);
+    }
+
     return pc;
 }
 
@@ -4463,14 +4480,12 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
     s->rip_offset = 0; /* for relative ip address */
     s->vex_l = 0;
     s->vex_v = 0;
- next_byte:
-    /* x86 has an upper limit of 15 bytes for an instruction. Since we
-     * do not want to decode and generate IR for an illegal
-     * instruction, the following check limits the instruction size to
-     * 25 bytes: 14 prefix + 1 opc + 6 (modrm+sib+ofs) + 4 imm */
-    if (s->pc - pc_start > 14) {
-        goto illegal_op;
+    if (sigsetjmp(s->jmpbuf, 0) != 0) {
+        gen_exception(s, EXCP0D_GPF, pc_start - s->cs_base);
+        return s->pc;
     }
+
+ next_byte:
     b = x86_ldub_code(env, s);
     /* Collect prefixes.  */
     switch (b) {

From ae990e6cd77a4e6004b7abc6d293598910abca63 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 16 Oct 2017 16:42:56 +0200
Subject: [PATCH 16/29] memory: call log_start after region_add

It might be confusing for some listener implementations that implement
both, region_add and log_start (e.g. KVM) if we call log_start before an
actual region was added using region_add.

This makes current KVM code trigger an assertion
("kvm_section_update_flags: error finding slot"). So let's just reverse
the order instead of tolerating log_start on yet unknown regions.

Reported-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20171016144302.24284-2-david@redhat.com>
Tested-by: Joe Clifford <joeclifford@gmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 memory.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/memory.c b/memory.c
index b637c12bad..3e1558a031 100644
--- a/memory.c
+++ b/memory.c
@@ -2607,12 +2607,12 @@ static void listener_add_address_space(MemoryListener *listener,
             .offset_within_address_space = int128_get64(fr->addr.start),
             .readonly = fr->readonly,
         };
-        if (fr->dirty_log_mask && listener->log_start) {
-            listener->log_start(listener, &section, 0, fr->dirty_log_mask);
-        }
         if (listener->region_add) {
             listener->region_add(listener, &section);
         }
+        if (fr->dirty_log_mask && listener->log_start) {
+            listener->log_start(listener, &section, 0, fr->dirty_log_mask);
+        }
     }
     if (listener->commit) {
         listener->commit(listener);

From bbfd3017eb0ae59fe799e67046914dd1f94a9767 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 16 Oct 2017 16:42:57 +0200
Subject: [PATCH 17/29] kvm: fix alignment of ram address

Fix the wrong calculation of the delta, used to align the ram address.

This only strikes if alignment has to be done.

Reported-by: Joe Clifford <joeclifford@gmail.com>
Fixes: 5ea69c2e3614 ("kvm: factor out alignment of memory section")
Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20171016144302.24284-3-david@redhat.com>
Tested-by: Joe Clifford <joeclifford@gmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 accel/kvm/kvm-all.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index 90c88b517d..fae1eca983 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -717,8 +717,9 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml,
         return;
     }
 
+    /* use aligned delta to align the ram address */
     ram = memory_region_get_ram_ptr(mr) + section->offset_within_region +
-          (section->offset_within_address_space - start_addr);
+          (start_addr - section->offset_within_address_space);
 
     mem = kvm_lookup_matching_slot(kml, start_addr, size);
     if (!add) {

From e377e87ca6a3d3d4f79bc2fdf2ad3283d39d4104 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 16 Oct 2017 16:42:58 +0200
Subject: [PATCH 18/29] kvm: tolerate non-existing slot for
 log_start/log_stop/log_sync

If we want to trap every access to a section, we might not have a
slot. So let's just tolerate if we don't have one.

Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20171016144302.24284-4-david@redhat.com>
Tested-by: Joe Clifford <joeclifford@gmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 accel/kvm/kvm-all.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index fae1eca983..f5fa3e24bd 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -394,8 +394,8 @@ static int kvm_section_update_flags(KVMMemoryListener *kml,
 
     mem = kvm_lookup_matching_slot(kml, start_addr, size);
     if (!mem) {
-        fprintf(stderr, "%s: error finding slot\n", __func__);
-        abort();
+        /* We don't have a slot if we want to trap every access. */
+        return 0;
     }
 
     return kvm_slot_update_flags(kml, mem, section->mr);
@@ -470,8 +470,8 @@ static int kvm_physical_sync_dirty_bitmap(KVMMemoryListener *kml,
     if (size) {
         mem = kvm_lookup_matching_slot(kml, start_addr, size);
         if (!mem) {
-            fprintf(stderr, "%s: error finding slot\n", __func__);
-            abort();
+            /* We don't have a slot if we want to trap every access. */
+            return 0;
         }
 
         /* XXX bad kernel interface alert

From 1c4fdabaf734f6519fdef670df14285e491ef52c Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 16 Oct 2017 16:42:59 +0200
Subject: [PATCH 19/29] kvm: fix error message when failing to unregister slot

"overlapping" is a leftover, let's drop it.

Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20171016144302.24284-5-david@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 accel/kvm/kvm-all.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index f5fa3e24bd..559c544501 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -734,7 +734,7 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml,
         mem->memory_size = 0;
         err = kvm_set_user_memory_region(kml, mem);
         if (err) {
-            fprintf(stderr, "%s: error unregistering overlapping slot: %s\n",
+            fprintf(stderr, "%s: error unregistering slot: %s\n",
                     __func__, strerror(-err));
             abort();
         }

From 90ed4bcc3a15749c3c341e2a684f8f84e2251b67 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 16 Oct 2017 16:43:00 +0200
Subject: [PATCH 20/29] kvm: region_add and region_del is not called on updates

Attributes are not updated via region_add()/region_del(). Attribute changes
lead to a delete first, followed by a new add.

If this would ever not be the case, we would get an error when trying to
register the new slot.

Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20171016144302.24284-6-david@redhat.com>
Tested-by: Joe Clifford <joeclifford@gmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 accel/kvm/kvm-all.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index 559c544501..2835bb3801 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -721,8 +721,8 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml,
     ram = memory_region_get_ram_ptr(mr) + section->offset_within_region +
           (start_addr - section->offset_within_address_space);
 
-    mem = kvm_lookup_matching_slot(kml, start_addr, size);
     if (!add) {
+        mem = kvm_lookup_matching_slot(kml, start_addr, size);
         if (!mem) {
             return;
         }
@@ -741,12 +741,6 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml,
         return;
     }
 
-    if (mem) {
-        /* update the slot */
-        kvm_slot_update_flags(kml, mem, mr);
-        return;
-    }
-
     /* register the new slot */
     mem = kvm_alloc_slot(kml);
     mem->memory_size = size;

From a6ffc4232ab649ea91bd951f8c4f9cc598a66fd6 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 16 Oct 2017 16:43:01 +0200
Subject: [PATCH 21/29] kvm: simplify kvm_align_section()

Use ROUND_UP and simplify the code a bit.

Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20171016144302.24284-7-david@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 accel/kvm/kvm-all.c | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index 2835bb3801..f290f487a5 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -197,26 +197,20 @@ static hwaddr kvm_align_section(MemoryRegionSection *section,
                                 hwaddr *start)
 {
     hwaddr size = int128_get64(section->size);
-    hwaddr delta;
-
-    *start = section->offset_within_address_space;
+    hwaddr delta, aligned;
 
     /* kvm works in page size chunks, but the function may be called
        with sub-page size and unaligned start address. Pad the start
        address to next and truncate size to previous page boundary. */
-    delta = qemu_real_host_page_size - (*start & ~qemu_real_host_page_mask);
-    delta &= ~qemu_real_host_page_mask;
-    *start += delta;
+    aligned = ROUND_UP(section->offset_within_address_space,
+                       qemu_real_host_page_size);
+    delta = aligned - section->offset_within_address_space;
+    *start = aligned;
     if (delta > size) {
         return 0;
     }
-    size -= delta;
-    size &= qemu_real_host_page_mask;
-    if (*start & ~qemu_real_host_page_mask) {
-        return 0;
-    }
 
-    return size;
+    return (size - delta) & qemu_real_host_page_mask;
 }
 
 int kvm_physical_memory_addr_from_host(KVMState *s, void *ram,

From 279836f8190fd9d6428324414ee802c38c09fbc5 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 16 Oct 2017 16:43:02 +0200
Subject: [PATCH 22/29] memory: reuse section_from_flat_range()

We can use section_from_flat_range() instead of manually initializing.

Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20171016144302.24284-8-david@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 memory.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/memory.c b/memory.c
index 3e1558a031..e26e5a3b1d 100644
--- a/memory.c
+++ b/memory.c
@@ -2599,14 +2599,8 @@ static void listener_add_address_space(MemoryListener *listener,
 
     view = address_space_get_flatview(as);
     FOR_EACH_FLAT_RANGE(fr, view) {
-        MemoryRegionSection section = {
-            .mr = fr->mr,
-            .fv = view,
-            .offset_within_region = fr->offset_in_region,
-            .size = fr->addr.size,
-            .offset_within_address_space = int128_get64(fr->addr.start),
-            .readonly = fr->readonly,
-        };
+        MemoryRegionSection section = section_from_flat_range(fr, view);
+
         if (listener->region_add) {
             listener->region_add(listener, &section);
         }

From ad52878f97610757390148fe5d5b4cc5ad15c585 Mon Sep 17 00:00:00 2001
From: Andrew Baumann <Andrew.Baumann@microsoft.com>
Date: Fri, 13 Oct 2017 11:19:13 -0700
Subject: [PATCH 23/29] notdirty_mem_write: implement 8-byte accesses

Aligned 8-byte memory writes by a 64-bit target on a 64-bit host should
always turn into atomic 8-byte writes on the host, however if we missed
in the softmmu, and the TLB line was marked as not dirty, then we
would end up tearing the 8-byte write into two 4-byte writes in
access_with_adjusted_size().

Signed-off-by: Andrew Baumann <Andrew.Baumann@microsoft.com>
Message-Id: <20171013181913.7556-1-Andrew.Baumann@microsoft.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 exec.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/exec.c b/exec.c
index 6579e7ac4f..b58bc4eff7 100644
--- a/exec.c
+++ b/exec.c
@@ -2376,6 +2376,9 @@ static void notdirty_mem_write(void *opaque, hwaddr ram_addr,
     case 4:
         stl_p(qemu_map_ram_ptr(NULL, ram_addr), val);
         break;
+    case 8:
+        stq_p(qemu_map_ram_ptr(NULL, ram_addr), val);
+        break;
     default:
         abort();
     }
@@ -2406,6 +2409,16 @@ static const MemoryRegionOps notdirty_mem_ops = {
     .write = notdirty_mem_write,
     .valid.accepts = notdirty_mem_accepts,
     .endianness = DEVICE_NATIVE_ENDIAN,
+    .valid = {
+        .min_access_size = 1,
+        .max_access_size = 8,
+        .unaligned = false,
+    },
+    .impl = {
+        .min_access_size = 1,
+        .max_access_size = 8,
+        .unaligned = false,
+    },
 };
 
 /* Generate a debug exception if a watchpoint has been hit.  */

From 306526b5de6984a164548572fd04d898dd6adbaa Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 17 Oct 2017 14:16:05 +0200
Subject: [PATCH 24/29] watch_mem_write: implement 8-byte accesses

Aligned 8-byte memory writes by a 64-bit target on a 64-bit host should
always turn into atomic 8-byte writes on the host, however a write
write watchpoint would end up tearing the 8-byte write into two 4-byte
writes in access_with_adjusted_size().

Reported-by: Andrew Baumann <Andrew.Baumann@microsoft.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 exec.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/exec.c b/exec.c
index b58bc4eff7..db5ae23118 100644
--- a/exec.c
+++ b/exec.c
@@ -2503,6 +2503,9 @@ static MemTxResult watch_mem_read(void *opaque, hwaddr addr, uint64_t *pdata,
     case 4:
         data = address_space_ldl(as, addr, attrs, &res);
         break;
+    case 8:
+        data = address_space_ldq(as, addr, attrs, &res);
+        break;
     default: abort();
     }
     *pdata = data;
@@ -2528,6 +2531,9 @@ static MemTxResult watch_mem_write(void *opaque, hwaddr addr,
     case 4:
         address_space_stl(as, addr, val, attrs, &res);
         break;
+    case 8:
+        address_space_stq(as, addr, val, attrs, &res);
+        break;
     default: abort();
     }
     return res;
@@ -2537,6 +2543,16 @@ static const MemoryRegionOps watch_mem_ops = {
     .read_with_attrs = watch_mem_read,
     .write_with_attrs = watch_mem_write,
     .endianness = DEVICE_NATIVE_ENDIAN,
+    .valid = {
+        .min_access_size = 1,
+        .max_access_size = 8,
+        .unaligned = false,
+    },
+    .impl = {
+        .min_access_size = 1,
+        .max_access_size = 8,
+        .unaligned = false,
+    },
 };
 
 static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs,

From b3f1c8c413bc83e4a2cc7a63e4eddf9fe6449052 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 17 Oct 2017 20:11:58 +0200
Subject: [PATCH 25/29] qemu-pr-helper: use new libmultipath API

libmultipath has recently changed its API.  The new API supports multi-threaded
clients better.  Unfortunately there is no backwards-compatibility, so we just
switch to the new one.  Running QEMU compiled with the new library on the old
library will likely crash, while doing the opposite will cause QEMU not to
start at all (because udev, get_multipath_config and put_multipath_config
are undefined).

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 configure             | 12 ++++++++++--
 scsi/qemu-pr-helper.c | 17 ++++++++++++++---
 2 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/configure b/configure
index 6587e8014b..7766e74125 100755
--- a/configure
+++ b/configure
@@ -3312,9 +3312,17 @@ if test "$mpath" != "no" ; then
 #include <mpath_persist.h>
 unsigned mpath_mx_alloc_len = 1024;
 int logsink;
+static struct config *multipath_conf;
+extern struct udev *udev;
+extern struct config *get_multipath_config(void);
+extern void put_multipath_config(struct config *conf);
+struct udev *udev;
+struct config *get_multipath_config(void) { return multipath_conf; }
+void put_multipath_config(struct config *conf) { }
+
 int main(void) {
-    struct udev *udev = udev_new();
-    mpath_lib_init(udev);
+    udev = udev_new();
+    multipath_conf = mpath_lib_init();
     return 0;
 }
 EOF
diff --git a/scsi/qemu-pr-helper.c b/scsi/qemu-pr-helper.c
index d58184833f..dd9785143b 100644
--- a/scsi/qemu-pr-helper.c
+++ b/scsi/qemu-pr-helper.c
@@ -276,15 +276,26 @@ static void dm_init(void)
 
 /* Variables required by libmultipath and libmpathpersist.  */
 QEMU_BUILD_BUG_ON(PR_HELPER_DATA_SIZE > MPATH_MAX_PARAM_LEN);
+static struct config *multipath_conf;
 unsigned mpath_mx_alloc_len = PR_HELPER_DATA_SIZE;
 int logsink;
+struct udev *udev;
+
+extern struct config *get_multipath_config(void);
+struct config *get_multipath_config(void)
+{
+    return multipath_conf;
+}
+
+extern void put_multipath_config(struct config *conf);
+void put_multipath_config(struct config *conf)
+{
+}
 
 static void multipath_pr_init(void)
 {
-    static struct udev *udev;
-
     udev = udev_new();
-    mpath_lib_init(udev);
+    multipath_conf = mpath_lib_init();
 }
 
 static int is_mpath(int fd)

From 04162f8f4bcf8c9ae2422def4357289b44208c8c Mon Sep 17 00:00:00 2001
From: Michael Roth <mdroth@linux.vnet.ibm.com>
Date: Mon, 16 Oct 2017 17:23:13 -0500
Subject: [PATCH 26/29] qdev: store DeviceState's canonical path to use when
 unparenting

device_unparent(dev, ...) is called when a device is unparented,
either directly, or as a result of a parent device being
finalized, and handles some final cleanup for the device. Part
of this includes emiting a DEVICE_DELETED QMP event to notify
management, which includes the device's path in the composition
tree as provided by object_get_canonical_path().

object_get_canonical_path() assumes the device is still connected
to the machine/root container, and will assert otherwise, but
in some situations this isn't the case:

If the parent is finalized as a result of object_unparent(), it
will still be attached to the composition tree at the time any
children are unparented as a result of that same call to
object_unparent(). However, in some cases, object_unparent()
will complete without finalizing the parent device, due to
lingering references that won't be released till some time later.
One such example is if the parent has MemoryRegion children (which
take a ref on their parent), who in turn have AddressSpace's (which
take a ref on their regions), since those AddressSpaces get cleaned
up asynchronously by the RCU thread.

In this case qdev:device_unparent() may be called for a child Device
that no longer has a path to the root/machine container, causing
object_get_canonical_path() to assert.

Fix this by storing the canonical path during realize() so the
information will still be available for device_unparent() in such
cases.

Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Michael Roth <mdroth@linux.vnet.ibm.com>
Signed-off-by: Greg Kurz <groug@kaod.org>
Signed-off-by: Michael Roth <mdroth@linux.vnet.ibm.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Message-Id: <20171016222315.407-2-mdroth@linux.vnet.ibm.com>
[Clear dev->canonical_path at the post_realize_fail label, which is
 cleaner.  Suggested by David Gibson. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/core/qdev.c         | 17 ++++++++++++++---
 include/hw/qdev-core.h |  1 +
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/hw/core/qdev.c b/hw/core/qdev.c
index 606ab53c42..0019a49503 100644
--- a/hw/core/qdev.c
+++ b/hw/core/qdev.c
@@ -928,6 +928,13 @@ static void device_set_realized(Object *obj, bool value, Error **errp)
             goto post_realize_fail;
         }
 
+        /*
+         * always free/re-initialize here since the value cannot be cleaned up
+         * in device_unrealize due to its usage later on in the unplug path
+         */
+        g_free(dev->canonical_path);
+        dev->canonical_path = object_get_canonical_path(OBJECT(dev));
+
         if (qdev_get_vmsd(dev)) {
             if (vmstate_register_with_alias_id(dev, -1, qdev_get_vmsd(dev), dev,
                                                dev->instance_id_alias,
@@ -984,6 +991,8 @@ child_realize_fail:
     }
 
 post_realize_fail:
+    g_free(dev->canonical_path);
+    dev->canonical_path = NULL;
     if (dc->unrealize) {
         dc->unrealize(dev, NULL);
     }
@@ -1102,10 +1111,12 @@ static void device_unparent(Object *obj)
 
     /* Only send event if the device had been completely realized */
     if (dev->pending_deleted_event) {
-        gchar *path = object_get_canonical_path(OBJECT(dev));
+        g_assert(dev->canonical_path);
 
-        qapi_event_send_device_deleted(!!dev->id, dev->id, path, &error_abort);
-        g_free(path);
+        qapi_event_send_device_deleted(!!dev->id, dev->id, dev->canonical_path,
+                                       &error_abort);
+        g_free(dev->canonical_path);
+        dev->canonical_path = NULL;
     }
 
     qemu_opts_del(dev->opts);
diff --git a/include/hw/qdev-core.h b/include/hw/qdev-core.h
index 089146197f..0a71bf83f0 100644
--- a/include/hw/qdev-core.h
+++ b/include/hw/qdev-core.h
@@ -153,6 +153,7 @@ struct DeviceState {
     /*< public >*/
 
     const char *id;
+    char *canonical_path;
     bool realized;
     bool pending_deleted_event;
     QemuOpts *opts;

From 2fc06c4ac65594ad248e9a9150ebdde9ff5a1253 Mon Sep 17 00:00:00 2001
From: Michael Roth <mdroth@linux.vnet.ibm.com>
Date: Mon, 16 Oct 2017 17:23:14 -0500
Subject: [PATCH 27/29] Revert "qdev: Free QemuOpts when the QOM path goes
 away"

This reverts commit abed886ec60cf239a03515cf0b30fb11fa964c44.

This patch originally addressed an issue where a DEVICE_DELETED
event could be emitted (in device_unparent()) before a Device's
QemuOpts were cleaned up (in device_finalize()), leading to a
"duplicate ID" error if management attempted to immediately add
a device with the same ID in response to the DEVICE_DELETED event.

An alternative will be implemented in a subsequent patch where we
defer the DEVICE_DELETED event until device_finalize(), which would
also prevent the race, so we revert the original fix in preparation.

Signed-off-by: Michael Roth <mdroth@linux.vnet.ibm.com>
Reviewed-by: Greg Kurz <groug@kaod.org>
Tested-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Message-Id: <20171016222315.407-3-mdroth@linux.vnet.ibm.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/core/qdev.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/hw/core/qdev.c b/hw/core/qdev.c
index 0019a49503..418cdac863 100644
--- a/hw/core/qdev.c
+++ b/hw/core/qdev.c
@@ -1069,6 +1069,7 @@ static void device_finalize(Object *obj)
     NamedGPIOList *ngl, *next;
 
     DeviceState *dev = DEVICE(obj);
+    qemu_opts_del(dev->opts);
 
     QLIST_FOREACH_SAFE(ngl, &dev->gpios, node, next) {
         QLIST_REMOVE(ngl, node);
@@ -1118,9 +1119,6 @@ static void device_unparent(Object *obj)
         g_free(dev->canonical_path);
         dev->canonical_path = NULL;
     }
-
-    qemu_opts_del(dev->opts);
-    dev->opts = NULL;
 }
 
 static void device_class_init(ObjectClass *class, void *data)

From f7b879e072ae6839b1b1d1312f48fa7f256397e2 Mon Sep 17 00:00:00 2001
From: Michael Roth <mdroth@linux.vnet.ibm.com>
Date: Mon, 16 Oct 2017 17:23:15 -0500
Subject: [PATCH 28/29] qdev: defer DEVICE_DEL event until instance_finalize()

DEVICE_DEL is currently emitted when a Device is unparented, as
opposed to when it is finalized. The main design motivation for this
seems to be that after unparent()/unrealize(), the Device is no
longer visible to the guest, and thus the operation is complete
from the perspective of management.

However, there are cases where remaining host-side cleanup is also
pertinent to management. The is generally handled by treating these
resources as aspects of the "backend", which can be managed via
separate interfaces/events, such as blockdev_add/del, netdev_add/del,
object_add/del, etc, but some devices do not have this level of
compartmentalization, namely vfio-pci, and possibly to lend themselves
well to it.

In the case of vfio-pci, the "backend" cleanup happens as part of
the finalization of the vfio-pci device itself, in particular the
cleanup of the VFIO group FD. Failing to wait for this cleanup can
result in tools like libvirt attempting to rebind the device to
the host while it's still being used by VFIO, which can result in
host crashes or other misbehavior depending on the host driver.

Deferring DEVICE_DEL still affords us the ability to manage backends
explicitly, while also addressing cases like vfio-pci's, so we
implement that approach here.

An alternative proposal involving having VFIO emit a separate event
to denote completion of host-side cleanup was discussed, but the
prevailing opinion seems to be that it is not worth the added
complexity, and leaves the issue open for other Device implementations
to solve in the future.

Signed-off-by: Michael Roth <mdroth@linux.vnet.ibm.com>
Reviewed-by: Greg Kurz <groug@kaod.org>
Tested-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Message-Id: <20171016222315.407-4-mdroth@linux.vnet.ibm.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/core/qdev.c | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/hw/core/qdev.c b/hw/core/qdev.c
index 418cdac863..11112951a5 100644
--- a/hw/core/qdev.c
+++ b/hw/core/qdev.c
@@ -1069,7 +1069,6 @@ static void device_finalize(Object *obj)
     NamedGPIOList *ngl, *next;
 
     DeviceState *dev = DEVICE(obj);
-    qemu_opts_del(dev->opts);
 
     QLIST_FOREACH_SAFE(ngl, &dev->gpios, node, next) {
         QLIST_REMOVE(ngl, node);
@@ -1080,6 +1079,18 @@ static void device_finalize(Object *obj)
          * here
          */
     }
+
+    /* Only send event if the device had been completely realized */
+    if (dev->pending_deleted_event) {
+        g_assert(dev->canonical_path);
+
+        qapi_event_send_device_deleted(!!dev->id, dev->id, dev->canonical_path,
+                                       &error_abort);
+        g_free(dev->canonical_path);
+        dev->canonical_path = NULL;
+    }
+
+    qemu_opts_del(dev->opts);
 }
 
 static void device_class_base_init(ObjectClass *class, void *data)
@@ -1109,16 +1120,6 @@ static void device_unparent(Object *obj)
         object_unref(OBJECT(dev->parent_bus));
         dev->parent_bus = NULL;
     }
-
-    /* Only send event if the device had been completely realized */
-    if (dev->pending_deleted_event) {
-        g_assert(dev->canonical_path);
-
-        qapi_event_send_device_deleted(!!dev->id, dev->id, dev->canonical_path,
-                                       &error_abort);
-        g_free(dev->canonical_path);
-        dev->canonical_path = NULL;
-    }
 }
 
 static void device_class_init(ObjectClass *class, void *data)

From 3da023b5827543ee4c022986ea2ad9d1274410b2 Mon Sep 17 00:00:00 2001
From: Mark Kanda <mark.kanda@oracle.com>
Date: Mon, 16 Oct 2017 15:17:04 -0500
Subject: [PATCH 29/29] scsi: reject configurations with logical block size >
 physical block size

Logical block size of a SCSI disk should never be larger than
physical block size. From an ATA/SCSI perspective, it makes no sense
to have the logical block size greater than the physical block size,
and it cannot even be effectively expressed in the command set. The
whole point of adding the physical block size to the ATA/SCSI command
set was to communicate a desire for a larger block size (than logical),
while maintaining backwards compatibility with legacy 512 byte block
size.

When setting logical_block_size > physical_block_size, QEMU cannot express
it in READ CAPACITY(16) output, and all it can do is set the physical
block exponent to 0 (i.e. logical_block_size == physical_block_size).
Reporting the error properly, however, is better.

Signed-off-by: Mark Kanda <mark.kanda@oracle.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Message-Id: <1508185024-5840-1-git-send-email-mark.kanda@oracle.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/scsi/scsi-disk.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
index a518080e7d..12431177a7 100644
--- a/hw/scsi/scsi-disk.c
+++ b/hw/scsi/scsi-disk.c
@@ -2347,6 +2347,14 @@ static void scsi_realize(SCSIDevice *dev, Error **errp)
 
     blkconf_serial(&s->qdev.conf, &s->serial);
     blkconf_blocksizes(&s->qdev.conf);
+
+    if (s->qdev.conf.logical_block_size >
+        s->qdev.conf.physical_block_size) {
+        error_setg(errp,
+                   "logical_block_size > physical_block_size not supported");
+        return;
+    }
+
     if (dev->type == TYPE_DISK) {
         blkconf_geometry(&dev->conf, NULL, 65535, 255, 255, &err);
         if (err) {