virtio, pci, pc: fixes, features

Bugfixes all over the place. HMAT support. New flags for vhost-user-blk utility. Auto-tuning of seg max for virtio storage. Signed-off-by: Michael S. Tsirkin <mst@redhat.com> -----BEGIN PGP SIGNATURE----- iQFDBAABCAAtFiEEXQn9CHHI+FuUyooNKB8NuNKNVGkFAl4TaMEPHG1zdEByZWRo YXQuY29tAAoJECgfDbjSjVRpvzgH/2LyDAzCa9h93ikSJjmyUk5FUaqve38daEb3 S3JYjwKxQx7u1ydooKhvBQnBCZ2i3S+k62gfYyKB+nBv8xvjs0Eg5D1YJ5E8hciy lf5OFGWWtX2iPDjZwQwT13kiJe0o3JRGxJJ6XqTEG+1EYOp7cky/FEv4PD030b9m I2wROZ/Am+onB9YJX8c0Vv1CG+AryuJNXnvwQzTXEjj4U7bEYUyJwVZaCRyAdWQ3 uYXIZN9VwjVX6BFvy9ZAJbEsUVJvOM1/aQaDqcrLz+VlzRT7bRkKHi2G3vakrm1I r5OpgyLo84132awCncbSykKDH5o8WaxLaJBjGmuBfasMz9wPzAg= =uL1o -----END PGP SIGNATURE----- Merge remote-tracking branch 'remotes/mst/tags/for_upstream' into staging virtio, pci, pc: fixes, features Bugfixes all over the place. HMAT support. New flags for vhost-user-blk utility. Auto-tuning of seg max for virtio storage. Signed-off-by: Michael S. Tsirkin <mst@redhat.com> # gpg: Signature made Mon 06 Jan 2020 17:05:05 GMT # gpg: using RSA key 5D09FD0871C8F85B94CA8A0D281F0DB8D28D5469 # gpg: issuer "mst@redhat.com" # gpg: Good signature from "Michael S. Tsirkin <mst@kernel.org>" [full] # gpg: aka "Michael S. Tsirkin <mst@redhat.com>" [full] # Primary key fingerprint: 0270 606B 6F3C DF3D 0B17 0970 C350 3912 AFBE 8E67 # Subkey fingerprint: 5D09 FD08 71C8 F85B 94CA 8A0D 281F 0DB8 D28D 5469 * remotes/mst/tags/for_upstream: (32 commits) intel_iommu: add present bit check for pasid table entries intel_iommu: a fix to vtd_find_as_from_bus_num() virtio-net: delete also control queue when TX/RX deleted virtio: reset region cache when on queue deletion virtio-mmio: update queue size on guest write tests: add virtio-scsi and virtio-blk seg_max_adjust test virtio: make seg_max virtqueue size dependent hw: fix using 4.2 compat in 5.0 machine types for i440fx/q35 vhost-user-scsi: reset the device if supported vhost-user: add VHOST_USER_RESET_DEVICE to reset devices hw/pci/pci_host: Let pci_data_[read/write] use unsigned 'size' argument hw/pci/pci_host: Remove redundant PCI_DPRINTF() virtio-mmio: Clear v2 transport state on soft reset ACPI: add expected files for HMAT tests (acpihmat) tests/bios-tables-test: add test cases for ACPI HMAT tests/numa: Add case for QMP build HMAT hmat acpi: Build Memory Side Cache Information Structure(s) hmat acpi: Build System Locality Latency and Bandwidth Information Structure(s) hmat acpi: Build Memory Proximity Domain Attributes Structure(s) numa: Extend CLI to provide memory side cache information ... Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
2020-01-07 16:25:00 +00:00 · 2020-01-07 16:25:00 +00:00 · 973d306dd6
parent 0cb0406172 56fc1e6ac6
commit 973d306dd6
45 changed files with 1796 additions and 133 deletions
--- a/contrib/vhost-user-blk/vhost-user-blk.c
+++ b/contrib/vhost-user-blk/vhost-user-blk.c
@ -576,70 +576,90 @@ vub_new(char *blk_file)
    return vdev_blk;
 }

+static int opt_fdnum = -1;
+static char *opt_socket_path;
+static char *opt_blk_file;
+static gboolean opt_print_caps;
+static gboolean opt_read_only;
+
+static GOptionEntry entries[] = {
+    { "print-capabilities", 'c', 0, G_OPTION_ARG_NONE, &opt_print_caps,
+      "Print capabilities", NULL },
+    { "fd", 'f', 0, G_OPTION_ARG_INT, &opt_fdnum,
+      "Use inherited fd socket", "FDNUM" },
+    { "socket-path", 's', 0, G_OPTION_ARG_FILENAME, &opt_socket_path,
+      "Use UNIX socket path", "PATH" },
+    {"blk-file", 'b', 0, G_OPTION_ARG_FILENAME, &opt_blk_file,
+     "block device or file path", "PATH"},
+    { "read-only", 'r', 0, G_OPTION_ARG_NONE, &opt_read_only,
+      "Enable read-only", NULL }
+};
+
 int main(int argc, char **argv)
 {
-    int opt;
-    char *unix_socket = NULL;
-    char *blk_file = NULL;
-    bool enable_ro = false;
    int lsock = -1, csock = -1;
    VubDev *vdev_blk = NULL;
+    GError *error = NULL;
+    GOptionContext *context;

-    while ((opt = getopt(argc, argv, "b:rs:h")) != -1) {
-        switch (opt) {
-        case 'b':
-            blk_file = g_strdup(optarg);
-            break;
-        case 's':
-            unix_socket = g_strdup(optarg);
-            break;
-        case 'r':
-            enable_ro = true;
-            break;
-        case 'h':
-        default:
-            printf("Usage: %s [ -b block device or file, -s UNIX domain socket"
-                   " | -r Enable read-only ] | [ -h ]\n", argv[0]);
-            return 0;
+    context = g_option_context_new(NULL);
+    g_option_context_add_main_entries(context, entries, NULL);
+    if (!g_option_context_parse(context, &argc, &argv, &error)) {
+        g_printerr("Option parsing failed: %s\n", error->message);
+        exit(EXIT_FAILURE);
+    }
+    if (opt_print_caps) {
+        g_print("{\n");
+        g_print("  \"type\": \"block\",\n");
+        g_print("  \"features\": [\n");
+        g_print("    \"read-only\",\n");
+        g_print("    \"blk-file\"\n");
+        g_print("  ]\n");
+        g_print("}\n");
+        exit(EXIT_SUCCESS);
+    }
+
+    if (!opt_blk_file) {
+        g_print("%s\n", g_option_context_get_help(context, true, NULL));
+        exit(EXIT_FAILURE);
+    }
+
+    if (opt_socket_path) {
+        lsock = unix_sock_new(opt_socket_path);
+        if (lsock < 0) {
+            exit(EXIT_FAILURE);
        }
+    } else if (opt_fdnum < 0) {
+        g_print("%s\n", g_option_context_get_help(context, true, NULL));
+        exit(EXIT_FAILURE);
+    } else {
+        lsock = opt_fdnum;
    }

-    if (!unix_socket || !blk_file) {
-        printf("Usage: %s [ -b block device or file, -s UNIX domain socket"
-               " | -r Enable read-only ] | [ -h ]\n", argv[0]);
-        return -1;
-    }
-
-    lsock = unix_sock_new(unix_socket);
-    if (lsock < 0) {
-        goto err;
-    }
-
-    csock = accept(lsock, (void *)0, (void *)0);
+    csock = accept(lsock, NULL, NULL);
    if (csock < 0) {
-        fprintf(stderr, "Accept error %s\n", strerror(errno));
-        goto err;
+        g_printerr("Accept error %s\n", strerror(errno));
+        exit(EXIT_FAILURE);
    }

-    vdev_blk = vub_new(blk_file);
+    vdev_blk = vub_new(opt_blk_file);
    if (!vdev_blk) {
-        goto err;
+        exit(EXIT_FAILURE);
    }
-    if (enable_ro) {
+    if (opt_read_only) {
        vdev_blk->enable_ro = true;
    }

    if (!vug_init(&vdev_blk->parent, VHOST_USER_BLK_MAX_QUEUES, csock,
                  vub_panic_cb, &vub_iface)) {
-        fprintf(stderr, "Failed to initialized libvhost-user-glib\n");
-        goto err;
+        g_printerr("Failed to initialize libvhost-user-glib\n");
+        exit(EXIT_FAILURE);
    }

    g_main_loop_run(vdev_blk->loop);
-
+    g_main_loop_unref(vdev_blk->loop);
+    g_option_context_free(context);
    vug_deinit(&vdev_blk->parent);
-
-err:
    vub_free(vdev_blk);
    if (csock >= 0) {
        close(csock);
@ -647,8 +667,8 @@ err:
    if (lsock >= 0) {
        close(lsock);
    }
-    g_free(unix_socket);
-    g_free(blk_file);
+    g_free(opt_socket_path);
+    g_free(opt_blk_file);

    return 0;
 }
--- a/docs/interop/vhost-user.json
+++ b/docs/interop/vhost-user.json
@ -54,6 +54,37 @@
  ]
 }

+##
+# @VHostUserBackendBlockFeature:
+#
+# List of vhost user "block" features.
+#
+# @read-only: The --read-only command line option is supported.
+# @blk-file: The --blk-file command line option is supported.
+#
+# Since: 5.0
+##
+{
+  'enum': 'VHostUserBackendBlockFeature',
+  'data': [ 'read-only', 'blk-file' ]
+}
+
+##
+# @VHostUserBackendCapabilitiesBlock:
+#
+# Capabilities reported by vhost user "block" backends
+#
+# @features: list of supported features.
+#
+# Since: 5.0
+##
+{
+  'struct': 'VHostUserBackendCapabilitiesBlock',
+  'data': {
+    'features': [ 'VHostUserBackendBlockFeature' ]
+  }
+}
+
 ##
 # @VHostUserBackendInputFeature:
 #
--- a/docs/interop/vhost-user.rst
+++ b/docs/interop/vhost-user.rst
@ -785,6 +785,7 @@ Protocol features
  #define VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD  10
  #define VHOST_USER_PROTOCOL_F_HOST_NOTIFIER  11
  #define VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD 12
+  #define VHOST_USER_PROTOCOL_F_RESET_DEVICE   13

 Master message types
 --------------------
@ -1190,6 +1191,20 @@ Master message types
  ancillary data. The GPU protocol is used to inform the master of
  rendering state and updates. See vhost-user-gpu.rst for details.

+``VHOST_USER_RESET_DEVICE``
+  :id: 34
+  :equivalent ioctl: N/A
+  :master payload: N/A
+  :slave payload: N/A
+
+  Ask the vhost user backend to disable all rings and reset all
+  internal device state to the initial state, ready to be
+  reinitialized. The backend retains ownership of the device
+  throughout the reset operation.
+
+  Only valid if the ``VHOST_USER_PROTOCOL_F_RESET_DEVICE`` protocol
+  feature is set by the backend.
+
 Slave message types
 -------------------

@ -1376,3 +1391,20 @@ Command line options:
  Enable virgl rendering support.

  (optional)
+
+vhost-user-blk
+--------------
+
+Command line options:
+
+--blk-file=PATH
+
+  Specify block device or file path.
+
+  (optional)
+
+--read-only
+
+  Enable read-only.
+
+  (optional)
--- a/hw/acpi/Kconfig
+++ b/hw/acpi/Kconfig
@ -7,6 +7,7 @@ config ACPI_X86
    select ACPI_NVDIMM
    select ACPI_CPU_HOTPLUG
    select ACPI_MEMORY_HOTPLUG
+    select ACPI_HMAT

 config ACPI_X86_ICH
    bool
@ -23,6 +24,10 @@ config ACPI_NVDIMM
    bool
    depends on ACPI

+config ACPI_HMAT
+    bool
+    depends on ACPI
+
 config ACPI_PCI
    bool
    depends on ACPI && PCI
@ -33,5 +38,3 @@ config ACPI_VMGENID
    depends on PC

 config ACPI_HW_REDUCED
-    bool
-    depends on ACPI
--- a/hw/acpi/Makefile.objs
+++ b/hw/acpi/Makefile.objs
@ -7,6 +7,7 @@ common-obj-$(CONFIG_ACPI_CPU_HOTPLUG) += cpu.o
 common-obj-$(CONFIG_ACPI_NVDIMM) += nvdimm.o
 common-obj-$(CONFIG_ACPI_VMGENID) += vmgenid.o
 common-obj-$(CONFIG_ACPI_HW_REDUCED) += generic_event_device.o
+common-obj-$(CONFIG_ACPI_HMAT) += hmat.o
 common-obj-$(call lnot,$(CONFIG_ACPI_X86)) += acpi-stub.o
 common-obj-$(call lnot,$(CONFIG_PC)) += acpi-x86-stub.o

--- a/hw/acpi/hmat.c
+++ b/hw/acpi/hmat.c
@ -0,0 +1,268 @@
+/*
+ * HMAT ACPI Implementation
+ *
+ * Copyright(C) 2019 Intel Corporation.
+ *
+ * Author:
+ *  Liu jingqi <jingqi.liu@linux.intel.com>
+ *  Tao Xu <tao3.xu@intel.com>
+ *
+ * HMAT is defined in ACPI 6.3: 5.2.27 Heterogeneous Memory Attribute Table
+ * (HMAT)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/units.h"
+#include "sysemu/numa.h"
+#include "hw/acpi/hmat.h"
+
+/*
+ * ACPI 6.3:
+ * 5.2.27.3 Memory Proximity Domain Attributes Structure: Table 5-145
+ */
+static void build_hmat_mpda(GArray *table_data, uint16_t flags,
+                            uint32_t initiator, uint32_t mem_node)
+{
+
+    /* Memory Proximity Domain Attributes Structure */
+    /* Type */
+    build_append_int_noprefix(table_data, 0, 2);
+    /* Reserved */
+    build_append_int_noprefix(table_data, 0, 2);
+    /* Length */
+    build_append_int_noprefix(table_data, 40, 4);
+    /* Flags */
+    build_append_int_noprefix(table_data, flags, 2);
+    /* Reserved */
+    build_append_int_noprefix(table_data, 0, 2);
+    /* Proximity Domain for the Attached Initiator */
+    build_append_int_noprefix(table_data, initiator, 4);
+    /* Proximity Domain for the Memory */
+    build_append_int_noprefix(table_data, mem_node, 4);
+    /* Reserved */
+    build_append_int_noprefix(table_data, 0, 4);
+    /*
+     * Reserved:
+     * Previously defined as the Start Address of the System Physical
+     * Address Range. Deprecated since ACPI Spec 6.3.
+     */
+    build_append_int_noprefix(table_data, 0, 8);
+    /*
+     * Reserved:
+     * Previously defined as the Range Length of the region in bytes.
+     * Deprecated since ACPI Spec 6.3.
+     */
+    build_append_int_noprefix(table_data, 0, 8);
+}
+
+/*
+ * ACPI 6.3: 5.2.27.4 System Locality Latency and Bandwidth Information
+ * Structure: Table 5-146
+ */
+static void build_hmat_lb(GArray *table_data, HMAT_LB_Info *hmat_lb,
+                          uint32_t num_initiator, uint32_t num_target,
+                          uint32_t *initiator_list)
+{
+    int i, index;
+    HMAT_LB_Data *lb_data;
+    uint16_t *entry_list;
+    uint32_t base;
+    /* Length in bytes for entire structure */
+    uint32_t lb_length
+        = 32 /* Table length upto and including Entry Base Unit */
+        + 4 * num_initiator /* Initiator Proximity Domain List */
+        + 4 * num_target /* Target Proximity Domain List */
+        + 2 * num_initiator * num_target; /* Latency or Bandwidth Entries */
+
+    /* Type */
+    build_append_int_noprefix(table_data, 1, 2);
+    /* Reserved */
+    build_append_int_noprefix(table_data, 0, 2);
+    /* Length */
+    build_append_int_noprefix(table_data, lb_length, 4);
+    /* Flags: Bits [3:0] Memory Hierarchy, Bits[7:4] Reserved */
+    assert(!(hmat_lb->hierarchy >> 4));
+    build_append_int_noprefix(table_data, hmat_lb->hierarchy, 1);
+    /* Data Type */
+    build_append_int_noprefix(table_data, hmat_lb->data_type, 1);
+    /* Reserved */
+    build_append_int_noprefix(table_data, 0, 2);
+    /* Number of Initiator Proximity Domains (s) */
+    build_append_int_noprefix(table_data, num_initiator, 4);
+    /* Number of Target Proximity Domains (t) */
+    build_append_int_noprefix(table_data, num_target, 4);
+    /* Reserved */
+    build_append_int_noprefix(table_data, 0, 4);
+
+    /* Entry Base Unit */
+    if (hmat_lb->data_type <= HMAT_LB_DATA_WRITE_LATENCY) {
+        /* Convert latency base from nanoseconds to picosecond */
+        base = hmat_lb->base * 1000;
+    } else {
+        /* Convert bandwidth base from Byte to Megabyte */
+        base = hmat_lb->base / MiB;
+    }
+    build_append_int_noprefix(table_data, base, 8);
+
+    /* Initiator Proximity Domain List */
+    for (i = 0; i < num_initiator; i++) {
+        build_append_int_noprefix(table_data, initiator_list[i], 4);
+    }
+
+    /* Target Proximity Domain List */
+    for (i = 0; i < num_target; i++) {
+        build_append_int_noprefix(table_data, i, 4);
+    }
+
+    /* Latency or Bandwidth Entries */
+    entry_list = g_malloc0(num_initiator * num_target * sizeof(uint16_t));
+    for (i = 0; i < hmat_lb->list->len; i++) {
+        lb_data = &g_array_index(hmat_lb->list, HMAT_LB_Data, i);
+        index = lb_data->initiator * num_target + lb_data->target;
+
+        entry_list[index] = (uint16_t)(lb_data->data / hmat_lb->base);
+    }
+
+    for (i = 0; i < num_initiator * num_target; i++) {
+        build_append_int_noprefix(table_data, entry_list[i], 2);
+    }
+
+    g_free(entry_list);
+}
+
+/* ACPI 6.3: 5.2.27.5 Memory Side Cache Information Structure: Table 5-147 */
+static void build_hmat_cache(GArray *table_data, uint8_t total_levels,
+                             NumaHmatCacheOptions *hmat_cache)
+{
+    /*
+     * Cache Attributes: Bits [3:0] – Total Cache Levels
+     * for this Memory Proximity Domain
+     */
+    uint32_t cache_attr = total_levels;
+
+    /* Bits [7:4] : Cache Level described in this structure */
+    cache_attr |= (uint32_t) hmat_cache->level << 4;
+
+    /* Bits [11:8] - Cache Associativity */
+    cache_attr |= (uint32_t) hmat_cache->associativity << 8;
+
+    /* Bits [15:12] - Write Policy */
+    cache_attr |= (uint32_t) hmat_cache->policy << 12;
+
+    /* Bits [31:16] - Cache Line size in bytes */
+    cache_attr |= (uint32_t) hmat_cache->line << 16;
+
+    /* Type */
+    build_append_int_noprefix(table_data, 2, 2);
+    /* Reserved */
+    build_append_int_noprefix(table_data, 0, 2);
+    /* Length */
+    build_append_int_noprefix(table_data, 32, 4);
+    /* Proximity Domain for the Memory */
+    build_append_int_noprefix(table_data, hmat_cache->node_id, 4);
+    /* Reserved */
+    build_append_int_noprefix(table_data, 0, 4);
+    /* Memory Side Cache Size */
+    build_append_int_noprefix(table_data, hmat_cache->size, 8);
+    /* Cache Attributes */
+    build_append_int_noprefix(table_data, cache_attr, 4);
+    /* Reserved */
+    build_append_int_noprefix(table_data, 0, 2);
+    /*
+     * Number of SMBIOS handles (n)
+     * Linux kernel uses Memory Side Cache Information Structure
+     * without SMBIOS entries for now, so set Number of SMBIOS handles
+     * as 0.
+     */
+    build_append_int_noprefix(table_data, 0, 2);
+}
+
+/* Build HMAT sub table structures */
+static void hmat_build_table_structs(GArray *table_data, NumaState *numa_state)
+{
+    uint16_t flags;
+    uint32_t num_initiator = 0;
+    uint32_t initiator_list[MAX_NODES];
+    int i, hierarchy, type, cache_level, total_levels;
+    HMAT_LB_Info *hmat_lb;
+    NumaHmatCacheOptions *hmat_cache;
+
+    for (i = 0; i < numa_state->num_nodes; i++) {
+        flags = 0;
+
+        if (numa_state->nodes[i].initiator < MAX_NODES) {
+            flags |= HMAT_PROXIMITY_INITIATOR_VALID;
+        }
+
+        build_hmat_mpda(table_data, flags, numa_state->nodes[i].initiator, i);
+    }
+
+    for (i = 0; i < numa_state->num_nodes; i++) {
+        if (numa_state->nodes[i].has_cpu) {
+            initiator_list[num_initiator++] = i;
+        }
+    }
+
+    /*
+     * ACPI 6.3: 5.2.27.4 System Locality Latency and Bandwidth Information
+     * Structure: Table 5-146
+     */
+    for (hierarchy = HMAT_LB_MEM_MEMORY;
+         hierarchy <= HMAT_LB_MEM_CACHE_3RD_LEVEL; hierarchy++) {
+        for (type = HMAT_LB_DATA_ACCESS_LATENCY;
+             type <= HMAT_LB_DATA_WRITE_BANDWIDTH; type++) {
+            hmat_lb = numa_state->hmat_lb[hierarchy][type];
+
+            if (hmat_lb && hmat_lb->list->len) {
+                build_hmat_lb(table_data, hmat_lb, num_initiator,
+                              numa_state->num_nodes, initiator_list);
+            }
+        }
+    }
+
+    /*
+     * ACPI 6.3: 5.2.27.5 Memory Side Cache Information Structure:
+     * Table 5-147
+     */
+    for (i = 0; i < numa_state->num_nodes; i++) {
+        total_levels = 0;
+        for (cache_level = 1; cache_level < HMAT_LB_LEVELS; cache_level++) {
+            if (numa_state->hmat_cache[i][cache_level]) {
+                total_levels++;
+            }
+        }
+        for (cache_level = 0; cache_level <= total_levels; cache_level++) {
+            hmat_cache = numa_state->hmat_cache[i][cache_level];
+            if (hmat_cache) {
+                build_hmat_cache(table_data, total_levels, hmat_cache);
+            }
+        }
+    }
+}
+
+void build_hmat(GArray *table_data, BIOSLinker *linker, NumaState *numa_state)
+{
+    int hmat_start = table_data->len;
+
+    /* reserve space for HMAT header  */
+    acpi_data_push(table_data, 40);
+
+    hmat_build_table_structs(table_data, numa_state);
+
+    build_header(linker, table_data,
+                 (void *)(table_data->data + hmat_start),
+                 "HMAT", table_data->len - hmat_start, 2, NULL, NULL);
+}
--- a/hw/acpi/hmat.h
+++ b/hw/acpi/hmat.h
@ -0,0 +1,42 @@
+/*
+ * HMAT ACPI Implementation Header
+ *
+ * Copyright(C) 2019 Intel Corporation.
+ *
+ * Author:
+ *  Liu jingqi <jingqi.liu@linux.intel.com>
+ *  Tao Xu <tao3.xu@intel.com>
+ *
+ * HMAT is defined in ACPI 6.3: 5.2.27 Heterogeneous Memory Attribute Table
+ * (HMAT)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>
+ */
+
+#ifndef HMAT_H
+#define HMAT_H
+
+#include "hw/acpi/aml-build.h"
+
+/*
+ * ACPI 6.3: 5.2.27.3 Memory Proximity Domain Attributes Structure,
+ * Table 5-145, Field "flag", Bit [0]: set to 1 to indicate that data in
+ * the Proximity Domain for the Attached Initiator field is valid.
+ * Other bits reserved.
+ */
+#define HMAT_PROXIMITY_INITIATOR_VALID  0x1
+
+void build_hmat(GArray *table_data, BIOSLinker *linker, NumaState *numa_state);
+
+#endif
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@ -764,13 +764,16 @@ bool virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq)
 {
    VirtIOBlockReq *req;
    MultiReqBuffer mrb = {};
+    bool suppress_notifications = virtio_queue_get_notification(vq);
    bool progress = false;

    aio_context_acquire(blk_get_aio_context(s->blk));
    blk_io_plug(s->blk);

    do {
-        virtio_queue_set_notification(vq, 0);
+        if (suppress_notifications) {
+            virtio_queue_set_notification(vq, 0);
+        }

        while ((req = virtio_blk_get_request(s, vq))) {
            progress = true;
@ -781,7 +784,9 @@ bool virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq)
            }
        }

-        virtio_queue_set_notification(vq, 1);
+        if (suppress_notifications) {
+            virtio_queue_set_notification(vq, 1);
+        }
    } while (!virtio_queue_empty(vq));

    if (mrb.num_reqs) {
@ -908,7 +913,8 @@ static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
    blk_get_geometry(s->blk, &capacity);
    memset(&blkcfg, 0, sizeof(blkcfg));
    virtio_stq_p(vdev, &blkcfg.capacity, capacity);
-    virtio_stl_p(vdev, &blkcfg.seg_max, 128 - 2);
+    virtio_stl_p(vdev, &blkcfg.seg_max,
+                 s->conf.seg_max_adjust ? s->conf.queue_size - 2 : 128 - 2);
    virtio_stw_p(vdev, &blkcfg.geometry.cylinders, conf->cyls);
    virtio_stl_p(vdev, &blkcfg.blk_size, blk_size);
    virtio_stw_p(vdev, &blkcfg.min_io_size, conf->min_io_size / blk_size);
@ -1133,6 +1139,11 @@ static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
        error_setg(errp, "num-queues property must be larger than 0");
        return;
    }
+    if (conf->queue_size <= 2) {
+        error_setg(errp, "invalid queue-size property (%" PRIu16 "), "
+                   "must be > 2", conf->queue_size);
+        return;
+    }
    if (!is_power_of_2(conf->queue_size) ||
        conf->queue_size > VIRTQUEUE_MAX_SIZE) {
        error_setg(errp, "invalid queue-size property (%" PRIu16 "), "
@ -1262,6 +1273,7 @@ static Property virtio_blk_properties[] = {
                    true),
    DEFINE_PROP_UINT16("num-queues", VirtIOBlock, conf.num_queues, 1),
    DEFINE_PROP_UINT16("queue-size", VirtIOBlock, conf.queue_size, 128),
+    DEFINE_PROP_BOOL("seg-max-adjust", VirtIOBlock, conf.seg_max_adjust, true),
    DEFINE_PROP_LINK("iothread", VirtIOBlock, conf.iothread, TYPE_IOTHREAD,
                     IOThread *),
    DEFINE_PROP_BIT64("discard", VirtIOBlock, host_features,
--- a/hw/char/virtio-serial-bus.c
+++ b/hw/char/virtio-serial-bus.c
@ -1126,9 +1126,17 @@ static void virtio_serial_device_unrealize(DeviceState *dev, Error **errp)
 {
    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
    VirtIOSerial *vser = VIRTIO_SERIAL(dev);
+    int i;

    QLIST_REMOVE(vser, next);

+    virtio_delete_queue(vser->c_ivq);
+    virtio_delete_queue(vser->c_ovq);
+    for (i = 0; i < vser->bus.max_nr_ports; i++) {
+        virtio_delete_queue(vser->ivqs[i]);
+        virtio_delete_queue(vser->ovqs[i]);
+    }
+
    g_free(vser->ivqs);
    g_free(vser->ovqs);
    g_free(vser->ports_map);
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@ -29,11 +29,15 @@

 GlobalProperty hw_compat_4_2[] = {
    { "virtio-blk-device", "x-enable-wce-if-config-wce", "off" },
+    { "virtio-blk-device", "seg-max-adjust", "off"},
+    { "virtio-scsi-device", "seg_max_adjust", "off"},
+    { "vhost-blk-device", "seg_max_adjust", "off"},
 };
 const size_t hw_compat_4_2_len = G_N_ELEMENTS(hw_compat_4_2);

 GlobalProperty hw_compat_4_1[] = {
    { "virtio-pci", "x-pcie-flr-init", "off" },
+    { "virtio-device", "use-disabled-flag", "false" },
 };
 const size_t hw_compat_4_1_len = G_N_ELEMENTS(hw_compat_4_1);

@ -429,6 +433,20 @@ static void machine_set_nvdimm(Object *obj, bool value, Error **errp)
    ms->nvdimms_state->is_enabled = value;
 }

+static bool machine_get_hmat(Object *obj, Error **errp)
+{
+    MachineState *ms = MACHINE(obj);
+
+    return ms->numa_state->hmat_enabled;
+}
+
+static void machine_set_hmat(Object *obj, bool value, Error **errp)
+{
+    MachineState *ms = MACHINE(obj);
+
+    ms->numa_state->hmat_enabled = value;
+}
+
 static char *machine_get_nvdimm_persistence(Object *obj, Error **errp)
 {
    MachineState *ms = MACHINE(obj);
@ -556,6 +574,7 @@ void machine_set_cpu_numa_node(MachineState *machine,
                               const CpuInstanceProperties *props, Error **errp)
 {
    MachineClass *mc = MACHINE_GET_CLASS(machine);
+    NodeInfo *numa_info = machine->numa_state->nodes;
    bool match = false;
    int i;

@ -625,6 +644,17 @@ void machine_set_cpu_numa_node(MachineState *machine,
        match = true;
        slot->props.node_id = props->node_id;
        slot->props.has_node_id = props->has_node_id;
+
+        if (machine->numa_state->hmat_enabled) {
+            if ((numa_info[props->node_id].initiator < MAX_NODES) &&
+                (props->node_id != numa_info[props->node_id].initiator)) {
+                error_setg(errp, "The initiator of CPU NUMA node %" PRId64
+                        " should be itself", props->node_id);
+                return;
+            }
+            numa_info[props->node_id].has_cpu = true;
+            numa_info[props->node_id].initiator = props->node_id;
+        }
    }

    if (!match) {
@ -845,6 +875,13 @@ static void machine_initfn(Object *obj)

    if (mc->cpu_index_to_instance_props && mc->get_default_cpu_node_id) {
        ms->numa_state = g_new0(NumaState, 1);
+        object_property_add_bool(obj, "hmat",
+                                 machine_get_hmat, machine_set_hmat,
+                                 &error_abort);
+        object_property_set_description(obj, "hmat",
+                                        "Set on/off to enable/disable "
+                                        "ACPI Heterogeneous Memory Attribute "
+                                        "Table (HMAT)", NULL);
    }

    /* Register notifier when init is done for sysbus sanity checks */
@ -912,6 +949,32 @@ static char *cpu_slot_to_string(const CPUArchId *cpu)
    return g_string_free(s, false);
 }

+static void numa_validate_initiator(NumaState *numa_state)
+{
+    int i;
+    NodeInfo *numa_info = numa_state->nodes;
+
+    for (i = 0; i < numa_state->num_nodes; i++) {
+        if (numa_info[i].initiator == MAX_NODES) {
+            error_report("The initiator of NUMA node %d is missing, use "
+                         "'-numa node,initiator' option to declare it", i);
+            exit(1);
+        }
+
+        if (!numa_info[numa_info[i].initiator].present) {
+            error_report("NUMA node %" PRIu16 " is missing, use "
+                         "'-numa node' option to declare it first",
+                         numa_info[i].initiator);
+            exit(1);
+        }
+
+        if (!numa_info[numa_info[i].initiator].has_cpu) {
+            error_report("The initiator of NUMA node %d is invalid", i);
+            exit(1);
+        }
+    }
+}
+
 static void machine_numa_finish_cpu_init(MachineState *machine)
 {
    int i;
@ -952,6 +1015,11 @@ static void machine_numa_finish_cpu_init(MachineState *machine)
            machine_set_cpu_numa_node(machine, &props, &error_fatal);
        }
    }
+
+    if (machine->numa_state->hmat_enabled) {
+        numa_validate_initiator(machine->numa_state);
+    }
+
    if (s->len && !qtest_enabled()) {
        warn_report("CPU(s) not present in any NUMA nodes: %s",
                    s->str);
--- a/hw/core/numa.c
+++ b/hw/core/numa.c
@ -23,6 +23,7 @@
 */

 #include "qemu/osdep.h"
+#include "qemu/units.h"
 #include "sysemu/hostmem.h"
 #include "sysemu/numa.h"
 #include "sysemu/sysemu.h"
@ -129,6 +130,29 @@ static void parse_numa_node(MachineState *ms, NumaNodeOptions *node,
        numa_info[nodenr].node_mem = object_property_get_uint(o, "size", NULL);
        numa_info[nodenr].node_memdev = MEMORY_BACKEND(o);
    }
+
+    /*
+     * If not set the initiator, set it to MAX_NODES. And if
+     * HMAT is enabled and this node has no cpus, QEMU will raise error.
+     */
+    numa_info[nodenr].initiator = MAX_NODES;
+    if (node->has_initiator) {
+        if (!ms->numa_state->hmat_enabled) {
+            error_setg(errp, "ACPI Heterogeneous Memory Attribute Table "
+                       "(HMAT) is disabled, enable it with -machine hmat=on "
+                       "before using any of hmat specific options");
+            return;
+        }
+
+        if (node->initiator >= MAX_NODES) {
+            error_report("The initiator id %" PRIu16 " expects an integer "
+                         "between 0 and %d", node->initiator,
+                         MAX_NODES - 1);
+            return;
+        }
+
+        numa_info[nodenr].initiator = node->initiator;
+    }
    numa_info[nodenr].present = true;
    max_numa_nodeid = MAX(max_numa_nodeid, nodenr + 1);
    ms->numa_state->num_nodes++;
@ -171,6 +195,253 @@ void parse_numa_distance(MachineState *ms, NumaDistOptions *dist, Error **errp)
    ms->numa_state->have_numa_distance = true;
 }

+void parse_numa_hmat_lb(NumaState *numa_state, NumaHmatLBOptions *node,
+                        Error **errp)
+{
+    int i, first_bit, last_bit;
+    uint64_t max_entry, temp_base, bitmap_copy;
+    NodeInfo *numa_info = numa_state->nodes;
+    HMAT_LB_Info *hmat_lb =
+        numa_state->hmat_lb[node->hierarchy][node->data_type];
+    HMAT_LB_Data lb_data = {};
+    HMAT_LB_Data *lb_temp;
+
+    /* Error checking */
+    if (node->initiator > numa_state->num_nodes) {
+        error_setg(errp, "Invalid initiator=%d, it should be less than %d",
+                   node->initiator, numa_state->num_nodes);
+        return;
+    }
+    if (node->target > numa_state->num_nodes) {
+        error_setg(errp, "Invalid target=%d, it should be less than %d",
+                   node->target, numa_state->num_nodes);
+        return;
+    }
+    if (!numa_info[node->initiator].has_cpu) {
+        error_setg(errp, "Invalid initiator=%d, it isn't an "
+                   "initiator proximity domain", node->initiator);
+        return;
+    }
+    if (!numa_info[node->target].present) {
+        error_setg(errp, "The target=%d should point to an existing node",
+                   node->target);
+        return;
+    }
+
+    if (!hmat_lb) {
+        hmat_lb = g_malloc0(sizeof(*hmat_lb));
+        numa_state->hmat_lb[node->hierarchy][node->data_type] = hmat_lb;
+        hmat_lb->list = g_array_new(false, true, sizeof(HMAT_LB_Data));
+    }
+    hmat_lb->hierarchy = node->hierarchy;
+    hmat_lb->data_type = node->data_type;
+    lb_data.initiator = node->initiator;
+    lb_data.target = node->target;
+
+    if (node->data_type <= HMATLB_DATA_TYPE_WRITE_LATENCY) {
+        /* Input latency data */
+
+        if (!node->has_latency) {
+            error_setg(errp, "Missing 'latency' option");
+            return;
+        }
+        if (node->has_bandwidth) {
+            error_setg(errp, "Invalid option 'bandwidth' since "
+                       "the data type is latency");
+            return;
+        }
+
+        /* Detect duplicate configuration */
+        for (i = 0; i < hmat_lb->list->len; i++) {
+            lb_temp = &g_array_index(hmat_lb->list, HMAT_LB_Data, i);
+
+            if (node->initiator == lb_temp->initiator &&
+                node->target == lb_temp->target) {
+                error_setg(errp, "Duplicate configuration of the latency for "
+                    "initiator=%d and target=%d", node->initiator,
+                    node->target);
+                return;
+            }
+        }
+
+        hmat_lb->base = hmat_lb->base ? hmat_lb->base : UINT64_MAX;
+
+        if (node->latency) {
+            /* Calculate the temporary base and compressed latency */
+            max_entry = node->latency;
+            temp_base = 1;
+            while (QEMU_IS_ALIGNED(max_entry, 10)) {
+                max_entry /= 10;
+                temp_base *= 10;
+            }
+
+            /* Calculate the max compressed latency */
+            temp_base = MIN(hmat_lb->base, temp_base);
+            max_entry = node->latency / hmat_lb->base;
+            max_entry = MAX(hmat_lb->range_bitmap, max_entry);
+
+            /*
+             * For latency hmat_lb->range_bitmap record the max compressed
+             * latency which should be less than 0xFFFF (UINT16_MAX)
+             */
+            if (max_entry >= UINT16_MAX) {
+                error_setg(errp, "Latency %" PRIu64 " between initiator=%d and "
+                        "target=%d should not differ from previously entered "
+                        "min or max values on more than %d", node->latency,
+                        node->initiator, node->target, UINT16_MAX - 1);
+                return;
+            } else {
+                hmat_lb->base = temp_base;
+                hmat_lb->range_bitmap = max_entry;
+            }
+
+            /*
+             * Set lb_info_provided bit 0 as 1,
+             * latency information is provided
+             */
+            numa_info[node->target].lb_info_provided |= BIT(0);
+        }
+        lb_data.data = node->latency;
+    } else if (node->data_type >= HMATLB_DATA_TYPE_ACCESS_BANDWIDTH) {
+        /* Input bandwidth data */
+        if (!node->has_bandwidth) {
+            error_setg(errp, "Missing 'bandwidth' option");
+            return;
+        }
+        if (node->has_latency) {
+            error_setg(errp, "Invalid option 'latency' since "
+                       "the data type is bandwidth");
+            return;
+        }
+        if (!QEMU_IS_ALIGNED(node->bandwidth, MiB)) {
+            error_setg(errp, "Bandwidth %" PRIu64 " between initiator=%d and "
+                       "target=%d should be 1MB aligned", node->bandwidth,
+                       node->initiator, node->target);
+            return;
+        }
+
+        /* Detect duplicate configuration */
+        for (i = 0; i < hmat_lb->list->len; i++) {
+            lb_temp = &g_array_index(hmat_lb->list, HMAT_LB_Data, i);
+
+            if (node->initiator == lb_temp->initiator &&
+                node->target == lb_temp->target) {
+                error_setg(errp, "Duplicate configuration of the bandwidth for "
+                    "initiator=%d and target=%d", node->initiator,
+                    node->target);
+                return;
+            }
+        }
+
+        hmat_lb->base = hmat_lb->base ? hmat_lb->base : 1;
+
+        if (node->bandwidth) {
+            /* Keep bitmap unchanged when bandwidth out of range */
+            bitmap_copy = hmat_lb->range_bitmap;
+            bitmap_copy |= node->bandwidth;
+            first_bit = ctz64(bitmap_copy);
+            temp_base = UINT64_C(1) << first_bit;
+            max_entry = node->bandwidth / temp_base;
+            last_bit = 64 - clz64(bitmap_copy);
+
+            /*
+             * For bandwidth, first_bit record the base unit of bandwidth bits,
+             * last_bit record the last bit of the max bandwidth. The max
+             * compressed bandwidth should be less than 0xFFFF (UINT16_MAX)
+             */
+            if ((last_bit - first_bit) > UINT16_BITS ||
+                max_entry >= UINT16_MAX) {
+                error_setg(errp, "Bandwidth %" PRIu64 " between initiator=%d "
+                        "and target=%d should not differ from previously "
+                        "entered values on more than %d", node->bandwidth,
+                        node->initiator, node->target, UINT16_MAX - 1);
+                return;
+            } else {
+                hmat_lb->base = temp_base;
+                hmat_lb->range_bitmap = bitmap_copy;
+            }
+
+            /*
+             * Set lb_info_provided bit 1 as 1,
+             * bandwidth information is provided
+             */
+            numa_info[node->target].lb_info_provided |= BIT(1);
+        }
+        lb_data.data = node->bandwidth;
+    } else {
+        assert(0);
+    }
+
+    g_array_append_val(hmat_lb->list, lb_data);
+}
+
+void parse_numa_hmat_cache(MachineState *ms, NumaHmatCacheOptions *node,
+                           Error **errp)
+{
+    int nb_numa_nodes = ms->numa_state->num_nodes;
+    NodeInfo *numa_info = ms->numa_state->nodes;
+    NumaHmatCacheOptions *hmat_cache = NULL;
+
+    if (node->node_id >= nb_numa_nodes) {
+        error_setg(errp, "Invalid node-id=%" PRIu32 ", it should be less "
+                   "than %d", node->node_id, nb_numa_nodes);
+        return;
+    }
+
+    if (numa_info[node->node_id].lb_info_provided != (BIT(0) | BIT(1))) {
+        error_setg(errp, "The latency and bandwidth information of "
+                   "node-id=%" PRIu32 " should be provided before memory side "
+                   "cache attributes", node->node_id);
+        return;
+    }
+
+    if (node->level < 1 || node->level >= HMAT_LB_LEVELS) {
+        error_setg(errp, "Invalid level=%" PRIu8 ", it should be larger than 0 "
+                   "and less than or equal to %d", node->level,
+                   HMAT_LB_LEVELS - 1);
+        return;
+    }
+
+    assert(node->associativity < HMAT_CACHE_ASSOCIATIVITY__MAX);
+    assert(node->policy < HMAT_CACHE_WRITE_POLICY__MAX);
+    if (ms->numa_state->hmat_cache[node->node_id][node->level]) {
+        error_setg(errp, "Duplicate configuration of the side cache for "
+                   "node-id=%" PRIu32 " and level=%" PRIu8,
+                   node->node_id, node->level);
+        return;
+    }
+
+    if ((node->level > 1) &&
+        ms->numa_state->hmat_cache[node->node_id][node->level - 1] &&
+        (node->size >=
+            ms->numa_state->hmat_cache[node->node_id][node->level - 1]->size)) {
+        error_setg(errp, "Invalid size=%" PRIu64 ", the size of level=%" PRIu8
+                   " should be less than the size(%" PRIu64 ") of "
+                   "level=%u", node->size, node->level,
+                   ms->numa_state->hmat_cache[node->node_id]
+                                             [node->level - 1]->size,
+                   node->level - 1);
+        return;
+    }
+
+    if ((node->level < HMAT_LB_LEVELS - 1) &&
+        ms->numa_state->hmat_cache[node->node_id][node->level + 1] &&
+        (node->size <=
+            ms->numa_state->hmat_cache[node->node_id][node->level + 1]->size)) {
+        error_setg(errp, "Invalid size=%" PRIu64 ", the size of level=%" PRIu8
+                   " should be larger than the size(%" PRIu64 ") of "
+                   "level=%u", node->size, node->level,
+                   ms->numa_state->hmat_cache[node->node_id]
+                                             [node->level + 1]->size,
+                   node->level + 1);
+        return;
+    }
+
+    hmat_cache = g_malloc0(sizeof(*hmat_cache));
+    memcpy(hmat_cache, node, sizeof(*hmat_cache));
+    ms->numa_state->hmat_cache[node->node_id][node->level] = hmat_cache;
+}
+
 void set_numa_options(MachineState *ms, NumaOptions *object, Error **errp)
 {
    Error *err = NULL;
@ -208,6 +479,32 @@ void set_numa_options(MachineState *ms, NumaOptions *object, Error **errp)
        machine_set_cpu_numa_node(ms, qapi_NumaCpuOptions_base(&object->u.cpu),
                                  &err);
        break;
+    case NUMA_OPTIONS_TYPE_HMAT_LB:
+        if (!ms->numa_state->hmat_enabled) {
+            error_setg(errp, "ACPI Heterogeneous Memory Attribute Table "
+                       "(HMAT) is disabled, enable it with -machine hmat=on "
+                       "before using any of hmat specific options");
+            return;
+        }
+
+        parse_numa_hmat_lb(ms->numa_state, &object->u.hmat_lb, &err);
+        if (err) {
+            goto end;
+        }
+        break;
+    case NUMA_OPTIONS_TYPE_HMAT_CACHE:
+        if (!ms->numa_state->hmat_enabled) {
+            error_setg(errp, "ACPI Heterogeneous Memory Attribute Table "
+                       "(HMAT) is disabled, enable it with -machine hmat=on "
+                       "before using any of hmat specific options");
+            return;
+        }
+
+        parse_numa_hmat_cache(ms, &object->u.hmat_cache, &err);
+        if (err) {
+            goto end;
+        }
+        break;
    default:
        abort();
    }
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@ -68,6 +68,7 @@
 #include "hw/i386/intel_iommu.h"

 #include "hw/acpi/ipmi.h"
+#include "hw/acpi/hmat.h"

 /* These are used to size the ACPI tables for -M pc-i440fx-1.7 and
 * -M pc-i440fx-2.0.  Even if the actual amount of AML generated grows
@ -2835,6 +2836,10 @@ void acpi_build(AcpiBuildTables *tables, MachineState *machine)
            acpi_add_table(table_offsets, tables_blob);
            build_slit(tables_blob, tables->linker, machine);
        }
+        if (machine->numa_state->hmat_enabled) {
+            acpi_add_table(table_offsets, tables_blob);
+            build_hmat(tables_blob, tables->linker, machine->numa_state);
+        }
    }
    if (acpi_get_mcfg(&mcfg)) {
        acpi_add_table(table_offsets, tables_blob);
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@ -686,9 +686,18 @@ static inline bool vtd_pe_type_check(X86IOMMUState *x86_iommu,
    return true;
 }

-static int vtd_get_pasid_dire(dma_addr_t pasid_dir_base,
-                              uint32_t pasid,
-                              VTDPASIDDirEntry *pdire)
+static inline bool vtd_pdire_present(VTDPASIDDirEntry *pdire)
+{
+    return pdire->val & 1;
+}
+
+/**
+ * Caller of this function should check present bit if wants
+ * to use pdir entry for futher usage except for fpd bit check.
+ */
+static int vtd_get_pdire_from_pdir_table(dma_addr_t pasid_dir_base,
+                                         uint32_t pasid,
+                                         VTDPASIDDirEntry *pdire)
 {
    uint32_t index;
    dma_addr_t addr, entry_size;
@ -703,18 +712,22 @@ static int vtd_get_pasid_dire(dma_addr_t pasid_dir_base,
    return 0;
 }

-static int vtd_get_pasid_entry(IntelIOMMUState *s,
-                               uint32_t pasid,
-                               VTDPASIDDirEntry *pdire,
-                               VTDPASIDEntry *pe)
+static inline bool vtd_pe_present(VTDPASIDEntry *pe)
+{
+    return pe->val[0] & VTD_PASID_ENTRY_P;
+}
+
+static int vtd_get_pe_in_pasid_leaf_table(IntelIOMMUState *s,
+                                          uint32_t pasid,
+                                          dma_addr_t addr,
+                                          VTDPASIDEntry *pe)
 {
    uint32_t index;
-    dma_addr_t addr, entry_size;
+    dma_addr_t entry_size;
    X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);

    index = VTD_PASID_TABLE_INDEX(pasid);
    entry_size = VTD_PASID_ENTRY_SIZE;
-    addr = pdire->val & VTD_PASID_TABLE_BASE_ADDR_MASK;
    addr = addr + index * entry_size;
    if (dma_memory_read(&address_space_memory, addr, pe, entry_size)) {
        return -VTD_FR_PASID_TABLE_INV;
@ -732,25 +745,54 @@ static int vtd_get_pasid_entry(IntelIOMMUState *s,
    return 0;
 }

-static int vtd_get_pasid_entry_from_pasid(IntelIOMMUState *s,
-                                          dma_addr_t pasid_dir_base,
-                                          uint32_t pasid,
-                                          VTDPASIDEntry *pe)
+/**
+ * Caller of this function should check present bit if wants
+ * to use pasid entry for futher usage except for fpd bit check.
+ */
+static int vtd_get_pe_from_pdire(IntelIOMMUState *s,
+                                 uint32_t pasid,
+                                 VTDPASIDDirEntry *pdire,
+                                 VTDPASIDEntry *pe)
+{
+    dma_addr_t addr = pdire->val & VTD_PASID_TABLE_BASE_ADDR_MASK;
+
+    return vtd_get_pe_in_pasid_leaf_table(s, pasid, addr, pe);
+}
+
+/**
+ * This function gets a pasid entry from a specified pasid
+ * table (includes dir and leaf table) with a specified pasid.
+ * Sanity check should be done to ensure return a present
+ * pasid entry to caller.
+ */
+static int vtd_get_pe_from_pasid_table(IntelIOMMUState *s,
+                                       dma_addr_t pasid_dir_base,
+                                       uint32_t pasid,
+                                       VTDPASIDEntry *pe)
 {
    int ret;
    VTDPASIDDirEntry pdire;

-    ret = vtd_get_pasid_dire(pasid_dir_base, pasid, &pdire);
+    ret = vtd_get_pdire_from_pdir_table(pasid_dir_base,
+                                        pasid, &pdire);
    if (ret) {
        return ret;
    }

-    ret = vtd_get_pasid_entry(s, pasid, &pdire, pe);
+    if (!vtd_pdire_present(&pdire)) {
+        return -VTD_FR_PASID_TABLE_INV;
+    }
+
+    ret = vtd_get_pe_from_pdire(s, pasid, &pdire, pe);
    if (ret) {
        return ret;
    }

-    return ret;
+    if (!vtd_pe_present(pe)) {
+        return -VTD_FR_PASID_TABLE_INV;
+    }
+
+    return 0;
 }

 static int vtd_ce_get_rid2pasid_entry(IntelIOMMUState *s,
@ -763,7 +805,7 @@ static int vtd_ce_get_rid2pasid_entry(IntelIOMMUState *s,

    pasid = VTD_CE_GET_RID2PASID(ce);
    pasid_dir_base = VTD_CE_GET_PASID_DIR_TABLE(ce);
-    ret = vtd_get_pasid_entry_from_pasid(s, pasid_dir_base, pasid, pe);
+    ret = vtd_get_pe_from_pasid_table(s, pasid_dir_base, pasid, pe);

    return ret;
 }
@ -781,7 +823,11 @@ static int vtd_ce_get_pasid_fpd(IntelIOMMUState *s,
    pasid = VTD_CE_GET_RID2PASID(ce);
    pasid_dir_base = VTD_CE_GET_PASID_DIR_TABLE(ce);

-    ret = vtd_get_pasid_dire(pasid_dir_base, pasid, &pdire);
+    /*
+     * No present bit check since fpd is meaningful even
+     * if the present bit is clear.
+     */
+    ret = vtd_get_pdire_from_pdir_table(pasid_dir_base, pasid, &pdire);
    if (ret) {
        return ret;
    }
@ -791,7 +837,15 @@ static int vtd_ce_get_pasid_fpd(IntelIOMMUState *s,
        return 0;
    }

-    ret = vtd_get_pasid_entry(s, pasid, &pdire, &pe);
+    if (!vtd_pdire_present(&pdire)) {
+        return -VTD_FR_PASID_TABLE_INV;
+    }
+
+    /*
+     * No present bit check since fpd is meaningful even
+     * if the present bit is clear.
+     */
+    ret = vtd_get_pe_from_pdire(s, pasid, &pdire, &pe);
    if (ret) {
        return ret;
    }
@ -948,6 +1002,7 @@ static VTDBus *vtd_find_as_from_bus_num(IntelIOMMUState *s, uint8_t bus_num)
                return vtd_bus;
            }
        }
+        vtd_bus = NULL;
    }
    return vtd_bus;
 }
@ -2610,16 +2665,15 @@ static uint64_t vtd_mem_read(void *opaque, hwaddr addr, unsigned size)
    switch (addr) {
    /* Root Table Address Register, 64-bit */
    case DMAR_RTADDR_REG:
+        val = vtd_get_quad_raw(s, DMAR_RTADDR_REG);
        if (size == 4) {
-            val = s->root & ((1ULL << 32) - 1);
-        } else {
-            val = s->root;
+            val = val & ((1ULL << 32) - 1);
        }
        break;

    case DMAR_RTADDR_REG_HI:
        assert(size == 4);
-        val = s->root >> 32;
+        val = vtd_get_quad_raw(s, DMAR_RTADDR_REG) >> 32;
        break;

    /* Invalidation Queue Address Register, 64-bit */
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@ -479,6 +479,7 @@ typedef struct VTDRootEntry VTDRootEntry;
 #define VTD_PASID_ENTRY_FPD           (1ULL << 1) /* Fault Processing Disable */

 /* PASID Granular Translation Type Mask */
+#define VTD_PASID_ENTRY_P              1ULL
 #define VTD_SM_PASID_ENTRY_PGTT        (7ULL << 6)
 #define VTD_SM_PASID_ENTRY_FLT         (1ULL << 6)
 #define VTD_SM_PASID_ENTRY_SLT         (2ULL << 6)
--- a/hw/i386/pc_piix.c
+++ b/hw/i386/pc_piix.c
@ -425,7 +425,6 @@ static void pc_i440fx_5_0_machine_options(MachineClass *m)
    m->alias = "pc";
    m->is_default = 1;
    pcmc->default_cpu_version = 1;
-    compat_props_add(m->compat_props, hw_compat_4_2, hw_compat_4_2_len);
 }

 DEFINE_I440FX_MACHINE(v5_0, "pc-i440fx-5.0", NULL,
--- a/hw/i386/pc_q35.c
+++ b/hw/i386/pc_q35.c
@ -354,7 +354,6 @@ static void pc_q35_5_0_machine_options(MachineClass *m)
    pc_q35_machine_options(m);
    m->alias = "q35";
    pcmc->default_cpu_version = 1;
-    compat_props_add(m->compat_props, hw_compat_4_2, hw_compat_4_2_len);
 }

 DEFINE_Q35_MACHINE(v5_0, "pc-q35-5.0", NULL,
--- a/hw/input/virtio-input.c
+++ b/hw/input/virtio-input.c
@ -280,6 +280,7 @@ static void virtio_input_device_unrealize(DeviceState *dev, Error **errp)
 {
    VirtIOInputClass *vic = VIRTIO_INPUT_GET_CLASS(dev);
    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+    VirtIOInput *vinput = VIRTIO_INPUT(dev);
    Error *local_err = NULL;

    if (vic->unrealize) {
@ -289,8 +290,8 @@ static void virtio_input_device_unrealize(DeviceState *dev, Error **errp)
            return;
        }
    }
-    virtio_del_queue(vdev, 0);
-    virtio_del_queue(vdev, 1);
+    virtio_delete_queue(vinput->evt);
+    virtio_delete_queue(vinput->sts);
    virtio_cleanup(vdev);
 }

--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@ -3102,7 +3102,8 @@ static void virtio_net_device_unrealize(DeviceState *dev, Error **errp)
    for (i = 0; i < max_queues; i++) {
        virtio_net_del_queue(n, i);
    }
-
+    /* delete also control vq */
+    virtio_del_queue(vdev, max_queues * 2);
    qemu_announce_timer_del(&n->announce_timer, false);
    g_free(n->vqs);
    qemu_del_nic(n->nic);
--- a/hw/pci/pci_host.c
+++ b/hw/pci/pci_host.c
@ -106,7 +106,7 @@ uint32_t pci_host_config_read_common(PCIDevice *pci_dev, uint32_t addr,
    return ret;
 }

-void pci_data_write(PCIBus *s, uint32_t addr, uint32_t val, int len)
+void pci_data_write(PCIBus *s, uint32_t addr, uint32_t val, unsigned len)
 {
    PCIDevice *pci_dev = pci_dev_find_by_addr(s, addr);
    uint32_t config_addr = addr & (PCI_CONFIG_SPACE_SIZE - 1);
@ -115,28 +115,21 @@ void pci_data_write(PCIBus *s, uint32_t addr, uint32_t val, int len)
        return;
    }

-    PCI_DPRINTF("%s: %s: addr=%02" PRIx32 " val=%08" PRIx32 " len=%d\n",
-                __func__, pci_dev->name, config_addr, val, len);
    pci_host_config_write_common(pci_dev, config_addr, PCI_CONFIG_SPACE_SIZE,
                                 val, len);
 }

-uint32_t pci_data_read(PCIBus *s, uint32_t addr, int len)
+uint32_t pci_data_read(PCIBus *s, uint32_t addr, unsigned len)
 {
    PCIDevice *pci_dev = pci_dev_find_by_addr(s, addr);
    uint32_t config_addr = addr & (PCI_CONFIG_SPACE_SIZE - 1);
-    uint32_t val;

    if (!pci_dev) {
        return ~0x0;
    }

-    val = pci_host_config_read_common(pci_dev, config_addr,
-                                      PCI_CONFIG_SPACE_SIZE, len);
-    PCI_DPRINTF("%s: %s: addr=%02"PRIx32" val=%08"PRIx32" len=%d\n",
-                __func__, pci_dev->name, config_addr, val, len);
-
-    return val;
+    return pci_host_config_read_common(pci_dev, config_addr,
+                                       PCI_CONFIG_SPACE_SIZE, len);
 }

 static void pci_host_config_write(void *opaque, hwaddr addr,
@ -167,8 +160,7 @@ static void pci_host_data_write(void *opaque, hwaddr addr,
                                uint64_t val, unsigned len)
 {
    PCIHostState *s = opaque;
-    PCI_DPRINTF("write addr " TARGET_FMT_plx " len %d val %x\n",
-                addr, len, (unsigned)val);
+
    if (s->config_reg & (1u << 31))
        pci_data_write(s->bus, s->config_reg | (addr & 3), val, len);
 }
@ -177,14 +169,11 @@ static uint64_t pci_host_data_read(void *opaque,
                                   hwaddr addr, unsigned len)
 {
    PCIHostState *s = opaque;
-    uint32_t val;
+
    if (!(s->config_reg & (1U << 31))) {
        return 0xffffffff;
    }
-    val = pci_data_read(s->bus, s->config_reg | (addr & 3), len);
-    PCI_DPRINTF("read addr " TARGET_FMT_plx " len %d val %x\n",
-                addr, len, val);
-    return val;
+    return pci_data_read(s->bus, s->config_reg | (addr & 3), len);
 }

 const MemoryRegionOps pci_host_conf_le_ops = {
--- a/hw/scsi/vhost-scsi.c
+++ b/hw/scsi/vhost-scsi.c
@ -275,6 +275,8 @@ static Property vhost_scsi_properties[] = {
    DEFINE_PROP_UINT32("num_queues", VirtIOSCSICommon, conf.num_queues, 1),
    DEFINE_PROP_UINT32("virtqueue_size", VirtIOSCSICommon, conf.virtqueue_size,
                       128),
+    DEFINE_PROP_BOOL("seg_max_adjust", VirtIOSCSICommon, conf.seg_max_adjust,
+                      true),
    DEFINE_PROP_UINT32("max_sectors", VirtIOSCSICommon, conf.max_sectors,
                       0xFFFF),
    DEFINE_PROP_UINT32("cmd_per_lun", VirtIOSCSICommon, conf.cmd_per_lun, 128),
--- a/hw/scsi/vhost-user-scsi.c
+++ b/hw/scsi/vhost-user-scsi.c
@ -39,6 +39,10 @@ static const int user_feature_bits[] = {
    VHOST_INVALID_FEATURE_BIT
 };

+enum VhostUserProtocolFeature {
+    VHOST_USER_PROTOCOL_F_RESET_DEVICE = 13,
+};
+
 static void vhost_user_scsi_set_status(VirtIODevice *vdev, uint8_t status)
 {
    VHostUserSCSI *s = (VHostUserSCSI *)vdev;
@ -62,6 +66,25 @@ static void vhost_user_scsi_set_status(VirtIODevice *vdev, uint8_t status)
    }
 }

+static void vhost_user_scsi_reset(VirtIODevice *vdev)
+{
+    VHostSCSICommon *vsc = VHOST_SCSI_COMMON(vdev);
+    struct vhost_dev *dev = &vsc->dev;
+
+    /*
+     * Historically, reset was not implemented so only reset devices
+     * that are expecting it.
+     */
+    if (!virtio_has_feature(dev->protocol_features,
+                            VHOST_USER_PROTOCOL_F_RESET_DEVICE)) {
+        return;
+    }
+
+    if (dev->vhost_ops->vhost_reset_device) {
+        dev->vhost_ops->vhost_reset_device(dev);
+    }
+}
+
 static void vhost_dummy_handle_output(VirtIODevice *vdev, VirtQueue *vq)
 {
 }
@ -182,6 +205,7 @@ static void vhost_user_scsi_class_init(ObjectClass *klass, void *data)
    vdc->get_features = vhost_scsi_common_get_features;
    vdc->set_config = vhost_scsi_common_set_config;
    vdc->set_status = vhost_user_scsi_set_status;
+    vdc->reset = vhost_user_scsi_reset;
    fwc->get_dev_path = vhost_scsi_common_get_fw_dev_path;
 }

--- a/hw/scsi/virtio-scsi.c
+++ b/hw/scsi/virtio-scsi.c
@ -597,12 +597,15 @@ bool virtio_scsi_handle_cmd_vq(VirtIOSCSI *s, VirtQueue *vq)
 {
    VirtIOSCSIReq *req, *next;
    int ret = 0;
+    bool suppress_notifications = virtio_queue_get_notification(vq);
    bool progress = false;

    QTAILQ_HEAD(, VirtIOSCSIReq) reqs = QTAILQ_HEAD_INITIALIZER(reqs);

    do {
-        virtio_queue_set_notification(vq, 0);
+        if (suppress_notifications) {
+            virtio_queue_set_notification(vq, 0);
+        }

        while ((req = virtio_scsi_pop_req(s, vq))) {
            progress = true;
@ -622,7 +625,9 @@ bool virtio_scsi_handle_cmd_vq(VirtIOSCSI *s, VirtQueue *vq)
            }
        }

-        virtio_queue_set_notification(vq, 1);
+        if (suppress_notifications) {
+            virtio_queue_set_notification(vq, 1);
+        }
    } while (ret != -EINVAL && !virtio_queue_empty(vq));

    QTAILQ_FOREACH_SAFE(req, &reqs, next, next) {
@ -654,7 +659,8 @@ static void virtio_scsi_get_config(VirtIODevice *vdev,
    VirtIOSCSICommon *s = VIRTIO_SCSI_COMMON(vdev);

    virtio_stl_p(vdev, &scsiconf->num_queues, s->conf.num_queues);
-    virtio_stl_p(vdev, &scsiconf->seg_max, 128 - 2);
+    virtio_stl_p(vdev, &scsiconf->seg_max,
+                 s->conf.seg_max_adjust ? s->conf.virtqueue_size - 2 : 128 - 2);
    virtio_stl_p(vdev, &scsiconf->max_sectors, s->conf.max_sectors);
    virtio_stl_p(vdev, &scsiconf->cmd_per_lun, s->conf.cmd_per_lun);
    virtio_stl_p(vdev, &scsiconf->event_info_size, sizeof(VirtIOSCSIEvent));
@ -893,6 +899,11 @@ void virtio_scsi_common_realize(DeviceState *dev,
        virtio_cleanup(vdev);
        return;
    }
+    if (s->conf.virtqueue_size <= 2) {
+        error_setg(errp, "invalid virtqueue_size property (= %" PRIu32 "), "
+                   "must be > 2", s->conf.virtqueue_size);
+        return;
+    }
    s->cmd_vqs = g_new0(VirtQueue *, s->conf.num_queues);
    s->sense_size = VIRTIO_SCSI_SENSE_DEFAULT_SIZE;
    s->cdb_size = VIRTIO_SCSI_CDB_DEFAULT_SIZE;
@ -949,6 +960,8 @@ static Property virtio_scsi_properties[] = {
    DEFINE_PROP_UINT32("num_queues", VirtIOSCSI, parent_obj.conf.num_queues, 1),
    DEFINE_PROP_UINT32("virtqueue_size", VirtIOSCSI,
                                         parent_obj.conf.virtqueue_size, 128),
+    DEFINE_PROP_BOOL("seg_max_adjust", VirtIOSCSI,
+                      parent_obj.conf.seg_max_adjust, true),
    DEFINE_PROP_UINT32("max_sectors", VirtIOSCSI, parent_obj.conf.max_sectors,
                                                  0xFFFF),
    DEFINE_PROP_UINT32("cmd_per_lun", VirtIOSCSI, parent_obj.conf.cmd_per_lun,
--- a/hw/virtio/vhost-user.c
+++ b/hw/virtio/vhost-user.c
@ -58,6 +58,7 @@ enum VhostUserProtocolFeature {
    VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD = 10,
    VHOST_USER_PROTOCOL_F_HOST_NOTIFIER = 11,
    VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD = 12,
+    VHOST_USER_PROTOCOL_F_RESET_DEVICE = 13,
    VHOST_USER_PROTOCOL_F_MAX
 };

@ -98,6 +99,7 @@ typedef enum VhostUserRequest {
    VHOST_USER_GET_INFLIGHT_FD = 31,
    VHOST_USER_SET_INFLIGHT_FD = 32,
    VHOST_USER_GPU_SET_SOCKET = 33,
+    VHOST_USER_RESET_DEVICE = 34,
    VHOST_USER_MAX
 } VhostUserRequest;

@ -890,10 +892,14 @@ static int vhost_user_set_owner(struct vhost_dev *dev)
 static int vhost_user_reset_device(struct vhost_dev *dev)
 {
    VhostUserMsg msg = {
-        .hdr.request = VHOST_USER_RESET_OWNER,
        .hdr.flags = VHOST_USER_VERSION,
    };

+    msg.hdr.request = virtio_has_feature(dev->protocol_features,
+                                         VHOST_USER_PROTOCOL_F_RESET_DEVICE)
+        ? VHOST_USER_RESET_DEVICE
+        : VHOST_USER_RESET_OWNER;
+
    if (vhost_user_write(dev, &msg, NULL, 0) < 0) {
        return -1;
    }
--- a/hw/virtio/virtio-balloon.c
+++ b/hw/virtio/virtio-balloon.c
@ -831,6 +831,13 @@ static void virtio_balloon_device_unrealize(DeviceState *dev, Error **errp)
    }
    balloon_stats_destroy_timer(s);
    qemu_remove_balloon_handler(s);
+
+    virtio_delete_queue(s->ivq);
+    virtio_delete_queue(s->dvq);
+    virtio_delete_queue(s->svq);
+    if (s->free_page_vq) {
+        virtio_delete_queue(s->free_page_vq);
+    }
    virtio_cleanup(vdev);
 }

--- a/hw/virtio/virtio-mmio.c
+++ b/hw/virtio/virtio-mmio.c
@ -65,6 +65,19 @@ static void virtio_mmio_stop_ioeventfd(VirtIOMMIOProxy *proxy)
    virtio_bus_stop_ioeventfd(&proxy->bus);
 }

+static void virtio_mmio_soft_reset(VirtIOMMIOProxy *proxy)
+{
+    int i;
+
+    if (proxy->legacy) {
+        return;
+    }
+
+    for (i = 0; i < VIRTIO_QUEUE_MAX; i++) {
+        proxy->vqs[i].enabled = 0;
+    }
+}
+
 static uint64_t virtio_mmio_read(void *opaque, hwaddr offset, unsigned size)
 {
    VirtIOMMIOProxy *proxy = (VirtIOMMIOProxy *)opaque;
@ -295,8 +308,9 @@ static void virtio_mmio_write(void *opaque, hwaddr offset, uint64_t value,
        break;
    case VIRTIO_MMIO_QUEUE_NUM:
        trace_virtio_mmio_queue_write(value, VIRTQUEUE_MAX_SIZE);
+        virtio_queue_set_num(vdev, vdev->queue_sel, value);
+
        if (proxy->legacy) {
-            virtio_queue_set_num(vdev, vdev->queue_sel, value);
            virtio_queue_update_rings(vdev, vdev->queue_sel);
        } else {
            proxy->vqs[vdev->queue_sel].num = value;
@ -378,6 +392,7 @@ static void virtio_mmio_write(void *opaque, hwaddr offset, uint64_t value,

        if (vdev->status == 0) {
            virtio_reset(vdev);
+            virtio_mmio_soft_reset(proxy);
        }
        break;
    case VIRTIO_MMIO_QUEUE_DESC_LOW:
--- a/hw/virtio/virtio-pci.c
+++ b/hw/virtio/virtio-pci.c
@ -608,10 +608,14 @@ static void virtio_write_config(PCIDevice *pci_dev, uint32_t address,
        pcie_cap_flr_write_config(pci_dev, address, val, len);
    }

-    if (range_covers_byte(address, len, PCI_COMMAND) &&
-        !(pci_dev->config[PCI_COMMAND] & PCI_COMMAND_MASTER)) {
-        virtio_pci_stop_ioeventfd(proxy);
-        virtio_set_status(vdev, vdev->status & ~VIRTIO_CONFIG_S_DRIVER_OK);
+    if (range_covers_byte(address, len, PCI_COMMAND)) {
+        if (!(pci_dev->config[PCI_COMMAND] & PCI_COMMAND_MASTER)) {
+            virtio_set_disabled(vdev, true);
+            virtio_pci_stop_ioeventfd(proxy);
+            virtio_set_status(vdev, vdev->status & ~VIRTIO_CONFIG_S_DRIVER_OK);
+        } else {
+            virtio_set_disabled(vdev, false);
+        }
    }

    if (proxy->config_cap &&
@ -1256,6 +1260,8 @@ static void virtio_pci_common_write(void *opaque, hwaddr addr,
        break;
    case VIRTIO_PCI_COMMON_Q_SIZE:
        proxy->vqs[vdev->queue_sel].num = val;
+        virtio_queue_set_num(vdev, vdev->queue_sel,
+                             proxy->vqs[vdev->queue_sel].num);
        break;
    case VIRTIO_PCI_COMMON_Q_MSIX:
        msix_vector_unuse(&proxy->pci_dev,
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@ -432,6 +432,11 @@ static void virtio_queue_packed_set_notification(VirtQueue *vq, int enable)
    }
 }

+bool virtio_queue_get_notification(VirtQueue *vq)
+{
+    return vq->notification;
+}
+
 void virtio_queue_set_notification(VirtQueue *vq, int enable)
 {
    vq->notification = enable;
@ -546,7 +551,7 @@ static inline bool is_desc_avail(uint16_t flags, bool wrap_counter)
 * Called within rcu_read_lock().  */
 static int virtio_queue_empty_rcu(VirtQueue *vq)
 {
-    if (unlikely(vq->vdev->broken)) {
+    if (virtio_device_disabled(vq->vdev)) {
        return 1;
    }

@ -565,7 +570,7 @@ static int virtio_queue_split_empty(VirtQueue *vq)
 {
    bool empty;

-    if (unlikely(vq->vdev->broken)) {
+    if (virtio_device_disabled(vq->vdev)) {
        return 1;
    }

@ -783,7 +788,7 @@ void virtqueue_fill(VirtQueue *vq, const VirtQueueElement *elem,

    virtqueue_unmap_sg(vq, elem, len);

-    if (unlikely(vq->vdev->broken)) {
+    if (virtio_device_disabled(vq->vdev)) {
        return;
    }

@ -839,7 +844,7 @@ static void virtqueue_packed_flush(VirtQueue *vq, unsigned int count)

 void virtqueue_flush(VirtQueue *vq, unsigned int count)
 {
-    if (unlikely(vq->vdev->broken)) {
+    if (virtio_device_disabled(vq->vdev)) {
        vq->inuse -= count;
        return;
    }
@ -1602,7 +1607,7 @@ err_undo_map:

 void *virtqueue_pop(VirtQueue *vq, size_t sz)
 {
-    if (unlikely(vq->vdev->broken)) {
+    if (virtio_device_disabled(vq->vdev)) {
        return NULL;
    }

@ -1698,7 +1703,7 @@ unsigned int virtqueue_drop_all(VirtQueue *vq)
 {
    struct VirtIODevice *vdev = vq->vdev;

-    if (unlikely(vdev->broken)) {
+    if (virtio_device_disabled(vq->vdev)) {
        return 0;
    }

@ -1816,7 +1821,7 @@ static void virtio_notify_vector(VirtIODevice *vdev, uint16_t vector)
    BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
    VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);

-    if (unlikely(vdev->broken)) {
+    if (virtio_device_disabled(vdev)) {
        return;
    }

@ -1920,6 +1925,7 @@ void virtio_reset(void *opaque)
    vdev->guest_features = 0;
    vdev->queue_sel = 0;
    vdev->status = 0;
+    vdev->disabled = false;
    atomic_set(&vdev->isr, 0);
    vdev->config_vector = VIRTIO_NO_VECTOR;
    virtio_notify_vector(vdev, vdev->config_vector);
@ -2330,17 +2336,24 @@ VirtQueue *virtio_add_queue(VirtIODevice *vdev, int queue_size,
    return &vdev->vq[i];
 }

+void virtio_delete_queue(VirtQueue *vq)
+{
+    vq->vring.num = 0;
+    vq->vring.num_default = 0;
+    vq->handle_output = NULL;
+    vq->handle_aio_output = NULL;
+    g_free(vq->used_elems);
+    vq->used_elems = NULL;
+    virtio_virtqueue_reset_region_cache(vq);
+}
+
 void virtio_del_queue(VirtIODevice *vdev, int n)
 {
    if (n < 0 || n >= VIRTIO_QUEUE_MAX) {
        abort();
    }

-    vdev->vq[n].vring.num = 0;
-    vdev->vq[n].vring.num_default = 0;
-    vdev->vq[n].handle_output = NULL;
-    vdev->vq[n].handle_aio_output = NULL;
-    g_free(vdev->vq[n].used_elems);
+    virtio_delete_queue(&vdev->vq[n]);
 }

 static void virtio_set_isr(VirtIODevice *vdev, int value)
@ -2553,6 +2566,13 @@ static bool virtio_started_needed(void *opaque)
    return vdev->started;
 }

+static bool virtio_disabled_needed(void *opaque)
+{
+    VirtIODevice *vdev = opaque;
+
+    return vdev->disabled;
+}
+
 static const VMStateDescription vmstate_virtqueue = {
    .name = "virtqueue_state",
    .version_id = 1,
@ -2718,6 +2738,17 @@ static const VMStateDescription vmstate_virtio_started = {
    }
 };

+static const VMStateDescription vmstate_virtio_disabled = {
+    .name = "virtio/disabled",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .needed = &virtio_disabled_needed,
+    .fields = (VMStateField[]) {
+        VMSTATE_BOOL(disabled, VirtIODevice),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
 static const VMStateDescription vmstate_virtio = {
    .name = "virtio",
    .version_id = 1,
@ -2735,6 +2766,7 @@ static const VMStateDescription vmstate_virtio = {
        &vmstate_virtio_extra_state,
        &vmstate_virtio_started,
        &vmstate_virtio_packed_virtqueues,
+        &vmstate_virtio_disabled,
        NULL
    }
 };
@ -3384,17 +3416,12 @@ static bool virtio_queue_host_notifier_aio_poll(void *opaque)
 {
    EventNotifier *n = opaque;
    VirtQueue *vq = container_of(n, VirtQueue, host_notifier);
-    bool progress;

    if (!vq->vring.desc || virtio_queue_empty(vq)) {
        return false;
    }

-    progress = virtio_queue_notify_aio_vq(vq);
-
-    /* In case the handler function re-enabled notifications */
-    virtio_queue_set_notification(vq, 0);
-    return progress;
+    return virtio_queue_notify_aio_vq(vq);
 }

 static void virtio_queue_host_notifier_aio_poll_end(EventNotifier *n)
@ -3569,6 +3596,7 @@ static void virtio_device_instance_finalize(Object *obj)
 static Property virtio_properties[] = {
    DEFINE_VIRTIO_COMMON_FEATURES(VirtIODevice, host_features),
    DEFINE_PROP_BOOL("use-started", VirtIODevice, use_started, true),
+    DEFINE_PROP_BOOL("use-disabled-flag", VirtIODevice, use_disabled_flag, true),
    DEFINE_PROP_END_OF_LIST(),
 };

--- a/include/hw/pci/pci_host.h
+++ b/include/hw/pci/pci_host.h
@ -62,8 +62,8 @@ void pci_host_config_write_common(PCIDevice *pci_dev, uint32_t addr,
 uint32_t pci_host_config_read_common(PCIDevice *pci_dev, uint32_t addr,
                                     uint32_t limit, uint32_t len);

-void pci_data_write(PCIBus *s, uint32_t addr, uint32_t val, int len);
-uint32_t pci_data_read(PCIBus *s, uint32_t addr, int len);
+void pci_data_write(PCIBus *s, uint32_t addr, uint32_t val, unsigned len);
+uint32_t pci_data_read(PCIBus *s, uint32_t addr, unsigned len);

 extern const MemoryRegionOps pci_host_conf_le_ops;
 extern const MemoryRegionOps pci_host_conf_be_ops;
--- a/include/hw/virtio/virtio-blk.h
+++ b/include/hw/virtio/virtio-blk.h
@ -38,6 +38,7 @@ struct VirtIOBlkConf
    uint32_t request_merging;
    uint16_t num_queues;
    uint16_t queue_size;
+    bool seg_max_adjust;
    uint32_t max_discard_sectors;
    uint32_t max_write_zeroes_sectors;
    bool x_enable_wce_if_config_wce;
--- a/include/hw/virtio/virtio-scsi.h
+++ b/include/hw/virtio/virtio-scsi.h
@ -48,6 +48,7 @@ typedef struct virtio_scsi_config VirtIOSCSIConfig;
 struct VirtIOSCSIConf {
    uint32_t num_queues;
    uint32_t virtqueue_size;
+    bool seg_max_adjust;
    uint32_t max_sectors;
    uint32_t cmd_per_lun;
 #ifdef CONFIG_VHOST_SCSI
--- a/include/hw/virtio/virtio.h
+++ b/include/hw/virtio/virtio.h
@ -100,6 +100,8 @@ struct VirtIODevice
    uint16_t device_id;
    bool vm_running;
    bool broken; /* device in invalid state, needs reset */
+    bool use_disabled_flag; /* allow use of 'disable' flag when needed */
+    bool disabled; /* device in temporarily disabled state */
    bool use_started;
    bool started;
    bool start_on_kick; /* when virtio 1.0 feature has not been negotiated */
@ -183,6 +185,8 @@ VirtQueue *virtio_add_queue(VirtIODevice *vdev, int queue_size,

 void virtio_del_queue(VirtIODevice *vdev, int n);

+void virtio_delete_queue(VirtQueue *vq);
+
 void virtqueue_push(VirtQueue *vq, const VirtQueueElement *elem,
                    unsigned int len);
 void virtqueue_flush(VirtQueue *vq, unsigned int count);
@ -224,6 +228,7 @@ int virtio_load(VirtIODevice *vdev, QEMUFile *f, int version_id);

 void virtio_notify_config(VirtIODevice *vdev);

+bool virtio_queue_get_notification(VirtQueue *vq);
 void virtio_queue_set_notification(VirtQueue *vq, int enable);

 int virtio_queue_ready(VirtQueue *vq);
@ -378,4 +383,17 @@ static inline void virtio_set_started(VirtIODevice *vdev, bool started)
        vdev->started = started;
    }
 }
+
+static inline void virtio_set_disabled(VirtIODevice *vdev, bool disable)
+{
+    if (vdev->use_disabled_flag) {
+        vdev->disabled = disable;
+    }
+}
+
+static inline bool virtio_device_disabled(VirtIODevice *vdev)
+{
+    return unlikely(vdev->disabled || vdev->broken);
+}
+
 #endif
--- a/include/sysemu/numa.h
+++ b/include/sysemu/numa.h
@ -14,10 +14,35 @@ struct CPUArchId;
 #define NUMA_DISTANCE_MAX         254
 #define NUMA_DISTANCE_UNREACHABLE 255

+/* the value of AcpiHmatLBInfo flags */
+enum {
+    HMAT_LB_MEM_MEMORY           = 0,
+    HMAT_LB_MEM_CACHE_1ST_LEVEL  = 1,
+    HMAT_LB_MEM_CACHE_2ND_LEVEL  = 2,
+    HMAT_LB_MEM_CACHE_3RD_LEVEL  = 3,
+    HMAT_LB_LEVELS   /* must be the last entry */
+};
+
+/* the value of AcpiHmatLBInfo data type */
+enum {
+    HMAT_LB_DATA_ACCESS_LATENCY   = 0,
+    HMAT_LB_DATA_READ_LATENCY     = 1,
+    HMAT_LB_DATA_WRITE_LATENCY    = 2,
+    HMAT_LB_DATA_ACCESS_BANDWIDTH = 3,
+    HMAT_LB_DATA_READ_BANDWIDTH   = 4,
+    HMAT_LB_DATA_WRITE_BANDWIDTH  = 5,
+    HMAT_LB_TYPES   /* must be the last entry */
+};
+
+#define UINT16_BITS       16
+
 struct NodeInfo {
    uint64_t node_mem;
    struct HostMemoryBackend *node_memdev;
    bool present;
+    bool has_cpu;
+    uint8_t lb_info_provided;
+    uint16_t initiator;
    uint8_t distance[MAX_NODES];
 };

@ -26,6 +51,31 @@ struct NumaNodeMem {
    uint64_t node_plugged_mem;
 };

+struct HMAT_LB_Data {
+    uint8_t     initiator;
+    uint8_t     target;
+    uint64_t    data;
+};
+typedef struct HMAT_LB_Data HMAT_LB_Data;
+
+struct HMAT_LB_Info {
+    /* Indicates it's memory or the specified level memory side cache. */
+    uint8_t     hierarchy;
+
+    /* Present the type of data, access/read/write latency or bandwidth. */
+    uint8_t     data_type;
+
+    /* The range bitmap of bandwidth for calculating common base */
+    uint64_t    range_bitmap;
+
+    /* The common base unit for latencies or bandwidths */
+    uint64_t    base;
+
+    /* Array to store the latencies or bandwidths */
+    GArray      *list;
+};
+typedef struct HMAT_LB_Info HMAT_LB_Info;
+
 struct NumaState {
    /* Number of NUMA nodes */
    int num_nodes;
@ -33,13 +83,26 @@ struct NumaState {
    /* Allow setting NUMA distance for different NUMA nodes */
    bool have_numa_distance;

+    /* Detect if HMAT support is enabled. */
+    bool hmat_enabled;
+
    /* NUMA nodes information */
    NodeInfo nodes[MAX_NODES];
+
+    /* NUMA nodes HMAT Locality Latency and Bandwidth Information */
+    HMAT_LB_Info *hmat_lb[HMAT_LB_LEVELS][HMAT_LB_TYPES];
+
+    /* Memory Side Cache Information Structure */
+    NumaHmatCacheOptions *hmat_cache[MAX_NODES][HMAT_LB_LEVELS];
 };
 typedef struct NumaState NumaState;

 void set_numa_options(MachineState *ms, NumaOptions *object, Error **errp);
 void parse_numa_opts(MachineState *ms);
+void parse_numa_hmat_lb(NumaState *numa_state, NumaHmatLBOptions *node,
+                        Error **errp);
+void parse_numa_hmat_cache(MachineState *ms, NumaHmatCacheOptions *node,
+                           Error **errp);
 void numa_complete_configuration(MachineState *ms);
 void query_numa_node_mem(NumaNodeMem node_mem[], MachineState *ms);
 extern QemuOptsList qemu_numa_opts;
--- a/qapi/machine.json
+++ b/qapi/machine.json
@ -426,10 +426,14 @@
 #
 # @cpu: property based CPU(s) to node mapping (Since: 2.10)
 #
+# @hmat-lb: memory latency and bandwidth information (Since: 5.0)
+#
+# @hmat-cache: memory side cache information (Since: 5.0)
+#
 # Since: 2.1
 ##
 { 'enum': 'NumaOptionsType',
-  'data': [ 'node', 'dist', 'cpu' ] }
+  'data': [ 'node', 'dist', 'cpu', 'hmat-lb', 'hmat-cache' ] }

 ##
 # @NumaOptions:
@ -444,7 +448,9 @@
  'data': {
    'node': 'NumaNodeOptions',
    'dist': 'NumaDistOptions',
-    'cpu': 'NumaCpuOptions' }}
+    'cpu': 'NumaCpuOptions',
+    'hmat-lb': 'NumaHmatLBOptions',
+    'hmat-cache': 'NumaHmatCacheOptions' }}

 ##
 # @NumaNodeOptions:
@ -463,6 +469,13 @@
 # @memdev: memory backend object.  If specified for one node,
 #          it must be specified for all nodes.
 #
+# @initiator: defined in ACPI 6.3 Chapter 5.2.27.3 Table 5-145,
+#             points to the nodeid which has the memory controller
+#             responsible for this NUMA node. This field provides
+#             additional information as to the initiator node that
+#             is closest (as in directly attached) to this node, and
+#             therefore has the best performance (since 5.0)
+#
 # Since: 2.1
 ##
 { 'struct': 'NumaNodeOptions',
@ -470,7 +483,8 @@
   '*nodeid': 'uint16',
   '*cpus':   ['uint16'],
   '*mem':    'size',
-   '*memdev': 'str' }}
+   '*memdev': 'str',
+   '*initiator': 'uint16' }}

 ##
 # @NumaDistOptions:
@ -549,6 +563,166 @@
   'base': 'CpuInstanceProperties',
   'data' : {} }

+##
+# @HmatLBMemoryHierarchy:
+#
+# The memory hierarchy in the System Locality Latency and Bandwidth
+# Information Structure of HMAT (Heterogeneous Memory Attribute Table)
+#
+# For more information about @HmatLBMemoryHierarchy, see chapter
+# 5.2.27.4: Table 5-146: Field "Flags" of ACPI 6.3 spec.
+#
+# @memory: the structure represents the memory performance
+#
+# @first-level: first level of memory side cache
+#
+# @second-level: second level of memory side cache
+#
+# @third-level: third level of memory side cache
+#
+# Since: 5.0
+##
+{ 'enum': 'HmatLBMemoryHierarchy',
+  'data': [ 'memory', 'first-level', 'second-level', 'third-level' ] }
+
+##
+# @HmatLBDataType:
+#
+# Data type in the System Locality Latency and Bandwidth
+# Information Structure of HMAT (Heterogeneous Memory Attribute Table)
+#
+# For more information about @HmatLBDataType, see chapter
+# 5.2.27.4: Table 5-146:  Field "Data Type" of ACPI 6.3 spec.
+#
+# @access-latency: access latency (nanoseconds)
+#
+# @read-latency: read latency (nanoseconds)
+#
+# @write-latency: write latency (nanoseconds)
+#
+# @access-bandwidth: access bandwidth (Bytes per second)
+#
+# @read-bandwidth: read bandwidth (Bytes per second)
+#
+# @write-bandwidth: write bandwidth (Bytes per second)
+#
+# Since: 5.0
+##
+{ 'enum': 'HmatLBDataType',
+  'data': [ 'access-latency', 'read-latency', 'write-latency',
+            'access-bandwidth', 'read-bandwidth', 'write-bandwidth' ] }
+
+##
+# @NumaHmatLBOptions:
+#
+# Set the system locality latency and bandwidth information
+# between Initiator and Target proximity Domains.
+#
+# For more information about @NumaHmatLBOptions, see chapter
+# 5.2.27.4: Table 5-146 of ACPI 6.3 spec.
+#
+# @initiator: the Initiator Proximity Domain.
+#
+# @target: the Target Proximity Domain.
+#
+# @hierarchy: the Memory Hierarchy. Indicates the performance
+#             of memory or side cache.
+#
+# @data-type: presents the type of data, access/read/write
+#             latency or hit latency.
+#
+# @latency: the value of latency from @initiator to @target
+#           proximity domain, the latency unit is "ns(nanosecond)".
+#
+# @bandwidth: the value of bandwidth between @initiator and @target
+#             proximity domain, the bandwidth unit is
+#             "Bytes per second".
+#
+# Since: 5.0
+##
+{ 'struct': 'NumaHmatLBOptions',
+    'data': {
+    'initiator': 'uint16',
+    'target': 'uint16',
+    'hierarchy': 'HmatLBMemoryHierarchy',
+    'data-type': 'HmatLBDataType',
+    '*latency': 'uint64',
+    '*bandwidth': 'size' }}
+
+##
+# @HmatCacheAssociativity:
+#
+# Cache associativity in the Memory Side Cache Information Structure
+# of HMAT
+#
+# For more information of @HmatCacheAssociativity, see chapter
+# 5.2.27.5: Table 5-147 of ACPI 6.3 spec.
+#
+# @none: None (no memory side cache in this proximity domain,
+#              or cache associativity unknown)
+#
+# @direct: Direct Mapped
+#
+# @complex: Complex Cache Indexing (implementation specific)
+#
+# Since: 5.0
+##
+{ 'enum': 'HmatCacheAssociativity',
+  'data': [ 'none', 'direct', 'complex' ] }
+
+##
+# @HmatCacheWritePolicy:
+#
+# Cache write policy in the Memory Side Cache Information Structure
+# of HMAT
+#
+# For more information of @HmatCacheWritePolicy, see chapter
+# 5.2.27.5: Table 5-147: Field "Cache Attributes" of ACPI 6.3 spec.
+#
+# @none: None (no memory side cache in this proximity domain,
+#              or cache write policy unknown)
+#
+# @write-back: Write Back (WB)
+#
+# @write-through: Write Through (WT)
+#
+# Since: 5.0
+##
+{ 'enum': 'HmatCacheWritePolicy',
+  'data': [ 'none', 'write-back', 'write-through' ] }
+
+##
+# @NumaHmatCacheOptions:
+#
+# Set the memory side cache information for a given memory domain.
+#
+# For more information of @NumaHmatCacheOptions, see chapter
+# 5.2.27.5: Table 5-147: Field "Cache Attributes" of ACPI 6.3 spec.
+#
+# @node-id: the memory proximity domain to which the memory belongs.
+#
+# @size: the size of memory side cache in bytes.
+#
+# @level: the cache level described in this structure.
+#
+# @associativity: the cache associativity,
+#         none/direct-mapped/complex(complex cache indexing).
+#
+# @policy: the write policy, none/write-back/write-through.
+#
+# @line: the cache Line size in bytes.
+#
+# Since: 5.0
+##
+{ 'struct': 'NumaHmatCacheOptions',
+  'data': {
+   'node-id': 'uint32',
+   'size': 'size',
+   'level': 'uint8',
+   'associativity': 'HmatCacheAssociativity',
+   'policy': 'HmatCacheWritePolicy',
+   'line': 'uint16' }}
+
 ##
 # @HostMemPolicy:
 #
--- a/qemu-options.hx
+++ b/qemu-options.hx
@ -40,7 +40,8 @@ DEF("machine", HAS_ARG, QEMU_OPTION_machine, \
    "                suppress-vmdesc=on|off disables self-describing migration (default=off)\n"
    "                nvdimm=on|off controls NVDIMM support (default=off)\n"
    "                enforce-config-section=on|off enforce configuration section migration (default=off)\n"
-    "                memory-encryption=@var{} memory encryption object to use (default=none)\n",
+    "                memory-encryption=@var{} memory encryption object to use (default=none)\n"
+    "                hmat=on|off controls ACPI HMAT support (default=off)\n",
    QEMU_ARCH_ALL)
 STEXI
@item -machine [type=]@var{name}[,prop=@var{value}[,...]]
@ -94,6 +95,9 @@ NOTE: this parameter is deprecated. Please use @option{-global}
@option{migration.send-configuration}=@var{on|off} instead.
@item memory-encryption=@var{}
 Memory encryption object to use. The default is none.
+@item hmat=on|off
+Enables or disables ACPI Heterogeneous Memory Attribute Table (HMAT) support.
+The default is off.
@end table
 ETEXI

@ -168,19 +172,24 @@ If any on the three values is given, the total number of CPUs @var{n} can be omi
 ETEXI

 DEF("numa", HAS_ARG, QEMU_OPTION_numa,
-    "-numa node[,mem=size][,cpus=firstcpu[-lastcpu]][,nodeid=node]\n"
-    "-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node]\n"
+    "-numa node[,mem=size][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=node]\n"
+    "-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=node]\n"
    "-numa dist,src=source,dst=destination,val=distance\n"
-    "-numa cpu,node-id=node[,socket-id=x][,core-id=y][,thread-id=z]\n",
+    "-numa cpu,node-id=node[,socket-id=x][,core-id=y][,thread-id=z]\n"
+    "-numa hmat-lb,initiator=node,target=node,hierarchy=memory|first-level|second-level|third-level,data-type=access-latency|read-latency|write-latency[,latency=lat][,bandwidth=bw]\n"
+    "-numa hmat-cache,node-id=node,size=size,level=level[,associativity=none|direct|complex][,policy=none|write-back|write-through][,line=size]\n",
    QEMU_ARCH_ALL)
 STEXI
-@item -numa node[,mem=@var{size}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}]
-@itemx -numa node[,memdev=@var{id}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}]
+@item -numa node[,mem=@var{size}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}][,initiator=@var{initiator}]
+@itemx -numa node[,memdev=@var{id}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}][,initiator=@var{initiator}]
@itemx -numa dist,src=@var{source},dst=@var{destination},val=@var{distance}
@itemx -numa cpu,node-id=@var{node}[,socket-id=@var{x}][,core-id=@var{y}][,thread-id=@var{z}]
+@itemx -numa hmat-lb,initiator=@var{node},target=@var{node},hierarchy=@var{hierarchy},data-type=@var{tpye}[,latency=@var{lat}][,bandwidth=@var{bw}]
+@itemx -numa hmat-cache,node-id=@var{node},size=@var{size},level=@var{level}[,associativity=@var{str}][,policy=@var{str}][,line=@var{size}]
@findex -numa
 Define a NUMA node and assign RAM and VCPUs to it.
 Set the NUMA distance from a source node to a destination node.
+Set the ACPI Heterogeneous Memory Attributes for the given nodes.

 Legacy VCPU assignment uses @samp{cpus} option where
@var{firstcpu} and @var{lastcpu} are CPU indexes. Each
@ -222,6 +231,27 @@ split equally between them.
@samp{mem} and @samp{memdev} are mutually exclusive. Furthermore,
 if one node uses @samp{memdev}, all of them have to use it.

+@samp{initiator} is an additional option that points to an @var{initiator}
+NUMA node that has best performance (the lowest latency or largest bandwidth)
+to this NUMA @var{node}. Note that this option can be set only when
+the machine property 'hmat' is set to 'on'.
+
+Following example creates a machine with 2 NUMA nodes, node 0 has CPU.
+node 1 has only memory, and its initiator is node 0. Note that because
+node 0 has CPU, by default the initiator of node 0 is itself and must be
+itself.
+@example
+-machine hmat=on \
+-m 2G,slots=2,maxmem=4G \
+-object memory-backend-ram,size=1G,id=m0 \
+-object memory-backend-ram,size=1G,id=m1 \
+-numa node,nodeid=0,memdev=m0 \
+-numa node,nodeid=1,memdev=m1,initiator=0 \
+-smp 2,sockets=2,maxcpus=2  \
+-numa cpu,node-id=0,socket-id=0 \
+-numa cpu,node-id=0,socket-id=1
+@end example
+
@var{source} and @var{destination} are NUMA node IDs.
@var{distance} is the NUMA distance from @var{source} to @var{destination}.
 The distance from a node to itself is always 10. If any pair of nodes is
@ -238,6 +268,59 @@ specified resources, it just assigns existing resources to NUMA
 nodes. This means that one still has to use the @option{-m},
@option{-smp} options to allocate RAM and VCPUs respectively.

+Use @samp{hmat-lb} to set System Locality Latency and Bandwidth Information
+between initiator and target NUMA nodes in ACPI Heterogeneous Attribute Memory Table (HMAT).
+Initiator NUMA node can create memory requests, usually it has one or more processors.
+Target NUMA node contains addressable memory.
+
+In @samp{hmat-lb} option, @var{node} are NUMA node IDs. @var{hierarchy} is the memory
+hierarchy of the target NUMA node: if @var{hierarchy} is 'memory', the structure
+represents the memory performance; if @var{hierarchy} is 'first-level|second-level|third-level',
+this structure represents aggregated performance of memory side caches for each domain.
+@var{type} of 'data-type' is type of data represented by this structure instance:
+if 'hierarchy' is 'memory', 'data-type' is 'access|read|write' latency or 'access|read|write'
+bandwidth of the target memory; if 'hierarchy' is 'first-level|second-level|third-level',
+'data-type' is 'access|read|write' hit latency or 'access|read|write' hit bandwidth of the
+target memory side cache.
+
+@var{lat} is latency value in nanoseconds. @var{bw} is bandwidth value,
+the possible value and units are NUM[M|G|T], mean that the bandwidth value are
+NUM byte per second (or MB/s, GB/s or TB/s depending on used suffix).
+Note that if latency or bandwidth value is 0, means the corresponding latency or
+bandwidth information is not provided.
+
+In @samp{hmat-cache} option, @var{node-id} is the NUMA-id of the memory belongs.
+@var{size} is the size of memory side cache in bytes. @var{level} is the cache
+level described in this structure, note that the cache level 0 should not be used
+with @samp{hmat-cache} option. @var{associativity} is the cache associativity,
+the possible value is 'none/direct(direct-mapped)/complex(complex cache indexing)'.
+@var{policy} is the write policy. @var{line} is the cache Line size in bytes.
+
+For example, the following options describe 2 NUMA nodes. Node 0 has 2 cpus and
+a ram, node 1 has only a ram. The processors in node 0 access memory in node
+0 with access-latency 5 nanoseconds, access-bandwidth is 200 MB/s;
+The processors in NUMA node 0 access memory in NUMA node 1 with access-latency 10
+nanoseconds, access-bandwidth is 100 MB/s.
+And for memory side cache information, NUMA node 0 and 1 both have 1 level memory
+cache, size is 10KB, policy is write-back, the cache Line size is 8 bytes:
+@example
+-machine hmat=on \
+-m 2G \
+-object memory-backend-ram,size=1G,id=m0 \
+-object memory-backend-ram,size=1G,id=m1 \
+-smp 2 \
+-numa node,nodeid=0,memdev=m0 \
+-numa node,nodeid=1,memdev=m1,initiator=0 \
+-numa cpu,node-id=0,socket-id=0 \
+-numa cpu,node-id=0,socket-id=1 \
+-numa hmat-lb,initiator=0,target=0,hierarchy=memory,data-type=access-latency,latency=5 \
+-numa hmat-lb,initiator=0,target=0,hierarchy=memory,data-type=access-bandwidth,bandwidth=200M \
+-numa hmat-lb,initiator=0,target=1,hierarchy=memory,data-type=access-latency,latency=10 \
+-numa hmat-lb,initiator=0,target=1,hierarchy=memory,data-type=access-bandwidth,bandwidth=100M \
+-numa hmat-cache,node-id=0,size=10K,level=1,associativity=direct,policy=write-back,line=8 \
+-numa hmat-cache,node-id=1,size=10K,level=1,associativity=direct,policy=write-back,line=8
+@end example
+
 ETEXI

 DEF("add-fd", HAS_ARG, QEMU_OPTION_add_fd,
--- a/tests/acceptance/virtio_seg_max_adjust.py
+++ b/tests/acceptance/virtio_seg_max_adjust.py
@ -0,0 +1,134 @@
+#!/usr/bin/env python
+#
+# Test virtio-scsi and virtio-blk queue settings for all machine types
+#
+# Copyright (c) 2019 Virtuozzo International GmbH
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+import sys
+import os
+import re
+
+sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..', 'python'))
+from qemu.machine import QEMUMachine
+from avocado_qemu import Test
+
+#list of machine types and virtqueue properties to test
+VIRTIO_SCSI_PROPS = {'seg_max_adjust': 'seg_max_adjust'}
+VIRTIO_BLK_PROPS = {'seg_max_adjust': 'seg-max-adjust'}
+
+DEV_TYPES = {'virtio-scsi-pci': VIRTIO_SCSI_PROPS,
+             'virtio-blk-pci': VIRTIO_BLK_PROPS}
+
+VM_DEV_PARAMS = {'virtio-scsi-pci': ['-device', 'virtio-scsi-pci,id=scsi0'],
+                 'virtio-blk-pci': ['-device',
+                                    'virtio-blk-pci,id=scsi0,drive=drive0',
+                                    '-drive',
+                                    'driver=null-co,id=drive0,if=none']}
+
+
+class VirtioMaxSegSettingsCheck(Test):
+    @staticmethod
+    def make_pattern(props):
+        pattern_items = ['{0} = \w+'.format(prop) for prop in props]
+        return '|'.join(pattern_items)
+
+    def query_virtqueue(self, vm, dev_type_name):
+        query_ok = False
+        error = None
+        props = None
+
+        output = vm.command('human-monitor-command',
+                            command_line = 'info qtree')
+        props_list = DEV_TYPES[dev_type_name].values();
+        pattern = self.make_pattern(props_list)
+        res = re.findall(pattern, output)
+
+        if len(res) != len(props_list):
+            props_list = set(props_list)
+            res = set(res)
+            not_found = props_list.difference(res)
+            not_found = ', '.join(not_found)
+            error = '({0}): The following properties not found: {1}'\
+                     .format(dev_type_name, not_found)
+        else:
+            query_ok = True
+            props = dict()
+            for prop in res:
+                p = prop.split(' = ')
+                props[p[0]] = p[1]
+        return query_ok, props, error
+
+    def check_mt(self, mt, dev_type_name):
+        with QEMUMachine(self.qemu_bin) as vm:
+            vm.set_machine(mt["name"])
+            for s in VM_DEV_PARAMS[dev_type_name]:
+                vm.add_args(s)
+            vm.launch()
+            query_ok, props, error = self.query_virtqueue(vm, dev_type_name)
+
+        if not query_ok:
+            self.fail('machine type {0}: {1}'.format(mt['name'], error))
+
+        for prop_name, prop_val in props.items():
+            expected_val = mt[prop_name]
+            self.assertEqual(expected_val, prop_val)
+
+    @staticmethod
+    def seg_max_adjust_enabled(mt):
+        # machine types >= 5.0 should have seg_max_adjust = true
+        # others seg_max_adjust = false
+        mt = mt.split("-")
+
+        # machine types with one line name and name like pc-x.x
+        if len(mt) <= 2:
+            return False
+
+        # machine types like pc-<chip_name>-x.x[.x]
+        ver = mt[2]
+        ver = ver.split(".");
+
+        # versions >= 5.0 goes with seg_max_adjust enabled
+        major = int(ver[0])
+
+        if major >= 5:
+            return True
+        return False
+
+    def test_machine_types(self):
+        # collect all machine types except 'none', 'isapc', 'microvm'
+        with QEMUMachine(self.qemu_bin) as vm:
+            vm.launch()
+            machines = [m['name'] for m in vm.command('query-machines')]
+            vm.shutdown()
+        machines.remove('none')
+        machines.remove('isapc')
+        machines.remove('microvm')
+
+        for dev_type in DEV_TYPES:
+            # create the list of machine types and their parameters.
+            mtypes = list()
+            for m in machines:
+                if self.seg_max_adjust_enabled(m):
+                    enabled = 'true'
+                else:
+                    enabled = 'false'
+                mtypes.append({'name': m,
+                               DEV_TYPES[dev_type]['seg_max_adjust']: enabled})
+
+            # test each machine type for a device type
+            for mt in mtypes:
+                self.check_mt(mt, dev_type)
--- a/tests/bios-tables-test.c
+++ b/tests/bios-tables-test.c
@ -947,6 +947,48 @@ static void test_acpi_virt_tcg_numamem(void)

 }

+static void test_acpi_tcg_acpi_hmat(const char *machine)
+{
+    test_data data;
+
+    memset(&data, 0, sizeof(data));
+    data.machine = machine;
+    data.variant = ".acpihmat";
+    test_acpi_one(" -machine hmat=on"
+                  " -smp 2,sockets=2"
+                  " -m 128M,slots=2,maxmem=1G"
+                  " -object memory-backend-ram,size=64M,id=m0"
+                  " -object memory-backend-ram,size=64M,id=m1"
+                  " -numa node,nodeid=0,memdev=m0"
+                  " -numa node,nodeid=1,memdev=m1,initiator=0"
+                  " -numa cpu,node-id=0,socket-id=0"
+                  " -numa cpu,node-id=0,socket-id=1"
+                  " -numa hmat-lb,initiator=0,target=0,hierarchy=memory,"
+                  "data-type=access-latency,latency=1"
+                  " -numa hmat-lb,initiator=0,target=0,hierarchy=memory,"
+                  "data-type=access-bandwidth,bandwidth=65534M"
+                  " -numa hmat-lb,initiator=0,target=1,hierarchy=memory,"
+                  "data-type=access-latency,latency=65534"
+                  " -numa hmat-lb,initiator=0,target=1,hierarchy=memory,"
+                  "data-type=access-bandwidth,bandwidth=32767M"
+                  " -numa hmat-cache,node-id=0,size=10K,level=1,"
+                  "associativity=direct,policy=write-back,line=8"
+                  " -numa hmat-cache,node-id=1,size=10K,level=1,"
+                  "associativity=direct,policy=write-back,line=8",
+                  &data);
+    free_test_data(&data);
+}
+
+static void test_acpi_q35_tcg_acpi_hmat(void)
+{
+    test_acpi_tcg_acpi_hmat(MACHINE_Q35);
+}
+
+static void test_acpi_piix4_tcg_acpi_hmat(void)
+{
+    test_acpi_tcg_acpi_hmat(MACHINE_PC);
+}
+
 static void test_acpi_virt_tcg(void)
 {
    test_data data = {
@ -991,6 +1033,8 @@ int main(int argc, char *argv[])
        qtest_add_func("acpi/q35/numamem", test_acpi_q35_tcg_numamem);
        qtest_add_func("acpi/piix4/dimmpxm", test_acpi_piix4_tcg_dimm_pxm);
        qtest_add_func("acpi/q35/dimmpxm", test_acpi_q35_tcg_dimm_pxm);
+        qtest_add_func("acpi/piix4/acpihmat", test_acpi_piix4_tcg_acpi_hmat);
+        qtest_add_func("acpi/q35/acpihmat", test_acpi_q35_tcg_acpi_hmat);
    } else if (strcmp(arch, "aarch64") == 0) {
        qtest_add_func("acpi/virt", test_acpi_virt_tcg);
        qtest_add_func("acpi/virt/numamem", test_acpi_virt_tcg_numamem);
--- a/tests/data/acpi/pc/APIC.acpihmat
+++ b/tests/data/acpi/pc/APIC.acpihmat
--- a/tests/data/acpi/pc/DSDT.acpihmat
+++ b/tests/data/acpi/pc/DSDT.acpihmat
--- a/tests/data/acpi/pc/HMAT.acpihmat
+++ b/tests/data/acpi/pc/HMAT.acpihmat
--- a/tests/data/acpi/pc/SRAT.acpihmat
+++ b/tests/data/acpi/pc/SRAT.acpihmat
--- a/tests/data/acpi/q35/APIC.acpihmat
+++ b/tests/data/acpi/q35/APIC.acpihmat
--- a/tests/data/acpi/q35/DSDT.acpihmat
+++ b/tests/data/acpi/q35/DSDT.acpihmat
--- a/tests/data/acpi/q35/HMAT.acpihmat
+++ b/tests/data/acpi/q35/HMAT.acpihmat
--- a/tests/data/acpi/q35/SRAT.acpihmat
+++ b/tests/data/acpi/q35/SRAT.acpihmat
--- a/tests/numa-test.c
+++ b/tests/numa-test.c
@ -327,6 +327,216 @@ static void pc_dynamic_cpu_cfg(const void *data)
    qtest_quit(qs);
 }

+static void pc_hmat_build_cfg(const void *data)
+{
+    QTestState *qs = qtest_initf("%s -nodefaults --preconfig -machine hmat=on "
+                     "-smp 2,sockets=2 "
+                     "-m 128M,slots=2,maxmem=1G "
+                     "-object memory-backend-ram,size=64M,id=m0 "
+                     "-object memory-backend-ram,size=64M,id=m1 "
+                     "-numa node,nodeid=0,memdev=m0 "
+                     "-numa node,nodeid=1,memdev=m1,initiator=0 "
+                     "-numa cpu,node-id=0,socket-id=0 "
+                     "-numa cpu,node-id=0,socket-id=1",
+                     data ? (char *)data : "");
+
+    /* Fail: Initiator should be less than the number of nodes */
+    g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node',"
+        " 'arguments': { 'type': 'hmat-lb', 'initiator': 2, 'target': 0,"
+        " 'hierarchy': \"memory\", 'data-type': \"access-latency\" } }")));
+
+    /* Fail: Target should be less than the number of nodes */
+    g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node',"
+        " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 2,"
+        " 'hierarchy': \"memory\", 'data-type': \"access-latency\" } }")));
+
+    /* Fail: Initiator should contain cpu */
+    g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node',"
+        " 'arguments': { 'type': 'hmat-lb', 'initiator': 1, 'target': 0,"
+        " 'hierarchy': \"memory\", 'data-type': \"access-latency\" } }")));
+
+    /* Fail: Data-type mismatch */
+    g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node',"
+        " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 0,"
+        " 'hierarchy': \"memory\", 'data-type': \"write-latency\","
+        " 'bandwidth': 524288000 } }")));
+    g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node',"
+        " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 0,"
+        " 'hierarchy': \"memory\", 'data-type': \"read-bandwidth\","
+        " 'latency': 5 } }")));
+
+    /* Fail: Bandwidth should be 1MB (1048576) aligned */
+    g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node',"
+        " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 0,"
+        " 'hierarchy': \"memory\", 'data-type': \"access-bandwidth\","
+        " 'bandwidth': 1048575 } }")));
+
+    /* Configuring HMAT bandwidth and latency details */
+    g_assert_false(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node',"
+        " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 0,"
+        " 'hierarchy': \"memory\", 'data-type': \"access-latency\","
+        " 'latency': 1 } }")));    /* 1 ns */
+    g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node',"
+        " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 0,"
+        " 'hierarchy': \"memory\", 'data-type': \"access-latency\","
+        " 'latency': 5 } }")));    /* Fail: Duplicate configuration */
+    g_assert_false(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node',"
+        " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 0,"
+        " 'hierarchy': \"memory\", 'data-type': \"access-bandwidth\","
+        " 'bandwidth': 68717379584 } }")));    /* 65534 MB/s */
+    g_assert_false(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node',"
+        " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 1,"
+        " 'hierarchy': \"memory\", 'data-type': \"access-latency\","
+        " 'latency': 65534 } }")));    /* 65534 ns */
+    g_assert_false(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node',"
+        " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 1,"
+        " 'hierarchy': \"memory\", 'data-type': \"access-bandwidth\","
+        " 'bandwidth': 34358689792 } }")));    /* 32767 MB/s */
+
+    /* Fail: node_id should be less than the number of nodes */
+    g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node',"
+        " 'arguments': { 'type': 'hmat-cache', 'node-id': 2, 'size': 10240,"
+        " 'level': 1, 'associativity': \"direct\", 'policy': \"write-back\","
+        " 'line': 8 } }")));
+
+    /* Fail: level should be less than HMAT_LB_LEVELS (4) */
+    g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node',"
+        " 'arguments': { 'type': 'hmat-cache', 'node-id': 0, 'size': 10240,"
+        " 'level': 4, 'associativity': \"direct\", 'policy': \"write-back\","
+        " 'line': 8 } }")));
+
+    /* Fail: associativity option should be 'none', if level is 0 */
+    g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node',"
+        " 'arguments': { 'type': 'hmat-cache', 'node-id': 0, 'size': 10240,"
+        " 'level': 0, 'associativity': \"direct\", 'policy': \"none\","
+        " 'line': 0 } }")));
+    /* Fail: policy option should be 'none', if level is 0 */
+    g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node',"
+        " 'arguments': { 'type': 'hmat-cache', 'node-id': 0, 'size': 10240,"
+        " 'level': 0, 'associativity': \"none\", 'policy': \"write-back\","
+        " 'line': 0 } }")));
+    /* Fail: line option should be 0, if level is 0 */
+    g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node',"
+        " 'arguments': { 'type': 'hmat-cache', 'node-id': 0, 'size': 10240,"
+        " 'level': 0, 'associativity': \"none\", 'policy': \"none\","
+        " 'line': 8 } }")));
+
+    /* Configuring HMAT memory side cache attributes */
+    g_assert_false(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node',"
+        " 'arguments': { 'type': 'hmat-cache', 'node-id': 0, 'size': 10240,"
+        " 'level': 1, 'associativity': \"direct\", 'policy': \"write-back\","
+        " 'line': 8 } }")));
+    g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node',"
+        " 'arguments': { 'type': 'hmat-cache', 'node-id': 0, 'size': 10240,"
+        " 'level': 1, 'associativity': \"direct\", 'policy': \"write-back\","
+        " 'line': 8 } }")));    /* Fail: Duplicate configuration */
+    /* Fail: The size of level 2 size should be small than level 1 */
+    g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node',"
+        " 'arguments': { 'type': 'hmat-cache', 'node-id': 0, 'size': 10240,"
+        " 'level': 2, 'associativity': \"direct\", 'policy': \"write-back\","
+        " 'line': 8 } }")));
+    /* Fail: The size of level 0 size should be larger than level 1 */
+    g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node',"
+        " 'arguments': { 'type': 'hmat-cache', 'node-id': 0, 'size': 10240,"
+        " 'level': 0, 'associativity': \"direct\", 'policy': \"write-back\","
+        " 'line': 8 } }")));
+    g_assert_false(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node',"
+        " 'arguments': { 'type': 'hmat-cache', 'node-id': 1, 'size': 10240,"
+        " 'level': 1, 'associativity': \"direct\", 'policy': \"write-back\","
+        " 'line': 8 } }")));
+
+    /* let machine initialization to complete and run */
+    g_assert_false(qmp_rsp_is_err(qtest_qmp(qs,
+        "{ 'execute': 'x-exit-preconfig' }")));
+    qtest_qmp_eventwait(qs, "RESUME");
+
+    qtest_quit(qs);
+}
+
+static void pc_hmat_off_cfg(const void *data)
+{
+    QTestState *qs = qtest_initf("%s -nodefaults --preconfig "
+                     "-smp 2,sockets=2 "
+                     "-m 128M,slots=2,maxmem=1G "
+                     "-object memory-backend-ram,size=64M,id=m0 "
+                     "-object memory-backend-ram,size=64M,id=m1 "
+                     "-numa node,nodeid=0,memdev=m0",
+                     data ? (char *)data : "");
+
+    /*
+     * Fail: Enable HMAT with -machine hmat=on
+     * before using any of hmat specific options
+     */
+    g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node',"
+        " 'arguments': { 'type': 'node', 'nodeid': 1, 'memdev': \"m1\","
+        " 'initiator': 0 } }")));
+    g_assert_false(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node',"
+        " 'arguments': { 'type': 'node', 'nodeid': 1, 'memdev': \"m1\" } }")));
+    g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node',"
+        " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 0,"
+        " 'hierarchy': \"memory\", 'data-type': \"access-latency\","
+        " 'latency': 1 } }")));
+    g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node',"
+        " 'arguments': { 'type': 'hmat-cache', 'node-id': 0, 'size': 10240,"
+        " 'level': 1, 'associativity': \"direct\", 'policy': \"write-back\","
+        " 'line': 8 } }")));
+
+    /* let machine initialization to complete and run */
+    g_assert_false(qmp_rsp_is_err(qtest_qmp(qs,
+        "{ 'execute': 'x-exit-preconfig' }")));
+    qtest_qmp_eventwait(qs, "RESUME");
+
+    qtest_quit(qs);
+}
+
+static void pc_hmat_erange_cfg(const void *data)
+{
+    QTestState *qs = qtest_initf("%s -nodefaults --preconfig -machine hmat=on "
+                     "-smp 2,sockets=2 "
+                     "-m 128M,slots=2,maxmem=1G "
+                     "-object memory-backend-ram,size=64M,id=m0 "
+                     "-object memory-backend-ram,size=64M,id=m1 "
+                     "-numa node,nodeid=0,memdev=m0 "
+                     "-numa node,nodeid=1,memdev=m1,initiator=0 "
+                     "-numa cpu,node-id=0,socket-id=0 "
+                     "-numa cpu,node-id=0,socket-id=1",
+                     data ? (char *)data : "");
+
+    /* Can't store the compressed latency */
+    g_assert_false(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node',"
+        " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 0,"
+        " 'hierarchy': \"memory\", 'data-type': \"access-latency\","
+        " 'latency': 1 } }")));    /* 1 ns */
+    g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node',"
+        " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 1,"
+        " 'hierarchy': \"memory\", 'data-type': \"access-latency\","
+        " 'latency': 65535 } }")));    /* 65535 ns */
+
+    /* Test the 0 input (bandwidth not provided) */
+    g_assert_false(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node',"
+        " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 0,"
+        " 'hierarchy': \"memory\", 'data-type': \"access-bandwidth\","
+        " 'bandwidth': 0 } }")));    /* 0 MB/s */
+    /* Fail: bandwidth should be provided before memory side cache attributes */
+    g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node',"
+        " 'arguments': { 'type': 'hmat-cache', 'node-id': 0, 'size': 10240,"
+        " 'level': 1, 'associativity': \"direct\", 'policy': \"write-back\","
+        " 'line': 8 } }")));
+
+    /* Can't store the compressed bandwidth */
+    g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node',"
+        " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 1,"
+        " 'hierarchy': \"memory\", 'data-type': \"access-bandwidth\","
+        " 'bandwidth': 68718428160 } }")));    /* 65535 MB/s */
+
+    /* let machine initialization to complete and run */
+    g_assert_false(qmp_rsp_is_err(qtest_qmp(qs,
+        "{ 'execute': 'x-exit-preconfig' }")));
+    qtest_qmp_eventwait(qs, "RESUME");
+
+    qtest_quit(qs);
+}
+
 int main(int argc, char **argv)
 {
    const char *args = NULL;
@ -346,6 +556,9 @@ int main(int argc, char **argv)
    if (!strcmp(arch, "i386") || !strcmp(arch, "x86_64")) {
        qtest_add_data_func("/numa/pc/cpu/explicit", args, pc_numa_cpu);
        qtest_add_data_func("/numa/pc/dynamic/cpu", args, pc_dynamic_cpu_cfg);
+        qtest_add_data_func("/numa/pc/hmat/build", args, pc_hmat_build_cfg);
+        qtest_add_data_func("/numa/pc/hmat/off", args, pc_hmat_off_cfg);
+        qtest_add_data_func("/numa/pc/hmat/erange", args, pc_hmat_erange_cfg);
    }

    if (!strcmp(arch, "ppc64")) {