diff --git a/MAINTAINERS b/MAINTAINERS index 4a0bc1e71b..5b335aa948 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1787,6 +1787,12 @@ F: include/sysemu/replay.h F: docs/replay.txt F: stubs/replay.c +IOVA Tree +M: Peter Xu +S: Maintained +F: include/qemu/iova-tree.h +F: util/iova-tree.c + Usermode Emulation ------------------ Overall diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c index beeed0c43f..54e643d871 100644 --- a/contrib/libvhost-user/libvhost-user.c +++ b/contrib/libvhost-user/libvhost-user.c @@ -323,13 +323,15 @@ vu_message_write(VuDev *dev, int conn_fd, VhostUserMsg *vmsg) rc = sendmsg(conn_fd, &msg, 0); } while (rc < 0 && (errno == EINTR || errno == EAGAIN)); - do { - if (vmsg->data) { - rc = write(conn_fd, vmsg->data, vmsg->size); - } else { - rc = write(conn_fd, p + VHOST_USER_HDR_SIZE, vmsg->size); - } - } while (rc < 0 && (errno == EINTR || errno == EAGAIN)); + if (vmsg->size) { + do { + if (vmsg->data) { + rc = write(conn_fd, vmsg->data, vmsg->size); + } else { + rc = write(conn_fd, p + VHOST_USER_HDR_SIZE, vmsg->size); + } + } while (rc < 0 && (errno == EINTR || errno == EAGAIN)); + } if (rc <= 0) { vu_panic(dev, "Error while writing: %s", strerror(errno)); diff --git a/contrib/vhost-user-blk/vhost-user-blk.c b/contrib/vhost-user-blk/vhost-user-blk.c index 67dac8155a..a6a132a492 100644 --- a/contrib/vhost-user-blk/vhost-user-blk.c +++ b/contrib/vhost-user-blk/vhost-user-blk.c @@ -311,6 +311,12 @@ vub_get_features(VuDev *dev) 1ull << VHOST_USER_F_PROTOCOL_FEATURES; } +static uint64_t +vub_get_protocol_features(VuDev *dev) +{ + return 1ull << VHOST_USER_PROTOCOL_F_CONFIG; +} + static int vub_get_config(VuDev *vu_dev, uint8_t *config, uint32_t len) { @@ -373,6 +379,7 @@ vub_set_config(VuDev *vu_dev, const uint8_t *data, static const VuDevIface vub_iface = { .get_features = vub_get_features, .queue_set_started = vub_queue_set_started, + .get_protocol_features = vub_get_protocol_features, .get_config = vub_get_config, .set_config = vub_set_config, }; diff --git a/docs/specs/pci-ids.txt b/docs/specs/pci-ids.txt index bb99a0257e..4d53e5c7d9 100644 --- a/docs/specs/pci-ids.txt +++ b/docs/specs/pci-ids.txt @@ -62,6 +62,7 @@ PCI devices (other than virtio): 1b36:000a PCI-PCI bridge (multiseat) 1b36:000b PCIe Expander Bridge (-device pxb-pcie) 1b36:000d PCI xhci usb host adapter +1b36:000f mdpy (mdev sample device), linux/samples/vfio-mdev/mdpy.c All these devices are documented in docs/specs. diff --git a/docs/virtio-balloon-stats.txt b/docs/virtio-balloon-stats.txt index 7a66d25da5..9985e1dffc 100644 --- a/docs/virtio-balloon-stats.txt +++ b/docs/virtio-balloon-stats.txt @@ -34,6 +34,8 @@ which will return a dictionary containing: - stat-total-memory - stat-available-memory - stat-disk-caches + - stat-htlb-pgalloc + - stat-htlb-pgfail o A key named last-update, which contains the last stats update timestamp in seconds. Since this timestamp is generated by the host, diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index fb31de9416..b5a09b7908 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -128,6 +128,22 @@ static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState *s, hwaddr addr, return new_val; } +static inline void vtd_iommu_lock(IntelIOMMUState *s) +{ + qemu_mutex_lock(&s->iommu_lock); +} + +static inline void vtd_iommu_unlock(IntelIOMMUState *s) +{ + qemu_mutex_unlock(&s->iommu_lock); +} + +/* Whether the address space needs to notify new mappings */ +static inline gboolean vtd_as_has_map_notifier(VTDAddressSpace *as) +{ + return as->notifier_flags & IOMMU_NOTIFIER_MAP; +} + /* GHashTable functions */ static gboolean vtd_uint64_equal(gconstpointer v1, gconstpointer v2) { @@ -172,9 +188,9 @@ static gboolean vtd_hash_remove_by_page(gpointer key, gpointer value, } /* Reset all the gen of VTDAddressSpace to zero and set the gen of - * IntelIOMMUState to 1. + * IntelIOMMUState to 1. Must be called with IOMMU lock held. */ -static void vtd_reset_context_cache(IntelIOMMUState *s) +static void vtd_reset_context_cache_locked(IntelIOMMUState *s) { VTDAddressSpace *vtd_as; VTDBus *vtd_bus; @@ -197,12 +213,20 @@ static void vtd_reset_context_cache(IntelIOMMUState *s) s->context_cache_gen = 1; } -static void vtd_reset_iotlb(IntelIOMMUState *s) +/* Must be called with IOMMU lock held. */ +static void vtd_reset_iotlb_locked(IntelIOMMUState *s) { assert(s->iotlb); g_hash_table_remove_all(s->iotlb); } +static void vtd_reset_iotlb(IntelIOMMUState *s) +{ + vtd_iommu_lock(s); + vtd_reset_iotlb_locked(s); + vtd_iommu_unlock(s); +} + static uint64_t vtd_get_iotlb_key(uint64_t gfn, uint16_t source_id, uint32_t level) { @@ -215,6 +239,7 @@ static uint64_t vtd_get_iotlb_gfn(hwaddr addr, uint32_t level) return (addr & vtd_slpt_level_page_mask(level)) >> VTD_PAGE_SHIFT_4K; } +/* Must be called with IOMMU lock held */ static VTDIOTLBEntry *vtd_lookup_iotlb(IntelIOMMUState *s, uint16_t source_id, hwaddr addr) { @@ -235,6 +260,7 @@ out: return entry; } +/* Must be with IOMMU lock held */ static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t source_id, uint16_t domain_id, hwaddr addr, uint64_t slpte, uint8_t access_flags, uint32_t level) @@ -246,7 +272,7 @@ static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t source_id, trace_vtd_iotlb_page_update(source_id, addr, slpte, domain_id); if (g_hash_table_size(s->iotlb) >= VTD_IOTLB_MAX_SIZE) { trace_vtd_iotlb_reset("iotlb exceeds size limit"); - vtd_reset_iotlb(s); + vtd_reset_iotlb_locked(s); } entry->gfn = gfn; @@ -722,23 +748,117 @@ static int vtd_iova_to_slpte(VTDContextEntry *ce, uint64_t iova, bool is_write, typedef int (*vtd_page_walk_hook)(IOMMUTLBEntry *entry, void *private); +/** + * Constant information used during page walking + * + * @hook_fn: hook func to be called when detected page + * @private: private data to be passed into hook func + * @notify_unmap: whether we should notify invalid entries + * @as: VT-d address space of the device + * @aw: maximum address width + * @domain: domain ID of the page walk + */ +typedef struct { + VTDAddressSpace *as; + vtd_page_walk_hook hook_fn; + void *private; + bool notify_unmap; + uint8_t aw; + uint16_t domain_id; +} vtd_page_walk_info; + +static int vtd_page_walk_one(IOMMUTLBEntry *entry, vtd_page_walk_info *info) +{ + VTDAddressSpace *as = info->as; + vtd_page_walk_hook hook_fn = info->hook_fn; + void *private = info->private; + DMAMap target = { + .iova = entry->iova, + .size = entry->addr_mask, + .translated_addr = entry->translated_addr, + .perm = entry->perm, + }; + DMAMap *mapped = iova_tree_find(as->iova_tree, &target); + + if (entry->perm == IOMMU_NONE && !info->notify_unmap) { + trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask); + return 0; + } + + assert(hook_fn); + + /* Update local IOVA mapped ranges */ + if (entry->perm) { + if (mapped) { + /* If it's exactly the same translation, skip */ + if (!memcmp(mapped, &target, sizeof(target))) { + trace_vtd_page_walk_one_skip_map(entry->iova, entry->addr_mask, + entry->translated_addr); + return 0; + } else { + /* + * Translation changed. Normally this should not + * happen, but it can happen when with buggy guest + * OSes. Note that there will be a small window that + * we don't have map at all. But that's the best + * effort we can do. The ideal way to emulate this is + * atomically modify the PTE to follow what has + * changed, but we can't. One example is that vfio + * driver only has VFIO_IOMMU_[UN]MAP_DMA but no + * interface to modify a mapping (meanwhile it seems + * meaningless to even provide one). Anyway, let's + * mark this as a TODO in case one day we'll have + * a better solution. + */ + IOMMUAccessFlags cache_perm = entry->perm; + int ret; + + /* Emulate an UNMAP */ + entry->perm = IOMMU_NONE; + trace_vtd_page_walk_one(info->domain_id, + entry->iova, + entry->translated_addr, + entry->addr_mask, + entry->perm); + ret = hook_fn(entry, private); + if (ret) { + return ret; + } + /* Drop any existing mapping */ + iova_tree_remove(as->iova_tree, &target); + /* Recover the correct permission */ + entry->perm = cache_perm; + } + } + iova_tree_insert(as->iova_tree, &target); + } else { + if (!mapped) { + /* Skip since we didn't map this range at all */ + trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask); + return 0; + } + iova_tree_remove(as->iova_tree, &target); + } + + trace_vtd_page_walk_one(info->domain_id, entry->iova, + entry->translated_addr, entry->addr_mask, + entry->perm); + return hook_fn(entry, private); +} + /** * vtd_page_walk_level - walk over specific level for IOVA range * * @addr: base GPA addr to start the walk * @start: IOVA range start address * @end: IOVA range end address (start <= addr < end) - * @hook_fn: hook func to be called when detected page - * @private: private data to be passed into hook func * @read: whether parent level has read permission * @write: whether parent level has write permission - * @notify_unmap: whether we should notify invalid entries - * @aw: maximum address width + * @info: constant information for the page walk */ static int vtd_page_walk_level(dma_addr_t addr, uint64_t start, - uint64_t end, vtd_page_walk_hook hook_fn, - void *private, uint32_t level, bool read, - bool write, bool notify_unmap, uint8_t aw) + uint64_t end, uint32_t level, bool read, + bool write, vtd_page_walk_info *info) { bool read_cur, write_cur, entry_valid; uint32_t offset; @@ -781,37 +901,34 @@ static int vtd_page_walk_level(dma_addr_t addr, uint64_t start, */ entry_valid = read_cur | write_cur; - if (vtd_is_last_slpte(slpte, level)) { + if (!vtd_is_last_slpte(slpte, level) && entry_valid) { + /* + * This is a valid PDE (or even bigger than PDE). We need + * to walk one further level. + */ + ret = vtd_page_walk_level(vtd_get_slpte_addr(slpte, info->aw), + iova, MIN(iova_next, end), level - 1, + read_cur, write_cur, info); + } else { + /* + * This means we are either: + * + * (1) the real page entry (either 4K page, or huge page) + * (2) the whole range is invalid + * + * In either case, we send an IOTLB notification down. + */ entry.target_as = &address_space_memory; entry.iova = iova & subpage_mask; - /* NOTE: this is only meaningful if entry_valid == true */ - entry.translated_addr = vtd_get_slpte_addr(slpte, aw); - entry.addr_mask = ~subpage_mask; entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur); - if (!entry_valid && !notify_unmap) { - trace_vtd_page_walk_skip_perm(iova, iova_next); - goto next; - } - trace_vtd_page_walk_one(level, entry.iova, entry.translated_addr, - entry.addr_mask, entry.perm); - if (hook_fn) { - ret = hook_fn(&entry, private); - if (ret < 0) { - return ret; - } - } - } else { - if (!entry_valid) { - trace_vtd_page_walk_skip_perm(iova, iova_next); - goto next; - } - ret = vtd_page_walk_level(vtd_get_slpte_addr(slpte, aw), iova, - MIN(iova_next, end), hook_fn, private, - level - 1, read_cur, write_cur, - notify_unmap, aw); - if (ret < 0) { - return ret; - } + entry.addr_mask = ~subpage_mask; + /* NOTE: this is only meaningful if entry_valid == true */ + entry.translated_addr = vtd_get_slpte_addr(slpte, info->aw); + ret = vtd_page_walk_one(&entry, info); + } + + if (ret < 0) { + return ret; } next: @@ -827,28 +944,24 @@ next: * @ce: context entry to walk upon * @start: IOVA address to start the walk * @end: IOVA range end address (start <= addr < end) - * @hook_fn: the hook that to be called for each detected area - * @private: private data for the hook function - * @aw: maximum address width + * @info: page walking information struct */ static int vtd_page_walk(VTDContextEntry *ce, uint64_t start, uint64_t end, - vtd_page_walk_hook hook_fn, void *private, - bool notify_unmap, uint8_t aw) + vtd_page_walk_info *info) { dma_addr_t addr = vtd_ce_get_slpt_base(ce); uint32_t level = vtd_ce_get_level(ce); - if (!vtd_iova_range_check(start, ce, aw)) { + if (!vtd_iova_range_check(start, ce, info->aw)) { return -VTD_FR_ADDR_BEYOND_MGAW; } - if (!vtd_iova_range_check(end, ce, aw)) { + if (!vtd_iova_range_check(end, ce, info->aw)) { /* Fix end so that it reaches the maximum */ - end = vtd_iova_limit(ce, aw); + end = vtd_iova_limit(ce, info->aw); } - return vtd_page_walk_level(addr, start, end, hook_fn, private, - level, true, true, notify_unmap, aw); + return vtd_page_walk_level(addr, start, end, level, true, true, info); } /* Map a device to its corresponding domain (context-entry) */ @@ -907,6 +1020,58 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num, return 0; } +static int vtd_sync_shadow_page_hook(IOMMUTLBEntry *entry, + void *private) +{ + memory_region_notify_iommu((IOMMUMemoryRegion *)private, *entry); + return 0; +} + +/* If context entry is NULL, we'll try to fetch it on our own. */ +static int vtd_sync_shadow_page_table_range(VTDAddressSpace *vtd_as, + VTDContextEntry *ce, + hwaddr addr, hwaddr size) +{ + IntelIOMMUState *s = vtd_as->iommu_state; + vtd_page_walk_info info = { + .hook_fn = vtd_sync_shadow_page_hook, + .private = (void *)&vtd_as->iommu, + .notify_unmap = true, + .aw = s->aw_bits, + .as = vtd_as, + }; + VTDContextEntry ce_cache; + int ret; + + if (ce) { + /* If the caller provided context entry, use it */ + ce_cache = *ce; + } else { + /* If the caller didn't provide ce, try to fetch */ + ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus), + vtd_as->devfn, &ce_cache); + if (ret) { + /* + * This should not really happen, but in case it happens, + * we just skip the sync for this time. After all we even + * don't have the root table pointer! + */ + trace_vtd_err("Detected invalid context entry when " + "trying to sync shadow page table"); + return 0; + } + } + + info.domain_id = VTD_CONTEXT_ENTRY_DID(ce_cache.hi); + + return vtd_page_walk(&ce_cache, addr, addr + size, &info); +} + +static int vtd_sync_shadow_page_table(VTDAddressSpace *vtd_as) +{ + return vtd_sync_shadow_page_table_range(vtd_as, NULL, 0, UINT64_MAX); +} + /* * Fetch translation type for specific device. Returns <0 if error * happens, otherwise return the shifted type to check against @@ -1088,7 +1253,7 @@ static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, IntelIOMMUState *s = vtd_as->iommu_state; VTDContextEntry ce; uint8_t bus_num = pci_bus_num(bus); - VTDContextCacheEntry *cc_entry = &vtd_as->context_cache_entry; + VTDContextCacheEntry *cc_entry; uint64_t slpte, page_mask; uint32_t level; uint16_t source_id = vtd_make_source_id(bus_num, devfn); @@ -1105,6 +1270,10 @@ static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, */ assert(!vtd_is_interrupt_addr(addr)); + vtd_iommu_lock(s); + + cc_entry = &vtd_as->context_cache_entry; + /* Try to fetch slpte form IOTLB */ iotlb_entry = vtd_lookup_iotlb(s, source_id, addr); if (iotlb_entry) { @@ -1164,7 +1333,7 @@ static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, * IOMMU region can be swapped back. */ vtd_pt_enable_fast_path(s, source_id); - + vtd_iommu_unlock(s); return true; } @@ -1185,6 +1354,7 @@ static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, vtd_update_iotlb(s, source_id, VTD_CONTEXT_ENTRY_DID(ce.hi), addr, slpte, access_flags, level); out: + vtd_iommu_unlock(s); entry->iova = addr & page_mask; entry->translated_addr = vtd_get_slpte_addr(slpte, s->aw_bits) & page_mask; entry->addr_mask = ~page_mask; @@ -1192,6 +1362,7 @@ out: return true; error: + vtd_iommu_unlock(s); entry->iova = 0; entry->translated_addr = 0; entry->addr_mask = 0; @@ -1230,20 +1401,23 @@ static void vtd_interrupt_remap_table_setup(IntelIOMMUState *s) static void vtd_iommu_replay_all(IntelIOMMUState *s) { - IntelIOMMUNotifierNode *node; + VTDAddressSpace *vtd_as; - QLIST_FOREACH(node, &s->notifiers_list, next) { - memory_region_iommu_replay_all(&node->vtd_as->iommu); + QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) { + vtd_sync_shadow_page_table(vtd_as); } } static void vtd_context_global_invalidate(IntelIOMMUState *s) { trace_vtd_inv_desc_cc_global(); + /* Protects context cache */ + vtd_iommu_lock(s); s->context_cache_gen++; if (s->context_cache_gen == VTD_CONTEXT_CACHE_GEN_MAX) { - vtd_reset_context_cache(s); + vtd_reset_context_cache_locked(s); } + vtd_iommu_unlock(s); vtd_switch_address_space_all(s); /* * From VT-d spec 6.5.2.1, a global context entry invalidation @@ -1295,7 +1469,9 @@ static void vtd_context_device_invalidate(IntelIOMMUState *s, if (vtd_as && ((devfn_it & mask) == (devfn & mask))) { trace_vtd_inv_desc_cc_device(bus_n, VTD_PCI_SLOT(devfn_it), VTD_PCI_FUNC(devfn_it)); + vtd_iommu_lock(s); vtd_as->context_cache_entry.context_cache_gen = 0; + vtd_iommu_unlock(s); /* * Do switch address space when needed, in case if the * device passthrough bit is switched. @@ -1303,14 +1479,13 @@ static void vtd_context_device_invalidate(IntelIOMMUState *s, vtd_switch_address_space(vtd_as); /* * So a device is moving out of (or moving into) a - * domain, a replay() suites here to notify all the - * IOMMU_NOTIFIER_MAP registers about this change. + * domain, resync the shadow page table. * This won't bring bad even if we have no such * notifier registered - the IOMMU notification * framework will skip MAP notifications if that * happened. */ - memory_region_iommu_replay_all(&vtd_as->iommu); + vtd_sync_shadow_page_table(vtd_as); } } } @@ -1354,48 +1529,60 @@ static void vtd_iotlb_global_invalidate(IntelIOMMUState *s) static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id) { - IntelIOMMUNotifierNode *node; VTDContextEntry ce; VTDAddressSpace *vtd_as; trace_vtd_inv_desc_iotlb_domain(domain_id); + vtd_iommu_lock(s); g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_domain, &domain_id); + vtd_iommu_unlock(s); - QLIST_FOREACH(node, &s->notifiers_list, next) { - vtd_as = node->vtd_as; + QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) { if (!vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus), vtd_as->devfn, &ce) && domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) { - memory_region_iommu_replay_all(&vtd_as->iommu); + vtd_sync_shadow_page_table(vtd_as); } } } -static int vtd_page_invalidate_notify_hook(IOMMUTLBEntry *entry, - void *private) -{ - memory_region_notify_iommu((IOMMUMemoryRegion *)private, *entry); - return 0; -} - static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s, uint16_t domain_id, hwaddr addr, uint8_t am) { - IntelIOMMUNotifierNode *node; + VTDAddressSpace *vtd_as; VTDContextEntry ce; int ret; + hwaddr size = (1 << am) * VTD_PAGE_SIZE; - QLIST_FOREACH(node, &(s->notifiers_list), next) { - VTDAddressSpace *vtd_as = node->vtd_as; + QLIST_FOREACH(vtd_as, &(s->vtd_as_with_notifiers), next) { ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus), vtd_as->devfn, &ce); if (!ret && domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) { - vtd_page_walk(&ce, addr, addr + (1 << am) * VTD_PAGE_SIZE, - vtd_page_invalidate_notify_hook, - (void *)&vtd_as->iommu, true, s->aw_bits); + if (vtd_as_has_map_notifier(vtd_as)) { + /* + * As long as we have MAP notifications registered in + * any of our IOMMU notifiers, we need to sync the + * shadow page table. + */ + vtd_sync_shadow_page_table_range(vtd_as, &ce, addr, size); + } else { + /* + * For UNMAP-only notifiers, we don't need to walk the + * page tables. We just deliver the PSI down to + * invalidate caches. + */ + IOMMUTLBEntry entry = { + .target_as = &address_space_memory, + .iova = addr, + .translated_addr = 0, + .addr_mask = size - 1, + .perm = IOMMU_NONE, + }; + memory_region_notify_iommu(&vtd_as->iommu, entry); + } } } } @@ -1411,7 +1598,9 @@ static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id, info.domain_id = domain_id; info.addr = addr; info.mask = ~((1 << am) - 1); + vtd_iommu_lock(s); g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info); + vtd_iommu_unlock(s); vtd_iotlb_page_invalidate_notify(s, domain_id, addr, am); } @@ -2326,8 +2515,6 @@ static void vtd_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu, { VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu); IntelIOMMUState *s = vtd_as->iommu_state; - IntelIOMMUNotifierNode *node = NULL; - IntelIOMMUNotifierNode *next_node = NULL; if (!s->caching_mode && new & IOMMU_NOTIFIER_MAP) { error_report("We need to set caching-mode=1 for intel-iommu to enable " @@ -2335,22 +2522,13 @@ static void vtd_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu, exit(1); } - if (old == IOMMU_NOTIFIER_NONE) { - node = g_malloc0(sizeof(*node)); - node->vtd_as = vtd_as; - QLIST_INSERT_HEAD(&s->notifiers_list, node, next); - return; - } + /* Update per-address-space notifier flags */ + vtd_as->notifier_flags = new; - /* update notifier node with new flags */ - QLIST_FOREACH_SAFE(node, &s->notifiers_list, next, next_node) { - if (node->vtd_as == vtd_as) { - if (new == IOMMU_NOTIFIER_NONE) { - QLIST_REMOVE(node, next); - g_free(node); - } - return; - } + if (old == IOMMU_NOTIFIER_NONE) { + QLIST_INSERT_HEAD(&s->vtd_as_with_notifiers, vtd_as, next); + } else if (new == IOMMU_NOTIFIER_NONE) { + QLIST_REMOVE(vtd_as, next); } } @@ -2719,6 +2897,7 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn) vtd_dev_as->devfn = (uint8_t)devfn; vtd_dev_as->iommu_state = s; vtd_dev_as->context_cache_entry.context_cache_gen = 0; + vtd_dev_as->iova_tree = iova_tree_new(); /* * Memory region relationships looks like (Address range shows @@ -2771,6 +2950,7 @@ static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n) hwaddr start = n->start; hwaddr end = n->end; IntelIOMMUState *s = as->iommu_state; + DMAMap map; /* * Note: all the codes in this function has a assumption that IOVA @@ -2815,17 +2995,19 @@ static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n) VTD_PCI_FUNC(as->devfn), entry.iova, size); + map.iova = entry.iova; + map.size = entry.addr_mask; + iova_tree_remove(as->iova_tree, &map); + memory_region_notify_one(n, &entry); } static void vtd_address_space_unmap_all(IntelIOMMUState *s) { - IntelIOMMUNotifierNode *node; VTDAddressSpace *vtd_as; IOMMUNotifier *n; - QLIST_FOREACH(node, &s->notifiers_list, next) { - vtd_as = node->vtd_as; + QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) { IOMMU_NOTIFIER_FOREACH(n, &vtd_as->iommu) { vtd_address_space_unmap(vtd_as, n); } @@ -2857,8 +3039,19 @@ static void vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n) PCI_FUNC(vtd_as->devfn), VTD_CONTEXT_ENTRY_DID(ce.hi), ce.hi, ce.lo); - vtd_page_walk(&ce, 0, ~0ULL, vtd_replay_hook, (void *)n, false, - s->aw_bits); + if (vtd_as_has_map_notifier(vtd_as)) { + /* This is required only for MAP typed notifiers */ + vtd_page_walk_info info = { + .hook_fn = vtd_replay_hook, + .private = (void *)n, + .notify_unmap = false, + .aw = s->aw_bits, + .as = vtd_as, + .domain_id = VTD_CONTEXT_ENTRY_DID(ce.hi), + }; + + vtd_page_walk(&ce, 0, ~0ULL, &info); + } } else { trace_vtd_replay_ce_invalid(bus_n, PCI_SLOT(vtd_as->devfn), PCI_FUNC(vtd_as->devfn)); @@ -2930,8 +3123,10 @@ static void vtd_init(IntelIOMMUState *s) s->cap |= VTD_CAP_CM; } - vtd_reset_context_cache(s); - vtd_reset_iotlb(s); + vtd_iommu_lock(s); + vtd_reset_context_cache_locked(s); + vtd_reset_iotlb_locked(s); + vtd_iommu_unlock(s); /* Define registers with default values and bit semantics */ vtd_define_long(s, DMAR_VER_REG, 0x10UL, 0, 0); @@ -3070,7 +3265,8 @@ static void vtd_realize(DeviceState *dev, Error **errp) return; } - QLIST_INIT(&s->notifiers_list); + QLIST_INIT(&s->vtd_as_with_notifiers); + qemu_mutex_init(&s->iommu_lock); memset(s->vtd_as_by_bus_num, 0, sizeof(s->vtd_as_by_bus_num)); memory_region_init_io(&s->csrmem, OBJECT(s), &vtd_mem_ops, s, "intel_iommu", DMAR_REG_SIZE); diff --git a/hw/i386/kvm/clock.c b/hw/i386/kvm/clock.c index 7dac319403..0bf1c60a06 100644 --- a/hw/i386/kvm/clock.c +++ b/hw/i386/kvm/clock.c @@ -26,7 +26,7 @@ #include "qapi/error.h" #include -#include +#include "standard-headers/asm-x86/kvm_para.h" #define TYPE_KVM_CLOCK "kvmclock" #define KVM_CLOCK(obj) OBJECT_CHECK(KVMClockState, (obj), TYPE_KVM_CLOCK) diff --git a/hw/i386/trace-events b/hw/i386/trace-events index 22d44648af..e14d06ec83 100644 --- a/hw/i386/trace-events +++ b/hw/i386/trace-events @@ -39,9 +39,10 @@ vtd_fault_disabled(void) "Fault processing disabled for context entry" vtd_replay_ce_valid(uint8_t bus, uint8_t dev, uint8_t fn, uint16_t domain, uint64_t hi, uint64_t lo) "replay valid context device %02"PRIx8":%02"PRIx8".%02"PRIx8" domain 0x%"PRIx16" hi 0x%"PRIx64" lo 0x%"PRIx64 vtd_replay_ce_invalid(uint8_t bus, uint8_t dev, uint8_t fn) "replay invalid context device %02"PRIx8":%02"PRIx8".%02"PRIx8 vtd_page_walk_level(uint64_t addr, uint32_t level, uint64_t start, uint64_t end) "walk (base=0x%"PRIx64", level=%"PRIu32") iova range 0x%"PRIx64" - 0x%"PRIx64 -vtd_page_walk_one(uint32_t level, uint64_t iova, uint64_t gpa, uint64_t mask, int perm) "detected page level 0x%"PRIx32" iova 0x%"PRIx64" -> gpa 0x%"PRIx64" mask 0x%"PRIx64" perm %d" +vtd_page_walk_one(uint16_t domain, uint64_t iova, uint64_t gpa, uint64_t mask, int perm) "domain 0x%"PRIu16" iova 0x%"PRIx64" -> gpa 0x%"PRIx64" mask 0x%"PRIx64" perm %d" +vtd_page_walk_one_skip_map(uint64_t iova, uint64_t mask, uint64_t translated) "iova 0x%"PRIx64" mask 0x%"PRIx64" translated 0x%"PRIx64 +vtd_page_walk_one_skip_unmap(uint64_t iova, uint64_t mask) "iova 0x%"PRIx64" mask 0x%"PRIx64 vtd_page_walk_skip_read(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to unable to read" -vtd_page_walk_skip_perm(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to perm empty" vtd_page_walk_skip_reserve(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to rsrv set" vtd_switch_address_space(uint8_t bus, uint8_t slot, uint8_t fn, bool on) "Device %02x:%02x.%x switching address space (iommu enabled=%d)" vtd_as_unmap_whole(uint8_t bus, uint8_t slot, uint8_t fn, uint64_t iova, uint64_t size) "Device %02x:%02x.%x start 0x%"PRIx64" size 0x%"PRIx64 diff --git a/hw/mem/nvdimm.c b/hw/mem/nvdimm.c index acb656b672..4087aca25e 100644 --- a/hw/mem/nvdimm.c +++ b/hw/mem/nvdimm.c @@ -89,7 +89,7 @@ static void nvdimm_set_unarmed(Object *obj, bool value, Error **errp) static void nvdimm_init(Object *obj) { - object_property_add(obj, NVDIMM_LABLE_SIZE_PROP, "int", + object_property_add(obj, NVDIMM_LABEL_SIZE_PROP, "int", nvdimm_get_label_size, nvdimm_set_label_size, NULL, NULL, NULL); object_property_add_bool(obj, NVDIMM_UNARMED_PROP, diff --git a/hw/pci-host/q35.c b/hw/pci-host/q35.c index a36a1195e4..02f9576588 100644 --- a/hw/pci-host/q35.c +++ b/hw/pci-host/q35.c @@ -535,13 +535,15 @@ static void mch_realize(PCIDevice *d, Error **errp) /* if *disabled* show SMRAM to all CPUs */ memory_region_init_alias(&mch->smram_region, OBJECT(mch), "smram-region", - mch->pci_address_space, 0xa0000, 0x20000); - memory_region_add_subregion_overlap(mch->system_memory, 0xa0000, + mch->pci_address_space, MCH_HOST_BRIDGE_SMRAM_C_BASE, + MCH_HOST_BRIDGE_SMRAM_C_SIZE); + memory_region_add_subregion_overlap(mch->system_memory, MCH_HOST_BRIDGE_SMRAM_C_BASE, &mch->smram_region, 1); memory_region_set_enabled(&mch->smram_region, true); memory_region_init_alias(&mch->open_high_smram, OBJECT(mch), "smram-open-high", - mch->ram_memory, 0xa0000, 0x20000); + mch->ram_memory, MCH_HOST_BRIDGE_SMRAM_C_BASE, + MCH_HOST_BRIDGE_SMRAM_C_SIZE); memory_region_add_subregion_overlap(mch->system_memory, 0xfeda0000, &mch->open_high_smram, 1); memory_region_set_enabled(&mch->open_high_smram, false); @@ -550,11 +552,14 @@ static void mch_realize(PCIDevice *d, Error **errp) memory_region_init(&mch->smram, OBJECT(mch), "smram", 1ull << 32); memory_region_set_enabled(&mch->smram, true); memory_region_init_alias(&mch->low_smram, OBJECT(mch), "smram-low", - mch->ram_memory, 0xa0000, 0x20000); + mch->ram_memory, MCH_HOST_BRIDGE_SMRAM_C_BASE, + MCH_HOST_BRIDGE_SMRAM_C_SIZE); memory_region_set_enabled(&mch->low_smram, true); - memory_region_add_subregion(&mch->smram, 0xa0000, &mch->low_smram); + memory_region_add_subregion(&mch->smram, MCH_HOST_BRIDGE_SMRAM_C_BASE, + &mch->low_smram); memory_region_init_alias(&mch->high_smram, OBJECT(mch), "smram-high", - mch->ram_memory, 0xa0000, 0x20000); + mch->ram_memory, MCH_HOST_BRIDGE_SMRAM_C_BASE, + MCH_HOST_BRIDGE_SMRAM_C_SIZE); memory_region_set_enabled(&mch->high_smram, true); memory_region_add_subregion(&mch->smram, 0xfeda0000, &mch->high_smram); diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events index 1422ff03ab..07bcbe9e85 100644 --- a/hw/virtio/trace-events +++ b/hw/virtio/trace-events @@ -6,6 +6,7 @@ vhost_region_add_section(const char *name, uint64_t gpa, uint64_t size, uint64_t vhost_region_add_section_merge(const char *name, uint64_t new_size, uint64_t gpa, uint64_t owr) "%s: size: 0x%"PRIx64 " gpa: 0x%"PRIx64 " owr: 0x%"PRIx64 vhost_region_add_section_aligned(const char *name, uint64_t gpa, uint64_t size, uint64_t host) "%s: 0x%"PRIx64"+0x%"PRIx64" @ 0x%"PRIx64 vhost_section(const char *name, int r) "%s:%d" +vhost_iotlb_miss(void *dev, int step) "%p step %d" # hw/virtio/vhost-user.c vhost_user_postcopy_end_entry(void) "" diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c index 38da8692bb..ca554d4ff1 100644 --- a/hw/virtio/vhost-user.c +++ b/hw/virtio/vhost-user.c @@ -852,14 +852,44 @@ static void slave_read(void *opaque) VhostUserHeader hdr = { 0, }; VhostUserPayload payload = { 0, }; int size, ret = 0; + struct iovec iov; + struct msghdr msgh; + int fd = -1; + char control[CMSG_SPACE(sizeof(fd))]; + struct cmsghdr *cmsg; + size_t fdsize; + + memset(&msgh, 0, sizeof(msgh)); + msgh.msg_iov = &iov; + msgh.msg_iovlen = 1; + msgh.msg_control = control; + msgh.msg_controllen = sizeof(control); /* Read header */ - size = read(u->slave_fd, &hdr, VHOST_USER_HDR_SIZE); + iov.iov_base = &hdr; + iov.iov_len = VHOST_USER_HDR_SIZE; + + size = recvmsg(u->slave_fd, &msgh, 0); if (size != VHOST_USER_HDR_SIZE) { error_report("Failed to read from slave."); goto err; } + if (msgh.msg_flags & MSG_CTRUNC) { + error_report("Truncated message."); + goto err; + } + + for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL; + cmsg = CMSG_NXTHDR(&msgh, cmsg)) { + if (cmsg->cmsg_level == SOL_SOCKET && + cmsg->cmsg_type == SCM_RIGHTS) { + fdsize = cmsg->cmsg_len - CMSG_LEN(0); + memcpy(&fd, CMSG_DATA(cmsg), fdsize); + break; + } + } + if (hdr.size > VHOST_USER_PAYLOAD_SIZE) { error_report("Failed to read msg header." " Size %d exceeds the maximum %zu.", hdr.size, @@ -883,9 +913,15 @@ static void slave_read(void *opaque) break; default: error_report("Received unexpected msg type."); + if (fd != -1) { + close(fd); + } ret = -EINVAL; } + /* Message handlers need to make sure that fd will be consumed. */ + fd = -1; + /* * REPLY_ACK feature handling. Other reply types has to be managed * directly in their request handlers. @@ -918,6 +954,9 @@ err: qemu_set_fd_handler(u->slave_fd, NULL, NULL, NULL); close(u->slave_fd); u->slave_fd = -1; + if (fd != -1) { + close(fd); + } return; } @@ -1076,7 +1115,7 @@ static int vhost_user_postcopy_advise(struct vhost_dev *dev, Error **errp) error_setg(errp, "%s: Failed to get ufd", __func__); return -1; } - fcntl(ufd, F_SETFL, O_NONBLOCK); + qemu_set_nonblock(ufd); /* register ufd with userfault thread */ u->postcopy_fd.fd = ufd; @@ -1316,7 +1355,7 @@ static bool vhost_user_requires_shm_log(struct vhost_dev *dev) static int vhost_user_migration_done(struct vhost_dev *dev, char* mac_addr) { - VhostUserMsg msg = { 0 }; + VhostUserMsg msg = { }; assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER); diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c index 9d5850a7d7..b08290036d 100644 --- a/hw/virtio/vhost.c +++ b/hw/virtio/vhost.c @@ -894,12 +894,15 @@ int vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write) rcu_read_lock(); + trace_vhost_iotlb_miss(dev, 1); + iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as, iova, write); if (iotlb.target_as != NULL) { ret = vhost_memory_region_lookup(dev, iotlb.translated_addr, &uaddr, &len); if (ret) { + trace_vhost_iotlb_miss(dev, 3); error_report("Fail to lookup the translated address " "%"PRIx64, iotlb.translated_addr); goto out; @@ -911,10 +914,14 @@ int vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write) ret = vhost_backend_update_device_iotlb(dev, iova, uaddr, len, iotlb.perm); if (ret) { + trace_vhost_iotlb_miss(dev, 4); error_report("Fail to update device iotlb"); goto out; } } + + trace_vhost_iotlb_miss(dev, 2); + out: rcu_read_unlock(); diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c index f456cea2e7..1f7a87f094 100644 --- a/hw/virtio/virtio-balloon.c +++ b/hw/virtio/virtio-balloon.c @@ -52,6 +52,8 @@ static const char *balloon_stat_names[] = { [VIRTIO_BALLOON_S_MEMTOT] = "stat-total-memory", [VIRTIO_BALLOON_S_AVAIL] = "stat-available-memory", [VIRTIO_BALLOON_S_CACHES] = "stat-disk-caches", + [VIRTIO_BALLOON_S_HTLB_PGALLOC] = "stat-htlb-pgalloc", + [VIRTIO_BALLOON_S_HTLB_PGFAIL] = "stat-htlb-pgfail", [VIRTIO_BALLOON_S_NR] = NULL }; diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c index 1e8ab7bbc5..5eb0c323ca 100644 --- a/hw/virtio/virtio-pci.c +++ b/hw/virtio/virtio-pci.c @@ -1037,6 +1037,27 @@ assign_error: return r; } +static int virtio_pci_set_host_notifier_mr(DeviceState *d, int n, + MemoryRegion *mr, bool assign) +{ + VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d); + int offset; + + if (n >= VIRTIO_QUEUE_MAX || !virtio_pci_modern(proxy) || + virtio_pci_queue_mem_mult(proxy) != memory_region_size(mr)) { + return -1; + } + + if (assign) { + offset = virtio_pci_queue_mem_mult(proxy) * n; + memory_region_add_subregion_overlap(&proxy->notify.mr, offset, mr, 1); + } else { + memory_region_del_subregion(&proxy->notify.mr, mr); + } + + return 0; +} + static void virtio_pci_vmstate_change(DeviceState *d, bool running) { VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d); @@ -2652,6 +2673,7 @@ static void virtio_pci_bus_class_init(ObjectClass *klass, void *data) k->has_extra_state = virtio_pci_has_extra_state; k->query_guest_notifiers = virtio_pci_query_guest_notifiers; k->set_guest_notifiers = virtio_pci_set_guest_notifiers; + k->set_host_notifier_mr = virtio_pci_set_host_notifier_mr; k->vmstate_change = virtio_pci_vmstate_change; k->pre_plugged = virtio_pci_pre_plugged; k->device_plugged = virtio_pci_device_plugged; diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index 006d3d1148..1debb0147b 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -2454,6 +2454,19 @@ EventNotifier *virtio_queue_get_host_notifier(VirtQueue *vq) return &vq->host_notifier; } +int virtio_queue_set_host_notifier_mr(VirtIODevice *vdev, int n, + MemoryRegion *mr, bool assign) +{ + BusState *qbus = qdev_get_parent_bus(DEVICE(vdev)); + VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); + + if (k->set_host_notifier_mr) { + return k->set_host_notifier_mr(qbus->parent, n, mr, assign); + } + + return -1; +} + void virtio_device_set_child_bus_name(VirtIODevice *vdev, char *bus_name) { g_free(vdev->bus_name); diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h index 45ec8919b6..fbfedcb1c0 100644 --- a/include/hw/i386/intel_iommu.h +++ b/include/hw/i386/intel_iommu.h @@ -27,6 +27,7 @@ #include "hw/i386/ioapic.h" #include "hw/pci/msi.h" #include "hw/sysbus.h" +#include "qemu/iova-tree.h" #define TYPE_INTEL_IOMMU_DEVICE "intel-iommu" #define INTEL_IOMMU_DEVICE(obj) \ @@ -67,7 +68,6 @@ typedef union VTD_IR_TableEntry VTD_IR_TableEntry; typedef union VTD_IR_MSIAddress VTD_IR_MSIAddress; typedef struct VTDIrq VTDIrq; typedef struct VTD_MSIMessage VTD_MSIMessage; -typedef struct IntelIOMMUNotifierNode IntelIOMMUNotifierNode; /* Context-Entry */ struct VTDContextEntry { @@ -93,6 +93,10 @@ struct VTDAddressSpace { MemoryRegion iommu_ir; /* Interrupt region: 0xfeeXXXXX */ IntelIOMMUState *iommu_state; VTDContextCacheEntry context_cache_entry; + QLIST_ENTRY(VTDAddressSpace) next; + /* Superset of notifier flags that this address space has */ + IOMMUNotifierFlag notifier_flags; + IOVATree *iova_tree; /* Traces mapped IOVA ranges */ }; struct VTDBus { @@ -253,11 +257,6 @@ struct VTD_MSIMessage { /* When IR is enabled, all MSI/MSI-X data bits should be zero */ #define VTD_IR_MSI_DATA (0) -struct IntelIOMMUNotifierNode { - VTDAddressSpace *vtd_as; - QLIST_ENTRY(IntelIOMMUNotifierNode) next; -}; - /* The iommu (DMAR) device state struct */ struct IntelIOMMUState { X86IOMMUState x86_iommu; @@ -295,7 +294,7 @@ struct IntelIOMMUState { GHashTable *vtd_as_by_busptr; /* VTDBus objects indexed by PCIBus* reference */ VTDBus *vtd_as_by_bus_num[VTD_PCI_BUS_MAX]; /* VTDBus objects indexed by bus number */ /* list of registered notifiers */ - QLIST_HEAD(, IntelIOMMUNotifierNode) notifiers_list; + QLIST_HEAD(, VTDAddressSpace) vtd_as_with_notifiers; /* interrupt remapping */ bool intr_enabled; /* Whether guest enabled IR */ @@ -305,6 +304,12 @@ struct IntelIOMMUState { OnOffAuto intr_eim; /* Toggle for EIM cabability */ bool buggy_eim; /* Force buggy EIM unless eim=off */ uint8_t aw_bits; /* Host/IOVA address width (in bits) */ + + /* + * Protects IOMMU states in general. Currently it protects the + * per-IOMMU IOTLB cache, and context entry cache in VTDAddressSpace. + */ + QemuMutex iommu_lock; }; /* Find the VTD Address space associated with the given bus pointer, diff --git a/include/hw/mem/nvdimm.h b/include/hw/mem/nvdimm.h index 7fd87c4e1c..74c60332e1 100644 --- a/include/hw/mem/nvdimm.h +++ b/include/hw/mem/nvdimm.h @@ -48,7 +48,7 @@ #define NVDIMM_GET_CLASS(obj) OBJECT_GET_CLASS(NVDIMMClass, (obj), \ TYPE_NVDIMM) -#define NVDIMM_LABLE_SIZE_PROP "label-size" +#define NVDIMM_LABEL_SIZE_PROP "label-size" #define NVDIMM_UNARMED_PROP "unarmed" struct NVDIMMDevice { diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h index a9c3ee5aa2..990d6fcbde 100644 --- a/include/hw/pci/pci.h +++ b/include/hw/pci/pci.h @@ -101,6 +101,7 @@ extern bool pci_available; #define PCI_DEVICE_ID_REDHAT_PCIE_RP 0x000c #define PCI_DEVICE_ID_REDHAT_XHCI 0x000d #define PCI_DEVICE_ID_REDHAT_PCIE_BRIDGE 0x000e +#define PCI_DEVICE_ID_REDHAT_MDPY 0x000f #define PCI_DEVICE_ID_REDHAT_QXL 0x0100 #define FMT_PCIBUS PRIx64 diff --git a/include/hw/virtio/virtio-bus.h b/include/hw/virtio/virtio-bus.h index ced3d2d2b0..7fec9dc929 100644 --- a/include/hw/virtio/virtio-bus.h +++ b/include/hw/virtio/virtio-bus.h @@ -52,6 +52,8 @@ typedef struct VirtioBusClass { bool (*has_extra_state)(DeviceState *d); bool (*query_guest_notifiers)(DeviceState *d); int (*set_guest_notifiers)(DeviceState *d, int nvqs, bool assign); + int (*set_host_notifier_mr)(DeviceState *d, int n, + MemoryRegion *mr, bool assign); void (*vmstate_change)(DeviceState *d, bool running); /* * Expose the features the transport layer supports before diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h index 098bdaaea3..9c1fa07d6d 100644 --- a/include/hw/virtio/virtio.h +++ b/include/hw/virtio/virtio.h @@ -239,6 +239,8 @@ void virtio_queue_set_align(VirtIODevice *vdev, int n, int align); void virtio_queue_notify(VirtIODevice *vdev, int n); uint16_t virtio_queue_vector(VirtIODevice *vdev, int n); void virtio_queue_set_vector(VirtIODevice *vdev, int n, uint16_t vector); +int virtio_queue_set_host_notifier_mr(VirtIODevice *vdev, int n, + MemoryRegion *mr, bool assign); int virtio_set_status(VirtIODevice *vdev, uint8_t val); void virtio_reset(void *opaque); void virtio_update_irq(VirtIODevice *vdev); diff --git a/include/qemu/iova-tree.h b/include/qemu/iova-tree.h new file mode 100644 index 0000000000..b061932097 --- /dev/null +++ b/include/qemu/iova-tree.h @@ -0,0 +1,134 @@ +/* + * An very simplified iova tree implementation based on GTree. + * + * Copyright 2018 Red Hat, Inc. + * + * Authors: + * Peter Xu + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + */ +#ifndef IOVA_TREE_H +#define IOVA_TREE_H + +/* + * Currently the iova tree will only allow to keep ranges + * information, and no extra user data is allowed for each element. A + * benefit is that we can merge adjacent ranges internally within the + * tree. It can save a lot of memory when the ranges are splitted but + * mostly continuous. + * + * Note that current implementation does not provide any thread + * protections. Callers of the iova tree should be responsible + * for the thread safety issue. + */ + +#include "qemu/osdep.h" +#include "exec/memory.h" +#include "exec/hwaddr.h" + +#define IOVA_OK (0) +#define IOVA_ERR_INVALID (-1) /* Invalid parameters */ +#define IOVA_ERR_OVERLAP (-2) /* IOVA range overlapped */ + +typedef struct IOVATree IOVATree; +typedef struct DMAMap { + hwaddr iova; + hwaddr translated_addr; + hwaddr size; /* Inclusive */ + IOMMUAccessFlags perm; +} QEMU_PACKED DMAMap; +typedef gboolean (*iova_tree_iterator)(DMAMap *map); + +/** + * iova_tree_new: + * + * Create a new iova tree. + * + * Returns: the tree pointer when succeeded, or NULL if error. + */ +IOVATree *iova_tree_new(void); + +/** + * iova_tree_insert: + * + * @tree: the iova tree to insert + * @map: the mapping to insert + * + * Insert an iova range to the tree. If there is overlapped + * ranges, IOVA_ERR_OVERLAP will be returned. + * + * Return: 0 if succeeded, or <0 if error. + */ +int iova_tree_insert(IOVATree *tree, DMAMap *map); + +/** + * iova_tree_remove: + * + * @tree: the iova tree to remove range from + * @map: the map range to remove + * + * Remove mappings from the tree that are covered by the map range + * provided. The range does not need to be exactly what has inserted, + * all the mappings that are included in the provided range will be + * removed from the tree. Here map->translated_addr is meaningless. + * + * Return: 0 if succeeded, or <0 if error. + */ +int iova_tree_remove(IOVATree *tree, DMAMap *map); + +/** + * iova_tree_find: + * + * @tree: the iova tree to search from + * @map: the mapping to search + * + * Search for a mapping in the iova tree that overlaps with the + * mapping range specified. Only the first found mapping will be + * returned. + * + * Return: DMAMap pointer if found, or NULL if not found. Note that + * the returned DMAMap pointer is maintained internally. User should + * only read the content but never modify or free the content. Also, + * user is responsible to make sure the pointer is valid (say, no + * concurrent deletion in progress). + */ +DMAMap *iova_tree_find(IOVATree *tree, DMAMap *map); + +/** + * iova_tree_find_address: + * + * @tree: the iova tree to search from + * @iova: the iova address to find + * + * Similar to iova_tree_find(), but it tries to find mapping with + * range iova=iova & size=0. + * + * Return: same as iova_tree_find(). + */ +DMAMap *iova_tree_find_address(IOVATree *tree, hwaddr iova); + +/** + * iova_tree_foreach: + * + * @tree: the iova tree to iterate on + * @iterator: the interator for the mappings, return true to stop + * + * Iterate over the iova tree. + * + * Return: 1 if found any overlap, 0 if not, <0 if error. + */ +void iova_tree_foreach(IOVATree *tree, iova_tree_iterator iterator); + +/** + * iova_tree_destroy: + * + * @tree: the iova tree to destroy + * + * Destroy an existing iova tree. + * + * Return: None. + */ +void iova_tree_destroy(IOVATree *tree); + +#endif diff --git a/linux-headers/asm-x86/kvm_para.h b/include/standard-headers/asm-x86/kvm_para.h similarity index 80% rename from linux-headers/asm-x86/kvm_para.h rename to include/standard-headers/asm-x86/kvm_para.h index 4c58184395..53a85ae3ed 100644 --- a/linux-headers/asm-x86/kvm_para.h +++ b/include/standard-headers/asm-x86/kvm_para.h @@ -2,16 +2,17 @@ #ifndef _ASM_X86_KVM_PARA_H #define _ASM_X86_KVM_PARA_H -#include -#include +#include "standard-headers/linux/types.h" /* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It * should be used to determine that a VM is running under KVM. */ #define KVM_CPUID_SIGNATURE 0x40000000 -/* This CPUID returns a feature bitmap in eax. Before enabling a particular - * paravirtualization, the appropriate feature bit should be checked. +/* This CPUID returns two feature bitmaps in eax, edx. Before enabling + * a particular paravirtualization, the appropriate feature bit should + * be checked in eax. The performance hint feature bit should be checked + * in edx. */ #define KVM_CPUID_FEATURES 0x40000001 #define KVM_FEATURE_CLOCKSOURCE 0 @@ -28,6 +29,8 @@ #define KVM_FEATURE_PV_TLB_FLUSH 9 #define KVM_FEATURE_ASYNC_PF_VMEXIT 10 +#define KVM_HINTS_DEDICATED 0 + /* The last 8 bits are used to indicate how to interpret the flags field * in pvclock structure. If no bits are set, all flags are ignored. */ @@ -45,12 +48,12 @@ #define MSR_KVM_PV_EOI_EN 0x4b564d04 struct kvm_steal_time { - __u64 steal; - __u32 version; - __u32 flags; - __u8 preempted; - __u8 u8_pad[3]; - __u32 pad[11]; + uint64_t steal; + uint32_t version; + uint32_t flags; + uint8_t preempted; + uint8_t uint8_t_pad[3]; + uint32_t pad[11]; }; #define KVM_VCPU_PREEMPTED (1 << 0) @@ -58,11 +61,11 @@ struct kvm_steal_time { #define KVM_CLOCK_PAIRING_WALLCLOCK 0 struct kvm_clock_pairing { - __s64 sec; - __s64 nsec; - __u64 tsc; - __u32 flags; - __u32 pad[9]; + int64_t sec; + int64_t nsec; + uint64_t tsc; + uint32_t flags; + uint32_t pad[9]; }; #define KVM_STEAL_ALIGNMENT_BITS 5 @@ -82,14 +85,14 @@ struct kvm_clock_pairing { /* Payload for KVM_HC_MMU_OP */ struct kvm_mmu_op_header { - __u32 op; - __u32 pad; + uint32_t op; + uint32_t pad; }; struct kvm_mmu_op_write_pte { struct kvm_mmu_op_header header; - __u64 pte_phys; - __u64 pte_val; + uint64_t pte_phys; + uint64_t pte_val; }; struct kvm_mmu_op_flush_tlb { @@ -98,16 +101,16 @@ struct kvm_mmu_op_flush_tlb { struct kvm_mmu_op_release_pt { struct kvm_mmu_op_header header; - __u64 pt_phys; + uint64_t pt_phys; }; #define KVM_PV_REASON_PAGE_NOT_PRESENT 1 #define KVM_PV_REASON_PAGE_READY 2 struct kvm_vcpu_pv_apf_data { - __u32 reason; - __u8 pad[60]; - __u32 enabled; + uint32_t reason; + uint8_t pad[60]; + uint32_t enabled; }; #define KVM_PV_EOI_BIT 0 diff --git a/include/standard-headers/linux/virtio_balloon.h b/include/standard-headers/linux/virtio_balloon.h index 7b0a41b8fc..e446805ae9 100644 --- a/include/standard-headers/linux/virtio_balloon.h +++ b/include/standard-headers/linux/virtio_balloon.h @@ -53,7 +53,9 @@ struct virtio_balloon_config { #define VIRTIO_BALLOON_S_MEMTOT 5 /* Total amount of memory */ #define VIRTIO_BALLOON_S_AVAIL 6 /* Available memory as in /proc */ #define VIRTIO_BALLOON_S_CACHES 7 /* Disk caches */ -#define VIRTIO_BALLOON_S_NR 8 +#define VIRTIO_BALLOON_S_HTLB_PGALLOC 8 /* Hugetlb page allocations */ +#define VIRTIO_BALLOON_S_HTLB_PGFAIL 9 /* Hugetlb page allocation failures */ +#define VIRTIO_BALLOON_S_NR 10 /* * Memory statistics structure. diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h index 23669c4d5a..0b64b8e067 100644 --- a/include/sysemu/kvm.h +++ b/include/sysemu/kvm.h @@ -22,7 +22,6 @@ #ifdef NEED_CPU_H # ifdef CONFIG_KVM # include -# include # define CONFIG_KVM_IS_POSSIBLE # endif #else diff --git a/linux-headers/asm-arm/bitsperlong.h b/linux-headers/asm-arm/bitsperlong.h new file mode 100644 index 0000000000..6dc0bb0c13 --- /dev/null +++ b/linux-headers/asm-arm/bitsperlong.h @@ -0,0 +1 @@ +#include diff --git a/linux-headers/asm-arm/kvm_para.h b/linux-headers/asm-arm/kvm_para.h deleted file mode 100644 index baacc4996d..0000000000 --- a/linux-headers/asm-arm/kvm_para.h +++ /dev/null @@ -1,2 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#include diff --git a/linux-headers/asm-arm64/bitsperlong.h b/linux-headers/asm-arm64/bitsperlong.h new file mode 100644 index 0000000000..485d60bee2 --- /dev/null +++ b/linux-headers/asm-arm64/bitsperlong.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Copyright (C) 2012 ARM Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +#ifndef __ASM_BITSPERLONG_H +#define __ASM_BITSPERLONG_H + +#define __BITS_PER_LONG 64 + +#include + +#endif /* __ASM_BITSPERLONG_H */ diff --git a/linux-headers/asm-arm64/kvm_para.h b/linux-headers/asm-arm64/kvm_para.h deleted file mode 100644 index 14fab8f0b9..0000000000 --- a/linux-headers/asm-arm64/kvm_para.h +++ /dev/null @@ -1 +0,0 @@ -#include diff --git a/linux-headers/asm-generic/bitsperlong.h b/linux-headers/asm-generic/bitsperlong.h new file mode 100644 index 0000000000..0aac245b6b --- /dev/null +++ b/linux-headers/asm-generic/bitsperlong.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef __ASM_GENERIC_BITS_PER_LONG +#define __ASM_GENERIC_BITS_PER_LONG + +/* + * There seems to be no way of detecting this automatically from user + * space, so 64 bit architectures should override this in their + * bitsperlong.h. In particular, an architecture that supports + * both 32 and 64 bit user space must not rely on CONFIG_64BIT + * to decide it, but rather check a compiler provided macro. + */ +#ifndef __BITS_PER_LONG +#define __BITS_PER_LONG 32 +#endif + +#endif /* __ASM_GENERIC_BITS_PER_LONG */ diff --git a/linux-headers/asm-generic/kvm_para.h b/linux-headers/asm-generic/kvm_para.h deleted file mode 100644 index 486f0af73c..0000000000 --- a/linux-headers/asm-generic/kvm_para.h +++ /dev/null @@ -1,4 +0,0 @@ -/* - * There isn't anything here, but the file must not be empty or patch - * will delete it. - */ diff --git a/linux-headers/asm-generic/unistd.h b/linux-headers/asm-generic/unistd.h new file mode 100644 index 0000000000..8bcb186c6f --- /dev/null +++ b/linux-headers/asm-generic/unistd.h @@ -0,0 +1,781 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#include + +/* + * This file contains the system call numbers, based on the + * layout of the x86-64 architecture, which embeds the + * pointer to the syscall in the table. + * + * As a basic principle, no duplication of functionality + * should be added, e.g. we don't use lseek when llseek + * is present. New architectures should use this file + * and implement the less feature-full calls in user space. + */ + +#ifndef __SYSCALL +#define __SYSCALL(x, y) +#endif + +#if __BITS_PER_LONG == 32 || defined(__SYSCALL_COMPAT) +#define __SC_3264(_nr, _32, _64) __SYSCALL(_nr, _32) +#else +#define __SC_3264(_nr, _32, _64) __SYSCALL(_nr, _64) +#endif + +#ifdef __SYSCALL_COMPAT +#define __SC_COMP(_nr, _sys, _comp) __SYSCALL(_nr, _comp) +#define __SC_COMP_3264(_nr, _32, _64, _comp) __SYSCALL(_nr, _comp) +#else +#define __SC_COMP(_nr, _sys, _comp) __SYSCALL(_nr, _sys) +#define __SC_COMP_3264(_nr, _32, _64, _comp) __SC_3264(_nr, _32, _64) +#endif + +#define __NR_io_setup 0 +__SC_COMP(__NR_io_setup, sys_io_setup, compat_sys_io_setup) +#define __NR_io_destroy 1 +__SYSCALL(__NR_io_destroy, sys_io_destroy) +#define __NR_io_submit 2 +__SC_COMP(__NR_io_submit, sys_io_submit, compat_sys_io_submit) +#define __NR_io_cancel 3 +__SYSCALL(__NR_io_cancel, sys_io_cancel) +#define __NR_io_getevents 4 +__SC_COMP(__NR_io_getevents, sys_io_getevents, compat_sys_io_getevents) + +/* fs/xattr.c */ +#define __NR_setxattr 5 +__SYSCALL(__NR_setxattr, sys_setxattr) +#define __NR_lsetxattr 6 +__SYSCALL(__NR_lsetxattr, sys_lsetxattr) +#define __NR_fsetxattr 7 +__SYSCALL(__NR_fsetxattr, sys_fsetxattr) +#define __NR_getxattr 8 +__SYSCALL(__NR_getxattr, sys_getxattr) +#define __NR_lgetxattr 9 +__SYSCALL(__NR_lgetxattr, sys_lgetxattr) +#define __NR_fgetxattr 10 +__SYSCALL(__NR_fgetxattr, sys_fgetxattr) +#define __NR_listxattr 11 +__SYSCALL(__NR_listxattr, sys_listxattr) +#define __NR_llistxattr 12 +__SYSCALL(__NR_llistxattr, sys_llistxattr) +#define __NR_flistxattr 13 +__SYSCALL(__NR_flistxattr, sys_flistxattr) +#define __NR_removexattr 14 +__SYSCALL(__NR_removexattr, sys_removexattr) +#define __NR_lremovexattr 15 +__SYSCALL(__NR_lremovexattr, sys_lremovexattr) +#define __NR_fremovexattr 16 +__SYSCALL(__NR_fremovexattr, sys_fremovexattr) + +/* fs/dcache.c */ +#define __NR_getcwd 17 +__SYSCALL(__NR_getcwd, sys_getcwd) + +/* fs/cookies.c */ +#define __NR_lookup_dcookie 18 +__SC_COMP(__NR_lookup_dcookie, sys_lookup_dcookie, compat_sys_lookup_dcookie) + +/* fs/eventfd.c */ +#define __NR_eventfd2 19 +__SYSCALL(__NR_eventfd2, sys_eventfd2) + +/* fs/eventpoll.c */ +#define __NR_epoll_create1 20 +__SYSCALL(__NR_epoll_create1, sys_epoll_create1) +#define __NR_epoll_ctl 21 +__SYSCALL(__NR_epoll_ctl, sys_epoll_ctl) +#define __NR_epoll_pwait 22 +__SC_COMP(__NR_epoll_pwait, sys_epoll_pwait, compat_sys_epoll_pwait) + +/* fs/fcntl.c */ +#define __NR_dup 23 +__SYSCALL(__NR_dup, sys_dup) +#define __NR_dup3 24 +__SYSCALL(__NR_dup3, sys_dup3) +#define __NR3264_fcntl 25 +__SC_COMP_3264(__NR3264_fcntl, sys_fcntl64, sys_fcntl, compat_sys_fcntl64) + +/* fs/inotify_user.c */ +#define __NR_inotify_init1 26 +__SYSCALL(__NR_inotify_init1, sys_inotify_init1) +#define __NR_inotify_add_watch 27 +__SYSCALL(__NR_inotify_add_watch, sys_inotify_add_watch) +#define __NR_inotify_rm_watch 28 +__SYSCALL(__NR_inotify_rm_watch, sys_inotify_rm_watch) + +/* fs/ioctl.c */ +#define __NR_ioctl 29 +__SC_COMP(__NR_ioctl, sys_ioctl, compat_sys_ioctl) + +/* fs/ioprio.c */ +#define __NR_ioprio_set 30 +__SYSCALL(__NR_ioprio_set, sys_ioprio_set) +#define __NR_ioprio_get 31 +__SYSCALL(__NR_ioprio_get, sys_ioprio_get) + +/* fs/locks.c */ +#define __NR_flock 32 +__SYSCALL(__NR_flock, sys_flock) + +/* fs/namei.c */ +#define __NR_mknodat 33 +__SYSCALL(__NR_mknodat, sys_mknodat) +#define __NR_mkdirat 34 +__SYSCALL(__NR_mkdirat, sys_mkdirat) +#define __NR_unlinkat 35 +__SYSCALL(__NR_unlinkat, sys_unlinkat) +#define __NR_symlinkat 36 +__SYSCALL(__NR_symlinkat, sys_symlinkat) +#define __NR_linkat 37 +__SYSCALL(__NR_linkat, sys_linkat) +#ifdef __ARCH_WANT_RENAMEAT +/* renameat is superseded with flags by renameat2 */ +#define __NR_renameat 38 +__SYSCALL(__NR_renameat, sys_renameat) +#endif /* __ARCH_WANT_RENAMEAT */ + +/* fs/namespace.c */ +#define __NR_umount2 39 +__SYSCALL(__NR_umount2, sys_umount) +#define __NR_mount 40 +__SC_COMP(__NR_mount, sys_mount, compat_sys_mount) +#define __NR_pivot_root 41 +__SYSCALL(__NR_pivot_root, sys_pivot_root) + +/* fs/nfsctl.c */ +#define __NR_nfsservctl 42 +__SYSCALL(__NR_nfsservctl, sys_ni_syscall) + +/* fs/open.c */ +#define __NR3264_statfs 43 +__SC_COMP_3264(__NR3264_statfs, sys_statfs64, sys_statfs, \ + compat_sys_statfs64) +#define __NR3264_fstatfs 44 +__SC_COMP_3264(__NR3264_fstatfs, sys_fstatfs64, sys_fstatfs, \ + compat_sys_fstatfs64) +#define __NR3264_truncate 45 +__SC_COMP_3264(__NR3264_truncate, sys_truncate64, sys_truncate, \ + compat_sys_truncate64) +#define __NR3264_ftruncate 46 +__SC_COMP_3264(__NR3264_ftruncate, sys_ftruncate64, sys_ftruncate, \ + compat_sys_ftruncate64) + +#define __NR_fallocate 47 +__SC_COMP(__NR_fallocate, sys_fallocate, compat_sys_fallocate) +#define __NR_faccessat 48 +__SYSCALL(__NR_faccessat, sys_faccessat) +#define __NR_chdir 49 +__SYSCALL(__NR_chdir, sys_chdir) +#define __NR_fchdir 50 +__SYSCALL(__NR_fchdir, sys_fchdir) +#define __NR_chroot 51 +__SYSCALL(__NR_chroot, sys_chroot) +#define __NR_fchmod 52 +__SYSCALL(__NR_fchmod, sys_fchmod) +#define __NR_fchmodat 53 +__SYSCALL(__NR_fchmodat, sys_fchmodat) +#define __NR_fchownat 54 +__SYSCALL(__NR_fchownat, sys_fchownat) +#define __NR_fchown 55 +__SYSCALL(__NR_fchown, sys_fchown) +#define __NR_openat 56 +__SC_COMP(__NR_openat, sys_openat, compat_sys_openat) +#define __NR_close 57 +__SYSCALL(__NR_close, sys_close) +#define __NR_vhangup 58 +__SYSCALL(__NR_vhangup, sys_vhangup) + +/* fs/pipe.c */ +#define __NR_pipe2 59 +__SYSCALL(__NR_pipe2, sys_pipe2) + +/* fs/quota.c */ +#define __NR_quotactl 60 +__SYSCALL(__NR_quotactl, sys_quotactl) + +/* fs/readdir.c */ +#define __NR_getdents64 61 +__SYSCALL(__NR_getdents64, sys_getdents64) + +/* fs/read_write.c */ +#define __NR3264_lseek 62 +__SC_3264(__NR3264_lseek, sys_llseek, sys_lseek) +#define __NR_read 63 +__SYSCALL(__NR_read, sys_read) +#define __NR_write 64 +__SYSCALL(__NR_write, sys_write) +#define __NR_readv 65 +__SC_COMP(__NR_readv, sys_readv, compat_sys_readv) +#define __NR_writev 66 +__SC_COMP(__NR_writev, sys_writev, compat_sys_writev) +#define __NR_pread64 67 +__SC_COMP(__NR_pread64, sys_pread64, compat_sys_pread64) +#define __NR_pwrite64 68 +__SC_COMP(__NR_pwrite64, sys_pwrite64, compat_sys_pwrite64) +#define __NR_preadv 69 +__SC_COMP(__NR_preadv, sys_preadv, compat_sys_preadv) +#define __NR_pwritev 70 +__SC_COMP(__NR_pwritev, sys_pwritev, compat_sys_pwritev) + +/* fs/sendfile.c */ +#define __NR3264_sendfile 71 +__SYSCALL(__NR3264_sendfile, sys_sendfile64) + +/* fs/select.c */ +#define __NR_pselect6 72 +__SC_COMP(__NR_pselect6, sys_pselect6, compat_sys_pselect6) +#define __NR_ppoll 73 +__SC_COMP(__NR_ppoll, sys_ppoll, compat_sys_ppoll) + +/* fs/signalfd.c */ +#define __NR_signalfd4 74 +__SC_COMP(__NR_signalfd4, sys_signalfd4, compat_sys_signalfd4) + +/* fs/splice.c */ +#define __NR_vmsplice 75 +__SC_COMP(__NR_vmsplice, sys_vmsplice, compat_sys_vmsplice) +#define __NR_splice 76 +__SYSCALL(__NR_splice, sys_splice) +#define __NR_tee 77 +__SYSCALL(__NR_tee, sys_tee) + +/* fs/stat.c */ +#define __NR_readlinkat 78 +__SYSCALL(__NR_readlinkat, sys_readlinkat) +#define __NR3264_fstatat 79 +__SC_3264(__NR3264_fstatat, sys_fstatat64, sys_newfstatat) +#define __NR3264_fstat 80 +__SC_3264(__NR3264_fstat, sys_fstat64, sys_newfstat) + +/* fs/sync.c */ +#define __NR_sync 81 +__SYSCALL(__NR_sync, sys_sync) +#define __NR_fsync 82 +__SYSCALL(__NR_fsync, sys_fsync) +#define __NR_fdatasync 83 +__SYSCALL(__NR_fdatasync, sys_fdatasync) +#ifdef __ARCH_WANT_SYNC_FILE_RANGE2 +#define __NR_sync_file_range2 84 +__SC_COMP(__NR_sync_file_range2, sys_sync_file_range2, \ + compat_sys_sync_file_range2) +#else +#define __NR_sync_file_range 84 +__SC_COMP(__NR_sync_file_range, sys_sync_file_range, \ + compat_sys_sync_file_range) +#endif + +/* fs/timerfd.c */ +#define __NR_timerfd_create 85 +__SYSCALL(__NR_timerfd_create, sys_timerfd_create) +#define __NR_timerfd_settime 86 +__SC_COMP(__NR_timerfd_settime, sys_timerfd_settime, \ + compat_sys_timerfd_settime) +#define __NR_timerfd_gettime 87 +__SC_COMP(__NR_timerfd_gettime, sys_timerfd_gettime, \ + compat_sys_timerfd_gettime) + +/* fs/utimes.c */ +#define __NR_utimensat 88 +__SC_COMP(__NR_utimensat, sys_utimensat, compat_sys_utimensat) + +/* kernel/acct.c */ +#define __NR_acct 89 +__SYSCALL(__NR_acct, sys_acct) + +/* kernel/capability.c */ +#define __NR_capget 90 +__SYSCALL(__NR_capget, sys_capget) +#define __NR_capset 91 +__SYSCALL(__NR_capset, sys_capset) + +/* kernel/exec_domain.c */ +#define __NR_personality 92 +__SYSCALL(__NR_personality, sys_personality) + +/* kernel/exit.c */ +#define __NR_exit 93 +__SYSCALL(__NR_exit, sys_exit) +#define __NR_exit_group 94 +__SYSCALL(__NR_exit_group, sys_exit_group) +#define __NR_waitid 95 +__SC_COMP(__NR_waitid, sys_waitid, compat_sys_waitid) + +/* kernel/fork.c */ +#define __NR_set_tid_address 96 +__SYSCALL(__NR_set_tid_address, sys_set_tid_address) +#define __NR_unshare 97 +__SYSCALL(__NR_unshare, sys_unshare) + +/* kernel/futex.c */ +#define __NR_futex 98 +__SC_COMP(__NR_futex, sys_futex, compat_sys_futex) +#define __NR_set_robust_list 99 +__SC_COMP(__NR_set_robust_list, sys_set_robust_list, \ + compat_sys_set_robust_list) +#define __NR_get_robust_list 100 +__SC_COMP(__NR_get_robust_list, sys_get_robust_list, \ + compat_sys_get_robust_list) + +/* kernel/hrtimer.c */ +#define __NR_nanosleep 101 +__SC_COMP(__NR_nanosleep, sys_nanosleep, compat_sys_nanosleep) + +/* kernel/itimer.c */ +#define __NR_getitimer 102 +__SC_COMP(__NR_getitimer, sys_getitimer, compat_sys_getitimer) +#define __NR_setitimer 103 +__SC_COMP(__NR_setitimer, sys_setitimer, compat_sys_setitimer) + +/* kernel/kexec.c */ +#define __NR_kexec_load 104 +__SC_COMP(__NR_kexec_load, sys_kexec_load, compat_sys_kexec_load) + +/* kernel/module.c */ +#define __NR_init_module 105 +__SYSCALL(__NR_init_module, sys_init_module) +#define __NR_delete_module 106 +__SYSCALL(__NR_delete_module, sys_delete_module) + +/* kernel/posix-timers.c */ +#define __NR_timer_create 107 +__SC_COMP(__NR_timer_create, sys_timer_create, compat_sys_timer_create) +#define __NR_timer_gettime 108 +__SC_COMP(__NR_timer_gettime, sys_timer_gettime, compat_sys_timer_gettime) +#define __NR_timer_getoverrun 109 +__SYSCALL(__NR_timer_getoverrun, sys_timer_getoverrun) +#define __NR_timer_settime 110 +__SC_COMP(__NR_timer_settime, sys_timer_settime, compat_sys_timer_settime) +#define __NR_timer_delete 111 +__SYSCALL(__NR_timer_delete, sys_timer_delete) +#define __NR_clock_settime 112 +__SC_COMP(__NR_clock_settime, sys_clock_settime, compat_sys_clock_settime) +#define __NR_clock_gettime 113 +__SC_COMP(__NR_clock_gettime, sys_clock_gettime, compat_sys_clock_gettime) +#define __NR_clock_getres 114 +__SC_COMP(__NR_clock_getres, sys_clock_getres, compat_sys_clock_getres) +#define __NR_clock_nanosleep 115 +__SC_COMP(__NR_clock_nanosleep, sys_clock_nanosleep, \ + compat_sys_clock_nanosleep) + +/* kernel/printk.c */ +#define __NR_syslog 116 +__SYSCALL(__NR_syslog, sys_syslog) + +/* kernel/ptrace.c */ +#define __NR_ptrace 117 +__SYSCALL(__NR_ptrace, sys_ptrace) + +/* kernel/sched/core.c */ +#define __NR_sched_setparam 118 +__SYSCALL(__NR_sched_setparam, sys_sched_setparam) +#define __NR_sched_setscheduler 119 +__SYSCALL(__NR_sched_setscheduler, sys_sched_setscheduler) +#define __NR_sched_getscheduler 120 +__SYSCALL(__NR_sched_getscheduler, sys_sched_getscheduler) +#define __NR_sched_getparam 121 +__SYSCALL(__NR_sched_getparam, sys_sched_getparam) +#define __NR_sched_setaffinity 122 +__SC_COMP(__NR_sched_setaffinity, sys_sched_setaffinity, \ + compat_sys_sched_setaffinity) +#define __NR_sched_getaffinity 123 +__SC_COMP(__NR_sched_getaffinity, sys_sched_getaffinity, \ + compat_sys_sched_getaffinity) +#define __NR_sched_yield 124 +__SYSCALL(__NR_sched_yield, sys_sched_yield) +#define __NR_sched_get_priority_max 125 +__SYSCALL(__NR_sched_get_priority_max, sys_sched_get_priority_max) +#define __NR_sched_get_priority_min 126 +__SYSCALL(__NR_sched_get_priority_min, sys_sched_get_priority_min) +#define __NR_sched_rr_get_interval 127 +__SC_COMP(__NR_sched_rr_get_interval, sys_sched_rr_get_interval, \ + compat_sys_sched_rr_get_interval) + +/* kernel/signal.c */ +#define __NR_restart_syscall 128 +__SYSCALL(__NR_restart_syscall, sys_restart_syscall) +#define __NR_kill 129 +__SYSCALL(__NR_kill, sys_kill) +#define __NR_tkill 130 +__SYSCALL(__NR_tkill, sys_tkill) +#define __NR_tgkill 131 +__SYSCALL(__NR_tgkill, sys_tgkill) +#define __NR_sigaltstack 132 +__SC_COMP(__NR_sigaltstack, sys_sigaltstack, compat_sys_sigaltstack) +#define __NR_rt_sigsuspend 133 +__SC_COMP(__NR_rt_sigsuspend, sys_rt_sigsuspend, compat_sys_rt_sigsuspend) +#define __NR_rt_sigaction 134 +__SC_COMP(__NR_rt_sigaction, sys_rt_sigaction, compat_sys_rt_sigaction) +#define __NR_rt_sigprocmask 135 +__SC_COMP(__NR_rt_sigprocmask, sys_rt_sigprocmask, compat_sys_rt_sigprocmask) +#define __NR_rt_sigpending 136 +__SC_COMP(__NR_rt_sigpending, sys_rt_sigpending, compat_sys_rt_sigpending) +#define __NR_rt_sigtimedwait 137 +__SC_COMP(__NR_rt_sigtimedwait, sys_rt_sigtimedwait, \ + compat_sys_rt_sigtimedwait) +#define __NR_rt_sigqueueinfo 138 +__SC_COMP(__NR_rt_sigqueueinfo, sys_rt_sigqueueinfo, \ + compat_sys_rt_sigqueueinfo) +#define __NR_rt_sigreturn 139 +__SC_COMP(__NR_rt_sigreturn, sys_rt_sigreturn, compat_sys_rt_sigreturn) + +/* kernel/sys.c */ +#define __NR_setpriority 140 +__SYSCALL(__NR_setpriority, sys_setpriority) +#define __NR_getpriority 141 +__SYSCALL(__NR_getpriority, sys_getpriority) +#define __NR_reboot 142 +__SYSCALL(__NR_reboot, sys_reboot) +#define __NR_setregid 143 +__SYSCALL(__NR_setregid, sys_setregid) +#define __NR_setgid 144 +__SYSCALL(__NR_setgid, sys_setgid) +#define __NR_setreuid 145 +__SYSCALL(__NR_setreuid, sys_setreuid) +#define __NR_setuid 146 +__SYSCALL(__NR_setuid, sys_setuid) +#define __NR_setresuid 147 +__SYSCALL(__NR_setresuid, sys_setresuid) +#define __NR_getresuid 148 +__SYSCALL(__NR_getresuid, sys_getresuid) +#define __NR_setresgid 149 +__SYSCALL(__NR_setresgid, sys_setresgid) +#define __NR_getresgid 150 +__SYSCALL(__NR_getresgid, sys_getresgid) +#define __NR_setfsuid 151 +__SYSCALL(__NR_setfsuid, sys_setfsuid) +#define __NR_setfsgid 152 +__SYSCALL(__NR_setfsgid, sys_setfsgid) +#define __NR_times 153 +__SC_COMP(__NR_times, sys_times, compat_sys_times) +#define __NR_setpgid 154 +__SYSCALL(__NR_setpgid, sys_setpgid) +#define __NR_getpgid 155 +__SYSCALL(__NR_getpgid, sys_getpgid) +#define __NR_getsid 156 +__SYSCALL(__NR_getsid, sys_getsid) +#define __NR_setsid 157 +__SYSCALL(__NR_setsid, sys_setsid) +#define __NR_getgroups 158 +__SYSCALL(__NR_getgroups, sys_getgroups) +#define __NR_setgroups 159 +__SYSCALL(__NR_setgroups, sys_setgroups) +#define __NR_uname 160 +__SYSCALL(__NR_uname, sys_newuname) +#define __NR_sethostname 161 +__SYSCALL(__NR_sethostname, sys_sethostname) +#define __NR_setdomainname 162 +__SYSCALL(__NR_setdomainname, sys_setdomainname) +#define __NR_getrlimit 163 +__SC_COMP(__NR_getrlimit, sys_getrlimit, compat_sys_getrlimit) +#define __NR_setrlimit 164 +__SC_COMP(__NR_setrlimit, sys_setrlimit, compat_sys_setrlimit) +#define __NR_getrusage 165 +__SC_COMP(__NR_getrusage, sys_getrusage, compat_sys_getrusage) +#define __NR_umask 166 +__SYSCALL(__NR_umask, sys_umask) +#define __NR_prctl 167 +__SYSCALL(__NR_prctl, sys_prctl) +#define __NR_getcpu 168 +__SYSCALL(__NR_getcpu, sys_getcpu) + +/* kernel/time.c */ +#define __NR_gettimeofday 169 +__SC_COMP(__NR_gettimeofday, sys_gettimeofday, compat_sys_gettimeofday) +#define __NR_settimeofday 170 +__SC_COMP(__NR_settimeofday, sys_settimeofday, compat_sys_settimeofday) +#define __NR_adjtimex 171 +__SC_COMP(__NR_adjtimex, sys_adjtimex, compat_sys_adjtimex) + +/* kernel/timer.c */ +#define __NR_getpid 172 +__SYSCALL(__NR_getpid, sys_getpid) +#define __NR_getppid 173 +__SYSCALL(__NR_getppid, sys_getppid) +#define __NR_getuid 174 +__SYSCALL(__NR_getuid, sys_getuid) +#define __NR_geteuid 175 +__SYSCALL(__NR_geteuid, sys_geteuid) +#define __NR_getgid 176 +__SYSCALL(__NR_getgid, sys_getgid) +#define __NR_getegid 177 +__SYSCALL(__NR_getegid, sys_getegid) +#define __NR_gettid 178 +__SYSCALL(__NR_gettid, sys_gettid) +#define __NR_sysinfo 179 +__SC_COMP(__NR_sysinfo, sys_sysinfo, compat_sys_sysinfo) + +/* ipc/mqueue.c */ +#define __NR_mq_open 180 +__SC_COMP(__NR_mq_open, sys_mq_open, compat_sys_mq_open) +#define __NR_mq_unlink 181 +__SYSCALL(__NR_mq_unlink, sys_mq_unlink) +#define __NR_mq_timedsend 182 +__SC_COMP(__NR_mq_timedsend, sys_mq_timedsend, compat_sys_mq_timedsend) +#define __NR_mq_timedreceive 183 +__SC_COMP(__NR_mq_timedreceive, sys_mq_timedreceive, \ + compat_sys_mq_timedreceive) +#define __NR_mq_notify 184 +__SC_COMP(__NR_mq_notify, sys_mq_notify, compat_sys_mq_notify) +#define __NR_mq_getsetattr 185 +__SC_COMP(__NR_mq_getsetattr, sys_mq_getsetattr, compat_sys_mq_getsetattr) + +/* ipc/msg.c */ +#define __NR_msgget 186 +__SYSCALL(__NR_msgget, sys_msgget) +#define __NR_msgctl 187 +__SC_COMP(__NR_msgctl, sys_msgctl, compat_sys_msgctl) +#define __NR_msgrcv 188 +__SC_COMP(__NR_msgrcv, sys_msgrcv, compat_sys_msgrcv) +#define __NR_msgsnd 189 +__SC_COMP(__NR_msgsnd, sys_msgsnd, compat_sys_msgsnd) + +/* ipc/sem.c */ +#define __NR_semget 190 +__SYSCALL(__NR_semget, sys_semget) +#define __NR_semctl 191 +__SC_COMP(__NR_semctl, sys_semctl, compat_sys_semctl) +#define __NR_semtimedop 192 +__SC_COMP(__NR_semtimedop, sys_semtimedop, compat_sys_semtimedop) +#define __NR_semop 193 +__SYSCALL(__NR_semop, sys_semop) + +/* ipc/shm.c */ +#define __NR_shmget 194 +__SYSCALL(__NR_shmget, sys_shmget) +#define __NR_shmctl 195 +__SC_COMP(__NR_shmctl, sys_shmctl, compat_sys_shmctl) +#define __NR_shmat 196 +__SC_COMP(__NR_shmat, sys_shmat, compat_sys_shmat) +#define __NR_shmdt 197 +__SYSCALL(__NR_shmdt, sys_shmdt) + +/* net/socket.c */ +#define __NR_socket 198 +__SYSCALL(__NR_socket, sys_socket) +#define __NR_socketpair 199 +__SYSCALL(__NR_socketpair, sys_socketpair) +#define __NR_bind 200 +__SYSCALL(__NR_bind, sys_bind) +#define __NR_listen 201 +__SYSCALL(__NR_listen, sys_listen) +#define __NR_accept 202 +__SYSCALL(__NR_accept, sys_accept) +#define __NR_connect 203 +__SYSCALL(__NR_connect, sys_connect) +#define __NR_getsockname 204 +__SYSCALL(__NR_getsockname, sys_getsockname) +#define __NR_getpeername 205 +__SYSCALL(__NR_getpeername, sys_getpeername) +#define __NR_sendto 206 +__SYSCALL(__NR_sendto, sys_sendto) +#define __NR_recvfrom 207 +__SC_COMP(__NR_recvfrom, sys_recvfrom, compat_sys_recvfrom) +#define __NR_setsockopt 208 +__SC_COMP(__NR_setsockopt, sys_setsockopt, compat_sys_setsockopt) +#define __NR_getsockopt 209 +__SC_COMP(__NR_getsockopt, sys_getsockopt, compat_sys_getsockopt) +#define __NR_shutdown 210 +__SYSCALL(__NR_shutdown, sys_shutdown) +#define __NR_sendmsg 211 +__SC_COMP(__NR_sendmsg, sys_sendmsg, compat_sys_sendmsg) +#define __NR_recvmsg 212 +__SC_COMP(__NR_recvmsg, sys_recvmsg, compat_sys_recvmsg) + +/* mm/filemap.c */ +#define __NR_readahead 213 +__SC_COMP(__NR_readahead, sys_readahead, compat_sys_readahead) + +/* mm/nommu.c, also with MMU */ +#define __NR_brk 214 +__SYSCALL(__NR_brk, sys_brk) +#define __NR_munmap 215 +__SYSCALL(__NR_munmap, sys_munmap) +#define __NR_mremap 216 +__SYSCALL(__NR_mremap, sys_mremap) + +/* security/keys/keyctl.c */ +#define __NR_add_key 217 +__SYSCALL(__NR_add_key, sys_add_key) +#define __NR_request_key 218 +__SYSCALL(__NR_request_key, sys_request_key) +#define __NR_keyctl 219 +__SC_COMP(__NR_keyctl, sys_keyctl, compat_sys_keyctl) + +/* arch/example/kernel/sys_example.c */ +#define __NR_clone 220 +__SYSCALL(__NR_clone, sys_clone) +#define __NR_execve 221 +__SC_COMP(__NR_execve, sys_execve, compat_sys_execve) + +#define __NR3264_mmap 222 +__SC_3264(__NR3264_mmap, sys_mmap2, sys_mmap) +/* mm/fadvise.c */ +#define __NR3264_fadvise64 223 +__SC_COMP(__NR3264_fadvise64, sys_fadvise64_64, compat_sys_fadvise64_64) + +/* mm/, CONFIG_MMU only */ +#ifndef __ARCH_NOMMU +#define __NR_swapon 224 +__SYSCALL(__NR_swapon, sys_swapon) +#define __NR_swapoff 225 +__SYSCALL(__NR_swapoff, sys_swapoff) +#define __NR_mprotect 226 +__SYSCALL(__NR_mprotect, sys_mprotect) +#define __NR_msync 227 +__SYSCALL(__NR_msync, sys_msync) +#define __NR_mlock 228 +__SYSCALL(__NR_mlock, sys_mlock) +#define __NR_munlock 229 +__SYSCALL(__NR_munlock, sys_munlock) +#define __NR_mlockall 230 +__SYSCALL(__NR_mlockall, sys_mlockall) +#define __NR_munlockall 231 +__SYSCALL(__NR_munlockall, sys_munlockall) +#define __NR_mincore 232 +__SYSCALL(__NR_mincore, sys_mincore) +#define __NR_madvise 233 +__SYSCALL(__NR_madvise, sys_madvise) +#define __NR_remap_file_pages 234 +__SYSCALL(__NR_remap_file_pages, sys_remap_file_pages) +#define __NR_mbind 235 +__SC_COMP(__NR_mbind, sys_mbind, compat_sys_mbind) +#define __NR_get_mempolicy 236 +__SC_COMP(__NR_get_mempolicy, sys_get_mempolicy, compat_sys_get_mempolicy) +#define __NR_set_mempolicy 237 +__SC_COMP(__NR_set_mempolicy, sys_set_mempolicy, compat_sys_set_mempolicy) +#define __NR_migrate_pages 238 +__SC_COMP(__NR_migrate_pages, sys_migrate_pages, compat_sys_migrate_pages) +#define __NR_move_pages 239 +__SC_COMP(__NR_move_pages, sys_move_pages, compat_sys_move_pages) +#endif + +#define __NR_rt_tgsigqueueinfo 240 +__SC_COMP(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo, \ + compat_sys_rt_tgsigqueueinfo) +#define __NR_perf_event_open 241 +__SYSCALL(__NR_perf_event_open, sys_perf_event_open) +#define __NR_accept4 242 +__SYSCALL(__NR_accept4, sys_accept4) +#define __NR_recvmmsg 243 +__SC_COMP(__NR_recvmmsg, sys_recvmmsg, compat_sys_recvmmsg) + +/* + * Architectures may provide up to 16 syscalls of their own + * starting with this value. + */ +#define __NR_arch_specific_syscall 244 + +#define __NR_wait4 260 +__SC_COMP(__NR_wait4, sys_wait4, compat_sys_wait4) +#define __NR_prlimit64 261 +__SYSCALL(__NR_prlimit64, sys_prlimit64) +#define __NR_fanotify_init 262 +__SYSCALL(__NR_fanotify_init, sys_fanotify_init) +#define __NR_fanotify_mark 263 +__SYSCALL(__NR_fanotify_mark, sys_fanotify_mark) +#define __NR_name_to_handle_at 264 +__SYSCALL(__NR_name_to_handle_at, sys_name_to_handle_at) +#define __NR_open_by_handle_at 265 +__SC_COMP(__NR_open_by_handle_at, sys_open_by_handle_at, \ + compat_sys_open_by_handle_at) +#define __NR_clock_adjtime 266 +__SC_COMP(__NR_clock_adjtime, sys_clock_adjtime, compat_sys_clock_adjtime) +#define __NR_syncfs 267 +__SYSCALL(__NR_syncfs, sys_syncfs) +#define __NR_setns 268 +__SYSCALL(__NR_setns, sys_setns) +#define __NR_sendmmsg 269 +__SC_COMP(__NR_sendmmsg, sys_sendmmsg, compat_sys_sendmmsg) +#define __NR_process_vm_readv 270 +__SC_COMP(__NR_process_vm_readv, sys_process_vm_readv, \ + compat_sys_process_vm_readv) +#define __NR_process_vm_writev 271 +__SC_COMP(__NR_process_vm_writev, sys_process_vm_writev, \ + compat_sys_process_vm_writev) +#define __NR_kcmp 272 +__SYSCALL(__NR_kcmp, sys_kcmp) +#define __NR_finit_module 273 +__SYSCALL(__NR_finit_module, sys_finit_module) +#define __NR_sched_setattr 274 +__SYSCALL(__NR_sched_setattr, sys_sched_setattr) +#define __NR_sched_getattr 275 +__SYSCALL(__NR_sched_getattr, sys_sched_getattr) +#define __NR_renameat2 276 +__SYSCALL(__NR_renameat2, sys_renameat2) +#define __NR_seccomp 277 +__SYSCALL(__NR_seccomp, sys_seccomp) +#define __NR_getrandom 278 +__SYSCALL(__NR_getrandom, sys_getrandom) +#define __NR_memfd_create 279 +__SYSCALL(__NR_memfd_create, sys_memfd_create) +#define __NR_bpf 280 +__SYSCALL(__NR_bpf, sys_bpf) +#define __NR_execveat 281 +__SC_COMP(__NR_execveat, sys_execveat, compat_sys_execveat) +#define __NR_userfaultfd 282 +__SYSCALL(__NR_userfaultfd, sys_userfaultfd) +#define __NR_membarrier 283 +__SYSCALL(__NR_membarrier, sys_membarrier) +#define __NR_mlock2 284 +__SYSCALL(__NR_mlock2, sys_mlock2) +#define __NR_copy_file_range 285 +__SYSCALL(__NR_copy_file_range, sys_copy_file_range) +#define __NR_preadv2 286 +__SC_COMP(__NR_preadv2, sys_preadv2, compat_sys_preadv2) +#define __NR_pwritev2 287 +__SC_COMP(__NR_pwritev2, sys_pwritev2, compat_sys_pwritev2) +#define __NR_pkey_mprotect 288 +__SYSCALL(__NR_pkey_mprotect, sys_pkey_mprotect) +#define __NR_pkey_alloc 289 +__SYSCALL(__NR_pkey_alloc, sys_pkey_alloc) +#define __NR_pkey_free 290 +__SYSCALL(__NR_pkey_free, sys_pkey_free) +#define __NR_statx 291 +__SYSCALL(__NR_statx, sys_statx) + +#undef __NR_syscalls +#define __NR_syscalls 292 + +/* + * 32 bit systems traditionally used different + * syscalls for off_t and loff_t arguments, while + * 64 bit systems only need the off_t version. + * For new 32 bit platforms, there is no need to + * implement the old 32 bit off_t syscalls, so + * they take different names. + * Here we map the numbers so that both versions + * use the same syscall table layout. + */ +#if __BITS_PER_LONG == 64 && !defined(__SYSCALL_COMPAT) +#define __NR_fcntl __NR3264_fcntl +#define __NR_statfs __NR3264_statfs +#define __NR_fstatfs __NR3264_fstatfs +#define __NR_truncate __NR3264_truncate +#define __NR_ftruncate __NR3264_ftruncate +#define __NR_lseek __NR3264_lseek +#define __NR_sendfile __NR3264_sendfile +#define __NR_newfstatat __NR3264_fstatat +#define __NR_fstat __NR3264_fstat +#define __NR_mmap __NR3264_mmap +#define __NR_fadvise64 __NR3264_fadvise64 +#ifdef __NR3264_stat +#define __NR_stat __NR3264_stat +#define __NR_lstat __NR3264_lstat +#endif +#else +#define __NR_fcntl64 __NR3264_fcntl +#define __NR_statfs64 __NR3264_statfs +#define __NR_fstatfs64 __NR3264_fstatfs +#define __NR_truncate64 __NR3264_truncate +#define __NR_ftruncate64 __NR3264_ftruncate +#define __NR_llseek __NR3264_lseek +#define __NR_sendfile64 __NR3264_sendfile +#define __NR_fstatat64 __NR3264_fstatat +#define __NR_fstat64 __NR3264_fstat +#define __NR_mmap2 __NR3264_mmap +#define __NR_fadvise64_64 __NR3264_fadvise64 +#ifdef __NR3264_stat +#define __NR_stat64 __NR3264_stat +#define __NR_lstat64 __NR3264_lstat +#endif +#endif diff --git a/linux-headers/asm-mips/bitsperlong.h b/linux-headers/asm-mips/bitsperlong.h new file mode 100644 index 0000000000..7268380d8d --- /dev/null +++ b/linux-headers/asm-mips/bitsperlong.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef __ASM_MIPS_BITSPERLONG_H +#define __ASM_MIPS_BITSPERLONG_H + +#define __BITS_PER_LONG _MIPS_SZLONG + +#include + +#endif /* __ASM_MIPS_BITSPERLONG_H */ diff --git a/linux-headers/asm-mips/kvm.h b/linux-headers/asm-mips/kvm.h index 6985eb59b0..edcf717c43 100644 --- a/linux-headers/asm-mips/kvm.h +++ b/linux-headers/asm-mips/kvm.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ /* * This file is subject to the terms and conditions of the GNU General Public * License. See the file "COPYING" in the main directory of this archive @@ -19,6 +20,10 @@ * Some parts derived from the x86 version of this file. */ +#define __KVM_HAVE_READONLY_MEM + +#define KVM_COALESCED_MMIO_PAGE_OFFSET 1 + /* * for KVM_GET_REGS and KVM_SET_REGS * @@ -52,9 +57,14 @@ struct kvm_fpu { * Register set = 0: GP registers from kvm_regs (see definitions below). * * Register set = 1: CP0 registers. - * bits[15..8] - Must be zero. - * bits[7..3] - Register 'rd' index. - * bits[2..0] - Register 'sel' index. + * bits[15..8] - COP0 register set. + * + * COP0 register set = 0: Main CP0 registers. + * bits[7..3] - Register 'rd' index. + * bits[2..0] - Register 'sel' index. + * + * COP0 register set = 1: MAARs. + * bits[7..0] - MAAR index. * * Register set = 2: KVM specific registers (see definitions below). * @@ -112,6 +122,15 @@ struct kvm_fpu { #define KVM_REG_MIPS_PC (KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 | 34) +/* + * KVM_REG_MIPS_CP0 - Coprocessor 0 registers. + */ + +#define KVM_REG_MIPS_MAAR (KVM_REG_MIPS_CP0 | (1 << 8)) +#define KVM_REG_MIPS_CP0_MAAR(n) (KVM_REG_MIPS_MAAR | \ + KVM_REG_SIZE_U64 | (n)) + + /* * KVM_REG_MIPS_KVM - KVM specific control registers. */ diff --git a/linux-headers/asm-mips/kvm_para.h b/linux-headers/asm-mips/kvm_para.h deleted file mode 100644 index dbb2464f3b..0000000000 --- a/linux-headers/asm-mips/kvm_para.h +++ /dev/null @@ -1,5 +0,0 @@ -#ifndef _ASM_MIPS_KVM_PARA_H -#define _ASM_MIPS_KVM_PARA_H - - -#endif /* _ASM_MIPS_KVM_PARA_H */ diff --git a/linux-headers/asm-mips/sgidefs.h b/linux-headers/asm-mips/sgidefs.h new file mode 100644 index 0000000000..26143e3b7c --- /dev/null +++ b/linux-headers/asm-mips/sgidefs.h @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Copyright (C) 1996, 1999, 2001 Ralf Baechle + * Copyright (C) 1999 Silicon Graphics, Inc. + * Copyright (C) 2001 MIPS Technologies, Inc. + */ +#ifndef __ASM_SGIDEFS_H +#define __ASM_SGIDEFS_H + +/* + * Using a Linux compiler for building Linux seems logic but not to + * everybody. + */ +#ifndef __linux__ +#error Use a Linux compiler or give up. +#endif + +/* + * Definitions for the ISA levels + * + * With the introduction of MIPS32 / MIPS64 instruction sets definitions + * MIPS ISAs are no longer subsets of each other. Therefore comparisons + * on these symbols except with == may result in unexpected results and + * are forbidden! + */ +#define _MIPS_ISA_MIPS1 1 +#define _MIPS_ISA_MIPS2 2 +#define _MIPS_ISA_MIPS3 3 +#define _MIPS_ISA_MIPS4 4 +#define _MIPS_ISA_MIPS5 5 +#define _MIPS_ISA_MIPS32 6 +#define _MIPS_ISA_MIPS64 7 + +/* + * Subprogram calling convention + */ +#define _MIPS_SIM_ABI32 1 +#define _MIPS_SIM_NABI32 2 +#define _MIPS_SIM_ABI64 3 + +#endif /* __ASM_SGIDEFS_H */ diff --git a/linux-headers/asm-mips/unistd.h b/linux-headers/asm-mips/unistd.h index 2a2020938e..9bfef7f764 100644 --- a/linux-headers/asm-mips/unistd.h +++ b/linux-headers/asm-mips/unistd.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ /* * This file is subject to the terms and conditions of the GNU General Public * License. See the file "COPYING" in the main directory of this archive @@ -377,16 +378,27 @@ #define __NR_memfd_create (__NR_Linux + 354) #define __NR_bpf (__NR_Linux + 355) #define __NR_execveat (__NR_Linux + 356) +#define __NR_userfaultfd (__NR_Linux + 357) +#define __NR_membarrier (__NR_Linux + 358) +#define __NR_mlock2 (__NR_Linux + 359) +#define __NR_copy_file_range (__NR_Linux + 360) +#define __NR_preadv2 (__NR_Linux + 361) +#define __NR_pwritev2 (__NR_Linux + 362) +#define __NR_pkey_mprotect (__NR_Linux + 363) +#define __NR_pkey_alloc (__NR_Linux + 364) +#define __NR_pkey_free (__NR_Linux + 365) +#define __NR_statx (__NR_Linux + 366) + /* * Offset of the last Linux o32 flavoured syscall */ -#define __NR_Linux_syscalls 356 +#define __NR_Linux_syscalls 366 #endif /* _MIPS_SIM == _MIPS_SIM_ABI32 */ #define __NR_O32_Linux 4000 -#define __NR_O32_Linux_syscalls 356 +#define __NR_O32_Linux_syscalls 366 #if _MIPS_SIM == _MIPS_SIM_ABI64 @@ -711,16 +723,26 @@ #define __NR_memfd_create (__NR_Linux + 314) #define __NR_bpf (__NR_Linux + 315) #define __NR_execveat (__NR_Linux + 316) +#define __NR_userfaultfd (__NR_Linux + 317) +#define __NR_membarrier (__NR_Linux + 318) +#define __NR_mlock2 (__NR_Linux + 319) +#define __NR_copy_file_range (__NR_Linux + 320) +#define __NR_preadv2 (__NR_Linux + 321) +#define __NR_pwritev2 (__NR_Linux + 322) +#define __NR_pkey_mprotect (__NR_Linux + 323) +#define __NR_pkey_alloc (__NR_Linux + 324) +#define __NR_pkey_free (__NR_Linux + 325) +#define __NR_statx (__NR_Linux + 326) /* * Offset of the last Linux 64-bit flavoured syscall */ -#define __NR_Linux_syscalls 316 +#define __NR_Linux_syscalls 326 #endif /* _MIPS_SIM == _MIPS_SIM_ABI64 */ #define __NR_64_Linux 5000 -#define __NR_64_Linux_syscalls 316 +#define __NR_64_Linux_syscalls 326 #if _MIPS_SIM == _MIPS_SIM_NABI32 @@ -1049,15 +1071,25 @@ #define __NR_memfd_create (__NR_Linux + 318) #define __NR_bpf (__NR_Linux + 319) #define __NR_execveat (__NR_Linux + 320) +#define __NR_userfaultfd (__NR_Linux + 321) +#define __NR_membarrier (__NR_Linux + 322) +#define __NR_mlock2 (__NR_Linux + 323) +#define __NR_copy_file_range (__NR_Linux + 324) +#define __NR_preadv2 (__NR_Linux + 325) +#define __NR_pwritev2 (__NR_Linux + 326) +#define __NR_pkey_mprotect (__NR_Linux + 327) +#define __NR_pkey_alloc (__NR_Linux + 328) +#define __NR_pkey_free (__NR_Linux + 329) +#define __NR_statx (__NR_Linux + 330) /* * Offset of the last N32 flavoured syscall */ -#define __NR_Linux_syscalls 320 +#define __NR_Linux_syscalls 330 #endif /* _MIPS_SIM == _MIPS_SIM_NABI32 */ #define __NR_N32_Linux 6000 -#define __NR_N32_Linux_syscalls 320 +#define __NR_N32_Linux_syscalls 330 #endif /* _ASM_UNISTD_H */ diff --git a/linux-headers/asm-powerpc/bitsperlong.h b/linux-headers/asm-powerpc/bitsperlong.h new file mode 100644 index 0000000000..46ece3ecff --- /dev/null +++ b/linux-headers/asm-powerpc/bitsperlong.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef __ASM_POWERPC_BITSPERLONG_H +#define __ASM_POWERPC_BITSPERLONG_H + +#if defined(__powerpc64__) +# define __BITS_PER_LONG 64 +#else +# define __BITS_PER_LONG 32 +#endif + +#include + +#endif /* __ASM_POWERPC_BITSPERLONG_H */ diff --git a/linux-headers/asm-powerpc/epapr_hcalls.h b/linux-headers/asm-powerpc/epapr_hcalls.h deleted file mode 100644 index 6cca559993..0000000000 --- a/linux-headers/asm-powerpc/epapr_hcalls.h +++ /dev/null @@ -1,99 +0,0 @@ -/* SPDX-License-Identifier: ((GPL-2.0+ WITH Linux-syscall-note) OR BSD-3-Clause) */ -/* - * ePAPR hcall interface - * - * Copyright 2008-2011 Freescale Semiconductor, Inc. - * - * Author: Timur Tabi - * - * This file is provided under a dual BSD/GPL license. When using or - * redistributing this file, you may do so under either license. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Freescale Semiconductor nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * - * ALTERNATIVELY, this software may be distributed under the terms of the - * GNU General Public License ("GPL") as published by the Free Software - * Foundation, either version 2 of that License or (at your option) any - * later version. - * - * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef _ASM_POWERPC_EPAPR_HCALLS_H -#define _ASM_POWERPC_EPAPR_HCALLS_H - -#define EV_BYTE_CHANNEL_SEND 1 -#define EV_BYTE_CHANNEL_RECEIVE 2 -#define EV_BYTE_CHANNEL_POLL 3 -#define EV_INT_SET_CONFIG 4 -#define EV_INT_GET_CONFIG 5 -#define EV_INT_SET_MASK 6 -#define EV_INT_GET_MASK 7 -#define EV_INT_IACK 9 -#define EV_INT_EOI 10 -#define EV_INT_SEND_IPI 11 -#define EV_INT_SET_TASK_PRIORITY 12 -#define EV_INT_GET_TASK_PRIORITY 13 -#define EV_DOORBELL_SEND 14 -#define EV_MSGSND 15 -#define EV_IDLE 16 - -/* vendor ID: epapr */ -#define EV_LOCAL_VENDOR_ID 0 /* for private use */ -#define EV_EPAPR_VENDOR_ID 1 -#define EV_FSL_VENDOR_ID 2 /* Freescale Semiconductor */ -#define EV_IBM_VENDOR_ID 3 /* IBM */ -#define EV_GHS_VENDOR_ID 4 /* Green Hills Software */ -#define EV_ENEA_VENDOR_ID 5 /* Enea */ -#define EV_WR_VENDOR_ID 6 /* Wind River Systems */ -#define EV_AMCC_VENDOR_ID 7 /* Applied Micro Circuits */ -#define EV_KVM_VENDOR_ID 42 /* KVM */ - -/* The max number of bytes that a byte channel can send or receive per call */ -#define EV_BYTE_CHANNEL_MAX_BYTES 16 - - -#define _EV_HCALL_TOKEN(id, num) (((id) << 16) | (num)) -#define EV_HCALL_TOKEN(hcall_num) _EV_HCALL_TOKEN(EV_EPAPR_VENDOR_ID, hcall_num) - -/* epapr return codes */ -#define EV_SUCCESS 0 -#define EV_EPERM 1 /* Operation not permitted */ -#define EV_ENOENT 2 /* Entry Not Found */ -#define EV_EIO 3 /* I/O error occurred */ -#define EV_EAGAIN 4 /* The operation had insufficient - * resources to complete and should be - * retried - */ -#define EV_ENOMEM 5 /* There was insufficient memory to - * complete the operation */ -#define EV_EFAULT 6 /* Bad guest address */ -#define EV_ENODEV 7 /* No such device */ -#define EV_EINVAL 8 /* An argument supplied to the hcall - was out of range or invalid */ -#define EV_INTERNAL 9 /* An internal error occurred */ -#define EV_CONFIG 10 /* A configuration error was detected */ -#define EV_INVALID_STATE 11 /* The object is in an invalid state */ -#define EV_UNIMPLEMENTED 12 /* Unimplemented hypercall */ -#define EV_BUFFER_OVERFLOW 13 /* Caller-supplied buffer too small */ - -#endif /* _ASM_POWERPC_EPAPR_HCALLS_H */ diff --git a/linux-headers/asm-powerpc/kvm_para.h b/linux-headers/asm-powerpc/kvm_para.h deleted file mode 100644 index 9beb49cc10..0000000000 --- a/linux-headers/asm-powerpc/kvm_para.h +++ /dev/null @@ -1,98 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -/* - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License, version 2, as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - * - * Copyright IBM Corp. 2008 - * - * Authors: Hollis Blanchard - */ - -#ifndef __POWERPC_KVM_PARA_H__ -#define __POWERPC_KVM_PARA_H__ - -#include - -/* - * Additions to this struct must only occur at the end, and should be - * accompanied by a KVM_MAGIC_FEAT flag to advertise that they are present - * (albeit not necessarily relevant to the current target hardware platform). - * - * Struct fields are always 32 or 64 bit aligned, depending on them being 32 - * or 64 bit wide respectively. - * - * See Documentation/virtual/kvm/ppc-pv.txt - */ -struct kvm_vcpu_arch_shared { - __u64 scratch1; - __u64 scratch2; - __u64 scratch3; - __u64 critical; /* Guest may not get interrupts if == r1 */ - __u64 sprg0; - __u64 sprg1; - __u64 sprg2; - __u64 sprg3; - __u64 srr0; - __u64 srr1; - __u64 dar; /* dear on BookE */ - __u64 msr; - __u32 dsisr; - __u32 int_pending; /* Tells the guest if we have an interrupt */ - __u32 sr[16]; - __u32 mas0; - __u32 mas1; - __u64 mas7_3; - __u64 mas2; - __u32 mas4; - __u32 mas6; - __u32 esr; - __u32 pir; - - /* - * SPRG4-7 are user-readable, so we can only keep these consistent - * between the shared area and the real registers when there's an - * intervening exit to KVM. This also applies to SPRG3 on some - * chips. - * - * This suffices for access by guest userspace, since in PR-mode - * KVM, an exit must occur when changing the guest's MSR[PR]. - * If the guest kernel writes to SPRG3-7 via the shared area, it - * must also use the shared area for reading while in kernel space. - */ - __u64 sprg4; - __u64 sprg5; - __u64 sprg6; - __u64 sprg7; -}; - -#define KVM_SC_MAGIC_R0 0x4b564d21 /* "KVM!" */ - -#define KVM_HCALL_TOKEN(num) _EV_HCALL_TOKEN(EV_KVM_VENDOR_ID, num) - -#include - -#define KVM_FEATURE_MAGIC_PAGE 1 - -/* Magic page flags from host to guest */ - -#define KVM_MAGIC_FEAT_SR (1 << 0) - -/* MASn, ESR, PIR, and high SPRGs */ -#define KVM_MAGIC_FEAT_MAS0_TO_SPRG7 (1 << 1) - -/* Magic page flags from guest to host */ - -#define MAGIC_PAGE_FLAG_NOT_MAPPED_NX (1 << 0) - - -#endif /* __POWERPC_KVM_PARA_H__ */ diff --git a/linux-headers/asm-s390/bitsperlong.h b/linux-headers/asm-s390/bitsperlong.h new file mode 100644 index 0000000000..cceaf47b02 --- /dev/null +++ b/linux-headers/asm-s390/bitsperlong.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef __ASM_S390_BITSPERLONG_H +#define __ASM_S390_BITSPERLONG_H + +#ifndef __s390x__ +#define __BITS_PER_LONG 32 +#else +#define __BITS_PER_LONG 64 +#endif + +#include + +#endif /* __ASM_S390_BITSPERLONG_H */ + diff --git a/linux-headers/asm-s390/kvm_para.h b/linux-headers/asm-s390/kvm_para.h deleted file mode 100644 index b9ab584adf..0000000000 --- a/linux-headers/asm-s390/kvm_para.h +++ /dev/null @@ -1,8 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -/* - * User API definitions for paravirtual devices on s390 - * - * Copyright IBM Corp. 2008 - * - * Author(s): Christian Borntraeger - */ diff --git a/linux-headers/asm-s390/unistd_32.h b/linux-headers/asm-s390/unistd_32.h index 1ae66a263b..d0f97cd0a4 100644 --- a/linux-headers/asm-s390/unistd_32.h +++ b/linux-headers/asm-s390/unistd_32.h @@ -360,5 +360,6 @@ #define __NR_s390_guarded_storage 378 #define __NR_statx 379 #define __NR_s390_sthyi 380 +#define __NR_kexec_file_load 381 #endif /* _ASM_S390_UNISTD_32_H */ diff --git a/linux-headers/asm-s390/unistd_64.h b/linux-headers/asm-s390/unistd_64.h index 8aa9d046a9..23ffb97746 100644 --- a/linux-headers/asm-s390/unistd_64.h +++ b/linux-headers/asm-s390/unistd_64.h @@ -327,5 +327,6 @@ #define __NR_s390_guarded_storage 378 #define __NR_statx 379 #define __NR_s390_sthyi 380 +#define __NR_kexec_file_load 381 #endif /* _ASM_S390_UNISTD_64_H */ diff --git a/linux-headers/asm-x86/bitsperlong.h b/linux-headers/asm-x86/bitsperlong.h new file mode 100644 index 0000000000..5d72c84588 --- /dev/null +++ b/linux-headers/asm-x86/bitsperlong.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef __ASM_X86_BITSPERLONG_H +#define __ASM_X86_BITSPERLONG_H + +#if defined(__x86_64__) && !defined(__ILP32__) +# define __BITS_PER_LONG 64 +#else +# define __BITS_PER_LONG 32 +#endif + +#include + +#endif /* __ASM_X86_BITSPERLONG_H */ + diff --git a/linux-headers/linux/kvm_para.h b/linux-headers/linux/kvm_para.h deleted file mode 100644 index 8bcd0aa853..0000000000 --- a/linux-headers/linux/kvm_para.h +++ /dev/null @@ -1,35 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef __LINUX_KVM_PARA_H -#define __LINUX_KVM_PARA_H - -/* - * This header file provides a method for making a hypercall to the host - * Architectures should define: - * - kvm_hypercall0, kvm_hypercall1... - * - kvm_arch_para_features - * - kvm_para_available - */ - -/* Return values for hypercalls */ -#define KVM_ENOSYS 1000 -#define KVM_EFAULT EFAULT -#define KVM_E2BIG E2BIG -#define KVM_EPERM EPERM -#define KVM_EOPNOTSUPP 95 - -#define KVM_HC_VAPIC_POLL_IRQ 1 -#define KVM_HC_MMU_OP 2 -#define KVM_HC_FEATURES 3 -#define KVM_HC_PPC_MAP_MAGIC_PAGE 4 -#define KVM_HC_KICK_CPU 5 -#define KVM_HC_MIPS_GET_CLOCK_FREQ 6 -#define KVM_HC_MIPS_EXIT_VM 7 -#define KVM_HC_MIPS_CONSOLE_OUTPUT 8 -#define KVM_HC_CLOCK_PAIRING 9 - -/* - * hypercalls use architecture specific - */ -#include - -#endif /* __LINUX_KVM_PARA_H */ diff --git a/net/vhost-user.c b/net/vhost-user.c index e0f16c895b..fa28aad12d 100644 --- a/net/vhost-user.c +++ b/net/vhost-user.c @@ -20,38 +20,38 @@ #include "qemu/option.h" #include "trace.h" -typedef struct VhostUserState { +typedef struct NetVhostUserState { NetClientState nc; CharBackend chr; /* only queue index 0 */ VHostNetState *vhost_net; guint watch; uint64_t acked_features; bool started; -} VhostUserState; +} NetVhostUserState; VHostNetState *vhost_user_get_vhost_net(NetClientState *nc) { - VhostUserState *s = DO_UPCAST(VhostUserState, nc, nc); + NetVhostUserState *s = DO_UPCAST(NetVhostUserState, nc, nc); assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_USER); return s->vhost_net; } uint64_t vhost_user_get_acked_features(NetClientState *nc) { - VhostUserState *s = DO_UPCAST(VhostUserState, nc, nc); + NetVhostUserState *s = DO_UPCAST(NetVhostUserState, nc, nc); assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_USER); return s->acked_features; } static void vhost_user_stop(int queues, NetClientState *ncs[]) { - VhostUserState *s; + NetVhostUserState *s; int i; for (i = 0; i < queues; i++) { assert(ncs[i]->info->type == NET_CLIENT_DRIVER_VHOST_USER); - s = DO_UPCAST(VhostUserState, nc, ncs[i]); + s = DO_UPCAST(NetVhostUserState, nc, ncs[i]); if (s->vhost_net) { /* save acked features */ @@ -68,7 +68,7 @@ static int vhost_user_start(int queues, NetClientState *ncs[], CharBackend *be) { VhostNetOptions options; struct vhost_net *net = NULL; - VhostUserState *s; + NetVhostUserState *s; int max_queues; int i; @@ -77,7 +77,7 @@ static int vhost_user_start(int queues, NetClientState *ncs[], CharBackend *be) for (i = 0; i < queues; i++) { assert(ncs[i]->info->type == NET_CLIENT_DRIVER_VHOST_USER); - s = DO_UPCAST(VhostUserState, nc, ncs[i]); + s = DO_UPCAST(NetVhostUserState, nc, ncs[i]); options.net_backend = ncs[i]; options.opaque = be; @@ -123,7 +123,7 @@ static ssize_t vhost_user_receive(NetClientState *nc, const uint8_t *buf, without GUEST_ANNOUNCE capability. */ if (size == 60) { - VhostUserState *s = DO_UPCAST(VhostUserState, nc, nc); + NetVhostUserState *s = DO_UPCAST(NetVhostUserState, nc, nc); int r; static int display_rarp_failure = 1; char mac_addr[6]; @@ -146,7 +146,7 @@ static ssize_t vhost_user_receive(NetClientState *nc, const uint8_t *buf, static void vhost_user_cleanup(NetClientState *nc) { - VhostUserState *s = DO_UPCAST(VhostUserState, nc, nc); + NetVhostUserState *s = DO_UPCAST(NetVhostUserState, nc, nc); if (s->vhost_net) { vhost_net_cleanup(s->vhost_net); @@ -180,7 +180,7 @@ static bool vhost_user_has_ufo(NetClientState *nc) static NetClientInfo net_vhost_user_info = { .type = NET_CLIENT_DRIVER_VHOST_USER, - .size = sizeof(VhostUserState), + .size = sizeof(NetVhostUserState), .receive = vhost_user_receive, .cleanup = vhost_user_cleanup, .has_vnet_hdr = vhost_user_has_vnet_hdr, @@ -190,7 +190,7 @@ static NetClientInfo net_vhost_user_info = { static gboolean net_vhost_user_watch(GIOChannel *chan, GIOCondition cond, void *opaque) { - VhostUserState *s = opaque; + NetVhostUserState *s = opaque; qemu_chr_fe_disconnect(&s->chr); @@ -203,7 +203,7 @@ static void chr_closed_bh(void *opaque) { const char *name = opaque; NetClientState *ncs[MAX_QUEUE_NUM]; - VhostUserState *s; + NetVhostUserState *s; Error *err = NULL; int queues; @@ -212,7 +212,7 @@ static void chr_closed_bh(void *opaque) MAX_QUEUE_NUM); assert(queues < MAX_QUEUE_NUM); - s = DO_UPCAST(VhostUserState, nc, ncs[0]); + s = DO_UPCAST(NetVhostUserState, nc, ncs[0]); qmp_set_link(name, false, &err); vhost_user_stop(queues, ncs); @@ -229,7 +229,7 @@ static void net_vhost_user_event(void *opaque, int event) { const char *name = opaque; NetClientState *ncs[MAX_QUEUE_NUM]; - VhostUserState *s; + NetVhostUserState *s; Chardev *chr; Error *err = NULL; int queues; @@ -239,7 +239,7 @@ static void net_vhost_user_event(void *opaque, int event) MAX_QUEUE_NUM); assert(queues < MAX_QUEUE_NUM); - s = DO_UPCAST(VhostUserState, nc, ncs[0]); + s = DO_UPCAST(NetVhostUserState, nc, ncs[0]); chr = qemu_chr_fe_get_driver(&s->chr); trace_vhost_user_event(chr->label, event); switch (event) { @@ -283,7 +283,7 @@ static int net_vhost_user_init(NetClientState *peer, const char *device, { Error *err = NULL; NetClientState *nc, *nc0 = NULL; - VhostUserState *s; + NetVhostUserState *s; int i; assert(name); @@ -296,7 +296,7 @@ static int net_vhost_user_init(NetClientState *peer, const char *device, nc->queue_index = i; if (!nc0) { nc0 = nc; - s = DO_UPCAST(VhostUserState, nc, nc); + s = DO_UPCAST(NetVhostUserState, nc, nc); if (!qemu_chr_fe_init(&s->chr, chr, &err)) { error_report_err(err); return -1; @@ -305,7 +305,7 @@ static int net_vhost_user_init(NetClientState *peer, const char *device, } - s = DO_UPCAST(VhostUserState, nc, nc0); + s = DO_UPCAST(NetVhostUserState, nc, nc0); do { if (qemu_chr_fe_wait_connected(&s->chr, &err) < 0) { error_report_err(err); diff --git a/scripts/update-linux-headers.sh b/scripts/update-linux-headers.sh index b65c03f0ae..947dec2852 100755 --- a/scripts/update-linux-headers.sh +++ b/scripts/update-linux-headers.sh @@ -43,6 +43,7 @@ cp_portable() { -e 'limits' \ -e 'linux/kernel' \ -e 'linux/sysinfo' \ + -e 'asm-generic/kvm_para' \ > /dev/null then echo "Unexpected #include in input file $f". @@ -83,11 +84,6 @@ for arch in $ARCHLIST; do continue fi - # Blacklist architectures which have KVM headers but are actually dead - if [ "$arch" = "ia64" -o "$arch" = "mips" ]; then - continue - fi - if [ "$arch" = x86 ]; then arch_var=SRCARCH else @@ -98,11 +94,12 @@ for arch in $ARCHLIST; do rm -rf "$output/linux-headers/asm-$arch" mkdir -p "$output/linux-headers/asm-$arch" - for header in kvm.h kvm_para.h unistd.h; do + for header in kvm.h unistd.h bitsperlong.h; do cp "$tmpdir/include/asm/$header" "$output/linux-headers/asm-$arch" done - if [ $arch = powerpc ]; then - cp "$tmpdir/include/asm/epapr_hcalls.h" "$output/linux-headers/asm-powerpc/" + + if [ $arch = mips ]; then + cp "$tmpdir/include/asm/sgidefs.h" "$output/linux-headers/asm-mips/" fi rm -rf "$output/include/standard-headers/asm-$arch" @@ -121,20 +118,23 @@ for arch in $ARCHLIST; do cp "$tmpdir/include/asm/unistd_32.h" "$output/linux-headers/asm-x86/" cp "$tmpdir/include/asm/unistd_x32.h" "$output/linux-headers/asm-x86/" cp "$tmpdir/include/asm/unistd_64.h" "$output/linux-headers/asm-x86/" + cp_portable "$tmpdir/include/asm/kvm_para.h" "$output/include/standard-headers/asm-$arch" fi done rm -rf "$output/linux-headers/linux" mkdir -p "$output/linux-headers/linux" -for header in kvm.h kvm_para.h vfio.h vfio_ccw.h vhost.h \ +for header in kvm.h vfio.h vfio_ccw.h vhost.h \ psci.h psp-sev.h userfaultfd.h; do cp "$tmpdir/include/linux/$header" "$output/linux-headers/linux" done + rm -rf "$output/linux-headers/asm-generic" mkdir -p "$output/linux-headers/asm-generic" -for header in kvm_para.h; do +for header in unistd.h bitsperlong.h; do cp "$tmpdir/include/asm-generic/$header" "$output/linux-headers/asm-generic" done + if [ -L "$linux/source" ]; then cp "$linux/source/COPYING" "$output/linux-headers" else diff --git a/target/i386/cpu.c b/target/i386/cpu.c index d95310ffd4..94260412e2 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -40,9 +40,7 @@ #include "qom/qom-qobject.h" #include "sysemu/arch_init.h" -#if defined(CONFIG_KVM) -#include -#endif +#include "standard-headers/asm-x86/kvm_para.h" #include "sysemu/sysemu.h" #include "hw/qdev-properties.h" diff --git a/target/i386/cpu.h b/target/i386/cpu.h index 8ac13f6c2c..664504610e 100644 --- a/target/i386/cpu.h +++ b/target/i386/cpu.h @@ -688,8 +688,6 @@ typedef uint32_t FeatureWordArray[FEATURE_WORDS]; #define CPUID_7_0_EDX_SPEC_CTRL (1U << 26) /* Speculation Control */ #define CPUID_7_0_EDX_SPEC_CTRL_SSBD (1U << 31) /* Speculative Store Bypass Disable */ -#define KVM_HINTS_DEDICATED (1U << 0) - #define CPUID_8000_0008_EBX_IBPB (1U << 12) /* Indirect Branch Prediction Barrier */ #define CPUID_XSAVE_XSAVEOPT (1U << 0) diff --git a/target/i386/kvm.c b/target/i386/kvm.c index 0c656a91a4..6511329d11 100644 --- a/target/i386/kvm.c +++ b/target/i386/kvm.c @@ -18,7 +18,7 @@ #include #include -#include +#include "standard-headers/asm-x86/kvm_para.h" #include "qemu-common.h" #include "cpu.h" @@ -387,7 +387,7 @@ uint32_t kvm_arch_get_supported_cpuid(KVMState *s, uint32_t function, ret &= ~(1U << KVM_FEATURE_PV_UNHALT); } } else if (function == KVM_CPUID_FEATURES && reg == R_EDX) { - ret |= KVM_HINTS_DEDICATED; + ret |= 1U << KVM_HINTS_DEDICATED; found = 1; } diff --git a/target/i386/kvm_i386.h b/target/i386/kvm_i386.h index 1de9876cd9..e5df24cad1 100644 --- a/target/i386/kvm_i386.h +++ b/target/i386/kvm_i386.h @@ -30,12 +30,6 @@ #define kvm_pic_in_kernel() 0 #define kvm_ioapic_in_kernel() 0 -/* These constants must never be used at runtime if kvm_enabled() is false. - * They exist so we don't need #ifdefs around KVM-specific code that already - * checks kvm_enabled() properly. - */ -#define KVM_CPUID_FEATURES 0 - #endif /* CONFIG_KVM */ bool kvm_allows_irq0_override(void); diff --git a/util/Makefile.objs b/util/Makefile.objs index 728c3541db..e1c3fed4dc 100644 --- a/util/Makefile.objs +++ b/util/Makefile.objs @@ -47,4 +47,5 @@ util-obj-y += qht.o util-obj-y += range.o util-obj-y += stats64.o util-obj-y += systemd.o +util-obj-y += iova-tree.o util-obj-$(CONFIG_LINUX) += vfio-helpers.o diff --git a/util/iova-tree.c b/util/iova-tree.c new file mode 100644 index 0000000000..2d9cebfc89 --- /dev/null +++ b/util/iova-tree.c @@ -0,0 +1,114 @@ +/* + * IOVA tree implementation based on GTree. + * + * Copyright 2018 Red Hat, Inc. + * + * Authors: + * Peter Xu + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + */ + +#include +#include "qemu/iova-tree.h" + +struct IOVATree { + GTree *tree; +}; + +static int iova_tree_compare(gconstpointer a, gconstpointer b, gpointer data) +{ + const DMAMap *m1 = a, *m2 = b; + + if (m1->iova > m2->iova + m2->size) { + return 1; + } + + if (m1->iova + m1->size < m2->iova) { + return -1; + } + + /* Overlapped */ + return 0; +} + +IOVATree *iova_tree_new(void) +{ + IOVATree *iova_tree = g_new0(IOVATree, 1); + + /* We don't have values actually, no need to free */ + iova_tree->tree = g_tree_new_full(iova_tree_compare, NULL, g_free, NULL); + + return iova_tree; +} + +DMAMap *iova_tree_find(IOVATree *tree, DMAMap *map) +{ + return g_tree_lookup(tree->tree, map); +} + +DMAMap *iova_tree_find_address(IOVATree *tree, hwaddr iova) +{ + DMAMap map = { .iova = iova, .size = 0 }; + + return iova_tree_find(tree, &map); +} + +static inline void iova_tree_insert_internal(GTree *gtree, DMAMap *range) +{ + /* Key and value are sharing the same range data */ + g_tree_insert(gtree, range, range); +} + +int iova_tree_insert(IOVATree *tree, DMAMap *map) +{ + DMAMap *new; + + if (map->iova + map->size < map->iova || map->perm == IOMMU_NONE) { + return IOVA_ERR_INVALID; + } + + /* We don't allow to insert range that overlaps with existings */ + if (iova_tree_find(tree, map)) { + return IOVA_ERR_OVERLAP; + } + + new = g_new0(DMAMap, 1); + memcpy(new, map, sizeof(*new)); + iova_tree_insert_internal(tree->tree, new); + + return IOVA_OK; +} + +static gboolean iova_tree_traverse(gpointer key, gpointer value, + gpointer data) +{ + iova_tree_iterator iterator = data; + DMAMap *map = key; + + g_assert(key == value); + + return iterator(map); +} + +void iova_tree_foreach(IOVATree *tree, iova_tree_iterator iterator) +{ + g_tree_foreach(tree->tree, iova_tree_traverse, iterator); +} + +int iova_tree_remove(IOVATree *tree, DMAMap *map) +{ + DMAMap *overlap; + + while ((overlap = iova_tree_find(tree, map))) { + g_tree_remove(tree->tree, overlap); + } + + return IOVA_OK; +} + +void iova_tree_destroy(IOVATree *tree) +{ + g_tree_destroy(tree->tree); + g_free(tree); +}