From cd9b22685e4ccd728550d51fbe108c473f89df4f Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Fri, 21 Jun 2013 09:37:50 -0600
Subject: [PATCH 1/7] vfio: Convert type1 iommu to use rbtree

We need to keep track of all the DMA mappings of an iommu container so
that it can be automatically unmapped when the user releases the file
descriptor.  We currently do this using a simple list, where we merge
entries with contiguous iovas and virtual addresses.  Using a tree for
this is a bit more efficient and allows us to use common code instead
of inventing our own.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/vfio_iommu_type1.c | 190 ++++++++++++++++----------------
 1 file changed, 96 insertions(+), 94 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 6f3fbc48a6c7..0e863b3ddcab 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -31,6 +31,7 @@
 #include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/pci.h>		/* pci_bus_type */
+#include <linux/rbtree.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/uaccess.h>
@@ -50,13 +51,13 @@ MODULE_PARM_DESC(allow_unsafe_interrupts,
 struct vfio_iommu {
 	struct iommu_domain	*domain;
 	struct mutex		lock;
-	struct list_head	dma_list;
+	struct rb_root		dma_list;
 	struct list_head	group_list;
 	bool			cache;
 };
 
 struct vfio_dma {
-	struct list_head	next;
+	struct rb_node		node;
 	dma_addr_t		iova;		/* Device address */
 	unsigned long		vaddr;		/* Process virtual addr */
 	long			npage;		/* Number of pages */
@@ -75,6 +76,49 @@ struct vfio_group {
 
 #define NPAGE_TO_SIZE(npage)	((size_t)(npage) << PAGE_SHIFT)
 
+static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
+				      dma_addr_t start, size_t size)
+{
+	struct rb_node *node = iommu->dma_list.rb_node;
+
+	while (node) {
+		struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
+
+		if (start + size <= dma->iova)
+			node = node->rb_left;
+		else if (start >= dma->iova + NPAGE_TO_SIZE(dma->npage))
+			node = node->rb_right;
+		else
+			return dma;
+	}
+
+	return NULL;
+}
+
+static void vfio_insert_dma(struct vfio_iommu *iommu, struct vfio_dma *new)
+{
+	struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL;
+	struct vfio_dma *dma;
+
+	while (*link) {
+		parent = *link;
+		dma = rb_entry(parent, struct vfio_dma, node);
+
+		if (new->iova + NPAGE_TO_SIZE(new->npage) <= dma->iova)
+			link = &(*link)->rb_left;
+		else
+			link = &(*link)->rb_right;
+	}
+
+	rb_link_node(&new->node, parent, link);
+	rb_insert_color(&new->node, &iommu->dma_list);
+}
+
+static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
+{
+	rb_erase(&old->node, &iommu->dma_list);
+}
+
 struct vwork {
 	struct mm_struct	*mm;
 	long			npage;
@@ -289,31 +333,8 @@ static int __vfio_dma_map(struct vfio_iommu *iommu, dma_addr_t iova,
 	return 0;
 }
 
-static inline bool ranges_overlap(dma_addr_t start1, size_t size1,
-				  dma_addr_t start2, size_t size2)
-{
-	if (start1 < start2)
-		return (start2 - start1 < size1);
-	else if (start2 < start1)
-		return (start1 - start2 < size2);
-	return (size1 > 0 && size2 > 0);
-}
-
-static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
-						dma_addr_t start, size_t size)
-{
-	struct vfio_dma *dma;
-
-	list_for_each_entry(dma, &iommu->dma_list, next) {
-		if (ranges_overlap(dma->iova, NPAGE_TO_SIZE(dma->npage),
-				   start, size))
-			return dma;
-	}
-	return NULL;
-}
-
-static long vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start,
-				    size_t size, struct vfio_dma *dma)
+static int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start,
+				   size_t size, struct vfio_dma *dma)
 {
 	struct vfio_dma *split;
 	long npage_lo, npage_hi;
@@ -322,10 +343,9 @@ static long vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start,
 	if (start <= dma->iova &&
 	    start + size >= dma->iova + NPAGE_TO_SIZE(dma->npage)) {
 		vfio_dma_unmap(iommu, dma->iova, dma->npage, dma->prot);
-		list_del(&dma->next);
-		npage_lo = dma->npage;
+		vfio_remove_dma(iommu, dma);
 		kfree(dma);
-		return npage_lo;
+		return 0;
 	}
 
 	/* Overlap low address of existing range */
@@ -339,7 +359,7 @@ static long vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start,
 		dma->iova += overlap;
 		dma->vaddr += overlap;
 		dma->npage -= npage_lo;
-		return npage_lo;
+		return 0;
 	}
 
 	/* Overlap high address of existing range */
@@ -351,7 +371,7 @@ static long vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start,
 
 		vfio_dma_unmap(iommu, start, npage_hi, dma->prot);
 		dma->npage -= npage_hi;
-		return npage_hi;
+		return 0;
 	}
 
 	/* Split existing */
@@ -370,16 +390,16 @@ static long vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start,
 	split->iova = start + size;
 	split->vaddr = dma->vaddr + NPAGE_TO_SIZE(npage_lo) + size;
 	split->prot = dma->prot;
-	list_add(&split->next, &iommu->dma_list);
-	return size >> PAGE_SHIFT;
+	vfio_insert_dma(iommu, split);
+	return 0;
 }
 
 static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 			     struct vfio_iommu_type1_dma_unmap *unmap)
 {
-	long ret = 0, npage = unmap->size >> PAGE_SHIFT;
-	struct vfio_dma *dma, *tmp;
 	uint64_t mask;
+	struct vfio_dma *dma;
+	int ret = 0;
 
 	mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1;
 
@@ -393,25 +413,19 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 
 	mutex_lock(&iommu->lock);
 
-	list_for_each_entry_safe(dma, tmp, &iommu->dma_list, next) {
-		if (ranges_overlap(dma->iova, NPAGE_TO_SIZE(dma->npage),
-				   unmap->iova, unmap->size)) {
-			ret = vfio_remove_dma_overlap(iommu, unmap->iova,
-						      unmap->size, dma);
-			if (ret > 0)
-				npage -= ret;
-			if (ret < 0 || npage == 0)
-				break;
-		}
-	}
+	while (!ret && (dma = vfio_find_dma(iommu,
+					    unmap->iova, unmap->size)))
+		ret = vfio_remove_dma_overlap(iommu, unmap->iova,
+					      unmap->size, dma);
+
 	mutex_unlock(&iommu->lock);
-	return ret > 0 ? 0 : (int)ret;
+	return ret;
 }
 
 static int vfio_dma_do_map(struct vfio_iommu *iommu,
 			   struct vfio_iommu_type1_dma_map *map)
 {
-	struct vfio_dma *dma, *pdma = NULL;
+	struct vfio_dma *dma;
 	dma_addr_t iova = map->iova;
 	unsigned long locked, lock_limit, vaddr = map->vaddr;
 	size_t size = map->size;
@@ -452,6 +466,10 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
 	if (!npage)
 		return -EINVAL;
 
+	dma = kzalloc(sizeof *dma, GFP_KERNEL);
+	if (!dma)
+		return -ENOMEM;
+
 	mutex_lock(&iommu->lock);
 
 	if (vfio_find_dma(iommu, iova, size)) {
@@ -473,62 +491,45 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
 	if (ret)
 		goto out_lock;
 
+	dma->npage = npage;
+	dma->iova = iova;
+	dma->vaddr = vaddr;
+	dma->prot = prot;
+
 	/* Check if we abut a region below - nothing below 0 */
 	if (iova) {
-		dma = vfio_find_dma(iommu, iova - 1, 1);
-		if (dma && dma->prot == prot &&
-		    dma->vaddr + NPAGE_TO_SIZE(dma->npage) == vaddr) {
-
-			dma->npage += npage;
-			iova = dma->iova;
-			vaddr = dma->vaddr;
+		struct vfio_dma *tmp = vfio_find_dma(iommu, iova - 1, 1);
+		if (tmp && tmp->prot == prot &&
+		    tmp->vaddr + NPAGE_TO_SIZE(tmp->npage) == vaddr) {
+			vfio_remove_dma(iommu, tmp);
+			dma->npage += tmp->npage;
+			dma->iova = iova = tmp->iova;
+			dma->vaddr = vaddr = tmp->vaddr;
+			kfree(tmp);
 			npage = dma->npage;
 			size = NPAGE_TO_SIZE(npage);
-
-			pdma = dma;
 		}
 	}
 
 	/* Check if we abut a region above - nothing above ~0 + 1 */
 	if (iova + size) {
-		dma = vfio_find_dma(iommu, iova + size, 1);
-		if (dma && dma->prot == prot &&
-		    dma->vaddr == vaddr + size) {
-
-			dma->npage += npage;
-			dma->iova = iova;
-			dma->vaddr = vaddr;
-
-			/*
-			 * If merged above and below, remove previously
-			 * merged entry.  New entry covers it.
-			 */
-			if (pdma) {
-				list_del(&pdma->next);
-				kfree(pdma);
-			}
-			pdma = dma;
+		struct vfio_dma *tmp = vfio_find_dma(iommu, iova + size, 1);
+		if (tmp && tmp->prot == prot &&
+		    tmp->vaddr == vaddr + size) {
+			vfio_remove_dma(iommu, tmp);
+			dma->npage += tmp->npage;
+			kfree(tmp);
+			npage = dma->npage;
+			size = NPAGE_TO_SIZE(npage);
 		}
 	}
 
-	/* Isolated, new region */
-	if (!pdma) {
-		dma = kzalloc(sizeof *dma, GFP_KERNEL);
-		if (!dma) {
-			ret = -ENOMEM;
-			vfio_dma_unmap(iommu, iova, npage, prot);
-			goto out_lock;
-		}
-
-		dma->npage = npage;
-		dma->iova = iova;
-		dma->vaddr = vaddr;
-		dma->prot = prot;
-		list_add(&dma->next, &iommu->dma_list);
-	}
+	vfio_insert_dma(iommu, dma);
 
 out_lock:
 	mutex_unlock(&iommu->lock);
+	if (ret)
+		kfree(dma);
 	return ret;
 }
 
@@ -606,7 +607,7 @@ static void *vfio_iommu_type1_open(unsigned long arg)
 		return ERR_PTR(-ENOMEM);
 
 	INIT_LIST_HEAD(&iommu->group_list);
-	INIT_LIST_HEAD(&iommu->dma_list);
+	iommu->dma_list = RB_ROOT;
 	mutex_init(&iommu->lock);
 
 	/*
@@ -640,7 +641,7 @@ static void vfio_iommu_type1_release(void *iommu_data)
 {
 	struct vfio_iommu *iommu = iommu_data;
 	struct vfio_group *group, *group_tmp;
-	struct vfio_dma *dma, *dma_tmp;
+	struct rb_node *node;
 
 	list_for_each_entry_safe(group, group_tmp, &iommu->group_list, next) {
 		iommu_detach_group(iommu->domain, group->iommu_group);
@@ -648,9 +649,10 @@ static void vfio_iommu_type1_release(void *iommu_data)
 		kfree(group);
 	}
 
-	list_for_each_entry_safe(dma, dma_tmp, &iommu->dma_list, next) {
+	while ((node = rb_first(&iommu->dma_list))) {
+		struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
 		vfio_dma_unmap(iommu, dma->iova, dma->npage, dma->prot);
-		list_del(&dma->next);
+		vfio_remove_dma(iommu, dma);
 		kfree(dma);
 	}
 

From 166fd7d94afdac040b28c473e45241820ca522a2 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Fri, 21 Jun 2013 09:38:02 -0600
Subject: [PATCH 2/7] vfio: hugepage support for vfio_iommu_type1

We currently send all mappings to the iommu in PAGE_SIZE chunks,
which prevents the iommu from enabling support for larger page sizes.
We still need to pin pages, which means we step through them in
PAGE_SIZE chunks, but we can batch up contiguous physical memory
chunks to allow the iommu the opportunity to use larger pages.  The
approach here is a bit different that the one currently used for
legacy KVM device assignment.  Rather than looking at the vma page
size and using that as the maximum size to pass to the iommu, we
instead simply look at whether the next page is physically
contiguous.  This means we might ask the iommu to map a 4MB region,
while legacy KVM might limit itself to a maximum of 2MB.

Splitting our mapping path also allows us to be smarter about locked
memory because we can more easily unwind if the user attempts to
exceed the limit.  Therefore, rather than assuming that a mapping
will result in locked memory, we test each page as it is pinned to
determine whether it locks RAM vs an mmap'd MMIO region.  This should
result in better locking granularity and less locked page fudge
factors in userspace.

The unmap path uses the same algorithm as legacy KVM.  We don't want
to track the pfn for each mapping ourselves, but we need the pfn in
order to unpin pages.  We therefore ask the iommu for the iova to
physical address translation, ask it to unpin a page, and see how many
pages were actually unpinned.  iommus supporting large pages will
often return something bigger than a page here, which we know will be
physically contiguous and we can unpin a batch of pfns.  iommus not
supporting large mappings won't see an improvement in batching here as
they only unmap a page at a time.

With this change, we also make a clarification to the API for mapping
and unmapping DMA.  We can only guarantee unmaps at the same
granularity as used for the original mapping.  In other words,
unmapping a subregion of a previous mapping is not guaranteed and may
result in a larger or smaller unmapping than requested.  The size
field in the unmapping structure is updated to reflect this.
Previously this was unmodified on mapping, always returning the the
requested unmap size.  This is now updated to return the actual unmap
size on success, allowing userspace to appropriately track mappings.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/vfio_iommu_type1.c | 521 +++++++++++++++++++++-----------
 include/uapi/linux/vfio.h       |   8 +-
 2 files changed, 343 insertions(+), 186 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 0e863b3ddcab..6654a7eb42d3 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -60,7 +60,7 @@ struct vfio_dma {
 	struct rb_node		node;
 	dma_addr_t		iova;		/* Device address */
 	unsigned long		vaddr;		/* Process virtual addr */
-	long			npage;		/* Number of pages */
+	size_t			size;		/* Map size (bytes) */
 	int			prot;		/* IOMMU_READ/WRITE */
 };
 
@@ -74,8 +74,6 @@ struct vfio_group {
  * into DMA'ble space using the IOMMU
  */
 
-#define NPAGE_TO_SIZE(npage)	((size_t)(npage) << PAGE_SHIFT)
-
 static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
 				      dma_addr_t start, size_t size)
 {
@@ -86,7 +84,7 @@ static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
 
 		if (start + size <= dma->iova)
 			node = node->rb_left;
-		else if (start >= dma->iova + NPAGE_TO_SIZE(dma->npage))
+		else if (start >= dma->iova + dma->size)
 			node = node->rb_right;
 		else
 			return dma;
@@ -104,7 +102,7 @@ static void vfio_insert_dma(struct vfio_iommu *iommu, struct vfio_dma *new)
 		parent = *link;
 		dma = rb_entry(parent, struct vfio_dma, node);
 
-		if (new->iova + NPAGE_TO_SIZE(new->npage) <= dma->iova)
+		if (new->iova + new->size <= dma->iova)
 			link = &(*link)->rb_left;
 		else
 			link = &(*link)->rb_right;
@@ -144,8 +142,8 @@ static void vfio_lock_acct(long npage)
 	struct vwork *vwork;
 	struct mm_struct *mm;
 
-	if (!current->mm)
-		return; /* process exited */
+	if (!current->mm || !npage)
+		return; /* process exited or nothing to do */
 
 	if (down_write_trylock(&current->mm->mmap_sem)) {
 		current->mm->locked_vm += npage;
@@ -217,33 +215,6 @@ static int put_pfn(unsigned long pfn, int prot)
 	return 0;
 }
 
-/* Unmap DMA region */
-static long __vfio_dma_do_unmap(struct vfio_iommu *iommu, dma_addr_t iova,
-			     long npage, int prot)
-{
-	long i, unlocked = 0;
-
-	for (i = 0; i < npage; i++, iova += PAGE_SIZE) {
-		unsigned long pfn;
-
-		pfn = iommu_iova_to_phys(iommu->domain, iova) >> PAGE_SHIFT;
-		if (pfn) {
-			iommu_unmap(iommu->domain, iova, PAGE_SIZE);
-			unlocked += put_pfn(pfn, prot);
-		}
-	}
-	return unlocked;
-}
-
-static void vfio_dma_unmap(struct vfio_iommu *iommu, dma_addr_t iova,
-			   long npage, int prot)
-{
-	long unlocked;
-
-	unlocked = __vfio_dma_do_unmap(iommu, iova, npage, prot);
-	vfio_lock_acct(-unlocked);
-}
-
 static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
 {
 	struct page *page[1];
@@ -270,79 +241,142 @@ static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
 	return ret;
 }
 
-/* Map DMA region */
-static int __vfio_dma_map(struct vfio_iommu *iommu, dma_addr_t iova,
-			  unsigned long vaddr, long npage, int prot)
+/*
+ * Attempt to pin pages.  We really don't want to track all the pfns and
+ * the iommu can only map chunks of consecutive pfns anyway, so get the
+ * first page and all consecutive pages with the same locking.
+ */
+static long vfio_pin_pages(unsigned long vaddr, long npage,
+			   int prot, unsigned long *pfn_base)
 {
-	dma_addr_t start = iova;
-	long i, locked = 0;
-	int ret;
+	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+	bool lock_cap = capable(CAP_IPC_LOCK);
+	long ret, i;
 
-	/* Verify that pages are not already mapped */
-	for (i = 0; i < npage; i++, iova += PAGE_SIZE)
-		if (iommu_iova_to_phys(iommu->domain, iova))
-			return -EBUSY;
+	if (!current->mm)
+		return -ENODEV;
 
-	iova = start;
+	ret = vaddr_get_pfn(vaddr, prot, pfn_base);
+	if (ret)
+		return ret;
 
-	if (iommu->cache)
-		prot |= IOMMU_CACHE;
+	if (is_invalid_reserved_pfn(*pfn_base))
+		return 1;
 
-	/*
-	 * XXX We break mappings into pages and use get_user_pages_fast to
-	 * pin the pages in memory.  It's been suggested that mlock might
-	 * provide a more efficient mechanism, but nothing prevents the
-	 * user from munlocking the pages, which could then allow the user
-	 * access to random host memory.  We also have no guarantee from the
-	 * IOMMU API that the iommu driver can unmap sub-pages of previous
-	 * mappings.  This means we might lose an entire range if a single
-	 * page within it is unmapped.  Single page mappings are inefficient,
-	 * but provide the most flexibility for now.
-	 */
-	for (i = 0; i < npage; i++, iova += PAGE_SIZE, vaddr += PAGE_SIZE) {
+	if (!lock_cap && current->mm->locked_vm + 1 > limit) {
+		put_pfn(*pfn_base, prot);
+		pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
+			limit << PAGE_SHIFT);
+		return -ENOMEM;
+	}
+
+	/* Lock all the consecutive pages from pfn_base */
+	for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) {
 		unsigned long pfn = 0;
 
 		ret = vaddr_get_pfn(vaddr, prot, &pfn);
-		if (ret) {
-			__vfio_dma_do_unmap(iommu, start, i, prot);
-			return ret;
+		if (ret)
+			break;
+
+		if (pfn != *pfn_base + i || is_invalid_reserved_pfn(pfn)) {
+			put_pfn(pfn, prot);
+			break;
 		}
 
-		/*
-		 * Only add actual locked pages to accounting
-		 * XXX We're effectively marking a page locked for every
-		 * IOVA page even though it's possible the user could be
-		 * backing multiple IOVAs with the same vaddr.  This over-
-		 * penalizes the user process, but we currently have no
-		 * easy way to do this properly.
-		 */
-		if (!is_invalid_reserved_pfn(pfn))
-			locked++;
-
-		ret = iommu_map(iommu->domain, iova,
-				(phys_addr_t)pfn << PAGE_SHIFT,
-				PAGE_SIZE, prot);
-		if (ret) {
-			/* Back out mappings on error */
+		if (!lock_cap && current->mm->locked_vm + i + 1 > limit) {
 			put_pfn(pfn, prot);
-			__vfio_dma_do_unmap(iommu, start, i, prot);
-			return ret;
+			pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
+				__func__, limit << PAGE_SHIFT);
+			break;
 		}
 	}
-	vfio_lock_acct(locked);
+
+	vfio_lock_acct(i);
+
+	return i;
+}
+
+static long vfio_unpin_pages(unsigned long pfn, long npage,
+			     int prot, bool do_accounting)
+{
+	unsigned long unlocked = 0;
+	long i;
+
+	for (i = 0; i < npage; i++)
+		unlocked += put_pfn(pfn++, prot);
+
+	if (do_accounting)
+		vfio_lock_acct(-unlocked);
+
+	return unlocked;
+}
+
+static int vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
+			    dma_addr_t iova, size_t *size)
+{
+	dma_addr_t start = iova, end = iova + *size;
+	long unlocked = 0;
+
+	while (iova < end) {
+		size_t unmapped;
+		phys_addr_t phys;
+
+		/*
+		 * We use the IOMMU to track the physical address.  This
+		 * saves us from having a lot more entries in our mapping
+		 * tree.  The downside is that we don't track the size
+		 * used to do the mapping.  We request unmap of a single
+		 * page, but expect IOMMUs that support large pages to
+		 * unmap a larger chunk.
+		 */
+		phys = iommu_iova_to_phys(iommu->domain, iova);
+		if (WARN_ON(!phys)) {
+			iova += PAGE_SIZE;
+			continue;
+		}
+
+		unmapped = iommu_unmap(iommu->domain, iova, PAGE_SIZE);
+		if (!unmapped)
+			break;
+
+		unlocked += vfio_unpin_pages(phys >> PAGE_SHIFT,
+					     unmapped >> PAGE_SHIFT,
+					     dma->prot, false);
+		iova += unmapped;
+	}
+
+	vfio_lock_acct(-unlocked);
+
+	*size = iova - start;
+
 	return 0;
 }
 
 static int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start,
-				   size_t size, struct vfio_dma *dma)
+				   size_t *size, struct vfio_dma *dma)
 {
+	size_t offset, overlap, tmp;
 	struct vfio_dma *split;
-	long npage_lo, npage_hi;
+	int ret;
+
+	/*
+	 * Existing dma region is completely covered, unmap all.  This is
+	 * the likely case since userspace tends to map and unmap buffers
+	 * in one shot rather than multiple mappings within a buffer.
+	 */
+	if (likely(start <= dma->iova &&
+		   start + *size >= dma->iova + dma->size)) {
+		*size = dma->size;
+		ret = vfio_unmap_unpin(iommu, dma, dma->iova, size);
+		if (ret)
+			return ret;
+
+		/*
+		 * Did we remove more than we have?  Should never happen
+		 * since a vfio_dma is contiguous in iova and vaddr.
+		 */
+		WARN_ON(*size != dma->size);
 
-	/* Existing dma region is completely covered, unmap all */
-	if (start <= dma->iova &&
-	    start + size >= dma->iova + NPAGE_TO_SIZE(dma->npage)) {
-		vfio_dma_unmap(iommu, dma->iova, dma->npage, dma->prot);
 		vfio_remove_dma(iommu, dma);
 		kfree(dma);
 		return 0;
@@ -350,47 +384,79 @@ static int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start,
 
 	/* Overlap low address of existing range */
 	if (start <= dma->iova) {
-		size_t overlap;
+		overlap = start + *size - dma->iova;
+		ret = vfio_unmap_unpin(iommu, dma, dma->iova, &overlap);
+		if (ret)
+			return ret;
 
-		overlap = start + size - dma->iova;
-		npage_lo = overlap >> PAGE_SHIFT;
+		vfio_remove_dma(iommu, dma);
 
-		vfio_dma_unmap(iommu, dma->iova, npage_lo, dma->prot);
-		dma->iova += overlap;
-		dma->vaddr += overlap;
-		dma->npage -= npage_lo;
+		/*
+		 * Check, we may have removed to whole vfio_dma.  If not
+		 * fixup and re-insert.
+		 */
+		if (overlap < dma->size) {
+			dma->iova += overlap;
+			dma->vaddr += overlap;
+			dma->size -= overlap;
+			vfio_insert_dma(iommu, dma);
+		}
+		*size = overlap;
 		return 0;
 	}
 
 	/* Overlap high address of existing range */
-	if (start + size >= dma->iova + NPAGE_TO_SIZE(dma->npage)) {
-		size_t overlap;
+	if (start + *size >= dma->iova + dma->size) {
+		offset = start - dma->iova;
+		overlap = dma->size - offset;
 
-		overlap = dma->iova + NPAGE_TO_SIZE(dma->npage) - start;
-		npage_hi = overlap >> PAGE_SHIFT;
+		ret = vfio_unmap_unpin(iommu, dma, start, &overlap);
+		if (ret)
+			return ret;
 
-		vfio_dma_unmap(iommu, start, npage_hi, dma->prot);
-		dma->npage -= npage_hi;
+		/*
+		 * We may have unmapped the entire vfio_dma if the user is
+		 * trying to unmap a sub-region of what was originally
+		 * mapped.  If anything left, we can resize in place since
+		 * iova is unchanged.
+		 */
+		if (overlap < dma->size)
+			dma->size -= overlap;
+		else
+			vfio_remove_dma(iommu, dma);
+
+		*size = overlap;
 		return 0;
 	}
 
 	/* Split existing */
-	npage_lo = (start - dma->iova) >> PAGE_SHIFT;
-	npage_hi = dma->npage - (size >> PAGE_SHIFT) - npage_lo;
+	offset = start - dma->iova;
 
-	split = kzalloc(sizeof *split, GFP_KERNEL);
-	if (!split)
-		return -ENOMEM;
+	ret = vfio_unmap_unpin(iommu, dma, start, size);
+	if (ret)
+		return ret;
 
-	vfio_dma_unmap(iommu, start, size >> PAGE_SHIFT, dma->prot);
+	WARN_ON(!*size);
+	tmp = dma->size;
 
-	dma->npage = npage_lo;
+	/*
+	 * Resize the lower vfio_dma in place, insert new for remaining
+	 * upper segment.
+	 */
+	dma->size = offset;
+
+	if (offset + *size < tmp) {
+		split = kzalloc(sizeof(*split), GFP_KERNEL);
+		if (!split)
+			return -ENOMEM;
+
+		split->size = tmp - offset - *size;
+		split->iova = dma->iova + offset + *size;
+		split->vaddr = dma->vaddr + offset + *size;
+		split->prot = dma->prot;
+		vfio_insert_dma(iommu, split);
+	}
 
-	split->npage = npage_hi;
-	split->iova = start + size;
-	split->vaddr = dma->vaddr + NPAGE_TO_SIZE(npage_lo) + size;
-	split->prot = dma->prot;
-	vfio_insert_dma(iommu, split);
 	return 0;
 }
 
@@ -399,6 +465,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 {
 	uint64_t mask;
 	struct vfio_dma *dma;
+	size_t unmapped = 0, size;
 	int ret = 0;
 
 	mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1;
@@ -408,30 +475,66 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 	if (unmap->size & mask)
 		return -EINVAL;
 
-	/* XXX We still break these down into PAGE_SIZE */
 	WARN_ON(mask & PAGE_MASK);
 
 	mutex_lock(&iommu->lock);
 
-	while (!ret && (dma = vfio_find_dma(iommu,
-					    unmap->iova, unmap->size)))
-		ret = vfio_remove_dma_overlap(iommu, unmap->iova,
-					      unmap->size, dma);
+	while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) {
+		size = unmap->size;
+		ret = vfio_remove_dma_overlap(iommu, unmap->iova, &size, dma);
+		if (ret)
+			break;
+		unmapped += size;
+	}
 
 	mutex_unlock(&iommu->lock);
+
+	/*
+	 * We may unmap more than requested, update the unmap struct so
+	 * userspace can know.
+	 */
+	unmap->size = unmapped;
+
+	return ret;
+}
+
+/*
+ * Turns out AMD IOMMU has a page table bug where it won't map large pages
+ * to a region that previously mapped smaller pages.  This should be fixed
+ * soon, so this is just a temporary workaround to break mappings down into
+ * PAGE_SIZE.  Better to map smaller pages than nothing.
+ */
+static int map_try_harder(struct vfio_iommu *iommu, dma_addr_t iova,
+			  unsigned long pfn, long npage, int prot)
+{
+	long i;
+	int ret;
+
+	for (i = 0; i < npage; i++, pfn++, iova += PAGE_SIZE) {
+		ret = iommu_map(iommu->domain, iova,
+				(phys_addr_t)pfn << PAGE_SHIFT,
+				PAGE_SIZE, prot);
+		if (ret)
+			break;
+	}
+
+	for (; i < npage && i > 0; i--, iova -= PAGE_SIZE)
+		iommu_unmap(iommu->domain, iova, PAGE_SIZE);
+
 	return ret;
 }
 
 static int vfio_dma_do_map(struct vfio_iommu *iommu,
 			   struct vfio_iommu_type1_dma_map *map)
 {
-	struct vfio_dma *dma;
-	dma_addr_t iova = map->iova;
-	unsigned long locked, lock_limit, vaddr = map->vaddr;
+	dma_addr_t end, iova;
+	unsigned long vaddr = map->vaddr;
 	size_t size = map->size;
+	long npage;
 	int ret = 0, prot = 0;
 	uint64_t mask;
-	long npage;
+
+	end = map->iova + map->size;
 
 	mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1;
 
@@ -444,92 +547,138 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
 	if (!prot)
 		return -EINVAL; /* No READ/WRITE? */
 
+	if (iommu->cache)
+		prot |= IOMMU_CACHE;
+
 	if (vaddr & mask)
 		return -EINVAL;
-	if (iova & mask)
+	if (map->iova & mask)
 		return -EINVAL;
-	if (size & mask)
+	if (!map->size || map->size & mask)
 		return -EINVAL;
 
-	/* XXX We still break these down into PAGE_SIZE */
 	WARN_ON(mask & PAGE_MASK);
 
 	/* Don't allow IOVA wrap */
-	if (iova + size && iova + size < iova)
+	if (end && end < map->iova)
 		return -EINVAL;
 
 	/* Don't allow virtual address wrap */
-	if (vaddr + size && vaddr + size < vaddr)
+	if (vaddr + map->size && vaddr + map->size < vaddr)
 		return -EINVAL;
 
-	npage = size >> PAGE_SHIFT;
-	if (!npage)
-		return -EINVAL;
-
-	dma = kzalloc(sizeof *dma, GFP_KERNEL);
-	if (!dma)
-		return -ENOMEM;
-
 	mutex_lock(&iommu->lock);
 
-	if (vfio_find_dma(iommu, iova, size)) {
-		ret = -EBUSY;
-		goto out_lock;
+	if (vfio_find_dma(iommu, map->iova, map->size)) {
+		mutex_unlock(&iommu->lock);
+		return -EEXIST;
 	}
 
-	/* account for locked pages */
-	locked = current->mm->locked_vm + npage;
-	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-	if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
-		pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
-			__func__, rlimit(RLIMIT_MEMLOCK));
-		ret = -ENOMEM;
-		goto out_lock;
-	}
+	for (iova = map->iova; iova < end; iova += size, vaddr += size) {
+		struct vfio_dma *dma = NULL;
+		unsigned long pfn;
+		long i;
 
-	ret = __vfio_dma_map(iommu, iova, vaddr, npage, prot);
-	if (ret)
-		goto out_lock;
+		/* Pin a contiguous chunk of memory */
+		npage = vfio_pin_pages(vaddr, (end - iova) >> PAGE_SHIFT,
+				       prot, &pfn);
+		if (npage <= 0) {
+			WARN_ON(!npage);
+			ret = (int)npage;
+			break;
+		}
 
-	dma->npage = npage;
-	dma->iova = iova;
-	dma->vaddr = vaddr;
-	dma->prot = prot;
+		/* Verify pages are not already mapped */
+		for (i = 0; i < npage; i++) {
+			if (iommu_iova_to_phys(iommu->domain,
+					       iova + (i << PAGE_SHIFT))) {
+				vfio_unpin_pages(pfn, npage, prot, true);
+				ret = -EBUSY;
+				break;
+			}
+		}
 
-	/* Check if we abut a region below - nothing below 0 */
-	if (iova) {
-		struct vfio_dma *tmp = vfio_find_dma(iommu, iova - 1, 1);
-		if (tmp && tmp->prot == prot &&
-		    tmp->vaddr + NPAGE_TO_SIZE(tmp->npage) == vaddr) {
-			vfio_remove_dma(iommu, tmp);
-			dma->npage += tmp->npage;
-			dma->iova = iova = tmp->iova;
-			dma->vaddr = vaddr = tmp->vaddr;
-			kfree(tmp);
-			npage = dma->npage;
-			size = NPAGE_TO_SIZE(npage);
+		ret = iommu_map(iommu->domain, iova,
+				(phys_addr_t)pfn << PAGE_SHIFT,
+				npage << PAGE_SHIFT, prot);
+		if (ret) {
+			if (ret != -EBUSY ||
+			    map_try_harder(iommu, iova, pfn, npage, prot)) {
+				vfio_unpin_pages(pfn, npage, prot, true);
+				break;
+			}
+		}
+
+		size = npage << PAGE_SHIFT;
+
+		/*
+		 * Check if we abut a region below - nothing below 0.
+		 * This is the most likely case when mapping chunks of
+		 * physically contiguous regions within a virtual address
+		 * range.  Update the abutting entry in place since iova
+		 * doesn't change.
+		 */
+		if (likely(iova)) {
+			struct vfio_dma *tmp;
+			tmp = vfio_find_dma(iommu, iova - 1, 1);
+			if (tmp && tmp->prot == prot &&
+			    tmp->vaddr + tmp->size == vaddr) {
+				tmp->size += size;
+
+				iova = tmp->iova;
+				size = tmp->size;
+				vaddr = tmp->vaddr;
+				dma = tmp;
+			}
+		}
+
+		/* Check if we abut a region above - nothing above ~0 + 1 */
+		if (likely(iova + size)) {
+			struct vfio_dma *tmp;
+
+			tmp = vfio_find_dma(iommu, iova + size, 1);
+			if (tmp && tmp->prot == prot &&
+			    tmp->vaddr == vaddr + size) {
+				vfio_remove_dma(iommu, tmp);
+				if (dma)
+					dma->size += tmp->size;
+				else
+					size += tmp->size;
+				kfree(tmp);
+			}
+		}
+
+		if (!dma) {
+			dma = kzalloc(sizeof(*dma), GFP_KERNEL);
+			if (!dma) {
+				iommu_unmap(iommu->domain, iova, size);
+				vfio_unpin_pages(pfn, npage, prot, true);
+				ret = -ENOMEM;
+				break;
+			}
+
+			dma->size = size;
+			dma->iova = iova;
+			dma->vaddr = vaddr;
+			dma->prot = prot;
+			vfio_insert_dma(iommu, dma);
 		}
 	}
 
-	/* Check if we abut a region above - nothing above ~0 + 1 */
-	if (iova + size) {
-		struct vfio_dma *tmp = vfio_find_dma(iommu, iova + size, 1);
-		if (tmp && tmp->prot == prot &&
-		    tmp->vaddr == vaddr + size) {
-			vfio_remove_dma(iommu, tmp);
-			dma->npage += tmp->npage;
-			kfree(tmp);
-			npage = dma->npage;
-			size = NPAGE_TO_SIZE(npage);
+	if (ret) {
+		struct vfio_dma *tmp;
+		iova = map->iova;
+		size = map->size;
+		while ((tmp = vfio_find_dma(iommu, iova, size))) {
+			if (vfio_remove_dma_overlap(iommu, iova, &size, tmp)) {
+				pr_warn("%s: Error rolling back failed map\n",
+					__func__);
+				break;
+			}
 		}
 	}
 
-	vfio_insert_dma(iommu, dma);
-
-out_lock:
 	mutex_unlock(&iommu->lock);
-	if (ret)
-		kfree(dma);
 	return ret;
 }
 
@@ -651,9 +800,8 @@ static void vfio_iommu_type1_release(void *iommu_data)
 
 	while ((node = rb_first(&iommu->dma_list))) {
 		struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
-		vfio_dma_unmap(iommu, dma->iova, dma->npage, dma->prot);
-		vfio_remove_dma(iommu, dma);
-		kfree(dma);
+		size_t size = dma->size;
+		vfio_remove_dma_overlap(iommu, dma->iova, &size, dma);
 	}
 
 	iommu_domain_free(iommu->domain);
@@ -708,6 +856,7 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
 
 	} else if (cmd == VFIO_IOMMU_UNMAP_DMA) {
 		struct vfio_iommu_type1_dma_unmap unmap;
+		long ret;
 
 		minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
 
@@ -717,7 +866,11 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
 		if (unmap.argsz < minsz || unmap.flags)
 			return -EINVAL;
 
-		return vfio_dma_do_unmap(iommu, &unmap);
+		ret = vfio_dma_do_unmap(iommu, &unmap);
+		if (ret)
+			return ret;
+
+		return copy_to_user((void __user *)arg, &unmap, minsz);
 	}
 
 	return -ENOTTY;
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 284ff2436829..513600612995 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -361,10 +361,14 @@ struct vfio_iommu_type1_dma_map {
 #define VFIO_IOMMU_MAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 13)
 
 /**
- * VFIO_IOMMU_UNMAP_DMA - _IOW(VFIO_TYPE, VFIO_BASE + 14, struct vfio_dma_unmap)
+ * VFIO_IOMMU_UNMAP_DMA - _IOWR(VFIO_TYPE, VFIO_BASE + 14,
+ *							struct vfio_dma_unmap)
  *
  * Unmap IO virtual addresses using the provided struct vfio_dma_unmap.
- * Caller sets argsz.
+ * Caller sets argsz.  The actual unmapped size is returned in the size
+ * field.  No guarantee is made to the user that arbitrary unmaps of iova
+ * or size different from those used in the original mapping call will
+ * succeed.
  */
 struct vfio_iommu_type1_dma_unmap {
 	__u32	argsz;

From 5c6c2b21ecc9adfc47cfaf93404937b6ecd8395d Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Fri, 21 Jun 2013 09:38:11 -0600
Subject: [PATCH 3/7] vfio: Provide module option to disable vfio_iommu_type1
 hugepage support

Add a module option to vfio_iommu_type1 to disable IOMMU hugepage
support.  This causes iommu_map to only be called with single page
mappings, disabling the IOMMU driver's ability to use hugepages.
This option can be enabled by loading vfio_iommu_type1 with
disable_hugepages=1 or dynamically through sysfs.  If enabled
dynamically, only new mappings are restricted.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/vfio_iommu_type1.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 6654a7eb42d3..8a2be4e40f22 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -48,6 +48,12 @@ module_param_named(allow_unsafe_interrupts,
 MODULE_PARM_DESC(allow_unsafe_interrupts,
 		 "Enable VFIO IOMMU support for on platforms without interrupt remapping support.");
 
+static bool disable_hugepages;
+module_param_named(disable_hugepages,
+		   disable_hugepages, bool, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(disable_hugepages,
+		 "Disable VFIO IOMMU support for IOMMU hugepages.");
+
 struct vfio_iommu {
 	struct iommu_domain	*domain;
 	struct mutex		lock;
@@ -270,6 +276,11 @@ static long vfio_pin_pages(unsigned long vaddr, long npage,
 		return -ENOMEM;
 	}
 
+	if (unlikely(disable_hugepages)) {
+		vfio_lock_acct(1);
+		return 1;
+	}
+
 	/* Lock all the consecutive pages from pfn_base */
 	for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) {
 		unsigned long pfn = 0;

From b0e59b85abd42f8a525ff032d5063457e1378808 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Fri, 21 Jun 2013 09:43:21 -0600
Subject: [PATCH 4/7] vfio: fix documentation

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 Documentation/vfio.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
index 8eda3635a17d..e71782a4f2a4 100644
--- a/Documentation/vfio.txt
+++ b/Documentation/vfio.txt
@@ -172,12 +172,12 @@ group and can access them as follows:
 	struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
 
 	/* Create a new container */
-	container = open("/dev/vfio/vfio, O_RDWR);
+	container = open("/dev/vfio/vfio", O_RDWR);
 
 	if (ioctl(container, VFIO_GET_API_VERSION) != VFIO_API_VERSION)
 		/* Unknown API version */
 
-	if (!ioctl(container, VFIO_CHECK_EXTENSION, VFIO_X86_IOMMU))
+	if (!ioctl(container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU))
 		/* Doesn't support the IOMMU driver we want. */
 
 	/* Open the group */
@@ -193,7 +193,7 @@ group and can access them as follows:
 	ioctl(group, VFIO_GROUP_SET_CONTAINER, &container);
 
 	/* Enable the IOMMU model we want */
-	ioctl(container, VFIO_SET_IOMMU, VFIO_X86_IOMMU)
+	ioctl(container, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU)
 
 	/* Get addition IOMMU info */
 	ioctl(container, VFIO_IOMMU_GET_INFO, &iommu_info);

From f5bfdbf252ad54aa4b6f765cbe89750087fcc155 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Tue, 25 Jun 2013 16:01:44 -0600
Subject: [PATCH 5/7] vfio/type1: Fix missed frees and zero sized removes

With hugepage support we can only properly aligned and sized ranges.
We only guarantee that we can unmap the same ranges mapped and not
arbitrary sub-ranges.  This means we might not free anything or might
free more than requested.  The vfio unmap interface started storing
the unmapped size to return to userspace to handle this.  This patch
fixes a few places where we don't properly handle those cases, moves
a memory allocation to a place where failure is an option and checks
our loops to make sure we don't get into an infinite loop trying to
remove an overlap.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/vfio_iommu_type1.c | 77 ++++++++++++++++++---------------
 1 file changed, 42 insertions(+), 35 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 8a2be4e40f22..98231d10890c 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -370,6 +370,9 @@ static int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start,
 	struct vfio_dma *split;
 	int ret;
 
+	if (!*size)
+		return 0;
+
 	/*
 	 * Existing dma region is completely covered, unmap all.  This is
 	 * the likely case since userspace tends to map and unmap buffers
@@ -411,7 +414,9 @@ static int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start,
 			dma->vaddr += overlap;
 			dma->size -= overlap;
 			vfio_insert_dma(iommu, dma);
-		}
+		} else
+			kfree(dma);
+
 		*size = overlap;
 		return 0;
 	}
@@ -425,48 +430,41 @@ static int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start,
 		if (ret)
 			return ret;
 
-		/*
-		 * We may have unmapped the entire vfio_dma if the user is
-		 * trying to unmap a sub-region of what was originally
-		 * mapped.  If anything left, we can resize in place since
-		 * iova is unchanged.
-		 */
-		if (overlap < dma->size)
-			dma->size -= overlap;
-		else
-			vfio_remove_dma(iommu, dma);
-
+		dma->size -= overlap;
 		*size = overlap;
 		return 0;
 	}
 
 	/* Split existing */
+	split = kzalloc(sizeof(*split), GFP_KERNEL);
+	if (!split)
+		return -ENOMEM;
+
 	offset = start - dma->iova;
 
 	ret = vfio_unmap_unpin(iommu, dma, start, size);
 	if (ret)
 		return ret;
 
-	WARN_ON(!*size);
+	if (!*size) {
+		kfree(split);
+		return -EINVAL;
+	}
+
 	tmp = dma->size;
 
-	/*
-	 * Resize the lower vfio_dma in place, insert new for remaining
-	 * upper segment.
-	 */
+	/* Resize the lower vfio_dma in place, before the below insert */
 	dma->size = offset;
 
-	if (offset + *size < tmp) {
-		split = kzalloc(sizeof(*split), GFP_KERNEL);
-		if (!split)
-			return -ENOMEM;
-
+	/* Insert new for remainder, assuming it didn't all get unmapped */
+	if (likely(offset + *size < tmp)) {
 		split->size = tmp - offset - *size;
 		split->iova = dma->iova + offset + *size;
 		split->vaddr = dma->vaddr + offset + *size;
 		split->prot = dma->prot;
 		vfio_insert_dma(iommu, split);
-	}
+	} else
+		kfree(split);
 
 	return 0;
 }
@@ -483,7 +481,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 
 	if (unmap->iova & mask)
 		return -EINVAL;
-	if (unmap->size & mask)
+	if (!unmap->size || unmap->size & mask)
 		return -EINVAL;
 
 	WARN_ON(mask & PAGE_MASK);
@@ -493,7 +491,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 	while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) {
 		size = unmap->size;
 		ret = vfio_remove_dma_overlap(iommu, unmap->iova, &size, dma);
-		if (ret)
+		if (ret || !size)
 			break;
 		unmapped += size;
 	}
@@ -635,7 +633,6 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
 			if (tmp && tmp->prot == prot &&
 			    tmp->vaddr + tmp->size == vaddr) {
 				tmp->size += size;
-
 				iova = tmp->iova;
 				size = tmp->size;
 				vaddr = tmp->vaddr;
@@ -643,19 +640,28 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
 			}
 		}
 
-		/* Check if we abut a region above - nothing above ~0 + 1 */
+		/*
+		 * Check if we abut a region above - nothing above ~0 + 1.
+		 * If we abut above and below, remove and free.  If only
+		 * abut above, remove, modify, reinsert.
+		 */
 		if (likely(iova + size)) {
 			struct vfio_dma *tmp;
-
 			tmp = vfio_find_dma(iommu, iova + size, 1);
 			if (tmp && tmp->prot == prot &&
 			    tmp->vaddr == vaddr + size) {
 				vfio_remove_dma(iommu, tmp);
-				if (dma)
+				if (dma) {
 					dma->size += tmp->size;
-				else
+					kfree(tmp);
+				} else {
 					size += tmp->size;
-				kfree(tmp);
+					tmp->size = size;
+					tmp->iova = iova;
+					tmp->vaddr = vaddr;
+					vfio_insert_dma(iommu, tmp);
+					dma = tmp;
+				}
 			}
 		}
 
@@ -681,11 +687,10 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
 		iova = map->iova;
 		size = map->size;
 		while ((tmp = vfio_find_dma(iommu, iova, size))) {
-			if (vfio_remove_dma_overlap(iommu, iova, &size, tmp)) {
-				pr_warn("%s: Error rolling back failed map\n",
-					__func__);
+			int r = vfio_remove_dma_overlap(iommu, iova,
+							&size, tmp);
+			if (WARN_ON(r || !size))
 				break;
-			}
 		}
 	}
 
@@ -813,6 +818,8 @@ static void vfio_iommu_type1_release(void *iommu_data)
 		struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
 		size_t size = dma->size;
 		vfio_remove_dma_overlap(iommu, dma->iova, &size, dma);
+		if (WARN_ON(!size))
+			break;
 	}
 
 	iommu_domain_free(iommu->domain);

From 6d6768c61b39a2184bc11bb0342e4f7f35642bcc Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Tue, 25 Jun 2013 16:06:54 -0600
Subject: [PATCH 6/7] vfio: Limit group opens

vfio_group_fops_open attempts to limit concurrent sessions by
disallowing opens once group->container is set.  This really doesn't
do what we want and allow for inconsistent behavior, for instance a
group can be opened twice, then a container set giving the user two
file descriptors to the group.  But then it won't allow more to be
opened.  There's not much reason to have the group opened multiple
times since most access is through devices or the container, so
complete what the original code intended and only allow a single
instance.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/vfio.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 6d78736563de..d30f44da8762 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -76,6 +76,7 @@ struct vfio_group {
 	struct notifier_block		nb;
 	struct list_head		vfio_next;
 	struct list_head		container_next;
+	atomic_t			opened;
 };
 
 struct vfio_device {
@@ -206,6 +207,7 @@ static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group)
 	INIT_LIST_HEAD(&group->device_list);
 	mutex_init(&group->device_lock);
 	atomic_set(&group->container_users, 0);
+	atomic_set(&group->opened, 0);
 	group->iommu_group = iommu_group;
 
 	group->nb.notifier_call = vfio_iommu_group_notifier;
@@ -1236,12 +1238,22 @@ static long vfio_group_fops_compat_ioctl(struct file *filep,
 static int vfio_group_fops_open(struct inode *inode, struct file *filep)
 {
 	struct vfio_group *group;
+	int opened;
 
 	group = vfio_group_get_from_minor(iminor(inode));
 	if (!group)
 		return -ENODEV;
 
+	/* Do we need multiple instances of the group open?  Seems not. */
+	opened = atomic_cmpxchg(&group->opened, 0, 1);
+	if (opened) {
+		vfio_group_put(group);
+		return -EBUSY;
+	}
+
+	/* Is something still in use from a previous open? */
 	if (group->container) {
+		atomic_dec(&group->opened);
 		vfio_group_put(group);
 		return -EBUSY;
 	}
@@ -1259,6 +1271,8 @@ static int vfio_group_fops_release(struct inode *inode, struct file *filep)
 
 	vfio_group_try_dissolve_container(group);
 
+	atomic_dec(&group->opened);
+
 	vfio_group_put(group);
 
 	return 0;

From 8d38ef1948bd415a5cb653a5c0ec16f3402aaca1 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Mon, 1 Jul 2013 08:28:58 -0600
Subject: [PATCH 7/7] vfio/type1: Fix leak on error path

We also don't handle unpinning zero pages as an error on other exits
so we can fix that inconsistency by rolling in the next conditional
return.

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/vfio_iommu_type1.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 98231d10890c..a9807dea3887 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -436,6 +436,12 @@ static int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start,
 	}
 
 	/* Split existing */
+
+	/*
+	 * Allocate our tracking structure early even though it may not
+	 * be used.  An Allocation failure later loses track of pages and
+	 * is more difficult to unwind.
+	 */
 	split = kzalloc(sizeof(*split), GFP_KERNEL);
 	if (!split)
 		return -ENOMEM;
@@ -443,12 +449,9 @@ static int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start,
 	offset = start - dma->iova;
 
 	ret = vfio_unmap_unpin(iommu, dma, start, size);
-	if (ret)
-		return ret;
-
-	if (!*size) {
+	if (ret || !*size) {
 		kfree(split);
-		return -EINVAL;
+		return ret;
 	}
 
 	tmp = dma->size;