iommu/amd: Implement notifiers for IOMMUv2

Since pages are not pinned anymore we need notifications
when the VMM changes the page-tables. Use mmu_notifiers for
that.
Also use the task_exit notifier from the profiling subsystem
to shutdown all contexts related to this task.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
This commit is contained in:
Joerg Roedel 2011-11-24 16:21:52 +01:00
parent 028eeacc41
commit 8736b2c331
2 changed files with 178 additions and 11 deletions

View File

@ -62,7 +62,8 @@ config AMD_IOMMU_STATS
config AMD_IOMMU_V2 config AMD_IOMMU_V2
tristate "AMD IOMMU Version 2 driver (EXPERIMENTAL)" tristate "AMD IOMMU Version 2 driver (EXPERIMENTAL)"
depends on AMD_IOMMU && EXPERIMENTAL depends on AMD_IOMMU && PROFILING && EXPERIMENTAL
select MMU_NOTIFIER
---help--- ---help---
This option enables support for the AMD IOMMUv2 features of the IOMMU This option enables support for the AMD IOMMUv2 features of the IOMMU
hardware. Select this option if you want to use devices that support hardware. Select this option if you want to use devices that support

View File

@ -16,8 +16,10 @@
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/ */
#include <linux/mmu_notifier.h>
#include <linux/amd-iommu.h> #include <linux/amd-iommu.h>
#include <linux/mm_types.h> #include <linux/mm_types.h>
#include <linux/profile.h>
#include <linux/module.h> #include <linux/module.h>
#include <linux/sched.h> #include <linux/sched.h>
#include <linux/iommu.h> #include <linux/iommu.h>
@ -45,6 +47,7 @@ struct pasid_state {
atomic_t count; /* Reference count */ atomic_t count; /* Reference count */
struct task_struct *task; /* Task bound to this PASID */ struct task_struct *task; /* Task bound to this PASID */
struct mm_struct *mm; /* mm_struct for the faults */ struct mm_struct *mm; /* mm_struct for the faults */
struct mmu_notifier mn; /* mmu_otifier handle */
struct pri_queue pri[PRI_QUEUE_SIZE]; /* PRI tag states */ struct pri_queue pri[PRI_QUEUE_SIZE]; /* PRI tag states */
struct device_state *device_state; /* Link to our device_state */ struct device_state *device_state; /* Link to our device_state */
int pasid; /* PASID index */ int pasid; /* PASID index */
@ -85,8 +88,16 @@ static DEFINE_SPINLOCK(ps_lock);
static struct workqueue_struct *iommu_wq; static struct workqueue_struct *iommu_wq;
/*
* Empty page table - Used between
* mmu_notifier_invalidate_range_start and
* mmu_notifier_invalidate_range_end
*/
static u64 *empty_page_table;
static void free_pasid_states(struct device_state *dev_state); static void free_pasid_states(struct device_state *dev_state);
static void unbind_pasid(struct device_state *dev_state, int pasid); static void unbind_pasid(struct device_state *dev_state, int pasid);
static int task_exit(struct notifier_block *nb, unsigned long e, void *data);
static u16 device_id(struct pci_dev *pdev) static u16 device_id(struct pci_dev *pdev)
{ {
@ -144,6 +155,11 @@ static void put_device_state_wait(struct device_state *dev_state)
free_device_state(dev_state); free_device_state(dev_state);
} }
static struct notifier_block profile_nb = {
.notifier_call = task_exit,
};
static void link_pasid_state(struct pasid_state *pasid_state) static void link_pasid_state(struct pasid_state *pasid_state)
{ {
spin_lock(&ps_lock); spin_lock(&ps_lock);
@ -294,6 +310,23 @@ static void put_pasid_state_wait(struct pasid_state *pasid_state)
free_pasid_state(pasid_state); free_pasid_state(pasid_state);
} }
static void __unbind_pasid(struct pasid_state *pasid_state)
{
struct iommu_domain *domain;
domain = pasid_state->device_state->domain;
amd_iommu_domain_clear_gcr3(domain, pasid_state->pasid);
clear_pasid_state(pasid_state->device_state, pasid_state->pasid);
/* Make sure no more pending faults are in the queue */
flush_workqueue(iommu_wq);
mmu_notifier_unregister(&pasid_state->mn, pasid_state->mm);
put_pasid_state(pasid_state); /* Reference taken in bind() function */
}
static void unbind_pasid(struct device_state *dev_state, int pasid) static void unbind_pasid(struct device_state *dev_state, int pasid)
{ {
struct pasid_state *pasid_state; struct pasid_state *pasid_state;
@ -303,12 +336,8 @@ static void unbind_pasid(struct device_state *dev_state, int pasid)
return; return;
unlink_pasid_state(pasid_state); unlink_pasid_state(pasid_state);
__unbind_pasid(pasid_state);
amd_iommu_domain_clear_gcr3(dev_state->domain, pasid); put_pasid_state_wait(pasid_state); /* Reference taken in this function */
clear_pasid_state(dev_state, pasid);
put_pasid_state(pasid_state); /* Reference taken in this function */
put_pasid_state_wait(pasid_state); /* Reference from bind() function */
} }
static void free_pasid_states_level1(struct pasid_state **tbl) static void free_pasid_states_level1(struct pasid_state **tbl)
@ -361,6 +390,83 @@ static void free_pasid_states(struct device_state *dev_state)
free_page((unsigned long)dev_state->states); free_page((unsigned long)dev_state->states);
} }
static struct pasid_state *mn_to_state(struct mmu_notifier *mn)
{
return container_of(mn, struct pasid_state, mn);
}
static void __mn_flush_page(struct mmu_notifier *mn,
unsigned long address)
{
struct pasid_state *pasid_state;
struct device_state *dev_state;
pasid_state = mn_to_state(mn);
dev_state = pasid_state->device_state;
amd_iommu_flush_page(dev_state->domain, pasid_state->pasid, address);
}
static int mn_clear_flush_young(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long address)
{
__mn_flush_page(mn, address);
return 0;
}
static void mn_change_pte(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long address,
pte_t pte)
{
__mn_flush_page(mn, address);
}
static void mn_invalidate_page(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long address)
{
__mn_flush_page(mn, address);
}
static void mn_invalidate_range_start(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long start, unsigned long end)
{
struct pasid_state *pasid_state;
struct device_state *dev_state;
pasid_state = mn_to_state(mn);
dev_state = pasid_state->device_state;
amd_iommu_domain_set_gcr3(dev_state->domain, pasid_state->pasid,
__pa(empty_page_table));
}
static void mn_invalidate_range_end(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long start, unsigned long end)
{
struct pasid_state *pasid_state;
struct device_state *dev_state;
pasid_state = mn_to_state(mn);
dev_state = pasid_state->device_state;
amd_iommu_domain_set_gcr3(dev_state->domain, pasid_state->pasid,
__pa(pasid_state->mm->pgd));
}
static struct mmu_notifier_ops iommu_mn = {
.clear_flush_young = mn_clear_flush_young,
.change_pte = mn_change_pte,
.invalidate_page = mn_invalidate_page,
.invalidate_range_start = mn_invalidate_range_start,
.invalidate_range_end = mn_invalidate_range_end,
};
static void set_pri_tag_status(struct pasid_state *pasid_state, static void set_pri_tag_status(struct pasid_state *pasid_state,
u16 tag, int status) u16 tag, int status)
{ {
@ -475,6 +581,50 @@ static struct notifier_block ppr_nb = {
.notifier_call = ppr_notifier, .notifier_call = ppr_notifier,
}; };
static int task_exit(struct notifier_block *nb, unsigned long e, void *data)
{
struct pasid_state *pasid_state;
struct task_struct *task;
task = data;
/*
* Using this notifier is a hack - but there is no other choice
* at the moment. What I really want is a sleeping notifier that
* is called when an MM goes down. But such a notifier doesn't
* exist yet. The notifier needs to sleep because it has to make
* sure that the device does not use the PASID and the address
* space anymore before it is destroyed. This includes waiting
* for pending PRI requests to pass the workqueue. The
* MMU-Notifiers would be a good fit, but they use RCU and so
* they are not allowed to sleep. Lets see how we can solve this
* in a more intelligent way in the future.
*/
again:
spin_lock(&ps_lock);
list_for_each_entry(pasid_state, &pasid_state_list, list) {
struct device_state *dev_state;
int pasid;
if (pasid_state->task != task)
continue;
/* Drop Lock and unbind */
spin_unlock(&ps_lock);
dev_state = pasid_state->device_state;
pasid = pasid_state->pasid;
unbind_pasid(dev_state, pasid);
/* Task may be in the list multiple times */
goto again;
}
spin_unlock(&ps_lock);
return NOTIFY_OK;
}
int amd_iommu_bind_pasid(struct pci_dev *pdev, int pasid, int amd_iommu_bind_pasid(struct pci_dev *pdev, int pasid,
struct task_struct *task) struct task_struct *task)
{ {
@ -509,13 +659,16 @@ int amd_iommu_bind_pasid(struct pci_dev *pdev, int pasid,
pasid_state->mm = get_task_mm(task); pasid_state->mm = get_task_mm(task);
pasid_state->device_state = dev_state; pasid_state->device_state = dev_state;
pasid_state->pasid = pasid; pasid_state->pasid = pasid;
pasid_state->mn.ops = &iommu_mn;
if (pasid_state->mm == NULL) if (pasid_state->mm == NULL)
goto out_free; goto out_free;
mmu_notifier_register(&pasid_state->mn, pasid_state->mm);
ret = set_pasid_state(dev_state, pasid_state, pasid); ret = set_pasid_state(dev_state, pasid_state, pasid);
if (ret) if (ret)
goto out_free; goto out_unregister;
ret = amd_iommu_domain_set_gcr3(dev_state->domain, pasid, ret = amd_iommu_domain_set_gcr3(dev_state->domain, pasid,
__pa(pasid_state->mm->pgd)); __pa(pasid_state->mm->pgd));
@ -529,6 +682,9 @@ int amd_iommu_bind_pasid(struct pci_dev *pdev, int pasid,
out_clear_state: out_clear_state:
clear_pasid_state(dev_state, pasid); clear_pasid_state(dev_state, pasid);
out_unregister:
mmu_notifier_unregister(&pasid_state->mn, pasid_state->mm);
out_free: out_free:
free_pasid_state(pasid_state); free_pasid_state(pasid_state);
@ -689,15 +845,22 @@ static int __init amd_iommu_v2_init(void)
ret = -ENOMEM; ret = -ENOMEM;
iommu_wq = create_workqueue("amd_iommu_v2"); iommu_wq = create_workqueue("amd_iommu_v2");
if (iommu_wq == NULL) { if (iommu_wq == NULL)
ret = -ENOMEM;
goto out_free; goto out_free;
}
ret = -ENOMEM;
empty_page_table = (u64 *)get_zeroed_page(GFP_KERNEL);
if (empty_page_table == NULL)
goto out_destroy_wq;
amd_iommu_register_ppr_notifier(&ppr_nb); amd_iommu_register_ppr_notifier(&ppr_nb);
profile_event_register(PROFILE_TASK_EXIT, &profile_nb);
return 0; return 0;
out_destroy_wq:
destroy_workqueue(iommu_wq);
out_free: out_free:
free_pages((unsigned long)state_table, get_order(state_table_size)); free_pages((unsigned long)state_table, get_order(state_table_size));
@ -710,6 +873,7 @@ static void __exit amd_iommu_v2_exit(void)
size_t state_table_size; size_t state_table_size;
int i; int i;
profile_event_unregister(PROFILE_TASK_EXIT, &profile_nb);
amd_iommu_unregister_ppr_notifier(&ppr_nb); amd_iommu_unregister_ppr_notifier(&ppr_nb);
flush_workqueue(iommu_wq); flush_workqueue(iommu_wq);
@ -734,6 +898,8 @@ static void __exit amd_iommu_v2_exit(void)
state_table_size = MAX_DEVICES * sizeof(struct device_state *); state_table_size = MAX_DEVICES * sizeof(struct device_state *);
free_pages((unsigned long)state_table, get_order(state_table_size)); free_pages((unsigned long)state_table, get_order(state_table_size));
free_page((unsigned long)empty_page_table);
} }
module_init(amd_iommu_v2_init); module_init(amd_iommu_v2_init);