IB/mlx5: Page faults handling infrastructure

* Refactor MR registration and cleanup, and fix reg_pages accounting.
* Create a work queue to handle page fault events in a kthread context.
* Register a fault handler to get events from the core for each QP.

The registered fault handler is empty in this patch, and only a later
patch implements it.

Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Shachar Raindel <raindel@mellanox.com>
Signed-off-by: Haggai Eran <haggaie@mellanox.com>
Signed-off-by: Roland Dreier <roland@purestorage.com>
This commit is contained in:
Haggai Eran 2014-12-11 17:04:23 +02:00 committed by Roland Dreier
parent 832a6b06ab
commit 6aec21f6a8
6 changed files with 295 additions and 23 deletions

View File

@ -864,7 +864,7 @@ static ssize_t show_reg_pages(struct device *device,
struct mlx5_ib_dev *dev =
container_of(device, struct mlx5_ib_dev, ib_dev.dev);
return sprintf(buf, "%d\n", dev->mdev->priv.reg_pages);
return sprintf(buf, "%d\n", atomic_read(&dev->mdev->priv.reg_pages));
}
static ssize_t show_hca(struct device *device, struct device_attribute *attr,
@ -1389,16 +1389,19 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
goto err_eqs;
mutex_init(&dev->cap_mask_mutex);
spin_lock_init(&dev->mr_lock);
err = create_dev_resources(&dev->devr);
if (err)
goto err_eqs;
err = ib_register_device(&dev->ib_dev, NULL);
err = mlx5_ib_odp_init_one(dev);
if (err)
goto err_rsrc;
err = ib_register_device(&dev->ib_dev, NULL);
if (err)
goto err_odp;
err = create_umr_res(dev);
if (err)
goto err_dev;
@ -1420,6 +1423,9 @@ err_umrc:
err_dev:
ib_unregister_device(&dev->ib_dev);
err_odp:
mlx5_ib_odp_remove_one(dev);
err_rsrc:
destroy_dev_resources(&dev->devr);
@ -1435,8 +1441,10 @@ err_dealloc:
static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context)
{
struct mlx5_ib_dev *dev = context;
ib_unregister_device(&dev->ib_dev);
destroy_umrc_res(dev);
mlx5_ib_odp_remove_one(dev);
destroy_dev_resources(&dev->devr);
free_comp_eqs(dev);
ib_dealloc_device(&dev->ib_dev);
@ -1450,15 +1458,30 @@ static struct mlx5_interface mlx5_ib_interface = {
static int __init mlx5_ib_init(void)
{
int err;
if (deprecated_prof_sel != 2)
pr_warn("prof_sel is deprecated for mlx5_ib, set it for mlx5_core\n");
return mlx5_register_interface(&mlx5_ib_interface);
err = mlx5_ib_odp_init();
if (err)
return err;
err = mlx5_register_interface(&mlx5_ib_interface);
if (err)
goto clean_odp;
return err;
clean_odp:
mlx5_ib_odp_cleanup();
return err;
}
static void __exit mlx5_ib_cleanup(void)
{
mlx5_unregister_interface(&mlx5_ib_interface);
mlx5_ib_odp_cleanup();
}
module_init(mlx5_ib_init);

View File

@ -149,6 +149,29 @@ enum {
MLX5_QP_EMPTY
};
/*
* Connect-IB can trigger up to four concurrent pagefaults
* per-QP.
*/
enum mlx5_ib_pagefault_context {
MLX5_IB_PAGEFAULT_RESPONDER_READ,
MLX5_IB_PAGEFAULT_REQUESTOR_READ,
MLX5_IB_PAGEFAULT_RESPONDER_WRITE,
MLX5_IB_PAGEFAULT_REQUESTOR_WRITE,
MLX5_IB_PAGEFAULT_CONTEXTS
};
static inline enum mlx5_ib_pagefault_context
mlx5_ib_get_pagefault_context(struct mlx5_pagefault *pagefault)
{
return pagefault->flags & (MLX5_PFAULT_REQUESTOR | MLX5_PFAULT_WRITE);
}
struct mlx5_ib_pfault {
struct work_struct work;
struct mlx5_pagefault mpfault;
};
struct mlx5_ib_qp {
struct ib_qp ibqp;
struct mlx5_core_qp mqp;
@ -194,6 +217,21 @@ struct mlx5_ib_qp {
/* Store signature errors */
bool signature_en;
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
/*
* A flag that is true for QP's that are in a state that doesn't
* allow page faults, and shouldn't schedule any more faults.
*/
int disable_page_faults;
/*
* The disable_page_faults_lock protects a QP's disable_page_faults
* field, allowing for a thread to atomically check whether the QP
* allows page faults, and if so schedule a page fault.
*/
spinlock_t disable_page_faults_lock;
struct mlx5_ib_pfault pagefaults[MLX5_IB_PAGEFAULT_CONTEXTS];
#endif
};
struct mlx5_ib_cq_buf {
@ -392,13 +430,17 @@ struct mlx5_ib_dev {
struct umr_common umrc;
/* sync used page count stats
*/
spinlock_t mr_lock;
struct mlx5_ib_resources devr;
struct mlx5_mr_cache cache;
struct timer_list delay_timer;
int fill_delay;
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
struct ib_odp_caps odp_caps;
/*
* Sleepable RCU that prevents destruction of MRs while they are still
* being used by a page fault handler.
*/
struct srcu_struct mr_srcu;
#endif
};
@ -575,12 +617,33 @@ int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
struct ib_mr_status *mr_status);
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
extern struct workqueue_struct *mlx5_ib_page_fault_wq;
int mlx5_ib_internal_query_odp_caps(struct mlx5_ib_dev *dev);
#else
void mlx5_ib_mr_pfault_handler(struct mlx5_ib_qp *qp,
struct mlx5_ib_pfault *pfault);
void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp);
int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev);
void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev);
int __init mlx5_ib_odp_init(void);
void mlx5_ib_odp_cleanup(void);
void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp);
void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp);
#else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
static inline int mlx5_ib_internal_query_odp_caps(struct mlx5_ib_dev *dev)
{
return 0;
}
static inline void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp) {}
static inline int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev) { return 0; }
static inline void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev) {}
static inline int mlx5_ib_odp_init(void) { return 0; }
static inline void mlx5_ib_odp_cleanup(void) {}
static inline void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp) {}
static inline void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp) {}
#endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
static inline void init_query_mad(struct ib_smp *mad)

View File

@ -52,6 +52,8 @@ static __be64 mlx5_ib_update_mtt_emergency_buffer[
static DEFINE_MUTEX(mlx5_ib_update_mtt_emergency_buffer_mutex);
#endif
static int clean_mr(struct mlx5_ib_mr *mr);
static int order2idx(struct mlx5_ib_dev *dev, int order)
{
struct mlx5_mr_cache *cache = &dev->cache;
@ -1049,6 +1051,10 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
mlx5_ib_dbg(dev, "cache empty for order %d", order);
mr = NULL;
}
} else if (access_flags & IB_ACCESS_ON_DEMAND) {
err = -EINVAL;
pr_err("Got MR registration for ODP MR > 512MB, not supported for Connect-IB");
goto error;
}
if (!mr)
@ -1064,9 +1070,7 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
mr->umem = umem;
mr->npages = npages;
spin_lock(&dev->mr_lock);
dev->mdev->priv.reg_pages += npages;
spin_unlock(&dev->mr_lock);
atomic_add(npages, &dev->mdev->priv.reg_pages);
mr->ibmr.lkey = mr->mmr.key;
mr->ibmr.rkey = mr->mmr.key;
@ -1110,12 +1114,9 @@ error:
return err;
}
int mlx5_ib_dereg_mr(struct ib_mr *ibmr)
static int clean_mr(struct mlx5_ib_mr *mr)
{
struct mlx5_ib_dev *dev = to_mdev(ibmr->device);
struct mlx5_ib_mr *mr = to_mmr(ibmr);
struct ib_umem *umem = mr->umem;
int npages = mr->npages;
struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
int umred = mr->umred;
int err;
@ -1135,19 +1136,35 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr)
free_cached_mr(dev, mr);
}
if (umem) {
ib_umem_release(umem);
spin_lock(&dev->mr_lock);
dev->mdev->priv.reg_pages -= npages;
spin_unlock(&dev->mr_lock);
}
if (!umred)
kfree(mr);
return 0;
}
int mlx5_ib_dereg_mr(struct ib_mr *ibmr)
{
struct mlx5_ib_dev *dev = to_mdev(ibmr->device);
struct mlx5_ib_mr *mr = to_mmr(ibmr);
int npages = mr->npages;
struct ib_umem *umem = mr->umem;
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
if (umem)
/* Wait for all running page-fault handlers to finish. */
synchronize_srcu(&dev->mr_srcu);
#endif
clean_mr(mr);
if (umem) {
ib_umem_release(umem);
atomic_sub(npages, &dev->mdev->priv.reg_pages);
}
return 0;
}
struct ib_mr *mlx5_ib_create_mr(struct ib_pd *pd,
struct ib_mr_init_attr *mr_init_attr)
{

View File

@ -32,6 +32,8 @@
#include "mlx5_ib.h"
struct workqueue_struct *mlx5_ib_page_fault_wq;
#define COPY_ODP_BIT_MLX_TO_IB(reg, ib_caps, field_name, bit_name) do { \
if (be32_to_cpu(reg.field_name) & MLX5_ODP_SUPPORT_##bit_name) \
ib_caps->field_name |= IB_ODP_SUPPORT_##bit_name; \
@ -58,3 +60,146 @@ int mlx5_ib_internal_query_odp_caps(struct mlx5_ib_dev *dev)
out:
return err;
}
static struct mlx5_ib_mr *mlx5_ib_odp_find_mr_lkey(struct mlx5_ib_dev *dev,
u32 key)
{
u32 base_key = mlx5_base_mkey(key);
struct mlx5_core_mr *mmr = __mlx5_mr_lookup(dev->mdev, base_key);
if (!mmr || mmr->key != key)
return NULL;
return container_of(mmr, struct mlx5_ib_mr, mmr);
}
static void mlx5_ib_page_fault_resume(struct mlx5_ib_qp *qp,
struct mlx5_ib_pfault *pfault,
int error) {
struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device);
int ret = mlx5_core_page_fault_resume(dev->mdev, qp->mqp.qpn,
pfault->mpfault.flags,
error);
if (ret)
pr_err("Failed to resolve the page fault on QP 0x%x\n",
qp->mqp.qpn);
}
void mlx5_ib_mr_pfault_handler(struct mlx5_ib_qp *qp,
struct mlx5_ib_pfault *pfault)
{
u8 event_subtype = pfault->mpfault.event_subtype;
switch (event_subtype) {
default:
pr_warn("Invalid page fault event subtype: 0x%x\n",
event_subtype);
mlx5_ib_page_fault_resume(qp, pfault, 1);
break;
}
}
static void mlx5_ib_qp_pfault_action(struct work_struct *work)
{
struct mlx5_ib_pfault *pfault = container_of(work,
struct mlx5_ib_pfault,
work);
enum mlx5_ib_pagefault_context context =
mlx5_ib_get_pagefault_context(&pfault->mpfault);
struct mlx5_ib_qp *qp = container_of(pfault, struct mlx5_ib_qp,
pagefaults[context]);
mlx5_ib_mr_pfault_handler(qp, pfault);
}
void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp)
{
unsigned long flags;
spin_lock_irqsave(&qp->disable_page_faults_lock, flags);
qp->disable_page_faults = 1;
spin_unlock_irqrestore(&qp->disable_page_faults_lock, flags);
/*
* Note that at this point, we are guarenteed that no more
* work queue elements will be posted to the work queue with
* the QP we are closing.
*/
flush_workqueue(mlx5_ib_page_fault_wq);
}
void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp)
{
unsigned long flags;
spin_lock_irqsave(&qp->disable_page_faults_lock, flags);
qp->disable_page_faults = 0;
spin_unlock_irqrestore(&qp->disable_page_faults_lock, flags);
}
static void mlx5_ib_pfault_handler(struct mlx5_core_qp *qp,
struct mlx5_pagefault *pfault)
{
/*
* Note that we will only get one fault event per QP per context
* (responder/initiator, read/write), until we resolve the page fault
* with the mlx5_ib_page_fault_resume command. Since this function is
* called from within the work element, there is no risk of missing
* events.
*/
struct mlx5_ib_qp *mibqp = to_mibqp(qp);
enum mlx5_ib_pagefault_context context =
mlx5_ib_get_pagefault_context(pfault);
struct mlx5_ib_pfault *qp_pfault = &mibqp->pagefaults[context];
qp_pfault->mpfault = *pfault;
/* No need to stop interrupts here since we are in an interrupt */
spin_lock(&mibqp->disable_page_faults_lock);
if (!mibqp->disable_page_faults)
queue_work(mlx5_ib_page_fault_wq, &qp_pfault->work);
spin_unlock(&mibqp->disable_page_faults_lock);
}
void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp)
{
int i;
qp->disable_page_faults = 1;
spin_lock_init(&qp->disable_page_faults_lock);
qp->mqp.pfault_handler = mlx5_ib_pfault_handler;
for (i = 0; i < MLX5_IB_PAGEFAULT_CONTEXTS; ++i)
INIT_WORK(&qp->pagefaults[i].work, mlx5_ib_qp_pfault_action);
}
int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev)
{
int ret;
ret = init_srcu_struct(&ibdev->mr_srcu);
if (ret)
return ret;
return 0;
}
void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev)
{
cleanup_srcu_struct(&ibdev->mr_srcu);
}
int __init mlx5_ib_odp_init(void)
{
mlx5_ib_page_fault_wq =
create_singlethread_workqueue("mlx5_ib_page_faults");
if (!mlx5_ib_page_fault_wq)
return -ENOMEM;
return 0;
}
void mlx5_ib_odp_cleanup(void)
{
destroy_workqueue(mlx5_ib_page_fault_wq);
}

View File

@ -876,6 +876,8 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
int inlen = sizeof(*in);
int err;
mlx5_ib_odp_create_qp(qp);
gen = &dev->mdev->caps.gen;
mutex_init(&qp->mutex);
spin_lock_init(&qp->sq.lock);
@ -1160,11 +1162,13 @@ static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp)
in = kzalloc(sizeof(*in), GFP_KERNEL);
if (!in)
return;
if (qp->state != IB_QPS_RESET)
if (qp->state != IB_QPS_RESET) {
mlx5_ib_qp_disable_pagefaults(qp);
if (mlx5_core_qp_modify(dev->mdev, to_mlx5_state(qp->state),
MLX5_QP_STATE_RST, in, sizeof(*in), &qp->mqp))
mlx5_ib_warn(dev, "mlx5_ib: modify QP %06x to RESET failed\n",
qp->mqp.qpn);
}
get_cqs(qp, &send_cq, &recv_cq);
@ -1712,6 +1716,15 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
if (mlx5_st < 0)
goto out;
/* If moving to a reset or error state, we must disable page faults on
* this QP and flush all current page faults. Otherwise a stale page
* fault may attempt to work on this QP after it is reset and moved
* again to RTS, and may cause the driver and the device to get out of
* sync. */
if (cur_state != IB_QPS_RESET && cur_state != IB_QPS_ERR &&
(new_state == IB_QPS_RESET || new_state == IB_QPS_ERR))
mlx5_ib_qp_disable_pagefaults(qp);
optpar = ib_mask_to_mlx5_opt(attr_mask);
optpar &= opt_mask[mlx5_cur][mlx5_new][mlx5_st];
in->optparam = cpu_to_be32(optpar);
@ -1721,6 +1734,9 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
if (err)
goto out;
if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
mlx5_ib_qp_enable_pagefaults(qp);
qp->state = new_state;
if (attr_mask & IB_QP_ACCESS_FLAGS)
@ -3026,6 +3042,14 @@ int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr
int mlx5_state;
int err = 0;
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
/*
* Wait for any outstanding page faults, in case the user frees memory
* based upon this query's result.
*/
flush_workqueue(mlx5_ib_page_fault_wq);
#endif
mutex_lock(&qp->mutex);
outb = kzalloc(sizeof(*outb), GFP_KERNEL);
if (!outb) {

View File

@ -474,7 +474,7 @@ struct mlx5_priv {
struct workqueue_struct *pg_wq;
struct rb_root page_root;
int fw_pages;
int reg_pages;
atomic_t reg_pages;
struct list_head free_list;
struct mlx5_core_health health;