hw/rdma: Add support for MAD packets
MAD (Management Datagram) packets are widely used by various modules both in kernel and in user space for example the rdma_* API which is used to create and maintain "connection" layer on top of RDMA uses several types of MAD packets. For more information please refer to chapter 13.4 in Volume 1 Architecture Specification, Release 1.1 available here: https://www.infinibandta.org/ibta-specifications-download/ To support MAD packets the device uses an external utility (contrib/rdmacm-mux) to relay packets from and to the guest driver. Signed-off-by: Yuval Shaia <yuval.shaia@oracle.com> Reviewed-by: Marcel Apfelbaum<marcel.apfelbaum@gmail.com> Signed-off-by: Marcel Apfelbaum <marcel.apfelbaum@gmail.com>
This commit is contained in:
parent
305bdd7a57
commit
605ec1663b
@ -16,8 +16,13 @@
|
||||
#include "qemu/osdep.h"
|
||||
#include "qemu/error-report.h"
|
||||
#include "qapi/error.h"
|
||||
#include "qapi/qmp/qlist.h"
|
||||
#include "qapi/qmp/qnum.h"
|
||||
|
||||
#include <infiniband/verbs.h>
|
||||
#include <infiniband/umad_types.h>
|
||||
#include <infiniband/umad.h>
|
||||
#include <rdma/rdma_user_cm.h>
|
||||
|
||||
#include "trace.h"
|
||||
#include "rdma_utils.h"
|
||||
@ -33,16 +38,25 @@
|
||||
#define VENDOR_ERR_MAD_SEND 0x206
|
||||
#define VENDOR_ERR_INVLKEY 0x207
|
||||
#define VENDOR_ERR_MR_SMALL 0x208
|
||||
#define VENDOR_ERR_INV_MAD_BUFF 0x209
|
||||
#define VENDOR_ERR_INV_NUM_SGE 0x210
|
||||
|
||||
#define THR_NAME_LEN 16
|
||||
#define THR_POLL_TO 5000
|
||||
|
||||
#define MAD_HDR_SIZE sizeof(struct ibv_grh)
|
||||
|
||||
typedef struct BackendCtx {
|
||||
uint64_t req_id;
|
||||
void *up_ctx;
|
||||
bool is_tx_req;
|
||||
struct ibv_sge sge; /* Used to save MAD recv buffer */
|
||||
} BackendCtx;
|
||||
|
||||
struct backend_umad {
|
||||
struct ib_user_mad hdr;
|
||||
char mad[RDMA_MAX_PRIVATE_DATA];
|
||||
};
|
||||
|
||||
static void (*comp_handler)(int status, unsigned int vendor_err, void *ctx);
|
||||
|
||||
static void dummy_comp_handler(int status, unsigned int vendor_err, void *ctx)
|
||||
@ -286,6 +300,61 @@ static int build_host_sge_array(RdmaDeviceResources *rdma_dev_res,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int mad_send(RdmaBackendDev *backend_dev, struct ibv_sge *sge,
|
||||
uint32_t num_sge)
|
||||
{
|
||||
struct backend_umad umad = {0};
|
||||
char *hdr, *msg;
|
||||
int ret;
|
||||
|
||||
pr_dbg("num_sge=%d\n", num_sge);
|
||||
|
||||
if (num_sge != 2) {
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
umad.hdr.length = sge[0].length + sge[1].length;
|
||||
pr_dbg("msg_len=%d\n", umad.hdr.length);
|
||||
|
||||
if (umad.hdr.length > sizeof(umad.mad)) {
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
umad.hdr.addr.qpn = htobe32(1);
|
||||
umad.hdr.addr.grh_present = 1;
|
||||
umad.hdr.addr.gid_index = backend_dev->backend_gid_idx;
|
||||
memcpy(umad.hdr.addr.gid, backend_dev->gid.raw, sizeof(umad.hdr.addr.gid));
|
||||
umad.hdr.addr.hop_limit = 0xFF;
|
||||
|
||||
hdr = rdma_pci_dma_map(backend_dev->dev, sge[0].addr, sge[0].length);
|
||||
if (!hdr) {
|
||||
pr_dbg("Fail to map to sge[0]\n");
|
||||
return -ENOMEM;
|
||||
}
|
||||
msg = rdma_pci_dma_map(backend_dev->dev, sge[1].addr, sge[1].length);
|
||||
if (!msg) {
|
||||
pr_dbg("Fail to map to sge[1]\n");
|
||||
rdma_pci_dma_unmap(backend_dev->dev, hdr, sge[0].length);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
pr_dbg_buf("mad_hdr", hdr, sge[0].length);
|
||||
pr_dbg_buf("mad_data", data, sge[1].length);
|
||||
|
||||
memcpy(&umad.mad[0], hdr, sge[0].length);
|
||||
memcpy(&umad.mad[sge[0].length], msg, sge[1].length);
|
||||
|
||||
rdma_pci_dma_unmap(backend_dev->dev, msg, sge[1].length);
|
||||
rdma_pci_dma_unmap(backend_dev->dev, hdr, sge[0].length);
|
||||
|
||||
ret = qemu_chr_fe_write(backend_dev->mad_chr_be, (const uint8_t *)&umad,
|
||||
sizeof(umad));
|
||||
|
||||
pr_dbg("qemu_chr_fe_write=%d\n", ret);
|
||||
|
||||
return (ret != sizeof(umad));
|
||||
}
|
||||
|
||||
void rdma_backend_post_send(RdmaBackendDev *backend_dev,
|
||||
RdmaBackendQP *qp, uint8_t qp_type,
|
||||
struct ibv_sge *sge, uint32_t num_sge,
|
||||
@ -304,9 +373,13 @@ void rdma_backend_post_send(RdmaBackendDev *backend_dev,
|
||||
comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_QP0, ctx);
|
||||
} else if (qp_type == IBV_QPT_GSI) {
|
||||
pr_dbg("QP1\n");
|
||||
comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_MAD_SEND, ctx);
|
||||
rc = mad_send(backend_dev, sge, num_sge);
|
||||
if (rc) {
|
||||
comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_MAD_SEND, ctx);
|
||||
} else {
|
||||
comp_handler(IBV_WC_SUCCESS, 0, ctx);
|
||||
}
|
||||
}
|
||||
pr_dbg("qp->ibqp is NULL for qp_type %d!!!\n", qp_type);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -370,6 +443,48 @@ out_free_bctx:
|
||||
g_free(bctx);
|
||||
}
|
||||
|
||||
static unsigned int save_mad_recv_buffer(RdmaBackendDev *backend_dev,
|
||||
struct ibv_sge *sge, uint32_t num_sge,
|
||||
void *ctx)
|
||||
{
|
||||
BackendCtx *bctx;
|
||||
int rc;
|
||||
uint32_t bctx_id;
|
||||
|
||||
if (num_sge != 1) {
|
||||
pr_dbg("Invalid num_sge (%d), expecting 1\n", num_sge);
|
||||
return VENDOR_ERR_INV_NUM_SGE;
|
||||
}
|
||||
|
||||
if (sge[0].length < RDMA_MAX_PRIVATE_DATA + sizeof(struct ibv_grh)) {
|
||||
pr_dbg("Too small buffer for MAD\n");
|
||||
return VENDOR_ERR_INV_MAD_BUFF;
|
||||
}
|
||||
|
||||
pr_dbg("addr=0x%" PRIx64"\n", sge[0].addr);
|
||||
pr_dbg("length=%d\n", sge[0].length);
|
||||
pr_dbg("lkey=%d\n", sge[0].lkey);
|
||||
|
||||
bctx = g_malloc0(sizeof(*bctx));
|
||||
|
||||
rc = rdma_rm_alloc_cqe_ctx(backend_dev->rdma_dev_res, &bctx_id, bctx);
|
||||
if (unlikely(rc)) {
|
||||
g_free(bctx);
|
||||
pr_dbg("Fail to allocate cqe_ctx\n");
|
||||
return VENDOR_ERR_NOMEM;
|
||||
}
|
||||
|
||||
pr_dbg("bctx_id %d, bctx %p, ctx %p\n", bctx_id, bctx, ctx);
|
||||
bctx->up_ctx = ctx;
|
||||
bctx->sge = *sge;
|
||||
|
||||
qemu_mutex_lock(&backend_dev->recv_mads_list.lock);
|
||||
qlist_append_int(backend_dev->recv_mads_list.list, bctx_id);
|
||||
qemu_mutex_unlock(&backend_dev->recv_mads_list.lock);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void rdma_backend_post_recv(RdmaBackendDev *backend_dev,
|
||||
RdmaDeviceResources *rdma_dev_res,
|
||||
RdmaBackendQP *qp, uint8_t qp_type,
|
||||
@ -388,7 +503,10 @@ void rdma_backend_post_recv(RdmaBackendDev *backend_dev,
|
||||
}
|
||||
if (qp_type == IBV_QPT_GSI) {
|
||||
pr_dbg("QP1\n");
|
||||
comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_MAD_SEND, ctx);
|
||||
rc = save_mad_recv_buffer(backend_dev, sge, num_sge, ctx);
|
||||
if (rc) {
|
||||
comp_handler(IBV_WC_GENERAL_ERR, rc, ctx);
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
@ -517,7 +635,6 @@ int rdma_backend_create_qp(RdmaBackendQP *qp, uint8_t qp_type,
|
||||
|
||||
switch (qp_type) {
|
||||
case IBV_QPT_GSI:
|
||||
pr_dbg("QP1 unsupported\n");
|
||||
return 0;
|
||||
|
||||
case IBV_QPT_RC:
|
||||
@ -748,11 +865,122 @@ static int init_device_caps(RdmaBackendDev *backend_dev,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void build_mad_hdr(struct ibv_grh *grh, union ibv_gid *sgid,
|
||||
union ibv_gid *my_gid, int paylen)
|
||||
{
|
||||
grh->paylen = htons(paylen);
|
||||
grh->sgid = *sgid;
|
||||
grh->dgid = *my_gid;
|
||||
|
||||
pr_dbg("paylen=%d (net=0x%x)\n", paylen, grh->paylen);
|
||||
pr_dbg("my_gid=0x%llx\n", my_gid->global.interface_id);
|
||||
pr_dbg("gid=0x%llx\n", sgid->global.interface_id);
|
||||
}
|
||||
|
||||
static inline int mad_can_receieve(void *opaque)
|
||||
{
|
||||
return sizeof(struct backend_umad);
|
||||
}
|
||||
|
||||
static void mad_read(void *opaque, const uint8_t *buf, int size)
|
||||
{
|
||||
RdmaBackendDev *backend_dev = (RdmaBackendDev *)opaque;
|
||||
QObject *o_ctx_id;
|
||||
unsigned long cqe_ctx_id;
|
||||
BackendCtx *bctx;
|
||||
char *mad;
|
||||
struct backend_umad *umad;
|
||||
|
||||
assert(size != sizeof(umad));
|
||||
umad = (struct backend_umad *)buf;
|
||||
|
||||
pr_dbg("Got %d bytes\n", size);
|
||||
pr_dbg("umad->hdr.length=%d\n", umad->hdr.length);
|
||||
|
||||
#ifdef PVRDMA_DEBUG
|
||||
struct umad_hdr *hdr = (struct umad_hdr *)&msg->umad.mad;
|
||||
pr_dbg("bv %x cls %x cv %x mtd %x st %d tid %" PRIx64 " at %x atm %x\n",
|
||||
hdr->base_version, hdr->mgmt_class, hdr->class_version,
|
||||
hdr->method, hdr->status, be64toh(hdr->tid),
|
||||
hdr->attr_id, hdr->attr_mod);
|
||||
#endif
|
||||
|
||||
qemu_mutex_lock(&backend_dev->recv_mads_list.lock);
|
||||
o_ctx_id = qlist_pop(backend_dev->recv_mads_list.list);
|
||||
qemu_mutex_unlock(&backend_dev->recv_mads_list.lock);
|
||||
if (!o_ctx_id) {
|
||||
pr_dbg("No more free MADs buffers, waiting for a while\n");
|
||||
sleep(THR_POLL_TO);
|
||||
return;
|
||||
}
|
||||
|
||||
cqe_ctx_id = qnum_get_uint(qobject_to(QNum, o_ctx_id));
|
||||
bctx = rdma_rm_get_cqe_ctx(backend_dev->rdma_dev_res, cqe_ctx_id);
|
||||
if (unlikely(!bctx)) {
|
||||
pr_dbg("Error: Fail to find ctx for %ld\n", cqe_ctx_id);
|
||||
return;
|
||||
}
|
||||
|
||||
pr_dbg("id %ld, bctx %p, ctx %p\n", cqe_ctx_id, bctx, bctx->up_ctx);
|
||||
|
||||
mad = rdma_pci_dma_map(backend_dev->dev, bctx->sge.addr,
|
||||
bctx->sge.length);
|
||||
if (!mad || bctx->sge.length < umad->hdr.length + MAD_HDR_SIZE) {
|
||||
comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_INV_MAD_BUFF,
|
||||
bctx->up_ctx);
|
||||
} else {
|
||||
memset(mad, 0, bctx->sge.length);
|
||||
build_mad_hdr((struct ibv_grh *)mad,
|
||||
(union ibv_gid *)&umad->hdr.addr.gid,
|
||||
&backend_dev->gid, umad->hdr.length);
|
||||
memcpy(&mad[MAD_HDR_SIZE], umad->mad, umad->hdr.length);
|
||||
rdma_pci_dma_unmap(backend_dev->dev, mad, bctx->sge.length);
|
||||
|
||||
comp_handler(IBV_WC_SUCCESS, 0, bctx->up_ctx);
|
||||
}
|
||||
|
||||
g_free(bctx);
|
||||
rdma_rm_dealloc_cqe_ctx(backend_dev->rdma_dev_res, cqe_ctx_id);
|
||||
}
|
||||
|
||||
static int mad_init(RdmaBackendDev *backend_dev)
|
||||
{
|
||||
struct backend_umad umad = {0};
|
||||
int ret;
|
||||
|
||||
if (!qemu_chr_fe_backend_connected(backend_dev->mad_chr_be)) {
|
||||
pr_dbg("Missing chardev for MAD multiplexer\n");
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
qemu_chr_fe_set_handlers(backend_dev->mad_chr_be, mad_can_receieve,
|
||||
mad_read, NULL, NULL, backend_dev, NULL, true);
|
||||
|
||||
/* Register ourself */
|
||||
memcpy(umad.hdr.addr.gid, backend_dev->gid.raw, sizeof(umad.hdr.addr.gid));
|
||||
ret = qemu_chr_fe_write(backend_dev->mad_chr_be, (const uint8_t *)&umad,
|
||||
sizeof(umad.hdr));
|
||||
if (ret != sizeof(umad.hdr)) {
|
||||
pr_dbg("Fail to register to rdma_umadmux (%d)\n", ret);
|
||||
}
|
||||
|
||||
qemu_mutex_init(&backend_dev->recv_mads_list.lock);
|
||||
backend_dev->recv_mads_list.list = qlist_new();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void mad_fini(RdmaBackendDev *backend_dev)
|
||||
{
|
||||
qlist_destroy_obj(QOBJECT(backend_dev->recv_mads_list.list));
|
||||
qemu_mutex_destroy(&backend_dev->recv_mads_list.lock);
|
||||
}
|
||||
|
||||
int rdma_backend_init(RdmaBackendDev *backend_dev, PCIDevice *pdev,
|
||||
RdmaDeviceResources *rdma_dev_res,
|
||||
const char *backend_device_name, uint8_t port_num,
|
||||
uint8_t backend_gid_idx, struct ibv_device_attr *dev_attr,
|
||||
Error **errp)
|
||||
CharBackend *mad_chr_be, Error **errp)
|
||||
{
|
||||
int i;
|
||||
int ret = 0;
|
||||
@ -763,7 +991,7 @@ int rdma_backend_init(RdmaBackendDev *backend_dev, PCIDevice *pdev,
|
||||
memset(backend_dev, 0, sizeof(*backend_dev));
|
||||
|
||||
backend_dev->dev = pdev;
|
||||
|
||||
backend_dev->mad_chr_be = mad_chr_be;
|
||||
backend_dev->backend_gid_idx = backend_gid_idx;
|
||||
backend_dev->port_num = port_num;
|
||||
backend_dev->rdma_dev_res = rdma_dev_res;
|
||||
@ -854,6 +1082,13 @@ int rdma_backend_init(RdmaBackendDev *backend_dev, PCIDevice *pdev,
|
||||
pr_dbg("interface_id=0x%" PRIx64 "\n",
|
||||
be64_to_cpu(backend_dev->gid.global.interface_id));
|
||||
|
||||
ret = mad_init(backend_dev);
|
||||
if (ret) {
|
||||
error_setg(errp, "Fail to initialize mad");
|
||||
ret = -EIO;
|
||||
goto out_destroy_comm_channel;
|
||||
}
|
||||
|
||||
backend_dev->comp_thread.run = false;
|
||||
backend_dev->comp_thread.is_running = false;
|
||||
|
||||
@ -890,6 +1125,7 @@ void rdma_backend_stop(RdmaBackendDev *backend_dev)
|
||||
void rdma_backend_fini(RdmaBackendDev *backend_dev)
|
||||
{
|
||||
rdma_backend_stop(backend_dev);
|
||||
mad_fini(backend_dev);
|
||||
g_hash_table_destroy(ah_hash);
|
||||
ibv_destroy_comp_channel(backend_dev->channel);
|
||||
ibv_close_device(backend_dev->context);
|
||||
|
@ -17,6 +17,8 @@
|
||||
#define RDMA_BACKEND_H
|
||||
|
||||
#include "qapi/error.h"
|
||||
#include "chardev/char-fe.h"
|
||||
|
||||
#include "rdma_rm_defs.h"
|
||||
#include "rdma_backend_defs.h"
|
||||
|
||||
@ -50,7 +52,7 @@ int rdma_backend_init(RdmaBackendDev *backend_dev, PCIDevice *pdev,
|
||||
RdmaDeviceResources *rdma_dev_res,
|
||||
const char *backend_device_name, uint8_t port_num,
|
||||
uint8_t backend_gid_idx, struct ibv_device_attr *dev_attr,
|
||||
Error **errp);
|
||||
CharBackend *mad_chr_be, Error **errp);
|
||||
void rdma_backend_fini(RdmaBackendDev *backend_dev);
|
||||
void rdma_backend_start(RdmaBackendDev *backend_dev);
|
||||
void rdma_backend_stop(RdmaBackendDev *backend_dev);
|
||||
|
@ -16,8 +16,9 @@
|
||||
#ifndef RDMA_BACKEND_DEFS_H
|
||||
#define RDMA_BACKEND_DEFS_H
|
||||
|
||||
#include <infiniband/verbs.h>
|
||||
#include "qemu/thread.h"
|
||||
#include "chardev/char-fe.h"
|
||||
#include <infiniband/verbs.h>
|
||||
|
||||
typedef struct RdmaDeviceResources RdmaDeviceResources;
|
||||
|
||||
@ -28,6 +29,11 @@ typedef struct RdmaBackendThread {
|
||||
bool is_running; /* Set by the thread to report its status */
|
||||
} RdmaBackendThread;
|
||||
|
||||
typedef struct RecvMadList {
|
||||
QemuMutex lock;
|
||||
QList *list;
|
||||
} RecvMadList;
|
||||
|
||||
typedef struct RdmaBackendDev {
|
||||
struct ibv_device_attr dev_attr;
|
||||
RdmaBackendThread comp_thread;
|
||||
@ -39,6 +45,8 @@ typedef struct RdmaBackendDev {
|
||||
struct ibv_comp_channel *channel;
|
||||
uint8_t port_num;
|
||||
uint8_t backend_gid_idx;
|
||||
RecvMadList recv_mads_list;
|
||||
CharBackend *mad_chr_be;
|
||||
} RdmaBackendDev;
|
||||
|
||||
typedef struct RdmaBackendPD {
|
||||
|
@ -19,6 +19,7 @@
|
||||
#include "qemu/units.h"
|
||||
#include "hw/pci/pci.h"
|
||||
#include "hw/pci/msix.h"
|
||||
#include "chardev/char-fe.h"
|
||||
|
||||
#include "../rdma_backend_defs.h"
|
||||
#include "../rdma_rm_defs.h"
|
||||
@ -83,6 +84,7 @@ typedef struct PVRDMADev {
|
||||
uint8_t backend_port_num;
|
||||
RdmaBackendDev backend_dev;
|
||||
RdmaDeviceResources rdma_dev_res;
|
||||
CharBackend mad_chr;
|
||||
} PVRDMADev;
|
||||
#define PVRDMA_DEV(dev) OBJECT_CHECK(PVRDMADev, (dev), PVRDMA_HW_NAME)
|
||||
|
||||
|
@ -51,6 +51,7 @@ static Property pvrdma_dev_properties[] = {
|
||||
DEFINE_PROP_INT32("dev-caps-max-qp-init-rd-atom", PVRDMADev,
|
||||
dev_attr.max_qp_init_rd_atom, MAX_QP_INIT_RD_ATOM),
|
||||
DEFINE_PROP_INT32("dev-caps-max-ah", PVRDMADev, dev_attr.max_ah, MAX_AH),
|
||||
DEFINE_PROP_CHR("mad-chardev", PVRDMADev, mad_chr),
|
||||
DEFINE_PROP_END_OF_LIST(),
|
||||
};
|
||||
|
||||
@ -613,7 +614,8 @@ static void pvrdma_realize(PCIDevice *pdev, Error **errp)
|
||||
|
||||
rc = rdma_backend_init(&dev->backend_dev, pdev, &dev->rdma_dev_res,
|
||||
dev->backend_device_name, dev->backend_port_num,
|
||||
dev->backend_gid_idx, &dev->dev_attr, errp);
|
||||
dev->backend_gid_idx, &dev->dev_attr, &dev->mad_chr,
|
||||
errp);
|
||||
if (rc) {
|
||||
goto out;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user