2007-09-10 19:50:12 +02:00
|
|
|
/*
|
|
|
|
* Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
|
|
|
|
*
|
|
|
|
* This software is available to you under a choice of one of two
|
|
|
|
* licenses. You may choose to be licensed under the terms of the GNU
|
|
|
|
* General Public License (GPL) Version 2, available from the file
|
|
|
|
* COPYING in the main directory of this source tree, or the BSD-type
|
|
|
|
* license below:
|
|
|
|
*
|
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
* modification, are permitted provided that the following conditions
|
|
|
|
* are met:
|
|
|
|
*
|
|
|
|
* Redistributions of source code must retain the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer.
|
|
|
|
*
|
|
|
|
* Redistributions in binary form must reproduce the above
|
|
|
|
* copyright notice, this list of conditions and the following
|
|
|
|
* disclaimer in the documentation and/or other materials provided
|
|
|
|
* with the distribution.
|
|
|
|
*
|
|
|
|
* Neither the name of the Network Appliance, Inc. nor the names of
|
|
|
|
* its contributors may be used to endorse or promote products
|
|
|
|
* derived from this software without specific prior written
|
|
|
|
* permission.
|
|
|
|
*
|
|
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
|
|
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
|
|
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
|
|
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
|
|
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
|
|
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
|
|
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
|
|
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
|
|
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* transport.c
|
|
|
|
*
|
|
|
|
* This file contains the top-level implementation of an RPC RDMA
|
|
|
|
* transport.
|
|
|
|
*
|
|
|
|
* Naming convention: functions beginning with xprt_ are part of the
|
|
|
|
* transport switch. All others are RPC RDMA internal.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/module.h>
|
|
|
|
#include <linux/init.h>
|
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h
percpu.h is included by sched.h and module.h and thus ends up being
included when building most .c files. percpu.h includes slab.h which
in turn includes gfp.h making everything defined by the two files
universally available and complicating inclusion dependencies.
percpu.h -> slab.h dependency is about to be removed. Prepare for
this change by updating users of gfp and slab facilities include those
headers directly instead of assuming availability. As this conversion
needs to touch large number of source files, the following script is
used as the basis of conversion.
http://userweb.kernel.org/~tj/misc/slabh-sweep.py
The script does the followings.
* Scan files for gfp and slab usages and update includes such that
only the necessary includes are there. ie. if only gfp is used,
gfp.h, if slab is used, slab.h.
* When the script inserts a new include, it looks at the include
blocks and try to put the new include such that its order conforms
to its surrounding. It's put in the include block which contains
core kernel includes, in the same order that the rest are ordered -
alphabetical, Christmas tree, rev-Xmas-tree or at the end if there
doesn't seem to be any matching order.
* If the script can't find a place to put a new include (mostly
because the file doesn't have fitting include block), it prints out
an error message indicating which .h file needs to be added to the
file.
The conversion was done in the following steps.
1. The initial automatic conversion of all .c files updated slightly
over 4000 files, deleting around 700 includes and adding ~480 gfp.h
and ~3000 slab.h inclusions. The script emitted errors for ~400
files.
2. Each error was manually checked. Some didn't need the inclusion,
some needed manual addition while adding it to implementation .h or
embedding .c file was more appropriate for others. This step added
inclusions to around 150 files.
3. The script was run again and the output was compared to the edits
from #2 to make sure no file was left behind.
4. Several build tests were done and a couple of problems were fixed.
e.g. lib/decompress_*.c used malloc/free() wrappers around slab
APIs requiring slab.h to be added manually.
5. The script was run on all .h files but without automatically
editing them as sprinkling gfp.h and slab.h inclusions around .h
files could easily lead to inclusion dependency hell. Most gfp.h
inclusion directives were ignored as stuff from gfp.h was usually
wildly available and often used in preprocessor macros. Each
slab.h inclusion directive was examined and added manually as
necessary.
6. percpu.h was updated not to include slab.h.
7. Build test were done on the following configurations and failures
were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my
distributed build env didn't work with gcov compiles) and a few
more options had to be turned off depending on archs to make things
build (like ipr on powerpc/64 which failed due to missing writeq).
* x86 and x86_64 UP and SMP allmodconfig and a custom test config.
* powerpc and powerpc64 SMP allmodconfig
* sparc and sparc64 SMP allmodconfig
* ia64 SMP allmodconfig
* s390 SMP allmodconfig
* alpha SMP allmodconfig
* um on x86_64 SMP allmodconfig
8. percpu.h modifications were reverted so that it could be applied as
a separate patch and serve as bisection point.
Given the fact that I had only a couple of failures from tests on step
6, I'm fairly confident about the coverage of this conversion patch.
If there is a breakage, it's likely to be something in one of the arch
headers which should be easily discoverable easily on most builds of
the specific arch.
Signed-off-by: Tejun Heo <tj@kernel.org>
Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 09:04:11 +01:00
|
|
|
#include <linux/slab.h>
|
2007-09-10 19:50:12 +02:00
|
|
|
#include <linux/seq_file.h>
|
2013-02-04 18:50:00 +01:00
|
|
|
#include <linux/sunrpc/addr.h>
|
2007-09-10 19:50:12 +02:00
|
|
|
|
|
|
|
#include "xprt_rdma.h"
|
|
|
|
|
2014-11-17 22:58:04 +01:00
|
|
|
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
|
2007-09-10 19:50:12 +02:00
|
|
|
# define RPCDBG_FACILITY RPCDBG_TRANS
|
|
|
|
#endif
|
|
|
|
|
|
|
|
MODULE_LICENSE("Dual BSD/GPL");
|
|
|
|
|
|
|
|
MODULE_DESCRIPTION("RPC/RDMA Transport for Linux kernel NFS");
|
|
|
|
MODULE_AUTHOR("Network Appliance, Inc.");
|
|
|
|
|
|
|
|
/*
|
|
|
|
* tunables
|
|
|
|
*/
|
|
|
|
|
|
|
|
static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE;
|
|
|
|
static unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE;
|
|
|
|
static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE;
|
|
|
|
static unsigned int xprt_rdma_inline_write_padding;
|
2008-10-09 21:00:20 +02:00
|
|
|
static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR;
|
2014-11-09 02:14:53 +01:00
|
|
|
int xprt_rdma_pad_optimize = 1;
|
2007-09-10 19:50:12 +02:00
|
|
|
|
2014-11-17 22:58:04 +01:00
|
|
|
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
|
2007-09-10 19:50:12 +02:00
|
|
|
|
|
|
|
static unsigned int min_slot_table_size = RPCRDMA_MIN_SLOT_TABLE;
|
|
|
|
static unsigned int max_slot_table_size = RPCRDMA_MAX_SLOT_TABLE;
|
|
|
|
static unsigned int zero;
|
|
|
|
static unsigned int max_padding = PAGE_SIZE;
|
|
|
|
static unsigned int min_memreg = RPCRDMA_BOUNCEBUFFERS;
|
|
|
|
static unsigned int max_memreg = RPCRDMA_LAST - 1;
|
|
|
|
|
|
|
|
static struct ctl_table_header *sunrpc_table_header;
|
|
|
|
|
2013-06-12 08:04:25 +02:00
|
|
|
static struct ctl_table xr_tunables_table[] = {
|
2007-09-10 19:50:12 +02:00
|
|
|
{
|
|
|
|
.procname = "rdma_slot_table_entries",
|
|
|
|
.data = &xprt_rdma_slot_table_entries,
|
|
|
|
.maxlen = sizeof(unsigned int),
|
|
|
|
.mode = 0644,
|
2009-11-16 12:11:48 +01:00
|
|
|
.proc_handler = proc_dointvec_minmax,
|
2007-09-10 19:50:12 +02:00
|
|
|
.extra1 = &min_slot_table_size,
|
|
|
|
.extra2 = &max_slot_table_size
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.procname = "rdma_max_inline_read",
|
|
|
|
.data = &xprt_rdma_max_inline_read,
|
|
|
|
.maxlen = sizeof(unsigned int),
|
|
|
|
.mode = 0644,
|
2009-11-16 12:11:48 +01:00
|
|
|
.proc_handler = proc_dointvec,
|
2007-09-10 19:50:12 +02:00
|
|
|
},
|
|
|
|
{
|
|
|
|
.procname = "rdma_max_inline_write",
|
|
|
|
.data = &xprt_rdma_max_inline_write,
|
|
|
|
.maxlen = sizeof(unsigned int),
|
|
|
|
.mode = 0644,
|
2009-11-16 12:11:48 +01:00
|
|
|
.proc_handler = proc_dointvec,
|
2007-09-10 19:50:12 +02:00
|
|
|
},
|
|
|
|
{
|
|
|
|
.procname = "rdma_inline_write_padding",
|
|
|
|
.data = &xprt_rdma_inline_write_padding,
|
|
|
|
.maxlen = sizeof(unsigned int),
|
|
|
|
.mode = 0644,
|
2009-11-16 12:11:48 +01:00
|
|
|
.proc_handler = proc_dointvec_minmax,
|
2007-09-10 19:50:12 +02:00
|
|
|
.extra1 = &zero,
|
|
|
|
.extra2 = &max_padding,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.procname = "rdma_memreg_strategy",
|
|
|
|
.data = &xprt_rdma_memreg_strategy,
|
|
|
|
.maxlen = sizeof(unsigned int),
|
|
|
|
.mode = 0644,
|
2009-11-16 12:11:48 +01:00
|
|
|
.proc_handler = proc_dointvec_minmax,
|
2007-09-10 19:50:12 +02:00
|
|
|
.extra1 = &min_memreg,
|
|
|
|
.extra2 = &max_memreg,
|
|
|
|
},
|
2008-10-09 21:01:11 +02:00
|
|
|
{
|
|
|
|
.procname = "rdma_pad_optimize",
|
|
|
|
.data = &xprt_rdma_pad_optimize,
|
|
|
|
.maxlen = sizeof(unsigned int),
|
|
|
|
.mode = 0644,
|
2009-11-16 12:11:48 +01:00
|
|
|
.proc_handler = proc_dointvec,
|
2008-10-09 21:01:11 +02:00
|
|
|
},
|
2009-11-05 22:32:03 +01:00
|
|
|
{ },
|
2007-09-10 19:50:12 +02:00
|
|
|
};
|
|
|
|
|
2013-06-12 08:04:25 +02:00
|
|
|
static struct ctl_table sunrpc_table[] = {
|
2007-09-10 19:50:12 +02:00
|
|
|
{
|
|
|
|
.procname = "sunrpc",
|
|
|
|
.mode = 0555,
|
|
|
|
.child = xr_tunables_table
|
|
|
|
},
|
2009-11-05 22:32:03 +01:00
|
|
|
{ },
|
2007-09-10 19:50:12 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
2014-05-28 16:34:32 +02:00
|
|
|
#define RPCRDMA_BIND_TO (60U * HZ)
|
|
|
|
#define RPCRDMA_INIT_REEST_TO (5U * HZ)
|
|
|
|
#define RPCRDMA_MAX_REEST_TO (30U * HZ)
|
|
|
|
#define RPCRDMA_IDLE_DISC_TO (5U * 60 * HZ)
|
|
|
|
|
2007-09-10 19:50:12 +02:00
|
|
|
static struct rpc_xprt_ops xprt_rdma_procs; /* forward reference */
|
|
|
|
|
|
|
|
static void
|
|
|
|
xprt_rdma_format_addresses(struct rpc_xprt *xprt)
|
|
|
|
{
|
2009-08-09 21:09:36 +02:00
|
|
|
struct sockaddr *sap = (struct sockaddr *)
|
2007-09-10 19:50:12 +02:00
|
|
|
&rpcx_to_rdmad(xprt).addr;
|
2009-08-09 21:09:36 +02:00
|
|
|
struct sockaddr_in *sin = (struct sockaddr_in *)sap;
|
|
|
|
char buf[64];
|
2007-09-10 19:50:12 +02:00
|
|
|
|
2009-08-09 21:09:36 +02:00
|
|
|
(void)rpc_ntop(sap, buf, sizeof(buf));
|
|
|
|
xprt->address_strings[RPC_DISPLAY_ADDR] = kstrdup(buf, GFP_KERNEL);
|
2007-09-10 19:50:12 +02:00
|
|
|
|
2010-03-08 21:15:59 +01:00
|
|
|
snprintf(buf, sizeof(buf), "%u", rpc_get_port(sap));
|
2009-08-09 21:09:36 +02:00
|
|
|
xprt->address_strings[RPC_DISPLAY_PORT] = kstrdup(buf, GFP_KERNEL);
|
2007-09-10 19:50:12 +02:00
|
|
|
|
|
|
|
xprt->address_strings[RPC_DISPLAY_PROTO] = "rdma";
|
|
|
|
|
2010-03-08 21:15:59 +01:00
|
|
|
snprintf(buf, sizeof(buf), "%08x", ntohl(sin->sin_addr.s_addr));
|
2009-08-09 21:09:36 +02:00
|
|
|
xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = kstrdup(buf, GFP_KERNEL);
|
|
|
|
|
2010-03-08 21:15:59 +01:00
|
|
|
snprintf(buf, sizeof(buf), "%4hx", rpc_get_port(sap));
|
2009-08-09 21:09:36 +02:00
|
|
|
xprt->address_strings[RPC_DISPLAY_HEX_PORT] = kstrdup(buf, GFP_KERNEL);
|
2007-09-10 19:50:12 +02:00
|
|
|
|
|
|
|
/* netid */
|
|
|
|
xprt->address_strings[RPC_DISPLAY_NETID] = "rdma";
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
xprt_rdma_free_addresses(struct rpc_xprt *xprt)
|
|
|
|
{
|
2008-01-14 18:32:20 +01:00
|
|
|
unsigned int i;
|
|
|
|
|
|
|
|
for (i = 0; i < RPC_DISPLAY_MAX; i++)
|
|
|
|
switch (i) {
|
|
|
|
case RPC_DISPLAY_PROTO:
|
|
|
|
case RPC_DISPLAY_NETID:
|
|
|
|
continue;
|
|
|
|
default:
|
|
|
|
kfree(xprt->address_strings[i]);
|
|
|
|
}
|
2007-09-10 19:50:12 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
xprt_rdma_connect_worker(struct work_struct *work)
|
|
|
|
{
|
2015-01-21 17:02:37 +01:00
|
|
|
struct rpcrdma_xprt *r_xprt = container_of(work, struct rpcrdma_xprt,
|
|
|
|
rx_connect_worker.work);
|
|
|
|
struct rpc_xprt *xprt = &r_xprt->rx_xprt;
|
2007-09-10 19:50:12 +02:00
|
|
|
int rc = 0;
|
|
|
|
|
2012-09-11 23:21:25 +02:00
|
|
|
xprt_clear_connected(xprt);
|
|
|
|
|
|
|
|
dprintk("RPC: %s: %sconnect\n", __func__,
|
|
|
|
r_xprt->rx_ep.rep_connected != 0 ? "re" : "");
|
|
|
|
rc = rpcrdma_ep_connect(&r_xprt->rx_ep, &r_xprt->rx_ia);
|
|
|
|
if (rc)
|
|
|
|
xprt_wake_pending_tasks(xprt, rc);
|
|
|
|
|
2007-09-10 19:50:12 +02:00
|
|
|
dprintk("RPC: %s: exit\n", __func__);
|
|
|
|
xprt_clear_connecting(xprt);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* xprt_rdma_destroy
|
|
|
|
*
|
|
|
|
* Destroy the xprt.
|
|
|
|
* Free all memory associated with the object, including its own.
|
|
|
|
* NOTE: none of the *destroy methods free memory for their top-level
|
|
|
|
* objects, even though they may have allocated it (they do free
|
|
|
|
* private memory). It's up to the caller to handle it. In this
|
|
|
|
* case (RDMA transport), all structure memory is inlined with the
|
|
|
|
* struct rpcrdma_xprt.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
xprt_rdma_destroy(struct rpc_xprt *xprt)
|
|
|
|
{
|
|
|
|
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
|
|
|
|
|
|
|
|
dprintk("RPC: %s: called\n", __func__);
|
|
|
|
|
2015-01-21 17:02:37 +01:00
|
|
|
cancel_delayed_work_sync(&r_xprt->rx_connect_worker);
|
2007-09-10 19:50:12 +02:00
|
|
|
|
|
|
|
xprt_clear_connected(xprt);
|
|
|
|
|
|
|
|
rpcrdma_buffer_destroy(&r_xprt->rx_buf);
|
2014-05-28 16:33:16 +02:00
|
|
|
rpcrdma_ep_destroy(&r_xprt->rx_ep, &r_xprt->rx_ia);
|
2007-09-10 19:50:12 +02:00
|
|
|
rpcrdma_ia_close(&r_xprt->rx_ia);
|
|
|
|
|
|
|
|
xprt_rdma_free_addresses(xprt);
|
|
|
|
|
2010-09-29 14:03:13 +02:00
|
|
|
xprt_free(xprt);
|
2007-09-10 19:50:12 +02:00
|
|
|
|
|
|
|
dprintk("RPC: %s: returning\n", __func__);
|
|
|
|
|
|
|
|
module_put(THIS_MODULE);
|
|
|
|
}
|
|
|
|
|
2007-12-20 22:03:54 +01:00
|
|
|
static const struct rpc_timeout xprt_rdma_default_timeout = {
|
|
|
|
.to_initval = 60 * HZ,
|
|
|
|
.to_maxval = 60 * HZ,
|
|
|
|
};
|
|
|
|
|
2007-09-10 19:50:12 +02:00
|
|
|
/**
|
|
|
|
* xprt_setup_rdma - Set up transport to use RDMA
|
|
|
|
*
|
|
|
|
* @args: rpc transport arguments
|
|
|
|
*/
|
|
|
|
static struct rpc_xprt *
|
|
|
|
xprt_setup_rdma(struct xprt_create *args)
|
|
|
|
{
|
|
|
|
struct rpcrdma_create_data_internal cdata;
|
|
|
|
struct rpc_xprt *xprt;
|
|
|
|
struct rpcrdma_xprt *new_xprt;
|
|
|
|
struct rpcrdma_ep *new_ep;
|
|
|
|
struct sockaddr_in *sin;
|
|
|
|
int rc;
|
|
|
|
|
|
|
|
if (args->addrlen > sizeof(xprt->addr)) {
|
|
|
|
dprintk("RPC: %s: address too large\n", __func__);
|
|
|
|
return ERR_PTR(-EBADF);
|
|
|
|
}
|
|
|
|
|
2010-09-29 14:05:43 +02:00
|
|
|
xprt = xprt_alloc(args->net, sizeof(struct rpcrdma_xprt),
|
2011-07-18 00:11:30 +02:00
|
|
|
xprt_rdma_slot_table_entries,
|
2010-09-29 14:02:43 +02:00
|
|
|
xprt_rdma_slot_table_entries);
|
2007-09-10 19:50:12 +02:00
|
|
|
if (xprt == NULL) {
|
|
|
|
dprintk("RPC: %s: couldn't allocate rpcrdma_xprt\n",
|
|
|
|
__func__);
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* 60 second timeout, no retries */
|
2007-12-20 22:03:55 +01:00
|
|
|
xprt->timeout = &xprt_rdma_default_timeout;
|
2014-05-28 16:34:32 +02:00
|
|
|
xprt->bind_timeout = RPCRDMA_BIND_TO;
|
|
|
|
xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO;
|
|
|
|
xprt->idle_timeout = RPCRDMA_IDLE_DISC_TO;
|
2007-09-10 19:50:12 +02:00
|
|
|
|
|
|
|
xprt->resvport = 0; /* privileged port not needed */
|
|
|
|
xprt->tsh_size = 0; /* RPC-RDMA handles framing */
|
|
|
|
xprt->ops = &xprt_rdma_procs;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Set up RDMA-specific connect data.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* Put server RDMA address in local cdata */
|
|
|
|
memcpy(&cdata.addr, args->dstaddr, args->addrlen);
|
|
|
|
|
|
|
|
/* Ensure xprt->addr holds valid server TCP (not RDMA)
|
|
|
|
* address, for any side protocols which peek at it */
|
|
|
|
xprt->prot = IPPROTO_TCP;
|
|
|
|
xprt->addrlen = args->addrlen;
|
|
|
|
memcpy(&xprt->addr, &cdata.addr, xprt->addrlen);
|
|
|
|
|
|
|
|
sin = (struct sockaddr_in *)&cdata.addr;
|
|
|
|
if (ntohs(sin->sin_port) != 0)
|
|
|
|
xprt_set_bound(xprt);
|
|
|
|
|
2008-10-31 08:54:56 +01:00
|
|
|
dprintk("RPC: %s: %pI4:%u\n",
|
|
|
|
__func__, &sin->sin_addr.s_addr, ntohs(sin->sin_port));
|
2007-09-10 19:50:12 +02:00
|
|
|
|
|
|
|
/* Set max requests */
|
|
|
|
cdata.max_requests = xprt->max_reqs;
|
|
|
|
|
|
|
|
/* Set some length limits */
|
|
|
|
cdata.rsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA write max */
|
|
|
|
cdata.wsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA read max */
|
|
|
|
|
|
|
|
cdata.inline_wsize = xprt_rdma_max_inline_write;
|
|
|
|
if (cdata.inline_wsize > cdata.wsize)
|
|
|
|
cdata.inline_wsize = cdata.wsize;
|
|
|
|
|
|
|
|
cdata.inline_rsize = xprt_rdma_max_inline_read;
|
|
|
|
if (cdata.inline_rsize > cdata.rsize)
|
|
|
|
cdata.inline_rsize = cdata.rsize;
|
|
|
|
|
|
|
|
cdata.padding = xprt_rdma_inline_write_padding;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Create new transport instance, which includes initialized
|
|
|
|
* o ia
|
|
|
|
* o endpoint
|
|
|
|
* o buffers
|
|
|
|
*/
|
|
|
|
|
|
|
|
new_xprt = rpcx_to_rdmax(xprt);
|
|
|
|
|
|
|
|
rc = rpcrdma_ia_open(new_xprt, (struct sockaddr *) &cdata.addr,
|
|
|
|
xprt_rdma_memreg_strategy);
|
|
|
|
if (rc)
|
|
|
|
goto out1;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* initialize and create ep
|
|
|
|
*/
|
|
|
|
new_xprt->rx_data = cdata;
|
|
|
|
new_ep = &new_xprt->rx_ep;
|
|
|
|
new_ep->rep_remote_addr = cdata.addr;
|
|
|
|
|
|
|
|
rc = rpcrdma_ep_create(&new_xprt->rx_ep,
|
|
|
|
&new_xprt->rx_ia, &new_xprt->rx_data);
|
|
|
|
if (rc)
|
|
|
|
goto out2;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Allocate pre-registered send and receive buffers for headers and
|
|
|
|
* any inline data. Also specify any padding which will be provided
|
|
|
|
* from a preregistered zero buffer.
|
|
|
|
*/
|
2015-01-21 17:03:44 +01:00
|
|
|
rc = rpcrdma_buffer_create(new_xprt);
|
2007-09-10 19:50:12 +02:00
|
|
|
if (rc)
|
|
|
|
goto out3;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Register a callback for connection events. This is necessary because
|
|
|
|
* connection loss notification is async. We also catch connection loss
|
|
|
|
* when reaping receives.
|
|
|
|
*/
|
2015-01-21 17:02:37 +01:00
|
|
|
INIT_DELAYED_WORK(&new_xprt->rx_connect_worker,
|
|
|
|
xprt_rdma_connect_worker);
|
2007-09-10 19:50:12 +02:00
|
|
|
|
|
|
|
xprt_rdma_format_addresses(xprt);
|
2014-07-29 23:23:34 +02:00
|
|
|
xprt->max_payload = rpcrdma_max_payload(new_xprt);
|
|
|
|
dprintk("RPC: %s: transport data payload maximum: %zu bytes\n",
|
|
|
|
__func__, xprt->max_payload);
|
2007-09-10 19:50:12 +02:00
|
|
|
|
|
|
|
if (!try_module_get(THIS_MODULE))
|
|
|
|
goto out4;
|
|
|
|
|
|
|
|
return xprt;
|
|
|
|
|
|
|
|
out4:
|
|
|
|
xprt_rdma_free_addresses(xprt);
|
|
|
|
rc = -EINVAL;
|
|
|
|
out3:
|
2014-05-28 16:33:16 +02:00
|
|
|
rpcrdma_ep_destroy(new_ep, &new_xprt->rx_ia);
|
2007-09-10 19:50:12 +02:00
|
|
|
out2:
|
|
|
|
rpcrdma_ia_close(&new_xprt->rx_ia);
|
|
|
|
out1:
|
2010-09-29 14:03:13 +02:00
|
|
|
xprt_free(xprt);
|
2007-09-10 19:50:12 +02:00
|
|
|
return ERR_PTR(rc);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Close a connection, during shutdown or timeout/reconnect
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
xprt_rdma_close(struct rpc_xprt *xprt)
|
|
|
|
{
|
|
|
|
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
|
|
|
|
|
|
|
|
dprintk("RPC: %s: closing\n", __func__);
|
2008-10-10 17:32:34 +02:00
|
|
|
if (r_xprt->rx_ep.rep_connected > 0)
|
|
|
|
xprt->reestablish_timeout = 0;
|
2007-11-07 00:44:20 +01:00
|
|
|
xprt_disconnect_done(xprt);
|
2014-07-29 23:25:55 +02:00
|
|
|
rpcrdma_ep_disconnect(&r_xprt->rx_ep, &r_xprt->rx_ia);
|
2007-09-10 19:50:12 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
xprt_rdma_set_port(struct rpc_xprt *xprt, u16 port)
|
|
|
|
{
|
|
|
|
struct sockaddr_in *sap;
|
|
|
|
|
|
|
|
sap = (struct sockaddr_in *)&xprt->addr;
|
|
|
|
sap->sin_port = htons(port);
|
|
|
|
sap = (struct sockaddr_in *)&rpcx_to_rdmad(xprt).addr;
|
|
|
|
sap->sin_port = htons(port);
|
|
|
|
dprintk("RPC: %s: %u\n", __func__, port);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2013-01-08 15:26:49 +01:00
|
|
|
xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task)
|
2007-09-10 19:50:12 +02:00
|
|
|
{
|
|
|
|
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
|
|
|
|
|
2010-04-16 22:41:57 +02:00
|
|
|
if (r_xprt->rx_ep.rep_connected != 0) {
|
|
|
|
/* Reconnect */
|
2015-01-21 17:02:37 +01:00
|
|
|
schedule_delayed_work(&r_xprt->rx_connect_worker,
|
|
|
|
xprt->reestablish_timeout);
|
2010-04-16 22:41:57 +02:00
|
|
|
xprt->reestablish_timeout <<= 1;
|
2014-05-28 16:34:32 +02:00
|
|
|
if (xprt->reestablish_timeout > RPCRDMA_MAX_REEST_TO)
|
|
|
|
xprt->reestablish_timeout = RPCRDMA_MAX_REEST_TO;
|
|
|
|
else if (xprt->reestablish_timeout < RPCRDMA_INIT_REEST_TO)
|
|
|
|
xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO;
|
2010-04-16 22:41:57 +02:00
|
|
|
} else {
|
2015-01-21 17:02:37 +01:00
|
|
|
schedule_delayed_work(&r_xprt->rx_connect_worker, 0);
|
2010-04-16 22:41:57 +02:00
|
|
|
if (!RPC_IS_ASYNC(task))
|
2015-01-21 17:02:37 +01:00
|
|
|
flush_delayed_work(&r_xprt->rx_connect_worker);
|
2007-09-10 19:50:12 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The RDMA allocate/free functions need the task structure as a place
|
|
|
|
* to hide the struct rpcrdma_req, which is necessary for the actual send/recv
|
2015-01-21 17:04:08 +01:00
|
|
|
* sequence.
|
|
|
|
*
|
|
|
|
* The RPC layer allocates both send and receive buffers in the same call
|
|
|
|
* (rq_send_buf and rq_rcv_buf are both part of a single contiguous buffer).
|
|
|
|
* We may register rq_rcv_buf when using reply chunks.
|
2007-09-10 19:50:12 +02:00
|
|
|
*/
|
|
|
|
static void *
|
|
|
|
xprt_rdma_allocate(struct rpc_task *task, size_t size)
|
|
|
|
{
|
2013-01-08 15:10:21 +01:00
|
|
|
struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt;
|
2015-01-21 17:04:08 +01:00
|
|
|
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
|
|
|
|
struct rpcrdma_regbuf *rb;
|
|
|
|
struct rpcrdma_req *req;
|
|
|
|
size_t min_size;
|
|
|
|
gfp_t flags = task->tk_flags & RPC_TASK_SWAPPER ?
|
|
|
|
GFP_ATOMIC : GFP_NOFS;
|
2007-09-10 19:50:12 +02:00
|
|
|
|
2015-01-21 17:04:08 +01:00
|
|
|
req = rpcrdma_buffer_get(&r_xprt->rx_buf);
|
2014-05-28 16:35:06 +02:00
|
|
|
if (req == NULL)
|
|
|
|
return NULL;
|
2007-09-10 19:50:12 +02:00
|
|
|
|
2015-01-21 17:04:08 +01:00
|
|
|
if (req->rl_sendbuf == NULL)
|
|
|
|
goto out_sendbuf;
|
|
|
|
if (size > req->rl_sendbuf->rg_size)
|
|
|
|
goto out_sendbuf;
|
|
|
|
|
|
|
|
out:
|
2007-09-10 19:50:12 +02:00
|
|
|
dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req);
|
2008-10-09 21:00:40 +02:00
|
|
|
req->rl_connect_cookie = 0; /* our reserved value */
|
2015-01-21 17:04:08 +01:00
|
|
|
return req->rl_sendbuf->rg_base;
|
|
|
|
|
|
|
|
out_sendbuf:
|
|
|
|
/* XDR encoding and RPC/RDMA marshaling of this request has not
|
|
|
|
* yet occurred. Thus a lower bound is needed to prevent buffer
|
|
|
|
* overrun during marshaling.
|
|
|
|
*
|
|
|
|
* RPC/RDMA marshaling may choose to send payload bearing ops
|
|
|
|
* inline, if the result is smaller than the inline threshold.
|
|
|
|
* The value of the "size" argument accounts for header
|
|
|
|
* requirements but not for the payload in these cases.
|
|
|
|
*
|
|
|
|
* Likewise, allocate enough space to receive a reply up to the
|
|
|
|
* size of the inline threshold.
|
|
|
|
*
|
|
|
|
* It's unlikely that both the send header and the received
|
|
|
|
* reply will be large, but slush is provided here to allow
|
|
|
|
* flexibility when marshaling.
|
|
|
|
*/
|
|
|
|
min_size = RPCRDMA_INLINE_READ_THRESHOLD(task->tk_rqstp);
|
|
|
|
min_size += RPCRDMA_INLINE_WRITE_THRESHOLD(task->tk_rqstp);
|
|
|
|
if (size < min_size)
|
|
|
|
size = min_size;
|
|
|
|
|
|
|
|
rb = rpcrdma_alloc_regbuf(&r_xprt->rx_ia, size, flags);
|
|
|
|
if (IS_ERR(rb))
|
|
|
|
goto out_fail;
|
|
|
|
rb->rg_owner = req;
|
|
|
|
|
|
|
|
r_xprt->rx_stats.hardway_register_count += size;
|
|
|
|
rpcrdma_free_regbuf(&r_xprt->rx_ia, req->rl_sendbuf);
|
|
|
|
req->rl_sendbuf = rb;
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
out_fail:
|
2007-09-10 19:50:12 +02:00
|
|
|
rpcrdma_buffer_put(req);
|
2015-01-21 17:04:08 +01:00
|
|
|
r_xprt->rx_stats.failed_marshal_count++;
|
2007-09-10 19:50:12 +02:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This function returns all RDMA resources to the pool.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
xprt_rdma_free(void *buffer)
|
|
|
|
{
|
|
|
|
struct rpcrdma_req *req;
|
|
|
|
struct rpcrdma_xprt *r_xprt;
|
2015-01-21 17:04:08 +01:00
|
|
|
struct rpcrdma_regbuf *rb;
|
2007-09-10 19:50:12 +02:00
|
|
|
int i;
|
|
|
|
|
|
|
|
if (buffer == NULL)
|
|
|
|
return;
|
|
|
|
|
2015-01-21 17:04:08 +01:00
|
|
|
rb = container_of(buffer, struct rpcrdma_regbuf, rg_base[0]);
|
|
|
|
req = rb->rg_owner;
|
|
|
|
r_xprt = container_of(req->rl_buffer, struct rpcrdma_xprt, rx_buf);
|
2007-09-10 19:50:12 +02:00
|
|
|
|
2015-01-21 17:04:08 +01:00
|
|
|
dprintk("RPC: %s: called on 0x%p\n", __func__, req->rl_reply);
|
2007-09-10 19:50:12 +02:00
|
|
|
|
|
|
|
for (i = 0; req->rl_nchunks;) {
|
|
|
|
--req->rl_nchunks;
|
|
|
|
i += rpcrdma_deregister_external(
|
2014-05-28 16:33:08 +02:00
|
|
|
&req->rl_segments[i], r_xprt);
|
2007-09-10 19:50:12 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
rpcrdma_buffer_put(req);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* send_request invokes the meat of RPC RDMA. It must do the following:
|
|
|
|
* 1. Marshal the RPC request into an RPC RDMA request, which means
|
|
|
|
* putting a header in front of data, and creating IOVs for RDMA
|
|
|
|
* from those in the request.
|
|
|
|
* 2. In marshaling, detect opportunities for RDMA, and use them.
|
|
|
|
* 3. Post a recv message to set up asynch completion, then send
|
|
|
|
* the request (rpcrdma_ep_post).
|
|
|
|
* 4. No partial sends are possible in the RPC-RDMA protocol (as in UDP).
|
|
|
|
*/
|
|
|
|
|
|
|
|
static int
|
|
|
|
xprt_rdma_send_request(struct rpc_task *task)
|
|
|
|
{
|
|
|
|
struct rpc_rqst *rqst = task->tk_rqstp;
|
2013-01-08 15:10:21 +01:00
|
|
|
struct rpc_xprt *xprt = rqst->rq_xprt;
|
2007-09-10 19:50:12 +02:00
|
|
|
struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
|
|
|
|
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
|
2014-07-29 23:23:43 +02:00
|
|
|
int rc = 0;
|
2007-09-10 19:50:12 +02:00
|
|
|
|
2014-07-29 23:23:43 +02:00
|
|
|
if (req->rl_niovs == 0)
|
2014-05-28 16:35:14 +02:00
|
|
|
rc = rpcrdma_marshal_req(rqst);
|
2014-11-09 02:14:29 +01:00
|
|
|
else if (r_xprt->rx_ia.ri_memreg_strategy != RPCRDMA_ALLPHYSICAL)
|
2014-07-29 23:23:43 +02:00
|
|
|
rc = rpcrdma_marshal_chunks(rqst, 0);
|
|
|
|
if (rc < 0)
|
|
|
|
goto failed_marshal;
|
2007-09-10 19:50:12 +02:00
|
|
|
|
|
|
|
if (req->rl_reply == NULL) /* e.g. reconnection */
|
|
|
|
rpcrdma_recv_buffer_get(req);
|
|
|
|
|
|
|
|
if (req->rl_reply) {
|
|
|
|
req->rl_reply->rr_func = rpcrdma_reply_handler;
|
|
|
|
/* this need only be done once, but... */
|
|
|
|
req->rl_reply->rr_xprt = xprt;
|
|
|
|
}
|
|
|
|
|
2008-10-09 21:00:40 +02:00
|
|
|
/* Must suppress retransmit to maintain credits */
|
|
|
|
if (req->rl_connect_cookie == xprt->connect_cookie)
|
|
|
|
goto drop_connection;
|
|
|
|
req->rl_connect_cookie = xprt->connect_cookie;
|
|
|
|
|
|
|
|
if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req))
|
|
|
|
goto drop_connection;
|
2007-09-10 19:50:12 +02:00
|
|
|
|
2010-05-13 18:51:49 +02:00
|
|
|
rqst->rq_xmit_bytes_sent += rqst->rq_snd_buf.len;
|
2007-09-10 19:50:12 +02:00
|
|
|
rqst->rq_bytes_sent = 0;
|
|
|
|
return 0;
|
2008-10-09 21:00:40 +02:00
|
|
|
|
2014-05-28 16:35:14 +02:00
|
|
|
failed_marshal:
|
|
|
|
r_xprt->rx_stats.failed_marshal_count++;
|
|
|
|
dprintk("RPC: %s: rpcrdma_marshal_req failed, status %i\n",
|
|
|
|
__func__, rc);
|
|
|
|
if (rc == -EIO)
|
|
|
|
return -EIO;
|
2008-10-09 21:00:40 +02:00
|
|
|
drop_connection:
|
|
|
|
xprt_disconnect_done(xprt);
|
|
|
|
return -ENOTCONN; /* implies disconnect */
|
2007-09-10 19:50:12 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
|
|
|
|
{
|
|
|
|
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
|
|
|
|
long idle_time = 0;
|
|
|
|
|
|
|
|
if (xprt_connected(xprt))
|
|
|
|
idle_time = (long)(jiffies - xprt->last_used) / HZ;
|
|
|
|
|
|
|
|
seq_printf(seq,
|
|
|
|
"\txprt:\trdma %u %lu %lu %lu %ld %lu %lu %lu %Lu %Lu "
|
|
|
|
"%lu %lu %lu %Lu %Lu %Lu %Lu %lu %lu %lu\n",
|
|
|
|
|
|
|
|
0, /* need a local port? */
|
|
|
|
xprt->stat.bind_count,
|
|
|
|
xprt->stat.connect_count,
|
|
|
|
xprt->stat.connect_time,
|
|
|
|
idle_time,
|
|
|
|
xprt->stat.sends,
|
|
|
|
xprt->stat.recvs,
|
|
|
|
xprt->stat.bad_xids,
|
|
|
|
xprt->stat.req_u,
|
|
|
|
xprt->stat.bklog_u,
|
|
|
|
|
|
|
|
r_xprt->rx_stats.read_chunk_count,
|
|
|
|
r_xprt->rx_stats.write_chunk_count,
|
|
|
|
r_xprt->rx_stats.reply_chunk_count,
|
|
|
|
r_xprt->rx_stats.total_rdma_request,
|
|
|
|
r_xprt->rx_stats.total_rdma_reply,
|
|
|
|
r_xprt->rx_stats.pullup_copy_count,
|
|
|
|
r_xprt->rx_stats.fixup_copy_count,
|
|
|
|
r_xprt->rx_stats.hardway_register_count,
|
|
|
|
r_xprt->rx_stats.failed_marshal_count,
|
|
|
|
r_xprt->rx_stats.bad_reply_count);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Plumbing for rpc transport switch and kernel module
|
|
|
|
*/
|
|
|
|
|
|
|
|
static struct rpc_xprt_ops xprt_rdma_procs = {
|
2014-05-28 16:34:57 +02:00
|
|
|
.reserve_xprt = xprt_reserve_xprt_cong,
|
2007-09-10 19:50:12 +02:00
|
|
|
.release_xprt = xprt_release_xprt_cong, /* sunrpc/xprt.c */
|
2012-09-07 17:08:50 +02:00
|
|
|
.alloc_slot = xprt_alloc_slot,
|
2007-09-10 19:50:12 +02:00
|
|
|
.release_request = xprt_release_rqst_cong, /* ditto */
|
|
|
|
.set_retrans_timeout = xprt_set_retrans_timeout_def, /* ditto */
|
|
|
|
.rpcbind = rpcb_getport_async, /* sunrpc/rpcb_clnt.c */
|
|
|
|
.set_port = xprt_rdma_set_port,
|
|
|
|
.connect = xprt_rdma_connect,
|
|
|
|
.buf_alloc = xprt_rdma_allocate,
|
|
|
|
.buf_free = xprt_rdma_free,
|
|
|
|
.send_request = xprt_rdma_send_request,
|
|
|
|
.close = xprt_rdma_close,
|
|
|
|
.destroy = xprt_rdma_destroy,
|
|
|
|
.print_stats = xprt_rdma_print_stats
|
|
|
|
};
|
|
|
|
|
|
|
|
static struct xprt_class xprt_rdma = {
|
|
|
|
.list = LIST_HEAD_INIT(xprt_rdma.list),
|
|
|
|
.name = "rdma",
|
|
|
|
.owner = THIS_MODULE,
|
|
|
|
.ident = XPRT_TRANSPORT_RDMA,
|
|
|
|
.setup = xprt_setup_rdma,
|
|
|
|
};
|
|
|
|
|
|
|
|
static void __exit xprt_rdma_cleanup(void)
|
|
|
|
{
|
|
|
|
int rc;
|
|
|
|
|
2014-03-12 17:51:39 +01:00
|
|
|
dprintk("RPCRDMA Module Removed, deregister RPC RDMA transport\n");
|
2014-11-17 22:58:04 +01:00
|
|
|
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
|
2007-09-10 19:50:12 +02:00
|
|
|
if (sunrpc_table_header) {
|
|
|
|
unregister_sysctl_table(sunrpc_table_header);
|
|
|
|
sunrpc_table_header = NULL;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
rc = xprt_unregister_transport(&xprt_rdma);
|
|
|
|
if (rc)
|
|
|
|
dprintk("RPC: %s: xprt_unregister returned %i\n",
|
|
|
|
__func__, rc);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int __init xprt_rdma_init(void)
|
|
|
|
{
|
|
|
|
int rc;
|
|
|
|
|
|
|
|
rc = xprt_register_transport(&xprt_rdma);
|
|
|
|
|
|
|
|
if (rc)
|
|
|
|
return rc;
|
|
|
|
|
2014-03-12 17:51:39 +01:00
|
|
|
dprintk("RPCRDMA Module Init, register RPC RDMA transport\n");
|
2007-09-10 19:50:12 +02:00
|
|
|
|
2014-03-12 17:51:39 +01:00
|
|
|
dprintk("Defaults:\n");
|
|
|
|
dprintk("\tSlots %d\n"
|
2007-09-10 19:50:12 +02:00
|
|
|
"\tMaxInlineRead %d\n\tMaxInlineWrite %d\n",
|
|
|
|
xprt_rdma_slot_table_entries,
|
|
|
|
xprt_rdma_max_inline_read, xprt_rdma_max_inline_write);
|
2014-03-12 17:51:39 +01:00
|
|
|
dprintk("\tPadding %d\n\tMemreg %d\n",
|
2007-09-10 19:50:12 +02:00
|
|
|
xprt_rdma_inline_write_padding, xprt_rdma_memreg_strategy);
|
|
|
|
|
2014-11-17 22:58:04 +01:00
|
|
|
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
|
2007-09-10 19:50:12 +02:00
|
|
|
if (!sunrpc_table_header)
|
|
|
|
sunrpc_table_header = register_sysctl_table(sunrpc_table);
|
|
|
|
#endif
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
module_init(xprt_rdma_init);
|
|
|
|
module_exit(xprt_rdma_cleanup);
|