smc: receive data from RMBE

move RMBE data into user space buffer and update managing cursors

Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
Ursula Braun 2017-01-09 16:55:24 +01:00 committed by David S. Miller
parent e6727f3900
commit 952310ccf2
9 changed files with 304 additions and 3 deletions

View File

@ -1,3 +1,3 @@
obj-$(CONFIG_SMC) += smc.o obj-$(CONFIG_SMC) += smc.o
smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o
smc-y += smc_cdc.o smc_tx.o smc-y += smc_cdc.o smc_tx.o smc_rx.o

View File

@ -38,6 +38,7 @@
#include "smc_ib.h" #include "smc_ib.h"
#include "smc_pnet.h" #include "smc_pnet.h"
#include "smc_tx.h" #include "smc_tx.h"
#include "smc_rx.h"
static DEFINE_MUTEX(smc_create_lgr_pending); /* serialize link group static DEFINE_MUTEX(smc_create_lgr_pending); /* serialize link group
* creation * creation
@ -412,6 +413,7 @@ static int smc_connect_rdma(struct smc_sock *smc)
mutex_unlock(&smc_create_lgr_pending); mutex_unlock(&smc_create_lgr_pending);
smc_tx_init(smc); smc_tx_init(smc);
smc_rx_init(smc);
out_connected: out_connected:
smc_copy_sock_settings_to_clc(smc); smc_copy_sock_settings_to_clc(smc);
@ -755,6 +757,7 @@ static void smc_listen_work(struct work_struct *work)
} }
smc_tx_init(new_smc); smc_tx_init(new_smc);
smc_rx_init(new_smc);
out_connected: out_connected:
sk_refcnt_debug_inc(newsmcsk); sk_refcnt_debug_inc(newsmcsk);
@ -950,7 +953,7 @@ static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
if (smc->use_fallback) if (smc->use_fallback)
rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags); rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags);
else else
rc = sock_no_recvmsg(sock, msg, len, flags); rc = smc_rx_recvmsg(smc, msg, len, flags);
out: out:
release_sock(sk); release_sock(sk);
return rc; return rc;
@ -1016,6 +1019,8 @@ static unsigned int smc_poll(struct file *file, struct socket *sock,
sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
} }
if (atomic_read(&smc->conn.bytes_to_rcv))
mask |= POLLIN | POLLRDNORM;
/* for now - to be enhanced in follow-on patch */ /* for now - to be enhanced in follow-on patch */
} }

View File

@ -115,6 +115,10 @@ struct smc_connection {
struct smc_buf_desc *rmb_desc; /* RMBE descriptor */ struct smc_buf_desc *rmb_desc; /* RMBE descriptor */
int rmbe_size; /* RMBE size <== sock rmem */ int rmbe_size; /* RMBE size <== sock rmem */
int rmbe_size_short;/* compressed notation */ int rmbe_size_short;/* compressed notation */
int rmbe_update_limit;
/* lower limit for consumer
* cursor update
*/
struct smc_host_cdc_msg local_tx_ctrl; /* host byte order staging struct smc_host_cdc_msg local_tx_ctrl; /* host byte order staging
* buffer for CDC msg send * buffer for CDC msg send

View File

@ -15,6 +15,7 @@
#include "smc_wr.h" #include "smc_wr.h"
#include "smc_cdc.h" #include "smc_cdc.h"
#include "smc_tx.h" #include "smc_tx.h"
#include "smc_rx.h"
/********************************** send *************************************/ /********************************** send *************************************/
@ -197,6 +198,7 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc,
atomic_add(diff_prod, &conn->bytes_to_rcv); atomic_add(diff_prod, &conn->bytes_to_rcv);
/* guarantee 0 <= bytes_to_rcv <= rmbe_size */ /* guarantee 0 <= bytes_to_rcv <= rmbe_size */
smp_mb__after_atomic(); smp_mb__after_atomic();
smc->sk.sk_data_ready(&smc->sk);
} }
if (conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) if (conn->local_rx_ctrl.conn_state_flags.peer_conn_abort)
@ -216,7 +218,9 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc,
return; return;
/* data available */ /* data available */
/* subsequent patch: send delayed ack, wake receivers */ if ((conn->local_rx_ctrl.prod_flags.write_blocked) ||
(conn->local_rx_ctrl.prod_flags.cons_curs_upd_req))
smc_tx_consumer_update(conn);
} }
/* called under tasklet context */ /* called under tasklet context */

View File

@ -489,6 +489,15 @@ struct smc_buf_desc *smc_rmb_get_slot(struct smc_link_group *lgr,
return NULL; return NULL;
} }
/* one of the conditions for announcing a receiver's current window size is
* that it "results in a minimum increase in the window size of 10% of the
* receive buffer space" [RFC7609]
*/
static inline int smc_rmb_wnd_update_limit(int rmbe_size)
{
return min_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2);
}
/* create the tx buffer for an SMC socket */ /* create the tx buffer for an SMC socket */
int smc_sndbuf_create(struct smc_sock *smc) int smc_sndbuf_create(struct smc_sock *smc)
{ {
@ -620,6 +629,7 @@ int smc_rmb_create(struct smc_sock *smc)
conn->rmbe_size_short = tmp_bufsize_short; conn->rmbe_size_short = tmp_bufsize_short;
smc->sk.sk_rcvbuf = tmp_bufsize * 2; smc->sk.sk_rcvbuf = tmp_bufsize * 2;
atomic_set(&conn->bytes_to_rcv, 0); atomic_set(&conn->bytes_to_rcv, 0);
conn->rmbe_update_limit = smc_rmb_wnd_update_limit(tmp_bufsize);
return 0; return 0;
} else { } else {
return -ENOMEM; return -ENOMEM;

217
net/smc/smc_rx.c Normal file
View File

@ -0,0 +1,217 @@
/*
* Shared Memory Communications over RDMA (SMC-R) and RoCE
*
* Manage RMBE
* copy new RMBE data into user space
*
* Copyright IBM Corp. 2016
*
* Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
*/
#include <linux/net.h>
#include <linux/rcupdate.h>
#include <net/sock.h>
#include "smc.h"
#include "smc_core.h"
#include "smc_cdc.h"
#include "smc_tx.h" /* smc_tx_consumer_update() */
#include "smc_rx.h"
/* callback implementation for sk.sk_data_ready()
* to wakeup rcvbuf consumers that blocked with smc_rx_wait_data().
* indirectly called by smc_cdc_msg_recv_action().
*/
static void smc_rx_data_ready(struct sock *sk)
{
struct socket_wq *wq;
/* derived from sock_def_readable() */
/* called already in smc_listen_work() */
rcu_read_lock();
wq = rcu_dereference(sk->sk_wq);
if (skwq_has_sleeper(wq))
wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
POLLRDNORM | POLLRDBAND);
if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
(sk->sk_state == SMC_CLOSED))
sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
else
sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
rcu_read_unlock();
}
/* blocks rcvbuf consumer until >=len bytes available or timeout or interrupted
* @smc smc socket
* @timeo pointer to max seconds to wait, pointer to value 0 for no timeout
* Returns:
* 1 if at least 1 byte available in rcvbuf or if socket error/shutdown.
* 0 otherwise (nothing in rcvbuf nor timeout, e.g. interrupted).
*/
static int smc_rx_wait_data(struct smc_sock *smc, long *timeo)
{
DEFINE_WAIT_FUNC(wait, woken_wake_function);
struct smc_connection *conn = &smc->conn;
struct sock *sk = &smc->sk;
int rc;
if (atomic_read(&conn->bytes_to_rcv))
return 1;
sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
add_wait_queue(sk_sleep(sk), &wait);
rc = sk_wait_event(sk, timeo,
sk->sk_err ||
sk->sk_shutdown & RCV_SHUTDOWN ||
sock_flag(sk, SOCK_DONE) ||
atomic_read(&conn->bytes_to_rcv) ||
smc_cdc_rxed_any_close_or_senddone(conn),
&wait);
remove_wait_queue(sk_sleep(sk), &wait);
sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
return rc;
}
/* rcvbuf consumer: main API called by socket layer.
* called under sk lock.
*/
int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len,
int flags)
{
size_t copylen, read_done = 0, read_remaining = len;
size_t chunk_len, chunk_off, chunk_len_sum;
struct smc_connection *conn = &smc->conn;
union smc_host_cursor cons;
int readable, chunk;
char *rcvbuf_base;
struct sock *sk;
long timeo;
int target; /* Read at least these many bytes */
int rc;
if (unlikely(flags & MSG_ERRQUEUE))
return -EINVAL; /* future work for sk.sk_family == AF_SMC */
if (flags & MSG_OOB)
return -EINVAL; /* future work */
sk = &smc->sk;
if (sk->sk_state == SMC_LISTEN)
return -ENOTCONN;
timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
msg->msg_namelen = 0;
/* we currently use 1 RMBE per RMB, so RMBE == RMB base addr */
rcvbuf_base = conn->rmb_desc->cpu_addr;
do { /* while (read_remaining) */
if (read_done >= target)
break;
if (atomic_read(&conn->bytes_to_rcv))
goto copy;
if (read_done) {
if (sk->sk_err ||
sk->sk_state == SMC_CLOSED ||
(sk->sk_shutdown & RCV_SHUTDOWN) ||
!timeo ||
signal_pending(current) ||
smc_cdc_rxed_any_close_or_senddone(conn) ||
conn->local_tx_ctrl.conn_state_flags.
peer_conn_abort)
break;
} else {
if (sock_flag(sk, SOCK_DONE))
break;
if (sk->sk_err) {
read_done = sock_error(sk);
break;
}
if (sk->sk_shutdown & RCV_SHUTDOWN ||
smc_cdc_rxed_any_close_or_senddone(conn) ||
conn->local_tx_ctrl.conn_state_flags.
peer_conn_abort)
break;
if (sk->sk_state == SMC_CLOSED) {
if (!sock_flag(sk, SOCK_DONE)) {
/* This occurs when user tries to read
* from never connected socket.
*/
read_done = -ENOTCONN;
break;
}
break;
}
if (signal_pending(current)) {
read_done = sock_intr_errno(timeo);
break;
}
}
if (!atomic_read(&conn->bytes_to_rcv)) {
smc_rx_wait_data(smc, &timeo);
continue;
}
copy:
/* initialize variables for 1st iteration of subsequent loop */
/* could be just 1 byte, even after smc_rx_wait_data above */
readable = atomic_read(&conn->bytes_to_rcv);
/* not more than what user space asked for */
copylen = min_t(size_t, read_remaining, readable);
smc_curs_write(&cons,
smc_curs_read(&conn->local_tx_ctrl.cons, conn),
conn);
/* determine chunks where to read from rcvbuf */
/* either unwrapped case, or 1st chunk of wrapped case */
chunk_len = min_t(size_t,
copylen, conn->rmbe_size - cons.count);
chunk_len_sum = chunk_len;
chunk_off = cons.count;
for (chunk = 0; chunk < 2; chunk++) {
if (!(flags & MSG_TRUNC)) {
rc = memcpy_to_msg(msg, rcvbuf_base + chunk_off,
chunk_len);
if (rc) {
if (!read_done)
read_done = -EFAULT;
goto out;
}
}
read_remaining -= chunk_len;
read_done += chunk_len;
if (chunk_len_sum == copylen)
break; /* either on 1st or 2nd iteration */
/* prepare next (== 2nd) iteration */
chunk_len = copylen - chunk_len; /* remainder */
chunk_len_sum += chunk_len;
chunk_off = 0; /* modulo offset in recv ring buffer */
}
/* update cursors */
if (!(flags & MSG_PEEK)) {
smc_curs_add(conn->rmbe_size, &cons, copylen);
/* increased in recv tasklet smc_cdc_msg_rcv() */
smp_mb__before_atomic();
atomic_sub(copylen, &conn->bytes_to_rcv);
/* guarantee 0 <= bytes_to_rcv <= rmbe_size */
smp_mb__after_atomic();
smc_curs_write(&conn->local_tx_ctrl.cons,
smc_curs_read(&cons, conn),
conn);
/* send consumer cursor update if required */
/* similar to advertising new TCP rcv_wnd if required */
smc_tx_consumer_update(conn);
}
} while (read_remaining);
out:
return read_done;
}
/* Initialize receive properties on connection establishment. NB: not __init! */
void smc_rx_init(struct smc_sock *smc)
{
smc->sk.sk_data_ready = smc_rx_data_ready;
}

23
net/smc/smc_rx.h Normal file
View File

@ -0,0 +1,23 @@
/*
* Shared Memory Communications over RDMA (SMC-R) and RoCE
*
* Manage RMBE
*
* Copyright IBM Corp. 2016
*
* Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
*/
#ifndef SMC_RX_H
#define SMC_RX_H
#include <linux/socket.h>
#include <linux/types.h>
#include "smc.h"
void smc_rx_init(struct smc_sock *smc);
int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len,
int flags);
#endif /* SMC_RX_H */

View File

@ -427,6 +427,43 @@ static void smc_tx_work(struct work_struct *work)
release_sock(&smc->sk); release_sock(&smc->sk);
} }
void smc_tx_consumer_update(struct smc_connection *conn)
{
union smc_host_cursor cfed, cons;
struct smc_cdc_tx_pend *pend;
struct smc_wr_buf *wr_buf;
int to_confirm, rc;
smc_curs_write(&cons,
smc_curs_read(&conn->local_tx_ctrl.cons, conn),
conn);
smc_curs_write(&cfed,
smc_curs_read(&conn->rx_curs_confirmed, conn),
conn);
to_confirm = smc_curs_diff(conn->rmbe_size, &cfed, &cons);
if (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req ||
((to_confirm > conn->rmbe_update_limit) &&
((to_confirm > (conn->rmbe_size / 2)) ||
conn->local_rx_ctrl.prod_flags.write_blocked))) {
rc = smc_cdc_get_free_slot(&conn->lgr->lnk[SMC_SINGLE_LINK],
&wr_buf, &pend);
if (!rc)
rc = smc_cdc_msg_send(conn, wr_buf, pend);
if (rc < 0) {
schedule_work(&conn->tx_work);
return;
}
smc_curs_write(&conn->rx_curs_confirmed,
smc_curs_read(&conn->local_tx_ctrl.cons, conn),
conn);
conn->local_rx_ctrl.prod_flags.cons_curs_upd_req = 0;
}
if (conn->local_rx_ctrl.prod_flags.write_blocked &&
!atomic_read(&conn->bytes_to_rcv))
conn->local_rx_ctrl.prod_flags.write_blocked = 0;
}
/***************************** send initialize *******************************/ /***************************** send initialize *******************************/
/* Initialize send properties on connection establishment. NB: not __init! */ /* Initialize send properties on connection establishment. NB: not __init! */

View File

@ -30,5 +30,6 @@ void smc_tx_init(struct smc_sock *smc);
int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len); int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len);
int smc_tx_sndbuf_nonempty(struct smc_connection *conn); int smc_tx_sndbuf_nonempty(struct smc_connection *conn);
void smc_tx_sndbuf_nonfull(struct smc_sock *smc); void smc_tx_sndbuf_nonfull(struct smc_sock *smc);
void smc_tx_consumer_update(struct smc_connection *conn);
#endif /* SMC_TX_H */ #endif /* SMC_TX_H */