libceph: fix messenger retry
In ancient times, the messenger could both initiate and accept connections. An artifact if that was data structures to store/process an incoming ceph_msg_connect request and send an outgoing ceph_msg_connect_reply. Sadly, the negotiation code was referencing those structures and ignoring important information (like the peer's connect_seq) from the correct ones. Among other things, this fixes tight reconnect loops where the server sends RETRY_SESSION and we (the client) retries with the same connect_seq as last time. This bug pretty easily triggered by injecting socket failures on the MDS and running some fs workload like workunits/direct_io/test_sync_io. Signed-off-by: Sage Weil <sage@inktank.com>
This commit is contained in:
parent
cd43045c2d
commit
a16cb1f707
@ -172,16 +172,8 @@ struct ceph_connection {
|
||||
|
||||
/* connection negotiation temps */
|
||||
char in_banner[CEPH_BANNER_MAX_LEN];
|
||||
union {
|
||||
struct { /* outgoing connection */
|
||||
struct ceph_msg_connect out_connect;
|
||||
struct ceph_msg_connect_reply in_reply;
|
||||
};
|
||||
struct { /* incoming */
|
||||
struct ceph_msg_connect in_connect;
|
||||
struct ceph_msg_connect_reply out_reply;
|
||||
};
|
||||
};
|
||||
struct ceph_msg_connect out_connect;
|
||||
struct ceph_msg_connect_reply in_reply;
|
||||
struct ceph_entity_addr actual_peer_addr;
|
||||
|
||||
/* message out temps */
|
||||
|
@ -1540,7 +1540,7 @@ static int process_connect(struct ceph_connection *con)
|
||||
* dropped messages.
|
||||
*/
|
||||
dout("process_connect got RESET peer seq %u\n",
|
||||
le32_to_cpu(con->in_connect.connect_seq));
|
||||
le32_to_cpu(con->in_reply.connect_seq));
|
||||
pr_err("%s%lld %s connection reset\n",
|
||||
ENTITY_NAME(con->peer_name),
|
||||
ceph_pr_addr(&con->peer_addr.in_addr));
|
||||
@ -1566,10 +1566,10 @@ static int process_connect(struct ceph_connection *con)
|
||||
* If we sent a smaller connect_seq than the peer has, try
|
||||
* again with a larger value.
|
||||
*/
|
||||
dout("process_connect got RETRY my seq = %u, peer_seq = %u\n",
|
||||
dout("process_connect got RETRY_SESSION my seq %u, peer %u\n",
|
||||
le32_to_cpu(con->out_connect.connect_seq),
|
||||
le32_to_cpu(con->in_connect.connect_seq));
|
||||
con->connect_seq = le32_to_cpu(con->in_connect.connect_seq);
|
||||
le32_to_cpu(con->in_reply.connect_seq));
|
||||
con->connect_seq = le32_to_cpu(con->in_reply.connect_seq);
|
||||
ret = prepare_write_connect(con);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
@ -1583,9 +1583,9 @@ static int process_connect(struct ceph_connection *con)
|
||||
*/
|
||||
dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
|
||||
con->peer_global_seq,
|
||||
le32_to_cpu(con->in_connect.global_seq));
|
||||
le32_to_cpu(con->in_reply.global_seq));
|
||||
get_global_seq(con->msgr,
|
||||
le32_to_cpu(con->in_connect.global_seq));
|
||||
le32_to_cpu(con->in_reply.global_seq));
|
||||
ret = prepare_write_connect(con);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
Loading…
Reference in New Issue
Block a user