ocfs2/cluster: Cluster up now includes network connections too

The cluster up check only checks to see if the node is heartbeating or not.
If yes it continues assuming that the node is connected to all the nodes. But
if that is not the case, the cluster join aborts with a stack of errors that
are not easy to comprehend.

This patch adds the network connect check upfront and prints the nodes that
the node is not yet connected to, before aborting.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
This commit is contained in:
Sunil Mushran 2011-07-24 10:33:54 -07:00
parent 3ba169ccec
commit 6b27f62fc7
2 changed files with 61 additions and 13 deletions

View File

@ -2138,13 +2138,6 @@ struct dlm_ctxt * dlm_register_domain(const char *domain,
goto leave; goto leave;
} }
if (!o2hb_check_local_node_heartbeating()) {
mlog(ML_ERROR, "the local node has not been configured, or is "
"not heartbeating\n");
ret = -EPROTO;
goto leave;
}
mlog(0, "register called for domain \"%s\"\n", domain); mlog(0, "register called for domain \"%s\"\n", domain);
retry: retry:

View File

@ -28,6 +28,7 @@
#include "cluster/masklog.h" #include "cluster/masklog.h"
#include "cluster/nodemanager.h" #include "cluster/nodemanager.h"
#include "cluster/heartbeat.h" #include "cluster/heartbeat.h"
#include "cluster/tcp.h"
#include "stackglue.h" #include "stackglue.h"
@ -255,6 +256,61 @@ static void o2cb_dump_lksb(struct ocfs2_dlm_lksb *lksb)
dlm_print_one_lock(lksb->lksb_o2dlm.lockid); dlm_print_one_lock(lksb->lksb_o2dlm.lockid);
} }
/*
* Check if this node is heartbeating and is connected to all other
* heartbeating nodes.
*/
static int o2cb_cluster_check(void)
{
u8 node_num;
int i;
unsigned long hbmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
unsigned long netmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
node_num = o2nm_this_node();
if (node_num == O2NM_MAX_NODES) {
printk(KERN_ERR "o2cb: This node has not been configured.\n");
return -EINVAL;
}
/*
* o2dlm expects o2net sockets to be created. If not, then
* dlm_join_domain() fails with a stack of errors which are both cryptic
* and incomplete. The idea here is to detect upfront whether we have
* managed to connect to all nodes or not. If not, then list the nodes
* to allow the user to check the configuration (incorrect IP, firewall,
* etc.) Yes, this is racy. But its not the end of the world.
*/
#define O2CB_MAP_STABILIZE_COUNT 60
for (i = 0; i < O2CB_MAP_STABILIZE_COUNT; ++i) {
o2hb_fill_node_map(hbmap, sizeof(hbmap));
if (!test_bit(node_num, hbmap)) {
printk(KERN_ERR "o2cb: %s heartbeat has not been "
"started.\n", (o2hb_global_heartbeat_active() ?
"Global" : "Local"));
return -EINVAL;
}
o2net_fill_node_map(netmap, sizeof(netmap));
/* Force set the current node to allow easy compare */
set_bit(node_num, netmap);
if (!memcmp(hbmap, netmap, sizeof(hbmap)))
return 0;
if (i < O2CB_MAP_STABILIZE_COUNT)
msleep(1000);
}
printk(KERN_ERR "o2cb: This node could not connect to nodes:");
i = -1;
while ((i = find_next_bit(hbmap, O2NM_MAX_NODES,
i + 1)) < O2NM_MAX_NODES) {
if (!test_bit(i, netmap))
printk(" %u", i);
}
printk(".\n");
return -ENOTCONN;
}
/* /*
* Called from the dlm when it's about to evict a node. This is how the * Called from the dlm when it's about to evict a node. This is how the
* classic stack signals node death. * classic stack signals node death.
@ -280,12 +336,11 @@ static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn)
BUG_ON(conn == NULL); BUG_ON(conn == NULL);
BUG_ON(conn->cc_proto == NULL); BUG_ON(conn->cc_proto == NULL);
/* for now we only have one cluster/node, make sure we see it /* Ensure cluster stack is up and all nodes are connected */
* in the heartbeat universe */ rc = o2cb_cluster_check();
if (!o2hb_check_local_node_heartbeating()) { if (rc) {
if (o2hb_global_heartbeat_active()) printk(KERN_ERR "o2cb: Cluster check failed. Fix errors "
mlog(ML_ERROR, "Global heartbeat not started\n"); "before retrying.\n");
rc = -EINVAL;
goto out; goto out;
} }