From 9bde178d052418af0b8e0f12932cf02ab4764c9d Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 28 Feb 2011 09:47:37 -0800
Subject: [PATCH 1/9] Revert "ceph: keep reference to parent inode on
 ceph_dentry"

This reverts commit 97d79b403ef03f729883246208ef5d8a2ebc4d68.

This fails to account for d_parent changes due to rename or disconnected
dentries due to submounts or NFS reexports.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/dir.c   | 5 +----
 fs/ceph/super.h | 1 -
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index f0aef787a102..0bc68de8edd7 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -60,7 +60,6 @@ int ceph_init_dentry(struct dentry *dentry)
 	}
 	di->dentry = dentry;
 	di->lease_session = NULL;
-	di->parent_inode = igrab(dentry->d_parent->d_inode);
 	dentry->d_fsdata = di;
 	dentry->d_time = jiffies;
 	ceph_dentry_lru_add(dentry);
@@ -1034,7 +1033,7 @@ static void ceph_dentry_release(struct dentry *dentry)
 	u64 snapid = CEPH_NOSNAP;
 
 	if (!IS_ROOT(dentry)) {
-		parent_inode = di->parent_inode;
+		parent_inode = dentry->d_parent->d_inode;
 		if (parent_inode)
 			snapid = ceph_snap(parent_inode);
 	}
@@ -1059,8 +1058,6 @@ static void ceph_dentry_release(struct dentry *dentry)
 		kmem_cache_free(ceph_dentry_cachep, di);
 		dentry->d_fsdata = NULL;
 	}
-	if (parent_inode)
-		iput(parent_inode);
 }
 
 static int ceph_snapdir_d_revalidate(struct dentry *dentry,
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 88fcaa21b801..20b907d76ae2 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -207,7 +207,6 @@ struct ceph_dentry_info {
 	struct dentry *dentry;
 	u64 time;
 	u64 offset;
-	struct inode *parent_inode;
 };
 
 struct ceph_inode_xattrs_info {

From b545cc1505eb49247071ce9f4092665de788ca00 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 28 Feb 2011 12:46:46 -0800
Subject: [PATCH 2/9] ceph: do not set I_COMPLETE

Do not set the I_COMPLETE flag on directories until we resolve races with
dcache pruning.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/dir.c   | 2 +-
 fs/ceph/inode.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 0bc68de8edd7..9b4f9d9947b3 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -409,7 +409,7 @@ more:
 	spin_lock(&inode->i_lock);
 	if (ci->i_release_count == fi->dir_release_count) {
 		dout(" marking %p complete\n", inode);
-		ci->i_ceph_flags |= CEPH_I_COMPLETE;
+		/* ci->i_ceph_flags |= CEPH_I_COMPLETE; */
 		ci->i_max_offset = filp->f_pos;
 	}
 	spin_unlock(&inode->i_lock);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 5625463aa479..193bfa5e9cbd 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -707,7 +707,7 @@ static int fill_inode(struct inode *inode,
 		    (issued & CEPH_CAP_FILE_EXCL) == 0 &&
 		    (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
 			dout(" marking %p complete (empty)\n", inode);
-			ci->i_ceph_flags |= CEPH_I_COMPLETE;
+			/* ci->i_ceph_flags |= CEPH_I_COMPLETE; */
 			ci->i_max_offset = 2;
 		}
 		break;

From 16a8b70a5a757db513f036bbcc73309f6c507d81 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 28 Feb 2011 12:49:15 -0800
Subject: [PATCH 3/9] ceph: do not clear I_COMPLETE from d_release

First, this was racy anyway: d_release isn't called until well after the
dentry is unhashed.  Second, this runs afoul of the recent dcache change
that clears d_parent prior to calling d_release (949854d0), causing a NULL
pointer dereference.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/dir.c | 22 +---------------------
 1 file changed, 1 insertion(+), 21 deletions(-)

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 9b4f9d9947b3..196fd4c62db7 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1029,28 +1029,8 @@ out_touch:
 static void ceph_dentry_release(struct dentry *dentry)
 {
 	struct ceph_dentry_info *di = ceph_dentry(dentry);
-	struct inode *parent_inode = NULL;
-	u64 snapid = CEPH_NOSNAP;
 
-	if (!IS_ROOT(dentry)) {
-		parent_inode = dentry->d_parent->d_inode;
-		if (parent_inode)
-			snapid = ceph_snap(parent_inode);
-	}
-	dout("dentry_release %p parent %p\n", dentry, parent_inode);
-	if (parent_inode && snapid != CEPH_SNAPDIR) {
-		struct ceph_inode_info *ci = ceph_inode(parent_inode);
-
-		spin_lock(&parent_inode->i_lock);
-		if (ci->i_shared_gen == di->lease_shared_gen ||
-		    snapid <= CEPH_MAXSNAP) {
-			dout(" clearing %p complete (d_release)\n",
-			     parent_inode);
-			ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
-			ci->i_release_count++;
-		}
-		spin_unlock(&parent_inode->i_lock);
-	}
+	dout("dentry_release %p\n", dentry);
 	if (di) {
 		ceph_dentry_lru_del(dentry);
 		if (di->lease_session)

From 38815b780285a4957852c5c9dbe94991c0b26c56 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 2 Mar 2011 16:55:21 -0800
Subject: [PATCH 4/9] libceph: fix handling of short returns from
 get_user_pages

get_user_pages() can return fewer pages than we ask for.  We were returning
a bogus pointer/error code in that case.  Instead, loop until we get all
the pages we want or get an error we can return to the caller.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 net/ceph/pagevec.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c
index 1a040e64c69f..cd9c21df87d1 100644
--- a/net/ceph/pagevec.c
+++ b/net/ceph/pagevec.c
@@ -16,22 +16,30 @@ struct page **ceph_get_direct_page_vector(const char __user *data,
 					  int num_pages, bool write_page)
 {
 	struct page **pages;
-	int rc;
+	int got = 0;
+	int rc = 0;
 
 	pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
 	if (!pages)
 		return ERR_PTR(-ENOMEM);
 
 	down_read(&current->mm->mmap_sem);
-	rc = get_user_pages(current, current->mm, (unsigned long)data,
-			    num_pages, write_page, 0, pages, NULL);
+	while (got < num_pages) {
+		rc = get_user_pages(current, current->mm,
+		    (unsigned long)data + ((unsigned long)got * PAGE_SIZE),
+		    num_pages - got, write_page, 0, pages + got, NULL);
+		if (rc < 0)
+			break;
+		BUG_ON(rc == 0);
+		got += rc;
+	}
 	up_read(&current->mm->mmap_sem);
-	if (rc < num_pages)
+	if (rc < 0)
 		goto fail;
 	return pages;
 
 fail:
-	ceph_put_page_vector(pages, rc > 0 ? rc : 0, false);
+	ceph_put_page_vector(pages, got, false);
 	return ERR_PTR(rc);
 }
 EXPORT_SYMBOL(ceph_get_direct_page_vector);

From 692d20f576fb26f62c83f80dbf3ea899998391b7 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 3 Mar 2011 12:14:53 -0800
Subject: [PATCH 5/9] libceph: retry after authorization failure

If we mark the connection CLOSED we will give up trying to reconnect to
this server instance.  That is appropriate for things like a protocol
version mismatch that won't change until the server is restarted, at which
point we'll get a new addr and reconnect.  An authorization failure like
this is probably due to the server not properly rotating it's secret keys,
however, and should be treated as transient so that the normal backoff and
retry behavior kicks in.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 net/ceph/messenger.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 35b36b86d762..6bd5025f6220 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -1248,8 +1248,6 @@ static int process_connect(struct ceph_connection *con)
 		     con->auth_retry);
 		if (con->auth_retry == 2) {
 			con->error_msg = "connect authorization failure";
-			reset_connection(con);
-			set_bit(CLOSED, &con->state);
 			return -1;
 		}
 		con->auth_retry = 1;

From 60bf8bf8815e6adea4c1d0423578c3b8000e2ec8 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 4 Mar 2011 12:24:28 -0800
Subject: [PATCH 6/9] libceph: fix msgr backoff

With commit f363e45f we replaced a bunch of hacky workqueue mutual
exclusion logic with the WQ_NON_REENTRANT flag.  One pieces of fallout is
that the exponential backoff breaks in certain cases:

 * con_work attempts to connect.
 * we get an immediate failure, and the socket state change handler queues
   immediate work.
 * con_work calls con_fault, we decide to back off, but can't queue delayed
   work.

In this case, we add a BACKOFF bit to make con_work reschedule delayed work
next time it runs (which should be immediately).

Signed-off-by: Sage Weil <sage@newdream.net>
---
 include/linux/ceph/messenger.h |  1 +
 net/ceph/messenger.c           | 30 ++++++++++++++++++++++++++++--
 2 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index c3011beac30d..eb31e108a64d 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -123,6 +123,7 @@ struct ceph_msg_pos {
 #define SOCK_CLOSED	11 /* socket state changed to closed */
 #define OPENING         13 /* open connection w/ (possibly new) peer */
 #define DEAD            14 /* dead, about to kfree */
+#define BACKOFF         15
 
 /*
  * A single connection with another host.
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 6bd5025f6220..46fbc422ba74 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -1949,6 +1949,19 @@ static void con_work(struct work_struct *work)
 						   work.work);
 
 	mutex_lock(&con->mutex);
+	if (test_and_clear_bit(BACKOFF, &con->state)) {
+		dout("con_work %p backing off\n", con);
+		if (queue_delayed_work(ceph_msgr_wq, &con->work,
+				       round_jiffies_relative(con->delay))) {
+			dout("con_work %p backoff %lu\n", con, con->delay);
+			mutex_unlock(&con->mutex);
+			return;
+		} else {
+			con->ops->put(con);
+			dout("con_work %p FAILED to back off %lu\n", con,
+			     con->delay);
+		}
+	}
 
 	if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */
 		dout("con_work CLOSED\n");
@@ -2017,11 +2030,24 @@ static void ceph_fault(struct ceph_connection *con)
 			con->delay = BASE_DELAY_INTERVAL;
 		else if (con->delay < MAX_DELAY_INTERVAL)
 			con->delay *= 2;
-		dout("fault queueing %p delay %lu\n", con, con->delay);
 		con->ops->get(con);
 		if (queue_delayed_work(ceph_msgr_wq, &con->work,
-				       round_jiffies_relative(con->delay)) == 0)
+				       round_jiffies_relative(con->delay))) {
+			dout("fault queued %p delay %lu\n", con, con->delay);
+		} else {
 			con->ops->put(con);
+			dout("fault failed to queue %p delay %lu, backoff\n",
+			     con, con->delay);
+			/*
+			 * In many cases we see a socket state change
+			 * while con_work is running and end up
+			 * queuing (non-delayed) work, such that we
+			 * can't backoff with a delay.  Set a flag so
+			 * that when con_work restarts we schedule the
+			 * delay then.
+			 */
+			set_bit(BACKOFF, &con->state);
+		}
 	}
 
 out_unlock:

From e76661d0a59e53e5cc4dccbe4b755d1dc8a968ec Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 3 Mar 2011 10:10:15 -0800
Subject: [PATCH 7/9] libceph: fix msgr keepalive flag

There was some broken keepalive code using a dead variable.  Shift to using
the proper bit flag.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 include/linux/ceph/messenger.h | 1 -
 net/ceph/messenger.c           | 9 ++++-----
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index eb31e108a64d..31d91a64838b 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -161,7 +161,6 @@ struct ceph_connection {
 	struct list_head out_queue;
 	struct list_head out_sent;   /* sending or sent but unacked */
 	u64 out_seq;		     /* last message queued for send */
-	bool out_keepalive_pending;
 
 	u64 in_seq, in_seq_acked;  /* last message received, acked */
 
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 46fbc422ba74..3252ea974e8f 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -336,7 +336,6 @@ static void reset_connection(struct ceph_connection *con)
 		ceph_msg_put(con->out_msg);
 		con->out_msg = NULL;
 	}
-	con->out_keepalive_pending = false;
 	con->in_seq = 0;
 	con->in_seq_acked = 0;
 }
@@ -2019,10 +2018,10 @@ static void ceph_fault(struct ceph_connection *con)
 	/* Requeue anything that hasn't been acked */
 	list_splice_init(&con->out_sent, &con->out_queue);
 
-	/* If there are no messages in the queue, place the connection
-	 * in a STANDBY state (i.e., don't try to reconnect just yet). */
-	if (list_empty(&con->out_queue) && !con->out_keepalive_pending) {
-		dout("fault setting STANDBY\n");
+	/* If there are no messages queued or keepalive pending, place
+	 * the connection in a STANDBY state */
+	if (list_empty(&con->out_queue) &&
+	    !test_bit(KEEPALIVE_PENDING, &con->state)) {
 		set_bit(STANDBY, &con->state);
 	} else {
 		/* retry after a delay. */

From e00de341fdb76c955703b4438100f9933c452b7f Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 4 Mar 2011 12:25:05 -0800
Subject: [PATCH 8/9] libceph: fix msgr standby handling

The standby logic used to be pretty dependent on the work requeueing
behavior that changed when we switched to WQ_NON_REENTRANT.  It was also
very fragile.

Restructure things so that:
 - We clear WRITE_PENDING when we set STANDBY.  This ensures we will
   requeue work when we wake up later.
 - con_work backs off if STANDBY is set.  There is nothing to do if we are
   in standby.
 - clear_standby() helper is called by both con_send() and con_keepalive(),
   the two actions that can wake us up again.  Move the connect_seq++
   logic here.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 net/ceph/messenger.c | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 3252ea974e8f..05f357828a2f 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -1712,14 +1712,6 @@ more:
 
 	/* open the socket first? */
 	if (con->sock == NULL) {
-		/*
-		 * if we were STANDBY and are reconnecting _this_
-		 * connection, bump connect_seq now.  Always bump
-		 * global_seq.
-		 */
-		if (test_and_clear_bit(STANDBY, &con->state))
-			con->connect_seq++;
-
 		prepare_write_banner(msgr, con);
 		prepare_write_connect(msgr, con, 1);
 		prepare_read_banner(con);
@@ -1962,6 +1954,10 @@ static void con_work(struct work_struct *work)
 		}
 	}
 
+	if (test_bit(STANDBY, &con->state)) {
+		dout("con_work %p STANDBY\n", con);
+		goto done;
+	}
 	if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */
 		dout("con_work CLOSED\n");
 		con_close_socket(con);
@@ -2022,6 +2018,8 @@ static void ceph_fault(struct ceph_connection *con)
 	 * the connection in a STANDBY state */
 	if (list_empty(&con->out_queue) &&
 	    !test_bit(KEEPALIVE_PENDING, &con->state)) {
+		dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con);
+		clear_bit(WRITE_PENDING, &con->state);
 		set_bit(STANDBY, &con->state);
 	} else {
 		/* retry after a delay. */
@@ -2117,6 +2115,19 @@ void ceph_messenger_destroy(struct ceph_messenger *msgr)
 }
 EXPORT_SYMBOL(ceph_messenger_destroy);
 
+static void clear_standby(struct ceph_connection *con)
+{
+	/* come back from STANDBY? */
+	if (test_and_clear_bit(STANDBY, &con->state)) {
+		mutex_lock(&con->mutex);
+		dout("clear_standby %p and ++connect_seq\n", con);
+		con->connect_seq++;
+		WARN_ON(test_bit(WRITE_PENDING, &con->state));
+		WARN_ON(test_bit(KEEPALIVE_PENDING, &con->state));
+		mutex_unlock(&con->mutex);
+	}
+}
+
 /*
  * Queue up an outgoing message on the given connection.
  */
@@ -2149,6 +2160,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
 
 	/* if there wasn't anything waiting to send before, queue
 	 * new work */
+	clear_standby(con);
 	if (test_and_set_bit(WRITE_PENDING, &con->state) == 0)
 		queue_con(con);
 }
@@ -2214,6 +2226,8 @@ void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg)
  */
 void ceph_con_keepalive(struct ceph_connection *con)
 {
+	dout("con_keepalive %p\n", con);
+	clear_standby(con);
 	if (test_and_set_bit(KEEPALIVE_PENDING, &con->state) == 0 &&
 	    test_and_set_bit(WRITE_PENDING, &con->state) == 0)
 		queue_con(con);

From 455cec0abff563574cca432ced49f734117ca113 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 3 Mar 2011 13:44:35 -0800
Subject: [PATCH 9/9] ceph: no .snap inside of snapped namespace

Otherwise you can do things like

# mkdir .snap/foo
# cd .snap/foo/.snap
# ls
<badness>

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/dir.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 196fd4c62db7..099a58615b90 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -496,6 +496,7 @@ struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
 
 	/* .snap dir? */
 	if (err == -ENOENT &&
+	    ceph_snap(parent) == CEPH_NOSNAP &&
 	    strcmp(dentry->d_name.name,
 		   fsc->mount_options->snapdir_name) == 0) {
 		struct inode *inode = ceph_get_snapdir(parent);