From 4b680afb42b9e1b504444583098401a00e63ed3c Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 8 Feb 2016 10:46:51 +1100
Subject: [PATCH 01/66] xfs: lock rt summary inode on allocation

RT allocation can fail on a debug kernel with:

XFS: Assertion failed: xfs_isilocked(ip, XFS_ILOCK_SHARED|XFS_ILOCK_EXCL), file: fs/xfs/libxfs/xfs_bmap.c, line: 4039

When modifying the summary inode during allocation. This occurs
because the summary inode is never locked, and xfs_bmapi_*
operations expect it to be locked. The summary inode is effectively
protected byt he lock on the bitmap inode, so this really is only a
debug kernel issue.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Tested-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_bmap_util.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 45ec9e40150c..07ef29b9b464 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -202,10 +202,12 @@ xfs_bmap_rtalloc(
 		ralen = MAXEXTLEN / mp->m_sb.sb_rextsize;
 
 	/*
-	 * Lock out other modifications to the RT bitmap inode.
+	 * Lock out modifications to both the RT bitmap and summary inodes
 	 */
 	xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
 	xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
+	xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(ap->tp, mp->m_rsumip, XFS_ILOCK_EXCL);
 
 	/*
 	 * If it's an allocation to an empty file at offset 0,

From 3218a3ec87f7d0063b48b10cf3d649ce75966223 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Mon, 8 Feb 2016 11:21:24 +1100
Subject: [PATCH 02/66] quota: remove unused cmd argument from quota_quotaon()

The cmd argument to quota_quotaon() via Q_QUOTAON quotactl
is not used, so remove it.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/quota/quota.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 3746367098fd..ea6667083b3e 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -79,7 +79,7 @@ unsigned int qtype_enforce_flag(int type)
 	return 0;
 }
 
-static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id,
+static int quota_quotaon(struct super_block *sb, int type, qid_t id,
 		         struct path *path)
 {
 	if (!sb->s_qcop->quota_on && !sb->s_qcop->quota_enable)
@@ -659,7 +659,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
 
 	switch (cmd) {
 	case Q_QUOTAON:
-		return quota_quotaon(sb, type, cmd, id, path);
+		return quota_quotaon(sb, type, id, path);
 	case Q_QUOTAOFF:
 		return quota_quotaoff(sb, type);
 	case Q_GETFMT:

From 8b37524962b9c54423374717786198f5c0820a28 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Mon, 8 Feb 2016 11:21:50 +1100
Subject: [PATCH 03/66] quota: add new quotactl Q_XGETNEXTQUOTA

Q_XGETNEXTQUOTA is exactly like Q_XGETQUOTA, except that it
will return quota information for the id equal to or greater
than the id requested.  In other words, if the requested id has
no quota, the command will return quota information for the
next higher id which does have a quota set.  If no higher id
has an active quota, -ESRCH is returned.

This allows filesystems to do efficient iteration in kernelspace,
much like extN filesystems do in userspace when asked to report
all active quotas.

The patch adds a d_id field to struct qc_dqblk so that we can
pass back the id of the quota which was found, and return it
to userspace.

Today, filesystems such as XFS require getpwent-style iterations,
and for systems which have i.e. LDAP backends, this can be very
slow, or even impossible if iteration is not allowed in the
configuration.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/quota/quota.c               | 31 +++++++++++++++++++++++++++++++
 include/linux/quota.h          |  2 ++
 include/uapi/linux/dqblk_xfs.h |  1 +
 3 files changed, 34 insertions(+)

diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index ea6667083b3e..0a6dd71ea309 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -625,6 +625,34 @@ static int quota_getxquota(struct super_block *sb, int type, qid_t id,
 	return ret;
 }
 
+/*
+ * Return quota for next active quota >= this id, if any exists,
+ * otherwise return -ESRCH via ->get_nextdqblk.
+ */
+static int quota_getnextxquota(struct super_block *sb, int type, qid_t id,
+			    void __user *addr)
+{
+	struct fs_disk_quota fdq;
+	struct qc_dqblk qdq;
+	struct kqid qid;
+	qid_t id_out;
+	int ret;
+
+	if (!sb->s_qcop->get_nextdqblk)
+		return -ENOSYS;
+	qid = make_kqid(current_user_ns(), type, id);
+	if (!qid_valid(qid))
+		return -EINVAL;
+	ret = sb->s_qcop->get_nextdqblk(sb, &qid, &qdq);
+	if (ret)
+		return ret;
+	id_out = from_kqid(current_user_ns(), qid);
+	copy_to_xfs_dqblk(&fdq, &qdq, type, id_out);
+	if (copy_to_user(addr, &fdq, sizeof(fdq)))
+		return -EFAULT;
+	return ret;
+}
+
 static int quota_rmxquota(struct super_block *sb, void __user *addr)
 {
 	__u32 flags;
@@ -690,6 +718,8 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
 		return quota_setxquota(sb, type, id, addr);
 	case Q_XGETQUOTA:
 		return quota_getxquota(sb, type, id, addr);
+	case Q_XGETNEXTQUOTA:
+		return quota_getnextxquota(sb, type, id, addr);
 	case Q_XQUOTASYNC:
 		if (sb->s_flags & MS_RDONLY)
 			return -EROFS;
@@ -712,6 +742,7 @@ static int quotactl_cmd_write(int cmd)
 	case Q_XGETQSTAT:
 	case Q_XGETQSTATV:
 	case Q_XGETQUOTA:
+	case Q_XGETNEXTQUOTA:
 	case Q_XQUOTASYNC:
 		return 0;
 	}
diff --git a/include/linux/quota.h b/include/linux/quota.h
index b2505acfd3c0..fba92f5c1a63 100644
--- a/include/linux/quota.h
+++ b/include/linux/quota.h
@@ -425,6 +425,8 @@ struct quotactl_ops {
 	int (*quota_sync)(struct super_block *, int);
 	int (*set_info)(struct super_block *, int, struct qc_info *);
 	int (*get_dqblk)(struct super_block *, struct kqid, struct qc_dqblk *);
+	int (*get_nextdqblk)(struct super_block *, struct kqid *,
+			     struct qc_dqblk *);
 	int (*set_dqblk)(struct super_block *, struct kqid, struct qc_dqblk *);
 	int (*get_state)(struct super_block *, struct qc_state *);
 	int (*rm_xquota)(struct super_block *, unsigned int);
diff --git a/include/uapi/linux/dqblk_xfs.h b/include/uapi/linux/dqblk_xfs.h
index dcd75cc26196..11b3b31faf14 100644
--- a/include/uapi/linux/dqblk_xfs.h
+++ b/include/uapi/linux/dqblk_xfs.h
@@ -39,6 +39,7 @@
 #define Q_XQUOTARM	XQM_CMD(6)	/* free disk space used by dquots */
 #define Q_XQUOTASYNC	XQM_CMD(7)	/* delalloc flush, updates dquots */
 #define Q_XGETQSTATV	XQM_CMD(8)	/* newer version of get quota */
+#define Q_XGETNEXTQUOTA	XQM_CMD(9)	/* get disk limits and usage >= ID */
 
 /*
  * fs_disk_quota structure:

From 926132c0257a5a8d149a6a395cc3405e55420566 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Mon, 8 Feb 2016 11:22:21 +1100
Subject: [PATCH 04/66] quota: add new quotactl Q_GETNEXTQUOTA

Q_GETNEXTQUOTA is exactly like Q_GETQUOTA, except that it
will return quota information for the id equal to or greater
than the id requested.  In other words, if the requested id has
no quota, the command will return quota information for the
next higher id which does have a quota set.  If no higher id
has an active quota, -ESRCH is returned.

This allows filesystems to do efficient iteration in kernelspace,
much like extN filesystems do in userspace when asked to report
all active quotas.

This does require a new data structure for userspace, as the
current structure does not include an ID for the returned quota
information.

Today, Ext4 with a hidden quota inode requires getpwent-style
iterations, and for systems which have i.e. LDAP backends,
this can be very slow, or even impossible if iteration is not
allowed in the configuration.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/quota/quota.c           | 31 +++++++++++++++++++++++++++++++
 include/uapi/linux/quota.h | 14 ++++++++++++++
 2 files changed, 45 insertions(+)

diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 0a6dd71ea309..0ebc90496525 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -222,6 +222,34 @@ static int quota_getquota(struct super_block *sb, int type, qid_t id,
 	return 0;
 }
 
+/*
+ * Return quota for next active quota >= this id, if any exists,
+ * otherwise return -ESRCH via ->get_nextdqblk
+ */
+static int quota_getnextquota(struct super_block *sb, int type, qid_t id,
+			  void __user *addr)
+{
+	struct kqid qid;
+	struct qc_dqblk fdq;
+	struct if_nextdqblk idq;
+	int ret;
+
+	if (!sb->s_qcop->get_nextdqblk)
+		return -ENOSYS;
+	qid = make_kqid(current_user_ns(), type, id);
+	if (!qid_valid(qid))
+		return -EINVAL;
+	ret = sb->s_qcop->get_nextdqblk(sb, &qid, &fdq);
+	if (ret)
+		return ret;
+	/* struct if_nextdqblk is a superset of struct if_dqblk */
+	copy_to_if_dqblk((struct if_dqblk *)&idq, &fdq);
+	idq.dqb_id = from_kqid(current_user_ns(), qid);
+	if (copy_to_user(addr, &idq, sizeof(idq)))
+		return -EFAULT;
+	return 0;
+}
+
 static void copy_from_if_dqblk(struct qc_dqblk *dst, struct if_dqblk *src)
 {
 	dst->d_spc_hardlimit = qbtos(src->dqb_bhardlimit);
@@ -698,6 +726,8 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
 		return quota_setinfo(sb, type, addr);
 	case Q_GETQUOTA:
 		return quota_getquota(sb, type, id, addr);
+	case Q_GETNEXTQUOTA:
+		return quota_getnextquota(sb, type, id, addr);
 	case Q_SETQUOTA:
 		return quota_setquota(sb, type, id, addr);
 	case Q_SYNC:
@@ -738,6 +768,7 @@ static int quotactl_cmd_write(int cmd)
 	switch (cmd) {
 	case Q_GETFMT:
 	case Q_GETINFO:
+	case Q_GETNEXTQUOTA:
 	case Q_SYNC:
 	case Q_XGETQSTAT:
 	case Q_XGETQSTATV:
diff --git a/include/uapi/linux/quota.h b/include/uapi/linux/quota.h
index 9c95b2c1c88a..38baddb807f5 100644
--- a/include/uapi/linux/quota.h
+++ b/include/uapi/linux/quota.h
@@ -71,6 +71,7 @@
 #define Q_SETINFO  0x800006	/* set information about quota files */
 #define Q_GETQUOTA 0x800007	/* get user quota structure */
 #define Q_SETQUOTA 0x800008	/* set user quota structure */
+#define Q_GETNEXTQUOTA 0x800009	/* get disk limits and usage >= ID */
 
 /* Quota format type IDs */
 #define	QFMT_VFS_OLD 1
@@ -119,6 +120,19 @@ struct if_dqblk {
 	__u32 dqb_valid;
 };
 
+struct if_nextdqblk {
+	__u64 dqb_bhardlimit;
+	__u64 dqb_bsoftlimit;
+	__u64 dqb_curspace;
+	__u64 dqb_ihardlimit;
+	__u64 dqb_isoftlimit;
+	__u64 dqb_curinodes;
+	__u64 dqb_btime;
+	__u64 dqb_itime;
+	__u32 dqb_valid;
+	__u32 dqb_id;
+};
+
 /*
  * Structure used for setting quota information about file via quotactl
  * Following flags are used to specify which fields are valid

From a484bcdd1321b86de29a2969399da1fa6e9c7648 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Mon, 8 Feb 2016 11:22:58 +1100
Subject: [PATCH 05/66] xfs: don't overflow quota ID when initializing dqblk

Quota IDs are unsigned, and so we can pass in values up
to 2^32-1.  But if we try to initialize a block containing
values over MAX_INT, curid will overflow and assert.

curid holds a quota ID, so give it the proper
xfs_dqid_t type (and remove the now-impossible ASSERT).

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_dquot.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 9c44d38dcd1f..c9c7c2d89245 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -232,7 +232,8 @@ xfs_qm_init_dquot_blk(
 {
 	struct xfs_quotainfo	*q = mp->m_quotainfo;
 	xfs_dqblk_t	*d;
-	int		curid, i;
+	xfs_dqid_t	curid;
+	int		i;
 
 	ASSERT(tp);
 	ASSERT(xfs_buf_islocked(bp));
@@ -243,7 +244,6 @@ xfs_qm_init_dquot_blk(
 	 * ID of the first dquot in the block - id's are zero based.
 	 */
 	curid = id - (id % q->qi_dqperchunk);
-	ASSERT(curid >= 0);
 	memset(d, 0, BBTOB(q->qi_dqchunklen));
 	for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++) {
 		d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);

From 4d4d9523b4ebbd5d97c8b2557c6ee2a3f90b6b3a Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Mon, 8 Feb 2016 11:23:23 +1100
Subject: [PATCH 06/66] xfs: get quota inode from mp & flags rather than dqp

Allow us to get the appropriate quota inode from any
mp & quota flags, not necessarily associated with a
particular dqp.  Needed for when we are searching for
the next active ID with quotas and we want to examine
the quota inode.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_dquot.c |  3 ++-
 fs/xfs/xfs_qm.h    | 10 +++++-----
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index c9c7c2d89245..5dbde0d65f37 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -464,12 +464,13 @@ xfs_qm_dqtobp(
 	struct xfs_bmbt_irec	map;
 	int			nmaps = 1, error;
 	struct xfs_buf		*bp;
-	struct xfs_inode	*quotip = xfs_dq_to_quota_inode(dqp);
+	struct xfs_inode	*quotip;
 	struct xfs_mount	*mp = dqp->q_mount;
 	xfs_dqid_t		id = be32_to_cpu(dqp->q_core.d_id);
 	struct xfs_trans	*tp = (tpp ? *tpp : NULL);
 	uint			lock_mode;
 
+	quotip = xfs_quota_inode(dqp->q_mount, dqp->dq_flags);
 	dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk;
 
 	lock_mode = xfs_ilock_data_map_shared(quotip);
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 996a04064894..8901a01bc354 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -104,15 +104,15 @@ xfs_dquot_tree(
 }
 
 static inline struct xfs_inode *
-xfs_dq_to_quota_inode(struct xfs_dquot *dqp)
+xfs_quota_inode(xfs_mount_t *mp, uint dq_flags)
 {
-	switch (dqp->dq_flags & XFS_DQ_ALLTYPES) {
+	switch (dq_flags & XFS_DQ_ALLTYPES) {
 	case XFS_DQ_USER:
-		return dqp->q_mount->m_quotainfo->qi_uquotaip;
+		return mp->m_quotainfo->qi_uquotaip;
 	case XFS_DQ_GROUP:
-		return dqp->q_mount->m_quotainfo->qi_gquotaip;
+		return mp->m_quotainfo->qi_gquotaip;
 	case XFS_DQ_PROJ:
-		return dqp->q_mount->m_quotainfo->qi_pquotaip;
+		return mp->m_quotainfo->qi_pquotaip;
 	default:
 		ASSERT(0);
 	}

From 8aa7d37ebf2f0cfba1411acb34e7b40e73f653e7 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Mon, 8 Feb 2016 11:25:16 +1100
Subject: [PATCH 07/66] xfs: Factor xfs_seek_hole_data into helper

Factor xfs_seek_hole_data into an unlocked helper which takes
an xfs inode rather than a file for internal use.

Also allow specification of "end" - the vfs lseek interface is
defined such that any offset past eof/i_size shall return -ENXIO,
but we will use this for quota code which does not maintain i_size,
and we want to be able to SEEK_DATA past i_size as well.  So the
lseek path can send in i_size, and the quota code can determine
its own ending offset.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_file.c  | 82 ++++++++++++++++++++++++++++++++--------------
 fs/xfs/xfs_inode.h |  2 ++
 2 files changed, 59 insertions(+), 25 deletions(-)

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 52883ac3cf84..70a4b5a2802e 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1337,31 +1337,31 @@ out:
 	return found;
 }
 
-STATIC loff_t
-xfs_seek_hole_data(
-	struct file		*file,
+/*
+ * caller must lock inode with xfs_ilock_data_map_shared,
+ * can we craft an appropriate ASSERT?
+ *
+ * end is because the VFS-level lseek interface is defined such that any
+ * offset past i_size shall return -ENXIO, but we use this for quota code
+ * which does not maintain i_size, and we want to SEEK_DATA past i_size.
+ */
+loff_t
+__xfs_seek_hole_data(
+	struct inode		*inode,
 	loff_t			start,
+	loff_t			end,
 	int			whence)
 {
-	struct inode		*inode = file->f_mapping->host;
 	struct xfs_inode	*ip = XFS_I(inode);
 	struct xfs_mount	*mp = ip->i_mount;
 	loff_t			uninitialized_var(offset);
-	xfs_fsize_t		isize;
 	xfs_fileoff_t		fsbno;
-	xfs_filblks_t		end;
-	uint			lock;
+	xfs_filblks_t		lastbno;
 	int			error;
 
-	if (XFS_FORCED_SHUTDOWN(mp))
-		return -EIO;
-
-	lock = xfs_ilock_data_map_shared(ip);
-
-	isize = i_size_read(inode);
-	if (start >= isize) {
+	if (start >= end) {
 		error = -ENXIO;
-		goto out_unlock;
+		goto out_error;
 	}
 
 	/*
@@ -1369,22 +1369,22 @@ xfs_seek_hole_data(
 	 * by fsbno to the end block of the file.
 	 */
 	fsbno = XFS_B_TO_FSBT(mp, start);
-	end = XFS_B_TO_FSB(mp, isize);
+	lastbno = XFS_B_TO_FSB(mp, end);
 
 	for (;;) {
 		struct xfs_bmbt_irec	map[2];
 		int			nmap = 2;
 		unsigned int		i;
 
-		error = xfs_bmapi_read(ip, fsbno, end - fsbno, map, &nmap,
+		error = xfs_bmapi_read(ip, fsbno, lastbno - fsbno, map, &nmap,
 				       XFS_BMAPI_ENTIRE);
 		if (error)
-			goto out_unlock;
+			goto out_error;
 
 		/* No extents at given offset, must be beyond EOF */
 		if (nmap == 0) {
 			error = -ENXIO;
-			goto out_unlock;
+			goto out_error;
 		}
 
 		for (i = 0; i < nmap; i++) {
@@ -1426,7 +1426,7 @@ xfs_seek_hole_data(
 			 * hole at the end of any file).
 		 	 */
 			if (whence == SEEK_HOLE) {
-				offset = isize;
+				offset = end;
 				break;
 			}
 			/*
@@ -1434,7 +1434,7 @@ xfs_seek_hole_data(
 			 */
 			ASSERT(whence == SEEK_DATA);
 			error = -ENXIO;
-			goto out_unlock;
+			goto out_error;
 		}
 
 		ASSERT(i > 1);
@@ -1445,14 +1445,14 @@ xfs_seek_hole_data(
 		 */
 		fsbno = map[i - 1].br_startoff + map[i - 1].br_blockcount;
 		start = XFS_FSB_TO_B(mp, fsbno);
-		if (start >= isize) {
+		if (start >= end) {
 			if (whence == SEEK_HOLE) {
-				offset = isize;
+				offset = end;
 				break;
 			}
 			ASSERT(whence == SEEK_DATA);
 			error = -ENXIO;
-			goto out_unlock;
+			goto out_error;
 		}
 	}
 
@@ -1464,7 +1464,39 @@ out:
 	 * situation in particular.
 	 */
 	if (whence == SEEK_HOLE)
-		offset = min_t(loff_t, offset, isize);
+		offset = min_t(loff_t, offset, end);
+
+	return offset;
+
+out_error:
+	return error;
+}
+
+STATIC loff_t
+xfs_seek_hole_data(
+	struct file		*file,
+	loff_t			start,
+	int			whence)
+{
+	struct inode		*inode = file->f_mapping->host;
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	uint			lock;
+	loff_t			offset, end;
+	int			error = 0;
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -EIO;
+
+	lock = xfs_ilock_data_map_shared(ip);
+
+	end = i_size_read(inode);
+	offset = __xfs_seek_hole_data(inode, start, end, whence);
+	if (offset < 0) {
+		error = offset;
+		goto out_unlock;
+	}
+
 	offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
 
 out_unlock:
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index ca9e11989cbd..ed7e9339c7e9 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -437,6 +437,8 @@ int	xfs_update_prealloc_flags(struct xfs_inode *ip,
 int	xfs_zero_eof(struct xfs_inode *ip, xfs_off_t offset,
 		     xfs_fsize_t isize, bool *did_zeroing);
 int	xfs_iozero(struct xfs_inode *ip, loff_t pos, size_t count);
+loff_t	__xfs_seek_hole_data(struct inode *inode, loff_t start,
+			     loff_t eof, int whence);
 
 
 /* from xfs_iops.c */

From 296c24e26ee3af2dbfecb482e6bc9560bd34c455 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Mon, 8 Feb 2016 11:27:38 +1100
Subject: [PATCH 08/66] xfs: wire up Q_XGETNEXTQUOTA / get_nextdqblk

Add code to allow the Q_XGETNEXTQUOTA quotactl to quickly find
all active quotas by examining the quota inode, and skipping
over unallocated or uninitialized regions.

Userspace can then use this interface rather than i.e. a
getpwent() loop when asked to report all active quotas.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/libxfs/xfs_quota_defs.h |  3 +-
 fs/xfs/xfs_dquot.c             | 96 ++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_qm.h                |  4 +-
 fs/xfs/xfs_qm_syscalls.c       | 12 +++--
 fs/xfs/xfs_quotaops.c          | 36 ++++++++++++-
 5 files changed, 142 insertions(+), 9 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h
index f51078f1e92a..8eed51275bb3 100644
--- a/fs/xfs/libxfs/xfs_quota_defs.h
+++ b/fs/xfs/libxfs/xfs_quota_defs.h
@@ -37,7 +37,7 @@ typedef __uint16_t	xfs_qwarncnt_t;
 #define XFS_DQ_PROJ		0x0002		/* project quota */
 #define XFS_DQ_GROUP		0x0004		/* a group quota */
 #define XFS_DQ_DIRTY		0x0008		/* dquot is dirty */
-#define XFS_DQ_FREEING		0x0010		/* dquot is beeing torn down */
+#define XFS_DQ_FREEING		0x0010		/* dquot is being torn down */
 
 #define XFS_DQ_ALLTYPES		(XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP)
 
@@ -116,6 +116,7 @@ typedef __uint16_t	xfs_qwarncnt_t;
 #define XFS_QMOPT_DQREPAIR	0x0001000 /* repair dquot if damaged */
 #define XFS_QMOPT_GQUOTA	0x0002000 /* group dquot requested */
 #define XFS_QMOPT_ENOSPC	0x0004000 /* enospc instead of edquot (prj) */
+#define XFS_QMOPT_DQNEXT	0x0008000 /* return next dquot >= this ID */
 
 /*
  * flags to xfs_trans_mod_dquot to indicate which field needs to be
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 5dbde0d65f37..2f0502f0f214 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -685,6 +685,56 @@ error0:
 	return error;
 }
 
+/*
+ * Advance to the next id in the current chunk, or if at the
+ * end of the chunk, skip ahead to first id in next allocated chunk
+ * using the SEEK_DATA interface.
+ */
+int
+xfs_dq_get_next_id(
+	xfs_mount_t		*mp,
+	uint			type,
+	xfs_dqid_t		*id,
+	loff_t			eof)
+{
+	struct xfs_inode	*quotip;
+	xfs_fsblock_t		start;
+	loff_t			offset;
+	uint			lock;
+	xfs_dqid_t		next_id;
+	int			error = 0;
+
+	/* Simple advance */
+	next_id = *id + 1;
+
+	/* If new ID is within the current chunk, advancing it sufficed */
+	if (next_id % mp->m_quotainfo->qi_dqperchunk) {
+		*id = next_id;
+		return 0;
+	}
+
+	/* Nope, next_id is now past the current chunk, so find the next one */
+	start = (xfs_fsblock_t)next_id / mp->m_quotainfo->qi_dqperchunk;
+
+	quotip = xfs_quota_inode(mp, type);
+	lock = xfs_ilock_data_map_shared(quotip);
+
+	offset = __xfs_seek_hole_data(VFS_I(quotip), XFS_FSB_TO_B(mp, start),
+				      eof, SEEK_DATA);
+	if (offset < 0)
+		error = offset;
+
+	xfs_iunlock(quotip, lock);
+
+	/* -ENXIO is essentially "no more data" */
+	if (error)
+		return (error == -ENXIO ? -ENOENT: error);
+
+	/* Convert next data offset back to a quota id */
+	*id = XFS_B_TO_FSB(mp, offset) * mp->m_quotainfo->qi_dqperchunk;
+	return 0;
+}
+
 /*
  * Given the file system, inode OR id, and type (UDQUOT/GDQUOT), return a
  * a locked dquot, doing an allocation (if requested) as needed.
@@ -705,6 +755,7 @@ xfs_qm_dqget(
 	struct xfs_quotainfo	*qi = mp->m_quotainfo;
 	struct radix_tree_root *tree = xfs_dquot_tree(qi, type);
 	struct xfs_dquot	*dqp;
+	loff_t			eof = 0;
 	int			error;
 
 	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
@@ -732,6 +783,21 @@ xfs_qm_dqget(
 	}
 #endif
 
+	/* Get the end of the quota file if we need it */
+	if (flags & XFS_QMOPT_DQNEXT) {
+		struct xfs_inode	*quotip;
+		xfs_fileoff_t		last;
+		uint			lock_mode;
+
+		quotip = xfs_quota_inode(mp, type);
+		lock_mode = xfs_ilock_data_map_shared(quotip);
+		error = xfs_bmap_last_offset(quotip, &last, XFS_DATA_FORK);
+		xfs_iunlock(quotip, lock_mode);
+		if (error)
+			return error;
+		eof = XFS_FSB_TO_B(mp, last);
+	}
+
 restart:
 	mutex_lock(&qi->qi_tree_lock);
 	dqp = radix_tree_lookup(tree, id);
@@ -745,6 +811,18 @@ restart:
 			goto restart;
 		}
 
+		/* uninit / unused quota found in radix tree, keep looking  */
+		if (flags & XFS_QMOPT_DQNEXT) {
+			if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) {
+				xfs_dqunlock(dqp);
+				mutex_unlock(&qi->qi_tree_lock);
+				error = xfs_dq_get_next_id(mp, type, &id, eof);
+				if (error)
+					return error;
+				goto restart;
+			}
+		}
+
 		dqp->q_nrefs++;
 		mutex_unlock(&qi->qi_tree_lock);
 
@@ -771,6 +849,13 @@ restart:
 	if (ip)
 		xfs_ilock(ip, XFS_ILOCK_EXCL);
 
+	/* If we are asked to find next active id, keep looking */
+	if (error == -ENOENT && (flags & XFS_QMOPT_DQNEXT)) {
+		error = xfs_dq_get_next_id(mp, type, &id, eof);
+		if (!error)
+			goto restart;
+	}
+
 	if (error)
 		return error;
 
@@ -821,6 +906,17 @@ restart:
 	qi->qi_dquots++;
 	mutex_unlock(&qi->qi_tree_lock);
 
+	/* If we are asked to find next active id, keep looking */
+	if (flags & XFS_QMOPT_DQNEXT) {
+		if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) {
+			xfs_qm_dqput(dqp);
+			error = xfs_dq_get_next_id(mp, type, &id, eof);
+			if (error)
+				return error;
+			goto restart;
+		}
+	}
+
  dqret:
 	ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL));
 	trace_xfs_dqget_miss(dqp);
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 8901a01bc354..c68a38f528ba 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -164,8 +164,8 @@ extern void		xfs_qm_dqrele_all_inodes(struct xfs_mount *, uint);
 
 /* quota ops */
 extern int		xfs_qm_scall_trunc_qfiles(struct xfs_mount *, uint);
-extern int		xfs_qm_scall_getquota(struct xfs_mount *, xfs_dqid_t,
-					uint, struct qc_dqblk *);
+extern int		xfs_qm_scall_getquota(struct xfs_mount *, xfs_dqid_t *,
+					uint, struct qc_dqblk *, uint);
 extern int		xfs_qm_scall_setqlim(struct xfs_mount *, xfs_dqid_t, uint,
 					struct qc_dqblk *);
 extern int		xfs_qm_scall_quotaon(struct xfs_mount *, uint);
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 3640c6e896af..0a252863f82b 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -635,9 +635,10 @@ out:
 int
 xfs_qm_scall_getquota(
 	struct xfs_mount	*mp,
-	xfs_dqid_t		id,
+	xfs_dqid_t		*id,
 	uint			type,
-	struct qc_dqblk		*dst)
+	struct qc_dqblk		*dst,
+	uint			dqget_flags)
 {
 	struct xfs_dquot	*dqp;
 	int			error;
@@ -647,7 +648,7 @@ xfs_qm_scall_getquota(
 	 * we aren't passing the XFS_QMOPT_DOALLOC flag. If it doesn't
 	 * exist, we'll get ENOENT back.
 	 */
-	error = xfs_qm_dqget(mp, NULL, id, type, 0, &dqp);
+	error = xfs_qm_dqget(mp, NULL, *id, type, dqget_flags, &dqp);
 	if (error)
 		return error;
 
@@ -660,6 +661,9 @@ xfs_qm_scall_getquota(
 		goto out_put;
 	}
 
+	/* Fill in the ID we actually read from disk */
+	*id = be32_to_cpu(dqp->q_core.d_id);
+
 	memset(dst, 0, sizeof(*dst));
 	dst->d_spc_hardlimit =
 		XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_blk_hardlimit));
@@ -701,7 +705,7 @@ xfs_qm_scall_getquota(
 	if (((XFS_IS_UQUOTA_ENFORCED(mp) && type == XFS_DQ_USER) ||
 	     (XFS_IS_GQUOTA_ENFORCED(mp) && type == XFS_DQ_GROUP) ||
 	     (XFS_IS_PQUOTA_ENFORCED(mp) && type == XFS_DQ_PROJ)) &&
-	    id != 0) {
+	    *id != 0) {
 		if ((dst->d_space > dst->d_spc_softlimit) &&
 		    (dst->d_spc_softlimit > 0)) {
 			ASSERT(dst->d_spc_timer != 0);
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index 7795e0d01382..f82d79a8c694 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -231,14 +231,45 @@ xfs_fs_get_dqblk(
 	struct qc_dqblk		*qdq)
 {
 	struct xfs_mount	*mp = XFS_M(sb);
+	xfs_dqid_t		id;
 
 	if (!XFS_IS_QUOTA_RUNNING(mp))
 		return -ENOSYS;
 	if (!XFS_IS_QUOTA_ON(mp))
 		return -ESRCH;
 
-	return xfs_qm_scall_getquota(mp, from_kqid(&init_user_ns, qid),
-				      xfs_quota_type(qid.type), qdq);
+	id = from_kqid(&init_user_ns, qid);
+	return xfs_qm_scall_getquota(mp, &id,
+				      xfs_quota_type(qid.type), qdq, 0);
+}
+
+/* Return quota info for active quota >= this qid */
+STATIC int
+xfs_fs_get_nextdqblk(
+	struct super_block	*sb,
+	struct kqid		*qid,
+	struct qc_dqblk		*qdq)
+{
+	int			ret;
+	struct xfs_mount	*mp = XFS_M(sb);
+	xfs_dqid_t		id;
+
+	if (!XFS_IS_QUOTA_RUNNING(mp))
+		return -ENOSYS;
+	if (!XFS_IS_QUOTA_ON(mp))
+		return -ESRCH;
+
+	id = from_kqid(&init_user_ns, *qid);
+	ret = xfs_qm_scall_getquota(mp, &id,
+				    xfs_quota_type(qid->type), qdq,
+				    XFS_QMOPT_DQNEXT);
+	if (ret)
+		return ret;
+
+	/* ID may be different, so convert back what we got */
+	*qid = make_kqid(current_user_ns(), qid->type, id);
+	return 0;
+	
 }
 
 STATIC int
@@ -267,5 +298,6 @@ const struct quotactl_ops xfs_quotactl_operations = {
 	.quota_disable		= xfs_quota_disable,
 	.rm_xquota		= xfs_fs_rm_xquota,
 	.get_dqblk		= xfs_fs_get_dqblk,
+	.get_nextdqblk		= xfs_fs_get_nextdqblk,
 	.set_dqblk		= xfs_fs_set_dqblk,
 };

From be6079461abf796e29d02b450a16908f4bf58f6c Mon Sep 17 00:00:00 2001
From: Carlos Maiolino <cmaiolino@redhat.com>
Date: Mon, 8 Feb 2016 11:27:55 +1100
Subject: [PATCH 09/66] xfs: Split default quota limits by quota type

Default quotas are globally set due historical reasons. IRIX only
supported user and project quotas, and default quota was only
applied to user quotas.

In Linux, when a default quota is set, all different quota types
inherits the same default value.

An user with a quota limit larger than the default quota value, will
still be limited to the default value because the group quotas also
inherits the default quotas. Unless the group which the user belongs
to have a custom quota limit set.

This patch aims to split the default quota value by quota type.
Allowing each quota type having different default values.

Default time limits are still set globally. XFS does not set a
per-user/group timer, but a single global timer. For changing this
behavior, some changes should be made in user-space tools another
bugs being fixed.

Signed-off-by: Carlos Maiolino <cmaiolino@redhat.com>
Reviewed-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_dquot.c       | 26 ++++++++++---------
 fs/xfs/xfs_qm.c          | 55 +++++++++++++++++++++++++++++++---------
 fs/xfs/xfs_qm.h          | 34 ++++++++++++++++++++-----
 fs/xfs/xfs_qm_syscalls.c | 15 ++++++-----
 fs/xfs/xfs_trans_dquot.c | 15 ++++++-----
 5 files changed, 103 insertions(+), 42 deletions(-)

diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 2f0502f0f214..316b2a1bdba5 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -92,26 +92,28 @@ xfs_qm_adjust_dqlimits(
 {
 	struct xfs_quotainfo	*q = mp->m_quotainfo;
 	struct xfs_disk_dquot	*d = &dq->q_core;
+	struct xfs_def_quota	*defq;
 	int			prealloc = 0;
 
 	ASSERT(d->d_id);
+	defq = xfs_get_defquota(dq, q);
 
-	if (q->qi_bsoftlimit && !d->d_blk_softlimit) {
-		d->d_blk_softlimit = cpu_to_be64(q->qi_bsoftlimit);
+	if (defq->bsoftlimit && !d->d_blk_softlimit) {
+		d->d_blk_softlimit = cpu_to_be64(defq->bsoftlimit);
 		prealloc = 1;
 	}
-	if (q->qi_bhardlimit && !d->d_blk_hardlimit) {
-		d->d_blk_hardlimit = cpu_to_be64(q->qi_bhardlimit);
+	if (defq->bhardlimit && !d->d_blk_hardlimit) {
+		d->d_blk_hardlimit = cpu_to_be64(defq->bhardlimit);
 		prealloc = 1;
 	}
-	if (q->qi_isoftlimit && !d->d_ino_softlimit)
-		d->d_ino_softlimit = cpu_to_be64(q->qi_isoftlimit);
-	if (q->qi_ihardlimit && !d->d_ino_hardlimit)
-		d->d_ino_hardlimit = cpu_to_be64(q->qi_ihardlimit);
-	if (q->qi_rtbsoftlimit && !d->d_rtb_softlimit)
-		d->d_rtb_softlimit = cpu_to_be64(q->qi_rtbsoftlimit);
-	if (q->qi_rtbhardlimit && !d->d_rtb_hardlimit)
-		d->d_rtb_hardlimit = cpu_to_be64(q->qi_rtbhardlimit);
+	if (defq->isoftlimit && !d->d_ino_softlimit)
+		d->d_ino_softlimit = cpu_to_be64(defq->isoftlimit);
+	if (defq->ihardlimit && !d->d_ino_hardlimit)
+		d->d_ino_hardlimit = cpu_to_be64(defq->ihardlimit);
+	if (defq->rtbsoftlimit && !d->d_rtb_softlimit)
+		d->d_rtb_softlimit = cpu_to_be64(defq->rtbsoftlimit);
+	if (defq->rtbhardlimit && !d->d_rtb_hardlimit)
+		d->d_rtb_hardlimit = cpu_to_be64(defq->rtbhardlimit);
 
 	if (prealloc)
 		xfs_dquot_set_prealloc_limits(dq);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 532ab79d38fe..be125e1758c1 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -560,6 +560,37 @@ xfs_qm_shrink_count(
 	return list_lru_shrink_count(&qi->qi_lru, sc);
 }
 
+STATIC void
+xfs_qm_set_defquota(
+	xfs_mount_t	*mp,
+	uint		type,
+	xfs_quotainfo_t	*qinf)
+{
+	xfs_dquot_t		*dqp;
+	struct xfs_def_quota    *defq;
+	int			error;
+
+	error = xfs_qm_dqread(mp, 0, type, XFS_QMOPT_DOWARN, &dqp);
+
+	if (!error) {
+		xfs_disk_dquot_t        *ddqp = &dqp->q_core;
+
+		defq = xfs_get_defquota(dqp, qinf);
+
+		/*
+		 * Timers and warnings have been already set, let's just set the
+		 * default limits for this quota type
+		 */
+		defq->bhardlimit = be64_to_cpu(ddqp->d_blk_hardlimit);
+		defq->bsoftlimit = be64_to_cpu(ddqp->d_blk_softlimit);
+		defq->ihardlimit = be64_to_cpu(ddqp->d_ino_hardlimit);
+		defq->isoftlimit = be64_to_cpu(ddqp->d_ino_softlimit);
+		defq->rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit);
+		defq->rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit);
+		xfs_qm_dqdestroy(dqp);
+	}
+}
+
 /*
  * This initializes all the quota information that's kept in the
  * mount structure
@@ -606,19 +637,19 @@ xfs_qm_init_quotainfo(
 	 * We try to get the limits from the superuser's limits fields.
 	 * This is quite hacky, but it is standard quota practice.
 	 *
-	 * We look at the USR dquot with id == 0 first, but if user quotas
-	 * are not enabled we goto the GRP dquot with id == 0.
-	 * We don't really care to keep separate default limits for user
-	 * and group quotas, at least not at this point.
-	 *
 	 * Since we may not have done a quotacheck by this point, just read
 	 * the dquot without attaching it to any hashtables or lists.
+	 *
+	 * Timers and warnings are globally set by the first timer found in
+	 * user/group/proj quota types, otherwise a default value is used.
+	 * This should be split into different fields per quota type.
 	 */
 	error = xfs_qm_dqread(mp, 0,
 			XFS_IS_UQUOTA_RUNNING(mp) ? XFS_DQ_USER :
 			 (XFS_IS_GQUOTA_RUNNING(mp) ? XFS_DQ_GROUP :
 			  XFS_DQ_PROJ),
 			XFS_QMOPT_DOWARN, &dqp);
+
 	if (!error) {
 		xfs_disk_dquot_t	*ddqp = &dqp->q_core;
 
@@ -639,13 +670,6 @@ xfs_qm_init_quotainfo(
 			be16_to_cpu(ddqp->d_iwarns) : XFS_QM_IWARNLIMIT;
 		qinf->qi_rtbwarnlimit = ddqp->d_rtbwarns ?
 			be16_to_cpu(ddqp->d_rtbwarns) : XFS_QM_RTBWARNLIMIT;
-		qinf->qi_bhardlimit = be64_to_cpu(ddqp->d_blk_hardlimit);
-		qinf->qi_bsoftlimit = be64_to_cpu(ddqp->d_blk_softlimit);
-		qinf->qi_ihardlimit = be64_to_cpu(ddqp->d_ino_hardlimit);
-		qinf->qi_isoftlimit = be64_to_cpu(ddqp->d_ino_softlimit);
-		qinf->qi_rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit);
-		qinf->qi_rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit);
-
 		xfs_qm_dqdestroy(dqp);
 	} else {
 		qinf->qi_btimelimit = XFS_QM_BTIMELIMIT;
@@ -656,6 +680,13 @@ xfs_qm_init_quotainfo(
 		qinf->qi_rtbwarnlimit = XFS_QM_RTBWARNLIMIT;
 	}
 
+	if (XFS_IS_UQUOTA_RUNNING(mp))
+		xfs_qm_set_defquota(mp, XFS_DQ_USER, qinf);
+	if (XFS_IS_GQUOTA_RUNNING(mp))
+		xfs_qm_set_defquota(mp, XFS_DQ_GROUP, qinf);
+	if (XFS_IS_PQUOTA_RUNNING(mp))
+		xfs_qm_set_defquota(mp, XFS_DQ_PROJ, qinf);
+
 	qinf->qi_shrinker.count_objects = xfs_qm_shrink_count;
 	qinf->qi_shrinker.scan_objects = xfs_qm_shrink_scan;
 	qinf->qi_shrinker.seeks = DEFAULT_SEEKS;
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index c68a38f528ba..2975a822e9f0 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -53,6 +53,15 @@ extern struct kmem_zone	*xfs_qm_dqtrxzone;
  */
 #define XFS_DQUOT_CLUSTER_SIZE_FSB	(xfs_filblks_t)1
 
+struct xfs_def_quota {
+	xfs_qcnt_t       bhardlimit;     /* default data blk hard limit */
+	xfs_qcnt_t       bsoftlimit;	 /* default data blk soft limit */
+	xfs_qcnt_t       ihardlimit;	 /* default inode count hard limit */
+	xfs_qcnt_t       isoftlimit;	 /* default inode count soft limit */
+	xfs_qcnt_t	 rtbhardlimit;   /* default realtime blk hard limit */
+	xfs_qcnt_t	 rtbsoftlimit;   /* default realtime blk soft limit */
+};
+
 /*
  * Various quota information for individual filesystems.
  * The mount structure keeps a pointer to this.
@@ -76,12 +85,9 @@ typedef struct xfs_quotainfo {
 	struct mutex	 qi_quotaofflock;/* to serialize quotaoff */
 	xfs_filblks_t	 qi_dqchunklen;	 /* # BBs in a chunk of dqs */
 	uint		 qi_dqperchunk;	 /* # ondisk dqs in above chunk */
-	xfs_qcnt_t	 qi_bhardlimit;	 /* default data blk hard limit */
-	xfs_qcnt_t	 qi_bsoftlimit;	 /* default data blk soft limit */
-	xfs_qcnt_t	 qi_ihardlimit;	 /* default inode count hard limit */
-	xfs_qcnt_t	 qi_isoftlimit;	 /* default inode count soft limit */
-	xfs_qcnt_t	 qi_rtbhardlimit;/* default realtime blk hard limit */
-	xfs_qcnt_t	 qi_rtbsoftlimit;/* default realtime blk soft limit */
+	struct xfs_def_quota	qi_usr_default;
+	struct xfs_def_quota	qi_grp_default;
+	struct xfs_def_quota	qi_prj_default;
 	struct shrinker  qi_shrinker;
 } xfs_quotainfo_t;
 
@@ -171,4 +177,20 @@ extern int		xfs_qm_scall_setqlim(struct xfs_mount *, xfs_dqid_t, uint,
 extern int		xfs_qm_scall_quotaon(struct xfs_mount *, uint);
 extern int		xfs_qm_scall_quotaoff(struct xfs_mount *, uint);
 
+static inline struct xfs_def_quota *
+xfs_get_defquota(struct xfs_dquot *dqp, struct xfs_quotainfo *qi)
+{
+	struct xfs_def_quota *defq;
+
+	if (XFS_QM_ISUDQ(dqp))
+		defq = &qi->qi_usr_default;
+	else if (XFS_QM_ISGDQ(dqp))
+		defq = &qi->qi_grp_default;
+	else {
+		ASSERT(XFS_QM_ISPDQ(dqp));
+		defq = &qi->qi_prj_default;
+	}
+	return defq;
+}
+
 #endif /* __XFS_QM_H__ */
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 0a252863f82b..f4d0e0a8f517 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -404,6 +404,7 @@ xfs_qm_scall_setqlim(
 	struct xfs_disk_dquot	*ddq;
 	struct xfs_dquot	*dqp;
 	struct xfs_trans	*tp;
+	struct xfs_def_quota	*defq;
 	int			error;
 	xfs_qcnt_t		hard, soft;
 
@@ -431,6 +432,8 @@ xfs_qm_scall_setqlim(
 		ASSERT(error != -ENOENT);
 		goto out_unlock;
 	}
+
+	defq = xfs_get_defquota(dqp, q);
 	xfs_dqunlock(dqp);
 
 	tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM);
@@ -458,8 +461,8 @@ xfs_qm_scall_setqlim(
 		ddq->d_blk_softlimit = cpu_to_be64(soft);
 		xfs_dquot_set_prealloc_limits(dqp);
 		if (id == 0) {
-			q->qi_bhardlimit = hard;
-			q->qi_bsoftlimit = soft;
+			defq->bhardlimit = hard;
+			defq->bsoftlimit = soft;
 		}
 	} else {
 		xfs_debug(mp, "blkhard %Ld < blksoft %Ld", hard, soft);
@@ -474,8 +477,8 @@ xfs_qm_scall_setqlim(
 		ddq->d_rtb_hardlimit = cpu_to_be64(hard);
 		ddq->d_rtb_softlimit = cpu_to_be64(soft);
 		if (id == 0) {
-			q->qi_rtbhardlimit = hard;
-			q->qi_rtbsoftlimit = soft;
+			defq->rtbhardlimit = hard;
+			defq->rtbsoftlimit = soft;
 		}
 	} else {
 		xfs_debug(mp, "rtbhard %Ld < rtbsoft %Ld", hard, soft);
@@ -491,8 +494,8 @@ xfs_qm_scall_setqlim(
 		ddq->d_ino_hardlimit = cpu_to_be64(hard);
 		ddq->d_ino_softlimit = cpu_to_be64(soft);
 		if (id == 0) {
-			q->qi_ihardlimit = hard;
-			q->qi_isoftlimit = soft;
+			defq->ihardlimit = hard;
+			defq->isoftlimit = soft;
 		}
 	} else {
 		xfs_debug(mp, "ihard %Ld < isoft %Ld", hard, soft);
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 995170194df0..c3d547211d16 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -609,17 +609,20 @@ xfs_trans_dqresv(
 	xfs_qcnt_t	total_count;
 	xfs_qcnt_t	*resbcountp;
 	xfs_quotainfo_t	*q = mp->m_quotainfo;
+	struct xfs_def_quota	*defq;
 
 
 	xfs_dqlock(dqp);
 
+	defq = xfs_get_defquota(dqp, q);
+
 	if (flags & XFS_TRANS_DQ_RES_BLKS) {
 		hardlimit = be64_to_cpu(dqp->q_core.d_blk_hardlimit);
 		if (!hardlimit)
-			hardlimit = q->qi_bhardlimit;
+			hardlimit = defq->bhardlimit;
 		softlimit = be64_to_cpu(dqp->q_core.d_blk_softlimit);
 		if (!softlimit)
-			softlimit = q->qi_bsoftlimit;
+			softlimit = defq->bsoftlimit;
 		timer = be32_to_cpu(dqp->q_core.d_btimer);
 		warns = be16_to_cpu(dqp->q_core.d_bwarns);
 		warnlimit = dqp->q_mount->m_quotainfo->qi_bwarnlimit;
@@ -628,10 +631,10 @@ xfs_trans_dqresv(
 		ASSERT(flags & XFS_TRANS_DQ_RES_RTBLKS);
 		hardlimit = be64_to_cpu(dqp->q_core.d_rtb_hardlimit);
 		if (!hardlimit)
-			hardlimit = q->qi_rtbhardlimit;
+			hardlimit = defq->rtbhardlimit;
 		softlimit = be64_to_cpu(dqp->q_core.d_rtb_softlimit);
 		if (!softlimit)
-			softlimit = q->qi_rtbsoftlimit;
+			softlimit = defq->rtbsoftlimit;
 		timer = be32_to_cpu(dqp->q_core.d_rtbtimer);
 		warns = be16_to_cpu(dqp->q_core.d_rtbwarns);
 		warnlimit = dqp->q_mount->m_quotainfo->qi_rtbwarnlimit;
@@ -672,10 +675,10 @@ xfs_trans_dqresv(
 			warnlimit = dqp->q_mount->m_quotainfo->qi_iwarnlimit;
 			hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit);
 			if (!hardlimit)
-				hardlimit = q->qi_ihardlimit;
+				hardlimit = defq->ihardlimit;
 			softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit);
 			if (!softlimit)
-				softlimit = q->qi_isoftlimit;
+				softlimit = defq->isoftlimit;
 
 			if (hardlimit && total_count > hardlimit) {
 				xfs_quota_warn(mp, dqp, QUOTA_NL_IHARDWARN);

From 187372a3b9faff68ed61c291d0135e6739e0dbdf Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 8 Feb 2016 14:40:51 +1100
Subject: [PATCH 10/66] direct-io: always call ->end_io if non-NULL

This way we can pass back errors to the file system, and allow for
cleanup required for all direct I/O invocations.

Also allow the ->end_io handlers to return errors on their own, so that
I/O completion errors can be passed on to the callers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/dax.c           |  9 +++++++--
 fs/direct-io.c     |  9 +++++++--
 fs/ext4/inode.c    |  9 +++++++--
 fs/ocfs2/aops.c    |  7 ++++++-
 fs/xfs/xfs_aops.c  | 13 +++++++------
 include/linux/fs.h |  2 +-
 6 files changed, 35 insertions(+), 14 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 4fd6b0c5c6b5..e38b2c589b54 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -267,8 +267,13 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
 	if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
 		inode_unlock(inode);
 
-	if ((retval > 0) && end_io)
-		end_io(iocb, pos, retval, bh.b_private);
+	if (end_io) {
+		int err;
+
+		err = end_io(iocb, pos, retval, bh.b_private);
+		if (err)
+			retval = err;
+	}
 
 	if (!(flags & DIO_SKIP_DIO_COUNT))
 		inode_dio_end(inode);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 1b2f7ffc8b84..9c6f885cc518 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -253,8 +253,13 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret,
 	if (ret == 0)
 		ret = transferred;
 
-	if (dio->end_io && dio->result)
-		dio->end_io(dio->iocb, offset, transferred, dio->private);
+	if (dio->end_io) {
+		int err;
+
+		err = dio->end_io(dio->iocb, offset, ret, dio->private);
+		if (err)
+			ret = err;
+	}
 
 	if (!(dio->flags & DIO_SKIP_DIO_COUNT))
 		inode_dio_end(dio->inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 83bc8bfb3bea..9db04dd9b88a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3161,14 +3161,17 @@ out:
 }
 #endif
 
-static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
+static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
 			    ssize_t size, void *private)
 {
         ext4_io_end_t *io_end = iocb->private;
 
+	if (size <= 0)
+		return 0;
+
 	/* if not async direct IO just return */
 	if (!io_end)
-		return;
+		return 0;
 
 	ext_debug("ext4_end_io_dio(): io_end 0x%p "
 		  "for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
@@ -3179,6 +3182,8 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
 	io_end->offset = offset;
 	io_end->size = size;
 	ext4_put_io_end(io_end);
+
+	return 0;
 }
 
 /*
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 794fd1587f34..5dcc5f5a842e 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -620,7 +620,7 @@ bail:
  * particularly interested in the aio/dio case.  We use the rw_lock DLM lock
  * to protect io on one node from truncation on another.
  */
-static void ocfs2_dio_end_io(struct kiocb *iocb,
+static int ocfs2_dio_end_io(struct kiocb *iocb,
 			     loff_t offset,
 			     ssize_t bytes,
 			     void *private)
@@ -628,6 +628,9 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
 	struct inode *inode = file_inode(iocb->ki_filp);
 	int level;
 
+	if (bytes <= 0)
+		return 0;
+
 	/* this io's submitter should not have unlocked this before we could */
 	BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
 
@@ -644,6 +647,8 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
 		level = ocfs2_iocb_rw_locked_level(iocb);
 		ocfs2_rw_unlock(inode, level);
 	}
+
+	return 0;
 }
 
 static int ocfs2_releasepage(struct page *page, gfp_t wait)
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 379c089fb051..295aaffea78e 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1645,7 +1645,7 @@ out_end_io:
  * case the completion can be called in interrupt context, whereas if we have an
  * ioend we will always be called in task context (i.e. from a workqueue).
  */
-STATIC void
+STATIC int
 xfs_end_io_direct_write(
 	struct kiocb		*iocb,
 	loff_t			offset,
@@ -1655,15 +1655,19 @@ xfs_end_io_direct_write(
 	struct inode		*inode = file_inode(iocb->ki_filp);
 	struct xfs_ioend	*ioend = private;
 
+	if (size <= 0)
+		return 0;
+
 	trace_xfs_gbmap_direct_endio(XFS_I(inode), offset, size,
 				     ioend ? ioend->io_type : 0, NULL);
 
 	if (!ioend) {
 		ASSERT(offset + size <= i_size_read(inode));
-		return;
+		return 0;
 	}
 
 	__xfs_end_io_direct_write(inode, ioend, offset, size);
+	return 0;
 }
 
 static inline ssize_t
@@ -1672,10 +1676,7 @@ xfs_vm_do_dio(
 	struct kiocb		*iocb,
 	struct iov_iter		*iter,
 	loff_t			offset,
-	void			(*endio)(struct kiocb	*iocb,
-					 loff_t		offset,
-					 ssize_t	size,
-					 void		*private),
+	dio_iodone_t		endio,
 	int			flags)
 {
 	struct block_device	*bdev;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 1a2046275cdf..d7f37bfcbdce 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -70,7 +70,7 @@ extern int sysctl_protected_hardlinks;
 struct buffer_head;
 typedef int (get_block_t)(struct inode *inode, sector_t iblock,
 			struct buffer_head *bh_result, int create);
-typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
+typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 			ssize_t bytes, void *private);
 typedef void (dax_iodone_t)(struct buffer_head *bh_map, int uptodate);
 

From 273dda76f757108bc2b29d30a9595b6dd3bdf3a1 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 8 Feb 2016 14:40:51 +1100
Subject: [PATCH 11/66] xfs: don't use ioends for direct write completions

We only need to communicate two bits of information to the direct I/O
completion handler:

 (1) do we need to convert any unwritten extents in the range
 (2) do we need to check if we need to update the inode size based
     on the range passed to the completion handler

We can use the private data passed to the get_block handler and the
completion handler as a simple bitmask to communicate this information
instead of the current complicated infrastructure reusing the ioends
from the buffer I/O path, and thus avoiding a memory allocation and
a context switch for any non-trivial direct write.  As a nice side
effect we also decouple the direct I/O path implementation from that
of the buffered I/O path.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Brian Foster <bfoster@redhat.com>
---
 fs/xfs/xfs_aops.c  | 220 +++++++++++++++------------------------------
 fs/xfs/xfs_trace.h |   9 +-
 2 files changed, 77 insertions(+), 152 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 295aaffea78e..f008a4f9dcea 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -36,6 +36,10 @@
 #include <linux/pagevec.h>
 #include <linux/writeback.h>
 
+/* flags for direct write completions */
+#define XFS_DIO_FLAG_UNWRITTEN	(1 << 0)
+#define XFS_DIO_FLAG_APPEND	(1 << 1)
+
 void
 xfs_count_page_state(
 	struct page		*page,
@@ -1238,27 +1242,8 @@ xfs_vm_releasepage(
 }
 
 /*
- * When we map a DIO buffer, we may need to attach an ioend that describes the
- * type of write IO we are doing. This passes to the completion function the
- * operations it needs to perform. If the mapping is for an overwrite wholly
- * within the EOF then we don't need an ioend and so we don't allocate one.
- * This avoids the unnecessary overhead of allocating and freeing ioends for
- * workloads that don't require transactions on IO completion.
- *
- * If we get multiple mappings in a single IO, we might be mapping different
- * types. But because the direct IO can only have a single private pointer, we
- * need to ensure that:
- *
- * a) i) the ioend spans the entire region of unwritten mappings; or
- *    ii) the ioend spans all the mappings that cross or are beyond EOF; and
- * b) if it contains unwritten extents, it is *permanently* marked as such
- *
- * We could do this by chaining ioends like buffered IO does, but we only
- * actually get one IO completion callback from the direct IO, and that spans
- * the entire IO regardless of how many mappings and IOs are needed to complete
- * the DIO. There is only going to be one reference to the ioend and its life
- * cycle is constrained by the DIO completion code. hence we don't need
- * reference counting here.
+ * When we map a DIO buffer, we may need to pass flags to
+ * xfs_end_io_direct_write to tell it what kind of write IO we are doing.
  *
  * Note that for DIO, an IO to the highest supported file block offset (i.e.
  * 2^63 - 1FSB bytes) will result in the offset + count overflowing a signed 64
@@ -1266,68 +1251,26 @@ xfs_vm_releasepage(
  * extending the file size. We won't know for sure until IO completion is run
  * and the actual max write offset is communicated to the IO completion
  * routine.
- *
- * For DAX page faults, we are preparing to never see unwritten extents here,
- * nor should we ever extend the inode size. Hence we will soon have nothing to
- * do here for this case, ensuring we don't have to provide an IO completion
- * callback to free an ioend that we don't actually need for a fault into the
- * page at offset (2^63 - 1FSB) bytes.
  */
-
 static void
 xfs_map_direct(
 	struct inode		*inode,
 	struct buffer_head	*bh_result,
 	struct xfs_bmbt_irec	*imap,
-	xfs_off_t		offset,
-	bool			dax_fault)
+	xfs_off_t		offset)
 {
-	struct xfs_ioend	*ioend;
+	uintptr_t		*flags = (uintptr_t *)&bh_result->b_private;
 	xfs_off_t		size = bh_result->b_size;
-	int			type;
 
-	if (ISUNWRITTEN(imap))
-		type = XFS_IO_UNWRITTEN;
-	else
-		type = XFS_IO_OVERWRITE;
+	trace_xfs_get_blocks_map_direct(XFS_I(inode), offset, size,
+		ISUNWRITTEN(imap) ? XFS_IO_UNWRITTEN : XFS_IO_OVERWRITE, imap);
 
-	trace_xfs_gbmap_direct(XFS_I(inode), offset, size, type, imap);
-
-	if (dax_fault) {
-		ASSERT(type == XFS_IO_OVERWRITE);
-		trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type,
-					    imap);
-		return;
-	}
-
-	if (bh_result->b_private) {
-		ioend = bh_result->b_private;
-		ASSERT(ioend->io_size > 0);
-		ASSERT(offset >= ioend->io_offset);
-		if (offset + size > ioend->io_offset + ioend->io_size)
-			ioend->io_size = offset - ioend->io_offset + size;
-
-		if (type == XFS_IO_UNWRITTEN && type != ioend->io_type)
-			ioend->io_type = XFS_IO_UNWRITTEN;
-
-		trace_xfs_gbmap_direct_update(XFS_I(inode), ioend->io_offset,
-					      ioend->io_size, ioend->io_type,
-					      imap);
-	} else if (type == XFS_IO_UNWRITTEN ||
-		   offset + size > i_size_read(inode) ||
-		   offset + size < 0) {
-		ioend = xfs_alloc_ioend(inode, type);
-		ioend->io_offset = offset;
-		ioend->io_size = size;
-
-		bh_result->b_private = ioend;
+	if (ISUNWRITTEN(imap)) {
+		*flags |= XFS_DIO_FLAG_UNWRITTEN;
+		set_buffer_defer_completion(bh_result);
+	} else if (offset + size > i_size_read(inode) || offset + size < 0) {
+		*flags |= XFS_DIO_FLAG_APPEND;
 		set_buffer_defer_completion(bh_result);
-
-		trace_xfs_gbmap_direct_new(XFS_I(inode), offset, size, type,
-					   imap);
-	} else {
-		trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type,
-					    imap);
 	}
 }
 
@@ -1498,9 +1441,12 @@ __xfs_get_blocks(
 		if (ISUNWRITTEN(&imap))
 			set_buffer_unwritten(bh_result);
 		/* direct IO needs special help */
-		if (create && direct)
-			xfs_map_direct(inode, bh_result, &imap, offset,
-				       dax_fault);
+		if (create && direct) {
+			if (dax_fault)
+				ASSERT(!ISUNWRITTEN(&imap));
+			else
+				xfs_map_direct(inode, bh_result, &imap, offset);
+		}
 	}
 
 	/*
@@ -1570,42 +1516,50 @@ xfs_get_blocks_dax_fault(
 	return __xfs_get_blocks(inode, iblock, bh_result, create, true, true);
 }
 
-static void
-__xfs_end_io_direct_write(
-	struct inode		*inode,
-	struct xfs_ioend	*ioend,
+/*
+ * Complete a direct I/O write request.
+ *
+ * xfs_map_direct passes us some flags in the private data to tell us what to
+ * do.  If no flags are set, then the write IO is an overwrite wholly within
+ * the existing allocated file size and so there is nothing for us to do.
+ *
+ * Note that in this case the completion can be called in interrupt context,
+ * whereas if we have flags set we will always be called in task context
+ * (i.e. from a workqueue).
+ */
+STATIC int
+xfs_end_io_direct_write(
+	struct kiocb		*iocb,
 	loff_t			offset,
-	ssize_t			size)
+	ssize_t			size,
+	void			*private)
 {
-	struct xfs_mount	*mp = XFS_I(inode)->i_mount;
+	struct inode		*inode = file_inode(iocb->ki_filp);
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	uintptr_t		flags = (uintptr_t)private;
+	int			error = 0;
 
-	if (XFS_FORCED_SHUTDOWN(mp) || ioend->io_error)
-		goto out_end_io;
+	trace_xfs_end_io_direct_write(ip, offset, size);
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -EIO;
+
+	if (size <= 0)
+		return size;
 
 	/*
-	 * dio completion end_io functions are only called on writes if more
-	 * than 0 bytes was written.
-	 */
-	ASSERT(size > 0);
-
-	/*
-	 * The ioend only maps whole blocks, while the IO may be sector aligned.
-	 * Hence the ioend offset/size may not match the IO offset/size exactly.
-	 * Because we don't map overwrites within EOF into the ioend, the offset
-	 * may not match, but only if the endio spans EOF.  Either way, write
-	 * the IO sizes into the ioend so that completion processing does the
-	 * right thing.
-	 */
-	ASSERT(offset + size <= ioend->io_offset + ioend->io_size);
-	ioend->io_size = size;
-	ioend->io_offset = offset;
-
-	/*
-	 * The ioend tells us whether we are doing unwritten extent conversion
+	 * The flags tell us whether we are doing unwritten extent conversions
 	 * or an append transaction that updates the on-disk file size. These
 	 * cases are the only cases where we should *potentially* be needing
 	 * to update the VFS inode size.
-	 *
+	 */
+	if (flags == 0) {
+		ASSERT(offset + size <= i_size_read(inode));
+		return 0;
+	}
+
+	/*
 	 * We need to update the in-core inode size here so that we don't end up
 	 * with the on-disk inode size being outside the in-core inode size. We
 	 * have no other method of updating EOF for AIO, so always do it here
@@ -1616,58 +1570,30 @@ __xfs_end_io_direct_write(
 	 * here can result in EOF moving backwards and Bad Things Happen when
 	 * that occurs.
 	 */
-	spin_lock(&XFS_I(inode)->i_flags_lock);
+	spin_lock(&ip->i_flags_lock);
 	if (offset + size > i_size_read(inode))
 		i_size_write(inode, offset + size);
-	spin_unlock(&XFS_I(inode)->i_flags_lock);
+	spin_unlock(&ip->i_flags_lock);
 
-	/*
-	 * If we are doing an append IO that needs to update the EOF on disk,
-	 * do the transaction reserve now so we can use common end io
-	 * processing. Stashing the error (if there is one) in the ioend will
-	 * result in the ioend processing passing on the error if it is
-	 * possible as we can't return it from here.
-	 */
-	if (ioend->io_type == XFS_IO_OVERWRITE)
-		ioend->io_error = xfs_setfilesize_trans_alloc(ioend);
+	if (flags & XFS_DIO_FLAG_UNWRITTEN) {
+		trace_xfs_end_io_direct_write_unwritten(ip, offset, size);
 
-out_end_io:
-	xfs_end_io(&ioend->io_work);
-	return;
-}
+		error = xfs_iomap_write_unwritten(ip, offset, size);
+	} else if (flags & XFS_DIO_FLAG_APPEND) {
+		struct xfs_trans *tp;
 
-/*
- * Complete a direct I/O write request.
- *
- * The ioend structure is passed from __xfs_get_blocks() to tell us what to do.
- * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite
- * wholly within the EOF and so there is nothing for us to do. Note that in this
- * case the completion can be called in interrupt context, whereas if we have an
- * ioend we will always be called in task context (i.e. from a workqueue).
- */
-STATIC int
-xfs_end_io_direct_write(
-	struct kiocb		*iocb,
-	loff_t			offset,
-	ssize_t			size,
-	void			*private)
-{
-	struct inode		*inode = file_inode(iocb->ki_filp);
-	struct xfs_ioend	*ioend = private;
+		trace_xfs_end_io_direct_write_append(ip, offset, size);
 
-	if (size <= 0)
-		return 0;
-
-	trace_xfs_gbmap_direct_endio(XFS_I(inode), offset, size,
-				     ioend ? ioend->io_type : 0, NULL);
-
-	if (!ioend) {
-		ASSERT(offset + size <= i_size_read(inode));
-		return 0;
+		tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
+		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
+		if (error) {
+			xfs_trans_cancel(tp);
+			return error;
+		}
+		error = xfs_setfilesize(ip, tp, offset, size);
 	}
 
-	__xfs_end_io_direct_write(inode, ioend, offset, size);
-	return 0;
+	return error;
 }
 
 static inline ssize_t
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 391d797cb53f..c8d58426008e 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1296,11 +1296,7 @@ DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
 DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
 DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
 DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
-DEFINE_IOMAP_EVENT(xfs_gbmap_direct);
-DEFINE_IOMAP_EVENT(xfs_gbmap_direct_new);
-DEFINE_IOMAP_EVENT(xfs_gbmap_direct_update);
-DEFINE_IOMAP_EVENT(xfs_gbmap_direct_none);
-DEFINE_IOMAP_EVENT(xfs_gbmap_direct_endio);
+DEFINE_IOMAP_EVENT(xfs_get_blocks_map_direct);
 
 DECLARE_EVENT_CLASS(xfs_simple_io_class,
 	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
@@ -1340,6 +1336,9 @@ DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
 DEFINE_SIMPLE_IO_EVENT(xfs_get_blocks_notfound);
 DEFINE_SIMPLE_IO_EVENT(xfs_setfilesize);
 DEFINE_SIMPLE_IO_EVENT(xfs_zero_eof);
+DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write);
+DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_unwritten);
+DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_append);
 
 DECLARE_EVENT_CLASS(xfs_itrunc_class,
 	TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size),

From c19b104a67b3bb1ac48275a8a1c9df666e676c25 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 8 Feb 2016 14:40:51 +1100
Subject: [PATCH 12/66] xfs: fold xfs_vm_do_dio into xfs_vm_direct_IO

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_aops.c | 40 ++++++++++++++++------------------------
 1 file changed, 16 insertions(+), 24 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index f008a4f9dcea..8163910bc49f 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1596,26 +1596,6 @@ xfs_end_io_direct_write(
 	return error;
 }
 
-static inline ssize_t
-xfs_vm_do_dio(
-	struct inode		*inode,
-	struct kiocb		*iocb,
-	struct iov_iter		*iter,
-	loff_t			offset,
-	dio_iodone_t		endio,
-	int			flags)
-{
-	struct block_device	*bdev;
-
-	if (IS_DAX(inode))
-		return dax_do_io(iocb, inode, iter, offset,
-				 xfs_get_blocks_direct, endio, 0);
-
-	bdev = xfs_find_bdev_for_inode(inode);
-	return  __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
-				     xfs_get_blocks_direct, endio, NULL, flags);
-}
-
 STATIC ssize_t
 xfs_vm_direct_IO(
 	struct kiocb		*iocb,
@@ -1623,11 +1603,23 @@ xfs_vm_direct_IO(
 	loff_t			offset)
 {
 	struct inode		*inode = iocb->ki_filp->f_mapping->host;
+	dio_iodone_t		*endio = NULL;
+	int			flags = 0;
+	struct block_device	*bdev;
 
-	if (iov_iter_rw(iter) == WRITE)
-		return xfs_vm_do_dio(inode, iocb, iter, offset,
-				     xfs_end_io_direct_write, DIO_ASYNC_EXTEND);
-	return xfs_vm_do_dio(inode, iocb, iter, offset, NULL, 0);
+	if (iov_iter_rw(iter) == WRITE) {
+		endio = xfs_end_io_direct_write;
+		flags = DIO_ASYNC_EXTEND;
+	}
+
+	if (IS_DAX(inode)) {
+		return dax_do_io(iocb, inode, iter, offset,
+				 xfs_get_blocks_direct, endio, 0);
+	}
+
+	bdev = xfs_find_bdev_for_inode(inode);
+	return  __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
+			xfs_get_blocks_direct, endio, NULL, flags);
 }
 
 /*

From 196328ec973a74ee52cc282824e72c3824dc1cf5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 8 Feb 2016 14:58:07 +1100
Subject: [PATCH 13/66] xfs: handle errors from ->free_blocks in
 xfs_btree_kill_iroot

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/libxfs/xfs_btree.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index a0eb18ce3ad3..3143577930bd 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -3209,6 +3209,7 @@ xfs_btree_kill_iroot(
 	int			level;
 	int			index;
 	int			numrecs;
+	int			error;
 #ifdef DEBUG
 	union xfs_btree_ptr	ptr;
 	int			i;
@@ -3272,8 +3273,6 @@ xfs_btree_kill_iroot(
 	cpp = xfs_btree_ptr_addr(cur, 1, cblock);
 #ifdef DEBUG
 	for (i = 0; i < numrecs; i++) {
-		int		error;
-
 		error = xfs_btree_check_ptr(cur, cpp, i, level - 1);
 		if (error) {
 			XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
@@ -3283,7 +3282,11 @@ xfs_btree_kill_iroot(
 #endif
 	xfs_btree_copy_ptrs(cur, pp, cpp, numrecs);
 
-	cur->bc_ops->free_block(cur, cbp);
+	error = cur->bc_ops->free_block(cur, cbp);
+	if (error) {
+		XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+		return error;
+	}
 	XFS_BTREE_STATS_INC(cur, free);
 
 	cur->bc_bufs[level - 1] = NULL;

From c46ee8ad7856b58eeb383c30ce847897f85c4103 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 8 Feb 2016 14:58:07 +1100
Subject: [PATCH 14/66] xfs: factor btree block freeing into a helper

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/libxfs/xfs_btree.c | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 3143577930bd..77afb4a899b9 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -294,6 +294,19 @@ xfs_btree_sblock_verify_crc(
 	return true;
 }
 
+static int
+xfs_btree_free_block(
+	struct xfs_btree_cur	*cur,
+	struct xfs_buf		*bp)
+{
+	int			error;
+
+	error = cur->bc_ops->free_block(cur, bp);
+	if (!error)
+		XFS_BTREE_STATS_INC(cur, free);
+	return error;
+}
+
 /*
  * Delete the btree cursor.
  */
@@ -3282,12 +3295,11 @@ xfs_btree_kill_iroot(
 #endif
 	xfs_btree_copy_ptrs(cur, pp, cpp, numrecs);
 
-	error = cur->bc_ops->free_block(cur, cbp);
+	error = xfs_btree_free_block(cur, cbp);
 	if (error) {
 		XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
 		return error;
 	}
-	XFS_BTREE_STATS_INC(cur, free);
 
 	cur->bc_bufs[level - 1] = NULL;
 	be16_add_cpu(&block->bb_level, -1);
@@ -3320,14 +3332,12 @@ xfs_btree_kill_root(
 	 */
 	cur->bc_ops->set_root(cur, newroot, -1);
 
-	error = cur->bc_ops->free_block(cur, bp);
+	error = xfs_btree_free_block(cur, bp);
 	if (error) {
 		XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
 		return error;
 	}
 
-	XFS_BTREE_STATS_INC(cur, free);
-
 	cur->bc_bufs[level] = NULL;
 	cur->bc_ra[level] = 0;
 	cur->bc_nlevels--;
@@ -3833,10 +3843,9 @@ xfs_btree_delrec(
 	}
 
 	/* Free the deleted block. */
-	error = cur->bc_ops->free_block(cur, rbp);
+	error = xfs_btree_free_block(cur, rbp);
 	if (error)
 		goto error0;
-	XFS_BTREE_STATS_INC(cur, free);
 
 	/*
 	 * If we joined with the left neighbor, set the buffer in the

From edfd9dd549212a0923c9b5b142275dc88912abfa Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 8 Feb 2016 14:58:07 +1100
Subject: [PATCH 15/66] xfs: move buffer invalidation to xfs_btree_free_block

... instead of leaving it in the methods.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/libxfs/xfs_alloc_btree.c  |  2 --
 fs/xfs/libxfs/xfs_bmap_btree.c   |  1 -
 fs/xfs/libxfs/xfs_btree.c        |  4 +++-
 fs/xfs/libxfs/xfs_ialloc_btree.c | 12 ++----------
 4 files changed, 5 insertions(+), 14 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index 444626ddbd1b..d9b42425291e 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -118,8 +118,6 @@ xfs_allocbt_free_block(
 	xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1,
 			      XFS_EXTENT_BUSY_SKIP_DISCARD);
 	xfs_trans_agbtree_delta(cur->bc_tp, -1);
-
-	xfs_trans_binval(cur->bc_tp, bp);
 	return 0;
 }
 
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index 1637c37bfbaa..e37508ae589b 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -531,7 +531,6 @@ xfs_bmbt_free_block(
 
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 	xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
-	xfs_trans_binval(tp, bp);
 	return 0;
 }
 
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 77afb4a899b9..1f88e1ce770f 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -302,8 +302,10 @@ xfs_btree_free_block(
 	int			error;
 
 	error = cur->bc_ops->free_block(cur, bp);
-	if (!error)
+	if (!error) {
+		xfs_trans_binval(cur->bc_tp, bp);
 		XFS_BTREE_STATS_INC(cur, free);
+	}
 	return error;
 }
 
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index c679f3c05b63..89c21d771e35 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -125,16 +125,8 @@ xfs_inobt_free_block(
 	struct xfs_btree_cur	*cur,
 	struct xfs_buf		*bp)
 {
-	xfs_fsblock_t		fsbno;
-	int			error;
-
-	fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp));
-	error = xfs_free_extent(cur->bc_tp, fsbno, 1);
-	if (error)
-		return error;
-
-	xfs_trans_binval(cur->bc_tp, bp);
-	return error;
+	return xfs_free_extent(cur->bc_tp,
+			XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp)), 1);
 }
 
 STATIC int

From de0b85a8cf24f8c535b7f719c454e3951298d17a Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Mon, 8 Feb 2016 14:58:07 +1100
Subject: [PATCH 16/66] xfs: remove unused function definitions

Old leftovers.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/libxfs/xfs_sb.h | 1 -
 fs/xfs/xfs_fsops.h     | 1 -
 fs/xfs/xfs_mount.h     | 1 -
 3 files changed, 3 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h
index b25bb9a343f3..961e6475a309 100644
--- a/fs/xfs/libxfs/xfs_sb.h
+++ b/fs/xfs/libxfs/xfs_sb.h
@@ -27,7 +27,6 @@ extern struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *, xfs_agnumber_t,
 extern void	xfs_perag_put(struct xfs_perag *pag);
 extern int	xfs_initialize_perag_data(struct xfs_mount *, xfs_agnumber_t);
 
-extern void	xfs_sb_calc_crc(struct xfs_buf *bp);
 extern void	xfs_log_sb(struct xfs_trans *tp);
 extern int	xfs_sync_sb(struct xfs_mount *mp, bool wait);
 extern void	xfs_sb_mount_common(struct xfs_mount *mp, struct xfs_sb *sbp);
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index 1b6a98b66886..f32713f14f9a 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -25,6 +25,5 @@ extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt);
 extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval,
 				xfs_fsop_resblks_t *outval);
 extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags);
-extern int xfs_fs_log_dummy(struct xfs_mount *mp);
 
 #endif	/* __XFS_FSOPS_H__ */
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index b57098481c10..a4e03ab50342 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -327,7 +327,6 @@ extern int	xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta,
 				 bool reserved);
 extern int	xfs_mod_frextents(struct xfs_mount *mp, int64_t delta);
 
-extern int	xfs_mount_log_sb(xfs_mount_t *);
 extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int);
 extern int	xfs_readsb(xfs_mount_t *, int);
 extern void	xfs_freesb(xfs_mount_t *);

From 18f1df4e00cea2eae41f3e5515b94d1e7127b2b6 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Mon, 8 Feb 2016 14:59:07 +1100
Subject: [PATCH 17/66] xfs: Make xfsaild freezeable again

Hendik has reported suspend failures due to xfsaild blocking the freezer
to settle down.
Jan 17 19:59:56 linux-6380 kernel: PM: Syncing filesystems ... done.
Jan 17 19:59:56 linux-6380 kernel: PM: Preparing system for sleep (mem)
Jan 17 19:59:56 linux-6380 kernel: Freezing user space processes ... (elapsed 0.001 seconds) done.
Jan 17 19:59:56 linux-6380 kernel: Freezing remaining freezable tasks ...
Jan 17 19:59:56 linux-6380 kernel: Freezing of tasks failed after 20.002 seconds (1 tasks refusing to freeze, wq_busy=0):
Jan 17 19:59:56 linux-6380 kernel: xfsaild/dm-5    S 00000000     0  1293      2 0x00000080
Jan 17 19:59:56 linux-6380 kernel:  f0ef5f00 00000046 00000200 00000000 ffff9022 c02d3800 00000000 00000032
Jan 17 19:59:56 linux-6380 kernel:  ee0b2400 00000032 f71e0d00 f36fabc0 f0ef2d00 f0ef6000 f0ef2d00 f12f90c0
Jan 17 19:59:56 linux-6380 kernel:  f0ef5f0c c0844e44 00000000 f0ef5f6c f811e0be 00000000 00000000 f0ef2d00
Jan 17 19:59:56 linux-6380 kernel: Call Trace:
Jan 17 19:59:56 linux-6380 kernel:  [<c0844e44>] schedule+0x34/0x90
Jan 17 19:59:56 linux-6380 kernel:  [<f811e0be>] xfsaild+0x5de/0x600 [xfs]
Jan 17 19:59:56 linux-6380 kernel:  [<c0286cbb>] kthread+0x9b/0xb0
Jan 17 19:59:56 linux-6380 kernel:  [<c0848a79>] ret_from_kernel_thread+0x21/0x38

The issue has been there for quite some time but it has been made
visible by only by 24ba16bb3d49 ("xfs: clear PF_NOFREEZE for xfsaild
kthread") because the suspend started seeing xfsaild.

The above commit has missed that the !xfs_ail_min branch might call
schedule with TASK_INTERRUPTIBLE without calling try_to_freeze so the pm
suspend would wake up the kernel thread over and over again without any
progress. What we want here is to use freezable_schedule instead to hide
the thread from the suspend.

While we are here also change schedule_timeout to freezable variant to
prevent from spurious wakeups by suspend.

[dchinner: re-add set_freezeable call so the freezer will account properly
 for this kthread. ]

Reported-by: Hendrik Woltersdorf <hendrikw@arcor.de>
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_trans_ail.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 4f18fd92ca13..d6c9c3e9e02b 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -497,6 +497,7 @@ xfsaild(
 	long		tout = 0;	/* milliseconds */
 
 	current->flags |= PF_MEMALLOC;
+	set_freezable();
 
 	while (!kthread_should_stop()) {
 		if (tout && tout <= 20)
@@ -519,14 +520,14 @@ xfsaild(
 		if (!xfs_ail_min(ailp) &&
 		    ailp->xa_target == ailp->xa_target_prev) {
 			spin_unlock(&ailp->xa_lock);
-			schedule();
+			freezable_schedule();
 			tout = 0;
 			continue;
 		}
 		spin_unlock(&ailp->xa_lock);
 
 		if (tout)
-			schedule_timeout(msecs_to_jiffies(tout));
+			freezable_schedule_timeout(msecs_to_jiffies(tout));
 
 		__set_current_state(TASK_RUNNING);
 

From 244efeafb65ad4d98cd0c9463631e3931d813a6e Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Mon, 8 Feb 2016 15:00:01 +1100
Subject: [PATCH 18/66] xfs: move struct xfs_attr_shortform to xfs_da_format.h

Move the shortform attr structure definition to the same place as the
other attribute structure definitions for consistency and also so that
xfs/122 verifies the structure size.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/libxfs/xfs_attr_sf.h    | 16 ----------------
 fs/xfs/libxfs/xfs_da_format.h  | 16 ++++++++++++++++
 fs/xfs/libxfs/xfs_inode_fork.c |  1 +
 3 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_attr_sf.h b/fs/xfs/libxfs/xfs_attr_sf.h
index 919756e3ba53..90928bbe693c 100644
--- a/fs/xfs/libxfs/xfs_attr_sf.h
+++ b/fs/xfs/libxfs/xfs_attr_sf.h
@@ -24,22 +24,6 @@
  * Small attribute lists are packed as tightly as possible so as
  * to fit into the literal area of the inode.
  */
-
-/*
- * Entries are packed toward the top as tight as possible.
- */
-typedef struct xfs_attr_shortform {
-	struct xfs_attr_sf_hdr {	/* constant-structure header block */
-		__be16	totsize;	/* total bytes in shortform list */
-		__u8	count;	/* count of active entries */
-	} hdr;
-	struct xfs_attr_sf_entry {
-		__uint8_t namelen;	/* actual length of name (no NULL) */
-		__uint8_t valuelen;	/* actual length of value (no NULL) */
-		__uint8_t flags;	/* flags bits (see xfs_attr_leaf.h) */
-		__uint8_t nameval[1];	/* name & value bytes concatenated */
-	} list[1];			/* variable sized array */
-} xfs_attr_shortform_t;
 typedef struct xfs_attr_sf_hdr xfs_attr_sf_hdr_t;
 typedef struct xfs_attr_sf_entry xfs_attr_sf_entry_t;
 
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
index b14bbd6bb05f..8d4d8bce41bf 100644
--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -641,6 +641,22 @@ xfs_dir2_block_leaf_p(struct xfs_dir2_block_tail *btp)
  */
 #define XFS_ATTR_LEAF_MAPSIZE	3	/* how many freespace slots */
 
+/*
+ * Entries are packed toward the top as tight as possible.
+ */
+typedef struct xfs_attr_shortform {
+	struct xfs_attr_sf_hdr {	/* constant-structure header block */
+		__be16	totsize;	/* total bytes in shortform list */
+		__u8	count;	/* count of active entries */
+	} hdr;
+	struct xfs_attr_sf_entry {
+		__uint8_t namelen;	/* actual length of name (no NULL) */
+		__uint8_t valuelen;	/* actual length of value (no NULL) */
+		__uint8_t flags;	/* flags bits (see xfs_attr_leaf.h) */
+		__uint8_t nameval[1];	/* name & value bytes concatenated */
+	} list[1];			/* variable sized array */
+} xfs_attr_shortform_t;
+
 typedef struct xfs_attr_leaf_map {	/* RLE map of free bytes */
 	__be16	base;			  /* base of free region */
 	__be16	size;			  /* length of free region */
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 0defbd02f62d..ef22a78fb569 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -31,6 +31,7 @@
 #include "xfs_error.h"
 #include "xfs_trace.h"
 #include "xfs_attr_sf.h"
+#include "xfs_da_format.h"
 
 kmem_zone_t *xfs_ifork_zone;
 

From 60630fe66ed28d43379382645ed349f7d3457330 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Mon, 8 Feb 2016 15:00:02 +1100
Subject: [PATCH 19/66] xfs: clean up unwritten buffers on write failure

The xfs_vm_write_failed() handler is currently responsible for cleaning
up any delalloc blocks over the range of a failed write beyond EOF.
Failure to do so results in warning messages and other inconsistencies
between buffer and extent state. The ->releasepage() handler currently
warns in the event of a page being released with either unwritten or
delalloc buffers, as neither is ever expected by the time a page is
released.

As has been reproduced by generic/083 on a -bsize=1k fs, it is currently
possible to trigger the ->releasepage() warning for a page with
unwritten buffers when a filesystem is near ENOSPC. This is reproduced
by the following sequence:

  $ mkfs.xfs -f -b size=1k -d size=100m <dev>
  $ mount <dev> /mnt/
  $
  $ xfs_io -fc "falloc -k 0 1k" /mnt/file
  $ dd if=/dev/zero of=/mnt/enospc conv=notrunc oflag=append
  $
  $ xfs_io -c "pwrite 512 1k" /mnt/file
  $ xfs_io -d -c "pwrite 16k 1k" /mnt/file

The first pwrite command attempts a block unaligned write across an
unwritten block and a hole. The delalloc for the hole fails with ENOSPC
and the subsequent error handling does not clean up the unwritten buffer
that was instantiated during the first ->get_block() call.

The second pwrite triggers a warning as part of the inode mapping
invalidation that occurs prior to direct I/O. The releasepage() handler
detects the unwritten buffer at this time, warns and prevents the
release of the page.

To deal with this problem, update xfs_vm_write_failed() to clean up
unwritten as well as delalloc buffers that are beyond EOF and within the
range of the failed write.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_aops.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 379c089fb051..4a13f5311a41 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1783,14 +1783,22 @@ xfs_vm_write_failed(
 		if (block_start >= to)
 			break;
 
-		if (!buffer_delay(bh))
+		/*
+		 * Process delalloc and unwritten buffers beyond EOF. We can
+		 * encounter unwritten buffers in the event that a file has
+		 * post-EOF unwritten extents and an extending write happens to
+		 * fail (e.g., an unaligned write that also involves a delalloc
+		 * to the same page).
+		 */
+		if (!buffer_delay(bh) && !buffer_unwritten(bh))
 			continue;
 
 		if (!buffer_new(bh) && block_offset < i_size_read(inode))
 			continue;
 
-		xfs_vm_kill_delalloc_range(inode, block_offset,
-					   block_offset + bh->b_size);
+		if (buffer_delay(bh))
+			xfs_vm_kill_delalloc_range(inode, block_offset,
+						   block_offset + bh->b_size);
 
 		/*
 		 * This buffer does not contain data anymore. make sure anyone
@@ -1801,6 +1809,7 @@ xfs_vm_write_failed(
 		clear_buffer_mapped(bh);
 		clear_buffer_new(bh);
 		clear_buffer_dirty(bh);
+		clear_buffer_unwritten(bh);
 	}
 
 }

From af055e37a91d215d7174d0b84c86795ca81086a7 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Mon, 8 Feb 2016 15:00:02 +1100
Subject: [PATCH 20/66] xfs: fix xfs_log_ticket leak in xfs_end_io() after fs
 shutdown

If the filesystem has shut down, xfs_end_io() currently sets an
error on the ioend and proceeds to ioend destruction. The ioend
might contain a truncate transaction if the I/O extended the size of
the file. This transaction is only cleaned up in
xfs_setfilesize_ioend(), however, which is skipped in this case.
This results in an xfs_log_ticket leak message when the associate
cache slab is destroyed (e.g., on rmmod).

This was originally reproduced by xfs/141 on a distro kernel. The
problem is reproducible on an upstream kernel, but not easily
detected in current upstream if the xfs_log_ticket cache happens to
be merged with another cache. This can be reproduced more
deterministically with the 'slab_nomerge' kernel boot option.

Update xfs_end_io() to proceed with normal end I/O processing after
an error is set on an ioend due to fs shutdown. The I/O type-based
processing is already designed to handle an I/O error and ensure
that the ioend is cleaned up correctly.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_aops.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 4a13f5311a41..0c8dacea411e 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -214,10 +214,12 @@ xfs_end_io(
 	struct xfs_inode *ip = XFS_I(ioend->io_inode);
 	int		error = 0;
 
-	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+	/*
+	 * Set an error if the mount has shut down and proceed with end I/O
+	 * processing so it can perform whatever cleanups are necessary.
+	 */
+	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 		ioend->io_error = -EIO;
-		goto done;
-	}
 
 	/*
 	 * For unwritten extents we need to issue transactions to convert a

From f67ca6eca89cddd355c83639a90109e245f9d5a7 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 9 Feb 2016 16:41:31 +1100
Subject: [PATCH 21/66] xfs: RT bitmap and summary buffers are not typed

When logging buffers, we attach a type to them that follows the
buffer all the way into the log and is used to identify the buffer
contents in log recovery. Both the realtime summary buffers and the
bitmap buffers do not have types defined or set, so when we try to
log them we see assert failure:

XFS: Assertion failed: (bip->bli_flags & XFS_BLI_STALE) || (xfs_blft_from_flags(&bip->__bli_format) > XFS_BLFT_UNKNOWN_BUF && xfs_blft_from_flags(&bip->__bli_format) < XFS_BLFT_MAX_BUF), file: fs/xfs/xfs_buf_item.c, line: 294

Fix this by adding buffer log format types for these buffers, and
add identification support into log recovery for them. Only build the log
recovery support if CONFIG_XFS_RT=y - we can't get into log recovery for real
time filesystems if support is not built into the kernel, and this avoids
potential build problems.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Tested-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/libxfs/xfs_log_format.h | 2 ++
 fs/xfs/libxfs/xfs_rtbitmap.c   | 3 +++
 fs/xfs/xfs_log_recover.c       | 7 +++++++
 3 files changed, 12 insertions(+)

diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index 265314690415..1be26ecdbebf 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -495,6 +495,8 @@ enum xfs_blft {
 	XFS_BLFT_ATTR_LEAF_BUF,
 	XFS_BLFT_ATTR_RMT_BUF,
 	XFS_BLFT_SB_BUF,
+	XFS_BLFT_RTBITMAP_BUF,
+	XFS_BLFT_RTSUMMARY_BUF,
 	XFS_BLFT_MAX_BUF = (1 << XFS_BLFT_BITS),
 };
 
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index 9b59ffa1fc19..377a2e6552f3 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -71,6 +71,9 @@ xfs_rtbuf_get(
 				   mp->m_bsize, 0, &bp, NULL);
 	if (error)
 		return error;
+
+	xfs_trans_buf_set_type(tp, bp, issum ? XFS_BLFT_RTSUMMARY_BUF
+					     : XFS_BLFT_RTBITMAP_BUF);
 	*bpp = bp;
 	return 0;
 }
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index da37beb76f6e..afdd326df923 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2473,6 +2473,13 @@ xlog_recover_validate_buf_type(
 		}
 		bp->b_ops = &xfs_sb_buf_ops;
 		break;
+#ifdef CONFIG_XFS_RT
+	case XFS_BLFT_RTBITMAP_BUF:
+	case XFS_BLFT_RTSUMMARY_BUF:
+		/* no verification of RT buffers is done */
+		bp->b_ops = NULL;
+		break;
+#endif /* CONFIG_XFS_RT */
 	default:
 		xfs_warn(mp, "Unknown buffer type %d!",
 			 xfs_blft_from_flags(buf_f));

From bf85e0998ae8bafc1e0863d914df3be2b1bc372a Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 9 Feb 2016 16:41:45 +1100
Subject: [PATCH 22/66] xfs: RT bitmap and summary buffers need verifiers

Buffers without verifiers issue runtime warnings on XFS. We don't
have anything we can actually verify in the RT buffers (no CRCs, not
magic numbers, etc), but we still need verifiers to avoid the
warnings.

Add a set of dummy verifier operations for the realtime buffers and
apply them in the appropriate places.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Tested-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/libxfs/xfs_rtbitmap.c | 27 ++++++++++++++++++++++++++-
 fs/xfs/libxfs/xfs_shared.h   |  1 +
 fs/xfs/xfs_log_recover.c     |  4 ++--
 3 files changed, 29 insertions(+), 3 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index 377a2e6552f3..33806e0c2bc6 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -41,6 +41,31 @@
  * Realtime allocator bitmap functions shared with userspace.
  */
 
+/*
+ * Real time buffers need verifiers to avoid runtime warnings during IO.
+ * We don't have anything to verify, however, so these are just dummy
+ * operations.
+ */
+static void
+xfs_rtbuf_verify_read(
+	struct xfs_buf	*bp)
+{
+	return;
+}
+
+static void
+xfs_rtbuf_verify_write(
+	struct xfs_buf	*bp)
+{
+	return;
+}
+
+const struct xfs_buf_ops xfs_rtbuf_ops = {
+	.name = "rtbuf",
+	.verify_read = xfs_rtbuf_verify_read,
+	.verify_write = xfs_rtbuf_verify_write,
+};
+
 /*
  * Get a buffer for the bitmap or summary file block specified.
  * The buffer is returned read and locked.
@@ -68,7 +93,7 @@ xfs_rtbuf_get(
 	ASSERT(map.br_startblock != NULLFSBLOCK);
 	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
 				   XFS_FSB_TO_DADDR(mp, map.br_startblock),
-				   mp->m_bsize, 0, &bp, NULL);
+				   mp->m_bsize, 0, &bp, &xfs_rtbuf_ops);
 	if (error)
 		return error;
 
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 15c3ceb845b9..81ac870834da 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -53,6 +53,7 @@ extern const struct xfs_buf_ops xfs_dquot_buf_ra_ops;
 extern const struct xfs_buf_ops xfs_sb_buf_ops;
 extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops;
 extern const struct xfs_buf_ops xfs_symlink_buf_ops;
+extern const struct xfs_buf_ops xfs_rtbuf_ops;
 
 /*
  * Transaction types.  Used to distinguish types of buffers. These never reach
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index afdd326df923..ed8c88917300 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2476,8 +2476,8 @@ xlog_recover_validate_buf_type(
 #ifdef CONFIG_XFS_RT
 	case XFS_BLFT_RTBITMAP_BUF:
 	case XFS_BLFT_RTSUMMARY_BUF:
-		/* no verification of RT buffers is done */
-		bp->b_ops = NULL;
+		/* no magic numbers for verification of RT buffers */
+		bp->b_ops = &xfs_rtbuf_ops;
 		break;
 #endif /* CONFIG_XFS_RT */
 	default:

From f8d55aa0523ad0f78979c222ed18b78ea7be793a Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 9 Feb 2016 16:54:58 +1100
Subject: [PATCH 23/66] xfs: introduce inode log format object

We currently carry around and log an entire inode core in the
struct xfs_inode. A lot of the information in the inode core is
duplicated in the VFS inode, but we cannot remove this duplication
of infomration because the inode core is logged directly in
xfs_inode_item_format().

Add a new function xfs_inode_item_format_core() that copies the
inode core data into a struct xfs_icdinode that is pulled directly
from the log vector buffer. This means we no longer directly
copy the inode core, but copy the structures one member at a time.
This will be slightly less efficient than copying, but will allow us
to remove duplicate and unnecessary items from the struct xfs_inode.

To enable us to do this, call the new structure a xfs_log_dinode,
so that we know it's different to the physical xfs_dinode and the
in-core xfs_icdinode.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/libxfs/xfs_inode_buf.c  |   8 +--
 fs/xfs/libxfs/xfs_inode_buf.h  |  53 +++++++++++++-
 fs/xfs/libxfs/xfs_log_format.h |  15 ++--
 fs/xfs/xfs_icache.c            |   2 +-
 fs/xfs/xfs_inode.h             |   2 +-
 fs/xfs/xfs_inode_item.c        | 128 +++++++++++++++++++++++++++++++--
 fs/xfs/xfs_inode_item.h        |   2 +
 fs/xfs/xfs_log_recover.c       |  52 +++++++-------
 8 files changed, 218 insertions(+), 44 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 1aabfda669b0..63d46bf65540 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -196,8 +196,8 @@ xfs_imap_to_bp(
 
 void
 xfs_dinode_from_disk(
-	xfs_icdinode_t		*to,
-	xfs_dinode_t		*from)
+	struct xfs_icdinode	*to,
+	struct xfs_dinode	*from)
 {
 	to->di_magic = be16_to_cpu(from->di_magic);
 	to->di_mode = be16_to_cpu(from->di_mode);
@@ -243,8 +243,8 @@ xfs_dinode_from_disk(
 
 void
 xfs_dinode_to_disk(
-	xfs_dinode_t		*to,
-	xfs_icdinode_t		*from)
+	struct xfs_dinode	*to,
+	struct xfs_icdinode	*from)
 {
 	to->di_magic = cpu_to_be16(from->di_magic);
 	to->di_mode = cpu_to_be16(from->di_mode);
diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h
index 9308c47f2a52..642f2a297c26 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.h
+++ b/fs/xfs/libxfs/xfs_inode_buf.h
@@ -20,7 +20,58 @@
 
 struct xfs_inode;
 struct xfs_dinode;
-struct xfs_icdinode;
+
+/*
+ * In memory representation of the XFS inode. This is held in the in-core
+ * struct xfs_inode to represent the on disk values, but no longer needs to be
+ * identical to the on-disk structure as it is always translated to on-disk
+ * format specific structures at the appropriate time.
+ */
+struct xfs_icdinode {
+	__uint16_t	di_magic;	/* inode magic # = XFS_DINODE_MAGIC */
+	__uint16_t	di_mode;	/* mode and type of file */
+	__int8_t	di_version;	/* inode version */
+	__int8_t	di_format;	/* format of di_c data */
+	__uint16_t	di_onlink;	/* old number of links to file */
+	__uint32_t	di_uid;		/* owner's user id */
+	__uint32_t	di_gid;		/* owner's group id */
+	__uint32_t	di_nlink;	/* number of links to file */
+	__uint16_t	di_projid_lo;	/* lower part of owner's project id */
+	__uint16_t	di_projid_hi;	/* higher part of owner's project id */
+	__uint8_t	di_pad[6];	/* unused, zeroed space */
+	__uint16_t	di_flushiter;	/* incremented on flush */
+	xfs_ictimestamp_t di_atime;	/* time last accessed */
+	xfs_ictimestamp_t di_mtime;	/* time last modified */
+	xfs_ictimestamp_t di_ctime;	/* time created/inode modified */
+	xfs_fsize_t	di_size;	/* number of bytes in file */
+	xfs_rfsblock_t	di_nblocks;	/* # of direct & btree blocks used */
+	xfs_extlen_t	di_extsize;	/* basic/minimum extent size for file */
+	xfs_extnum_t	di_nextents;	/* number of extents in data fork */
+	xfs_aextnum_t	di_anextents;	/* number of extents in attribute fork*/
+	__uint8_t	di_forkoff;	/* attr fork offs, <<3 for 64b align */
+	__int8_t	di_aformat;	/* format of attr fork's data */
+	__uint32_t	di_dmevmask;	/* DMIG event mask */
+	__uint16_t	di_dmstate;	/* DMIG state info */
+	__uint16_t	di_flags;	/* random flags, XFS_DIFLAG_... */
+	__uint32_t	di_gen;		/* generation number */
+
+	/* di_next_unlinked is the only non-core field in the old dinode */
+	xfs_agino_t	di_next_unlinked;/* agi unlinked list ptr */
+
+	/* start of the extended dinode, writable fields */
+	__uint32_t	di_crc;		/* CRC of the inode */
+	__uint64_t	di_changecount;	/* number of attribute changes */
+	xfs_lsn_t	di_lsn;		/* flush sequence */
+	__uint64_t	di_flags2;	/* more random flags */
+	__uint8_t	di_pad2[16];	/* more padding for future expansion */
+
+	/* fields only written to during inode creation */
+	xfs_ictimestamp_t di_crtime;	/* time created */
+	xfs_ino_t	di_ino;		/* inode number */
+	uuid_t		di_uuid;	/* UUID of the filesystem */
+
+	/* structure must be padded to 64 bit alignment */
+};
 
 /*
  * Inode location information.  Stored in the inode and passed to
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index 265314690415..d00ed639e0bc 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -290,6 +290,7 @@ typedef struct xfs_inode_log_format_64 {
 	__int32_t		ilf_boffset;	/* off of inode in buffer */
 } xfs_inode_log_format_64_t;
 
+
 /*
  * Flags for xfs_trans_log_inode flags field.
  */
@@ -360,10 +361,10 @@ typedef struct xfs_ictimestamp {
 } xfs_ictimestamp_t;
 
 /*
- * NOTE:  This structure must be kept identical to struct xfs_dinode
- *	  except for the endianness annotations.
+ * Define the format of the inode core that is logged. This structure must be
+ * kept identical to struct xfs_dinode except for the endianness annotations.
  */
-typedef struct xfs_icdinode {
+struct xfs_log_dinode {
 	__uint16_t	di_magic;	/* inode magic # = XFS_DINODE_MAGIC */
 	__uint16_t	di_mode;	/* mode and type of file */
 	__int8_t	di_version;	/* inode version */
@@ -407,13 +408,13 @@ typedef struct xfs_icdinode {
 	uuid_t		di_uuid;	/* UUID of the filesystem */
 
 	/* structure must be padded to 64 bit alignment */
-} xfs_icdinode_t;
+};
 
-static inline uint xfs_icdinode_size(int version)
+static inline uint xfs_log_dinode_size(int version)
 {
 	if (version == 3)
-		return sizeof(struct xfs_icdinode);
-	return offsetof(struct xfs_icdinode, di_next_unlinked);
+		return sizeof(struct xfs_log_dinode);
+	return offsetof(struct xfs_log_dinode, di_next_unlinked);
 }
 
 /*
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index d7a490f24ead..7c26f8611891 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -79,7 +79,7 @@ xfs_inode_alloc(
 	memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
 	ip->i_flags = 0;
 	ip->i_delayed_blks = 0;
-	memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
+	memset(&ip->i_d, 0, sizeof(ip->i_d));
 
 	return ip;
 }
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index ca9e11989cbd..aef5452b1a90 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -63,7 +63,7 @@ typedef struct xfs_inode {
 	unsigned long		i_flags;	/* see defined flags below */
 	unsigned int		i_delayed_blks;	/* count of delay alloc blks */
 
-	xfs_icdinode_t		i_d;		/* most of ondisk inode */
+	struct xfs_icdinode	i_d;		/* most of ondisk inode */
 
 	/* VFS inode */
 	struct inode		i_vnode;	/* embedded VFS inode */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index d14b12b8cfef..3ad997278869 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -135,7 +135,7 @@ xfs_inode_item_size(
 
 	*nvecs += 2;
 	*nbytes += sizeof(struct xfs_inode_log_format) +
-		   xfs_icdinode_size(ip->i_d.di_version);
+		   xfs_log_dinode_size(ip->i_d.di_version);
 
 	xfs_inode_item_data_fork_size(iip, nvecs, nbytes);
 	if (XFS_IFORK_Q(ip))
@@ -322,6 +322,127 @@ xfs_inode_item_format_attr_fork(
 	}
 }
 
+static void
+xfs_icdinode_to_log_dinode(
+	struct xfs_icdinode	*from,
+	struct xfs_log_dinode	*to)
+{
+	to->di_magic = from->di_magic;
+	to->di_mode = from->di_mode;
+	to->di_version = from->di_version;
+	to->di_format = from->di_format;
+	to->di_onlink = from->di_onlink;
+	to->di_uid = from->di_uid;
+	to->di_gid = from->di_gid;
+	to->di_nlink = from->di_nlink;
+	to->di_projid_lo = from->di_projid_lo;
+	to->di_projid_hi = from->di_projid_hi;
+	memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
+	to->di_atime.t_sec = from->di_atime.t_sec;
+	to->di_atime.t_nsec = from->di_atime.t_nsec;
+	to->di_mtime.t_sec = from->di_mtime.t_sec;
+	to->di_mtime.t_nsec = from->di_mtime.t_nsec;
+	to->di_ctime.t_sec = from->di_ctime.t_sec;
+	to->di_ctime.t_nsec = from->di_ctime.t_nsec;
+	to->di_size = from->di_size;
+	to->di_nblocks = from->di_nblocks;
+	to->di_extsize = from->di_extsize;
+	to->di_nextents = from->di_nextents;
+	to->di_anextents = from->di_anextents;
+	to->di_forkoff = from->di_forkoff;
+	to->di_aformat = from->di_aformat;
+	to->di_dmevmask = from->di_dmevmask;
+	to->di_dmstate = from->di_dmstate;
+	to->di_flags = from->di_flags;
+	to->di_gen = from->di_gen;
+
+	if (from->di_version == 3) {
+		to->di_changecount = from->di_changecount;
+		to->di_crtime.t_sec = from->di_crtime.t_sec;
+		to->di_crtime.t_nsec = from->di_crtime.t_nsec;
+		to->di_flags2 = from->di_flags2;
+		to->di_ino = from->di_ino;
+		to->di_lsn = from->di_lsn;
+		memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
+		uuid_copy(&to->di_uuid, &from->di_uuid);
+		to->di_flushiter = 0;
+	} else {
+		to->di_flushiter = from->di_flushiter;
+	}
+}
+
+/*
+ * Recovery needs to be able to convert a log dinode back to a real dinode
+ * for writeback we do that by converting a log dinode to a icdinode, and
+ * then passing that to the formatting function.
+ */
+void
+xfs_log_dinode_to_icdinode(
+	struct xfs_log_dinode	*from,
+	struct xfs_icdinode	*to)
+{
+	to->di_magic = from->di_magic;
+	to->di_mode = from->di_mode;
+	to->di_version = from->di_version;
+	to->di_format = from->di_format;
+	to->di_onlink = from->di_onlink;
+	to->di_uid = from->di_uid;
+	to->di_gid = from->di_gid;
+	to->di_nlink = from->di_nlink;
+	to->di_projid_lo = from->di_projid_lo;
+	to->di_projid_hi = from->di_projid_hi;
+	memset(to->di_pad, 0, sizeof(to->di_pad));
+	to->di_atime.t_sec = from->di_atime.t_sec;
+	to->di_atime.t_nsec = from->di_atime.t_nsec;
+	to->di_mtime.t_sec = from->di_mtime.t_sec;
+	to->di_mtime.t_nsec = from->di_mtime.t_nsec;
+	to->di_ctime.t_sec = from->di_ctime.t_sec;
+	to->di_ctime.t_nsec = from->di_ctime.t_nsec;
+	to->di_size = from->di_size;
+	to->di_nblocks = from->di_nblocks;
+	to->di_extsize = from->di_extsize;
+	to->di_nextents = from->di_nextents;
+	to->di_anextents = from->di_anextents;
+	to->di_forkoff = from->di_forkoff;
+	to->di_aformat = from->di_aformat;
+	to->di_dmevmask = from->di_dmevmask;
+	to->di_dmstate = from->di_dmstate;
+	to->di_flags = from->di_flags;
+	to->di_gen = from->di_gen;
+
+	if (from->di_version == 3) {
+		to->di_changecount = from->di_changecount;
+		to->di_crtime.t_sec = from->di_crtime.t_sec;
+		to->di_crtime.t_nsec = from->di_crtime.t_nsec;
+		to->di_flags2 = from->di_flags2;
+		to->di_ino = from->di_ino;
+		to->di_lsn = from->di_lsn;
+		memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
+		uuid_copy(&to->di_uuid, &from->di_uuid);
+		to->di_flushiter = 0;
+	} else {
+		to->di_flushiter = from->di_flushiter;
+	}
+}
+
+/*
+ * Format the inode core. Current timestamp data is only in the VFS inode
+ * fields, so we need to grab them from there. Hence rather than just copying
+ * the XFS inode core structure, format the fields directly into the iovec.
+ */
+static void
+xfs_inode_item_format_core(
+	struct xfs_inode	*ip,
+	struct xfs_log_vec	*lv,
+	struct xfs_log_iovec	**vecp)
+{
+	struct xfs_log_dinode	*dic;
+
+	dic = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_ICORE);
+	xfs_icdinode_to_log_dinode(&ip->i_d, dic);
+	xlog_finish_iovec(lv, *vecp, xfs_log_dinode_size(ip->i_d.di_version));
+}
+
 /*
  * This is called to fill in the vector of log iovecs for the given inode
  * log item.  It fills the first item with an inode log format structure,
@@ -351,10 +472,7 @@ xfs_inode_item_format(
 	ilf->ilf_size = 2; /* format + core */
 	xlog_finish_iovec(lv, vecp, sizeof(struct xfs_inode_log_format));
 
-	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ICORE,
-			&ip->i_d,
-			xfs_icdinode_size(ip->i_d.di_version));
-
+	xfs_inode_item_format_core(ip, lv, &vecp);
 	xfs_inode_item_format_data_fork(iip, ilf, lv, &vecp);
 	if (XFS_IFORK_Q(ip)) {
 		xfs_inode_item_format_attr_fork(iip, ilf, lv, &vecp);
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 4c7722e325b3..24261180ba66 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -49,6 +49,8 @@ extern void xfs_istale_done(struct xfs_buf *, struct xfs_log_item *);
 extern void xfs_iflush_abort(struct xfs_inode *, bool);
 extern int xfs_inode_item_format_convert(xfs_log_iovec_t *,
 					 xfs_inode_log_format_t *);
+extern void xfs_log_dinode_to_icdinode(struct xfs_log_dinode *from,
+				       struct xfs_icdinode *to);
 
 extern struct kmem_zone	*xfs_ili_zone;
 
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index da37beb76f6e..3120f7bbb180 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2839,7 +2839,8 @@ xlog_recover_inode_pass2(
 	int			error;
 	int			attr_index;
 	uint			fields;
-	xfs_icdinode_t		*dicp;
+	struct xfs_log_dinode	*ldip;
+	struct xfs_icdinode	icic;
 	uint			isize;
 	int			need_free = 0;
 
@@ -2892,8 +2893,8 @@ xlog_recover_inode_pass2(
 		error = -EFSCORRUPTED;
 		goto out_release;
 	}
-	dicp = item->ri_buf[1].i_addr;
-	if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) {
+	ldip = item->ri_buf[1].i_addr;
+	if (unlikely(ldip->di_magic != XFS_DINODE_MAGIC)) {
 		xfs_alert(mp,
 			"%s: Bad inode log record, rec ptr 0x%p, ino %Ld",
 			__func__, item, in_f->ilf_ino);
@@ -2929,13 +2930,13 @@ xlog_recover_inode_pass2(
 	 * to skip replay when the on disk inode is newer than the log one
 	 */
 	if (!xfs_sb_version_hascrc(&mp->m_sb) &&
-	    dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
+	    ldip->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
 		/*
 		 * Deal with the wrap case, DI_MAX_FLUSH is less
 		 * than smaller numbers
 		 */
 		if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH &&
-		    dicp->di_flushiter < (DI_MAX_FLUSH >> 1)) {
+		    ldip->di_flushiter < (DI_MAX_FLUSH >> 1)) {
 			/* do nothing */
 		} else {
 			trace_xfs_log_recover_inode_skip(log, in_f);
@@ -2945,13 +2946,13 @@ xlog_recover_inode_pass2(
 	}
 
 	/* Take the opportunity to reset the flush iteration count */
-	dicp->di_flushiter = 0;
+	ldip->di_flushiter = 0;
 
-	if (unlikely(S_ISREG(dicp->di_mode))) {
-		if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
-		    (dicp->di_format != XFS_DINODE_FMT_BTREE)) {
+	if (unlikely(S_ISREG(ldip->di_mode))) {
+		if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) &&
+		    (ldip->di_format != XFS_DINODE_FMT_BTREE)) {
 			XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
-					 XFS_ERRLEVEL_LOW, mp, dicp);
+					 XFS_ERRLEVEL_LOW, mp, ldip);
 			xfs_alert(mp,
 		"%s: Bad regular inode log record, rec ptr 0x%p, "
 		"ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
@@ -2959,12 +2960,12 @@ xlog_recover_inode_pass2(
 			error = -EFSCORRUPTED;
 			goto out_release;
 		}
-	} else if (unlikely(S_ISDIR(dicp->di_mode))) {
-		if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
-		    (dicp->di_format != XFS_DINODE_FMT_BTREE) &&
-		    (dicp->di_format != XFS_DINODE_FMT_LOCAL)) {
+	} else if (unlikely(S_ISDIR(ldip->di_mode))) {
+		if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) &&
+		    (ldip->di_format != XFS_DINODE_FMT_BTREE) &&
+		    (ldip->di_format != XFS_DINODE_FMT_LOCAL)) {
 			XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",
-					     XFS_ERRLEVEL_LOW, mp, dicp);
+					     XFS_ERRLEVEL_LOW, mp, ldip);
 			xfs_alert(mp,
 		"%s: Bad dir inode log record, rec ptr 0x%p, "
 		"ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
@@ -2973,32 +2974,32 @@ xlog_recover_inode_pass2(
 			goto out_release;
 		}
 	}
-	if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){
+	if (unlikely(ldip->di_nextents + ldip->di_anextents > ldip->di_nblocks)){
 		XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",
-				     XFS_ERRLEVEL_LOW, mp, dicp);
+				     XFS_ERRLEVEL_LOW, mp, ldip);
 		xfs_alert(mp,
 	"%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
 	"dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
 			__func__, item, dip, bp, in_f->ilf_ino,
-			dicp->di_nextents + dicp->di_anextents,
-			dicp->di_nblocks);
+			ldip->di_nextents + ldip->di_anextents,
+			ldip->di_nblocks);
 		error = -EFSCORRUPTED;
 		goto out_release;
 	}
-	if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) {
+	if (unlikely(ldip->di_forkoff > mp->m_sb.sb_inodesize)) {
 		XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",
-				     XFS_ERRLEVEL_LOW, mp, dicp);
+				     XFS_ERRLEVEL_LOW, mp, ldip);
 		xfs_alert(mp,
 	"%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
 	"dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__,
-			item, dip, bp, in_f->ilf_ino, dicp->di_forkoff);
+			item, dip, bp, in_f->ilf_ino, ldip->di_forkoff);
 		error = -EFSCORRUPTED;
 		goto out_release;
 	}
-	isize = xfs_icdinode_size(dicp->di_version);
+	isize = xfs_log_dinode_size(ldip->di_version);
 	if (unlikely(item->ri_buf[1].i_len > isize)) {
 		XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
-				     XFS_ERRLEVEL_LOW, mp, dicp);
+				     XFS_ERRLEVEL_LOW, mp, ldip);
 		xfs_alert(mp,
 			"%s: Bad inode log record length %d, rec ptr 0x%p",
 			__func__, item->ri_buf[1].i_len, item);
@@ -3007,7 +3008,8 @@ xlog_recover_inode_pass2(
 	}
 
 	/* The core is in in-core format */
-	xfs_dinode_to_disk(dip, dicp);
+	xfs_log_dinode_to_icdinode(ldip, &icic);
+	xfs_dinode_to_disk(dip, &icic);
 
 	/* the rest is in on-disk format */
 	if (item->ri_buf[1].i_len > isize) {

From 3987848c7c2be112e03c82d03821b044f1c0edec Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 9 Feb 2016 16:54:58 +1100
Subject: [PATCH 24/66] xfs: remove timestamps from incore inode

The struct xfs_inode has two copies of the current timestamps in it,
one in the vfs inode and one in the struct xfs_icdinode. Now that we
no longer log the struct xfs_icdinode directly, we don't need to
keep the timestamps in this structure. instead we can copy them
straight out of the VFS inode when formatting the inode log item or
the on-disk inode.

This reduces the struct xfs_inode in size by 24 bytes.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/libxfs/xfs_inode_buf.c | 91 ++++++++++++++++++++++++++++++-----
 fs/xfs/libxfs/xfs_inode_buf.h |  9 ++--
 fs/xfs/libxfs/xfs_rtbitmap.c  |  2 +-
 fs/xfs/xfs_inode.c            | 21 ++++----
 fs/xfs/xfs_inode_item.c       | 73 +++++-----------------------
 fs/xfs/xfs_inode_item.h       |  2 -
 fs/xfs/xfs_iops.c             | 37 +++-----------
 fs/xfs/xfs_itable.c           | 16 +++---
 fs/xfs/xfs_log_recover.c      |  8 ++-
 fs/xfs/xfs_rtalloc.c          |  2 +-
 fs/xfs/xfs_trans_inode.c      | 12 +----
 11 files changed, 130 insertions(+), 143 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 63d46bf65540..e832bb2ab4f9 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -195,10 +195,13 @@ xfs_imap_to_bp(
 }
 
 void
-xfs_dinode_from_disk(
-	struct xfs_icdinode	*to,
+xfs_inode_from_disk(
+	struct xfs_inode	*ip,
 	struct xfs_dinode	*from)
 {
+	struct xfs_icdinode	*to = &ip->i_d;
+	struct inode		*inode = VFS_I(ip);
+
 	to->di_magic = be16_to_cpu(from->di_magic);
 	to->di_mode = be16_to_cpu(from->di_mode);
 	to->di_version = from ->di_version;
@@ -211,12 +214,20 @@ xfs_dinode_from_disk(
 	to->di_projid_hi = be16_to_cpu(from->di_projid_hi);
 	memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
 	to->di_flushiter = be16_to_cpu(from->di_flushiter);
-	to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec);
-	to->di_atime.t_nsec = be32_to_cpu(from->di_atime.t_nsec);
-	to->di_mtime.t_sec = be32_to_cpu(from->di_mtime.t_sec);
-	to->di_mtime.t_nsec = be32_to_cpu(from->di_mtime.t_nsec);
-	to->di_ctime.t_sec = be32_to_cpu(from->di_ctime.t_sec);
-	to->di_ctime.t_nsec = be32_to_cpu(from->di_ctime.t_nsec);
+
+	/*
+	 * Time is signed, so need to convert to signed 32 bit before
+	 * storing in inode timestamp which may be 64 bit. Otherwise
+	 * a time before epoch is converted to a time long after epoch
+	 * on 64 bit systems.
+	 */
+	inode->i_atime.tv_sec = (int)be32_to_cpu(from->di_atime.t_sec);
+	inode->i_atime.tv_nsec = (int)be32_to_cpu(from->di_atime.t_nsec);
+	inode->i_mtime.tv_sec = (int)be32_to_cpu(from->di_mtime.t_sec);
+	inode->i_mtime.tv_nsec = (int)be32_to_cpu(from->di_mtime.t_nsec);
+	inode->i_ctime.tv_sec = (int)be32_to_cpu(from->di_ctime.t_sec);
+	inode->i_ctime.tv_nsec = (int)be32_to_cpu(from->di_ctime.t_nsec);
+
 	to->di_size = be64_to_cpu(from->di_size);
 	to->di_nblocks = be64_to_cpu(from->di_nblocks);
 	to->di_extsize = be32_to_cpu(from->di_extsize);
@@ -242,9 +253,63 @@ xfs_dinode_from_disk(
 }
 
 void
-xfs_dinode_to_disk(
-	struct xfs_dinode	*to,
-	struct xfs_icdinode	*from)
+xfs_inode_to_disk(
+	struct xfs_inode	*ip,
+	struct xfs_dinode	*to)
+{
+	struct xfs_icdinode	*from = &ip->i_d;
+	struct inode		*inode = VFS_I(ip);
+
+	to->di_magic = cpu_to_be16(from->di_magic);
+	to->di_mode = cpu_to_be16(from->di_mode);
+	to->di_version = from ->di_version;
+	to->di_format = from->di_format;
+	to->di_onlink = cpu_to_be16(from->di_onlink);
+	to->di_uid = cpu_to_be32(from->di_uid);
+	to->di_gid = cpu_to_be32(from->di_gid);
+	to->di_nlink = cpu_to_be32(from->di_nlink);
+	to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
+	to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
+	memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
+
+	to->di_atime.t_sec = cpu_to_be32(inode->i_atime.tv_sec);
+	to->di_atime.t_nsec = cpu_to_be32(inode->i_atime.tv_nsec);
+	to->di_mtime.t_sec = cpu_to_be32(inode->i_mtime.tv_sec);
+	to->di_mtime.t_nsec = cpu_to_be32(inode->i_mtime.tv_nsec);
+	to->di_ctime.t_sec = cpu_to_be32(inode->i_ctime.tv_sec);
+	to->di_ctime.t_nsec = cpu_to_be32(inode->i_ctime.tv_nsec);
+
+	to->di_size = cpu_to_be64(from->di_size);
+	to->di_nblocks = cpu_to_be64(from->di_nblocks);
+	to->di_extsize = cpu_to_be32(from->di_extsize);
+	to->di_nextents = cpu_to_be32(from->di_nextents);
+	to->di_anextents = cpu_to_be16(from->di_anextents);
+	to->di_forkoff = from->di_forkoff;
+	to->di_aformat = from->di_aformat;
+	to->di_dmevmask = cpu_to_be32(from->di_dmevmask);
+	to->di_dmstate = cpu_to_be16(from->di_dmstate);
+	to->di_flags = cpu_to_be16(from->di_flags);
+	to->di_gen = cpu_to_be32(from->di_gen);
+
+	if (from->di_version == 3) {
+		to->di_changecount = cpu_to_be64(from->di_changecount);
+		to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec);
+		to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec);
+		to->di_flags2 = cpu_to_be64(from->di_flags2);
+		to->di_ino = cpu_to_be64(from->di_ino);
+		to->di_lsn = cpu_to_be64(from->di_lsn);
+		memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
+		uuid_copy(&to->di_uuid, &from->di_uuid);
+		to->di_flushiter = 0;
+	} else {
+		to->di_flushiter = cpu_to_be16(from->di_flushiter);
+	}
+}
+
+void
+xfs_log_dinode_to_disk(
+	struct xfs_log_dinode	*from,
+	struct xfs_dinode	*to)
 {
 	to->di_magic = cpu_to_be16(from->di_magic);
 	to->di_mode = cpu_to_be16(from->di_mode);
@@ -257,12 +322,14 @@ xfs_dinode_to_disk(
 	to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
 	to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
 	memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
+
 	to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec);
 	to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec);
 	to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec);
 	to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec);
 	to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec);
 	to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec);
+
 	to->di_size = cpu_to_be64(from->di_size);
 	to->di_nblocks = cpu_to_be64(from->di_nblocks);
 	to->di_extsize = cpu_to_be32(from->di_extsize);
@@ -403,7 +470,7 @@ xfs_iread(
 	 * Otherwise, just get the truly permanent information.
 	 */
 	if (dip->di_mode) {
-		xfs_dinode_from_disk(&ip->i_d, dip);
+		xfs_inode_from_disk(ip, dip);
 		error = xfs_iformat_fork(ip, dip);
 		if (error)  {
 #ifdef DEBUG
diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h
index 642f2a297c26..adcc9bfe02e9 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.h
+++ b/fs/xfs/libxfs/xfs_inode_buf.h
@@ -40,9 +40,6 @@ struct xfs_icdinode {
 	__uint16_t	di_projid_hi;	/* higher part of owner's project id */
 	__uint8_t	di_pad[6];	/* unused, zeroed space */
 	__uint16_t	di_flushiter;	/* incremented on flush */
-	xfs_ictimestamp_t di_atime;	/* time last accessed */
-	xfs_ictimestamp_t di_mtime;	/* time last modified */
-	xfs_ictimestamp_t di_ctime;	/* time created/inode modified */
 	xfs_fsize_t	di_size;	/* number of bytes in file */
 	xfs_rfsblock_t	di_nblocks;	/* # of direct & btree blocks used */
 	xfs_extlen_t	di_extsize;	/* basic/minimum extent size for file */
@@ -89,8 +86,10 @@ int	xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *,
 int	xfs_iread(struct xfs_mount *, struct xfs_trans *,
 		  struct xfs_inode *, uint);
 void	xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *);
-void	xfs_dinode_to_disk(struct xfs_dinode *to, struct xfs_icdinode *from);
-void	xfs_dinode_from_disk(struct xfs_icdinode *to, struct xfs_dinode *from);
+void	xfs_inode_to_disk(struct xfs_inode *ip, struct xfs_dinode *to);
+void	xfs_inode_from_disk(struct xfs_inode *ip, struct xfs_dinode *from);
+void	xfs_log_dinode_to_disk(struct xfs_log_dinode *from,
+			       struct xfs_dinode *to);
 
 #if defined(DEBUG)
 void	xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index 9b59ffa1fc19..acc71dd36a2b 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -983,7 +983,7 @@ xfs_rtfree_extent(
 	    mp->m_sb.sb_rextents) {
 		if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM))
 			mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
-		*(__uint64_t *)&mp->m_rbmip->i_d.di_atime = 0;
+		*(__uint64_t *)&VFS_I(mp->m_rbmip)->i_atime = 0;
 		xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
 	}
 	return 0;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index ceba1a83cacc..9ad9e355a1c0 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -766,6 +766,7 @@ xfs_ialloc(
 	uint		flags;
 	int		error;
 	struct timespec	tv;
+	struct inode	*inode;
 
 	/*
 	 * Call the space management code to pick
@@ -791,6 +792,7 @@ xfs_ialloc(
 	if (error)
 		return error;
 	ASSERT(ip != NULL);
+	inode = VFS_I(ip);
 
 	/*
 	 * We always convert v1 inodes to v2 now - we only support filesystems
@@ -832,10 +834,9 @@ xfs_ialloc(
 	ASSERT(ip->i_d.di_nblocks == 0);
 
 	tv = current_fs_time(mp->m_super);
-	ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec;
-	ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec;
-	ip->i_d.di_atime = ip->i_d.di_mtime;
-	ip->i_d.di_ctime = ip->i_d.di_mtime;
+	inode->i_mtime = tv;
+	inode->i_atime = tv;
+	inode->i_ctime = tv;
 
 	/*
 	 * di_gen will have been taken care of in xfs_iread.
@@ -853,7 +854,8 @@ xfs_ialloc(
 		ip->i_d.di_lsn = 0;
 		ip->i_d.di_flags2 = 0;
 		memset(&(ip->i_d.di_pad2[0]), 0, sizeof(ip->i_d.di_pad2));
-		ip->i_d.di_crtime = ip->i_d.di_mtime;
+		ip->i_d.di_crtime.t_sec = (__int32_t)tv.tv_sec;
+		ip->i_d.di_crtime.t_nsec = (__int32_t)tv.tv_nsec;
 	}
 
 
@@ -3523,12 +3525,11 @@ xfs_iflush_int(
 		ip->i_d.di_flushiter++;
 
 	/*
-	 * Copy the dirty parts of the inode into the on-disk
-	 * inode.  We always copy out the core of the inode,
-	 * because if the inode is dirty at all the core must
-	 * be.
+	 * Copy the dirty parts of the inode into the on-disk inode.  We always
+	 * copy out the core of the inode, because if the inode is dirty at all
+	 * the core must be.
 	 */
-	xfs_dinode_to_disk(dip, &ip->i_d);
+	xfs_inode_to_disk(ip, dip);
 
 	/* Wrap, we never let the log put out DI_MAX_FLUSH */
 	if (ip->i_d.di_flushiter == DI_MAX_FLUSH)
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 3ad997278869..9dcbf584a336 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -323,10 +323,13 @@ xfs_inode_item_format_attr_fork(
 }
 
 static void
-xfs_icdinode_to_log_dinode(
-	struct xfs_icdinode	*from,
+xfs_inode_to_log_dinode(
+	struct xfs_inode	*ip,
 	struct xfs_log_dinode	*to)
 {
+	struct xfs_icdinode	*from = &ip->i_d;
+	struct inode		*inode = VFS_I(ip);
+
 	to->di_magic = from->di_magic;
 	to->di_mode = from->di_mode;
 	to->di_version = from->di_version;
@@ -338,66 +341,14 @@ xfs_icdinode_to_log_dinode(
 	to->di_projid_lo = from->di_projid_lo;
 	to->di_projid_hi = from->di_projid_hi;
 	memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
-	to->di_atime.t_sec = from->di_atime.t_sec;
-	to->di_atime.t_nsec = from->di_atime.t_nsec;
-	to->di_mtime.t_sec = from->di_mtime.t_sec;
-	to->di_mtime.t_nsec = from->di_mtime.t_nsec;
-	to->di_ctime.t_sec = from->di_ctime.t_sec;
-	to->di_ctime.t_nsec = from->di_ctime.t_nsec;
-	to->di_size = from->di_size;
-	to->di_nblocks = from->di_nblocks;
-	to->di_extsize = from->di_extsize;
-	to->di_nextents = from->di_nextents;
-	to->di_anextents = from->di_anextents;
-	to->di_forkoff = from->di_forkoff;
-	to->di_aformat = from->di_aformat;
-	to->di_dmevmask = from->di_dmevmask;
-	to->di_dmstate = from->di_dmstate;
-	to->di_flags = from->di_flags;
-	to->di_gen = from->di_gen;
 
-	if (from->di_version == 3) {
-		to->di_changecount = from->di_changecount;
-		to->di_crtime.t_sec = from->di_crtime.t_sec;
-		to->di_crtime.t_nsec = from->di_crtime.t_nsec;
-		to->di_flags2 = from->di_flags2;
-		to->di_ino = from->di_ino;
-		to->di_lsn = from->di_lsn;
-		memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
-		uuid_copy(&to->di_uuid, &from->di_uuid);
-		to->di_flushiter = 0;
-	} else {
-		to->di_flushiter = from->di_flushiter;
-	}
-}
+	to->di_atime.t_sec = inode->i_atime.tv_sec;
+	to->di_atime.t_nsec = inode->i_atime.tv_nsec;
+	to->di_mtime.t_sec = inode->i_mtime.tv_sec;
+	to->di_mtime.t_nsec = inode->i_mtime.tv_nsec;
+	to->di_ctime.t_sec = inode->i_ctime.tv_sec;
+	to->di_ctime.t_nsec = inode->i_ctime.tv_nsec;
 
-/*
- * Recovery needs to be able to convert a log dinode back to a real dinode
- * for writeback we do that by converting a log dinode to a icdinode, and
- * then passing that to the formatting function.
- */
-void
-xfs_log_dinode_to_icdinode(
-	struct xfs_log_dinode	*from,
-	struct xfs_icdinode	*to)
-{
-	to->di_magic = from->di_magic;
-	to->di_mode = from->di_mode;
-	to->di_version = from->di_version;
-	to->di_format = from->di_format;
-	to->di_onlink = from->di_onlink;
-	to->di_uid = from->di_uid;
-	to->di_gid = from->di_gid;
-	to->di_nlink = from->di_nlink;
-	to->di_projid_lo = from->di_projid_lo;
-	to->di_projid_hi = from->di_projid_hi;
-	memset(to->di_pad, 0, sizeof(to->di_pad));
-	to->di_atime.t_sec = from->di_atime.t_sec;
-	to->di_atime.t_nsec = from->di_atime.t_nsec;
-	to->di_mtime.t_sec = from->di_mtime.t_sec;
-	to->di_mtime.t_nsec = from->di_mtime.t_nsec;
-	to->di_ctime.t_sec = from->di_ctime.t_sec;
-	to->di_ctime.t_nsec = from->di_ctime.t_nsec;
 	to->di_size = from->di_size;
 	to->di_nblocks = from->di_nblocks;
 	to->di_extsize = from->di_extsize;
@@ -439,7 +390,7 @@ xfs_inode_item_format_core(
 	struct xfs_log_dinode	*dic;
 
 	dic = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_ICORE);
-	xfs_icdinode_to_log_dinode(&ip->i_d, dic);
+	xfs_inode_to_log_dinode(ip, dic);
 	xlog_finish_iovec(lv, *vecp, xfs_log_dinode_size(ip->i_d.di_version));
 }
 
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 24261180ba66..4c7722e325b3 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -49,8 +49,6 @@ extern void xfs_istale_done(struct xfs_buf *, struct xfs_log_item *);
 extern void xfs_iflush_abort(struct xfs_inode *, bool);
 extern int xfs_inode_item_format_convert(xfs_log_iovec_t *,
 					 xfs_inode_log_format_t *);
-extern void xfs_log_dinode_to_icdinode(struct xfs_log_dinode *from,
-				       struct xfs_icdinode *to);
 
 extern struct kmem_zone	*xfs_ili_zone;
 
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 76b71a1c6c32..cd27c6d56dfc 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -522,21 +522,12 @@ xfs_setattr_time(
 
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 
-	if (iattr->ia_valid & ATTR_ATIME) {
+	if (iattr->ia_valid & ATTR_ATIME)
 		inode->i_atime = iattr->ia_atime;
-		ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec;
-		ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec;
-	}
-	if (iattr->ia_valid & ATTR_CTIME) {
+	if (iattr->ia_valid & ATTR_CTIME)
 		inode->i_ctime = iattr->ia_ctime;
-		ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
-		ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
-	}
-	if (iattr->ia_valid & ATTR_MTIME) {
+	if (iattr->ia_valid & ATTR_MTIME)
 		inode->i_mtime = iattr->ia_mtime;
-		ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
-		ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
-	}
 }
 
 int
@@ -991,21 +982,13 @@ xfs_vn_update_time(
 	}
 
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
-	if (flags & S_CTIME) {
+	if (flags & S_CTIME)
 		inode->i_ctime = *now;
-		ip->i_d.di_ctime.t_sec = (__int32_t)now->tv_sec;
-		ip->i_d.di_ctime.t_nsec = (__int32_t)now->tv_nsec;
-	}
-	if (flags & S_MTIME) {
+	if (flags & S_MTIME)
 		inode->i_mtime = *now;
-		ip->i_d.di_mtime.t_sec = (__int32_t)now->tv_sec;
-		ip->i_d.di_mtime.t_nsec = (__int32_t)now->tv_nsec;
-	}
-	if (flags & S_ATIME) {
+	if (flags & S_ATIME)
 		inode->i_atime = *now;
-		ip->i_d.di_atime.t_sec = (__int32_t)now->tv_sec;
-		ip->i_d.di_atime.t_nsec = (__int32_t)now->tv_nsec;
-	}
+
 	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_TIMESTAMP);
 	return xfs_trans_commit(tp);
@@ -1251,12 +1234,6 @@ xfs_setup_inode(
 
 	inode->i_generation = ip->i_d.di_gen;
 	i_size_write(inode, ip->i_d.di_size);
-	inode->i_atime.tv_sec	= ip->i_d.di_atime.t_sec;
-	inode->i_atime.tv_nsec	= ip->i_d.di_atime.t_nsec;
-	inode->i_mtime.tv_sec	= ip->i_d.di_mtime.t_sec;
-	inode->i_mtime.tv_nsec	= ip->i_d.di_mtime.t_nsec;
-	inode->i_ctime.tv_sec	= ip->i_d.di_ctime.t_sec;
-	inode->i_ctime.tv_nsec	= ip->i_d.di_ctime.t_nsec;
 	xfs_diflags_to_iflags(inode, ip);
 
 	ip->d_ops = ip->i_mount->m_nondir_inode_ops;
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 930ebd86beba..2acda42319f5 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -57,6 +57,7 @@ xfs_bulkstat_one_int(
 {
 	struct xfs_icdinode	*dic;		/* dinode core info pointer */
 	struct xfs_inode	*ip;		/* incore inode pointer */
+	struct inode		*inode;
 	struct xfs_bstat	*buf;		/* return buffer */
 	int			error = 0;	/* error value */
 
@@ -77,6 +78,7 @@ xfs_bulkstat_one_int(
 
 	ASSERT(ip != NULL);
 	ASSERT(ip->i_imap.im_blkno != 0);
+	inode = VFS_I(ip);
 
 	dic = &ip->i_d;
 
@@ -91,12 +93,14 @@ xfs_bulkstat_one_int(
 	buf->bs_uid = dic->di_uid;
 	buf->bs_gid = dic->di_gid;
 	buf->bs_size = dic->di_size;
-	buf->bs_atime.tv_sec = dic->di_atime.t_sec;
-	buf->bs_atime.tv_nsec = dic->di_atime.t_nsec;
-	buf->bs_mtime.tv_sec = dic->di_mtime.t_sec;
-	buf->bs_mtime.tv_nsec = dic->di_mtime.t_nsec;
-	buf->bs_ctime.tv_sec = dic->di_ctime.t_sec;
-	buf->bs_ctime.tv_nsec = dic->di_ctime.t_nsec;
+
+	buf->bs_atime.tv_sec = inode->i_atime.tv_sec;
+	buf->bs_atime.tv_nsec = inode->i_atime.tv_nsec;
+	buf->bs_mtime.tv_sec = inode->i_mtime.tv_sec;
+	buf->bs_mtime.tv_nsec = inode->i_mtime.tv_nsec;
+	buf->bs_ctime.tv_sec = inode->i_ctime.tv_sec;
+	buf->bs_ctime.tv_nsec = inode->i_ctime.tv_nsec;
+
 	buf->bs_xflags = xfs_ip2xflags(ip);
 	buf->bs_extsize = dic->di_extsize << mp->m_sb.sb_blocklog;
 	buf->bs_extents = dic->di_nextents;
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 3120f7bbb180..4b79cf006589 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2793,7 +2793,7 @@ xfs_recover_inode_owner_change(
 		return -ENOMEM;
 
 	/* instantiate the inode */
-	xfs_dinode_from_disk(&ip->i_d, dip);
+	xfs_inode_from_disk(ip, dip);
 	ASSERT(ip->i_d.di_version >= 3);
 
 	error = xfs_iformat_fork(ip, dip);
@@ -2840,7 +2840,6 @@ xlog_recover_inode_pass2(
 	int			attr_index;
 	uint			fields;
 	struct xfs_log_dinode	*ldip;
-	struct xfs_icdinode	icic;
 	uint			isize;
 	int			need_free = 0;
 
@@ -3007,9 +3006,8 @@ xlog_recover_inode_pass2(
 		goto out_release;
 	}
 
-	/* The core is in in-core format */
-	xfs_log_dinode_to_icdinode(ldip, &icic);
-	xfs_dinode_to_disk(dip, &icic);
+	/* recover the log dinode inode into the on disk inode */
+	xfs_log_dinode_to_disk(ldip, dip);
 
 	/* the rest is in on-disk format */
 	if (item->ri_buf[1].i_len > isize) {
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index be02a68b2fe2..abf44435d04a 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -1272,7 +1272,7 @@ xfs_rtpick_extent(
 
 	ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
 
-	seqp = (__uint64_t *)&mp->m_rbmip->i_d.di_atime;
+	seqp = (__uint64_t *)&VFS_I(mp->m_rbmip)->i_atime;
 	if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) {
 		mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
 		*seqp = 0;
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index b97f1df910ab..3f0d46655ce5 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -75,18 +75,10 @@ xfs_trans_ichgtime(
 
 	tv = current_fs_time(inode->i_sb);
 
-	if ((flags & XFS_ICHGTIME_MOD) &&
-	    !timespec_equal(&inode->i_mtime, &tv)) {
+	if (flags & XFS_ICHGTIME_MOD)
 		inode->i_mtime = tv;
-		ip->i_d.di_mtime.t_sec = tv.tv_sec;
-		ip->i_d.di_mtime.t_nsec = tv.tv_nsec;
-	}
-	if ((flags & XFS_ICHGTIME_CHG) &&
-	    !timespec_equal(&inode->i_ctime, &tv)) {
+	if (flags & XFS_ICHGTIME_CHG)
 		inode->i_ctime = tv;
-		ip->i_d.di_ctime.t_sec = tv.tv_sec;
-		ip->i_d.di_ctime.t_nsec = tv.tv_nsec;
-	}
 }
 
 /*

From 93f958f9c41f0bfd10627a2279457df64004d957 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 9 Feb 2016 16:54:58 +1100
Subject: [PATCH 25/66] xfs: cull unnecessary icdinode fields

Now that the struct xfs_icdinode is not directly related to the
on-disk format, we can cull things in it we really don't need to
store:

	- magic number never changes
	- padding is not necessary
	- next_unlinked is never used
	- inode number is redundant
	- uuid is redundant
	- lsn is accessed directly from dinode
	- inode CRC is only accessed directly from dinode

Hence we can remove these from the struct xfs_icdinode and redirect
the code that uses them to the xfs_dinode appripriately.  This
reduces the size of the struct icdinode from 152 bytes to 88 bytes,
and removes a fair chunk of unnecessary code, too.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/libxfs/xfs_inode_buf.c | 39 ++++++++++++-----------------------
 fs/xfs/libxfs/xfs_inode_buf.h | 25 ++++++----------------
 fs/xfs/xfs_inode.c            | 19 +----------------
 fs/xfs/xfs_inode_item.c       | 19 ++++++++++-------
 4 files changed, 31 insertions(+), 71 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index e832bb2ab4f9..c0fe062b567a 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -202,7 +202,6 @@ xfs_inode_from_disk(
 	struct xfs_icdinode	*to = &ip->i_d;
 	struct inode		*inode = VFS_I(ip);
 
-	to->di_magic = be16_to_cpu(from->di_magic);
 	to->di_mode = be16_to_cpu(from->di_mode);
 	to->di_version = from ->di_version;
 	to->di_format = from->di_format;
@@ -212,7 +211,6 @@ xfs_inode_from_disk(
 	to->di_nlink = be32_to_cpu(from->di_nlink);
 	to->di_projid_lo = be16_to_cpu(from->di_projid_lo);
 	to->di_projid_hi = be16_to_cpu(from->di_projid_hi);
-	memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
 	to->di_flushiter = be16_to_cpu(from->di_flushiter);
 
 	/*
@@ -245,24 +243,22 @@ xfs_inode_from_disk(
 		to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec);
 		to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec);
 		to->di_flags2 = be64_to_cpu(from->di_flags2);
-		to->di_ino = be64_to_cpu(from->di_ino);
-		to->di_lsn = be64_to_cpu(from->di_lsn);
-		memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
-		uuid_copy(&to->di_uuid, &from->di_uuid);
 	}
 }
 
 void
 xfs_inode_to_disk(
 	struct xfs_inode	*ip,
-	struct xfs_dinode	*to)
+	struct xfs_dinode	*to,
+	xfs_lsn_t		lsn)
 {
 	struct xfs_icdinode	*from = &ip->i_d;
 	struct inode		*inode = VFS_I(ip);
 
-	to->di_magic = cpu_to_be16(from->di_magic);
+	to->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
+
 	to->di_mode = cpu_to_be16(from->di_mode);
-	to->di_version = from ->di_version;
+	to->di_version = from->di_version;
 	to->di_format = from->di_format;
 	to->di_onlink = cpu_to_be16(from->di_onlink);
 	to->di_uid = cpu_to_be32(from->di_uid);
@@ -270,8 +266,8 @@ xfs_inode_to_disk(
 	to->di_nlink = cpu_to_be32(from->di_nlink);
 	to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
 	to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
-	memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
 
+	memset(to->di_pad, 0, sizeof(to->di_pad));
 	to->di_atime.t_sec = cpu_to_be32(inode->i_atime.tv_sec);
 	to->di_atime.t_nsec = cpu_to_be32(inode->i_atime.tv_nsec);
 	to->di_mtime.t_sec = cpu_to_be32(inode->i_mtime.tv_sec);
@@ -296,10 +292,11 @@ xfs_inode_to_disk(
 		to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec);
 		to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec);
 		to->di_flags2 = cpu_to_be64(from->di_flags2);
-		to->di_ino = cpu_to_be64(from->di_ino);
-		to->di_lsn = cpu_to_be64(from->di_lsn);
-		memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
-		uuid_copy(&to->di_uuid, &from->di_uuid);
+
+		to->di_ino = cpu_to_be64(ip->i_ino);
+		to->di_lsn = cpu_to_be64(lsn);
+		memset(to->di_pad2, 0, sizeof(to->di_pad2));
+		uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid);
 		to->di_flushiter = 0;
 	} else {
 		to->di_flushiter = cpu_to_be16(from->di_flushiter);
@@ -434,13 +431,10 @@ xfs_iread(
 	    !(mp->m_flags & XFS_MOUNT_IKEEP)) {
 		/* initialise the on-disk inode core */
 		memset(&ip->i_d, 0, sizeof(ip->i_d));
-		ip->i_d.di_magic = XFS_DINODE_MAGIC;
 		ip->i_d.di_gen = prandom_u32();
-		if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		if (xfs_sb_version_hascrc(&mp->m_sb))
 			ip->i_d.di_version = 3;
-			ip->i_d.di_ino = ip->i_ino;
-			uuid_copy(&ip->i_d.di_uuid, &mp->m_sb.sb_meta_uuid);
-		} else
+		else
 			ip->i_d.di_version = 2;
 		return 0;
 	}
@@ -484,16 +478,10 @@ xfs_iread(
 		 * Partial initialisation of the in-core inode. Just the bits
 		 * that xfs_ialloc won't overwrite or relies on being correct.
 		 */
-		ip->i_d.di_magic = be16_to_cpu(dip->di_magic);
 		ip->i_d.di_version = dip->di_version;
 		ip->i_d.di_gen = be32_to_cpu(dip->di_gen);
 		ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter);
 
-		if (dip->di_version == 3) {
-			ip->i_d.di_ino = be64_to_cpu(dip->di_ino);
-			uuid_copy(&ip->i_d.di_uuid, &dip->di_uuid);
-		}
-
 		/*
 		 * Make sure to pull in the mode here as well in
 		 * case the inode is released without being used.
@@ -514,7 +502,6 @@ xfs_iread(
 	 */
 	if (ip->i_d.di_version == 1) {
 		ip->i_d.di_version = 2;
-		memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
 		ip->i_d.di_nlink = ip->i_d.di_onlink;
 		ip->i_d.di_onlink = 0;
 		xfs_set_projid(ip, 0);
diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h
index adcc9bfe02e9..7b8b1b97918b 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.h
+++ b/fs/xfs/libxfs/xfs_inode_buf.h
@@ -22,24 +22,22 @@ struct xfs_inode;
 struct xfs_dinode;
 
 /*
- * In memory representation of the XFS inode. This is held in the in-core
- * struct xfs_inode to represent the on disk values, but no longer needs to be
- * identical to the on-disk structure as it is always translated to on-disk
+ * In memory representation of the XFS inode. This is held in the in-core struct
+ * xfs_inode and represents the current on disk values but the structure is not
+ * in on-disk format.  That is, this structure is always translated to on-disk
  * format specific structures at the appropriate time.
  */
 struct xfs_icdinode {
-	__uint16_t	di_magic;	/* inode magic # = XFS_DINODE_MAGIC */
 	__uint16_t	di_mode;	/* mode and type of file */
 	__int8_t	di_version;	/* inode version */
 	__int8_t	di_format;	/* format of di_c data */
 	__uint16_t	di_onlink;	/* old number of links to file */
+	__uint16_t	di_flushiter;	/* incremented on flush */
 	__uint32_t	di_uid;		/* owner's user id */
 	__uint32_t	di_gid;		/* owner's group id */
 	__uint32_t	di_nlink;	/* number of links to file */
 	__uint16_t	di_projid_lo;	/* lower part of owner's project id */
 	__uint16_t	di_projid_hi;	/* higher part of owner's project id */
-	__uint8_t	di_pad[6];	/* unused, zeroed space */
-	__uint16_t	di_flushiter;	/* incremented on flush */
 	xfs_fsize_t	di_size;	/* number of bytes in file */
 	xfs_rfsblock_t	di_nblocks;	/* # of direct & btree blocks used */
 	xfs_extlen_t	di_extsize;	/* basic/minimum extent size for file */
@@ -52,22 +50,10 @@ struct xfs_icdinode {
 	__uint16_t	di_flags;	/* random flags, XFS_DIFLAG_... */
 	__uint32_t	di_gen;		/* generation number */
 
-	/* di_next_unlinked is the only non-core field in the old dinode */
-	xfs_agino_t	di_next_unlinked;/* agi unlinked list ptr */
-
-	/* start of the extended dinode, writable fields */
-	__uint32_t	di_crc;		/* CRC of the inode */
 	__uint64_t	di_changecount;	/* number of attribute changes */
-	xfs_lsn_t	di_lsn;		/* flush sequence */
 	__uint64_t	di_flags2;	/* more random flags */
-	__uint8_t	di_pad2[16];	/* more padding for future expansion */
 
-	/* fields only written to during inode creation */
 	xfs_ictimestamp_t di_crtime;	/* time created */
-	xfs_ino_t	di_ino;		/* inode number */
-	uuid_t		di_uuid;	/* UUID of the filesystem */
-
-	/* structure must be padded to 64 bit alignment */
 };
 
 /*
@@ -86,7 +72,8 @@ int	xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *,
 int	xfs_iread(struct xfs_mount *, struct xfs_trans *,
 		  struct xfs_inode *, uint);
 void	xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *);
-void	xfs_inode_to_disk(struct xfs_inode *ip, struct xfs_dinode *to);
+void	xfs_inode_to_disk(struct xfs_inode *ip, struct xfs_dinode *to,
+			  xfs_lsn_t lsn);
 void	xfs_inode_from_disk(struct xfs_inode *ip, struct xfs_dinode *from);
 void	xfs_log_dinode_to_disk(struct xfs_log_dinode *from,
 			       struct xfs_dinode *to);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 9ad9e355a1c0..45acdee98983 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -809,7 +809,6 @@ xfs_ialloc(
 	ip->i_d.di_uid = xfs_kuid_to_uid(current_fsuid());
 	ip->i_d.di_gid = xfs_kgid_to_gid(current_fsgid());
 	xfs_set_projid(ip, prid);
-	memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
 
 	if (pip && XFS_INHERIT_GID(pip)) {
 		ip->i_d.di_gid = pip->i_d.di_gid;
@@ -847,13 +846,8 @@ xfs_ialloc(
 	ip->i_d.di_flags = 0;
 
 	if (ip->i_d.di_version == 3) {
-		ASSERT(ip->i_d.di_ino == ino);
-		ASSERT(uuid_equal(&ip->i_d.di_uuid, &mp->m_sb.sb_meta_uuid));
-		ip->i_d.di_crc = 0;
 		ip->i_d.di_changecount = 1;
-		ip->i_d.di_lsn = 0;
 		ip->i_d.di_flags2 = 0;
-		memset(&(ip->i_d.di_pad2[0]), 0, sizeof(ip->i_d.di_pad2));
 		ip->i_d.di_crtime.t_sec = (__int32_t)tv.tv_sec;
 		ip->i_d.di_crtime.t_nsec = (__int32_t)tv.tv_nsec;
 	}
@@ -3464,13 +3458,6 @@ xfs_iflush_int(
 			__func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip);
 		goto corrupt_out;
 	}
-	if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC,
-				mp, XFS_ERRTAG_IFLUSH_2, XFS_RANDOM_IFLUSH_2)) {
-		xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
-			"%s: Bad inode %Lu, ptr 0x%p, magic number 0x%x",
-			__func__, ip->i_ino, ip, ip->i_d.di_magic);
-		goto corrupt_out;
-	}
 	if (S_ISREG(ip->i_d.di_mode)) {
 		if (XFS_TEST_ERROR(
 		    (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
@@ -3529,7 +3516,7 @@ xfs_iflush_int(
 	 * copy out the core of the inode, because if the inode is dirty at all
 	 * the core must be.
 	 */
-	xfs_inode_to_disk(ip, dip);
+	xfs_inode_to_disk(ip, dip, iip->ili_item.li_lsn);
 
 	/* Wrap, we never let the log put out DI_MAX_FLUSH */
 	if (ip->i_d.di_flushiter == DI_MAX_FLUSH)
@@ -3581,10 +3568,6 @@ xfs_iflush_int(
 	 */
 	xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item);
 
-	/* update the lsn in the on disk inode if required */
-	if (ip->i_d.di_version == 3)
-		dip->di_lsn = cpu_to_be64(iip->ili_item.li_lsn);
-
 	/* generate the checksum. */
 	xfs_dinode_calc_crc(mp, dip);
 
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 9dcbf584a336..588d1b4523cb 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -325,12 +325,14 @@ xfs_inode_item_format_attr_fork(
 static void
 xfs_inode_to_log_dinode(
 	struct xfs_inode	*ip,
-	struct xfs_log_dinode	*to)
+	struct xfs_log_dinode	*to,
+	xfs_lsn_t		lsn)
 {
 	struct xfs_icdinode	*from = &ip->i_d;
 	struct inode		*inode = VFS_I(ip);
 
-	to->di_magic = from->di_magic;
+	to->di_magic = XFS_DINODE_MAGIC;
+
 	to->di_mode = from->di_mode;
 	to->di_version = from->di_version;
 	to->di_format = from->di_format;
@@ -340,8 +342,8 @@ xfs_inode_to_log_dinode(
 	to->di_nlink = from->di_nlink;
 	to->di_projid_lo = from->di_projid_lo;
 	to->di_projid_hi = from->di_projid_hi;
-	memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
 
+	memset(to->di_pad, 0, sizeof(to->di_pad));
 	to->di_atime.t_sec = inode->i_atime.tv_sec;
 	to->di_atime.t_nsec = inode->i_atime.tv_nsec;
 	to->di_mtime.t_sec = inode->i_mtime.tv_sec;
@@ -366,10 +368,11 @@ xfs_inode_to_log_dinode(
 		to->di_crtime.t_sec = from->di_crtime.t_sec;
 		to->di_crtime.t_nsec = from->di_crtime.t_nsec;
 		to->di_flags2 = from->di_flags2;
-		to->di_ino = from->di_ino;
-		to->di_lsn = from->di_lsn;
-		memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
-		uuid_copy(&to->di_uuid, &from->di_uuid);
+
+		to->di_ino = ip->i_ino;
+		to->di_lsn = lsn;
+		memset(to->di_pad2, 0, sizeof(to->di_pad2));
+		uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid);
 		to->di_flushiter = 0;
 	} else {
 		to->di_flushiter = from->di_flushiter;
@@ -390,7 +393,7 @@ xfs_inode_item_format_core(
 	struct xfs_log_dinode	*dic;
 
 	dic = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_ICORE);
-	xfs_inode_to_log_dinode(ip, dic);
+	xfs_inode_to_log_dinode(ip, dic, ip->i_itemp->ili_item.li_lsn);
 	xlog_finish_iovec(lv, *vecp, xfs_log_dinode_size(ip->i_d.di_version));
 }
 

From faeb4e4715be017e88e630bda84477afc1dff38b Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 9 Feb 2016 16:54:58 +1100
Subject: [PATCH 26/66] xfs: move v1 inode conversion to xfs_inode_from_disk

So we don't have to carry an di_onlink variable around anymore, move
the inode conversion from v1 inode format to v2 inode format into
xfs_inode_from_disk(). This means we can remove the di_onlink fields
from the struct xfs_icdinode.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/libxfs/xfs_inode_buf.c  | 42 ++++++++++++++++------------------
 fs/xfs/libxfs/xfs_inode_buf.h  |  1 -
 fs/xfs/libxfs/xfs_log_format.h |  2 +-
 fs/xfs/xfs_inode.c             |  2 --
 fs/xfs/xfs_inode_item.c        |  2 +-
 5 files changed, 22 insertions(+), 27 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index c0fe062b567a..fe8f4f349b37 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -204,13 +204,25 @@ xfs_inode_from_disk(
 
 	to->di_mode = be16_to_cpu(from->di_mode);
 	to->di_version = from ->di_version;
+
+	/*
+	 * Convert v1 inodes immediately to v2 inode format as this is the
+	 * minimum inode version format we support in the rest of the code.
+	 */
+	if (to->di_version == 1) {
+		to->di_nlink = be16_to_cpu(from->di_onlink);
+		to->di_projid_lo = 0;
+		to->di_projid_hi = 0;
+		to->di_version = 2;
+	} else {
+		to->di_nlink = be32_to_cpu(from->di_nlink);
+		to->di_projid_lo = be16_to_cpu(from->di_projid_lo);
+		to->di_projid_hi = be16_to_cpu(from->di_projid_hi);
+	}
+
 	to->di_format = from->di_format;
-	to->di_onlink = be16_to_cpu(from->di_onlink);
 	to->di_uid = be32_to_cpu(from->di_uid);
 	to->di_gid = be32_to_cpu(from->di_gid);
-	to->di_nlink = be32_to_cpu(from->di_nlink);
-	to->di_projid_lo = be16_to_cpu(from->di_projid_lo);
-	to->di_projid_hi = be16_to_cpu(from->di_projid_hi);
 	to->di_flushiter = be16_to_cpu(from->di_flushiter);
 
 	/*
@@ -256,11 +268,11 @@ xfs_inode_to_disk(
 	struct inode		*inode = VFS_I(ip);
 
 	to->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
+	to->di_onlink = 0;
 
 	to->di_mode = cpu_to_be16(from->di_mode);
 	to->di_version = from->di_version;
 	to->di_format = from->di_format;
-	to->di_onlink = cpu_to_be16(from->di_onlink);
 	to->di_uid = cpu_to_be32(from->di_uid);
 	to->di_gid = cpu_to_be32(from->di_gid);
 	to->di_nlink = cpu_to_be32(from->di_nlink);
@@ -310,9 +322,9 @@ xfs_log_dinode_to_disk(
 {
 	to->di_magic = cpu_to_be16(from->di_magic);
 	to->di_mode = cpu_to_be16(from->di_mode);
-	to->di_version = from ->di_version;
+	to->di_version = from->di_version;
 	to->di_format = from->di_format;
-	to->di_onlink = cpu_to_be16(from->di_onlink);
+	to->di_onlink = 0;
 	to->di_uid = cpu_to_be32(from->di_uid);
 	to->di_gid = cpu_to_be32(from->di_gid);
 	to->di_nlink = cpu_to_be32(from->di_nlink);
@@ -492,21 +504,7 @@ xfs_iread(
 		ip->i_d.di_mode = 0;
 	}
 
-	/*
-	 * Automatically convert version 1 inode formats in memory to version 2
-	 * inode format. If the inode is modified, it will get logged and
-	 * rewritten as a version 2 inode. We can do this because we set the
-	 * superblock feature bit for v2 inodes unconditionally during mount
-	 * and it means the reast of the code can assume the inode version is 2
-	 * or higher.
-	 */
-	if (ip->i_d.di_version == 1) {
-		ip->i_d.di_version = 2;
-		ip->i_d.di_nlink = ip->i_d.di_onlink;
-		ip->i_d.di_onlink = 0;
-		xfs_set_projid(ip, 0);
-	}
-
+	ASSERT(ip->i_d.di_version >= 2);
 	ip->i_delayed_blks = 0;
 
 	/*
diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h
index 7b8b1b97918b..73ba1d8ffac2 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.h
+++ b/fs/xfs/libxfs/xfs_inode_buf.h
@@ -31,7 +31,6 @@ struct xfs_icdinode {
 	__uint16_t	di_mode;	/* mode and type of file */
 	__int8_t	di_version;	/* inode version */
 	__int8_t	di_format;	/* format of di_c data */
-	__uint16_t	di_onlink;	/* old number of links to file */
 	__uint16_t	di_flushiter;	/* incremented on flush */
 	__uint32_t	di_uid;		/* owner's user id */
 	__uint32_t	di_gid;		/* owner's group id */
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index d00ed639e0bc..03f90b99b8c8 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -369,7 +369,7 @@ struct xfs_log_dinode {
 	__uint16_t	di_mode;	/* mode and type of file */
 	__int8_t	di_version;	/* inode version */
 	__int8_t	di_format;	/* format of di_c data */
-	__uint16_t	di_onlink;	/* old number of links to file */
+	__uint8_t	di_pad3[2];	/* unused in v2/3 inodes */
 	__uint32_t	di_uid;		/* owner's user id */
 	__uint32_t	di_gid;		/* owner's group id */
 	__uint32_t	di_nlink;	/* number of links to file */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 45acdee98983..7d9c514fd231 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -803,9 +803,7 @@ xfs_ialloc(
 		ip->i_d.di_version = 2;
 
 	ip->i_d.di_mode = mode;
-	ip->i_d.di_onlink = 0;
 	ip->i_d.di_nlink = nlink;
-	ASSERT(ip->i_d.di_nlink == nlink);
 	ip->i_d.di_uid = xfs_kuid_to_uid(current_fsuid());
 	ip->i_d.di_gid = xfs_kgid_to_gid(current_fsgid());
 	xfs_set_projid(ip, prid);
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 588d1b4523cb..1e5ecbc45459 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -336,7 +336,6 @@ xfs_inode_to_log_dinode(
 	to->di_mode = from->di_mode;
 	to->di_version = from->di_version;
 	to->di_format = from->di_format;
-	to->di_onlink = from->di_onlink;
 	to->di_uid = from->di_uid;
 	to->di_gid = from->di_gid;
 	to->di_nlink = from->di_nlink;
@@ -344,6 +343,7 @@ xfs_inode_to_log_dinode(
 	to->di_projid_hi = from->di_projid_hi;
 
 	memset(to->di_pad, 0, sizeof(to->di_pad));
+	memset(to->di_pad3, 0, sizeof(to->di_pad3));
 	to->di_atime.t_sec = inode->i_atime.tv_sec;
 	to->di_atime.t_nsec = inode->i_atime.tv_nsec;
 	to->di_mtime.t_sec = inode->i_mtime.tv_sec;

From 50997470ef95fd8ae0ef6acab3b2e1d886ae2445 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 9 Feb 2016 16:54:58 +1100
Subject: [PATCH 27/66] xfs: reinitialise recycled VFS inode correctly

We are going to keep certain on-disk information in the VFS inode
rather than in a separate XFS specific stucture, so we have to be
careful of the VFS code clearing that information when we
re-initialise reclaimable cached inodes during lookup. If we don't
do this, then we lose critical information from the inode and that
results in corruption being detected.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_icache.c | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 7c26f8611891..9ca2865575ab 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -134,6 +134,26 @@ xfs_inode_free(
 	call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
 }
 
+/*
+ * When we recycle a reclaimable inode, we need to re-initialise the VFS inode
+ * part of the structure. This is made more complex by the fact we store
+ * information about the on-disk values in the VFS inode and so we can't just
+ * overwrite it's values unconditionally. Hence we save the parameters we
+ * need to retain across reinitialisation, and rewrite them into the VFS inode
+ * after resetting it's state even if resetting fails.
+ */
+static int
+xfs_reinit_inode(
+	struct xfs_mount	*mp,
+	struct inode		*inode)
+{
+	int		error;
+
+	error = inode_init_always(mp->m_super, inode);
+
+	return error;
+}
+
 /*
  * Check the validity of the inode we just found it the cache
  */
@@ -208,7 +228,7 @@ xfs_iget_cache_hit(
 		spin_unlock(&ip->i_flags_lock);
 		rcu_read_unlock();
 
-		error = inode_init_always(mp->m_super, inode);
+		error = xfs_reinit_inode(mp, inode);
 		if (error) {
 			/*
 			 * Re-initializing the inode failed, and we are in deep

From 54d7b5c1d03e9711cce2d72237d5b3f5c87431f4 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 9 Feb 2016 16:54:58 +1100
Subject: [PATCH 28/66] xfs: use vfs inode nlink field everywhere

The VFS tracks the inode nlink just like the xfs_icdinode. We can
remove the variable from the icdinode and use the VFS inode variable
everywhere, reducing the size of the xfs_icdinode by a further 4
bytes.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/libxfs/xfs_inode_buf.c |  6 +--
 fs/xfs/libxfs/xfs_inode_buf.h |  1 -
 fs/xfs/xfs_icache.c           |  2 +
 fs/xfs/xfs_inode.c            | 77 ++++++++++++++++-------------------
 fs/xfs/xfs_inode.h            |  2 -
 fs/xfs/xfs_inode_item.c       |  2 +-
 fs/xfs/xfs_iops.c             |  3 +-
 fs/xfs/xfs_itable.c           |  2 +-
 fs/xfs/xfs_log_recover.c      |  2 +-
 9 files changed, 43 insertions(+), 54 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index fe8f4f349b37..cc509c1af75e 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -210,12 +210,12 @@ xfs_inode_from_disk(
 	 * minimum inode version format we support in the rest of the code.
 	 */
 	if (to->di_version == 1) {
-		to->di_nlink = be16_to_cpu(from->di_onlink);
+		set_nlink(inode, be16_to_cpu(from->di_onlink));
 		to->di_projid_lo = 0;
 		to->di_projid_hi = 0;
 		to->di_version = 2;
 	} else {
-		to->di_nlink = be32_to_cpu(from->di_nlink);
+		set_nlink(inode, be32_to_cpu(from->di_nlink));
 		to->di_projid_lo = be16_to_cpu(from->di_projid_lo);
 		to->di_projid_hi = be16_to_cpu(from->di_projid_hi);
 	}
@@ -275,7 +275,6 @@ xfs_inode_to_disk(
 	to->di_format = from->di_format;
 	to->di_uid = cpu_to_be32(from->di_uid);
 	to->di_gid = cpu_to_be32(from->di_gid);
-	to->di_nlink = cpu_to_be32(from->di_nlink);
 	to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
 	to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
 
@@ -286,6 +285,7 @@ xfs_inode_to_disk(
 	to->di_mtime.t_nsec = cpu_to_be32(inode->i_mtime.tv_nsec);
 	to->di_ctime.t_sec = cpu_to_be32(inode->i_ctime.tv_sec);
 	to->di_ctime.t_nsec = cpu_to_be32(inode->i_ctime.tv_nsec);
+	to->di_nlink = cpu_to_be32(inode->i_nlink);
 
 	to->di_size = cpu_to_be64(from->di_size);
 	to->di_nblocks = cpu_to_be64(from->di_nblocks);
diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h
index 73ba1d8ffac2..320b72344270 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.h
+++ b/fs/xfs/libxfs/xfs_inode_buf.h
@@ -34,7 +34,6 @@ struct xfs_icdinode {
 	__uint16_t	di_flushiter;	/* incremented on flush */
 	__uint32_t	di_uid;		/* owner's user id */
 	__uint32_t	di_gid;		/* owner's group id */
-	__uint32_t	di_nlink;	/* number of links to file */
 	__uint16_t	di_projid_lo;	/* lower part of owner's project id */
 	__uint16_t	di_projid_hi;	/* higher part of owner's project id */
 	xfs_fsize_t	di_size;	/* number of bytes in file */
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 9ca2865575ab..4c184f70d43c 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -148,9 +148,11 @@ xfs_reinit_inode(
 	struct inode		*inode)
 {
 	int		error;
+	uint32_t	nlink = inode->i_nlink;
 
 	error = inode_init_always(mp->m_super, inode);
 
+	set_nlink(inode, nlink);
 	return error;
 }
 
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 7d9c514fd231..18b3bc0c211a 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -57,9 +57,9 @@ kmem_zone_t *xfs_inode_zone;
  */
 #define	XFS_ITRUNC_MAX_EXTENTS	2
 
-STATIC int xfs_iflush_int(xfs_inode_t *, xfs_buf_t *);
-
-STATIC int xfs_iunlink_remove(xfs_trans_t *, xfs_inode_t *);
+STATIC int xfs_iflush_int(struct xfs_inode *, struct xfs_buf *);
+STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *);
+STATIC int xfs_iunlink_remove(struct xfs_trans *, struct xfs_inode *);
 
 /*
  * helper function to extract extent size hint from inode
@@ -803,7 +803,7 @@ xfs_ialloc(
 		ip->i_d.di_version = 2;
 
 	ip->i_d.di_mode = mode;
-	ip->i_d.di_nlink = nlink;
+	set_nlink(inode, nlink);
 	ip->i_d.di_uid = xfs_kuid_to_uid(current_fsuid());
 	ip->i_d.di_gid = xfs_kgid_to_gid(current_fsgid());
 	xfs_set_projid(ip, prid);
@@ -1086,35 +1086,24 @@ xfs_dir_ialloc(
 }
 
 /*
- * Decrement the link count on an inode & log the change.
- * If this causes the link count to go to zero, initiate the
- * logging activity required to truncate a file.
+ * Decrement the link count on an inode & log the change.  If this causes the
+ * link count to go to zero, move the inode to AGI unlinked list so that it can
+ * be freed when the last active reference goes away via xfs_inactive().
  */
 int				/* error */
 xfs_droplink(
 	xfs_trans_t *tp,
 	xfs_inode_t *ip)
 {
-	int	error;
-
 	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
 
-	ASSERT (ip->i_d.di_nlink > 0);
-	ip->i_d.di_nlink--;
 	drop_nlink(VFS_I(ip));
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 
-	error = 0;
-	if (ip->i_d.di_nlink == 0) {
-		/*
-		 * We're dropping the last link to this file.
-		 * Move the on-disk inode to the AGI unlinked list.
-		 * From xfs_inactive() we will pull the inode from
-		 * the list and free it.
-		 */
-		error = xfs_iunlink(tp, ip);
-	}
-	return error;
+	if (VFS_I(ip)->i_nlink)
+		return 0;
+
+	return xfs_iunlink(tp, ip);
 }
 
 /*
@@ -1128,8 +1117,6 @@ xfs_bumplink(
 	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
 
 	ASSERT(ip->i_d.di_version > 1);
-	ASSERT(ip->i_d.di_nlink > 0 || (VFS_I(ip)->i_state & I_LINKABLE));
-	ip->i_d.di_nlink++;
 	inc_nlink(VFS_I(ip));
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 	return 0;
@@ -1387,7 +1374,6 @@ xfs_create_tmpfile(
 	 */
 	xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
 
-	ip->i_d.di_nlink--;
 	error = xfs_iunlink(tp, ip);
 	if (error)
 		goto out_trans_cancel;
@@ -1486,7 +1472,10 @@ xfs_link(
 
 	xfs_bmap_init(&free_list, &first_block);
 
-	if (sip->i_d.di_nlink == 0) {
+	/*
+	 * Handle initial link state of O_TMPFILE inode
+	 */
+	if (VFS_I(sip)->i_nlink == 0) {
 		error = xfs_iunlink_remove(tp, sip);
 		if (error)
 			goto error_return;
@@ -1673,7 +1662,7 @@ xfs_release(
 		}
 	}
 
-	if (ip->i_d.di_nlink == 0)
+	if (VFS_I(ip)->i_nlink == 0)
 		return 0;
 
 	if (xfs_can_free_eofblocks(ip, false)) {
@@ -1889,7 +1878,7 @@ xfs_inactive(
 	if (mp->m_flags & XFS_MOUNT_RDONLY)
 		return;
 
-	if (ip->i_d.di_nlink != 0) {
+	if (VFS_I(ip)->i_nlink != 0) {
 		/*
 		 * force is true because we are evicting an inode from the
 		 * cache. Post-eof blocks must be freed, lest we end up with
@@ -1946,16 +1935,21 @@ xfs_inactive(
 }
 
 /*
- * This is called when the inode's link count goes to 0.
- * We place the on-disk inode on a list in the AGI.  It
- * will be pulled from this list when the inode is freed.
+ * This is called when the inode's link count goes to 0 or we are creating a
+ * tmpfile via O_TMPFILE. In the case of a tmpfile, @ignore_linkcount will be
+ * set to true as the link count is dropped to zero by the VFS after we've
+ * created the file successfully, so we have to add it to the unlinked list
+ * while the link count is non-zero.
+ *
+ * We place the on-disk inode on a list in the AGI.  It will be pulled from this
+ * list when the inode is freed.
  */
-int
+STATIC int
 xfs_iunlink(
-	xfs_trans_t	*tp,
-	xfs_inode_t	*ip)
+	struct xfs_trans *tp,
+	struct xfs_inode *ip)
 {
-	xfs_mount_t	*mp;
+	xfs_mount_t	*mp = tp->t_mountp;
 	xfs_agi_t	*agi;
 	xfs_dinode_t	*dip;
 	xfs_buf_t	*agibp;
@@ -1965,11 +1959,8 @@ xfs_iunlink(
 	int		offset;
 	int		error;
 
-	ASSERT(ip->i_d.di_nlink == 0);
 	ASSERT(ip->i_d.di_mode != 0);
 
-	mp = tp->t_mountp;
-
 	/*
 	 * Get the agi buffer first.  It ensures lock ordering
 	 * on the list.
@@ -2406,7 +2397,7 @@ xfs_ifree(
 	struct xfs_icluster	xic = { 0 };
 
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-	ASSERT(ip->i_d.di_nlink == 0);
+	ASSERT(VFS_I(ip)->i_nlink == 0);
 	ASSERT(ip->i_d.di_nextents == 0);
 	ASSERT(ip->i_d.di_anextents == 0);
 	ASSERT(ip->i_d.di_size == 0 || !S_ISREG(ip->i_d.di_mode));
@@ -2574,8 +2565,8 @@ xfs_remove(
 	 * If we're removing a directory perform some additional validation.
 	 */
 	if (is_dir) {
-		ASSERT(ip->i_d.di_nlink >= 2);
-		if (ip->i_d.di_nlink != 2) {
+		ASSERT(VFS_I(ip)->i_nlink >= 2);
+		if (VFS_I(ip)->i_nlink != 2) {
 			error = -ENOTEMPTY;
 			goto out_trans_cancel;
 		}
@@ -3031,7 +3022,7 @@ xfs_rename(
 			 * Make sure target dir is empty.
 			 */
 			if (!(xfs_dir_isempty(target_ip)) ||
-			    (target_ip->i_d.di_nlink > 2)) {
+			    (VFS_I(target_ip)->i_nlink > 2)) {
 				error = -EEXIST;
 				goto out_trans_cancel;
 			}
@@ -3138,7 +3129,7 @@ xfs_rename(
 	 * intermediate state on disk.
 	 */
 	if (wip) {
-		ASSERT(VFS_I(wip)->i_nlink == 0 && wip->i_d.di_nlink == 0);
+		ASSERT(VFS_I(wip)->i_nlink == 0);
 		error = xfs_bumplink(tp, wip);
 		if (error)
 			goto out_bmap_cancel;
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index aef5452b1a90..e74d13d3076b 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -405,8 +405,6 @@ int		xfs_ifree(struct xfs_trans *, xfs_inode_t *,
 			   struct xfs_bmap_free *);
 int		xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *,
 				      int, xfs_fsize_t);
-int		xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
-
 void		xfs_iext_realloc(xfs_inode_t *, int, int);
 
 void		xfs_iunpin_wait(xfs_inode_t *);
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 1e5ecbc45459..193e0bdec340 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -338,7 +338,6 @@ xfs_inode_to_log_dinode(
 	to->di_format = from->di_format;
 	to->di_uid = from->di_uid;
 	to->di_gid = from->di_gid;
-	to->di_nlink = from->di_nlink;
 	to->di_projid_lo = from->di_projid_lo;
 	to->di_projid_hi = from->di_projid_hi;
 
@@ -350,6 +349,7 @@ xfs_inode_to_log_dinode(
 	to->di_mtime.t_nsec = inode->i_mtime.tv_nsec;
 	to->di_ctime.t_sec = inode->i_ctime.tv_sec;
 	to->di_ctime.t_nsec = inode->i_ctime.tv_nsec;
+	to->di_nlink = inode->i_nlink;
 
 	to->di_size = from->di_size;
 	to->di_nblocks = from->di_nblocks;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index cd27c6d56dfc..8982e56e07c4 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -460,7 +460,7 @@ xfs_vn_getattr(
 	stat->size = XFS_ISIZE(ip);
 	stat->dev = inode->i_sb->s_dev;
 	stat->mode = ip->i_d.di_mode;
-	stat->nlink = ip->i_d.di_nlink;
+	stat->nlink = inode->i_nlink;
 	stat->uid = inode->i_uid;
 	stat->gid = inode->i_gid;
 	stat->ino = ip->i_ino;
@@ -1216,7 +1216,6 @@ xfs_setup_inode(
 	hlist_add_fake(&inode->i_hash);
 
 	inode->i_mode	= ip->i_d.di_mode;
-	set_nlink(inode, ip->i_d.di_nlink);
 	inode->i_uid    = xfs_uid_to_kuid(ip->i_d.di_uid);
 	inode->i_gid    = xfs_gid_to_kgid(ip->i_d.di_gid);
 
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 2acda42319f5..cfb65273d361 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -85,7 +85,6 @@ xfs_bulkstat_one_int(
 	/* xfs_iget returns the following without needing
 	 * further change.
 	 */
-	buf->bs_nlink = dic->di_nlink;
 	buf->bs_projid_lo = dic->di_projid_lo;
 	buf->bs_projid_hi = dic->di_projid_hi;
 	buf->bs_ino = ino;
@@ -94,6 +93,7 @@ xfs_bulkstat_one_int(
 	buf->bs_gid = dic->di_gid;
 	buf->bs_size = dic->di_size;
 
+	buf->bs_nlink = inode->i_nlink;
 	buf->bs_atime.tv_sec = inode->i_atime.tv_sec;
 	buf->bs_atime.tv_nsec = inode->i_atime.tv_nsec;
 	buf->bs_mtime.tv_sec = inode->i_mtime.tv_sec;
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 4b79cf006589..611c25cdb15c 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -4337,7 +4337,7 @@ xlog_recover_process_one_iunlink(
 	if (error)
 		goto fail_iput;
 
-	ASSERT(ip->i_d.di_nlink == 0);
+	ASSERT(VFS_I(ip)->i_nlink == 0);
 	ASSERT(ip->i_d.di_mode != 0);
 
 	/* setup for the next pass */

From 9e9a2674e43353f650ecd19a54eba028eafff82e Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 9 Feb 2016 16:54:58 +1100
Subject: [PATCH 29/66] xfs: move inode generation count to VFS inode

Pull another 4 bytes out of the xfs_icdinode.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/libxfs/xfs_inode_buf.c | 8 ++++----
 fs/xfs/libxfs/xfs_inode_buf.h | 1 -
 fs/xfs/xfs_export.c           | 2 +-
 fs/xfs/xfs_icache.c           | 2 ++
 fs/xfs/xfs_inode.c            | 5 +----
 fs/xfs/xfs_inode_item.c       | 2 +-
 fs/xfs/xfs_ioctl.c            | 2 +-
 fs/xfs/xfs_iops.c             | 1 -
 fs/xfs/xfs_itable.c           | 2 +-
 9 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index cc509c1af75e..9113a8d0f148 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -237,6 +237,7 @@ xfs_inode_from_disk(
 	inode->i_mtime.tv_nsec = (int)be32_to_cpu(from->di_mtime.t_nsec);
 	inode->i_ctime.tv_sec = (int)be32_to_cpu(from->di_ctime.t_sec);
 	inode->i_ctime.tv_nsec = (int)be32_to_cpu(from->di_ctime.t_nsec);
+	inode->i_generation = be32_to_cpu(from->di_gen);
 
 	to->di_size = be64_to_cpu(from->di_size);
 	to->di_nblocks = be64_to_cpu(from->di_nblocks);
@@ -248,7 +249,6 @@ xfs_inode_from_disk(
 	to->di_dmevmask	= be32_to_cpu(from->di_dmevmask);
 	to->di_dmstate	= be16_to_cpu(from->di_dmstate);
 	to->di_flags	= be16_to_cpu(from->di_flags);
-	to->di_gen	= be32_to_cpu(from->di_gen);
 
 	if (to->di_version == 3) {
 		to->di_changecount = be64_to_cpu(from->di_changecount);
@@ -286,6 +286,7 @@ xfs_inode_to_disk(
 	to->di_ctime.t_sec = cpu_to_be32(inode->i_ctime.tv_sec);
 	to->di_ctime.t_nsec = cpu_to_be32(inode->i_ctime.tv_nsec);
 	to->di_nlink = cpu_to_be32(inode->i_nlink);
+	to->di_gen = cpu_to_be32(inode->i_generation);
 
 	to->di_size = cpu_to_be64(from->di_size);
 	to->di_nblocks = cpu_to_be64(from->di_nblocks);
@@ -297,7 +298,6 @@ xfs_inode_to_disk(
 	to->di_dmevmask = cpu_to_be32(from->di_dmevmask);
 	to->di_dmstate = cpu_to_be16(from->di_dmstate);
 	to->di_flags = cpu_to_be16(from->di_flags);
-	to->di_gen = cpu_to_be32(from->di_gen);
 
 	if (from->di_version == 3) {
 		to->di_changecount = cpu_to_be64(from->di_changecount);
@@ -443,7 +443,7 @@ xfs_iread(
 	    !(mp->m_flags & XFS_MOUNT_IKEEP)) {
 		/* initialise the on-disk inode core */
 		memset(&ip->i_d, 0, sizeof(ip->i_d));
-		ip->i_d.di_gen = prandom_u32();
+		VFS_I(ip)->i_generation = prandom_u32();
 		if (xfs_sb_version_hascrc(&mp->m_sb))
 			ip->i_d.di_version = 3;
 		else
@@ -491,7 +491,7 @@ xfs_iread(
 		 * that xfs_ialloc won't overwrite or relies on being correct.
 		 */
 		ip->i_d.di_version = dip->di_version;
-		ip->i_d.di_gen = be32_to_cpu(dip->di_gen);
+		VFS_I(ip)->i_generation = be32_to_cpu(dip->di_gen);
 		ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter);
 
 		/*
diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h
index 320b72344270..29fd9f1cf54b 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.h
+++ b/fs/xfs/libxfs/xfs_inode_buf.h
@@ -46,7 +46,6 @@ struct xfs_icdinode {
 	__uint32_t	di_dmevmask;	/* DMIG event mask */
 	__uint16_t	di_dmstate;	/* DMIG state info */
 	__uint16_t	di_flags;	/* random flags, XFS_DIFLAG_... */
-	__uint32_t	di_gen;		/* generation number */
 
 	__uint64_t	di_changecount;	/* number of attribute changes */
 	__uint64_t	di_flags2;	/* more random flags */
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index 652cd3c5b58c..2816d42507bc 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -152,7 +152,7 @@ xfs_nfs_get_inode(
 		return ERR_PTR(error);
 	}
 
-	if (ip->i_d.di_gen != generation) {
+	if (VFS_I(ip)->i_generation != generation) {
 		IRELE(ip);
 		return ERR_PTR(-ESTALE);
 	}
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 4c184f70d43c..6401e3c07d3a 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -149,10 +149,12 @@ xfs_reinit_inode(
 {
 	int		error;
 	uint32_t	nlink = inode->i_nlink;
+	uint32_t	generation = inode->i_generation;
 
 	error = inode_init_always(mp->m_super, inode);
 
 	set_nlink(inode, nlink);
+	inode->i_generation = generation;
 	return error;
 }
 
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 18b3bc0c211a..dfc5b18c8cc2 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -835,9 +835,6 @@ xfs_ialloc(
 	inode->i_atime = tv;
 	inode->i_ctime = tv;
 
-	/*
-	 * di_gen will have been taken care of in xfs_iread.
-	 */
 	ip->i_d.di_extsize = 0;
 	ip->i_d.di_dmevmask = 0;
 	ip->i_d.di_dmstate = 0;
@@ -2424,7 +2421,7 @@ xfs_ifree(
 	 * Bump the generation count so no one will be confused
 	 * by reincarnations of this inode.
 	 */
-	ip->i_d.di_gen++;
+	VFS_I(ip)->i_generation++;
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 
 	if (xic.deleted)
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 193e0bdec340..6367780b8abb 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -350,6 +350,7 @@ xfs_inode_to_log_dinode(
 	to->di_ctime.t_sec = inode->i_ctime.tv_sec;
 	to->di_ctime.t_nsec = inode->i_ctime.tv_nsec;
 	to->di_nlink = inode->i_nlink;
+	to->di_gen = inode->i_generation;
 
 	to->di_size = from->di_size;
 	to->di_nblocks = from->di_nblocks;
@@ -361,7 +362,6 @@ xfs_inode_to_log_dinode(
 	to->di_dmevmask = from->di_dmevmask;
 	to->di_dmstate = from->di_dmstate;
 	to->di_flags = from->di_flags;
-	to->di_gen = from->di_gen;
 
 	if (from->di_version == 3) {
 		to->di_changecount = from->di_changecount;
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 478d04e07f95..cdd6c3156d53 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -114,7 +114,7 @@ xfs_find_handle(
 		handle.ha_fid.fid_len = sizeof(xfs_fid_t) -
 					sizeof(handle.ha_fid.fid_len);
 		handle.ha_fid.fid_pad = 0;
-		handle.ha_fid.fid_gen = ip->i_d.di_gen;
+		handle.ha_fid.fid_gen = inode->i_generation;
 		handle.ha_fid.fid_ino = ip->i_ino;
 
 		hsize = XFS_HSIZE(handle);
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 8982e56e07c4..a4daa3fd3ae9 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -1231,7 +1231,6 @@ xfs_setup_inode(
 		break;
 	}
 
-	inode->i_generation = ip->i_d.di_gen;
 	i_size_write(inode, ip->i_d.di_size);
 	xfs_diflags_to_iflags(inode, ip);
 
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index cfb65273d361..6162e65f10b6 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -100,11 +100,11 @@ xfs_bulkstat_one_int(
 	buf->bs_mtime.tv_nsec = inode->i_mtime.tv_nsec;
 	buf->bs_ctime.tv_sec = inode->i_ctime.tv_sec;
 	buf->bs_ctime.tv_nsec = inode->i_ctime.tv_nsec;
+	buf->bs_gen = inode->i_generation;
 
 	buf->bs_xflags = xfs_ip2xflags(ip);
 	buf->bs_extsize = dic->di_extsize << mp->m_sb.sb_blocklog;
 	buf->bs_extents = dic->di_nextents;
-	buf->bs_gen = dic->di_gen;
 	memset(buf->bs_pad, 0, sizeof(buf->bs_pad));
 	buf->bs_dmevmask = dic->di_dmevmask;
 	buf->bs_dmstate = dic->di_dmstate;

From 83e06f21b439b7b308eda06332a4feef35739e94 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 9 Feb 2016 16:54:58 +1100
Subject: [PATCH 30/66] xfs: move di_changecount to VFS inode

We can store the di_changecount in the i_version field of the VFS
inode and remove another 8 bytes from the xfs_icdinode.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/libxfs/xfs_inode_buf.c | 4 ++--
 fs/xfs/libxfs/xfs_inode_buf.h | 1 -
 fs/xfs/xfs_icache.c           | 6 ++++--
 fs/xfs/xfs_inode.c            | 2 +-
 fs/xfs/xfs_inode_item.c       | 2 +-
 fs/xfs/xfs_trans_inode.c      | 2 +-
 6 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 9113a8d0f148..05fbefa97fbf 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -251,7 +251,7 @@ xfs_inode_from_disk(
 	to->di_flags	= be16_to_cpu(from->di_flags);
 
 	if (to->di_version == 3) {
-		to->di_changecount = be64_to_cpu(from->di_changecount);
+		inode->i_version = be64_to_cpu(from->di_changecount);
 		to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec);
 		to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec);
 		to->di_flags2 = be64_to_cpu(from->di_flags2);
@@ -300,7 +300,7 @@ xfs_inode_to_disk(
 	to->di_flags = cpu_to_be16(from->di_flags);
 
 	if (from->di_version == 3) {
-		to->di_changecount = cpu_to_be64(from->di_changecount);
+		to->di_changecount = cpu_to_be64(inode->i_version);
 		to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec);
 		to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec);
 		to->di_flags2 = cpu_to_be64(from->di_flags2);
diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h
index 29fd9f1cf54b..c51bd1213c96 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.h
+++ b/fs/xfs/libxfs/xfs_inode_buf.h
@@ -47,7 +47,6 @@ struct xfs_icdinode {
 	__uint16_t	di_dmstate;	/* DMIG state info */
 	__uint16_t	di_flags;	/* random flags, XFS_DIFLAG_... */
 
-	__uint64_t	di_changecount;	/* number of attribute changes */
 	__uint64_t	di_flags2;	/* more random flags */
 
 	xfs_ictimestamp_t di_crtime;	/* time created */
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 6401e3c07d3a..30eafad7cf19 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -138,9 +138,9 @@ xfs_inode_free(
  * When we recycle a reclaimable inode, we need to re-initialise the VFS inode
  * part of the structure. This is made more complex by the fact we store
  * information about the on-disk values in the VFS inode and so we can't just
- * overwrite it's values unconditionally. Hence we save the parameters we
+ * overwrite the values unconditionally. Hence we save the parameters we
  * need to retain across reinitialisation, and rewrite them into the VFS inode
- * after resetting it's state even if resetting fails.
+ * after reinitialisation even if it fails.
  */
 static int
 xfs_reinit_inode(
@@ -150,11 +150,13 @@ xfs_reinit_inode(
 	int		error;
 	uint32_t	nlink = inode->i_nlink;
 	uint32_t	generation = inode->i_generation;
+	uint64_t	version = inode->i_version;
 
 	error = inode_init_always(mp->m_super, inode);
 
 	set_nlink(inode, nlink);
 	inode->i_generation = generation;
+	inode->i_version = version;
 	return error;
 }
 
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index dfc5b18c8cc2..8a970056a2c4 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -841,7 +841,7 @@ xfs_ialloc(
 	ip->i_d.di_flags = 0;
 
 	if (ip->i_d.di_version == 3) {
-		ip->i_d.di_changecount = 1;
+		inode->i_version = 1;
 		ip->i_d.di_flags2 = 0;
 		ip->i_d.di_crtime.t_sec = (__int32_t)tv.tv_sec;
 		ip->i_d.di_crtime.t_nsec = (__int32_t)tv.tv_nsec;
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 6367780b8abb..3415c63bc843 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -364,7 +364,7 @@ xfs_inode_to_log_dinode(
 	to->di_flags = from->di_flags;
 
 	if (from->di_version == 3) {
-		to->di_changecount = from->di_changecount;
+		to->di_changecount = inode->i_version;
 		to->di_crtime.t_sec = from->di_crtime.t_sec;
 		to->di_crtime.t_nsec = from->di_crtime.t_nsec;
 		to->di_flags2 = from->di_flags2;
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index 3f0d46655ce5..11a3af08b5c7 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -117,7 +117,7 @@ xfs_trans_log_inode(
 	 */
 	if (!(ip->i_itemp->ili_item.li_desc->lid_flags & XFS_LID_DIRTY) &&
 	    IS_I_VERSION(VFS_I(ip))) {
-		ip->i_d.di_changecount = ++VFS_I(ip)->i_version;
+		VFS_I(ip)->i_version++;
 		flags |= XFS_ILOG_CORE;
 	}
 

From c19b3b05ae440de50fffe2ac2a9b27392a7448e9 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 9 Feb 2016 16:54:58 +1100
Subject: [PATCH 31/66] xfs: mode di_mode to vfs inode

Move the di_mode value from the xfs_icdinode to the VFS inode, reducing
the xfs_icdinode byte another 2 bytes and collapsing another 2 byte hole
in the structure.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/libxfs/xfs_bmap.c       |  6 ++---
 fs/xfs/libxfs/xfs_dir2.c       | 12 ++++-----
 fs/xfs/libxfs/xfs_inode_buf.c  |  8 +++---
 fs/xfs/libxfs/xfs_inode_buf.h  |  1 -
 fs/xfs/libxfs/xfs_inode_fork.c |  2 +-
 fs/xfs/xfs_bmap_util.c         |  4 +--
 fs/xfs/xfs_dir2_readdir.c      |  2 +-
 fs/xfs/xfs_file.c              |  6 ++---
 fs/xfs/xfs_filestream.c        |  4 +--
 fs/xfs/xfs_icache.c            | 13 ++++++---
 fs/xfs/xfs_inode.c             | 48 ++++++++++++++++------------------
 fs/xfs/xfs_inode.h             |  4 +--
 fs/xfs/xfs_inode_item.c        |  2 +-
 fs/xfs/xfs_ioctl.c             | 14 +++++-----
 fs/xfs/xfs_iops.c              | 12 +++------
 fs/xfs/xfs_itable.c            |  2 +-
 fs/xfs/xfs_log_recover.c       |  2 +-
 fs/xfs/xfs_mount.c             |  2 +-
 18 files changed, 71 insertions(+), 73 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index ef00156f4f96..6a051662d8f9 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -912,7 +912,7 @@ xfs_bmap_local_to_extents(
 	 * We don't want to deal with the case of keeping inode data inline yet.
 	 * So sending the data fork of a regular inode is invalid.
 	 */
-	ASSERT(!(S_ISREG(ip->i_d.di_mode) && whichfork == XFS_DATA_FORK));
+	ASSERT(!(S_ISREG(VFS_I(ip)->i_mode) && whichfork == XFS_DATA_FORK));
 	ifp = XFS_IFORK_PTR(ip, whichfork);
 	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
 
@@ -1079,7 +1079,7 @@ xfs_bmap_add_attrfork_local(
 	if (ip->i_df.if_bytes <= XFS_IFORK_DSIZE(ip))
 		return 0;
 
-	if (S_ISDIR(ip->i_d.di_mode)) {
+	if (S_ISDIR(VFS_I(ip)->i_mode)) {
 		memset(&dargs, 0, sizeof(dargs));
 		dargs.geo = ip->i_mount->m_dir_geo;
 		dargs.dp = ip;
@@ -1091,7 +1091,7 @@ xfs_bmap_add_attrfork_local(
 		return xfs_dir2_sf_to_block(&dargs);
 	}
 
-	if (S_ISLNK(ip->i_d.di_mode))
+	if (S_ISLNK(VFS_I(ip)->i_mode))
 		return xfs_bmap_local_to_extents(tp, ip, firstblock, 1,
 						 flags, XFS_DATA_FORK,
 						 xfs_symlink_local_to_remote);
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 2fb53a5c0a74..af0f9d171f8a 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -176,7 +176,7 @@ xfs_dir_isempty(
 {
 	xfs_dir2_sf_hdr_t	*sfp;
 
-	ASSERT(S_ISDIR(dp->i_d.di_mode));
+	ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
 	if (dp->i_d.di_size == 0)	/* might happen during shutdown. */
 		return 1;
 	if (dp->i_d.di_size > XFS_IFORK_DSIZE(dp))
@@ -231,7 +231,7 @@ xfs_dir_init(
 	struct xfs_da_args *args;
 	int		error;
 
-	ASSERT(S_ISDIR(dp->i_d.di_mode));
+	ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
 	error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino);
 	if (error)
 		return error;
@@ -266,7 +266,7 @@ xfs_dir_createname(
 	int			rval;
 	int			v;		/* type-checking value */
 
-	ASSERT(S_ISDIR(dp->i_d.di_mode));
+	ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
 	if (inum) {
 		rval = xfs_dir_ino_validate(tp->t_mountp, inum);
 		if (rval)
@@ -364,7 +364,7 @@ xfs_dir_lookup(
 	int		v;		/* type-checking value */
 	int		lock_mode;
 
-	ASSERT(S_ISDIR(dp->i_d.di_mode));
+	ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
 	XFS_STATS_INC(dp->i_mount, xs_dir_lookup);
 
 	/*
@@ -443,7 +443,7 @@ xfs_dir_removename(
 	int		rval;
 	int		v;		/* type-checking value */
 
-	ASSERT(S_ISDIR(dp->i_d.di_mode));
+	ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
 	XFS_STATS_INC(dp->i_mount, xs_dir_remove);
 
 	args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
@@ -505,7 +505,7 @@ xfs_dir_replace(
 	int		rval;
 	int		v;		/* type-checking value */
 
-	ASSERT(S_ISDIR(dp->i_d.di_mode));
+	ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
 
 	rval = xfs_dir_ino_validate(tp->t_mountp, inum);
 	if (rval)
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 05fbefa97fbf..9d9559eb2835 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -202,13 +202,12 @@ xfs_inode_from_disk(
 	struct xfs_icdinode	*to = &ip->i_d;
 	struct inode		*inode = VFS_I(ip);
 
-	to->di_mode = be16_to_cpu(from->di_mode);
-	to->di_version = from ->di_version;
 
 	/*
 	 * Convert v1 inodes immediately to v2 inode format as this is the
 	 * minimum inode version format we support in the rest of the code.
 	 */
+	to->di_version = from->di_version;
 	if (to->di_version == 1) {
 		set_nlink(inode, be16_to_cpu(from->di_onlink));
 		to->di_projid_lo = 0;
@@ -238,6 +237,7 @@ xfs_inode_from_disk(
 	inode->i_ctime.tv_sec = (int)be32_to_cpu(from->di_ctime.t_sec);
 	inode->i_ctime.tv_nsec = (int)be32_to_cpu(from->di_ctime.t_nsec);
 	inode->i_generation = be32_to_cpu(from->di_gen);
+	inode->i_mode = be16_to_cpu(from->di_mode);
 
 	to->di_size = be64_to_cpu(from->di_size);
 	to->di_nblocks = be64_to_cpu(from->di_nblocks);
@@ -270,7 +270,6 @@ xfs_inode_to_disk(
 	to->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
 	to->di_onlink = 0;
 
-	to->di_mode = cpu_to_be16(from->di_mode);
 	to->di_version = from->di_version;
 	to->di_format = from->di_format;
 	to->di_uid = cpu_to_be32(from->di_uid);
@@ -287,6 +286,7 @@ xfs_inode_to_disk(
 	to->di_ctime.t_nsec = cpu_to_be32(inode->i_ctime.tv_nsec);
 	to->di_nlink = cpu_to_be32(inode->i_nlink);
 	to->di_gen = cpu_to_be32(inode->i_generation);
+	to->di_mode = cpu_to_be16(inode->i_mode);
 
 	to->di_size = cpu_to_be64(from->di_size);
 	to->di_nblocks = cpu_to_be64(from->di_nblocks);
@@ -501,7 +501,7 @@ xfs_iread(
 		 * the inode is already free and not try to mess
 		 * with the uninitialized part of it.
 		 */
-		ip->i_d.di_mode = 0;
+		VFS_I(ip)->i_mode = 0;
 	}
 
 	ASSERT(ip->i_d.di_version >= 2);
diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h
index c51bd1213c96..7c4dd321b215 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.h
+++ b/fs/xfs/libxfs/xfs_inode_buf.h
@@ -28,7 +28,6 @@ struct xfs_dinode;
  * format specific structures at the appropriate time.
  */
 struct xfs_icdinode {
-	__uint16_t	di_mode;	/* mode and type of file */
 	__int8_t	di_version;	/* inode version */
 	__int8_t	di_format;	/* format of di_c data */
 	__uint16_t	di_flushiter;	/* incremented on flush */
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 0defbd02f62d..0bf1c747439d 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -120,7 +120,7 @@ xfs_iformat_fork(
 		return -EFSCORRUPTED;
 	}
 
-	switch (ip->i_d.di_mode & S_IFMT) {
+	switch (VFS_I(ip)->i_mode & S_IFMT) {
 	case S_IFIFO:
 	case S_IFCHR:
 	case S_IFBLK:
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 45ec9e40150c..708775613e55 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -821,7 +821,7 @@ bool
 xfs_can_free_eofblocks(struct xfs_inode *ip, bool force)
 {
 	/* prealloc/delalloc exists only on regular files */
-	if (!S_ISREG(ip->i_d.di_mode))
+	if (!S_ISREG(VFS_I(ip)->i_mode))
 		return false;
 
 	/*
@@ -1726,7 +1726,7 @@ xfs_swap_extents(
 	xfs_lock_two_inodes(ip, tip, XFS_MMAPLOCK_EXCL);
 
 	/* Verify that both files have the same format */
-	if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) {
+	if ((VFS_I(ip)->i_mode & S_IFMT) != (VFS_I(tip)->i_mode & S_IFMT)) {
 		error = -EINVAL;
 		goto out_unlock;
 	}
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index 642d55d10075..93b3ab0c5435 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -665,7 +665,7 @@ xfs_readdir(
 	if (XFS_FORCED_SHUTDOWN(dp->i_mount))
 		return -EIO;
 
-	ASSERT(S_ISDIR(dp->i_d.di_mode));
+	ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
 	XFS_STATS_INC(dp->i_mount, xs_dir_getdents);
 
 	args.dp = dp;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 52883ac3cf84..f7333fbba5c2 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -156,9 +156,9 @@ xfs_update_prealloc_flags(
 	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 
 	if (!(flags & XFS_PREALLOC_INVISIBLE)) {
-		ip->i_d.di_mode &= ~S_ISUID;
-		if (ip->i_d.di_mode & S_IXGRP)
-			ip->i_d.di_mode &= ~S_ISGID;
+		VFS_I(ip)->i_mode &= ~S_ISUID;
+		if (VFS_I(ip)->i_mode & S_IXGRP)
+			VFS_I(ip)->i_mode &= ~S_ISGID;
 		xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 	}
 
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index c4c130f9bfb6..a51353a1f87f 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -151,7 +151,7 @@ xfs_filestream_pick_ag(
 	xfs_agnumber_t		ag, max_ag = NULLAGNUMBER;
 	int			err, trylock, nscan;
 
-	ASSERT(S_ISDIR(ip->i_d.di_mode));
+	ASSERT(S_ISDIR(VFS_I(ip)->i_mode));
 
 	/* 2% of an AG's blocks must be free for it to be chosen. */
 	minfree = mp->m_sb.sb_agblocks / 50;
@@ -319,7 +319,7 @@ xfs_filestream_lookup_ag(
 	xfs_agnumber_t		startag, ag = NULLAGNUMBER;
 	struct xfs_mru_cache_elem *mru;
 
-	ASSERT(S_ISREG(ip->i_d.di_mode));
+	ASSERT(S_ISREG(VFS_I(ip)->i_mode));
 
 	pip = xfs_filestream_get_parent(ip);
 	if (!pip)
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 30eafad7cf19..bf2d60749278 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -63,6 +63,9 @@ xfs_inode_alloc(
 		return NULL;
 	}
 
+	/* VFS doesn't initialise i_mode! */
+	VFS_I(ip)->i_mode = 0;
+
 	XFS_STATS_INC(mp, vn_active);
 	ASSERT(atomic_read(&ip->i_pincount) == 0);
 	ASSERT(!spin_is_locked(&ip->i_flags_lock));
@@ -98,7 +101,7 @@ void
 xfs_inode_free(
 	struct xfs_inode	*ip)
 {
-	switch (ip->i_d.di_mode & S_IFMT) {
+	switch (VFS_I(ip)->i_mode & S_IFMT) {
 	case S_IFREG:
 	case S_IFDIR:
 	case S_IFLNK:
@@ -151,12 +154,14 @@ xfs_reinit_inode(
 	uint32_t	nlink = inode->i_nlink;
 	uint32_t	generation = inode->i_generation;
 	uint64_t	version = inode->i_version;
+	umode_t		mode = inode->i_mode;
 
 	error = inode_init_always(mp->m_super, inode);
 
 	set_nlink(inode, nlink);
 	inode->i_generation = generation;
 	inode->i_version = version;
+	inode->i_mode = mode;
 	return error;
 }
 
@@ -211,7 +216,7 @@ xfs_iget_cache_hit(
 	/*
 	 * If lookup is racing with unlink return an error immediately.
 	 */
-	if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
+	if (VFS_I(ip)->i_mode == 0 && !(flags & XFS_IGET_CREATE)) {
 		error = -ENOENT;
 		goto out_error;
 	}
@@ -321,7 +326,7 @@ xfs_iget_cache_miss(
 
 	trace_xfs_iget_miss(ip);
 
-	if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
+	if ((VFS_I(ip)->i_mode == 0) && !(flags & XFS_IGET_CREATE)) {
 		error = -ENOENT;
 		goto out_destroy;
 	}
@@ -470,7 +475,7 @@ again:
 	 * If we have a real type for an on-disk inode, we can setup the inode
 	 * now.	 If it's a new inode being created, xfs_ialloc will handle it.
 	 */
-	if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0)
+	if (xfs_iflags_test(ip, XFS_INEW) && VFS_I(ip)->i_mode != 0)
 		xfs_setup_existing_inode(ip);
 	return 0;
 
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 8a970056a2c4..26bac4ab79a3 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -802,7 +802,7 @@ xfs_ialloc(
 	if (ip->i_d.di_version == 1)
 		ip->i_d.di_version = 2;
 
-	ip->i_d.di_mode = mode;
+	inode->i_mode = mode;
 	set_nlink(inode, nlink);
 	ip->i_d.di_uid = xfs_kuid_to_uid(current_fsuid());
 	ip->i_d.di_gid = xfs_kgid_to_gid(current_fsgid());
@@ -810,9 +810,8 @@ xfs_ialloc(
 
 	if (pip && XFS_INHERIT_GID(pip)) {
 		ip->i_d.di_gid = pip->i_d.di_gid;
-		if ((pip->i_d.di_mode & S_ISGID) && S_ISDIR(mode)) {
-			ip->i_d.di_mode |= S_ISGID;
-		}
+		if ((VFS_I(pip)->i_mode & S_ISGID) && S_ISDIR(mode))
+			inode->i_mode |= S_ISGID;
 	}
 
 	/*
@@ -821,10 +820,9 @@ xfs_ialloc(
 	 * (and only if the irix_sgid_inherit compatibility variable is set).
 	 */
 	if ((irix_sgid_inherit) &&
-	    (ip->i_d.di_mode & S_ISGID) &&
-	    (!in_group_p(xfs_gid_to_kgid(ip->i_d.di_gid)))) {
-		ip->i_d.di_mode &= ~S_ISGID;
-	}
+	    (inode->i_mode & S_ISGID) &&
+	    (!in_group_p(xfs_gid_to_kgid(ip->i_d.di_gid))))
+		inode->i_mode &= ~S_ISGID;
 
 	ip->i_d.di_size = 0;
 	ip->i_d.di_nextents = 0;
@@ -1421,7 +1419,7 @@ xfs_link(
 
 	trace_xfs_link(tdp, target_name);
 
-	ASSERT(!S_ISDIR(sip->i_d.di_mode));
+	ASSERT(!S_ISDIR(VFS_I(sip)->i_mode));
 
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return -EIO;
@@ -1628,7 +1626,7 @@ xfs_release(
 	xfs_mount_t	*mp = ip->i_mount;
 	int		error;
 
-	if (!S_ISREG(ip->i_d.di_mode) || (ip->i_d.di_mode == 0))
+	if (!S_ISREG(VFS_I(ip)->i_mode) || (VFS_I(ip)->i_mode == 0))
 		return 0;
 
 	/* If this is a read-only mount, don't do this (would generate I/O) */
@@ -1863,7 +1861,7 @@ xfs_inactive(
 	 * If the inode is already free, then there can be nothing
 	 * to clean up here.
 	 */
-	if (ip->i_d.di_mode == 0) {
+	if (VFS_I(ip)->i_mode == 0) {
 		ASSERT(ip->i_df.if_real_bytes == 0);
 		ASSERT(ip->i_df.if_broot_bytes == 0);
 		return;
@@ -1887,7 +1885,7 @@ xfs_inactive(
 		return;
 	}
 
-	if (S_ISREG(ip->i_d.di_mode) &&
+	if (S_ISREG(VFS_I(ip)->i_mode) &&
 	    (ip->i_d.di_size != 0 || XFS_ISIZE(ip) != 0 ||
 	     ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0))
 		truncate = 1;
@@ -1896,7 +1894,7 @@ xfs_inactive(
 	if (error)
 		return;
 
-	if (S_ISLNK(ip->i_d.di_mode))
+	if (S_ISLNK(VFS_I(ip)->i_mode))
 		error = xfs_inactive_symlink(ip);
 	else if (truncate)
 		error = xfs_inactive_truncate(ip);
@@ -1956,7 +1954,7 @@ xfs_iunlink(
 	int		offset;
 	int		error;
 
-	ASSERT(ip->i_d.di_mode != 0);
+	ASSERT(VFS_I(ip)->i_mode != 0);
 
 	/*
 	 * Get the agi buffer first.  It ensures lock ordering
@@ -2397,7 +2395,7 @@ xfs_ifree(
 	ASSERT(VFS_I(ip)->i_nlink == 0);
 	ASSERT(ip->i_d.di_nextents == 0);
 	ASSERT(ip->i_d.di_anextents == 0);
-	ASSERT(ip->i_d.di_size == 0 || !S_ISREG(ip->i_d.di_mode));
+	ASSERT(ip->i_d.di_size == 0 || !S_ISREG(VFS_I(ip)->i_mode));
 	ASSERT(ip->i_d.di_nblocks == 0);
 
 	/*
@@ -2411,7 +2409,7 @@ xfs_ifree(
 	if (error)
 		return error;
 
-	ip->i_d.di_mode = 0;		/* mark incore inode as free */
+	VFS_I(ip)->i_mode = 0;		/* mark incore inode as free */
 	ip->i_d.di_flags = 0;
 	ip->i_d.di_dmevmask = 0;
 	ip->i_d.di_forkoff = 0;		/* mark the attr fork not in use */
@@ -2508,7 +2506,7 @@ xfs_remove(
 {
 	xfs_mount_t		*mp = dp->i_mount;
 	xfs_trans_t             *tp = NULL;
-	int			is_dir = S_ISDIR(ip->i_d.di_mode);
+	int			is_dir = S_ISDIR(VFS_I(ip)->i_mode);
 	int                     error = 0;
 	xfs_bmap_free_t         free_list;
 	xfs_fsblock_t           first_block;
@@ -2753,7 +2751,7 @@ xfs_cross_rename(
 	if (dp1 != dp2) {
 		dp2_flags = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
 
-		if (S_ISDIR(ip2->i_d.di_mode)) {
+		if (S_ISDIR(VFS_I(ip2)->i_mode)) {
 			error = xfs_dir_replace(tp, ip2, &xfs_name_dotdot,
 						dp1->i_ino, first_block,
 						free_list, spaceres);
@@ -2761,7 +2759,7 @@ xfs_cross_rename(
 				goto out_trans_abort;
 
 			/* transfer ip2 ".." reference to dp1 */
-			if (!S_ISDIR(ip1->i_d.di_mode)) {
+			if (!S_ISDIR(VFS_I(ip1)->i_mode)) {
 				error = xfs_droplink(tp, dp2);
 				if (error)
 					goto out_trans_abort;
@@ -2780,7 +2778,7 @@ xfs_cross_rename(
 			ip2_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
 		}
 
-		if (S_ISDIR(ip1->i_d.di_mode)) {
+		if (S_ISDIR(VFS_I(ip1)->i_mode)) {
 			error = xfs_dir_replace(tp, ip1, &xfs_name_dotdot,
 						dp2->i_ino, first_block,
 						free_list, spaceres);
@@ -2788,7 +2786,7 @@ xfs_cross_rename(
 				goto out_trans_abort;
 
 			/* transfer ip1 ".." reference to dp2 */
-			if (!S_ISDIR(ip2->i_d.di_mode)) {
+			if (!S_ISDIR(VFS_I(ip2)->i_mode)) {
 				error = xfs_droplink(tp, dp1);
 				if (error)
 					goto out_trans_abort;
@@ -2885,7 +2883,7 @@ xfs_rename(
 	struct xfs_inode	*inodes[__XFS_SORT_INODES];
 	int			num_inodes = __XFS_SORT_INODES;
 	bool			new_parent = (src_dp != target_dp);
-	bool			src_is_directory = S_ISDIR(src_ip->i_d.di_mode);
+	bool			src_is_directory = S_ISDIR(VFS_I(src_ip)->i_mode);
 	int			spaceres;
 	int			error;
 
@@ -3014,7 +3012,7 @@ xfs_rename(
 		 * target and source are directories and that target can be
 		 * destroyed, or that neither is a directory.
 		 */
-		if (S_ISDIR(target_ip->i_d.di_mode)) {
+		if (S_ISDIR(VFS_I(target_ip)->i_mode)) {
 			/*
 			 * Make sure target dir is empty.
 			 */
@@ -3444,7 +3442,7 @@ xfs_iflush_int(
 			__func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip);
 		goto corrupt_out;
 	}
-	if (S_ISREG(ip->i_d.di_mode)) {
+	if (S_ISREG(VFS_I(ip)->i_mode)) {
 		if (XFS_TEST_ERROR(
 		    (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
 		    (ip->i_d.di_format != XFS_DINODE_FMT_BTREE),
@@ -3454,7 +3452,7 @@ xfs_iflush_int(
 				__func__, ip->i_ino, ip);
 			goto corrupt_out;
 		}
-	} else if (S_ISDIR(ip->i_d.di_mode)) {
+	} else if (S_ISDIR(VFS_I(ip)->i_mode)) {
 		if (XFS_TEST_ERROR(
 		    (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
 		    (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index e74d13d3076b..d6277494e606 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -88,7 +88,7 @@ static inline struct inode *VFS_I(struct xfs_inode *ip)
  */
 static inline xfs_fsize_t XFS_ISIZE(struct xfs_inode *ip)
 {
-	if (S_ISREG(ip->i_d.di_mode))
+	if (S_ISREG(VFS_I(ip)->i_mode))
 		return i_size_read(VFS_I(ip));
 	return ip->i_d.di_size;
 }
@@ -369,7 +369,7 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
  */
 #define XFS_INHERIT_GID(pip)	\
 	(((pip)->i_mount->m_flags & XFS_MOUNT_GRPID) || \
-	 ((pip)->i_d.di_mode & S_ISGID))
+	 (VFS_I(pip)->i_mode & S_ISGID))
 
 int		xfs_release(struct xfs_inode *ip);
 void		xfs_inactive(struct xfs_inode *ip);
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 3415c63bc843..c48b5b18d771 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -333,7 +333,6 @@ xfs_inode_to_log_dinode(
 
 	to->di_magic = XFS_DINODE_MAGIC;
 
-	to->di_mode = from->di_mode;
 	to->di_version = from->di_version;
 	to->di_format = from->di_format;
 	to->di_uid = from->di_uid;
@@ -351,6 +350,7 @@ xfs_inode_to_log_dinode(
 	to->di_ctime.t_nsec = inode->i_ctime.tv_nsec;
 	to->di_nlink = inode->i_nlink;
 	to->di_gen = inode->i_generation;
+	to->di_mode = inode->i_mode;
 
 	to->di_size = from->di_size;
 	to->di_nblocks = from->di_nblocks;
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index cdd6c3156d53..81d6d6218803 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -963,7 +963,7 @@ xfs_set_diflags(
 		di_flags |= XFS_DIFLAG_NODEFRAG;
 	if (xflags & FS_XFLAG_FILESTREAM)
 		di_flags |= XFS_DIFLAG_FILESTREAM;
-	if (S_ISDIR(ip->i_d.di_mode)) {
+	if (S_ISDIR(VFS_I(ip)->i_mode)) {
 		if (xflags & FS_XFLAG_RTINHERIT)
 			di_flags |= XFS_DIFLAG_RTINHERIT;
 		if (xflags & FS_XFLAG_NOSYMLINKS)
@@ -972,7 +972,7 @@ xfs_set_diflags(
 			di_flags |= XFS_DIFLAG_EXTSZINHERIT;
 		if (xflags & FS_XFLAG_PROJINHERIT)
 			di_flags |= XFS_DIFLAG_PROJINHERIT;
-	} else if (S_ISREG(ip->i_d.di_mode)) {
+	} else if (S_ISREG(VFS_I(ip)->i_mode)) {
 		if (xflags & FS_XFLAG_REALTIME)
 			di_flags |= XFS_DIFLAG_REALTIME;
 		if (xflags & FS_XFLAG_EXTSIZE)
@@ -1128,14 +1128,14 @@ xfs_ioctl_setattr_check_extsize(
 {
 	struct xfs_mount	*mp = ip->i_mount;
 
-	if ((fa->fsx_xflags & FS_XFLAG_EXTSIZE) && !S_ISREG(ip->i_d.di_mode))
+	if ((fa->fsx_xflags & FS_XFLAG_EXTSIZE) && !S_ISREG(VFS_I(ip)->i_mode))
 		return -EINVAL;
 
 	if ((fa->fsx_xflags & FS_XFLAG_EXTSZINHERIT) &&
-	    !S_ISDIR(ip->i_d.di_mode))
+	    !S_ISDIR(VFS_I(ip)->i_mode))
 		return -EINVAL;
 
-	if (S_ISREG(ip->i_d.di_mode) && ip->i_d.di_nextents &&
+	if (S_ISREG(VFS_I(ip)->i_mode) && ip->i_d.di_nextents &&
 	    ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) != fa->fsx_extsize))
 		return -EINVAL;
 
@@ -1256,9 +1256,9 @@ xfs_ioctl_setattr(
 	 * successful return from chown()
 	 */
 
-	if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
+	if ((VFS_I(ip)->i_mode & (S_ISUID|S_ISGID)) &&
 	    !capable_wrt_inode_uidgid(VFS_I(ip), CAP_FSETID))
-		ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
+		VFS_I(ip)->i_mode &= ~(S_ISUID|S_ISGID);
 
 	/* Change the ownerships and register project quota modifications */
 	if (xfs_get_projid(ip) != fa->fsx_projid) {
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index a4daa3fd3ae9..0d38b1d2c420 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -459,7 +459,7 @@ xfs_vn_getattr(
 
 	stat->size = XFS_ISIZE(ip);
 	stat->dev = inode->i_sb->s_dev;
-	stat->mode = ip->i_d.di_mode;
+	stat->mode = inode->i_mode;
 	stat->nlink = inode->i_nlink;
 	stat->uid = inode->i_uid;
 	stat->gid = inode->i_gid;
@@ -506,9 +506,6 @@ xfs_setattr_mode(
 
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 
-	ip->i_d.di_mode &= S_IFMT;
-	ip->i_d.di_mode |= mode & ~S_IFMT;
-
 	inode->i_mode &= S_IFMT;
 	inode->i_mode |= mode & ~S_IFMT;
 }
@@ -652,9 +649,9 @@ xfs_setattr_nonsize(
 		 * The set-user-ID and set-group-ID bits of a file will be
 		 * cleared upon successful return from chown()
 		 */
-		if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
+		if ((inode->i_mode & (S_ISUID|S_ISGID)) &&
 		    !capable(CAP_FSETID))
-			ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
+			inode->i_mode &= ~(S_ISUID|S_ISGID);
 
 		/*
 		 * Change the ownerships and register quota modifications
@@ -764,7 +761,7 @@ xfs_setattr_size(
 
 	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
 	ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
-	ASSERT(S_ISREG(ip->i_d.di_mode));
+	ASSERT(S_ISREG(inode->i_mode));
 	ASSERT((iattr->ia_valid & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET|
 		ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0);
 
@@ -1215,7 +1212,6 @@ xfs_setup_inode(
 	/* make the inode look hashed for the writeback code */
 	hlist_add_fake(&inode->i_hash);
 
-	inode->i_mode	= ip->i_d.di_mode;
 	inode->i_uid    = xfs_uid_to_kuid(ip->i_d.di_uid);
 	inode->i_gid    = xfs_gid_to_kgid(ip->i_d.di_gid);
 
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 6162e65f10b6..ce73eb34620d 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -88,7 +88,6 @@ xfs_bulkstat_one_int(
 	buf->bs_projid_lo = dic->di_projid_lo;
 	buf->bs_projid_hi = dic->di_projid_hi;
 	buf->bs_ino = ino;
-	buf->bs_mode = dic->di_mode;
 	buf->bs_uid = dic->di_uid;
 	buf->bs_gid = dic->di_gid;
 	buf->bs_size = dic->di_size;
@@ -101,6 +100,7 @@ xfs_bulkstat_one_int(
 	buf->bs_ctime.tv_sec = inode->i_ctime.tv_sec;
 	buf->bs_ctime.tv_nsec = inode->i_ctime.tv_nsec;
 	buf->bs_gen = inode->i_generation;
+	buf->bs_mode = inode->i_mode;
 
 	buf->bs_xflags = xfs_ip2xflags(ip);
 	buf->bs_extsize = dic->di_extsize << mp->m_sb.sb_blocklog;
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 611c25cdb15c..bd6f23b952a5 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -4338,7 +4338,7 @@ xlog_recover_process_one_iunlink(
 		goto fail_iput;
 
 	ASSERT(VFS_I(ip)->i_nlink == 0);
-	ASSERT(ip->i_d.di_mode != 0);
+	ASSERT(VFS_I(ip)->i_mode != 0);
 
 	/* setup for the next pass */
 	agino = be32_to_cpu(dip->di_next_unlinked);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index bb753b359bee..d3061054bad2 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -865,7 +865,7 @@ xfs_mountfs(
 
 	ASSERT(rip != NULL);
 
-	if (unlikely(!S_ISDIR(rip->i_d.di_mode))) {
+	if (unlikely(!S_ISDIR(VFS_I(rip)->i_mode))) {
 		xfs_warn(mp, "corrupted root inode %llu: not a directory",
 			(unsigned long long)rip->i_ino);
 		xfs_iunlock(rip, XFS_ILOCK_EXCL);

From b0388bf1088d992b45aa8af14d3151ae57848b17 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 10 Feb 2016 15:01:11 +1100
Subject: [PATCH 32/66] xfs: remove XBF_DONE flag wrapper macros

They only set/clear/check a flag, no need for obfuscating this
with a macro.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_buf.c         | 2 +-
 fs/xfs/xfs_buf.h         | 4 ----
 fs/xfs/xfs_buf_item.c    | 6 +++---
 fs/xfs/xfs_inode.c       | 2 +-
 fs/xfs/xfs_log.c         | 4 ++--
 fs/xfs/xfs_log_recover.c | 2 +-
 fs/xfs/xfs_mount.c       | 2 +-
 fs/xfs/xfs_trans_buf.c   | 4 ++--
 8 files changed, 11 insertions(+), 15 deletions(-)

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 435c7de42e5f..9a2191b91137 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -650,7 +650,7 @@ xfs_buf_read_map(
 	if (bp) {
 		trace_xfs_buf_read(bp, flags, _RET_IP_);
 
-		if (!XFS_BUF_ISDONE(bp)) {
+		if (!(bp->b_flags & XBF_DONE)) {
 			XFS_STATS_INC(target->bt_mount, xb_get_read);
 			bp->b_ops = ops;
 			_xfs_buf_read(bp, flags);
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index c75721acd867..03b5d3a61b0e 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -321,10 +321,6 @@ void xfs_buf_stale(struct xfs_buf *bp);
 #define XFS_BUF_UNSTALE(bp)	((bp)->b_flags &= ~XBF_STALE)
 #define XFS_BUF_ISSTALE(bp)	((bp)->b_flags & XBF_STALE)
 
-#define XFS_BUF_DONE(bp)	((bp)->b_flags |= XBF_DONE)
-#define XFS_BUF_UNDONE(bp)	((bp)->b_flags &= ~XBF_DONE)
-#define XFS_BUF_ISDONE(bp)	((bp)->b_flags & XBF_DONE)
-
 #define XFS_BUF_ASYNC(bp)	((bp)->b_flags |= XBF_ASYNC)
 #define XFS_BUF_UNASYNC(bp)	((bp)->b_flags &= ~XBF_ASYNC)
 #define XFS_BUF_ISASYNC(bp)	((bp)->b_flags & XBF_ASYNC)
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 7e986da34f6c..901e4d721958 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -493,7 +493,7 @@ xfs_buf_item_unpin(
 		xfs_buf_hold(bp);
 		bp->b_flags |= XBF_ASYNC;
 		xfs_buf_ioerror(bp, -EIO);
-		XFS_BUF_UNDONE(bp);
+		bp->b_flags &= ~XBF_DONE;
 		xfs_buf_stale(bp);
 		xfs_buf_ioend(bp);
 	}
@@ -1067,7 +1067,7 @@ xfs_buf_iodone_callbacks(
 	 */
 	if (XFS_FORCED_SHUTDOWN(mp)) {
 		xfs_buf_stale(bp);
-		XFS_BUF_DONE(bp);
+		bp->b_flags |= XBF_DONE;
 		trace_xfs_buf_item_iodone(bp, _RET_IP_);
 		goto do_callbacks;
 	}
@@ -1113,7 +1113,7 @@ xfs_buf_iodone_callbacks(
 	 * sure to return the error to the caller of xfs_bwrite().
 	 */
 	xfs_buf_stale(bp);
-	XFS_BUF_DONE(bp);
+	bp->b_flags |= XBF_DONE;
 
 	trace_xfs_buf_error_relse(bp, _RET_IP_);
 
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index ceba1a83cacc..a7be69a79321 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -3313,7 +3313,7 @@ cluster_corrupt_out:
 		 * mark it as stale and brelse.
 		 */
 		if (bp->b_iodone) {
-			XFS_BUF_UNDONE(bp);
+			bp->b_flags &= ~XBF_DONE;
 			xfs_buf_stale(bp);
 			xfs_buf_ioerror(bp, -EIO);
 			xfs_buf_ioend(bp);
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 9c9a1c9bcc7f..aa8c9bfaa35d 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -3979,7 +3979,7 @@ xfs_log_force_umount(
 	    log->l_flags & XLOG_ACTIVE_RECOVERY) {
 		mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
 		if (mp->m_sb_bp)
-			XFS_BUF_DONE(mp->m_sb_bp);
+			mp->m_sb_bp->b_flags |= XBF_DONE;
 		return 0;
 	}
 
@@ -4009,7 +4009,7 @@ xfs_log_force_umount(
 	spin_lock(&log->l_icloglock);
 	mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
 	if (mp->m_sb_bp)
-		XFS_BUF_DONE(mp->m_sb_bp);
+		mp->m_sb_bp->b_flags |= XBF_DONE;
 
 	/*
 	 * Mark the log and the iclogs with IO error flags to prevent any
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index da37beb76f6e..545ff3c03243 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -4926,7 +4926,7 @@ xlog_do_recover(
 	 * updates, re-read in the superblock and reverify it.
 	 */
 	bp = xfs_getsb(log->l_mp, 0);
-	XFS_BUF_UNDONE(bp);
+	bp->b_flags &= ~XBF_DONE;
 	ASSERT(!(XFS_BUF_ISWRITE(bp)));
 	XFS_BUF_READ(bp);
 	XFS_BUF_UNASYNC(bp);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index bb753b359bee..2c951899344a 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1284,7 +1284,7 @@ xfs_getsb(
 	}
 
 	xfs_buf_hold(bp);
-	ASSERT(XFS_BUF_ISDONE(bp));
+	ASSERT(bp->b_flags & XBF_DONE);
 	return bp;
 }
 
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 75798412859a..ed6f61fd3539 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -155,7 +155,7 @@ xfs_trans_get_buf_map(
 		ASSERT(xfs_buf_islocked(bp));
 		if (XFS_FORCED_SHUTDOWN(tp->t_mountp)) {
 			xfs_buf_stale(bp);
-			XFS_BUF_DONE(bp);
+			bp->b_flags |= XBF_DONE;
 		}
 
 		ASSERT(bp->b_transp == tp);
@@ -518,7 +518,7 @@ xfs_trans_log_buf(xfs_trans_t	*tp,
 	 * inside the b_bdstrat callback so that this won't get written to
 	 * disk.
 	 */
-	XFS_BUF_DONE(bp);
+	bp->b_flags |= XBF_DONE;
 
 	ASSERT(atomic_read(&bip->bli_refcount) > 0);
 	bp->b_iodone = xfs_buf_iodone_callbacks;

From 1157b32c732cbab75320e429559c0ec9f5d382e4 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 10 Feb 2016 15:01:11 +1100
Subject: [PATCH 33/66] xfs: remove XBF_ASYNC flag wrapper macros

They only set/clear/check a flag, no need for obfuscating this
with a macro.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_buf.h         | 4 ----
 fs/xfs/xfs_buf_item.c    | 2 +-
 fs/xfs/xfs_log.c         | 8 +++-----
 fs/xfs/xfs_log_recover.c | 3 +--
 4 files changed, 5 insertions(+), 12 deletions(-)

diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 03b5d3a61b0e..2a28d1ccbfb9 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -321,10 +321,6 @@ void xfs_buf_stale(struct xfs_buf *bp);
 #define XFS_BUF_UNSTALE(bp)	((bp)->b_flags &= ~XBF_STALE)
 #define XFS_BUF_ISSTALE(bp)	((bp)->b_flags & XBF_STALE)
 
-#define XFS_BUF_ASYNC(bp)	((bp)->b_flags |= XBF_ASYNC)
-#define XFS_BUF_UNASYNC(bp)	((bp)->b_flags &= ~XBF_ASYNC)
-#define XFS_BUF_ISASYNC(bp)	((bp)->b_flags & XBF_ASYNC)
-
 #define XFS_BUF_READ(bp)	((bp)->b_flags |= XBF_READ)
 #define XFS_BUF_UNREAD(bp)	((bp)->b_flags &= ~XBF_READ)
 #define XFS_BUF_ISREAD(bp)	((bp)->b_flags & XBF_READ)
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 901e4d721958..838df068fad4 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -1090,7 +1090,7 @@ xfs_buf_iodone_callbacks(
 	 * errors tend to affect the whole device and a failing log write
 	 * will make us give up.  But we really ought to do better here.
 	 */
-	if (XFS_BUF_ISASYNC(bp)) {
+	if (bp->b_flags & XBF_ASYNC) {
 		ASSERT(bp->b_iodone != NULL);
 
 		trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index aa8c9bfaa35d..19db3daad8a9 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1212,7 +1212,7 @@ xlog_iodone(xfs_buf_t *bp)
 	}
 
 	/* log I/O is always issued ASYNC */
-	ASSERT(XFS_BUF_ISASYNC(bp));
+	ASSERT(bp->b_flags & XBF_ASYNC);
 	xlog_state_done_syncing(iclog, aborted);
 
 	/*
@@ -1865,8 +1865,7 @@ xlog_sync(
 	bp->b_io_length = BTOBB(count);
 	bp->b_fspriv = iclog;
 	XFS_BUF_ZEROFLAGS(bp);
-	XFS_BUF_ASYNC(bp);
-	bp->b_flags |= XBF_SYNCIO;
+	bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO);
 
 	if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) {
 		bp->b_flags |= XBF_FUA;
@@ -1911,8 +1910,7 @@ xlog_sync(
 				(char *)&iclog->ic_header + count, split);
 		bp->b_fspriv = iclog;
 		XFS_BUF_ZEROFLAGS(bp);
-		XFS_BUF_ASYNC(bp);
-		bp->b_flags |= XBF_SYNCIO;
+		bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO);
 		if (log->l_mp->m_flags & XFS_MOUNT_BARRIER)
 			bp->b_flags |= XBF_FUA;
 
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 545ff3c03243..f00cce9bf830 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -4926,10 +4926,9 @@ xlog_do_recover(
 	 * updates, re-read in the superblock and reverify it.
 	 */
 	bp = xfs_getsb(log->l_mp, 0);
-	bp->b_flags &= ~XBF_DONE;
+	bp->b_flags &= ~(XBF_DONE | XBF_ASYNC);
 	ASSERT(!(XFS_BUF_ISWRITE(bp)));
 	XFS_BUF_READ(bp);
-	XFS_BUF_UNASYNC(bp);
 	bp->b_ops = &xfs_sb_buf_ops;
 
 	error = xfs_buf_submit_wait(bp);

From 0cac682ff683bac968c24a4774c69c3a9ff35013 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 10 Feb 2016 15:01:11 +1100
Subject: [PATCH 34/66] xfs: remove XBF_READ flag wrapper macros

They only set/clear/check a flag, no need for obfuscating this
with a macro.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_buf.h         | 4 ----
 fs/xfs/xfs_log_recover.c | 4 ++--
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 2a28d1ccbfb9..329e612460a6 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -321,10 +321,6 @@ void xfs_buf_stale(struct xfs_buf *bp);
 #define XFS_BUF_UNSTALE(bp)	((bp)->b_flags &= ~XBF_STALE)
 #define XFS_BUF_ISSTALE(bp)	((bp)->b_flags & XBF_STALE)
 
-#define XFS_BUF_READ(bp)	((bp)->b_flags |= XBF_READ)
-#define XFS_BUF_UNREAD(bp)	((bp)->b_flags &= ~XBF_READ)
-#define XFS_BUF_ISREAD(bp)	((bp)->b_flags & XBF_READ)
-
 #define XFS_BUF_WRITE(bp)	((bp)->b_flags |= XBF_WRITE)
 #define XFS_BUF_UNWRITE(bp)	((bp)->b_flags &= ~XBF_WRITE)
 #define XFS_BUF_ISWRITE(bp)	((bp)->b_flags & XBF_WRITE)
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index f00cce9bf830..e747bad5b64e 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -190,7 +190,7 @@ xlog_bread_noalign(
 	ASSERT(nbblks <= bp->b_length);
 
 	XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
-	XFS_BUF_READ(bp);
+	bp->b_flags |= XBF_READ;
 	bp->b_io_length = nbblks;
 	bp->b_error = 0;
 
@@ -4928,7 +4928,7 @@ xlog_do_recover(
 	bp = xfs_getsb(log->l_mp, 0);
 	bp->b_flags &= ~(XBF_DONE | XBF_ASYNC);
 	ASSERT(!(XFS_BUF_ISWRITE(bp)));
-	XFS_BUF_READ(bp);
+	bp->b_flags |= XBF_READ;
 	bp->b_ops = &xfs_sb_buf_ops;
 
 	error = xfs_buf_submit_wait(bp);

From b68c08219a6726fb68dca2d56e024d2e2c1654f5 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 10 Feb 2016 15:01:11 +1100
Subject: [PATCH 35/66] xfs: remove XBF_WRITE flag wrapper macros

They only set/clear/check a flag, no need for obfuscating this
with a macro.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_buf.h         | 4 ----
 fs/xfs/xfs_log.c         | 8 +++-----
 fs/xfs/xfs_log_recover.c | 2 +-
 3 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 329e612460a6..a1189625cca4 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -321,10 +321,6 @@ void xfs_buf_stale(struct xfs_buf *bp);
 #define XFS_BUF_UNSTALE(bp)	((bp)->b_flags &= ~XBF_STALE)
 #define XFS_BUF_ISSTALE(bp)	((bp)->b_flags & XBF_STALE)
 
-#define XFS_BUF_WRITE(bp)	((bp)->b_flags |= XBF_WRITE)
-#define XFS_BUF_UNWRITE(bp)	((bp)->b_flags &= ~XBF_WRITE)
-#define XFS_BUF_ISWRITE(bp)	((bp)->b_flags & XBF_WRITE)
-
 /*
  * These macros use the IO block map rather than b_bn. b_bn is now really
  * just for the buffer cache index for cached buffers. As IO does not use b_bn
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 19db3daad8a9..edf20b204235 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1865,7 +1865,7 @@ xlog_sync(
 	bp->b_io_length = BTOBB(count);
 	bp->b_fspriv = iclog;
 	XFS_BUF_ZEROFLAGS(bp);
-	bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO);
+	bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE);
 
 	if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) {
 		bp->b_flags |= XBF_FUA;
@@ -1892,12 +1892,11 @@ xlog_sync(
 
 	/* account for log which doesn't start at block #0 */
 	XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart);
+
 	/*
 	 * Don't call xfs_bwrite here. We do log-syncs even when the filesystem
 	 * is shutting down.
 	 */
-	XFS_BUF_WRITE(bp);
-
 	error = xlog_bdstrat(bp);
 	if (error) {
 		xfs_buf_ioerror_alert(bp, "xlog_sync");
@@ -1910,7 +1909,7 @@ xlog_sync(
 				(char *)&iclog->ic_header + count, split);
 		bp->b_fspriv = iclog;
 		XFS_BUF_ZEROFLAGS(bp);
-		bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO);
+		bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE);
 		if (log->l_mp->m_flags & XFS_MOUNT_BARRIER)
 			bp->b_flags |= XBF_FUA;
 
@@ -1919,7 +1918,6 @@ xlog_sync(
 
 		/* account for internal log which doesn't start at block #0 */
 		XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart);
-		XFS_BUF_WRITE(bp);
 		error = xlog_bdstrat(bp);
 		if (error) {
 			xfs_buf_ioerror_alert(bp, "xlog_sync (split)");
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index e747bad5b64e..50a75ecf279e 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -4927,7 +4927,7 @@ xlog_do_recover(
 	 */
 	bp = xfs_getsb(log->l_mp, 0);
 	bp->b_flags &= ~(XBF_DONE | XBF_ASYNC);
-	ASSERT(!(XFS_BUF_ISWRITE(bp)));
+	ASSERT(!(bp->b_flags & XBF_WRITE));
 	bp->b_flags |= XBF_READ;
 	bp->b_ops = &xfs_sb_buf_ops;
 

From 5cfd28b6abc2bd53f225c82e5083d898b3a158fe Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 10 Feb 2016 15:01:11 +1100
Subject: [PATCH 36/66] xfs: remove XBF_STALE flag wrapper macros

They only set/clear/check a flag, no need for obfuscating this
with a macro.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_buf.h       | 4 +---
 fs/xfs/xfs_buf_item.c  | 2 +-
 fs/xfs/xfs_trans_buf.c | 6 +++---
 3 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index a1189625cca4..98b7ee97aeae 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -302,6 +302,7 @@ extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
 
 /* Buffer Utility Routines */
 extern void *xfs_buf_offset(struct xfs_buf *, size_t);
+extern void xfs_buf_stale(struct xfs_buf *bp);
 
 /* Delayed Write Buffer Routines */
 extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *);
@@ -317,9 +318,6 @@ extern void xfs_buf_terminate(void);
 			    XBF_SYNCIO|XBF_FUA|XBF_FLUSH| \
 			    XBF_WRITE_FAIL))
 
-void xfs_buf_stale(struct xfs_buf *bp);
-#define XFS_BUF_UNSTALE(bp)	((bp)->b_flags &= ~XBF_STALE)
-#define XFS_BUF_ISSTALE(bp)	((bp)->b_flags & XBF_STALE)
 
 /*
  * These macros use the IO block map rather than b_bn. b_bn is now really
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 838df068fad4..99e91a0e554e 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -431,7 +431,7 @@ xfs_buf_item_unpin(
 	if (freed && stale) {
 		ASSERT(bip->bli_flags & XFS_BLI_STALE);
 		ASSERT(xfs_buf_islocked(bp));
-		ASSERT(XFS_BUF_ISSTALE(bp));
+		ASSERT(bp->b_flags & XBF_STALE);
 		ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
 
 		trace_xfs_buf_item_unpin_stale(bip);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index ed6f61fd3539..8ee29ca132dc 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -534,8 +534,8 @@ xfs_trans_log_buf(xfs_trans_t	*tp,
 	 */
 	if (bip->bli_flags & XFS_BLI_STALE) {
 		bip->bli_flags &= ~XFS_BLI_STALE;
-		ASSERT(XFS_BUF_ISSTALE(bp));
-		XFS_BUF_UNSTALE(bp);
+		ASSERT(bp->b_flags & XBF_STALE);
+		bp->b_flags &= ~XBF_STALE;
 		bip->__bli_format.blf_flags &= ~XFS_BLF_CANCEL;
 	}
 
@@ -600,7 +600,7 @@ xfs_trans_binval(
 		 * If the buffer is already invalidated, then
 		 * just return.
 		 */
-		ASSERT(XFS_BUF_ISSTALE(bp));
+		ASSERT(bp->b_flags & XBF_STALE);
 		ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY)));
 		ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_INODE_BUF));
 		ASSERT(!(bip->__bli_format.blf_flags & XFS_BLFT_MASK));

From 12877da58429affc988403817b88d901fe01f8c1 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 10 Feb 2016 15:01:30 +1100
Subject: [PATCH 37/66] xfs: remove XFS_BUF_ZEROFLAGS macro

The places where we use this macro already clear unnecessary IO
flags (e.g. through xfs_bwrite()) or never have unexpected IO flags
set on them in the first place (e.g. iclog buffers). Remove the
macro from these locations, and where necessary clear only the
specific flags that are conditional in the current buffer context.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_buf.h         | 6 ------
 fs/xfs/xfs_log.c         | 4 ++--
 fs/xfs/xfs_log_recover.c | 1 -
 3 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 98b7ee97aeae..4eb89bd4ee73 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -313,12 +313,6 @@ extern int xfs_buf_delwri_submit_nowait(struct list_head *);
 extern int xfs_buf_init(void);
 extern void xfs_buf_terminate(void);
 
-#define XFS_BUF_ZEROFLAGS(bp) \
-	((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC| \
-			    XBF_SYNCIO|XBF_FUA|XBF_FLUSH| \
-			    XBF_WRITE_FAIL))
-
-
 /*
  * These macros use the IO block map rather than b_bn. b_bn is now really
  * just for the buffer cache index for cached buffers. As IO does not use b_bn
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index edf20b204235..40b700d3f426 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1864,7 +1864,7 @@ xlog_sync(
 
 	bp->b_io_length = BTOBB(count);
 	bp->b_fspriv = iclog;
-	XFS_BUF_ZEROFLAGS(bp);
+	bp->b_flags &= ~(XBF_FUA | XBF_FLUSH);
 	bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE);
 
 	if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) {
@@ -1908,7 +1908,7 @@ xlog_sync(
 		xfs_buf_associate_memory(bp,
 				(char *)&iclog->ic_header + count, split);
 		bp->b_fspriv = iclog;
-		XFS_BUF_ZEROFLAGS(bp);
+		bp->b_flags &= ~(XBF_FUA | XBF_FLUSH);
 		bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE);
 		if (log->l_mp->m_flags & XFS_MOUNT_BARRIER)
 			bp->b_flags |= XBF_FUA;
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 50a75ecf279e..837e8d09aa5a 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -275,7 +275,6 @@ xlog_bwrite(
 	ASSERT(nbblks <= bp->b_length);
 
 	XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
-	XFS_BUF_ZEROFLAGS(bp);
 	xfs_buf_hold(bp);
 	xfs_buf_lock(bp);
 	bp->b_io_length = nbblks;

From 988ef927926aa3481cbf144f235c0cefd7deb9e4 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 15 Feb 2016 17:20:50 +1100
Subject: [PATCH 38/66] xfs: remove nonblocking mode from xfs_vm_writepage

Remove the nonblocking optimisation done for mapping lookups during
writeback. It's not clear that leaving a hole in the writeback range
just because we couldn't get a lock is really a win, as it makes us
do another small random IO later on rather than a large sequential
IO now.

As this gets in the way of sane error handling later on, just remove
for the moment and we can re-introduce an equivalent optimisation in
future if we see problems due to extent map lock contention.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_aops.c | 20 +++-----------------
 1 file changed, 3 insertions(+), 17 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 379c089fb051..807744b19233 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -283,8 +283,7 @@ xfs_map_blocks(
 	struct inode		*inode,
 	loff_t			offset,
 	struct xfs_bmbt_irec	*imap,
-	int			type,
-	int			nonblocking)
+	int			type)
 {
 	struct xfs_inode	*ip = XFS_I(inode);
 	struct xfs_mount	*mp = ip->i_mount;
@@ -300,12 +299,7 @@ xfs_map_blocks(
 	if (type == XFS_IO_UNWRITTEN)
 		bmapi_flags |= XFS_BMAPI_IGSTATE;
 
-	if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
-		if (nonblocking)
-			return -EAGAIN;
-		xfs_ilock(ip, XFS_ILOCK_SHARED);
-	}
-
+	xfs_ilock(ip, XFS_ILOCK_SHARED);
 	ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
 	       (ip->i_df.if_flags & XFS_IFEXTENTS));
 	ASSERT(offset <= mp->m_super->s_maxbytes);
@@ -955,7 +949,6 @@ xfs_vm_writepage(
 	ssize_t			len;
 	int			err, imap_valid = 0, uptodate = 1;
 	int			count = 0;
-	int			nonblocking = 0;
 
 	trace_xfs_writepage(inode, page, 0, 0);
 
@@ -1055,9 +1048,6 @@ xfs_vm_writepage(
 	offset = page_offset(page);
 	type = XFS_IO_OVERWRITE;
 
-	if (wbc->sync_mode == WB_SYNC_NONE)
-		nonblocking = 1;
-
 	do {
 		int new_ioend = 0;
 
@@ -1117,8 +1107,7 @@ xfs_vm_writepage(
 			 * time.
 			 */
 			new_ioend = 1;
-			err = xfs_map_blocks(inode, offset, &imap, type,
-					     nonblocking);
+			err = xfs_map_blocks(inode, offset, &imap, type);
 			if (err)
 				goto error;
 			imap_valid = xfs_imap_valid(inode, &imap, offset);
@@ -1188,9 +1177,6 @@ error:
 	if (iohead)
 		xfs_cancel_ioend(iohead);
 
-	if (err == -EAGAIN)
-		goto redirty;
-
 	xfs_aops_discard_page(page);
 	ClearPageUptodate(page);
 	unlock_page(page);

From 150d5be09ce49a9bed6feb7b7dc4e5ae188778ec Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 15 Feb 2016 17:21:12 +1100
Subject: [PATCH 39/66] xfs: remove xfs_cancel_ioend

We currently have code to cancel ioends being built because we
change bufferhead state as we build the ioend. On error, this needs
to be unwound and so we have cancelling code that walks the buffers
on the ioend chain and undoes these state changes.

However, the IO submission path already handles state changes for
buffers when a submission error occurs, so we don't really need a
separate cancel function to do this - we can simply submit the
ioend chain with the specific error and it will be cancelled rather
than submitted.

Hence we can remove the explicit cancel code and just rely on
submission to deal with the error correctly.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_aops.c | 95 +++++++++++++++++++++++------------------------
 1 file changed, 47 insertions(+), 48 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 807744b19233..245c448a2448 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -521,38 +521,6 @@ xfs_submit_ioend(
 	} while ((ioend = next) != NULL);
 }
 
-/*
- * Cancel submission of all buffer_heads so far in this endio.
- * Toss the endio too.  Only ever called for the initial page
- * in a writepage request, so only ever one page.
- */
-STATIC void
-xfs_cancel_ioend(
-	xfs_ioend_t		*ioend)
-{
-	xfs_ioend_t		*next;
-	struct buffer_head	*bh, *next_bh;
-
-	do {
-		next = ioend->io_list;
-		bh = ioend->io_buffer_head;
-		do {
-			next_bh = bh->b_private;
-			clear_buffer_async_write(bh);
-			/*
-			 * The unwritten flag is cleared when added to the
-			 * ioend. We're not submitting for I/O so mark the
-			 * buffer unwritten again for next time around.
-			 */
-			if (ioend->io_type == XFS_IO_UNWRITTEN)
-				set_buffer_unwritten(bh);
-			unlock_buffer(bh);
-		} while ((bh = next_bh) != NULL);
-
-		mempool_free(ioend, xfs_ioend_pool);
-	} while ((ioend = next) != NULL);
-}
-
 /*
  * Test to see if we've been building up a completion structure for
  * earlier buffers -- if so, we try to append to this ioend if we
@@ -925,6 +893,28 @@ out_invalidate:
 	return;
 }
 
+static int
+xfs_writepage_submit(
+	struct xfs_ioend	*ioend,
+	struct xfs_ioend	*iohead,
+	struct writeback_control *wbc,
+	int			status)
+{
+	struct blk_plug		plug;
+
+	/* Reserve log space if we might write beyond the on-disk inode size. */
+	if (!status && ioend && ioend->io_type != XFS_IO_UNWRITTEN &&
+	    xfs_ioend_is_append(ioend))
+		status = xfs_setfilesize_trans_alloc(ioend);
+
+	if (iohead) {
+		blk_start_plug(&plug);
+		xfs_submit_ioend(wbc, iohead, status);
+		blk_finish_plug(&plug);
+	}
+	return status;
+}
+
 /*
  * Write out a dirty page.
  *
@@ -1136,6 +1126,7 @@ xfs_vm_writepage(
 		return 0;
 
 	ASSERT(iohead);
+	ASSERT(err == 0);
 
 	/*
 	 * Any errors from this point onwards need tobe reported through the IO
@@ -1161,25 +1152,33 @@ xfs_vm_writepage(
 				  wbc, end_index);
 	}
 
-
-	/*
-	 * Reserve log space if we might write beyond the on-disk inode size.
-	 */
-	err = 0;
-	if (ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend))
-		err = xfs_setfilesize_trans_alloc(ioend);
-
-	xfs_submit_ioend(wbc, iohead, err);
-
-	return 0;
+	return xfs_writepage_submit(ioend, iohead, wbc, 0);
 
 error:
-	if (iohead)
-		xfs_cancel_ioend(iohead);
+	/*
+	 * On error, we have to fail the iohead here because we buffers locked
+	 * in the ioend chain. If we don't do this, we'll deadlock invalidating
+	 * the page as that tries to lock the buffers on the page. Also, because
+	 * we may have set pages under writeback, we have to run IO completion to
+	 * mark the error state of the IO appropriately, so we can't cancel the
+	 * ioend directly here. That means we have to mark this page as under
+	 * writeback if we included any buffers from it in the ioend chain.
+	 */
+	if (count)
+		xfs_start_page_writeback(page, 0, count);
+	xfs_writepage_submit(ioend, iohead, wbc, err);
 
-	xfs_aops_discard_page(page);
-	ClearPageUptodate(page);
-	unlock_page(page);
+	/*
+	 * We can only discard the page we had the IO error on if we haven't
+	 * included it in the ioend above. If it has already been errored out,
+	 * the it is unlocked and we can't touch it here.
+	 */
+	if (!count) {
+		xfs_aops_discard_page(page);
+		ClearPageUptodate(page);
+		unlock_page(page);
+	}
+	mapping_set_error(page->mapping, err);
 	return err;
 
 redirty:

From fbcc025613590d7b1d15521555dcc6393a148a6b Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 15 Feb 2016 17:21:19 +1100
Subject: [PATCH 40/66] xfs: Introduce writeback context for writepages

xfs_vm_writepages() calls generic_writepages to writeback a range of
a file, but then xfs_vm_writepage() clusters pages itself as it does
not have any context it can pass between->writepage calls from
__write_cache_pages().

Introduce a writeback context for xfs_vm_writepages() and call
__write_cache_pages directly with our own writepage callback so that
we can pass that context to each writepage invocation. This
encapsulates the current mapping, whether it is valid or not, the
current ioend and it's IO type and the ioend chain being built.

This requires us to move the ioend submission up to the level where
the writepage context is declared. This does mean we do not submit
IO until we packaged the entire writeback range, but with the block
plugging in the writepages call this is the way IO is submitted,
anyway.

It also means that we need to handle discontiguous page ranges.  If
the pages sent down by write_cache_pages to the writepage callback
are discontiguous, we need to detect this and put each discontiguous
page range into individual ioends. This is needed to ensure that the
ioend accurately represents the range of the file that it covers so
that file size updates during IO completion set the size correctly.
Failure to take into account the discontiguous ranges results in
files being too small when writeback patterns are non-sequential.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_aops.c | 218 ++++++++++++++++++++++++----------------------
 fs/xfs/xfs_aops.h |   2 +
 2 files changed, 116 insertions(+), 104 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 245c448a2448..e0be34125cf0 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -36,6 +36,18 @@
 #include <linux/pagevec.h>
 #include <linux/writeback.h>
 
+/*
+ * structure owned by writepages passed to individual writepage calls
+ */
+struct xfs_writepage_ctx {
+	struct xfs_bmbt_irec    imap;
+	bool			imap_valid;
+	unsigned int		io_type;
+	struct xfs_ioend	*iohead;
+	struct xfs_ioend	*ioend;
+	sector_t		last_block;
+};
+
 void
 xfs_count_page_state(
 	struct page		*page,
@@ -335,7 +347,7 @@ xfs_map_blocks(
 	return 0;
 }
 
-STATIC int
+STATIC bool
 xfs_imap_valid(
 	struct inode		*inode,
 	struct xfs_bmbt_irec	*imap,
@@ -532,29 +544,27 @@ xfs_add_to_ioend(
 	struct inode		*inode,
 	struct buffer_head	*bh,
 	xfs_off_t		offset,
-	unsigned int		type,
-	xfs_ioend_t		**result,
-	int			need_ioend)
+	struct xfs_writepage_ctx *wpc)
 {
-	xfs_ioend_t		*ioend = *result;
+	if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type ||
+	    bh->b_blocknr != wpc->last_block + 1) {
+		struct xfs_ioend	*new;
 
-	if (!ioend || need_ioend || type != ioend->io_type) {
-		xfs_ioend_t	*previous = *result;
-
-		ioend = xfs_alloc_ioend(inode, type);
-		ioend->io_offset = offset;
-		ioend->io_buffer_head = bh;
-		ioend->io_buffer_tail = bh;
-		if (previous)
-			previous->io_list = ioend;
-		*result = ioend;
+		new = xfs_alloc_ioend(inode, wpc->io_type);
+		new->io_offset = offset;
+		new->io_buffer_head = bh;
+		new->io_buffer_tail = bh;
+		if (wpc->ioend)
+			wpc->ioend->io_list = new;
+		wpc->ioend = new;
 	} else {
-		ioend->io_buffer_tail->b_private = bh;
-		ioend->io_buffer_tail = bh;
+		wpc->ioend->io_buffer_tail->b_private = bh;
+		wpc->ioend->io_buffer_tail = bh;
 	}
 
 	bh->b_private = NULL;
-	ioend->io_size += bh->b_size;
+	wpc->ioend->io_size += bh->b_size;
+	wpc->last_block = bh->b_blocknr;
 }
 
 STATIC void
@@ -651,17 +661,15 @@ xfs_convert_page(
 	struct inode		*inode,
 	struct page		*page,
 	loff_t			tindex,
-	struct xfs_bmbt_irec	*imap,
-	xfs_ioend_t		**ioendp,
+	struct xfs_writepage_ctx *wpc,
 	struct writeback_control *wbc)
 {
 	struct buffer_head	*bh, *head;
 	xfs_off_t		end_offset;
 	unsigned long		p_offset;
-	unsigned int		type;
 	int			len, page_dirty;
 	int			count = 0, done = 0, uptodate = 1;
- 	xfs_off_t		offset = page_offset(page);
+	xfs_off_t		offset = page_offset(page);
 
 	if (page->index != tindex)
 		goto fail;
@@ -671,7 +679,7 @@ xfs_convert_page(
 		goto fail_unlock_page;
 	if (page->mapping != inode->i_mapping)
 		goto fail_unlock_page;
-	if (!xfs_check_page_type(page, (*ioendp)->io_type, false))
+	if (!xfs_check_page_type(page, wpc->ioend->io_type, false))
 		goto fail_unlock_page;
 
 	/*
@@ -707,7 +715,7 @@ xfs_convert_page(
 	 * writeback.  Hence for more optimal IO patterns, we should always
 	 * avoid partial page writeback due to multiple mappings on a page here.
 	 */
-	if (!xfs_imap_valid(inode, imap, end_offset))
+	if (!xfs_imap_valid(inode, &wpc->imap, end_offset))
 		goto fail_unlock_page;
 
 	len = 1 << inode->i_blkbits;
@@ -739,23 +747,22 @@ xfs_convert_page(
 		if (buffer_unwritten(bh) || buffer_delay(bh) ||
 		    buffer_mapped(bh)) {
 			if (buffer_unwritten(bh))
-				type = XFS_IO_UNWRITTEN;
+				wpc->io_type = XFS_IO_UNWRITTEN;
 			else if (buffer_delay(bh))
-				type = XFS_IO_DELALLOC;
+				wpc->io_type = XFS_IO_DELALLOC;
 			else
-				type = XFS_IO_OVERWRITE;
+				wpc->io_type = XFS_IO_OVERWRITE;
 
 			/*
 			 * imap should always be valid because of the above
 			 * partial page end_offset check on the imap.
 			 */
-			ASSERT(xfs_imap_valid(inode, imap, offset));
+			ASSERT(xfs_imap_valid(inode, &wpc->imap, offset));
 
 			lock_buffer(bh);
-			if (type != XFS_IO_OVERWRITE)
-				xfs_map_at_offset(inode, bh, imap, offset);
-			xfs_add_to_ioend(inode, bh, offset, type,
-					 ioendp, done);
+			if (wpc->io_type != XFS_IO_OVERWRITE)
+				xfs_map_at_offset(inode, bh, &wpc->imap, offset);
+			xfs_add_to_ioend(inode, bh, offset, wpc);
 
 			page_dirty--;
 			count++;
@@ -790,8 +797,7 @@ STATIC void
 xfs_cluster_write(
 	struct inode		*inode,
 	pgoff_t			tindex,
-	struct xfs_bmbt_irec	*imap,
-	xfs_ioend_t		**ioendp,
+	struct xfs_writepage_ctx *wpc,
 	struct writeback_control *wbc,
 	pgoff_t			tlast)
 {
@@ -807,7 +813,7 @@ xfs_cluster_write(
 
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			done = xfs_convert_page(inode, pvec.pages[i], tindex++,
-					imap, ioendp, wbc);
+						wpc, wbc);
 			if (done)
 				break;
 		}
@@ -895,21 +901,20 @@ out_invalidate:
 
 static int
 xfs_writepage_submit(
-	struct xfs_ioend	*ioend,
-	struct xfs_ioend	*iohead,
+	struct xfs_writepage_ctx *wpc,
 	struct writeback_control *wbc,
 	int			status)
 {
 	struct blk_plug		plug;
 
 	/* Reserve log space if we might write beyond the on-disk inode size. */
-	if (!status && ioend && ioend->io_type != XFS_IO_UNWRITTEN &&
-	    xfs_ioend_is_append(ioend))
-		status = xfs_setfilesize_trans_alloc(ioend);
+	if (!status && wpc->ioend && wpc->ioend->io_type != XFS_IO_UNWRITTEN &&
+	    xfs_ioend_is_append(wpc->ioend))
+		status = xfs_setfilesize_trans_alloc(wpc->ioend);
 
-	if (iohead) {
+	if (wpc->iohead) {
 		blk_start_plug(&plug);
-		xfs_submit_ioend(wbc, iohead, status);
+		xfs_submit_ioend(wbc, wpc->iohead, status);
 		blk_finish_plug(&plug);
 	}
 	return status;
@@ -924,20 +929,19 @@ xfs_writepage_submit(
  * For any other dirty buffer heads on the page we should flush them.
  */
 STATIC int
-xfs_vm_writepage(
+xfs_do_writepage(
 	struct page		*page,
-	struct writeback_control *wbc)
+	struct writeback_control *wbc,
+	void			*data)
 {
+	struct xfs_writepage_ctx *wpc = data;
 	struct inode		*inode = page->mapping->host;
 	struct buffer_head	*bh, *head;
-	struct xfs_bmbt_irec	imap;
-	xfs_ioend_t		*ioend = NULL, *iohead = NULL;
 	loff_t			offset;
-	unsigned int		type;
 	__uint64_t              end_offset;
 	pgoff_t                 end_index, last_index;
 	ssize_t			len;
-	int			err, imap_valid = 0, uptodate = 1;
+	int			err, uptodate = 1;
 	int			count = 0;
 
 	trace_xfs_writepage(inode, page, 0, 0);
@@ -1036,11 +1040,8 @@ xfs_vm_writepage(
 
 	bh = head = page_buffers(page);
 	offset = page_offset(page);
-	type = XFS_IO_OVERWRITE;
 
 	do {
-		int new_ioend = 0;
-
 		if (offset >= end_offset)
 			break;
 		if (!buffer_uptodate(bh))
@@ -1053,24 +1054,24 @@ xfs_vm_writepage(
 		 * buffers covering holes here.
 		 */
 		if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
-			imap_valid = 0;
+			wpc->imap_valid = false;
 			continue;
 		}
 
 		if (buffer_unwritten(bh)) {
-			if (type != XFS_IO_UNWRITTEN) {
-				type = XFS_IO_UNWRITTEN;
-				imap_valid = 0;
+			if (wpc->io_type != XFS_IO_UNWRITTEN) {
+				wpc->io_type = XFS_IO_UNWRITTEN;
+				wpc->imap_valid = false;
 			}
 		} else if (buffer_delay(bh)) {
-			if (type != XFS_IO_DELALLOC) {
-				type = XFS_IO_DELALLOC;
-				imap_valid = 0;
+			if (wpc->io_type != XFS_IO_DELALLOC) {
+				wpc->io_type = XFS_IO_DELALLOC;
+				wpc->imap_valid = false;
 			}
 		} else if (buffer_uptodate(bh)) {
-			if (type != XFS_IO_OVERWRITE) {
-				type = XFS_IO_OVERWRITE;
-				imap_valid = 0;
+			if (wpc->io_type != XFS_IO_OVERWRITE) {
+				wpc->io_type = XFS_IO_OVERWRITE;
+				wpc->imap_valid = false;
 			}
 		} else {
 			if (PageUptodate(page))
@@ -1081,38 +1082,29 @@ xfs_vm_writepage(
 			 * subsequent writeable buffers into a new
 			 * ioend.
 			 */
-			imap_valid = 0;
+			wpc->imap_valid = 0;
 			continue;
 		}
 
-		if (imap_valid)
-			imap_valid = xfs_imap_valid(inode, &imap, offset);
-		if (!imap_valid) {
-			/*
-			 * If we didn't have a valid mapping then we need to
-			 * put the new mapping into a separate ioend structure.
-			 * This ensures non-contiguous extents always have
-			 * separate ioends, which is particularly important
-			 * for unwritten extent conversion at I/O completion
-			 * time.
-			 */
-			new_ioend = 1;
-			err = xfs_map_blocks(inode, offset, &imap, type);
+		if (wpc->imap_valid)
+			wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap, offset);
+		if (!wpc->imap_valid) {
+			err = xfs_map_blocks(inode, offset, &wpc->imap,
+					     wpc->io_type);
 			if (err)
 				goto error;
-			imap_valid = xfs_imap_valid(inode, &imap, offset);
+			wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap, offset);
 		}
-		if (imap_valid) {
+		if (wpc->imap_valid) {
 			lock_buffer(bh);
-			if (type != XFS_IO_OVERWRITE)
-				xfs_map_at_offset(inode, bh, &imap, offset);
-			xfs_add_to_ioend(inode, bh, offset, type, &ioend,
-					 new_ioend);
+			if (wpc->io_type != XFS_IO_OVERWRITE)
+				xfs_map_at_offset(inode, bh, &wpc->imap, offset);
+			xfs_add_to_ioend(inode, bh, offset, wpc);
 			count++;
 		}
 
-		if (!iohead)
-			iohead = ioend;
+		if (!wpc->iohead)
+			wpc->iohead = wpc->ioend;
 
 	} while (offset += len, ((bh = bh->b_this_page) != head));
 
@@ -1122,10 +1114,10 @@ xfs_vm_writepage(
 	xfs_start_page_writeback(page, 1, count);
 
 	/* if there is no IO to be submitted for this page, we are done */
-	if (!ioend)
+	if (!count)
 		return 0;
 
-	ASSERT(iohead);
+	ASSERT(wpc->iohead);
 	ASSERT(err == 0);
 
 	/*
@@ -1133,10 +1125,10 @@ xfs_vm_writepage(
 	 * completion path as we have marked the initial page as under writeback
 	 * and unlocked it.
 	 */
-	if (imap_valid) {
+	if (wpc->imap_valid) {
 		xfs_off_t		end_index;
 
-		end_index = imap.br_startoff + imap.br_blockcount;
+		end_index = wpc->imap.br_startoff + wpc->imap.br_blockcount;
 
 		/* to bytes */
 		end_index <<= inode->i_blkbits;
@@ -1148,32 +1140,30 @@ xfs_vm_writepage(
 		if (end_index > last_index)
 			end_index = last_index;
 
-		xfs_cluster_write(inode, page->index + 1, &imap, &ioend,
-				  wbc, end_index);
+		xfs_cluster_write(inode, page->index + 1, wpc, wbc, end_index);
 	}
-
-	return xfs_writepage_submit(ioend, iohead, wbc, 0);
+	return 0;
 
 error:
 	/*
 	 * On error, we have to fail the iohead here because we buffers locked
 	 * in the ioend chain. If we don't do this, we'll deadlock invalidating
 	 * the page as that tries to lock the buffers on the page. Also, because
-	 * we may have set pages under writeback, we have to run IO completion to
-	 * mark the error state of the IO appropriately, so we can't cancel the
-	 * ioend directly here. That means we have to mark this page as under
-	 * writeback if we included any buffers from it in the ioend chain.
+	 * we may have set pages under writeback, we have to make sure we run
+	 * IO completion to mark the error state of the IO appropriately, so we
+	 * can't cancel the ioend directly here. That means we have to mark this
+	 * page as under writeback if we included any buffers from it in the
+	 * ioend chain so that completion treats it correctly.
+	 *
+	 * If we didn't include the page in the ioend, then we can simply
+	 * discard and unlock it as there are no other users of the page or it's
+	 * buffers right now. The caller will still need to trigger submission
+	 * of outstanding ioends on the writepage context so they are treated
+	 * correctly on error.
 	 */
 	if (count)
 		xfs_start_page_writeback(page, 0, count);
-	xfs_writepage_submit(ioend, iohead, wbc, err);
-
-	/*
-	 * We can only discard the page we had the IO error on if we haven't
-	 * included it in the ioend above. If it has already been errored out,
-	 * the it is unlocked and we can't touch it here.
-	 */
-	if (!count) {
+	else {
 		xfs_aops_discard_page(page);
 		ClearPageUptodate(page);
 		unlock_page(page);
@@ -1187,13 +1177,33 @@ redirty:
 	return 0;
 }
 
+STATIC int
+xfs_vm_writepage(
+	struct page		*page,
+	struct writeback_control *wbc)
+{
+	struct xfs_writepage_ctx wpc = {
+		.io_type = XFS_IO_INVALID,
+	};
+	int			ret;
+
+	ret = xfs_do_writepage(page, wbc, &wpc);
+	return xfs_writepage_submit(&wpc, wbc, ret);
+}
+
 STATIC int
 xfs_vm_writepages(
 	struct address_space	*mapping,
 	struct writeback_control *wbc)
 {
+	struct xfs_writepage_ctx wpc = {
+		.io_type = XFS_IO_INVALID,
+	};
+	int			ret;
+
 	xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
-	return generic_writepages(mapping, wbc);
+	ret = write_cache_pages(mapping, wbc, xfs_do_writepage, &wpc);
+	return xfs_writepage_submit(&wpc, wbc, ret);
 }
 
 /*
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index f6ffc9ae5ceb..3c3f1a37a1c7 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -24,12 +24,14 @@ extern mempool_t *xfs_ioend_pool;
  * Types of I/O for bmap clustering and I/O completion tracking.
  */
 enum {
+	XFS_IO_INVALID,		/* initial state */
 	XFS_IO_DELALLOC,	/* covers delalloc region */
 	XFS_IO_UNWRITTEN,	/* covers allocated but uninitialized data */
 	XFS_IO_OVERWRITE,	/* covers already allocated extent */
 };
 
 #define XFS_IO_TYPES \
+	{ XFS_IO_INVALID,		"invalid" }, \
 	{ XFS_IO_DELALLOC,		"delalloc" }, \
 	{ XFS_IO_UNWRITTEN,		"unwritten" }, \
 	{ XFS_IO_OVERWRITE,		"overwrite" }

From ad68972acb82c3e8bba316d542ab204984cb1f1c Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 15 Feb 2016 17:21:31 +1100
Subject: [PATCH 41/66] xfs: xfs_cluster_write is redundant

xfs_cluster_write() is not necessary now that xfs_vm_writepages()
aggregates writepage calls across a single mapping. This means we no
longer need to do page lookups in xfs_cluster_write, so writeback
only needs to look up th epage cache once per page being written.
This also removes a large amount of mostly duplicate code between
xfs_do_writepage() and xfs_convert_page().

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_aops.c | 214 ++--------------------------------------------
 1 file changed, 6 insertions(+), 208 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index e0be34125cf0..f75c7c99cb63 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -650,179 +650,6 @@ xfs_check_page_type(
 	return false;
 }
 
-/*
- * Allocate & map buffers for page given the extent map. Write it out.
- * except for the original page of a writepage, this is called on
- * delalloc/unwritten pages only, for the original page it is possible
- * that the page has no mapping at all.
- */
-STATIC int
-xfs_convert_page(
-	struct inode		*inode,
-	struct page		*page,
-	loff_t			tindex,
-	struct xfs_writepage_ctx *wpc,
-	struct writeback_control *wbc)
-{
-	struct buffer_head	*bh, *head;
-	xfs_off_t		end_offset;
-	unsigned long		p_offset;
-	int			len, page_dirty;
-	int			count = 0, done = 0, uptodate = 1;
-	xfs_off_t		offset = page_offset(page);
-
-	if (page->index != tindex)
-		goto fail;
-	if (!trylock_page(page))
-		goto fail;
-	if (PageWriteback(page))
-		goto fail_unlock_page;
-	if (page->mapping != inode->i_mapping)
-		goto fail_unlock_page;
-	if (!xfs_check_page_type(page, wpc->ioend->io_type, false))
-		goto fail_unlock_page;
-
-	/*
-	 * page_dirty is initially a count of buffers on the page before
-	 * EOF and is decremented as we move each into a cleanable state.
-	 *
-	 * Derivation:
-	 *
-	 * End offset is the highest offset that this page should represent.
-	 * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1))
-	 * will evaluate non-zero and be less than PAGE_CACHE_SIZE and
-	 * hence give us the correct page_dirty count. On any other page,
-	 * it will be zero and in that case we need page_dirty to be the
-	 * count of buffers on the page.
-	 */
-	end_offset = min_t(unsigned long long,
-			(xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,
-			i_size_read(inode));
-
-	/*
-	 * If the current map does not span the entire page we are about to try
-	 * to write, then give up. The only way we can write a page that spans
-	 * multiple mappings in a single writeback iteration is via the
-	 * xfs_vm_writepage() function. Data integrity writeback requires the
-	 * entire page to be written in a single attempt, otherwise the part of
-	 * the page we don't write here doesn't get written as part of the data
-	 * integrity sync.
-	 *
-	 * For normal writeback, we also don't attempt to write partial pages
-	 * here as it simply means that write_cache_pages() will see it under
-	 * writeback and ignore the page until some point in the future, at
-	 * which time this will be the only page in the file that needs
-	 * writeback.  Hence for more optimal IO patterns, we should always
-	 * avoid partial page writeback due to multiple mappings on a page here.
-	 */
-	if (!xfs_imap_valid(inode, &wpc->imap, end_offset))
-		goto fail_unlock_page;
-
-	len = 1 << inode->i_blkbits;
-	p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1),
-					PAGE_CACHE_SIZE);
-	p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE;
-	page_dirty = p_offset / len;
-
-	/*
-	 * The moment we find a buffer that doesn't match our current type
-	 * specification or can't be written, abort the loop and start
-	 * writeback. As per the above xfs_imap_valid() check, only
-	 * xfs_vm_writepage() can handle partial page writeback fully - we are
-	 * limited here to the buffers that are contiguous with the current
-	 * ioend, and hence a buffer we can't write breaks that contiguity and
-	 * we have to defer the rest of the IO to xfs_vm_writepage().
-	 */
-	bh = head = page_buffers(page);
-	do {
-		if (offset >= end_offset)
-			break;
-		if (!buffer_uptodate(bh))
-			uptodate = 0;
-		if (!(PageUptodate(page) || buffer_uptodate(bh))) {
-			done = 1;
-			break;
-		}
-
-		if (buffer_unwritten(bh) || buffer_delay(bh) ||
-		    buffer_mapped(bh)) {
-			if (buffer_unwritten(bh))
-				wpc->io_type = XFS_IO_UNWRITTEN;
-			else if (buffer_delay(bh))
-				wpc->io_type = XFS_IO_DELALLOC;
-			else
-				wpc->io_type = XFS_IO_OVERWRITE;
-
-			/*
-			 * imap should always be valid because of the above
-			 * partial page end_offset check on the imap.
-			 */
-			ASSERT(xfs_imap_valid(inode, &wpc->imap, offset));
-
-			lock_buffer(bh);
-			if (wpc->io_type != XFS_IO_OVERWRITE)
-				xfs_map_at_offset(inode, bh, &wpc->imap, offset);
-			xfs_add_to_ioend(inode, bh, offset, wpc);
-
-			page_dirty--;
-			count++;
-		} else {
-			done = 1;
-			break;
-		}
-	} while (offset += len, (bh = bh->b_this_page) != head);
-
-	if (uptodate && bh == head)
-		SetPageUptodate(page);
-
-	if (count) {
-		if (--wbc->nr_to_write <= 0 &&
-		    wbc->sync_mode == WB_SYNC_NONE)
-			done = 1;
-	}
-	xfs_start_page_writeback(page, !page_dirty, count);
-
-	return done;
- fail_unlock_page:
-	unlock_page(page);
- fail:
-	return 1;
-}
-
-/*
- * Convert & write out a cluster of pages in the same extent as defined
- * by mp and following the start page.
- */
-STATIC void
-xfs_cluster_write(
-	struct inode		*inode,
-	pgoff_t			tindex,
-	struct xfs_writepage_ctx *wpc,
-	struct writeback_control *wbc,
-	pgoff_t			tlast)
-{
-	struct pagevec		pvec;
-	int			done = 0, i;
-
-	pagevec_init(&pvec, 0);
-	while (!done && tindex <= tlast) {
-		unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);
-
-		if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
-			break;
-
-		for (i = 0; i < pagevec_count(&pvec); i++) {
-			done = xfs_convert_page(inode, pvec.pages[i], tindex++,
-						wpc, wbc);
-			if (done)
-				break;
-		}
-
-		pagevec_release(&pvec);
-		cond_resched();
-	}
-}
-
 STATIC void
 xfs_vm_invalidatepage(
 	struct page		*page,
@@ -939,7 +766,7 @@ xfs_do_writepage(
 	struct buffer_head	*bh, *head;
 	loff_t			offset;
 	__uint64_t              end_offset;
-	pgoff_t                 end_index, last_index;
+	pgoff_t                 end_index;
 	ssize_t			len;
 	int			err, uptodate = 1;
 	int			count = 0;
@@ -969,12 +796,9 @@ xfs_do_writepage(
 	if (WARN_ON_ONCE(current->flags & PF_FSTRANS))
 		goto redirty;
 
-	/* Is this page beyond the end of the file? */
-	offset = i_size_read(inode);
-	end_index = offset >> PAGE_CACHE_SHIFT;
-	last_index = (offset - 1) >> PAGE_CACHE_SHIFT;
-
 	/*
+	 * Is this page beyond the end of the file?
+	 *
 	 * The page index is less than the end_index, adjust the end_offset
 	 * to the highest offset that this page should represent.
 	 * -----------------------------------------------------
@@ -985,6 +809,8 @@ xfs_do_writepage(
 	 * |     desired writeback range    |      see else    |
 	 * ---------------------------------^------------------|
 	 */
+	offset = i_size_read(inode);
+	end_index = offset >> PAGE_CACHE_SHIFT;
 	if (page->index < end_index)
 		end_offset = (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT;
 	else {
@@ -1113,35 +939,7 @@ xfs_do_writepage(
 
 	xfs_start_page_writeback(page, 1, count);
 
-	/* if there is no IO to be submitted for this page, we are done */
-	if (!count)
-		return 0;
-
-	ASSERT(wpc->iohead);
-	ASSERT(err == 0);
-
-	/*
-	 * Any errors from this point onwards need tobe reported through the IO
-	 * completion path as we have marked the initial page as under writeback
-	 * and unlocked it.
-	 */
-	if (wpc->imap_valid) {
-		xfs_off_t		end_index;
-
-		end_index = wpc->imap.br_startoff + wpc->imap.br_blockcount;
-
-		/* to bytes */
-		end_index <<= inode->i_blkbits;
-
-		/* to pages */
-		end_index = (end_index - 1) >> PAGE_CACHE_SHIFT;
-
-		/* check against file size */
-		if (end_index > last_index)
-			end_index = last_index;
-
-		xfs_cluster_write(inode, page->index + 1, wpc, wbc, end_index);
-	}
+	ASSERT(wpc->iohead || !count);
 	return 0;
 
 error:

From bfce7d2e2d5ee05e9d465888905c66a70a9c243c Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 15 Feb 2016 17:21:37 +1100
Subject: [PATCH 42/66] xfs: factor mapping out of xfs_do_writepage

Separate out the bufferhead based mapping from the writepage code so
that we have a clear separation of the page operations and the
bufferhead state.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_aops.c | 232 ++++++++++++++++++++++++----------------------
 1 file changed, 122 insertions(+), 110 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index f75c7c99cb63..6f5c95f94add 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -747,6 +747,127 @@ xfs_writepage_submit(
 	return status;
 }
 
+static int
+xfs_writepage_map(
+	struct xfs_writepage_ctx *wpc,
+	struct inode		*inode,
+	struct page		*page,
+	loff_t			offset,
+	__uint64_t              end_offset)
+{
+	struct buffer_head	*bh, *head;
+	ssize_t			len = 1 << inode->i_blkbits;
+	int			error = 0;
+	int			uptodate = 1;
+	int			count = 0;
+
+	bh = head = page_buffers(page);
+	offset = page_offset(page);
+
+	do {
+		if (offset >= end_offset)
+			break;
+		if (!buffer_uptodate(bh))
+			uptodate = 0;
+
+		/*
+		 * set_page_dirty dirties all buffers in a page, independent
+		 * of their state.  The dirty state however is entirely
+		 * meaningless for holes (!mapped && uptodate), so skip
+		 * buffers covering holes here.
+		 */
+		if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
+			wpc->imap_valid = false;
+			continue;
+		}
+
+		if (buffer_unwritten(bh)) {
+			if (wpc->io_type != XFS_IO_UNWRITTEN) {
+				wpc->io_type = XFS_IO_UNWRITTEN;
+				wpc->imap_valid = false;
+			}
+		} else if (buffer_delay(bh)) {
+			if (wpc->io_type != XFS_IO_DELALLOC) {
+				wpc->io_type = XFS_IO_DELALLOC;
+				wpc->imap_valid = false;
+			}
+		} else if (buffer_uptodate(bh)) {
+			if (wpc->io_type != XFS_IO_OVERWRITE) {
+				wpc->io_type = XFS_IO_OVERWRITE;
+				wpc->imap_valid = false;
+			}
+		} else {
+			if (PageUptodate(page))
+				ASSERT(buffer_mapped(bh));
+			/*
+			 * This buffer is not uptodate and will not be
+			 * written to disk.  Ensure that we will put any
+			 * subsequent writeable buffers into a new
+			 * ioend.
+			 */
+			wpc->imap_valid = false;
+			continue;
+		}
+
+		if (wpc->imap_valid)
+			wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap,
+							 offset);
+		if (!wpc->imap_valid) {
+			error = xfs_map_blocks(inode, offset, &wpc->imap,
+					     wpc->io_type);
+			if (error)
+				goto out_error;
+			wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap,
+							 offset);
+		}
+		if (wpc->imap_valid) {
+			lock_buffer(bh);
+			if (wpc->io_type != XFS_IO_OVERWRITE)
+				xfs_map_at_offset(inode, bh, &wpc->imap, offset);
+			xfs_add_to_ioend(inode, bh, offset, wpc);
+			count++;
+		}
+
+		if (!wpc->iohead)
+			wpc->iohead = wpc->ioend;
+
+	} while (offset += len, ((bh = bh->b_this_page) != head));
+
+	if (uptodate && bh == head)
+		SetPageUptodate(page);
+
+	xfs_start_page_writeback(page, 1, count);
+	ASSERT(wpc->iohead || !count);
+	return 0;
+
+out_error:
+	/*
+	 * On error, we have to fail the iohead here because we locked buffers
+	 * in the ioend chain. If we don't do this, we'll deadlock invalidating
+	 * the page as that tries to lock the buffers on the page. Also, because
+	 * we may have set pages under writeback, we have to make sure we run
+	 * IO completion to mark the error state of the IO appropriately, so we
+	 * can't cancel the ioend directly here. That means we have to mark this
+	 * page as under writeback if we included any buffers from it in the
+	 * ioend chain so that completion treats it correctly.
+	 *
+	 * If we didn't include the page in the ioend, then we can simply
+	 * discard and unlock it as there are no other users of the page or it's
+	 * buffers right now. The caller will still need to trigger submission
+	 * of outstanding ioends on the writepage context so they are treated
+	 * correctly on error.
+	 */
+	if (count)
+		xfs_start_page_writeback(page, 0, count);
+	else {
+		xfs_aops_discard_page(page);
+		ClearPageUptodate(page);
+		unlock_page(page);
+	}
+	mapping_set_error(page->mapping, error);
+	return error;
+}
+
 /*
  * Write out a dirty page.
  *
@@ -763,13 +884,9 @@ xfs_do_writepage(
 {
 	struct xfs_writepage_ctx *wpc = data;
 	struct inode		*inode = page->mapping->host;
-	struct buffer_head	*bh, *head;
 	loff_t			offset;
 	__uint64_t              end_offset;
 	pgoff_t                 end_index;
-	ssize_t			len;
-	int			err, uptodate = 1;
-	int			count = 0;
 
 	trace_xfs_writepage(inode, page, 0, 0);
 
@@ -862,112 +979,7 @@ xfs_do_writepage(
 		end_offset = offset;
 	}
 
-	len = 1 << inode->i_blkbits;
-
-	bh = head = page_buffers(page);
-	offset = page_offset(page);
-
-	do {
-		if (offset >= end_offset)
-			break;
-		if (!buffer_uptodate(bh))
-			uptodate = 0;
-
-		/*
-		 * set_page_dirty dirties all buffers in a page, independent
-		 * of their state.  The dirty state however is entirely
-		 * meaningless for holes (!mapped && uptodate), so skip
-		 * buffers covering holes here.
-		 */
-		if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
-			wpc->imap_valid = false;
-			continue;
-		}
-
-		if (buffer_unwritten(bh)) {
-			if (wpc->io_type != XFS_IO_UNWRITTEN) {
-				wpc->io_type = XFS_IO_UNWRITTEN;
-				wpc->imap_valid = false;
-			}
-		} else if (buffer_delay(bh)) {
-			if (wpc->io_type != XFS_IO_DELALLOC) {
-				wpc->io_type = XFS_IO_DELALLOC;
-				wpc->imap_valid = false;
-			}
-		} else if (buffer_uptodate(bh)) {
-			if (wpc->io_type != XFS_IO_OVERWRITE) {
-				wpc->io_type = XFS_IO_OVERWRITE;
-				wpc->imap_valid = false;
-			}
-		} else {
-			if (PageUptodate(page))
-				ASSERT(buffer_mapped(bh));
-			/*
-			 * This buffer is not uptodate and will not be
-			 * written to disk.  Ensure that we will put any
-			 * subsequent writeable buffers into a new
-			 * ioend.
-			 */
-			wpc->imap_valid = 0;
-			continue;
-		}
-
-		if (wpc->imap_valid)
-			wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap, offset);
-		if (!wpc->imap_valid) {
-			err = xfs_map_blocks(inode, offset, &wpc->imap,
-					     wpc->io_type);
-			if (err)
-				goto error;
-			wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap, offset);
-		}
-		if (wpc->imap_valid) {
-			lock_buffer(bh);
-			if (wpc->io_type != XFS_IO_OVERWRITE)
-				xfs_map_at_offset(inode, bh, &wpc->imap, offset);
-			xfs_add_to_ioend(inode, bh, offset, wpc);
-			count++;
-		}
-
-		if (!wpc->iohead)
-			wpc->iohead = wpc->ioend;
-
-	} while (offset += len, ((bh = bh->b_this_page) != head));
-
-	if (uptodate && bh == head)
-		SetPageUptodate(page);
-
-	xfs_start_page_writeback(page, 1, count);
-
-	ASSERT(wpc->iohead || !count);
-	return 0;
-
-error:
-	/*
-	 * On error, we have to fail the iohead here because we buffers locked
-	 * in the ioend chain. If we don't do this, we'll deadlock invalidating
-	 * the page as that tries to lock the buffers on the page. Also, because
-	 * we may have set pages under writeback, we have to make sure we run
-	 * IO completion to mark the error state of the IO appropriately, so we
-	 * can't cancel the ioend directly here. That means we have to mark this
-	 * page as under writeback if we included any buffers from it in the
-	 * ioend chain so that completion treats it correctly.
-	 *
-	 * If we didn't include the page in the ioend, then we can simply
-	 * discard and unlock it as there are no other users of the page or it's
-	 * buffers right now. The caller will still need to trigger submission
-	 * of outstanding ioends on the writepage context so they are treated
-	 * correctly on error.
-	 */
-	if (count)
-		xfs_start_page_writeback(page, 0, count);
-	else {
-		xfs_aops_discard_page(page);
-		ClearPageUptodate(page);
-		unlock_page(page);
-	}
-	mapping_set_error(page->mapping, err);
-	return err;
+	return xfs_writepage_map(wpc, inode, page, offset, end_offset);
 
 redirty:
 	redirty_page_for_writepage(wbc, page);

From e10de3723c53378e7cf441529f563c316fdc0dd3 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 15 Feb 2016 17:23:12 +1100
Subject: [PATCH 43/66] xfs: don't chain ioends during writepage submission

Currently we can build a long ioend chain during ->writepages that
gets attached to the writepage context. IO submission only then
occurs when we finish all the writepage processing. This means we
can have many ioends allocated and pending, and this violates the
mempool guarantees that we need to give about forwards progress.
i.e. we really should only have one ioend being built at a time,
otherwise we may drain the mempool trying to allocate a new ioend
and that blocks submission, completion and freeing of ioends that
are already in progress.

To prevent this situation from happening, we need to submit ioends
for IO as soon as they are ready for dispatch rather than queuing
them for later submission. This means the ioends have bios built
immediately and they get queued on any plug that is current active.
Hence if we schedule away from writeback, the ioends that have been
built will make forwards progress due to the plug flushing on
context switch. This will also prevent context switches from
creating unnecessary IO submission latency.

We can't completely avoid having nested IO allocation - when we have
a block size smaller than a page size, we still need to hold the
ioend submission until after we have marked the current page dirty.
Hence we may need multiple ioends to be held while the current page
is completely mapped and made ready for IO dispatch. We cannot avoid
this problem - the current code already has this ioend chaining
within a page so we can mostly ignore that it occurs.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_aops.c | 249 +++++++++++++++++++++++-----------------------
 fs/xfs/xfs_aops.h |   2 +-
 2 files changed, 123 insertions(+), 128 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 6f5c95f94add..46dc9211bae7 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -43,7 +43,6 @@ struct xfs_writepage_ctx {
 	struct xfs_bmbt_irec    imap;
 	bool			imap_valid;
 	unsigned int		io_type;
-	struct xfs_ioend	*iohead;
 	struct xfs_ioend	*ioend;
 	sector_t		last_block;
 };
@@ -277,7 +276,7 @@ xfs_alloc_ioend(
 	 */
 	atomic_set(&ioend->io_remaining, 1);
 	ioend->io_error = 0;
-	ioend->io_list = NULL;
+	INIT_LIST_HEAD(&ioend->io_list);
 	ioend->io_type = type;
 	ioend->io_inode = inode;
 	ioend->io_buffer_head = NULL;
@@ -420,8 +419,7 @@ xfs_start_buffer_writeback(
 STATIC void
 xfs_start_page_writeback(
 	struct page		*page,
-	int			clear_dirty,
-	int			buffers)
+	int			clear_dirty)
 {
 	ASSERT(PageLocked(page));
 	ASSERT(!PageWriteback(page));
@@ -440,10 +438,6 @@ xfs_start_page_writeback(
 		set_page_writeback_keepwrite(page);
 
 	unlock_page(page);
-
-	/* If no buffers on the page are to be written, finish it here */
-	if (!buffers)
-		end_page_writeback(page);
 }
 
 static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh)
@@ -452,110 +446,90 @@ static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh)
 }
 
 /*
- * Submit all of the bios for all of the ioends we have saved up, covering the
- * initial writepage page and also any probed pages.
- *
- * Because we may have multiple ioends spanning a page, we need to start
- * writeback on all the buffers before we submit them for I/O. If we mark the
- * buffers as we got, then we can end up with a page that only has buffers
- * marked async write and I/O complete on can occur before we mark the other
- * buffers async write.
- *
- * The end result of this is that we trip a bug in end_page_writeback() because
- * we call it twice for the one page as the code in end_buffer_async_write()
- * assumes that all buffers on the page are started at the same time.
- *
- * The fix is two passes across the ioend list - one to start writeback on the
- * buffer_heads, and then submit them for I/O on the second pass.
+ * Submit all of the bios for an ioend. We are only passed a single ioend at a
+ * time; the caller is responsible for chaining prior to submission.
  *
  * If @fail is non-zero, it means that we have a situation where some part of
  * the submission process has failed after we have marked paged for writeback
  * and unlocked them. In this situation, we need to fail the ioend chain rather
  * than submit it to IO. This typically only happens on a filesystem shutdown.
  */
-STATIC void
+STATIC int
 xfs_submit_ioend(
 	struct writeback_control *wbc,
 	xfs_ioend_t		*ioend,
-	int			fail)
+	int			status)
 {
-	xfs_ioend_t		*head = ioend;
-	xfs_ioend_t		*next;
 	struct buffer_head	*bh;
 	struct bio		*bio;
 	sector_t		lastblock = 0;
 
-	/* Pass 1 - start writeback */
-	do {
-		next = ioend->io_list;
-		for (bh = ioend->io_buffer_head; bh; bh = bh->b_private)
-			xfs_start_buffer_writeback(bh);
-	} while ((ioend = next) != NULL);
-
-	/* Pass 2 - submit I/O */
-	ioend = head;
-	do {
-		next = ioend->io_list;
-		bio = NULL;
-
-		/*
-		 * If we are failing the IO now, just mark the ioend with an
-		 * error and finish it. This will run IO completion immediately
-		 * as there is only one reference to the ioend at this point in
-		 * time.
-		 */
-		if (fail) {
-			ioend->io_error = fail;
-			xfs_finish_ioend(ioend);
-			continue;
-		}
-
-		for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
-
-			if (!bio) {
- retry:
-				bio = xfs_alloc_ioend_bio(bh);
-			} else if (bh->b_blocknr != lastblock + 1) {
-				xfs_submit_ioend_bio(wbc, ioend, bio);
-				goto retry;
-			}
-
-			if (xfs_bio_add_buffer(bio, bh) != bh->b_size) {
-				xfs_submit_ioend_bio(wbc, ioend, bio);
-				goto retry;
-			}
-
-			lastblock = bh->b_blocknr;
-		}
-		if (bio)
-			xfs_submit_ioend_bio(wbc, ioend, bio);
+	/* Reserve log space if we might write beyond the on-disk inode size. */
+	if (!status &&
+	     ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend))
+		status = xfs_setfilesize_trans_alloc(ioend);
+	/*
+	 * If we are failing the IO now, just mark the ioend with an
+	 * error and finish it. This will run IO completion immediately
+	 * as there is only one reference to the ioend at this point in
+	 * time.
+	 */
+	if (status) {
+		ioend->io_error = status;
 		xfs_finish_ioend(ioend);
-	} while ((ioend = next) != NULL);
+		return status;
+	}
+
+	bio = NULL;
+	for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
+
+		if (!bio) {
+retry:
+			bio = xfs_alloc_ioend_bio(bh);
+		} else if (bh->b_blocknr != lastblock + 1) {
+			xfs_submit_ioend_bio(wbc, ioend, bio);
+			goto retry;
+		}
+
+		if (xfs_bio_add_buffer(bio, bh) != bh->b_size) {
+			xfs_submit_ioend_bio(wbc, ioend, bio);
+			goto retry;
+		}
+
+		lastblock = bh->b_blocknr;
+	}
+	if (bio)
+		xfs_submit_ioend_bio(wbc, ioend, bio);
+	xfs_finish_ioend(ioend);
+	return 0;
 }
 
 /*
  * Test to see if we've been building up a completion structure for
  * earlier buffers -- if so, we try to append to this ioend if we
  * can, otherwise we finish off any current ioend and start another.
- * Return true if we've finished the given ioend.
+ * Return the ioend we finished off so that the caller can submit it
+ * once it has finished processing the dirty page.
  */
 STATIC void
 xfs_add_to_ioend(
 	struct inode		*inode,
 	struct buffer_head	*bh,
 	xfs_off_t		offset,
-	struct xfs_writepage_ctx *wpc)
+	struct xfs_writepage_ctx *wpc,
+	struct list_head	*iolist)
 {
 	if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type ||
 	    bh->b_blocknr != wpc->last_block + 1) {
 		struct xfs_ioend	*new;
 
+		if (wpc->ioend)
+			list_add(&wpc->ioend->io_list, iolist);
+
 		new = xfs_alloc_ioend(inode, wpc->io_type);
 		new->io_offset = offset;
 		new->io_buffer_head = bh;
 		new->io_buffer_tail = bh;
-		if (wpc->ioend)
-			wpc->ioend->io_list = new;
 		wpc->ioend = new;
 	} else {
 		wpc->ioend->io_buffer_tail->b_private = bh;
@@ -565,6 +539,7 @@ xfs_add_to_ioend(
 	bh->b_private = NULL;
 	wpc->ioend->io_size += bh->b_size;
 	wpc->last_block = bh->b_blocknr;
+	xfs_start_buffer_writeback(bh);
 }
 
 STATIC void
@@ -726,44 +701,41 @@ out_invalidate:
 	return;
 }
 
-static int
-xfs_writepage_submit(
-	struct xfs_writepage_ctx *wpc,
-	struct writeback_control *wbc,
-	int			status)
-{
-	struct blk_plug		plug;
-
-	/* Reserve log space if we might write beyond the on-disk inode size. */
-	if (!status && wpc->ioend && wpc->ioend->io_type != XFS_IO_UNWRITTEN &&
-	    xfs_ioend_is_append(wpc->ioend))
-		status = xfs_setfilesize_trans_alloc(wpc->ioend);
-
-	if (wpc->iohead) {
-		blk_start_plug(&plug);
-		xfs_submit_ioend(wbc, wpc->iohead, status);
-		blk_finish_plug(&plug);
-	}
-	return status;
-}
-
+/*
+ * We implement an immediate ioend submission policy here to avoid needing to
+ * chain multiple ioends and hence nest mempool allocations which can violate
+ * forward progress guarantees we need to provide. The current ioend we are
+ * adding buffers to is cached on the writepage context, and if the new buffer
+ * does not append to the cached ioend it will create a new ioend and cache that
+ * instead.
+ *
+ * If a new ioend is created and cached, the old ioend is returned and queued
+ * locally for submission once the entire page is processed or an error has been
+ * detected.  While ioends are submitted immediately after they are completed,
+ * batching optimisations are provided by higher level block plugging.
+ *
+ * At the end of a writeback pass, there will be a cached ioend remaining on the
+ * writepage context that the caller will need to submit.
+ */
 static int
 xfs_writepage_map(
 	struct xfs_writepage_ctx *wpc,
+	struct writeback_control *wbc,
 	struct inode		*inode,
 	struct page		*page,
 	loff_t			offset,
 	__uint64_t              end_offset)
 {
+	LIST_HEAD(submit_list);
+	struct xfs_ioend	*ioend, *next;
 	struct buffer_head	*bh, *head;
 	ssize_t			len = 1 << inode->i_blkbits;
 	int			error = 0;
-	int			uptodate = 1;
 	int			count = 0;
+	int			uptodate = 1;
 
 	bh = head = page_buffers(page);
 	offset = page_offset(page);
-
 	do {
 		if (offset >= end_offset)
 			break;
@@ -816,7 +788,7 @@ xfs_writepage_map(
 			error = xfs_map_blocks(inode, offset, &wpc->imap,
 					     wpc->io_type);
 			if (error)
-				goto out_error;
+				goto out;
 			wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap,
 							 offset);
 		}
@@ -824,46 +796,65 @@ xfs_writepage_map(
 			lock_buffer(bh);
 			if (wpc->io_type != XFS_IO_OVERWRITE)
 				xfs_map_at_offset(inode, bh, &wpc->imap, offset);
-			xfs_add_to_ioend(inode, bh, offset, wpc);
+			xfs_add_to_ioend(inode, bh, offset, wpc, &submit_list);
 			count++;
 		}
 
-		if (!wpc->iohead)
-			wpc->iohead = wpc->ioend;
-
 	} while (offset += len, ((bh = bh->b_this_page) != head));
 
 	if (uptodate && bh == head)
 		SetPageUptodate(page);
 
-	xfs_start_page_writeback(page, 1, count);
-	ASSERT(wpc->iohead || !count);
-	return 0;
+	ASSERT(wpc->ioend || list_empty(&submit_list));
 
-out_error:
+out:
 	/*
-	 * On error, we have to fail the iohead here because we locked buffers
-	 * in the ioend chain. If we don't do this, we'll deadlock invalidating
-	 * the page as that tries to lock the buffers on the page. Also, because
-	 * we may have set pages under writeback, we have to make sure we run
-	 * IO completion to mark the error state of the IO appropriately, so we
-	 * can't cancel the ioend directly here. That means we have to mark this
-	 * page as under writeback if we included any buffers from it in the
-	 * ioend chain so that completion treats it correctly.
+	 * On error, we have to fail the ioend here because we have locked
+	 * buffers in the ioend. If we don't do this, we'll deadlock
+	 * invalidating the page as that tries to lock the buffers on the page.
+	 * Also, because we may have set pages under writeback, we have to make
+	 * sure we run IO completion to mark the error state of the IO
+	 * appropriately, so we can't cancel the ioend directly here. That means
+	 * we have to mark this page as under writeback if we included any
+	 * buffers from it in the ioend chain so that completion treats it
+	 * correctly.
 	 *
-	 * If we didn't include the page in the ioend, then we can simply
-	 * discard and unlock it as there are no other users of the page or it's
-	 * buffers right now. The caller will still need to trigger submission
-	 * of outstanding ioends on the writepage context so they are treated
-	 * correctly on error.
+	 * If we didn't include the page in the ioend, the on error we can
+	 * simply discard and unlock it as there are no other users of the page
+	 * or it's buffers right now. The caller will still need to trigger
+	 * submission of outstanding ioends on the writepage context so they are
+	 * treated correctly on error.
 	 */
-	if (count)
-		xfs_start_page_writeback(page, 0, count);
-	else {
+	if (count) {
+		xfs_start_page_writeback(page, !error);
+
+		/*
+		 * Preserve the original error if there was one, otherwise catch
+		 * submission errors here and propagate into subsequent ioend
+		 * submissions.
+		 */
+		list_for_each_entry_safe(ioend, next, &submit_list, io_list) {
+			int error2;
+
+			list_del_init(&ioend->io_list);
+			error2 = xfs_submit_ioend(wbc, ioend, error);
+			if (error2 && !error)
+				error = error2;
+		}
+	} else if (error) {
 		xfs_aops_discard_page(page);
 		ClearPageUptodate(page);
 		unlock_page(page);
+	} else {
+		/*
+		 * We can end up here with no error and nothing to write if we
+		 * race with a partial page truncate on a sub-page block sized
+		 * filesystem. In that case we need to mark the page clean.
+		 */
+		xfs_start_page_writeback(page, 1);
+		end_page_writeback(page);
 	}
+
 	mapping_set_error(page->mapping, error);
 	return error;
 }
@@ -979,7 +970,7 @@ xfs_do_writepage(
 		end_offset = offset;
 	}
 
-	return xfs_writepage_map(wpc, inode, page, offset, end_offset);
+	return xfs_writepage_map(wpc, wbc, inode, page, offset, end_offset);
 
 redirty:
 	redirty_page_for_writepage(wbc, page);
@@ -998,7 +989,9 @@ xfs_vm_writepage(
 	int			ret;
 
 	ret = xfs_do_writepage(page, wbc, &wpc);
-	return xfs_writepage_submit(&wpc, wbc, ret);
+	if (wpc.ioend)
+		ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
+	return ret;
 }
 
 STATIC int
@@ -1013,7 +1006,9 @@ xfs_vm_writepages(
 
 	xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
 	ret = write_cache_pages(mapping, wbc, xfs_do_writepage, &wpc);
-	return xfs_writepage_submit(&wpc, wbc, ret);
+	if (wpc.ioend)
+		ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
+	return ret;
 }
 
 /*
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index 3c3f1a37a1c7..4e01bd5b6426 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -41,7 +41,7 @@ enum {
  * It can manage several multi-page bio's at once.
  */
 typedef struct xfs_ioend {
-	struct xfs_ioend	*io_list;	/* next ioend in chain */
+	struct list_head	io_list;	/* next ioend in chain */
 	unsigned int		io_type;	/* delalloc / unwritten */
 	int			io_error;	/* I/O error code */
 	atomic_t		io_remaining;	/* hold count */

From 74c66bcb7eda551f3b8588659c58fe29184af903 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 29 Feb 2016 08:36:38 +1100
Subject: [PATCH 44/66] ext4: Fix data exposure after failed AIO DIO

When AIO DIO fails e.g. due to IO error, we must not convert unwritten
extents as that will expose uninitialized data. Handle this case
by clearing unwritten flag from io_end in case of error and thus
preventing extent conversion.

Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/ext4/ext4.h    | 30 +++++++++++++++++++++---------
 fs/ext4/inode.c   | 21 ++++++++-------------
 fs/ext4/page-io.c | 10 ----------
 3 files changed, 29 insertions(+), 32 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 0662b285dc8a..56c12df107ab 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1504,15 +1504,6 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
 		 ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
 }
 
-static inline void ext4_set_io_unwritten_flag(struct inode *inode,
-					      struct ext4_io_end *io_end)
-{
-	if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
-		io_end->flag |= EXT4_IO_END_UNWRITTEN;
-		atomic_inc(&EXT4_I(inode)->i_unwritten);
-	}
-}
-
 static inline ext4_io_end_t *ext4_inode_aio(struct inode *inode)
 {
 	return inode->i_private;
@@ -3293,6 +3284,27 @@ extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
 extern int ext4_resize_begin(struct super_block *sb);
 extern void ext4_resize_end(struct super_block *sb);
 
+static inline void ext4_set_io_unwritten_flag(struct inode *inode,
+					      struct ext4_io_end *io_end)
+{
+	if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
+		io_end->flag |= EXT4_IO_END_UNWRITTEN;
+		atomic_inc(&EXT4_I(inode)->i_unwritten);
+	}
+}
+
+static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
+{
+	struct inode *inode = io_end->inode;
+
+	if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
+		io_end->flag &= ~EXT4_IO_END_UNWRITTEN;
+		/* Wake up anyone waiting on unwritten extent conversion */
+		if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
+			wake_up_all(ext4_ioend_wq(inode));
+	}
+}
+
 #endif	/* __KERNEL__ */
 
 #define EFSBADCRC	EBADMSG		/* Bad CRC detected */
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 9db04dd9b88a..2b98171a9432 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3166,9 +3166,6 @@ static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
 {
         ext4_io_end_t *io_end = iocb->private;
 
-	if (size <= 0)
-		return 0;
-
 	/* if not async direct IO just return */
 	if (!io_end)
 		return 0;
@@ -3179,6 +3176,14 @@ static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
 		  size);
 
 	iocb->private = NULL;
+	/*
+	 * Error during AIO DIO. We cannot convert unwritten extents as the
+	 * data was not written. Just clear the unwritten flag and drop io_end.
+	 */
+	if (size <= 0) {
+		ext4_clear_io_unwritten_flag(io_end);
+		size = 0;
+	}
 	io_end->offset = offset;
 	io_end->size = size;
 	ext4_put_io_end(io_end);
@@ -3306,16 +3311,6 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	if (io_end) {
 		ext4_inode_aio_set(inode, NULL);
 		ext4_put_io_end(io_end);
-		/*
-		 * When no IO was submitted ext4_end_io_dio() was not
-		 * called so we have to put iocb's reference.
-		 */
-		if (ret <= 0 && ret != -EIOCBQUEUED && iocb->private) {
-			WARN_ON(iocb->private != io_end);
-			WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
-			ext4_put_io_end(io_end);
-			iocb->private = NULL;
-		}
 	}
 	if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
 						EXT4_STATE_DIO_UNWRITTEN)) {
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 090b3498638e..f49a87c4fb63 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -139,16 +139,6 @@ static void ext4_release_io_end(ext4_io_end_t *io_end)
 	kmem_cache_free(io_end_cachep, io_end);
 }
 
-static void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
-{
-	struct inode *inode = io_end->inode;
-
-	io_end->flag &= ~EXT4_IO_END_UNWRITTEN;
-	/* Wake up anyone waiting on unwritten extent conversion */
-	if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
-		wake_up_all(ext4_ioend_wq(inode));
-}
-
 /*
  * Check a range of space and convert unwritten extents to written. Note that
  * we are protected from truncate touching same part of extent tree by the

From e8897529053d05f5bd677706ba6807fc2f2b942c Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 1 Mar 2016 09:41:33 +1100
Subject: [PATCH 45/66] xfs: XFS_DIFLAG_DAX is only for regular files or
 directories

Only file data can use DAX, so we should onyl be able to set this
flag on regular files. However, the flag also serves as an "inherit"
flag at file create time when set on directories, so limit the
FS_IOC_FSSETXATTR ioctl to only set this flag on regular files and
directories.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Tested-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_ioctl.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 478d04e07f95..ee8f66616cbb 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1051,6 +1051,14 @@ xfs_ioctl_setattr_xflags(
 	    !capable(CAP_LINUX_IMMUTABLE))
 		return -EPERM;
 
+	/*
+	 * It is only valid to set the DAX flag on regular files and
+	 * directories. On directories it serves as an inherit hint.
+	 */
+	if ((fa->fsx_xflags & FS_XFLAG_DAX) &&
+	    !(S_ISREG(VFS_I(ip)->i_mode) || S_ISDIR(VFS_I(ip)->i_mode)))
+		return -EINVAL;
+
 	xfs_set_diflags(ip, fa->fsx_xflags);
 	xfs_diflags_to_linux(ip);
 	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);

From db10c697b40e34ac91bbcdd866e73a3d0bad7780 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 1 Mar 2016 09:41:33 +1100
Subject: [PATCH 46/66] xfs: S_DAX is only for regular files

Only regular files can use DAX for data operations, so we should
restrict setting it on the VFS inode to regular files. Setting it on
metadata inodes may cause the VFS to do the wrong thing for such
inodes, so avoid potential problems by restricting the scope of the
flag to what we know is supported.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Tested-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_iops.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 76b71a1c6c32..5d4f9739d4a4 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -1205,8 +1205,9 @@ xfs_diflags_to_iflags(
 		inode->i_flags |= S_SYNC;
 	if (flags & XFS_DIFLAG_NOATIME)
 		inode->i_flags |= S_NOATIME;
-	if (ip->i_mount->m_flags & XFS_MOUNT_DAX ||
-	    ip->i_d.di_flags2 & XFS_DIFLAG2_DAX)
+	if (S_ISREG(inode->i_mode) &&
+	    (ip->i_mount->m_flags & XFS_MOUNT_DAX ||
+	     ip->i_d.di_flags2 & XFS_DIFLAG2_DAX))
 		inode->i_flags |= S_DAX;
 }
 

From 3a6a854a82fc1cf12c7f2095b80aff32c563b3ab Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 1 Mar 2016 09:41:33 +1100
Subject: [PATCH 47/66] xfs: dynamically switch modes when XFS_DIFLAG2_DAX is
 set/cleared

When we set or clear the XFS_DIFLAG2_DAX flag, we should also
set/clear the S_DAX flag in the VFS inode. To do this, we need to
ensure that we first flush and remove any cached entries in the
radix tree to ensure the correct data access method is used when we
next try to read or write data. We ahve to be especially careful
here to lock out page faults so they don't race with the flush and
invalidation before we change the access mode.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Tested-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_ioctl.c | 109 ++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 94 insertions(+), 15 deletions(-)

diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index ee8f66616cbb..55ec4d44d4ba 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1051,14 +1051,6 @@ xfs_ioctl_setattr_xflags(
 	    !capable(CAP_LINUX_IMMUTABLE))
 		return -EPERM;
 
-	/*
-	 * It is only valid to set the DAX flag on regular files and
-	 * directories. On directories it serves as an inherit hint.
-	 */
-	if ((fa->fsx_xflags & FS_XFLAG_DAX) &&
-	    !(S_ISREG(VFS_I(ip)->i_mode) || S_ISDIR(VFS_I(ip)->i_mode)))
-		return -EINVAL;
-
 	xfs_set_diflags(ip, fa->fsx_xflags);
 	xfs_diflags_to_linux(ip);
 	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
@@ -1067,24 +1059,83 @@ xfs_ioctl_setattr_xflags(
 	return 0;
 }
 
+/*
+ * If we are changing DAX flags, we have to ensure the file is clean and any
+ * cached objects in the address space are invalidated and removed. This
+ * requires us to lock out other IO and page faults similar to a truncate
+ * operation. The locks need to be held until the transaction has been committed
+ * so that the cache invalidation is atomic with respect to the DAX flag
+ * manipulation.
+ */
+static int
+xfs_ioctl_setattr_dax_invalidate(
+	struct xfs_inode	*ip,
+	struct fsxattr		*fa,
+	int			*join_flags)
+{
+	struct inode		*inode = VFS_I(ip);
+	int			error;
+
+	*join_flags = 0;
+
+	/*
+	 * It is only valid to set the DAX flag on regular files and
+	 * directories. On directories it serves as an inherit hint.
+	 */
+	if ((fa->fsx_xflags & FS_XFLAG_DAX) &&
+	    !(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)))
+		return -EINVAL;
+
+	/* If the DAX state is not changing, we have nothing to do here. */
+	if ((fa->fsx_xflags & FS_XFLAG_DAX) && IS_DAX(inode))
+		return 0;
+	if (!(fa->fsx_xflags & FS_XFLAG_DAX) && !IS_DAX(inode))
+		return 0;
+
+	/* lock, flush and invalidate mapping in preparation for flag change */
+	xfs_ilock(ip, XFS_MMAPLOCK_EXCL | XFS_IOLOCK_EXCL);
+	error = filemap_write_and_wait(inode->i_mapping);
+	if (error)
+		goto out_unlock;
+	error = invalidate_inode_pages2(inode->i_mapping);
+	if (error)
+		goto out_unlock;
+
+	*join_flags = XFS_MMAPLOCK_EXCL | XFS_IOLOCK_EXCL;
+	return 0;
+
+out_unlock:
+	xfs_iunlock(ip, XFS_MMAPLOCK_EXCL | XFS_IOLOCK_EXCL);
+	return error;
+
+}
+
 /*
  * Set up the transaction structure for the setattr operation, checking that we
  * have permission to do so. On success, return a clean transaction and the
  * inode locked exclusively ready for further operation specific checks. On
  * failure, return an error without modifying or locking the inode.
+ *
+ * The inode might already be IO locked on call. If this is the case, it is
+ * indicated in @join_flags and we take full responsibility for ensuring they
+ * are unlocked from now on. Hence if we have an error here, we still have to
+ * unlock them. Otherwise, once they are joined to the transaction, they will
+ * be unlocked on commit/cancel.
  */
 static struct xfs_trans *
 xfs_ioctl_setattr_get_trans(
-	struct xfs_inode	*ip)
+	struct xfs_inode	*ip,
+	int			join_flags)
 {
 	struct xfs_mount	*mp = ip->i_mount;
 	struct xfs_trans	*tp;
-	int			error;
+	int			error = -EROFS;
 
 	if (mp->m_flags & XFS_MOUNT_RDONLY)
-		return ERR_PTR(-EROFS);
+		goto out_unlock;
+	error = -EIO;
 	if (XFS_FORCED_SHUTDOWN(mp))
-		return ERR_PTR(-EIO);
+		goto out_unlock;
 
 	tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
 	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
@@ -1092,7 +1143,8 @@ xfs_ioctl_setattr_get_trans(
 		goto out_cancel;
 
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
-	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | join_flags);
+	join_flags = 0;
 
 	/*
 	 * CAP_FOWNER overrides the following restrictions:
@@ -1112,6 +1164,9 @@ xfs_ioctl_setattr_get_trans(
 
 out_cancel:
 	xfs_trans_cancel(tp);
+out_unlock:
+	if (join_flags)
+		xfs_iunlock(ip, join_flags);
 	return ERR_PTR(error);
 }
 
@@ -1210,6 +1265,7 @@ xfs_ioctl_setattr(
 	struct xfs_dquot	*pdqp = NULL;
 	struct xfs_dquot	*olddquot = NULL;
 	int			code;
+	int			join_flags = 0;
 
 	trace_xfs_ioctl_setattr(ip);
 
@@ -1233,7 +1289,18 @@ xfs_ioctl_setattr(
 			return code;
 	}
 
-	tp = xfs_ioctl_setattr_get_trans(ip);
+	/*
+	 * Changing DAX config may require inode locking for mapping
+	 * invalidation. These need to be held all the way to transaction commit
+	 * or cancel time, so need to be passed through to
+	 * xfs_ioctl_setattr_get_trans() so it can apply them to the join call
+	 * appropriately.
+	 */
+	code = xfs_ioctl_setattr_dax_invalidate(ip, fa, &join_flags);
+	if (code)
+		goto error_free_dquots;
+
+	tp = xfs_ioctl_setattr_get_trans(ip, join_flags);
 	if (IS_ERR(tp)) {
 		code = PTR_ERR(tp);
 		goto error_free_dquots;
@@ -1349,6 +1416,7 @@ xfs_ioc_setxflags(
 	struct xfs_trans	*tp;
 	struct fsxattr		fa;
 	unsigned int		flags;
+	int			join_flags = 0;
 	int			error;
 
 	if (copy_from_user(&flags, arg, sizeof(flags)))
@@ -1365,7 +1433,18 @@ xfs_ioc_setxflags(
 	if (error)
 		return error;
 
-	tp = xfs_ioctl_setattr_get_trans(ip);
+	/*
+	 * Changing DAX config may require inode locking for mapping
+	 * invalidation. These need to be held all the way to transaction commit
+	 * or cancel time, so need to be passed through to
+	 * xfs_ioctl_setattr_get_trans() so it can apply them to the join call
+	 * appropriately.
+	 */
+	error = xfs_ioctl_setattr_dax_invalidate(ip, &fa, &join_flags);
+	if (error)
+		goto out_drop_write;
+
+	tp = xfs_ioctl_setattr_get_trans(ip, join_flags);
 	if (IS_ERR(tp)) {
 		error = PTR_ERR(tp);
 		goto out_drop_write;

From 64485437357dfdc9752495b3f496adfc5c816c6f Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 1 Mar 2016 09:41:33 +1100
Subject: [PATCH 48/66] xfs: XFS_DIFLAG2_DAX limited by PAGE_SIZE

If the block size of a filesystem is not at least PAGE_SIZEd, then
at this point in time DAX cannot be used due to the fact we can't
guarantee extents are page sized or aligned without further work.
Hence disallow setting the DAX flag on an inode if the block size is
too small. Also, be defensive and check the block size when reading
an inode in off disk.

In future, we want to allow DAX to work on any filesystem, so this
is temporary while we sort of the correct conbination of extent size
hints and allocation alignment configurations needed to guarantee
page sized and aligned extent allocation for DAX enabled files.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Tested-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_ioctl.c | 12 ++++++++----
 fs/xfs/xfs_iops.c  |  1 +
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 55ec4d44d4ba..388b5b5b67c9 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1080,11 +1080,15 @@ xfs_ioctl_setattr_dax_invalidate(
 
 	/*
 	 * It is only valid to set the DAX flag on regular files and
-	 * directories. On directories it serves as an inherit hint.
+	 * directories on filesystems where the block size is equal to the page
+	 * size. On directories it serves as an inherit hint.
 	 */
-	if ((fa->fsx_xflags & FS_XFLAG_DAX) &&
-	    !(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)))
-		return -EINVAL;
+	if (fa->fsx_xflags & FS_XFLAG_DAX) {
+		if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)))
+			return -EINVAL;
+		if (ip->i_mount->m_sb.sb_blocksize != PAGE_SIZE)
+			return -EINVAL;
+	}
 
 	/* If the DAX state is not changing, we have nothing to do here. */
 	if ((fa->fsx_xflags & FS_XFLAG_DAX) && IS_DAX(inode))
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 5d4f9739d4a4..f7386dc10a20 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -1206,6 +1206,7 @@ xfs_diflags_to_iflags(
 	if (flags & XFS_DIFLAG_NOATIME)
 		inode->i_flags |= S_NOATIME;
 	if (S_ISREG(inode->i_mode) &&
+	    ip->i_mount->m_sb.sb_blocksize == PAGE_SIZE &&
 	    (ip->i_mount->m_flags & XFS_MOUNT_DAX ||
 	     ip->i_d.di_flags2 & XFS_DIFLAG2_DAX))
 		inode->i_flags |= S_DAX;

From 2e83b79b2d6c78bf1b4aa227938a214dcbddc83f Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mguzik@redhat.com>
Date: Wed, 2 Mar 2016 09:51:09 +1100
Subject: [PATCH 49/66] xfs: fix two memory leaks in xfs_attr_list.c error
 paths

This plugs 2 trivial leaks in xfs_attr_shortform_list and
xfs_attr3_leaf_list_int.

Signed-off-by: Mateusz Guzik <mguzik@redhat.com>
Cc: <stable@vger.kernel.org>
Reviewed-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_attr_list.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index 0ef7c2ed3f8a..4fa14820e2e2 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -202,8 +202,10 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
 					sbp->namelen,
 					sbp->valuelen,
 					&sbp->name[sbp->namelen]);
-		if (error)
+		if (error) {
+			kmem_free(sbuf);
 			return error;
+		}
 		if (context->seen_enough)
 			break;
 		cursor->offset++;
@@ -454,14 +456,13 @@ xfs_attr3_leaf_list_int(
 				args.rmtblkcnt = xfs_attr3_rmt_blocks(
 							args.dp->i_mount, valuelen);
 				retval = xfs_attr_rmtval_get(&args);
-				if (retval)
-					return retval;
-				retval = context->put_listent(context,
-						entry->flags,
-						name_rmt->name,
-						(int)name_rmt->namelen,
-						valuelen,
-						args.value);
+				if (!retval)
+					retval = context->put_listent(context,
+							entry->flags,
+							name_rmt->name,
+							(int)name_rmt->namelen,
+							valuelen,
+							args.value);
 				kmem_free(args.value);
 			} else {
 				retval = context->put_listent(context,

From 2e74af0e118992e62062a6f0666edcac6d64afdc Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Wed, 2 Mar 2016 09:55:38 +1100
Subject: [PATCH 50/66] xfs: convert mount option parsing to tokens

This should be a no-op change, just switch to token parsing
like every other respectable filesystem does.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_super.c | 393 +++++++++++++++++++++++----------------------
 1 file changed, 198 insertions(+), 195 deletions(-)

diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 59c9b7bd958d..9e1538d48548 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -65,83 +65,85 @@ static struct kset *xfs_kset;		/* top-level xfs sysfs dir */
 static struct xfs_kobj xfs_dbg_kobj;	/* global debug sysfs attrs */
 #endif
 
-#define MNTOPT_LOGBUFS	"logbufs"	/* number of XFS log buffers */
-#define MNTOPT_LOGBSIZE	"logbsize"	/* size of XFS log buffers */
-#define MNTOPT_LOGDEV	"logdev"	/* log device */
-#define MNTOPT_RTDEV	"rtdev"		/* realtime I/O device */
-#define MNTOPT_BIOSIZE	"biosize"	/* log2 of preferred buffered io size */
-#define MNTOPT_WSYNC	"wsync"		/* safe-mode nfs compatible mount */
-#define MNTOPT_NOALIGN	"noalign"	/* turn off stripe alignment */
-#define MNTOPT_SWALLOC	"swalloc"	/* turn on stripe width allocation */
-#define MNTOPT_SUNIT	"sunit"		/* data volume stripe unit */
-#define MNTOPT_SWIDTH	"swidth"	/* data volume stripe width */
-#define MNTOPT_NOUUID	"nouuid"	/* ignore filesystem UUID */
-#define MNTOPT_MTPT	"mtpt"		/* filesystem mount point */
-#define MNTOPT_GRPID	"grpid"		/* group-ID from parent directory */
-#define MNTOPT_NOGRPID	"nogrpid"	/* group-ID from current process */
-#define MNTOPT_BSDGROUPS    "bsdgroups"    /* group-ID from parent directory */
-#define MNTOPT_SYSVGROUPS   "sysvgroups"   /* group-ID from current process */
-#define MNTOPT_ALLOCSIZE    "allocsize"    /* preferred allocation size */
-#define MNTOPT_NORECOVERY   "norecovery"   /* don't run XFS recovery */
-#define MNTOPT_BARRIER	"barrier"	/* use writer barriers for log write and
-					 * unwritten extent conversion */
-#define MNTOPT_NOBARRIER "nobarrier"	/* .. disable */
-#define MNTOPT_64BITINODE   "inode64"	/* inodes can be allocated anywhere */
-#define MNTOPT_32BITINODE   "inode32"	/* inode allocation limited to
-					 * XFS_MAXINUMBER_32 */
-#define MNTOPT_IKEEP	"ikeep"		/* do not free empty inode clusters */
-#define MNTOPT_NOIKEEP	"noikeep"	/* free empty inode clusters */
-#define MNTOPT_LARGEIO	   "largeio"	/* report large I/O sizes in stat() */
-#define MNTOPT_NOLARGEIO   "nolargeio"	/* do not report large I/O sizes
-					 * in stat(). */
-#define MNTOPT_ATTR2	"attr2"		/* do use attr2 attribute format */
-#define MNTOPT_NOATTR2	"noattr2"	/* do not use attr2 attribute format */
-#define MNTOPT_FILESTREAM  "filestreams" /* use filestreams allocator */
-#define MNTOPT_QUOTA	"quota"		/* disk quotas (user) */
-#define MNTOPT_NOQUOTA	"noquota"	/* no quotas */
-#define MNTOPT_USRQUOTA	"usrquota"	/* user quota enabled */
-#define MNTOPT_GRPQUOTA	"grpquota"	/* group quota enabled */
-#define MNTOPT_PRJQUOTA	"prjquota"	/* project quota enabled */
-#define MNTOPT_UQUOTA	"uquota"	/* user quota (IRIX variant) */
-#define MNTOPT_GQUOTA	"gquota"	/* group quota (IRIX variant) */
-#define MNTOPT_PQUOTA	"pquota"	/* project quota (IRIX variant) */
-#define MNTOPT_UQUOTANOENF "uqnoenforce"/* user quota limit enforcement */
-#define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */
-#define MNTOPT_PQUOTANOENF "pqnoenforce"/* project quota limit enforcement */
-#define MNTOPT_QUOTANOENF  "qnoenforce"	/* same as uqnoenforce */
-#define MNTOPT_DISCARD	   "discard"	/* Discard unused blocks */
-#define MNTOPT_NODISCARD   "nodiscard"	/* Do not discard unused blocks */
-
-#define MNTOPT_DAX	"dax"		/* Enable direct access to bdev pages */
-
 /*
  * Table driven mount option parser.
- *
- * Currently only used for remount, but it will be used for mount
- * in the future, too.
  */
 enum {
-	Opt_barrier,
-	Opt_nobarrier,
-	Opt_inode64,
-	Opt_inode32,
-	Opt_err
+	Opt_logbufs, Opt_logbsize, Opt_logdev, Opt_rtdev, Opt_biosize,
+	Opt_wsync, Opt_noalign, Opt_swalloc, Opt_sunit, Opt_swidth, Opt_nouuid,
+	Opt_mtpt, Opt_grpid, Opt_nogrpid, Opt_bsdgroups, Opt_sysvgroups,
+	Opt_allocsize, Opt_norecovery, Opt_barrier, Opt_nobarrier,
+	Opt_inode64, Opt_inode32, Opt_ikeep, Opt_noikeep,
+	Opt_largeio, Opt_nolargeio, Opt_attr2, Opt_noattr2, Opt_filestreams,
+	Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota, Opt_prjquota,
+	Opt_uquota, Opt_gquota, Opt_pquota,
+	Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce,
+	Opt_discard, Opt_nodiscard, Opt_dax, Opt_err,
 };
 
 static const match_table_t tokens = {
-	{Opt_barrier, "barrier"},
-	{Opt_nobarrier, "nobarrier"},
-	{Opt_inode64, "inode64"},
-	{Opt_inode32, "inode32"},
-	{Opt_err, NULL}
+	{Opt_logbufs,	"logbufs=%u"},	/* number of XFS log buffers */
+	{Opt_logbsize,	"logbsize=%s"},	/* size of XFS log buffers */
+	{Opt_logdev,	"logdev=%s"},	/* log device */
+	{Opt_rtdev,	"rtdev=%s"},	/* realtime I/O device */
+	{Opt_biosize,	"biosize=%u"},	/* log2 of preferred buffered io size */
+	{Opt_wsync,	"wsync"},	/* safe-mode nfs compatible mount */
+	{Opt_noalign,	"noalign"},	/* turn off stripe alignment */
+	{Opt_swalloc,	"swalloc"},	/* turn on stripe width allocation */
+	{Opt_sunit,	"sunit=%u"},	/* data volume stripe unit */
+	{Opt_swidth,	"swidth=%u"},	/* data volume stripe width */
+	{Opt_nouuid,	"nouuid"},	/* ignore filesystem UUID */
+	{Opt_mtpt,	"mtpt"},	/* filesystem mount point */
+	{Opt_grpid,	"grpid"},	/* group-ID from parent directory */
+	{Opt_nogrpid,	"nogrpid"},	/* group-ID from current process */
+	{Opt_bsdgroups,	"bsdgroups"},	/* group-ID from parent directory */
+	{Opt_sysvgroups,"sysvgroups"},	/* group-ID from current process */
+	{Opt_allocsize,	"allocsize=%s"},/* preferred allocation size */
+	{Opt_norecovery,"norecovery"},	/* don't run XFS recovery */
+	{Opt_barrier,	"barrier"},	/* use writer barriers for log write and
+					 * unwritten extent conversion */
+	{Opt_nobarrier,	"nobarrier"},	/* .. disable */
+	{Opt_inode64,	"inode64"},	/* inodes can be allocated anywhere */
+	{Opt_inode32,   "inode32"},	/* inode allocation limited to
+					 * XFS_MAXINUMBER_32 */
+	{Opt_ikeep,	"ikeep"},	/* do not free empty inode clusters */
+	{Opt_noikeep,	"noikeep"},	/* free empty inode clusters */
+	{Opt_largeio,	"largeio"},	/* report large I/O sizes in stat() */
+	{Opt_nolargeio,	"nolargeio"},	/* do not report large I/O sizes
+					 * in stat(). */
+	{Opt_attr2,	"attr2"},	/* do use attr2 attribute format */
+	{Opt_noattr2,	"noattr2"},	/* do not use attr2 attribute format */
+	{Opt_filestreams,"filestreams"},/* use filestreams allocator */
+	{Opt_quota,	"quota"},	/* disk quotas (user) */
+	{Opt_noquota,	"noquota"},	/* no quotas */
+	{Opt_usrquota,	"usrquota"},	/* user quota enabled */
+	{Opt_grpquota,	"grpquota"},	/* group quota enabled */
+	{Opt_prjquota,	"prjquota"},	/* project quota enabled */
+	{Opt_uquota,	"uquota"},	/* user quota (IRIX variant) */
+	{Opt_gquota,	"gquota"},	/* group quota (IRIX variant) */
+	{Opt_pquota,	"pquota"},	/* project quota (IRIX variant) */
+	{Opt_uqnoenforce,"uqnoenforce"},/* user quota limit enforcement */
+	{Opt_gqnoenforce,"gqnoenforce"},/* group quota limit enforcement */
+	{Opt_pqnoenforce,"pqnoenforce"},/* project quota limit enforcement */
+	{Opt_qnoenforce, "qnoenforce"},	/* same as uqnoenforce */
+	{Opt_discard,	"discard"},	/* Discard unused blocks */
+	{Opt_nodiscard,	"nodiscard"},	/* Do not discard unused blocks */
+
+	{Opt_dax,	"dax"},		/* Enable direct access to bdev pages */
+	{Opt_err,	NULL},
 };
 
 
 STATIC int
-suffix_kstrtoint(char *s, unsigned int base, int *res)
+suffix_kstrtoint(const substring_t *s, unsigned int base, int *res)
 {
 	int	last, shift_left_factor = 0, _res;
-	char	*value = s;
+	char	*value;
+	int	ret = 0;
+
+	value = match_strdup(s);
+	if (!value)
+		return -ENOMEM;
 
 	last = strlen(value) - 1;
 	if (value[last] == 'K' || value[last] == 'k') {
@@ -157,10 +159,11 @@ suffix_kstrtoint(char *s, unsigned int base, int *res)
 		value[last] = '\0';
 	}
 
-	if (kstrtoint(s, base, &_res))
-		return -EINVAL;
+	if (kstrtoint(value, base, &_res))
+		ret = -EINVAL;
+	kfree(value);
 	*res = _res << shift_left_factor;
-	return 0;
+	return ret;
 }
 
 /*
@@ -176,7 +179,8 @@ xfs_parseargs(
 	char			*options)
 {
 	struct super_block	*sb = mp->m_super;
-	char			*this_char, *value;
+	char			*p;
+	substring_t		args[MAX_OPT_ARGS];
 	int			dsunit = 0;
 	int			dswidth = 0;
 	int			iosize = 0;
@@ -217,152 +221,152 @@ xfs_parseargs(
 	if (!options)
 		goto done;
 
-	while ((this_char = strsep(&options, ",")) != NULL) {
-		if (!*this_char)
-			continue;
-		if ((value = strchr(this_char, '=')) != NULL)
-			*value++ = 0;
+	while ((p = strsep(&options, ",")) != NULL) {
+		int		token;
 
-		if (!strcmp(this_char, MNTOPT_LOGBUFS)) {
-			if (!value || !*value) {
-				xfs_warn(mp, "%s option requires an argument",
-					this_char);
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_logbufs:
+			if (match_int(args, &mp->m_logbufs))
 				return -EINVAL;
-			}
-			if (kstrtoint(value, 10, &mp->m_logbufs))
+			break;
+		case Opt_logbsize:
+			if (suffix_kstrtoint(args, 10, &mp->m_logbsize))
 				return -EINVAL;
-		} else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) {
-			if (!value || !*value) {
-				xfs_warn(mp, "%s option requires an argument",
-					this_char);
-				return -EINVAL;
-			}
-			if (suffix_kstrtoint(value, 10, &mp->m_logbsize))
-				return -EINVAL;
-		} else if (!strcmp(this_char, MNTOPT_LOGDEV)) {
-			if (!value || !*value) {
-				xfs_warn(mp, "%s option requires an argument",
-					this_char);
-				return -EINVAL;
-			}
-			mp->m_logname = kstrndup(value, MAXNAMELEN, GFP_KERNEL);
+			break;
+		case Opt_logdev:
+			mp->m_logname = match_strdup(args);
 			if (!mp->m_logname)
 				return -ENOMEM;
-		} else if (!strcmp(this_char, MNTOPT_MTPT)) {
-			xfs_warn(mp, "%s option not allowed on this system",
-				this_char);
+			break;
+		case Opt_mtpt:
+			xfs_warn(mp, "%s option not allowed on this system", p);
 			return -EINVAL;
-		} else if (!strcmp(this_char, MNTOPT_RTDEV)) {
-			if (!value || !*value) {
-				xfs_warn(mp, "%s option requires an argument",
-					this_char);
-				return -EINVAL;
-			}
-			mp->m_rtname = kstrndup(value, MAXNAMELEN, GFP_KERNEL);
+		case Opt_rtdev:
+			mp->m_rtname = match_strdup(args);
 			if (!mp->m_rtname)
 				return -ENOMEM;
-		} else if (!strcmp(this_char, MNTOPT_ALLOCSIZE) ||
-			   !strcmp(this_char, MNTOPT_BIOSIZE)) {
-			if (!value || !*value) {
-				xfs_warn(mp, "%s option requires an argument",
-					this_char);
-				return -EINVAL;
-			}
-			if (suffix_kstrtoint(value, 10, &iosize))
+			break;
+		case Opt_allocsize:
+		case Opt_biosize:
+			if (suffix_kstrtoint(args, 10, &iosize))
 				return -EINVAL;
 			iosizelog = ffs(iosize) - 1;
-		} else if (!strcmp(this_char, MNTOPT_GRPID) ||
-			   !strcmp(this_char, MNTOPT_BSDGROUPS)) {
+			break;
+		case Opt_grpid:
+		case Opt_bsdgroups:
 			mp->m_flags |= XFS_MOUNT_GRPID;
-		} else if (!strcmp(this_char, MNTOPT_NOGRPID) ||
-			   !strcmp(this_char, MNTOPT_SYSVGROUPS)) {
+			break;
+		case Opt_nogrpid:
+		case Opt_sysvgroups:
 			mp->m_flags &= ~XFS_MOUNT_GRPID;
-		} else if (!strcmp(this_char, MNTOPT_WSYNC)) {
+			break;
+		case Opt_wsync:
 			mp->m_flags |= XFS_MOUNT_WSYNC;
-		} else if (!strcmp(this_char, MNTOPT_NORECOVERY)) {
+			break;
+		case Opt_norecovery:
 			mp->m_flags |= XFS_MOUNT_NORECOVERY;
-		} else if (!strcmp(this_char, MNTOPT_NOALIGN)) {
+			break;
+		case Opt_noalign:
 			mp->m_flags |= XFS_MOUNT_NOALIGN;
-		} else if (!strcmp(this_char, MNTOPT_SWALLOC)) {
+			break;
+		case Opt_swalloc:
 			mp->m_flags |= XFS_MOUNT_SWALLOC;
-		} else if (!strcmp(this_char, MNTOPT_SUNIT)) {
-			if (!value || !*value) {
-				xfs_warn(mp, "%s option requires an argument",
-					this_char);
+			break;
+		case Opt_sunit:
+			if (match_int(args, &dsunit))
 				return -EINVAL;
-			}
-			if (kstrtoint(value, 10, &dsunit))
+			break;
+		case Opt_swidth:
+			if (match_int(args, &dswidth))
 				return -EINVAL;
-		} else if (!strcmp(this_char, MNTOPT_SWIDTH)) {
-			if (!value || !*value) {
-				xfs_warn(mp, "%s option requires an argument",
-					this_char);
-				return -EINVAL;
-			}
-			if (kstrtoint(value, 10, &dswidth))
-				return -EINVAL;
-		} else if (!strcmp(this_char, MNTOPT_32BITINODE)) {
+			break;
+		case Opt_inode32:
 			mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
-		} else if (!strcmp(this_char, MNTOPT_64BITINODE)) {
+			break;
+		case Opt_inode64:
 			mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS;
-		} else if (!strcmp(this_char, MNTOPT_NOUUID)) {
+			break;
+		case Opt_nouuid:
 			mp->m_flags |= XFS_MOUNT_NOUUID;
-		} else if (!strcmp(this_char, MNTOPT_BARRIER)) {
+			break;
+		case Opt_barrier:
 			mp->m_flags |= XFS_MOUNT_BARRIER;
-		} else if (!strcmp(this_char, MNTOPT_NOBARRIER)) {
+			break;
+		case Opt_nobarrier:
 			mp->m_flags &= ~XFS_MOUNT_BARRIER;
-		} else if (!strcmp(this_char, MNTOPT_IKEEP)) {
+			break;
+		case Opt_ikeep:
 			mp->m_flags |= XFS_MOUNT_IKEEP;
-		} else if (!strcmp(this_char, MNTOPT_NOIKEEP)) {
+			break;
+		case Opt_noikeep:
 			mp->m_flags &= ~XFS_MOUNT_IKEEP;
-		} else if (!strcmp(this_char, MNTOPT_LARGEIO)) {
+			break;
+		case Opt_largeio:
 			mp->m_flags &= ~XFS_MOUNT_COMPAT_IOSIZE;
-		} else if (!strcmp(this_char, MNTOPT_NOLARGEIO)) {
+			break;
+		case Opt_nolargeio:
 			mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
-		} else if (!strcmp(this_char, MNTOPT_ATTR2)) {
+			break;
+		case Opt_attr2:
 			mp->m_flags |= XFS_MOUNT_ATTR2;
-		} else if (!strcmp(this_char, MNTOPT_NOATTR2)) {
+			break;
+		case Opt_noattr2:
 			mp->m_flags &= ~XFS_MOUNT_ATTR2;
 			mp->m_flags |= XFS_MOUNT_NOATTR2;
-		} else if (!strcmp(this_char, MNTOPT_FILESTREAM)) {
+			break;
+		case Opt_filestreams:
 			mp->m_flags |= XFS_MOUNT_FILESTREAMS;
-		} else if (!strcmp(this_char, MNTOPT_NOQUOTA)) {
+			break;
+		case Opt_noquota:
 			mp->m_qflags &= ~XFS_ALL_QUOTA_ACCT;
 			mp->m_qflags &= ~XFS_ALL_QUOTA_ENFD;
 			mp->m_qflags &= ~XFS_ALL_QUOTA_ACTIVE;
-		} else if (!strcmp(this_char, MNTOPT_QUOTA) ||
-			   !strcmp(this_char, MNTOPT_UQUOTA) ||
-			   !strcmp(this_char, MNTOPT_USRQUOTA)) {
+			break;
+		case Opt_quota:
+		case Opt_uquota:
+		case Opt_usrquota:
 			mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE |
 					 XFS_UQUOTA_ENFD);
-		} else if (!strcmp(this_char, MNTOPT_QUOTANOENF) ||
-			   !strcmp(this_char, MNTOPT_UQUOTANOENF)) {
+			break;
+		case Opt_qnoenforce:
+		case Opt_uqnoenforce:
 			mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE);
 			mp->m_qflags &= ~XFS_UQUOTA_ENFD;
-		} else if (!strcmp(this_char, MNTOPT_PQUOTA) ||
-			   !strcmp(this_char, MNTOPT_PRJQUOTA)) {
+			break;
+		case Opt_pquota:
+		case Opt_prjquota:
 			mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE |
 					 XFS_PQUOTA_ENFD);
-		} else if (!strcmp(this_char, MNTOPT_PQUOTANOENF)) {
+			break;
+		case Opt_pqnoenforce:
 			mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE);
 			mp->m_qflags &= ~XFS_PQUOTA_ENFD;
-		} else if (!strcmp(this_char, MNTOPT_GQUOTA) ||
-			   !strcmp(this_char, MNTOPT_GRPQUOTA)) {
+		case Opt_gquota:
+		case Opt_grpquota:
 			mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE |
 					 XFS_GQUOTA_ENFD);
-		} else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) {
+			break;
+		case Opt_gqnoenforce:
 			mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
 			mp->m_qflags &= ~XFS_GQUOTA_ENFD;
-		} else if (!strcmp(this_char, MNTOPT_DISCARD)) {
+			break;
+		case Opt_discard:
 			mp->m_flags |= XFS_MOUNT_DISCARD;
-		} else if (!strcmp(this_char, MNTOPT_NODISCARD)) {
+			break;
+		case Opt_nodiscard:
 			mp->m_flags &= ~XFS_MOUNT_DISCARD;
+			break;
 #ifdef CONFIG_FS_DAX
-		} else if (!strcmp(this_char, MNTOPT_DAX)) {
+		case Opt_dax:
 			mp->m_flags |= XFS_MOUNT_DAX;
+			break;
 #endif
-		} else {
-			xfs_warn(mp, "unknown mount option [%s].", this_char);
+		default:
+			xfs_warn(mp, "unknown mount option [%s].", p);
 			return -EINVAL;
 		}
 	}
@@ -461,25 +465,25 @@ xfs_showargs(
 {
 	static struct proc_xfs_info xfs_info_set[] = {
 		/* the few simple ones we can get from the mount struct */
-		{ XFS_MOUNT_IKEEP,		"," MNTOPT_IKEEP },
-		{ XFS_MOUNT_WSYNC,		"," MNTOPT_WSYNC },
-		{ XFS_MOUNT_NOALIGN,		"," MNTOPT_NOALIGN },
-		{ XFS_MOUNT_SWALLOC,		"," MNTOPT_SWALLOC },
-		{ XFS_MOUNT_NOUUID,		"," MNTOPT_NOUUID },
-		{ XFS_MOUNT_NORECOVERY,		"," MNTOPT_NORECOVERY },
-		{ XFS_MOUNT_ATTR2,		"," MNTOPT_ATTR2 },
-		{ XFS_MOUNT_FILESTREAMS,	"," MNTOPT_FILESTREAM },
-		{ XFS_MOUNT_GRPID,		"," MNTOPT_GRPID },
-		{ XFS_MOUNT_DISCARD,		"," MNTOPT_DISCARD },
-		{ XFS_MOUNT_SMALL_INUMS,	"," MNTOPT_32BITINODE },
-		{ XFS_MOUNT_DAX,		"," MNTOPT_DAX },
+		{ XFS_MOUNT_IKEEP,		",ikeep" },
+		{ XFS_MOUNT_WSYNC,		",wsync" },
+		{ XFS_MOUNT_NOALIGN,		",noalign" },
+		{ XFS_MOUNT_SWALLOC,		",swalloc" },
+		{ XFS_MOUNT_NOUUID,		",nouuid" },
+		{ XFS_MOUNT_NORECOVERY,		",norecovery" },
+		{ XFS_MOUNT_ATTR2,		",attr2" },
+		{ XFS_MOUNT_FILESTREAMS,	",filestreams" },
+		{ XFS_MOUNT_GRPID,		",grpid" },
+		{ XFS_MOUNT_DISCARD,		",discard" },
+		{ XFS_MOUNT_SMALL_INUMS,	",inode32" },
+		{ XFS_MOUNT_DAX,		",dax" },
 		{ 0, NULL }
 	};
 	static struct proc_xfs_info xfs_info_unset[] = {
 		/* the few simple ones we can get from the mount struct */
-		{ XFS_MOUNT_COMPAT_IOSIZE,	"," MNTOPT_LARGEIO },
-		{ XFS_MOUNT_BARRIER,		"," MNTOPT_NOBARRIER },
-		{ XFS_MOUNT_SMALL_INUMS,	"," MNTOPT_64BITINODE },
+		{ XFS_MOUNT_COMPAT_IOSIZE,	",largeio" },
+		{ XFS_MOUNT_BARRIER,		",nobarrier" },
+		{ XFS_MOUNT_SMALL_INUMS,	",inode64" },
 		{ 0, NULL }
 	};
 	struct proc_xfs_info	*xfs_infop;
@@ -494,46 +498,46 @@ xfs_showargs(
 	}
 
 	if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)
-		seq_printf(m, "," MNTOPT_ALLOCSIZE "=%dk",
+		seq_printf(m, ",allocsize=%dk",
 				(int)(1 << mp->m_writeio_log) >> 10);
 
 	if (mp->m_logbufs > 0)
-		seq_printf(m, "," MNTOPT_LOGBUFS "=%d", mp->m_logbufs);
+		seq_printf(m, ",logbufs=%d", mp->m_logbufs);
 	if (mp->m_logbsize > 0)
-		seq_printf(m, "," MNTOPT_LOGBSIZE "=%dk", mp->m_logbsize >> 10);
+		seq_printf(m, ",logbsize=%dk", mp->m_logbsize >> 10);
 
 	if (mp->m_logname)
-		seq_show_option(m, MNTOPT_LOGDEV, mp->m_logname);
+		seq_show_option(m, "logdev", mp->m_logname);
 	if (mp->m_rtname)
-		seq_show_option(m, MNTOPT_RTDEV, mp->m_rtname);
+		seq_show_option(m, "rtdev", mp->m_rtname);
 
 	if (mp->m_dalign > 0)
-		seq_printf(m, "," MNTOPT_SUNIT "=%d",
+		seq_printf(m, ",sunit=%d",
 				(int)XFS_FSB_TO_BB(mp, mp->m_dalign));
 	if (mp->m_swidth > 0)
-		seq_printf(m, "," MNTOPT_SWIDTH "=%d",
+		seq_printf(m, ",swidth=%d",
 				(int)XFS_FSB_TO_BB(mp, mp->m_swidth));
 
 	if (mp->m_qflags & (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD))
-		seq_puts(m, "," MNTOPT_USRQUOTA);
+		seq_puts(m, ",usrquota");
 	else if (mp->m_qflags & XFS_UQUOTA_ACCT)
-		seq_puts(m, "," MNTOPT_UQUOTANOENF);
+		seq_puts(m, ",uqnoenforce");
 
 	if (mp->m_qflags & XFS_PQUOTA_ACCT) {
 		if (mp->m_qflags & XFS_PQUOTA_ENFD)
-			seq_puts(m, "," MNTOPT_PRJQUOTA);
+			seq_puts(m, ",prjquota");
 		else
-			seq_puts(m, "," MNTOPT_PQUOTANOENF);
+			seq_puts(m, ",pqnoenforce");
 	}
 	if (mp->m_qflags & XFS_GQUOTA_ACCT) {
 		if (mp->m_qflags & XFS_GQUOTA_ENFD)
-			seq_puts(m, "," MNTOPT_GRPQUOTA);
+			seq_puts(m, ",grpquota");
 		else
-			seq_puts(m, "," MNTOPT_GQUOTANOENF);
+			seq_puts(m, ",gqnoenforce");
 	}
 
 	if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT))
-		seq_puts(m, "," MNTOPT_NOQUOTA);
+		seq_puts(m, ",noquota");
 
 	return 0;
 }
@@ -1344,9 +1348,8 @@ xfs_finish_flags(
 	 */
 	if (xfs_sb_version_hascrc(&mp->m_sb) &&
 	    (mp->m_flags & XFS_MOUNT_NOATTR2)) {
-		xfs_warn(mp,
-"Cannot mount a V5 filesystem as %s. %s is always enabled for V5 filesystems.",
-			MNTOPT_NOATTR2, MNTOPT_ATTR2);
+		xfs_warn(mp, "Cannot mount a V5 filesystem as noattr2. "
+			     "attr2 is always enabled for V5 filesystems.");
 		return -EINVAL;
 	}
 

From a08ee40a79653d082911cab9fbeff94baa7714cb Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Wed, 2 Mar 2016 09:56:31 +1100
Subject: [PATCH 51/66] xfs: sanitize remount options

Perform basic sanitization of remount options by
passing the option string and a dummy mount structure
through xfs_parseargs and returning the result.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_super.c | 32 +++++++++++++++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 9e1538d48548..8dd7d165902d 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -172,13 +172,17 @@ suffix_kstrtoint(const substring_t *s, unsigned int base, int *res)
  *
  * Note that this function leaks the various device name allocations on
  * failure.  The caller takes care of them.
+ *
+ * *sb is const because this is also used to test options on the remount
+ * path, and we don't want this to have any side effects at remount time.
+ * Today this function does not change *sb, but just to future-proof...
  */
 STATIC int
 xfs_parseargs(
 	struct xfs_mount	*mp,
 	char			*options)
 {
-	struct super_block	*sb = mp->m_super;
+	const struct super_block *sb = mp->m_super;
 	char			*p;
 	substring_t		args[MAX_OPT_ARGS];
 	int			dsunit = 0;
@@ -1169,6 +1173,27 @@ xfs_quiesce_attr(
 	xfs_log_quiesce(mp);
 }
 
+STATIC int
+xfs_test_remount_options(
+	struct super_block	*sb,
+	struct xfs_mount	*mp,
+	char			*options)
+{
+	int			error = 0;
+	struct xfs_mount	*tmp_mp;
+
+	tmp_mp = kmem_zalloc(sizeof(*tmp_mp), KM_MAYFAIL);
+	if (!tmp_mp)
+		return -ENOMEM;
+
+	tmp_mp->m_super = sb;
+	error = xfs_parseargs(tmp_mp, options);
+	xfs_free_fsname(tmp_mp);
+	kfree(tmp_mp);
+
+	return error;
+}
+
 STATIC int
 xfs_fs_remount(
 	struct super_block	*sb,
@@ -1181,6 +1206,11 @@ xfs_fs_remount(
 	char			*p;
 	int			error;
 
+	/* First, check for complete junk; i.e. invalid options */
+	error = xfs_test_remount_options(sb, mp, options);
+	if (error)
+		return error;
+
 	sync_filesystem(sb);
 	while ((p = strsep(&options, ",")) != NULL) {
 		int token;

From 5d518bd6cec6f921ee1be504094762496afb45fb Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Wed, 2 Mar 2016 09:57:04 +1100
Subject: [PATCH 52/66] xfs: fix format specifier , should be %llx and not %llu

busyp->bno is printed with a %llu format specifier when the
intention is to print a hexadecimal value. Trivial fix to
use %llx instead.  Found with smatch static analysis:

fs/xfs/xfs_discard.c:229 xfs_discard_extents() warn: '0x'
  prefix is confusing together with '%llu' specifier

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_discard.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index e85a9519a5ae..272c3f8b6f7d 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -227,7 +227,7 @@ xfs_discard_extents(
 				GFP_NOFS, 0);
 		if (error && error != -EOPNOTSUPP) {
 			xfs_info(mp,
-	 "discard failed for extent [0x%llu,%u], error %d",
+	 "discard failed for extent [0x%llx,%u], error %d",
 				 (unsigned long long)busyp->bno,
 				 busyp->length,
 				 error);

From 12c3f05c7b592ae3bf2219392f1cbf252645cd79 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Wed, 2 Mar 2016 09:58:09 +1100
Subject: [PATCH 53/66] xfs: fix up inode32/64 (re)mount handling

inode32/inode64 allocator behavior with respect to mount, remount
and growfs is a little tricky.

The inode32 mount option should only enable the inode32 allocator
heuristics if the filesystem is large enough for 64-bit inodes to
exist.  Today, it has this behavior on the initial mount, but a
remount with inode32 unconditionally changes the allocation
heuristics, even for a small fs.

Also, an inode32 mounted small filesystem should transition to the
inode32 allocator if the filesystem is subsequently grown to a
sufficient size.  Today that does not happen.

This patch consolidates xfs_set_inode32 and xfs_set_inode64 into a
single new function, and moves the "is the maximum inode number big
enough to matter" test into that function, so it doesn't rely on the
caller to get it right - which remount did not do, previously.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_mount.c |  20 +--------
 fs/xfs/xfs_mount.h |   5 +--
 fs/xfs/xfs_super.c | 104 ++++++++++++++++++++++++---------------------
 fs/xfs/xfs_super.h |   4 +-
 4 files changed, 61 insertions(+), 72 deletions(-)

diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index bb753b359bee..a773ef33f559 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -185,9 +185,6 @@ xfs_initialize_perag(
 	xfs_agnumber_t	index;
 	xfs_agnumber_t	first_initialised = 0;
 	xfs_perag_t	*pag;
-	xfs_agino_t	agino;
-	xfs_ino_t	ino;
-	xfs_sb_t	*sbp = &mp->m_sb;
 	int		error = -ENOMEM;
 
 	/*
@@ -230,22 +227,7 @@ xfs_initialize_perag(
 		radix_tree_preload_end();
 	}
 
-	/*
-	 * If we mount with the inode64 option, or no inode overflows
-	 * the legacy 32-bit address space clear the inode32 option.
-	 */
-	agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0);
-	ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino);
-
-	if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) && ino > XFS_MAXINUMBER_32)
-		mp->m_flags |= XFS_MOUNT_32BITINODES;
-	else
-		mp->m_flags &= ~XFS_MOUNT_32BITINODES;
-
-	if (mp->m_flags & XFS_MOUNT_32BITINODES)
-		index = xfs_set_inode32(mp, agcount);
-	else
-		index = xfs_set_inode64(mp, agcount);
+	index = xfs_set_inode_alloc(mp, agcount);
 
 	if (maxagi)
 		*maxagi = index;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index b57098481c10..af01458def6a 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -166,9 +166,8 @@ typedef struct xfs_mount {
 #define XFS_MOUNT_GRPID		(1ULL << 9)	/* group-ID assigned from directory */
 #define XFS_MOUNT_NORECOVERY	(1ULL << 10)	/* no recovery - dirty fs */
 #define XFS_MOUNT_DFLT_IOSIZE	(1ULL << 12)	/* set default i/o size */
-#define XFS_MOUNT_32BITINODES	(1ULL << 14)	/* do not create inodes above
-						 * 32 bits in size */
-#define XFS_MOUNT_SMALL_INUMS	(1ULL << 15)	/* users wants 32bit inodes */
+#define XFS_MOUNT_SMALL_INUMS	(1ULL << 14)	/* user wants 32bit inodes */
+#define XFS_MOUNT_32BITINODES	(1ULL << 15)	/* inode32 allocator active */
 #define XFS_MOUNT_NOUUID	(1ULL << 16)	/* ignore uuid during mount */
 #define XFS_MOUNT_BARRIER	(1ULL << 17)
 #define XFS_MOUNT_IKEEP		(1ULL << 18)	/* keep empty inode clusters*/
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 8dd7d165902d..d85087bc0c40 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -580,23 +580,35 @@ xfs_max_file_offset(
 }
 
 /*
- * xfs_set_inode32() and xfs_set_inode64() are passed an agcount
- * because in the growfs case, mp->m_sb.sb_agcount is not updated
- * yet to the potentially higher ag count.
+ * Set parameters for inode allocation heuristics, taking into account
+ * filesystem size and inode32/inode64 mount options; i.e. specifically
+ * whether or not XFS_MOUNT_SMALL_INUMS is set.
+ *
+ * Inode allocation patterns are altered only if inode32 is requested
+ * (XFS_MOUNT_SMALL_INUMS), and the filesystem is sufficiently large.
+ * If altered, XFS_MOUNT_32BITINODES is set as well.
+ *
+ * An agcount independent of that in the mount structure is provided
+ * because in the growfs case, mp->m_sb.sb_agcount is not yet updated
+ * to the potentially higher ag count.
+ *
+ * Returns the maximum AG index which may contain inodes.
  */
 xfs_agnumber_t
-xfs_set_inode32(struct xfs_mount *mp, xfs_agnumber_t agcount)
+xfs_set_inode_alloc(
+	struct xfs_mount *mp,
+	xfs_agnumber_t	agcount)
 {
-	xfs_agnumber_t	index = 0;
+	xfs_agnumber_t	index;
 	xfs_agnumber_t	maxagi = 0;
 	xfs_sb_t	*sbp = &mp->m_sb;
 	xfs_agnumber_t	max_metadata;
 	xfs_agino_t	agino;
 	xfs_ino_t	ino;
-	xfs_perag_t	*pag;
 
-	/* Calculate how much should be reserved for inodes to meet
-	 * the max inode percentage.
+	/*
+	 * Calculate how much should be reserved for inodes to meet
+	 * the max inode percentage.  Used only for inode32.
 	 */
 	if (mp->m_maxicount) {
 		__uint64_t	icount;
@@ -610,54 +622,48 @@ xfs_set_inode32(struct xfs_mount *mp, xfs_agnumber_t agcount)
 		max_metadata = agcount;
 	}
 
+	/* Get the last possible inode in the filesystem */
 	agino =	XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0);
+	ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino);
 
-	for (index = 0; index < agcount; index++) {
-		ino = XFS_AGINO_TO_INO(mp, index, agino);
-
-		if (ino > XFS_MAXINUMBER_32) {
-			pag = xfs_perag_get(mp, index);
-			pag->pagi_inodeok = 0;
-			pag->pagf_metadata = 0;
-			xfs_perag_put(pag);
-			continue;
-		}
-
-		pag = xfs_perag_get(mp, index);
-		pag->pagi_inodeok = 1;
-		maxagi++;
-		if (index < max_metadata)
-			pag->pagf_metadata = 1;
-		xfs_perag_put(pag);
-	}
-	mp->m_flags |= (XFS_MOUNT_32BITINODES |
-			XFS_MOUNT_SMALL_INUMS);
-
-	return maxagi;
-}
-
-xfs_agnumber_t
-xfs_set_inode64(struct xfs_mount *mp, xfs_agnumber_t agcount)
-{
-	xfs_agnumber_t index = 0;
+	/*
+	 * If user asked for no more than 32-bit inodes, and the fs is
+	 * sufficiently large, set XFS_MOUNT_32BITINODES if we must alter
+	 * the allocator to accommodate the request.
+	 */
+	if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) && ino > XFS_MAXINUMBER_32)
+		mp->m_flags |= XFS_MOUNT_32BITINODES;
+	else
+		mp->m_flags &= ~XFS_MOUNT_32BITINODES;
 
 	for (index = 0; index < agcount; index++) {
 		struct xfs_perag	*pag;
 
+		ino = XFS_AGINO_TO_INO(mp, index, agino);
+
 		pag = xfs_perag_get(mp, index);
-		pag->pagi_inodeok = 1;
-		pag->pagf_metadata = 0;
+
+		if (mp->m_flags & XFS_MOUNT_32BITINODES) {
+			if (ino > XFS_MAXINUMBER_32) {
+				pag->pagi_inodeok = 0;
+				pag->pagf_metadata = 0;
+			} else {
+				pag->pagi_inodeok = 1;
+				maxagi++;
+				if (index < max_metadata)
+					pag->pagf_metadata = 1;
+				else
+					pag->pagf_metadata = 0;
+			}
+		} else {
+			pag->pagi_inodeok = 1;
+			pag->pagf_metadata = 0;
+		}
+
 		xfs_perag_put(pag);
 	}
 
-	/* There is no need for lock protection on m_flags,
-	 * the rw_semaphore of the VFS superblock is locked
-	 * during mount/umount/remount operations, so this is
-	 * enough to avoid concurency on the m_flags field
-	 */
-	mp->m_flags &= ~(XFS_MOUNT_32BITINODES |
-			 XFS_MOUNT_SMALL_INUMS);
-	return index;
+	return (mp->m_flags & XFS_MOUNT_32BITINODES) ? maxagi : agcount;
 }
 
 STATIC int
@@ -1227,10 +1233,12 @@ xfs_fs_remount(
 			mp->m_flags &= ~XFS_MOUNT_BARRIER;
 			break;
 		case Opt_inode64:
-			mp->m_maxagi = xfs_set_inode64(mp, sbp->sb_agcount);
+			mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS;
+			mp->m_maxagi = xfs_set_inode_alloc(mp, sbp->sb_agcount);
 			break;
 		case Opt_inode32:
-			mp->m_maxagi = xfs_set_inode32(mp, sbp->sb_agcount);
+			mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
+			mp->m_maxagi = xfs_set_inode_alloc(mp, sbp->sb_agcount);
 			break;
 		default:
 			/*
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
index 499058fea303..2dfb1ce4585f 100644
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -65,8 +65,8 @@ extern __uint64_t xfs_max_file_offset(unsigned int);
 
 extern void xfs_flush_inodes(struct xfs_mount *mp);
 extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
-extern xfs_agnumber_t xfs_set_inode32(struct xfs_mount *, xfs_agnumber_t agcount);
-extern xfs_agnumber_t xfs_set_inode64(struct xfs_mount *, xfs_agnumber_t agcount);
+extern xfs_agnumber_t xfs_set_inode_alloc(struct xfs_mount *,
+					   xfs_agnumber_t agcount);
 
 extern const struct export_operations xfs_export_operations;
 extern const struct xattr_handler *xfs_xattr_handlers[];

From a7e5d03ba8882aa772c691f16690fe7e73cee257 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 2 Mar 2016 09:58:21 +1100
Subject: [PATCH 54/66] xfs: remove xfs_trans_get_block_res

Just use the t_blk_res field directly instead of obsfucating the reference
by a macro.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/libxfs/xfs_bmap.c       | 6 +++---
 fs/xfs/libxfs/xfs_bmap_btree.c | 4 ++--
 fs/xfs/xfs_trans.h             | 1 -
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index ef00156f4f96..0527e8b9bda3 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -5210,7 +5210,7 @@ xfs_bunmapi(
 			 * This is better than zeroing it.
 			 */
 			ASSERT(del.br_state == XFS_EXT_NORM);
-			ASSERT(xfs_trans_get_block_res(tp) > 0);
+			ASSERT(tp->t_blk_res > 0);
 			/*
 			 * If this spans a realtime extent boundary,
 			 * chop it back to the start of the one we end at.
@@ -5241,7 +5241,7 @@ xfs_bunmapi(
 				del.br_startblock += mod;
 			} else if ((del.br_startoff == start &&
 				    (del.br_state == XFS_EXT_UNWRITTEN ||
-				     xfs_trans_get_block_res(tp) == 0)) ||
+				     tp->t_blk_res == 0)) ||
 				   !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
 				/*
 				 * Can't make it unwritten.  There isn't
@@ -5332,7 +5332,7 @@ xfs_bunmapi(
 		 * conversion to btree format, since the transaction
 		 * will be dirty.
 		 */
-		if (!wasdel && xfs_trans_get_block_res(tp) == 0 &&
+		if (!wasdel && tp->t_blk_res == 0 &&
 		    XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
 		    XFS_IFORK_NEXTENTS(ip, whichfork) >= /* Note the >= */
 			XFS_IFORK_MAXEXT(ip, whichfork) &&
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index 1637c37bfbaa..e3eb3865b454 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -461,7 +461,7 @@ xfs_bmbt_alloc_block(
 		 * reservation amount is insufficient then we may fail a
 		 * block allocation here and corrupt the filesystem.
 		 */
-		args.minleft = xfs_trans_get_block_res(args.tp);
+		args.minleft = args.tp->t_blk_res;
 	} else if (cur->bc_private.b.flist->xbf_low) {
 		args.type = XFS_ALLOCTYPE_START_BNO;
 	} else {
@@ -470,7 +470,7 @@ xfs_bmbt_alloc_block(
 
 	args.minlen = args.maxlen = args.prod = 1;
 	args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
-	if (!args.wasdel && xfs_trans_get_block_res(args.tp) == 0) {
+	if (!args.wasdel && args.tp->t_blk_res == 0) {
 		error = -ENOSPC;
 		goto error0;
 	}
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 4643070d7cae..e7c49cf43fbc 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -133,7 +133,6 @@ typedef struct xfs_trans {
  * XFS transaction mechanism exported interfaces that are
  * actually macros.
  */
-#define	xfs_trans_get_block_res(tp)	((tp)->t_blk_res)
 #define	xfs_trans_set_sync(tp)		((tp)->t_flags |= XFS_TRANS_SYNC)
 
 #if defined(DEBUG) || defined(XFS_WARN)

From a798011c8fd4286605b990c1722c16903fc14b7a Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 7 Mar 2016 08:39:36 +1100
Subject: [PATCH 55/66] xfs: reinitialise per-AG structures if geometry changes
 during recovery

If a crash occurs immediately after a filesystem grow operation, the
updated superblock geometry is found only in the log. After we
recover the log, the superblock is reread and re-initialised and so
has the new geometry in memory. If the new geometry has more AGs
than prior to the grow operation, then the new AGs will not have
in-memory xfs_perag structurea associated with them.

This will result in an oops when the first metadata buffer from a
new AG is looked up in the buffer cache, as the block lies within
the new geometry but then fails to find a perag structure on lookup.
This is easily fixed by simply re-initialising the perag structure
after re-reading the superblock at the conclusion of the first pahse
of log recovery.

This, however, does not fix the case of log recovery requiring
access to metadata in the newly grown space. Fortunately for us,
because the in-core superblock has not been updated, this will
result in detection of access beyond the end of the filesystem
and so recovery will fail at that point. If this proves to be
a problem, then we can address it separately to the current
reported issue.

Reported-by: Alex Lyakas <alex@zadarastorage.com>
Tested-by: Alex Lyakas <alex@zadarastorage.com>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_log_recover.c | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 837e8d09aa5a..60c385f43a53 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -4891,6 +4891,7 @@ xlog_do_recover(
 	xfs_daddr_t	head_blk,
 	xfs_daddr_t	tail_blk)
 {
+	struct xfs_mount *mp = log->l_mp;
 	int		error;
 	xfs_buf_t	*bp;
 	xfs_sb_t	*sbp;
@@ -4905,7 +4906,7 @@ xlog_do_recover(
 	/*
 	 * If IO errors happened during recovery, bail out.
 	 */
-	if (XFS_FORCED_SHUTDOWN(log->l_mp)) {
+	if (XFS_FORCED_SHUTDOWN(mp)) {
 		return -EIO;
 	}
 
@@ -4918,13 +4919,13 @@ xlog_do_recover(
 	 * or iunlinks they will have some entries in the AIL; so we look at
 	 * the AIL to determine how to set the tail_lsn.
 	 */
-	xlog_assign_tail_lsn(log->l_mp);
+	xlog_assign_tail_lsn(mp);
 
 	/*
 	 * Now that we've finished replaying all buffer and inode
 	 * updates, re-read in the superblock and reverify it.
 	 */
-	bp = xfs_getsb(log->l_mp, 0);
+	bp = xfs_getsb(mp, 0);
 	bp->b_flags &= ~(XBF_DONE | XBF_ASYNC);
 	ASSERT(!(bp->b_flags & XBF_WRITE));
 	bp->b_flags |= XBF_READ;
@@ -4932,7 +4933,7 @@ xlog_do_recover(
 
 	error = xfs_buf_submit_wait(bp);
 	if (error) {
-		if (!XFS_FORCED_SHUTDOWN(log->l_mp)) {
+		if (!XFS_FORCED_SHUTDOWN(mp)) {
 			xfs_buf_ioerror_alert(bp, __func__);
 			ASSERT(0);
 		}
@@ -4941,14 +4942,17 @@ xlog_do_recover(
 	}
 
 	/* Convert superblock from on-disk format */
-	sbp = &log->l_mp->m_sb;
+	sbp = &mp->m_sb;
 	xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
-	ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC);
-	ASSERT(xfs_sb_good_version(sbp));
-	xfs_reinit_percpu_counters(log->l_mp);
-
 	xfs_buf_relse(bp);
 
+	/* re-initialise in-core superblock and geometry structures */
+	xfs_reinit_percpu_counters(mp);
+	error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi);
+	if (error) {
+		xfs_warn(mp, "Failed post-recovery per-ag init: %d", error);
+		return error;
+	}
 
 	xlog_recover_check_summary(log);
 

From 49ca9118e6ecca63c78de924801b8b9fe4af44ff Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Mon, 7 Mar 2016 08:39:56 +1100
Subject: [PATCH 56/66] xfs: fix computation of inode btree maxlevels

Commit 88740da18[1] introduced a function to compute the maximum
height of the inode btree back in 1994.  Back then, apparently, the
freespace and inode btrees shared the same geometry; however, it has
long since been the case that the inode and freespace btrees have
different record and key sizes.  Therefore, we must use m_inobt_mnr if
we want a correct calculation/log reservation/etc.

(Yes, this bug has been around for 21 years and ten months.)

(Yes, I was in middle school when this bug was committed.)

[1] http://oss.sgi.com/cgi-bin/gitweb.cgi?p=archive/xfs-import.git;a=commitdiff;h=88740da18ddd9d7ba3ebaa9502fefc6ef2fd19cd

Historical-research-by: Dave Chinner <david@fromorbit.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/libxfs/xfs_ialloc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 66d702e6b9ff..22297f9b0fd5 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -2403,8 +2403,8 @@ xfs_ialloc_compute_maxlevels(
 
 	maxleafents = (1LL << XFS_INO_AGINO_BITS(mp)) >>
 		XFS_INODES_PER_CHUNK_LOG;
-	minleafrecs = mp->m_alloc_mnr[0];
-	minnoderecs = mp->m_alloc_mnr[1];
+	minleafrecs = mp->m_inobt_mnr[0];
+	minnoderecs = mp->m_inobt_mnr[1];
 	maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
 	for (level = 1; maxblocks > 1; level++)
 		maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;

From 5110cd82ca90e6ce04c1c3264587c183aad1f179 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Mon, 7 Mar 2016 08:40:03 +1100
Subject: [PATCH 57/66] xfs: use named array initializers for log item dumping

Use named array initializers for the string arrays used to dump log
items, rather than depending on the order being maintained correctly.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_log.c | 132 ++++++++++++++++++++++++-----------------------
 1 file changed, 68 insertions(+), 64 deletions(-)

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 40b700d3f426..b49ccf5c1d75 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -2008,77 +2008,81 @@ xlog_print_tic_res(
 	uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t);
 
 	/* match with XLOG_REG_TYPE_* in xfs_log.h */
-	static char *res_type_str[XLOG_REG_TYPE_MAX] = {
-	    "bformat",
-	    "bchunk",
-	    "efi_format",
-	    "efd_format",
-	    "iformat",
-	    "icore",
-	    "iext",
-	    "ibroot",
-	    "ilocal",
-	    "iattr_ext",
-	    "iattr_broot",
-	    "iattr_local",
-	    "qformat",
-	    "dquot",
-	    "quotaoff",
-	    "LR header",
-	    "unmount",
-	    "commit",
-	    "trans header"
+#define REG_TYPE_STR(type, str)	[XLOG_REG_TYPE_##type] = str
+	static char *res_type_str[XLOG_REG_TYPE_MAX + 1] = {
+	    REG_TYPE_STR(BFORMAT, "bformat"),
+	    REG_TYPE_STR(BCHUNK, "bchunk"),
+	    REG_TYPE_STR(EFI_FORMAT, "efi_format"),
+	    REG_TYPE_STR(EFD_FORMAT, "efd_format"),
+	    REG_TYPE_STR(IFORMAT, "iformat"),
+	    REG_TYPE_STR(ICORE, "icore"),
+	    REG_TYPE_STR(IEXT, "iext"),
+	    REG_TYPE_STR(IBROOT, "ibroot"),
+	    REG_TYPE_STR(ILOCAL, "ilocal"),
+	    REG_TYPE_STR(IATTR_EXT, "iattr_ext"),
+	    REG_TYPE_STR(IATTR_BROOT, "iattr_broot"),
+	    REG_TYPE_STR(IATTR_LOCAL, "iattr_local"),
+	    REG_TYPE_STR(QFORMAT, "qformat"),
+	    REG_TYPE_STR(DQUOT, "dquot"),
+	    REG_TYPE_STR(QUOTAOFF, "quotaoff"),
+	    REG_TYPE_STR(LRHEADER, "LR header"),
+	    REG_TYPE_STR(UNMOUNT, "unmount"),
+	    REG_TYPE_STR(COMMIT, "commit"),
+	    REG_TYPE_STR(TRANSHDR, "trans header"),
+	    REG_TYPE_STR(ICREATE, "inode create")
 	};
+#undef REG_TYPE_STR
+#define TRANS_TYPE_STR(type)	[XFS_TRANS_##type] = #type
 	static char *trans_type_str[XFS_TRANS_TYPE_MAX] = {
-	    "SETATTR_NOT_SIZE",
-	    "SETATTR_SIZE",
-	    "INACTIVE",
-	    "CREATE",
-	    "CREATE_TRUNC",
-	    "TRUNCATE_FILE",
-	    "REMOVE",
-	    "LINK",
-	    "RENAME",
-	    "MKDIR",
-	    "RMDIR",
-	    "SYMLINK",
-	    "SET_DMATTRS",
-	    "GROWFS",
-	    "STRAT_WRITE",
-	    "DIOSTRAT",
-	    "WRITE_SYNC",
-	    "WRITEID",
-	    "ADDAFORK",
-	    "ATTRINVAL",
-	    "ATRUNCATE",
-	    "ATTR_SET",
-	    "ATTR_RM",
-	    "ATTR_FLAG",
-	    "CLEAR_AGI_BUCKET",
-	    "QM_SBCHANGE",
-	    "DUMMY1",
-	    "DUMMY2",
-	    "QM_QUOTAOFF",
-	    "QM_DQALLOC",
-	    "QM_SETQLIM",
-	    "QM_DQCLUSTER",
-	    "QM_QINOCREATE",
-	    "QM_QUOTAOFF_END",
-	    "FSYNC_TS",
-	    "GROWFSRT_ALLOC",
-	    "GROWFSRT_ZERO",
-	    "GROWFSRT_FREE",
-	    "SWAPEXT",
-	    "CHECKPOINT",
-	    "ICREATE",
-	    "CREATE_TMPFILE"
+	    TRANS_TYPE_STR(SETATTR_NOT_SIZE),
+	    TRANS_TYPE_STR(SETATTR_SIZE),
+	    TRANS_TYPE_STR(INACTIVE),
+	    TRANS_TYPE_STR(CREATE),
+	    TRANS_TYPE_STR(CREATE_TRUNC),
+	    TRANS_TYPE_STR(TRUNCATE_FILE),
+	    TRANS_TYPE_STR(REMOVE),
+	    TRANS_TYPE_STR(LINK),
+	    TRANS_TYPE_STR(RENAME),
+	    TRANS_TYPE_STR(MKDIR),
+	    TRANS_TYPE_STR(RMDIR),
+	    TRANS_TYPE_STR(SYMLINK),
+	    TRANS_TYPE_STR(SET_DMATTRS),
+	    TRANS_TYPE_STR(GROWFS),
+	    TRANS_TYPE_STR(STRAT_WRITE),
+	    TRANS_TYPE_STR(DIOSTRAT),
+	    TRANS_TYPE_STR(WRITEID),
+	    TRANS_TYPE_STR(ADDAFORK),
+	    TRANS_TYPE_STR(ATTRINVAL),
+	    TRANS_TYPE_STR(ATRUNCATE),
+	    TRANS_TYPE_STR(ATTR_SET),
+	    TRANS_TYPE_STR(ATTR_RM),
+	    TRANS_TYPE_STR(ATTR_FLAG),
+	    TRANS_TYPE_STR(CLEAR_AGI_BUCKET),
+	    TRANS_TYPE_STR(SB_CHANGE),
+	    TRANS_TYPE_STR(DUMMY1),
+	    TRANS_TYPE_STR(DUMMY2),
+	    TRANS_TYPE_STR(QM_QUOTAOFF),
+	    TRANS_TYPE_STR(QM_DQALLOC),
+	    TRANS_TYPE_STR(QM_SETQLIM),
+	    TRANS_TYPE_STR(QM_DQCLUSTER),
+	    TRANS_TYPE_STR(QM_QINOCREATE),
+	    TRANS_TYPE_STR(QM_QUOTAOFF_END),
+	    TRANS_TYPE_STR(FSYNC_TS),
+	    TRANS_TYPE_STR(GROWFSRT_ALLOC),
+	    TRANS_TYPE_STR(GROWFSRT_ZERO),
+	    TRANS_TYPE_STR(GROWFSRT_FREE),
+	    TRANS_TYPE_STR(SWAPEXT),
+	    TRANS_TYPE_STR(CHECKPOINT),
+	    TRANS_TYPE_STR(ICREATE),
+	    TRANS_TYPE_STR(CREATE_TMPFILE)
 	};
+#undef TRANS_TYPE_STR
 
 	xfs_warn(mp, "xlog_write: reservation summary:");
 	xfs_warn(mp, "  trans type  = %s (%u)",
 		 ((ticket->t_trans_type <= 0 ||
 		   ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ?
-		  "bad-trans-type" : trans_type_str[ticket->t_trans_type-1]),
+		  "bad-trans-type" : trans_type_str[ticket->t_trans_type]),
 		 ticket->t_trans_type);
 	xfs_warn(mp, "  unit res    = %d bytes",
 		 ticket->t_unit_res);
@@ -2097,7 +2101,7 @@ xlog_print_tic_res(
 		uint r_type = ticket->t_res_arr[i].r_type;
 		xfs_warn(mp, "region[%u]: %s - %u bytes", i,
 			    ((r_type <= 0 || r_type > XLOG_REG_TYPE_MAX) ?
-			    "bad-rtype" : res_type_str[r_type-1]),
+			    "bad-rtype" : res_type_str[r_type]),
 			    ticket->t_res_arr[i].r_len);
 	}
 

From 0df61da8ac5b7acb1db6cee142822ebe4724c739 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Mon, 7 Mar 2016 09:32:14 +1100
Subject: [PATCH 58/66] xfs: ioends require logically contiguous file offsets

We need to create a new ioend if the current writepage call isn't
logically contiguous with the range contained in the previous ioend.
Hopefully writepage gets called in order of increasing file offset.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_aops.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 46dc9211bae7..b125f9019145 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -520,7 +520,8 @@ xfs_add_to_ioend(
 	struct list_head	*iolist)
 {
 	if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type ||
-	    bh->b_blocknr != wpc->last_block + 1) {
+	    bh->b_blocknr != wpc->last_block + 1 ||
+	    offset != wpc->ioend->io_offset + wpc->ioend->io_size) {
 		struct xfs_ioend	*new;
 
 		if (wpc->ioend)

From 30cbc591c34e680e8b5d6d675ea49effe42a0570 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Wed, 9 Mar 2016 08:15:14 +1100
Subject: [PATCH 59/66] xfs: check sizes of XFS on-disk structures at compile
 time

Check the sizes of XFS on-disk structures when compiling the kernel.
Use this to catch inadvertent changes in structure size due to padding
and alignment issues, etc.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_ondisk.h | 117 ++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_super.c  |   3 ++
 2 files changed, 120 insertions(+)
 create mode 100644 fs/xfs/xfs_ondisk.h

diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h
new file mode 100644
index 000000000000..184c44effdd5
--- /dev/null
+++ b/fs/xfs/xfs_ondisk.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2016 Oracle.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_ONDISK_H
+#define __XFS_ONDISK_H
+
+#define XFS_CHECK_STRUCT_SIZE(structname, size) \
+	BUILD_BUG_ON_MSG(sizeof(structname) != (size), "XFS: sizeof(" \
+		#structname ") is wrong, expected " #size)
+
+static inline void __init
+xfs_check_ondisk_structs(void)
+{
+	/* ag/file structures */
+	XFS_CHECK_STRUCT_SIZE(struct xfs_acl,			4);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_acl_entry,		12);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_agf,			224);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_agfl,			36);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_agi,			336);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_key,		8);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_rec,		16);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_bmdr_block,		4);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block,		72);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_dinode,		176);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_disk_dquot,		104);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_dqblk,			136);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_dsb,			264);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_dsymlink_hdr,		56);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_inobt_key,		4);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_inobt_rec,		16);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_timestamp,		8);
+	XFS_CHECK_STRUCT_SIZE(xfs_alloc_key_t,			8);
+	XFS_CHECK_STRUCT_SIZE(xfs_alloc_ptr_t,			4);
+	XFS_CHECK_STRUCT_SIZE(xfs_alloc_rec_t,			8);
+	XFS_CHECK_STRUCT_SIZE(xfs_inobt_ptr_t,			4);
+
+	/* dir/attr trees */
+	XFS_CHECK_STRUCT_SIZE(struct xfs_attr3_leaf_hdr,	80);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_attr3_leafblock,	88);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_attr3_rmt_hdr,		56);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_da3_blkinfo,		56);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_da3_intnode,		64);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_da3_node_hdr,		64);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_blk_hdr,		48);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_data_hdr,		64);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_free,		64);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_free_hdr,		64);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_leaf,		64);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_leaf_hdr,		64);
+	XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_entry_t,		8);
+	XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_hdr_t,		32);
+	XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_map_t,		4);
+	XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_name_local_t,	4);
+
+	/*
+	 * m68k has problems with xfs_attr_leaf_name_remote_t, but we pad it to
+	 * 4 bytes anyway so it's not obviously a problem.  Hence for the moment
+	 * we don't check this structure. This can be re-instated when the attr
+	 * definitions are updated to use c99 VLA definitions.
+	 *
+	XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_name_remote_t,	12);
+	 */
+
+	XFS_CHECK_STRUCT_SIZE(xfs_attr_leafblock_t,		40);
+	XFS_CHECK_STRUCT_SIZE(xfs_attr_shortform_t,		8);
+	XFS_CHECK_STRUCT_SIZE(xfs_da_blkinfo_t,			12);
+	XFS_CHECK_STRUCT_SIZE(xfs_da_intnode_t,			16);
+	XFS_CHECK_STRUCT_SIZE(xfs_da_node_entry_t,		8);
+	XFS_CHECK_STRUCT_SIZE(xfs_da_node_hdr_t,		16);
+	XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_free_t,		4);
+	XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_hdr_t,		16);
+	XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_unused_t,		6);
+	XFS_CHECK_STRUCT_SIZE(xfs_dir2_free_hdr_t,		16);
+	XFS_CHECK_STRUCT_SIZE(xfs_dir2_free_t,			16);
+	XFS_CHECK_STRUCT_SIZE(xfs_dir2_ino4_t,			4);
+	XFS_CHECK_STRUCT_SIZE(xfs_dir2_ino8_t,			8);
+	XFS_CHECK_STRUCT_SIZE(xfs_dir2_inou_t,			8);
+	XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_entry_t,		8);
+	XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_hdr_t,		16);
+	XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_t,			16);
+	XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_tail_t,		4);
+	XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_entry_t,		3);
+	XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_hdr_t,		10);
+	XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_off_t,		2);
+
+	/* log structures */
+	XFS_CHECK_STRUCT_SIZE(struct xfs_dq_logformat,		24);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_efd_log_format_32,	28);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_efd_log_format_64,	32);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_efi_log_format_32,	28);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_efi_log_format_64,	32);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_extent_32,		12);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_extent_64,		16);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_log_dinode,		176);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_icreate_log,		28);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_ictimestamp,		8);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format_32,	52);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format_64,	56);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_qoff_logformat,	20);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_trans_header,		16);
+}
+
+#endif /* __XFS_ONDISK_H */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 59c9b7bd958d..3bd2027a7ea6 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -45,6 +45,7 @@
 #include "xfs_filestream.h"
 #include "xfs_quota.h"
 #include "xfs_sysfs.h"
+#include "xfs_ondisk.h"
 
 #include <linux/namei.h>
 #include <linux/init.h>
@@ -1817,6 +1818,8 @@ init_xfs_fs(void)
 {
 	int			error;
 
+	xfs_check_ondisk_structs();
+
 	printk(KERN_INFO XFS_VERSION_STRING " with "
 			 XFS_BUILD_OPTIONS " enabled\n");
 

From a5fd276bdc4fb71b06d100a6abc77ad682f77de4 Mon Sep 17 00:00:00 2001
From: Luis de Bethencourt <luisbg@osg.samsung.com>
Date: Wed, 9 Mar 2016 08:17:56 +1100
Subject: [PATCH 60/66] xfs: remove impossible condition

bp_release is set to 0 just before the breakpoint of the for loop before
the conditional check (in line 458). The other breakpoint is a goto that
skips the dead code.

Addresses-Coverity-Id: 102338

Signed-off-by: Luis de Bethencourt <luisbg@osg.samsung.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/libxfs/xfs_bmap.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index ef00156f4f96..9054c500ce44 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -477,10 +477,7 @@ xfs_bmap_check_leaf_extents(
 		}
 		block = XFS_BUF_TO_BLOCK(bp);
 	}
-	if (bp_release) {
-		bp_release = 0;
-		xfs_trans_brelse(NULL, bp);
-	}
+
 	return;
 
 error0:

From 801cc4e17a34c32e1527827292fac9cec5d3393b Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Tue, 15 Mar 2016 11:42:44 +1100
Subject: [PATCH 61/66] xfs: debug mode forced buffered write failure

Add a DEBUG mode-only sysfs knob to enable forced buffered write
failure. An additional side effect of this mode is brute force killing
of delayed allocation blocks in the range of the write. The latter is
the prime motiviation behind this patch, as userspace test
infrastructure requires a reliable mechanism to create and split
delalloc extents without causing extent conversion.

Certain fallocate operations (i.e., zero range) were used for this in
the past, but the implementations have changed such that delalloc
extents are flushed and converted to real blocks, rendering the test
useless.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_aops.c  |  9 +++++-
 fs/xfs/xfs_mount.h | 25 +++++++++++++++
 fs/xfs/xfs_sysfs.c | 78 ++++++++++++++++++++++++++++++++++++++++------
 3 files changed, 101 insertions(+), 11 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 379c089fb051..eed0bfc4443c 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1752,6 +1752,7 @@ xfs_vm_write_failed(
 	loff_t			from = pos & (PAGE_CACHE_SIZE - 1);
 	loff_t			to = from + len;
 	struct buffer_head	*bh, *head;
+	struct xfs_mount	*mp = XFS_I(inode)->i_mount;
 
 	/*
 	 * The request pos offset might be 32 or 64 bit, this is all fine
@@ -1786,7 +1787,8 @@ xfs_vm_write_failed(
 		if (!buffer_delay(bh))
 			continue;
 
-		if (!buffer_new(bh) && block_offset < i_size_read(inode))
+		if (!xfs_mp_fail_writes(mp) && !buffer_new(bh) &&
+		    block_offset < i_size_read(inode))
 			continue;
 
 		xfs_vm_kill_delalloc_range(inode, block_offset,
@@ -1824,6 +1826,7 @@ xfs_vm_write_begin(
 	pgoff_t			index = pos >> PAGE_CACHE_SHIFT;
 	struct page		*page;
 	int			status;
+	struct xfs_mount	*mp = XFS_I(mapping->host)->i_mount;
 
 	ASSERT(len <= PAGE_CACHE_SIZE);
 
@@ -1832,6 +1835,8 @@ xfs_vm_write_begin(
 		return -ENOMEM;
 
 	status = __block_write_begin(page, pos, len, xfs_get_blocks);
+	if (xfs_mp_fail_writes(mp))
+		status = -EIO;
 	if (unlikely(status)) {
 		struct inode	*inode = mapping->host;
 		size_t		isize = i_size_read(inode);
@@ -1844,6 +1849,8 @@ xfs_vm_write_begin(
 		 * allocated in this write, not blocks that were previously
 		 * written successfully.
 		 */
+		if (xfs_mp_fail_writes(mp))
+			isize = 0;
 		if (pos + len > isize) {
 			ssize_t start = max_t(ssize_t, pos, isize);
 
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index b57098481c10..546cf46a97ce 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -147,6 +147,17 @@ typedef struct xfs_mount {
 	 * to various other kinds of pain inflicted on the pNFS server.
 	 */
 	__uint32_t		m_generation;
+
+#ifdef DEBUG
+	/*
+	 * DEBUG mode instrumentation to test and/or trigger delayed allocation
+	 * block killing in the event of failed writes. When enabled, all
+	 * buffered writes are forced to fail. All delalloc blocks in the range
+	 * of the write (including pre-existing delalloc blocks!) are tossed as
+	 * part of the write failure error handling sequence.
+	 */
+	bool			m_fail_writes;
+#endif
 } xfs_mount_t;
 
 /*
@@ -264,6 +275,20 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
 	return (xfs_agblock_t) do_div(ld, mp->m_sb.sb_agblocks);
 }
 
+#ifdef DEBUG
+static inline bool
+xfs_mp_fail_writes(struct xfs_mount *mp)
+{
+	return mp->m_fail_writes;
+}
+#else
+static inline bool
+xfs_mp_fail_writes(struct xfs_mount *mp)
+{
+	return 0;
+}
+#endif
+
 /*
  * Per-ag incore structure, copies of information in agf and agi, to improve the
  * performance of allocation group selection.
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index 641d625eb334..6ced4f143494 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -18,10 +18,13 @@
 
 #include "xfs.h"
 #include "xfs_sysfs.h"
+#include "xfs_format.h"
 #include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_log.h"
 #include "xfs_log_priv.h"
 #include "xfs_stats.h"
+#include "xfs_mount.h"
 
 struct xfs_sysfs_attr {
 	struct attribute attr;
@@ -45,16 +48,6 @@ to_attr(struct attribute *attr)
 
 #define ATTR_LIST(name) &xfs_sysfs_attr_##name.attr
 
-/*
- * xfs_mount kobject. This currently has no attributes and thus no need for show
- * and store helpers. The mp kobject serves as the per-mount parent object that
- * is identified by the fsname under sysfs.
- */
-
-struct kobj_type xfs_mp_ktype = {
-	.release = xfs_sysfs_release,
-};
-
 STATIC ssize_t
 xfs_sysfs_object_show(
 	struct kobject		*kobject,
@@ -83,6 +76,71 @@ static const struct sysfs_ops xfs_sysfs_ops = {
 	.store = xfs_sysfs_object_store,
 };
 
+/*
+ * xfs_mount kobject. The mp kobject also serves as the per-mount parent object
+ * that is identified by the fsname under sysfs.
+ */
+
+static inline struct xfs_mount *
+to_mp(struct kobject *kobject)
+{
+	struct xfs_kobj *kobj = to_kobj(kobject);
+
+	return container_of(kobj, struct xfs_mount, m_kobj);
+}
+
+#ifdef DEBUG
+
+STATIC ssize_t
+fail_writes_store(
+	struct kobject		*kobject,
+	const char		*buf,
+	size_t			count)
+{
+	struct xfs_mount	*mp = to_mp(kobject);
+	int			ret;
+	int			val;
+
+	ret = kstrtoint(buf, 0, &val);
+	if (ret)
+		return ret;
+
+	if (val == 1)
+		mp->m_fail_writes = true;
+	else if (val == 0)
+		mp->m_fail_writes = false;
+	else
+		return -EINVAL;
+
+	return count;
+}
+
+STATIC ssize_t
+fail_writes_show(
+	struct kobject		*kobject,
+	char			*buf)
+{
+	struct xfs_mount	*mp = to_mp(kobject);
+
+	return snprintf(buf, PAGE_SIZE, "%d\n", mp->m_fail_writes ? 1 : 0);
+}
+XFS_SYSFS_ATTR_RW(fail_writes);
+
+#endif /* DEBUG */
+
+static struct attribute *xfs_mp_attrs[] = {
+#ifdef DEBUG
+	ATTR_LIST(fail_writes),
+#endif
+	NULL,
+};
+
+struct kobj_type xfs_mp_ktype = {
+	.release = xfs_sysfs_release,
+	.sysfs_ops = &xfs_sysfs_ops,
+	.default_attrs = xfs_mp_attrs,
+};
+
 #ifdef DEBUG
 /* debug */
 

From b2706a05bad36c0a826493c6ba84c8a9caf8a3ae Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Tue, 15 Mar 2016 11:42:46 +1100
Subject: [PATCH 62/66] xfs: update freeblocks counter after extent deletion

xfs_bunmapi() currently updates the fdblocks counter, unreserves quota,
etc. before the extent is deleted by xfs_bmap_del_extent(). The function
has problems dividing up the indirect reserved blocks for scenarios
where a single delalloc extent is split in two. Particularly, there
aren't always enough blocks reserved for multiple extents in a single
extent reservation.

The solution to this problem is to allow the extent removal code to
steal from the deleted extent to meet indirect reservation requirements.
Move the block of code in xfs_bmapi() that updates the fdblocks counter
to after the call to xfs_bmap_del_extent() to allow the codepath to
update the extent record before the free blocks are accounted. Also,
reshuffle the code slightly so the delalloc accounting occurs near the
xfs_bmap_del_extent() call to provide context for the comments.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/libxfs/xfs_bmap.c | 58 +++++++++++++++++++++++-----------------
 1 file changed, 34 insertions(+), 24 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index ef00156f4f96..b48abc300c36 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -5296,31 +5296,7 @@ xfs_bunmapi(
 				goto nodelete;
 			}
 		}
-		if (wasdel) {
-			ASSERT(startblockval(del.br_startblock) > 0);
-			/* Update realtime/data freespace, unreserve quota */
-			if (isrt) {
-				xfs_filblks_t rtexts;
 
-				rtexts = XFS_FSB_TO_B(mp, del.br_blockcount);
-				do_div(rtexts, mp->m_sb.sb_rextsize);
-				xfs_mod_frextents(mp, (int64_t)rtexts);
-				(void)xfs_trans_reserve_quota_nblks(NULL,
-					ip, -((long)del.br_blockcount), 0,
-					XFS_QMOPT_RES_RTBLKS);
-			} else {
-				xfs_mod_fdblocks(mp, (int64_t)del.br_blockcount,
-						 false);
-				(void)xfs_trans_reserve_quota_nblks(NULL,
-					ip, -((long)del.br_blockcount), 0,
-					XFS_QMOPT_RES_REGBLKS);
-			}
-			ip->i_delayed_blks -= del.br_blockcount;
-			if (cur)
-				cur->bc_private.b.flags |=
-					XFS_BTCUR_BPRV_WASDEL;
-		} else if (cur)
-			cur->bc_private.b.flags &= ~XFS_BTCUR_BPRV_WASDEL;
 		/*
 		 * If it's the case where the directory code is running
 		 * with no block reservation, and the deleted block is in
@@ -5342,11 +5318,45 @@ xfs_bunmapi(
 			error = -ENOSPC;
 			goto error0;
 		}
+
+		/*
+		 * Unreserve quota and update realtime free space, if
+		 * appropriate. If delayed allocation, update the inode delalloc
+		 * counter now and wait to update the sb counters as
+		 * xfs_bmap_del_extent() might need to borrow some blocks.
+		 */
+		if (wasdel) {
+			ASSERT(startblockval(del.br_startblock) > 0);
+			if (isrt) {
+				xfs_filblks_t rtexts;
+
+				rtexts = XFS_FSB_TO_B(mp, del.br_blockcount);
+				do_div(rtexts, mp->m_sb.sb_rextsize);
+				xfs_mod_frextents(mp, (int64_t)rtexts);
+				(void)xfs_trans_reserve_quota_nblks(NULL,
+					ip, -((long)del.br_blockcount), 0,
+					XFS_QMOPT_RES_RTBLKS);
+			} else {
+				(void)xfs_trans_reserve_quota_nblks(NULL,
+					ip, -((long)del.br_blockcount), 0,
+					XFS_QMOPT_RES_REGBLKS);
+			}
+			ip->i_delayed_blks -= del.br_blockcount;
+			if (cur)
+				cur->bc_private.b.flags |=
+					XFS_BTCUR_BPRV_WASDEL;
+		} else if (cur)
+			cur->bc_private.b.flags &= ~XFS_BTCUR_BPRV_WASDEL;
+
 		error = xfs_bmap_del_extent(ip, tp, &lastx, flist, cur, &del,
 				&tmp_logflags, whichfork);
 		logflags |= tmp_logflags;
 		if (error)
 			goto error0;
+
+		if (!isrt && wasdel)
+			xfs_mod_fdblocks(mp, (int64_t)del.br_blockcount, false);
+
 		bno = del.br_startoff - 1;
 nodelete:
 		/*

From a9bd24ac2becf69e896d88bf8b1b7b0f18c2157b Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Tue, 15 Mar 2016 11:42:46 +1100
Subject: [PATCH 63/66] xfs: refactor delalloc indlen reservation split into
 helper

The delayed allocation indirect reservation splitting code is not
sufficient in some cases where a delalloc extent is split in two. In
preparation for enhancements to this code, refactor the current indlen
distribution algorithm into a new helper function.

[dchinner: rename temp, temp2 variables]

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/libxfs/xfs_bmap.c | 75 +++++++++++++++++++++++++++++-----------
 1 file changed, 55 insertions(+), 20 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index b48abc300c36..6de613b558a9 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -4720,6 +4720,47 @@ error0:
 	return error;
 }
 
+/*
+ * When a delalloc extent is split (e.g., due to a hole punch), the original
+ * indlen reservation must be shared across the two new extents that are left
+ * behind.
+ *
+ * Given the original reservation and the worst case indlen for the two new
+ * extents (as calculated by xfs_bmap_worst_indlen()), split the original
+ * reservation fairly across the two new extents.
+ */
+static void
+xfs_bmap_split_indlen(
+	xfs_filblks_t			ores,		/* original res. */
+	xfs_filblks_t			*indlen1,	/* ext1 worst indlen */
+	xfs_filblks_t			*indlen2)	/* ext2 worst indlen */
+{
+	xfs_filblks_t			len1 = *indlen1;
+	xfs_filblks_t			len2 = *indlen2;
+	xfs_filblks_t			nres = len1 + len2; /* new total res. */
+
+	/*
+	 * The only blocks available are those reserved for the original extent.
+	 * Therefore, we have to skim blocks off each of the new reservations so
+	 * long as the new total reservation is greater than the original.
+	 */
+	while (nres > ores) {
+		if (len1) {
+			len1--;
+			nres--;
+		}
+		if (nres == ores)
+			break;
+		if (len2) {
+			len2--;
+			nres--;
+		}
+	}
+
+	*indlen1 = len1;
+	*indlen2 = len2;
+}
+
 /*
  * Called by xfs_bmapi to update file extent records and the btree
  * after removing space (or undoing a delayed allocation).
@@ -4985,27 +5026,21 @@ xfs_bmap_del_extent(
 				XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
 		} else {
 			ASSERT(whichfork == XFS_DATA_FORK);
-			temp = xfs_bmap_worst_indlen(ip, temp);
-			xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-			temp2 = xfs_bmap_worst_indlen(ip, temp2);
-			new.br_startblock = nullstartblock((int)temp2);
+
+			/*
+			 * Distribute the original indlen reservation across the
+			 * two new extents.
+			 */
+			temp = xfs_bmap_worst_indlen(ip, got.br_blockcount);
+			temp2 = xfs_bmap_worst_indlen(ip, new.br_blockcount);
+			xfs_bmap_split_indlen(da_old, &temp, &temp2);
 			da_new = temp + temp2;
-			while (da_new > da_old) {
-				if (temp) {
-					temp--;
-					da_new--;
-					xfs_bmbt_set_startblock(ep,
-						nullstartblock((int)temp));
-				}
-				if (da_new == da_old)
-					break;
-				if (temp2) {
-					temp2--;
-					da_new--;
-					new.br_startblock =
-						nullstartblock((int)temp2);
-				}
-			}
+
+			/*
+			 * Set the reservation for each extent.
+			 */
+			xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
+			new.br_startblock = nullstartblock((int)temp2);
 		}
 		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
 		xfs_iext_insert(ip, *idx + 1, 1, &new, state);

From d34999c97ae87cd56514b8cbc6269651efe274fe Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Tue, 15 Mar 2016 11:42:47 +1100
Subject: [PATCH 64/66] xfs: borrow indirect blocks from freed extent when
 available

xfs_bmap_del_extent() handles extent removal from the in-core and
on-disk extent lists. When removing a delalloc range, it updates the
indirect block reservation appropriately based on the removal. It
currently enforces that the new indirect block reservation is less than
or equal to the original. This is normally the case in all situations
except for in certain cases when the removed range creates a hole in a
single delalloc extent, thus splitting a single delalloc extent in two.

It is possible with small enough extents to split an indlen==1 extent
into two such slightly smaller extents. This leaves one extent with 0
indirect blocks and leads to assert failures in other areas (e.g.,
xfs_bunmapi() if the extent happens to be removed).

Update the indlen distribution code to steal blocks from the deleted
extent, if necessary, to satisfy the worst case total indirect
reservation for the new extents. This is safe as the caller does not
update the fdblocks counters until the extent is removed. Blocks stolen
in this manner simply remain accounted as allocated, having ownership
transferred from the data extent to an indirect reservation.

As a precaution, fall back to the original reservation algorithm if the
new indlen requirement is not met and warn if we end up with extents
without any reservation at all to detect this more easily in the future.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/libxfs/xfs_bmap.c | 46 +++++++++++++++++++++++++++++++---------
 1 file changed, 36 insertions(+), 10 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 6de613b558a9..2a43d5c7fde5 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -4727,22 +4727,39 @@ error0:
  *
  * Given the original reservation and the worst case indlen for the two new
  * extents (as calculated by xfs_bmap_worst_indlen()), split the original
- * reservation fairly across the two new extents.
+ * reservation fairly across the two new extents. If necessary, steal available
+ * blocks from a deleted extent to make up a reservation deficiency (e.g., if
+ * ores == 1). The number of stolen blocks is returned. The availability and
+ * subsequent accounting of stolen blocks is the responsibility of the caller.
  */
-static void
+static xfs_filblks_t
 xfs_bmap_split_indlen(
 	xfs_filblks_t			ores,		/* original res. */
 	xfs_filblks_t			*indlen1,	/* ext1 worst indlen */
-	xfs_filblks_t			*indlen2)	/* ext2 worst indlen */
+	xfs_filblks_t			*indlen2,	/* ext2 worst indlen */
+	xfs_filblks_t			avail)		/* stealable blocks */
 {
 	xfs_filblks_t			len1 = *indlen1;
 	xfs_filblks_t			len2 = *indlen2;
 	xfs_filblks_t			nres = len1 + len2; /* new total res. */
+	xfs_filblks_t			stolen = 0;
 
 	/*
-	 * The only blocks available are those reserved for the original extent.
-	 * Therefore, we have to skim blocks off each of the new reservations so
-	 * long as the new total reservation is greater than the original.
+	 * Steal as many blocks as we can to try and satisfy the worst case
+	 * indlen for both new extents.
+	 */
+	while (nres > ores && avail) {
+		nres--;
+		avail--;
+		stolen++;
+	}
+
+	/*
+	 * The only blocks available are those reserved for the original
+	 * extent and what we can steal from the extent being removed.
+	 * If this still isn't enough to satisfy the combined
+	 * requirements for the two new extents, skim blocks off of each
+	 * of the new reservations until they match what is available.
 	 */
 	while (nres > ores) {
 		if (len1) {
@@ -4759,6 +4776,8 @@ xfs_bmap_split_indlen(
 
 	*indlen1 = len1;
 	*indlen2 = len2;
+
+	return stolen;
 }
 
 /*
@@ -5025,20 +5044,27 @@ xfs_bmap_del_extent(
 			XFS_IFORK_NEXT_SET(ip, whichfork,
 				XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
 		} else {
+			xfs_filblks_t	stolen;
 			ASSERT(whichfork == XFS_DATA_FORK);
 
 			/*
 			 * Distribute the original indlen reservation across the
-			 * two new extents.
+			 * two new extents. Steal blocks from the deleted extent
+			 * if necessary. Stealing blocks simply fudges the
+			 * fdblocks accounting in xfs_bunmapi().
 			 */
 			temp = xfs_bmap_worst_indlen(ip, got.br_blockcount);
 			temp2 = xfs_bmap_worst_indlen(ip, new.br_blockcount);
-			xfs_bmap_split_indlen(da_old, &temp, &temp2);
-			da_new = temp + temp2;
+			stolen = xfs_bmap_split_indlen(da_old, &temp, &temp2,
+						       del->br_blockcount);
+			da_new = temp + temp2 - stolen;
+			del->br_blockcount -= stolen;
 
 			/*
-			 * Set the reservation for each extent.
+			 * Set the reservation for each extent. Warn if either
+			 * is zero as this can lead to delalloc problems.
 			 */
+			WARN_ON_ONCE(!temp || !temp2);
 			xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
 			new.br_startblock = nullstartblock((int)temp2);
 		}

From cc07eed8336d6452214d13e0cba770a0f5296a7f Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Tue, 15 Mar 2016 11:42:47 +1100
Subject: [PATCH 65/66] xfs: ensure committed is initialized in xfs_trans_roll

__xfs_trans_roll() can return without setting the
*committed argument; this was a problem for xfs_bmap_finish():

        int       committed;/* xact committed or not */
...
        error = __xfs_trans_roll(tp, ip, &committed);
        if (error) {
...
                if (committed) {

and we tested an uninitialized "committed" variable on the
error path.  No caller is preserving "committed" state across
calls to __xfs_trans_roll(), so just initialize committed inside
the function to avoid future errors like this.

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_trans.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 748b16aff45a..20c53666cb4b 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1028,6 +1028,8 @@ __xfs_trans_roll(
 	struct xfs_trans_res	tres;
 	int			error;
 
+	*committed = 0;
+
 	/*
 	 * Ensure that the inode is always logged.
 	 */
@@ -1082,6 +1084,6 @@ xfs_trans_roll(
 	struct xfs_trans	**tpp,
 	struct xfs_inode	*dp)
 {
-	int			committed = 0;
+	int			committed;
 	return __xfs_trans_roll(tpp, dp, &committed);
 }

From 355cced45286ed7e710058174066628ff9ad9fa4 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 15 Mar 2016 11:44:18 +1100
Subject: [PATCH 66/66] xfs: always set rvalp in xfs_dir2_node_trim_free

xfs_dir2_node_trim_free can return with setting the rvalp argument
pointer.  Initialize it to 0 at the beginning of the function and
only update it to 1 if we succeeded trimming a freespace block.

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/libxfs/xfs_dir2_node.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c
index 63ee03db796c..75a557432d0f 100644
--- a/fs/xfs/libxfs/xfs_dir2_node.c
+++ b/fs/xfs/libxfs/xfs_dir2_node.c
@@ -2235,6 +2235,9 @@ xfs_dir2_node_trim_free(
 
 	dp = args->dp;
 	tp = args->trans;
+
+	*rvalp = 0;
+
 	/*
 	 * Read the freespace block.
 	 */
@@ -2255,7 +2258,6 @@ xfs_dir2_node_trim_free(
 	 */
 	if (freehdr.nused > 0) {
 		xfs_trans_brelse(tp, bp);
-		*rvalp = 0;
 		return 0;
 	}
 	/*