From e8897529053d05f5bd677706ba6807fc2f2b942c Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 1 Mar 2016 09:41:33 +1100 Subject: [PATCH 1/4] xfs: XFS_DIFLAG_DAX is only for regular files or directories Only file data can use DAX, so we should onyl be able to set this flag on regular files. However, the flag also serves as an "inherit" flag at file create time when set on directories, so limit the FS_IOC_FSSETXATTR ioctl to only set this flag on regular files and directories. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Tested-by: Ross Zwisler Signed-off-by: Dave Chinner --- fs/xfs/xfs_ioctl.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 478d04e07f95..ee8f66616cbb 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1051,6 +1051,14 @@ xfs_ioctl_setattr_xflags( !capable(CAP_LINUX_IMMUTABLE)) return -EPERM; + /* + * It is only valid to set the DAX flag on regular files and + * directories. On directories it serves as an inherit hint. + */ + if ((fa->fsx_xflags & FS_XFLAG_DAX) && + !(S_ISREG(VFS_I(ip)->i_mode) || S_ISDIR(VFS_I(ip)->i_mode))) + return -EINVAL; + xfs_set_diflags(ip, fa->fsx_xflags); xfs_diflags_to_linux(ip); xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); From db10c697b40e34ac91bbcdd866e73a3d0bad7780 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 1 Mar 2016 09:41:33 +1100 Subject: [PATCH 2/4] xfs: S_DAX is only for regular files Only regular files can use DAX for data operations, so we should restrict setting it on the VFS inode to regular files. Setting it on metadata inodes may cause the VFS to do the wrong thing for such inodes, so avoid potential problems by restricting the scope of the flag to what we know is supported. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Tested-by: Ross Zwisler Signed-off-by: Dave Chinner --- fs/xfs/xfs_iops.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 76b71a1c6c32..5d4f9739d4a4 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -1205,8 +1205,9 @@ xfs_diflags_to_iflags( inode->i_flags |= S_SYNC; if (flags & XFS_DIFLAG_NOATIME) inode->i_flags |= S_NOATIME; - if (ip->i_mount->m_flags & XFS_MOUNT_DAX || - ip->i_d.di_flags2 & XFS_DIFLAG2_DAX) + if (S_ISREG(inode->i_mode) && + (ip->i_mount->m_flags & XFS_MOUNT_DAX || + ip->i_d.di_flags2 & XFS_DIFLAG2_DAX)) inode->i_flags |= S_DAX; } From 3a6a854a82fc1cf12c7f2095b80aff32c563b3ab Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 1 Mar 2016 09:41:33 +1100 Subject: [PATCH 3/4] xfs: dynamically switch modes when XFS_DIFLAG2_DAX is set/cleared When we set or clear the XFS_DIFLAG2_DAX flag, we should also set/clear the S_DAX flag in the VFS inode. To do this, we need to ensure that we first flush and remove any cached entries in the radix tree to ensure the correct data access method is used when we next try to read or write data. We ahve to be especially careful here to lock out page faults so they don't race with the flush and invalidation before we change the access mode. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Tested-by: Ross Zwisler Signed-off-by: Dave Chinner --- fs/xfs/xfs_ioctl.c | 109 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 94 insertions(+), 15 deletions(-) diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index ee8f66616cbb..55ec4d44d4ba 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1051,14 +1051,6 @@ xfs_ioctl_setattr_xflags( !capable(CAP_LINUX_IMMUTABLE)) return -EPERM; - /* - * It is only valid to set the DAX flag on regular files and - * directories. On directories it serves as an inherit hint. - */ - if ((fa->fsx_xflags & FS_XFLAG_DAX) && - !(S_ISREG(VFS_I(ip)->i_mode) || S_ISDIR(VFS_I(ip)->i_mode))) - return -EINVAL; - xfs_set_diflags(ip, fa->fsx_xflags); xfs_diflags_to_linux(ip); xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); @@ -1067,24 +1059,83 @@ xfs_ioctl_setattr_xflags( return 0; } +/* + * If we are changing DAX flags, we have to ensure the file is clean and any + * cached objects in the address space are invalidated and removed. This + * requires us to lock out other IO and page faults similar to a truncate + * operation. The locks need to be held until the transaction has been committed + * so that the cache invalidation is atomic with respect to the DAX flag + * manipulation. + */ +static int +xfs_ioctl_setattr_dax_invalidate( + struct xfs_inode *ip, + struct fsxattr *fa, + int *join_flags) +{ + struct inode *inode = VFS_I(ip); + int error; + + *join_flags = 0; + + /* + * It is only valid to set the DAX flag on regular files and + * directories. On directories it serves as an inherit hint. + */ + if ((fa->fsx_xflags & FS_XFLAG_DAX) && + !(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) + return -EINVAL; + + /* If the DAX state is not changing, we have nothing to do here. */ + if ((fa->fsx_xflags & FS_XFLAG_DAX) && IS_DAX(inode)) + return 0; + if (!(fa->fsx_xflags & FS_XFLAG_DAX) && !IS_DAX(inode)) + return 0; + + /* lock, flush and invalidate mapping in preparation for flag change */ + xfs_ilock(ip, XFS_MMAPLOCK_EXCL | XFS_IOLOCK_EXCL); + error = filemap_write_and_wait(inode->i_mapping); + if (error) + goto out_unlock; + error = invalidate_inode_pages2(inode->i_mapping); + if (error) + goto out_unlock; + + *join_flags = XFS_MMAPLOCK_EXCL | XFS_IOLOCK_EXCL; + return 0; + +out_unlock: + xfs_iunlock(ip, XFS_MMAPLOCK_EXCL | XFS_IOLOCK_EXCL); + return error; + +} + /* * Set up the transaction structure for the setattr operation, checking that we * have permission to do so. On success, return a clean transaction and the * inode locked exclusively ready for further operation specific checks. On * failure, return an error without modifying or locking the inode. + * + * The inode might already be IO locked on call. If this is the case, it is + * indicated in @join_flags and we take full responsibility for ensuring they + * are unlocked from now on. Hence if we have an error here, we still have to + * unlock them. Otherwise, once they are joined to the transaction, they will + * be unlocked on commit/cancel. */ static struct xfs_trans * xfs_ioctl_setattr_get_trans( - struct xfs_inode *ip) + struct xfs_inode *ip, + int join_flags) { struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; - int error; + int error = -EROFS; if (mp->m_flags & XFS_MOUNT_RDONLY) - return ERR_PTR(-EROFS); + goto out_unlock; + error = -EIO; if (XFS_FORCED_SHUTDOWN(mp)) - return ERR_PTR(-EIO); + goto out_unlock; tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); @@ -1092,7 +1143,8 @@ xfs_ioctl_setattr_get_trans( goto out_cancel; xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | join_flags); + join_flags = 0; /* * CAP_FOWNER overrides the following restrictions: @@ -1112,6 +1164,9 @@ xfs_ioctl_setattr_get_trans( out_cancel: xfs_trans_cancel(tp); +out_unlock: + if (join_flags) + xfs_iunlock(ip, join_flags); return ERR_PTR(error); } @@ -1210,6 +1265,7 @@ xfs_ioctl_setattr( struct xfs_dquot *pdqp = NULL; struct xfs_dquot *olddquot = NULL; int code; + int join_flags = 0; trace_xfs_ioctl_setattr(ip); @@ -1233,7 +1289,18 @@ xfs_ioctl_setattr( return code; } - tp = xfs_ioctl_setattr_get_trans(ip); + /* + * Changing DAX config may require inode locking for mapping + * invalidation. These need to be held all the way to transaction commit + * or cancel time, so need to be passed through to + * xfs_ioctl_setattr_get_trans() so it can apply them to the join call + * appropriately. + */ + code = xfs_ioctl_setattr_dax_invalidate(ip, fa, &join_flags); + if (code) + goto error_free_dquots; + + tp = xfs_ioctl_setattr_get_trans(ip, join_flags); if (IS_ERR(tp)) { code = PTR_ERR(tp); goto error_free_dquots; @@ -1349,6 +1416,7 @@ xfs_ioc_setxflags( struct xfs_trans *tp; struct fsxattr fa; unsigned int flags; + int join_flags = 0; int error; if (copy_from_user(&flags, arg, sizeof(flags))) @@ -1365,7 +1433,18 @@ xfs_ioc_setxflags( if (error) return error; - tp = xfs_ioctl_setattr_get_trans(ip); + /* + * Changing DAX config may require inode locking for mapping + * invalidation. These need to be held all the way to transaction commit + * or cancel time, so need to be passed through to + * xfs_ioctl_setattr_get_trans() so it can apply them to the join call + * appropriately. + */ + error = xfs_ioctl_setattr_dax_invalidate(ip, &fa, &join_flags); + if (error) + goto out_drop_write; + + tp = xfs_ioctl_setattr_get_trans(ip, join_flags); if (IS_ERR(tp)) { error = PTR_ERR(tp); goto out_drop_write; From 64485437357dfdc9752495b3f496adfc5c816c6f Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 1 Mar 2016 09:41:33 +1100 Subject: [PATCH 4/4] xfs: XFS_DIFLAG2_DAX limited by PAGE_SIZE If the block size of a filesystem is not at least PAGE_SIZEd, then at this point in time DAX cannot be used due to the fact we can't guarantee extents are page sized or aligned without further work. Hence disallow setting the DAX flag on an inode if the block size is too small. Also, be defensive and check the block size when reading an inode in off disk. In future, we want to allow DAX to work on any filesystem, so this is temporary while we sort of the correct conbination of extent size hints and allocation alignment configurations needed to guarantee page sized and aligned extent allocation for DAX enabled files. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Tested-by: Ross Zwisler Signed-off-by: Dave Chinner --- fs/xfs/xfs_ioctl.c | 12 ++++++++---- fs/xfs/xfs_iops.c | 1 + 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 55ec4d44d4ba..388b5b5b67c9 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1080,11 +1080,15 @@ xfs_ioctl_setattr_dax_invalidate( /* * It is only valid to set the DAX flag on regular files and - * directories. On directories it serves as an inherit hint. + * directories on filesystems where the block size is equal to the page + * size. On directories it serves as an inherit hint. */ - if ((fa->fsx_xflags & FS_XFLAG_DAX) && - !(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) - return -EINVAL; + if (fa->fsx_xflags & FS_XFLAG_DAX) { + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) + return -EINVAL; + if (ip->i_mount->m_sb.sb_blocksize != PAGE_SIZE) + return -EINVAL; + } /* If the DAX state is not changing, we have nothing to do here. */ if ((fa->fsx_xflags & FS_XFLAG_DAX) && IS_DAX(inode)) diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 5d4f9739d4a4..f7386dc10a20 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -1206,6 +1206,7 @@ xfs_diflags_to_iflags( if (flags & XFS_DIFLAG_NOATIME) inode->i_flags |= S_NOATIME; if (S_ISREG(inode->i_mode) && + ip->i_mount->m_sb.sb_blocksize == PAGE_SIZE && (ip->i_mount->m_flags & XFS_MOUNT_DAX || ip->i_d.di_flags2 & XFS_DIFLAG2_DAX)) inode->i_flags |= S_DAX;