Merge branch 'xfs-dax-support' into for-next

This commit is contained in:
Dave Chinner 2015-06-04 13:01:49 +10:00
commit 66e8ac7bfa
12 changed files with 333 additions and 154 deletions

View File

@ -309,14 +309,21 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
out:
i_mmap_unlock_read(mapping);
if (bh->b_end_io)
bh->b_end_io(bh, 1);
return error;
}
static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
get_block_t get_block)
/**
* __dax_fault - handle a page fault on a DAX file
* @vma: The virtual memory area where the fault occurred
* @vmf: The description of the fault
* @get_block: The filesystem method used to translate file offsets to blocks
*
* When a page fault occurs, filesystems may call this helper in their
* fault handler for DAX files. __dax_fault() assumes the caller has done all
* the necessary locking for the page fault to proceed successfully.
*/
int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
get_block_t get_block, dax_iodone_t complete_unwritten)
{
struct file *file = vma->vm_file;
struct address_space *mapping = file->f_mapping;
@ -417,7 +424,19 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
page_cache_release(page);
}
/*
* If we successfully insert the new mapping over an unwritten extent,
* we need to ensure we convert the unwritten extent. If there is an
* error inserting the mapping, the filesystem needs to leave it as
* unwritten to prevent exposure of the stale underlying data to
* userspace, but we still need to call the completion function so
* the private resources on the mapping buffer can be released. We
* indicate what the callback should do via the uptodate variable, same
* as for normal BH based IO completions.
*/
error = dax_insert_mapping(inode, &bh, vma, vmf);
if (buffer_unwritten(&bh))
complete_unwritten(&bh, !error);
out:
if (error == -ENOMEM)
@ -434,6 +453,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
}
goto out;
}
EXPORT_SYMBOL(__dax_fault);
/**
* dax_fault - handle a page fault on a DAX file
@ -445,7 +465,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
* fault handler for DAX files.
*/
int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
get_block_t get_block)
get_block_t get_block, dax_iodone_t complete_unwritten)
{
int result;
struct super_block *sb = file_inode(vma->vm_file)->i_sb;
@ -454,7 +474,7 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
sb_start_pagefault(sb);
file_update_time(vma->vm_file);
}
result = do_dax_fault(vma, vmf, get_block);
result = __dax_fault(vma, vmf, get_block, complete_unwritten);
if (vmf->flags & FAULT_FLAG_WRITE)
sb_end_pagefault(sb);

View File

@ -28,12 +28,12 @@
#ifdef CONFIG_FS_DAX
static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
return dax_fault(vma, vmf, ext2_get_block);
return dax_fault(vma, vmf, ext2_get_block, NULL);
}
static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
return dax_mkwrite(vma, vmf, ext2_get_block);
return dax_mkwrite(vma, vmf, ext2_get_block, NULL);
}
static const struct vm_operations_struct ext2_dax_vm_ops = {

View File

@ -192,15 +192,27 @@ out:
}
#ifdef CONFIG_FS_DAX
static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
{
struct inode *inode = bh->b_assoc_map->host;
/* XXX: breaks on 32-bit > 16GB. Is that even supported? */
loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
int err;
if (!uptodate)
return;
WARN_ON(!buffer_unwritten(bh));
err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
}
static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
return dax_fault(vma, vmf, ext4_get_block);
return dax_fault(vma, vmf, ext4_get_block, ext4_end_io_unwritten);
/* Is this the right get_block? */
}
static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
return dax_mkwrite(vma, vmf, ext4_get_block);
return dax_mkwrite(vma, vmf, ext4_get_block, ext4_end_io_unwritten);
}
static const struct vm_operations_struct ext4_dax_vm_ops = {

View File

@ -656,18 +656,6 @@ has_zeroout:
return retval;
}
static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
{
struct inode *inode = bh->b_assoc_map->host;
/* XXX: breaks on 32-bit > 16GB. Is that even supported? */
loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
int err;
if (!uptodate)
return;
WARN_ON(!buffer_unwritten(bh));
err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
}
/* Maximum number of blocks we map for direct IO at once. */
#define DIO_MAX_BLOCKS 4096
@ -705,10 +693,15 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
map_bh(bh, inode->i_sb, map.m_pblk);
bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
if (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) {
if (IS_DAX(inode) && buffer_unwritten(bh)) {
/*
* dgc: I suspect unwritten conversion on ext4+DAX is
* fundamentally broken here when there are concurrent
* read/write in progress on this inode.
*/
WARN_ON_ONCE(io_end);
bh->b_assoc_map = inode->i_mapping;
bh->b_private = (void *)(unsigned long)iblock;
bh->b_end_io = ext4_end_io_unwritten;
}
if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
set_buffer_defer_completion(bh);

View File

@ -1349,7 +1349,7 @@ __xfs_get_blocks(
sector_t iblock,
struct buffer_head *bh_result,
int create,
int direct)
bool direct)
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
@ -1414,6 +1414,7 @@ __xfs_get_blocks(
if (error)
return error;
new = 1;
} else {
/*
* Delalloc reservations do not require a transaction,
@ -1508,49 +1509,29 @@ xfs_get_blocks(
struct buffer_head *bh_result,
int create)
{
return __xfs_get_blocks(inode, iblock, bh_result, create, 0);
return __xfs_get_blocks(inode, iblock, bh_result, create, false);
}
STATIC int
int
xfs_get_blocks_direct(
struct inode *inode,
sector_t iblock,
struct buffer_head *bh_result,
int create)
{
return __xfs_get_blocks(inode, iblock, bh_result, create, 1);
return __xfs_get_blocks(inode, iblock, bh_result, create, true);
}
/*
* Complete a direct I/O write request.
*
* The ioend structure is passed from __xfs_get_blocks() to tell us what to do.
* If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite
* wholly within the EOF and so there is nothing for us to do. Note that in this
* case the completion can be called in interrupt context, whereas if we have an
* ioend we will always be called in task context (i.e. from a workqueue).
*/
STATIC void
xfs_end_io_direct_write(
struct kiocb *iocb,
static void
__xfs_end_io_direct_write(
struct inode *inode,
struct xfs_ioend *ioend,
loff_t offset,
ssize_t size,
void *private)
ssize_t size)
{
struct inode *inode = file_inode(iocb->ki_filp);
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
struct xfs_ioend *ioend = private;
struct xfs_mount *mp = XFS_I(inode)->i_mount;
trace_xfs_gbmap_direct_endio(ip, offset, size,
ioend ? ioend->io_type : 0, NULL);
if (!ioend) {
ASSERT(offset + size <= i_size_read(inode));
return;
}
if (XFS_FORCED_SHUTDOWN(mp))
if (XFS_FORCED_SHUTDOWN(mp) || ioend->io_error)
goto out_end_io;
/*
@ -1587,10 +1568,10 @@ xfs_end_io_direct_write(
* here can result in EOF moving backwards and Bad Things Happen when
* that occurs.
*/
spin_lock(&ip->i_flags_lock);
spin_lock(&XFS_I(inode)->i_flags_lock);
if (offset + size > i_size_read(inode))
i_size_write(inode, offset + size);
spin_unlock(&ip->i_flags_lock);
spin_unlock(&XFS_I(inode)->i_flags_lock);
/*
* If we are doing an append IO that needs to update the EOF on disk,
@ -1607,6 +1588,98 @@ out_end_io:
return;
}
/*
* Complete a direct I/O write request.
*
* The ioend structure is passed from __xfs_get_blocks() to tell us what to do.
* If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite
* wholly within the EOF and so there is nothing for us to do. Note that in this
* case the completion can be called in interrupt context, whereas if we have an
* ioend we will always be called in task context (i.e. from a workqueue).
*/
STATIC void
xfs_end_io_direct_write(
struct kiocb *iocb,
loff_t offset,
ssize_t size,
void *private)
{
struct inode *inode = file_inode(iocb->ki_filp);
struct xfs_ioend *ioend = private;
trace_xfs_gbmap_direct_endio(XFS_I(inode), offset, size,
ioend ? ioend->io_type : 0, NULL);
if (!ioend) {
ASSERT(offset + size <= i_size_read(inode));
return;
}
__xfs_end_io_direct_write(inode, ioend, offset, size);
}
/*
* For DAX we need a mapping buffer callback for unwritten extent conversion
* when page faults allocate blocks and then zero them. Note that in this
* case the mapping indicated by the ioend may extend beyond EOF. We most
* definitely do not want to extend EOF here, so we trim back the ioend size to
* EOF.
*/
#ifdef CONFIG_FS_DAX
void
xfs_end_io_dax_write(
struct buffer_head *bh,
int uptodate)
{
struct xfs_ioend *ioend = bh->b_private;
struct inode *inode = ioend->io_inode;
ssize_t size = ioend->io_size;
ASSERT(IS_DAX(ioend->io_inode));
/* if there was an error zeroing, then don't convert it */
if (!uptodate)
ioend->io_error = -EIO;
/*
* Trim update to EOF, so we don't extend EOF during unwritten extent
* conversion of partial EOF blocks.
*/
spin_lock(&XFS_I(inode)->i_flags_lock);
if (ioend->io_offset + size > i_size_read(inode))
size = i_size_read(inode) - ioend->io_offset;
spin_unlock(&XFS_I(inode)->i_flags_lock);
__xfs_end_io_direct_write(inode, ioend, ioend->io_offset, size);
}
#else
void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate) { }
#endif
static inline ssize_t
xfs_vm_do_dio(
struct inode *inode,
struct kiocb *iocb,
struct iov_iter *iter,
loff_t offset,
void (*endio)(struct kiocb *iocb,
loff_t offset,
ssize_t size,
void *private),
int flags)
{
struct block_device *bdev;
if (IS_DAX(inode))
return dax_do_io(iocb, inode, iter, offset,
xfs_get_blocks_direct, endio, 0);
bdev = xfs_find_bdev_for_inode(inode);
return __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
xfs_get_blocks_direct, endio, NULL, flags);
}
STATIC ssize_t
xfs_vm_direct_IO(
struct kiocb *iocb,
@ -1614,16 +1687,11 @@ xfs_vm_direct_IO(
loff_t offset)
{
struct inode *inode = iocb->ki_filp->f_mapping->host;
struct block_device *bdev = xfs_find_bdev_for_inode(inode);
if (iov_iter_rw(iter) == WRITE) {
return __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
xfs_get_blocks_direct,
xfs_end_io_direct_write, NULL,
DIO_ASYNC_EXTEND);
}
return __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
xfs_get_blocks_direct, NULL, NULL, 0);
if (iov_iter_rw(iter) == WRITE)
return xfs_vm_do_dio(inode, iocb, iter, offset,
xfs_end_io_direct_write, DIO_ASYNC_EXTEND);
return xfs_vm_do_dio(inode, iocb, iter, offset, NULL, 0);
}
/*

View File

@ -53,7 +53,12 @@ typedef struct xfs_ioend {
} xfs_ioend_t;
extern const struct address_space_operations xfs_address_space_operations;
extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int);
int xfs_get_blocks(struct inode *inode, sector_t offset,
struct buffer_head *map_bh, int create);
int xfs_get_blocks_direct(struct inode *inode, sector_t offset,
struct buffer_head *map_bh, int create);
void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate);
extern void xfs_count_page_state(struct page *, int *, int *);

View File

@ -1133,14 +1133,29 @@ xfs_zero_remaining_bytes(
break;
ASSERT(imap.br_blockcount >= 1);
ASSERT(imap.br_startoff == offset_fsb);
ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
if (imap.br_startblock == HOLESTARTBLOCK ||
imap.br_state == XFS_EXT_UNWRITTEN) {
/* skip the entire extent */
lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff +
imap.br_blockcount) - 1;
continue;
}
lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
if (lastoffset > endoff)
lastoffset = endoff;
if (imap.br_startblock == HOLESTARTBLOCK)
continue;
ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
if (imap.br_state == XFS_EXT_UNWRITTEN)
/* DAX can just zero the backing device directly */
if (IS_DAX(VFS_I(ip))) {
error = dax_zero_page_range(VFS_I(ip), offset,
lastoffset - offset + 1,
xfs_get_blocks_direct);
if (error)
return error;
continue;
}
error = xfs_buf_read_uncached(XFS_IS_REALTIME_INODE(ip) ?
mp->m_rtdev_targp : mp->m_ddev_targp,

View File

@ -79,14 +79,15 @@ xfs_rw_ilock_demote(
}
/*
* xfs_iozero
* xfs_iozero clears the specified range supplied via the page cache (except in
* the DAX case). Writes through the page cache will allocate blocks over holes,
* though the callers usually map the holes first and avoid them. If a block is
* not completely zeroed, then it will be read from disk before being partially
* zeroed.
*
* xfs_iozero clears the specified range of buffer supplied,
* and marks all the affected blocks as valid and modified. If
* an affected block is not allocated, it will be allocated. If
* an affected block is not completely overwritten, and is not
* valid before the operation, it will be read from disk before
* being partially zeroed.
* In the DAX case, we can just directly write to the underlying pages. This
* will not allocate blocks, but will avoid holes and unwritten extents and so
* not do unnecessary work.
*/
int
xfs_iozero(
@ -96,7 +97,8 @@ xfs_iozero(
{
struct page *page;
struct address_space *mapping;
int status;
int status = 0;
mapping = VFS_I(ip)->i_mapping;
do {
@ -108,6 +110,12 @@ xfs_iozero(
if (bytes > count)
bytes = count;
if (IS_DAX(VFS_I(ip))) {
status = dax_zero_page_range(VFS_I(ip), pos, bytes,
xfs_get_blocks_direct);
if (status)
break;
} else {
status = pagecache_write_begin(NULL, mapping, pos, bytes,
AOP_FLAG_UNINTERRUPTIBLE,
&page, &fsdata);
@ -116,12 +124,13 @@ xfs_iozero(
zero_user(page, offset, bytes);
status = pagecache_write_end(NULL, mapping, pos, bytes, bytes,
page, fsdata);
status = pagecache_write_end(NULL, mapping, pos, bytes,
bytes, page, fsdata);
WARN_ON(status <= 0); /* can't return less than zero! */
status = 0;
}
pos += bytes;
count -= bytes;
status = 0;
} while (count);
return status;
@ -284,7 +293,7 @@ xfs_file_read_iter(
if (file->f_mode & FMODE_NOCMTIME)
ioflags |= XFS_IO_INVIS;
if (unlikely(ioflags & XFS_IO_ISDIRECT)) {
if ((ioflags & XFS_IO_ISDIRECT) && !IS_DAX(inode)) {
xfs_buftarg_t *target =
XFS_IS_REALTIME_INODE(ip) ?
mp->m_rtdev_targp : mp->m_ddev_targp;
@ -378,6 +387,10 @@ xfs_file_splice_read(
trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
/* for dax, we need to avoid the page cache */
if (IS_DAX(VFS_I(ip)))
ret = default_file_splice_read(infilp, ppos, pipe, count, flags);
else
ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
if (ret > 0)
XFS_STATS_ADD(xs_read_bytes, ret);
@ -672,7 +685,7 @@ xfs_file_dio_aio_write(
mp->m_rtdev_targp : mp->m_ddev_targp;
/* DIO must be aligned to device logical sector size */
if ((pos | count) & target->bt_logical_sectormask)
if (!IS_DAX(inode) && ((pos | count) & target->bt_logical_sectormask))
return -EINVAL;
/* "unaligned" here means not aligned to a filesystem block */
@ -758,8 +771,11 @@ xfs_file_dio_aio_write(
out:
xfs_rw_iunlock(ip, iolock);
/* No fallback to buffered IO on errors for XFS. */
ASSERT(ret < 0 || ret == count);
/*
* No fallback to buffered IO on errors for XFS. DAX can result in
* partial writes, but direct IO will either complete fully or fail.
*/
ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip)));
return ret;
}
@ -842,7 +858,7 @@ xfs_file_write_iter(
if (XFS_FORCED_SHUTDOWN(ip->i_mount))
return -EIO;
if (unlikely(iocb->ki_flags & IOCB_DIRECT))
if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode))
ret = xfs_file_dio_aio_write(iocb, from);
else
ret = xfs_file_buffered_aio_write(iocb, from);
@ -1063,17 +1079,6 @@ xfs_file_readdir(
return xfs_readdir(ip, ctx, bufsize);
}
STATIC int
xfs_file_mmap(
struct file *filp,
struct vm_area_struct *vma)
{
vma->vm_ops = &xfs_file_vm_ops;
file_accessed(filp);
return 0;
}
/*
* This type is designed to indicate the type of offset we would like
* to search from page cache for xfs_seek_hole_data().
@ -1454,26 +1459,11 @@ xfs_file_llseek(
* ordering of:
*
* mmap_sem (MM)
* sb_start_pagefault(vfs, freeze)
* i_mmap_lock (XFS - truncate serialisation)
* page_lock (MM)
* i_lock (XFS - extent map serialisation)
*/
STATIC int
xfs_filemap_fault(
struct vm_area_struct *vma,
struct vm_fault *vmf)
{
struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host);
int error;
trace_xfs_filemap_fault(ip);
xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
error = filemap_fault(vma, vmf);
xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
return error;
}
/*
* mmap()d file has taken write protection fault and is being made writable. We
@ -1486,16 +1476,66 @@ xfs_filemap_page_mkwrite(
struct vm_area_struct *vma,
struct vm_fault *vmf)
{
struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host);
int error;
struct inode *inode = file_inode(vma->vm_file);
int ret;
trace_xfs_filemap_page_mkwrite(ip);
trace_xfs_filemap_page_mkwrite(XFS_I(inode));
sb_start_pagefault(inode->i_sb);
file_update_time(vma->vm_file);
xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
if (IS_DAX(inode)) {
ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_direct,
xfs_end_io_dax_write);
} else {
ret = __block_page_mkwrite(vma, vmf, xfs_get_blocks);
ret = block_page_mkwrite_return(ret);
}
xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
sb_end_pagefault(inode->i_sb);
return ret;
}
STATIC int
xfs_filemap_fault(
struct vm_area_struct *vma,
struct vm_fault *vmf)
{
struct xfs_inode *ip = XFS_I(file_inode(vma->vm_file));
int ret;
trace_xfs_filemap_fault(ip);
/* DAX can shortcut the normal fault path on write faults! */
if ((vmf->flags & FAULT_FLAG_WRITE) && IS_DAX(VFS_I(ip)))
return xfs_filemap_page_mkwrite(vma, vmf);
xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
error = block_page_mkwrite(vma, vmf, xfs_get_blocks);
ret = filemap_fault(vma, vmf);
xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
return error;
return ret;
}
static const struct vm_operations_struct xfs_file_vm_ops = {
.fault = xfs_filemap_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = xfs_filemap_page_mkwrite,
};
STATIC int
xfs_file_mmap(
struct file *filp,
struct vm_area_struct *vma)
{
file_accessed(filp);
vma->vm_ops = &xfs_file_vm_ops;
if (IS_DAX(file_inode(filp)))
vma->vm_flags |= VM_MIXEDMAP;
return 0;
}
const struct file_operations xfs_file_operations = {
@ -1526,9 +1566,3 @@ const struct file_operations xfs_dir_file_operations = {
#endif
.fsync = xfs_dir_fsync,
};
static const struct vm_operations_struct xfs_file_vm_ops = {
.fault = xfs_filemap_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = xfs_filemap_page_mkwrite,
};

View File

@ -851,7 +851,11 @@ xfs_setattr_size(
* to hope that the caller sees ENOMEM and retries the truncate
* operation.
*/
error = block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks);
if (IS_DAX(inode))
error = dax_truncate_page(inode, newsize, xfs_get_blocks_direct);
else
error = block_truncate_page(inode->i_mapping, newsize,
xfs_get_blocks);
if (error)
return error;
truncate_setsize(inode, newsize);
@ -1191,22 +1195,22 @@ xfs_diflags_to_iflags(
struct inode *inode,
struct xfs_inode *ip)
{
if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE)
uint16_t flags = ip->i_d.di_flags;
inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC |
S_NOATIME | S_DAX);
if (flags & XFS_DIFLAG_IMMUTABLE)
inode->i_flags |= S_IMMUTABLE;
else
inode->i_flags &= ~S_IMMUTABLE;
if (ip->i_d.di_flags & XFS_DIFLAG_APPEND)
if (flags & XFS_DIFLAG_APPEND)
inode->i_flags |= S_APPEND;
else
inode->i_flags &= ~S_APPEND;
if (ip->i_d.di_flags & XFS_DIFLAG_SYNC)
if (flags & XFS_DIFLAG_SYNC)
inode->i_flags |= S_SYNC;
else
inode->i_flags &= ~S_SYNC;
if (ip->i_d.di_flags & XFS_DIFLAG_NOATIME)
if (flags & XFS_DIFLAG_NOATIME)
inode->i_flags |= S_NOATIME;
else
inode->i_flags &= ~S_NOATIME;
/* XXX: Also needs an on-disk per inode flag! */
if (ip->i_mount->m_flags & XFS_MOUNT_DAX)
inode->i_flags |= S_DAX;
}
/*

View File

@ -181,6 +181,8 @@ typedef struct xfs_mount {
allocator */
#define XFS_MOUNT_NOATTR2 (1ULL << 25) /* disable use of attr2 format */
#define XFS_MOUNT_DAX (1ULL << 62) /* TEST ONLY! */
/*
* Default minimum read and write sizes.

View File

@ -112,6 +112,8 @@ static struct xfs_kobj xfs_dbg_kobj; /* global debug sysfs attrs */
#define MNTOPT_DISCARD "discard" /* Discard unused blocks */
#define MNTOPT_NODISCARD "nodiscard" /* Do not discard unused blocks */
#define MNTOPT_DAX "dax" /* Enable direct access to bdev pages */
/*
* Table driven mount option parser.
*
@ -363,6 +365,10 @@ xfs_parseargs(
mp->m_flags |= XFS_MOUNT_DISCARD;
} else if (!strcmp(this_char, MNTOPT_NODISCARD)) {
mp->m_flags &= ~XFS_MOUNT_DISCARD;
#ifdef CONFIG_FS_DAX
} else if (!strcmp(this_char, MNTOPT_DAX)) {
mp->m_flags |= XFS_MOUNT_DAX;
#endif
} else {
xfs_warn(mp, "unknown mount option [%s].", this_char);
return -EINVAL;
@ -452,7 +458,7 @@ done:
}
struct proc_xfs_info {
int flag;
uint64_t flag;
char *str;
};
@ -474,6 +480,7 @@ xfs_showargs(
{ XFS_MOUNT_GRPID, "," MNTOPT_GRPID },
{ XFS_MOUNT_DISCARD, "," MNTOPT_DISCARD },
{ XFS_MOUNT_SMALL_INUMS, "," MNTOPT_32BITINODE },
{ XFS_MOUNT_DAX, "," MNTOPT_DAX },
{ 0, NULL }
};
static struct proc_xfs_info xfs_info_unset[] = {
@ -1507,6 +1514,20 @@ xfs_fs_fill_super(
if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5)
sb->s_flags |= MS_I_VERSION;
if (mp->m_flags & XFS_MOUNT_DAX) {
xfs_warn(mp,
"DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
if (sb->s_blocksize != PAGE_SIZE) {
xfs_alert(mp,
"Filesystem block size invalid for DAX Turning DAX off.");
mp->m_flags &= ~XFS_MOUNT_DAX;
} else if (!sb->s_bdev->bd_disk->fops->direct_access) {
xfs_alert(mp,
"Block device does not support DAX Turning DAX off.");
mp->m_flags &= ~XFS_MOUNT_DAX;
}
}
error = xfs_mountfs(mp);
if (error)
goto out_filestream_unmount;

View File

@ -70,6 +70,7 @@ typedef int (get_block_t)(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
ssize_t bytes, void *private);
typedef void (dax_iodone_t)(struct buffer_head *bh_map, int uptodate);
#define MAY_EXEC 0x00000001
#define MAY_WRITE 0x00000002
@ -2627,9 +2628,13 @@ ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, loff_t,
int dax_clear_blocks(struct inode *, sector_t block, long size);
int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
int dax_truncate_page(struct inode *, loff_t from, get_block_t);
int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
dax_iodone_t);
int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
dax_iodone_t);
int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *);
#define dax_mkwrite(vma, vmf, gb) dax_fault(vma, vmf, gb)
#define dax_mkwrite(vma, vmf, gb, iod) dax_fault(vma, vmf, gb, iod)
#define __dax_mkwrite(vma, vmf, gb, iod) __dax_fault(vma, vmf, gb, iod)
#ifdef CONFIG_BLOCK
typedef void (dio_submit_t)(int rw, struct bio *bio, struct inode *inode,