linux/fs/sync.c

/*
 * High-level sync()-related operations
 */

#include <linux/kernel.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/module.h>
#include <linux/writeback.h>
#include <linux/syscalls.h>
#include <linux/linkage.h>
#include <linux/pagemap.h>

#define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
			SYNC_FILE_RANGE_WAIT_AFTER)

/*
 * sys_sync_file_range() permits finely controlled syncing over a segment of
 * a file in the range offset .. (offset+nbytes-1) inclusive.  If nbytes is
 * zero then sys_sync_file_range() will operate from offset out to EOF.
 *
 * The flag bits are:
 *
 * SYNC_FILE_RANGE_WAIT_BEFORE: wait upon writeout of all pages in the range
 * before performing the write.
 *
 * SYNC_FILE_RANGE_WRITE: initiate writeout of all those dirty pages in the
 * range which are not presently under writeback.
 *
 * SYNC_FILE_RANGE_WAIT_AFTER: wait upon writeout of all pages in the range
 * after performing the write.
 *
 * Useful combinations of the flag bits are:
 *
 * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE: ensures that all pages
 * in the range which were dirty on entry to sys_sync_file_range() are placed
 * under writeout.  This is a start-write-for-data-integrity operation.
 *
 * SYNC_FILE_RANGE_WRITE: start writeout of all dirty pages in the range which
 * are not presently under writeout.  This is an asynchronous flush-to-disk
 * operation.  Not suitable for data integrity operations.
 *
 * SYNC_FILE_RANGE_WAIT_BEFORE (or SYNC_FILE_RANGE_WAIT_AFTER): wait for
 * completion of writeout of all pages in the range.  This will be used after an
 * earlier SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE operation to wait
 * for that operation to complete and to return the result.
 *
 * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER:
 * a traditional sync() operation.  This is a write-for-data-integrity operation
 * which will ensure that all pages in the range which were dirty on entry to
 * sys_sync_file_range() are committed to disk.
 *
 *
 * SYNC_FILE_RANGE_WAIT_BEFORE and SYNC_FILE_RANGE_WAIT_AFTER will detect any
 * I/O errors or ENOSPC conditions and will return those to the caller, after
 * clearing the EIO and ENOSPC flags in the address_space.
 *
 * It should be noted that none of these operations write out the file's
 * metadata.  So unless the application is strictly performing overwrites of
 * already-instantiated disk blocks, there are no guarantees here that the data
 * will be available after a crash.
 */
asmlinkage long sys_sync_file_range(int fd, loff_t offset, loff_t nbytes,
					unsigned int flags)
{
	int ret;
	struct file *file;
	loff_t endbyte;			/* inclusive */
	int fput_needed;
	umode_t i_mode;

	ret = -EINVAL;
	if (flags & ~VALID_FLAGS)
		goto out;

	endbyte = offset + nbytes;

	if ((s64)offset < 0)
		goto out;
	if ((s64)endbyte < 0)
		goto out;
	if (endbyte < offset)
		goto out;

	if (sizeof(pgoff_t) == 4) {
		if (offset >= (0x100000000ULL << PAGE_CACHE_SHIFT)) {
			/*
			 * The range starts outside a 32 bit machine's
			 * pagecache addressing capabilities.  Let it "succeed"
			 */
			ret = 0;
			goto out;
		}
		if (endbyte >= (0x100000000ULL << PAGE_CACHE_SHIFT)) {
			/*
			 * Out to EOF
			 */
			nbytes = 0;
		}
	}

	if (nbytes == 0)
		endbyte = LLONG_MAX;
	else
		endbyte--;		/* inclusive */

	ret = -EBADF;
	file = fget_light(fd, &fput_needed);
	if (!file)
		goto out;

	i_mode = file->f_dentry->d_inode->i_mode;
	ret = -ESPIPE;
	if (!S_ISREG(i_mode) && !S_ISBLK(i_mode) && !S_ISDIR(i_mode) &&
			!S_ISLNK(i_mode))
		goto out_put;

	ret = do_sync_file_range(file, offset, endbyte, flags);
out_put:
	fput_light(file, fput_needed);
out:
	return ret;
}

/*
 * `endbyte' is inclusive
 */
int do_sync_file_range(struct file *file, loff_t offset, loff_t endbyte,
			unsigned int flags)
{
	int ret;
	struct address_space *mapping;

	mapping = file->f_mapping;
	if (!mapping) {
		ret = -EINVAL;
		goto out;
	}

	ret = 0;
	if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) {
		ret = wait_on_page_writeback_range(mapping,
					offset >> PAGE_CACHE_SHIFT,
					endbyte >> PAGE_CACHE_SHIFT);
		if (ret < 0)
			goto out;
	}

	if (flags & SYNC_FILE_RANGE_WRITE) {
		ret = __filemap_fdatawrite_range(mapping, offset, endbyte,
						WB_SYNC_NONE);
		if (ret < 0)
			goto out;
	}

	if (flags & SYNC_FILE_RANGE_WAIT_AFTER) {
		ret = wait_on_page_writeback_range(mapping,
					offset >> PAGE_CACHE_SHIFT,
					endbyte >> PAGE_CACHE_SHIFT);
	}
out:
	return ret;
}
EXPORT_SYMBOL_GPL(do_sync_file_range);
[PATCH] sys_sync_file_range() Remove the recently-added LINUX_FADV_ASYNC_WRITE and LINUX_FADV_WRITE_WAIT fadvise() additions, do it in a new sys_sync_file_range() syscall instead. Reasons: - It's more flexible. Things which would require two or three syscalls with fadvise() can be done in a single syscall. - Using fadvise() in this manner is something not covered by POSIX. The patch wires up the syscall for x86. The sycall is implemented in the new fs/sync.c. The intention is that we can move sys_fsync(), sys_fdatasync() and perhaps sys_sync() into there later. Documentation for the syscall is in fs/sync.c. A test app (sync_file_range.c) is in http://www.zip.com.au/~akpm/linux/patches/stuff/ext3-tools.tar.gz. The available-to-GPL-modules do_sync_file_range() is for knfsd: "A COMMIT can say NFS_DATA_SYNC or NFS_FILE_SYNC. I can skip the ->fsync call for NFS_DATA_SYNC which is hopefully the more common." Note: the `async' writeout mode SYNC_FILE_RANGE_WRITE will turn synchronous if the queue is congested. This is trivial to fix: add a new flag bit, set wbc->nonblocking. But I'm not sure that we want to expose implementation details down to that level. Note: it's notable that we can sync an fd which wasn't opened for writing. Same with fsync() and fdatasync()). Note: the code takes some care to handle attempts to sync file contents outside the 16TB offset on 32-bit machines. It makes such attempts appear to succeed, for best 32-bit/64-bit compatibility. Perhaps it should make such requests fail... Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Michael Kerrisk <mtk-manpages@gmx.net> Cc: Ulrich Drepper <drepper@redhat.com> Cc: Neil Brown <neilb@cse.unsw.edu.au> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org> 2006-03-31 12:30:42 +02:00			`/*`
			`* High-level sync()-related operations`
			`*/`

			`#include <linux/kernel.h>`
			`#include <linux/file.h>`
			`#include <linux/fs.h>`
			`#include <linux/module.h>`
			`#include <linux/writeback.h>`
			`#include <linux/syscalls.h>`
			`#include <linux/linkage.h>`
			`#include <linux/pagemap.h>`

			`#define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE\|SYNC_FILE_RANGE_WRITE\| \`
			`SYNC_FILE_RANGE_WAIT_AFTER)`

			`/*`
			`* sys_sync_file_range() permits finely controlled syncing over a segment of`
			`* a file in the range offset .. (offset+nbytes-1) inclusive. If nbytes is`
			`* zero then sys_sync_file_range() will operate from offset out to EOF.`
			`*`
			`* The flag bits are:`
			`*`
			`* SYNC_FILE_RANGE_WAIT_BEFORE: wait upon writeout of all pages in the range`
			`* before performing the write.`
			`*`
			`* SYNC_FILE_RANGE_WRITE: initiate writeout of all those dirty pages in the`
			`* range which are not presently under writeback.`
			`*`
			`* SYNC_FILE_RANGE_WAIT_AFTER: wait upon writeout of all pages in the range`
			`* after performing the write.`
			`*`
			`* Useful combinations of the flag bits are:`
			`*`
			`* SYNC_FILE_RANGE_WAIT_BEFORE\|SYNC_FILE_RANGE_WRITE: ensures that all pages`
			`* in the range which were dirty on entry to sys_sync_file_range() are placed`
			`* under writeout. This is a start-write-for-data-integrity operation.`
			`*`
			`* SYNC_FILE_RANGE_WRITE: start writeout of all dirty pages in the range which`
			`* are not presently under writeout. This is an asynchronous flush-to-disk`
			`* operation. Not suitable for data integrity operations.`
			`*`
			`* SYNC_FILE_RANGE_WAIT_BEFORE (or SYNC_FILE_RANGE_WAIT_AFTER): wait for`
			`* completion of writeout of all pages in the range. This will be used after an`
			`* earlier SYNC_FILE_RANGE_WAIT_BEFORE\|SYNC_FILE_RANGE_WRITE operation to wait`
			`* for that operation to complete and to return the result.`
			`*`
			`* SYNC_FILE_RANGE_WAIT_BEFORE\|SYNC_FILE_RANGE_WRITE\|SYNC_FILE_RANGE_WAIT_AFTER:`
			`* a traditional sync() operation. This is a write-for-data-integrity operation`
			`* which will ensure that all pages in the range which were dirty on entry to`
			`* sys_sync_file_range() are committed to disk.`
			`*`
			`*`
			`* SYNC_FILE_RANGE_WAIT_BEFORE and SYNC_FILE_RANGE_WAIT_AFTER will detect any`
			`* I/O errors or ENOSPC conditions and will return those to the caller, after`
			`* clearing the EIO and ENOSPC flags in the address_space.`
			`*`
			`* It should be noted that none of these operations write out the file's`
			`* metadata. So unless the application is strictly performing overwrites of`
			`* already-instantiated disk blocks, there are no guarantees here that the data`
			`* will be available after a crash.`
			`*/`
			`asmlinkage long sys_sync_file_range(int fd, loff_t offset, loff_t nbytes,`
[PATCH] sync_file_range(): use unsigned for flags Ulrich suggested that the `flags' arg to sync_file_range() become unsigned. Cc: Ulrich Drepper <drepper@redhat.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org> 2006-04-11 07:53:57 +02:00			`unsigned int flags)`
[PATCH] sys_sync_file_range() Remove the recently-added LINUX_FADV_ASYNC_WRITE and LINUX_FADV_WRITE_WAIT fadvise() additions, do it in a new sys_sync_file_range() syscall instead. Reasons: - It's more flexible. Things which would require two or three syscalls with fadvise() can be done in a single syscall. - Using fadvise() in this manner is something not covered by POSIX. The patch wires up the syscall for x86. The sycall is implemented in the new fs/sync.c. The intention is that we can move sys_fsync(), sys_fdatasync() and perhaps sys_sync() into there later. Documentation for the syscall is in fs/sync.c. A test app (sync_file_range.c) is in http://www.zip.com.au/~akpm/linux/patches/stuff/ext3-tools.tar.gz. The available-to-GPL-modules do_sync_file_range() is for knfsd: "A COMMIT can say NFS_DATA_SYNC or NFS_FILE_SYNC. I can skip the ->fsync call for NFS_DATA_SYNC which is hopefully the more common." Note: the `async' writeout mode SYNC_FILE_RANGE_WRITE will turn synchronous if the queue is congested. This is trivial to fix: add a new flag bit, set wbc->nonblocking. But I'm not sure that we want to expose implementation details down to that level. Note: it's notable that we can sync an fd which wasn't opened for writing. Same with fsync() and fdatasync()). Note: the code takes some care to handle attempts to sync file contents outside the 16TB offset on 32-bit machines. It makes such attempts appear to succeed, for best 32-bit/64-bit compatibility. Perhaps it should make such requests fail... Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Michael Kerrisk <mtk-manpages@gmx.net> Cc: Ulrich Drepper <drepper@redhat.com> Cc: Neil Brown <neilb@cse.unsw.edu.au> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org> 2006-03-31 12:30:42 +02:00			`{`
			`int ret;`
			`struct file *file;`
			`loff_t endbyte; /* inclusive */`
			`int fput_needed;`
			`umode_t i_mode;`

			`ret = -EINVAL;`
			`if (flags & ~VALID_FLAGS)`
			`goto out;`

			`endbyte = offset + nbytes;`

			`if ((s64)offset < 0)`
			`goto out;`
			`if ((s64)endbyte < 0)`
			`goto out;`
			`if (endbyte < offset)`
			`goto out;`

			`if (sizeof(pgoff_t) == 4) {`
			`if (offset >= (0x100000000ULL << PAGE_CACHE_SHIFT)) {`
			`/*`
			`* The range starts outside a 32 bit machine's`
			`* pagecache addressing capabilities. Let it "succeed"`
			`*/`
			`ret = 0;`
			`goto out;`
			`}`
			`if (endbyte >= (0x100000000ULL << PAGE_CACHE_SHIFT)) {`
			`/*`
			`* Out to EOF`
			`*/`
			`nbytes = 0;`
			`}`
			`}`

			`if (nbytes == 0)`
[PATCH] writeback: fix range handling When a writeback_control's `start' and `end' fields are used to indicate a one-byte-range starting at file offset zero, the required values of .start=0,.end=0 mean that the ->writepages() implementation has no way of telling that it is being asked to perform a range request. Because we're currently overloading (start == 0 && end == 0) to mean "this is not a write-a-range request". To make all this sane, the patch changes range of writeback_control. So caller does: If it is calling ->writepages() to write pages, it sets range (range_start/end or range_cyclic) always. And if range_cyclic is true, ->writepages() thinks the range is cyclic, otherwise it just uses range_start and range_end. This patch does, - Add LLONG_MAX, LLONG_MIN, ULLONG_MAX to include/linux/kernel.h -1 is usually ok for range_end (type is long long). But, if someone did, range_end += val; range_end is "val - 1" u64val = range_end >> bits; u64val is "~(0ULL)" or something, they are wrong. So, this adds LLONG_MAX to avoid nasty things, and uses LLONG_MAX for range_end. - All callers of ->writepages() sets range_start/end or range_cyclic. - Fix updates of ->writeback_index. It seems already bit strange. If it starts at 0 and ended by check of nr_to_write, this last index may reduce chance to scan end of file. So, this updates ->writeback_index only if range_cyclic is true or whole-file is scanned. Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp> Cc: Nathan Scott <nathans@sgi.com> Cc: Anton Altaparmakov <aia21@cantab.net> Cc: Steven French <sfrench@us.ibm.com> Cc: "Vladimir V. Saveliev" <vs@namesys.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org> 2006-06-23 11:03:26 +02:00			`endbyte = LLONG_MAX;`
[PATCH] sys_sync_file_range() Remove the recently-added LINUX_FADV_ASYNC_WRITE and LINUX_FADV_WRITE_WAIT fadvise() additions, do it in a new sys_sync_file_range() syscall instead. Reasons: - It's more flexible. Things which would require two or three syscalls with fadvise() can be done in a single syscall. - Using fadvise() in this manner is something not covered by POSIX. The patch wires up the syscall for x86. The sycall is implemented in the new fs/sync.c. The intention is that we can move sys_fsync(), sys_fdatasync() and perhaps sys_sync() into there later. Documentation for the syscall is in fs/sync.c. A test app (sync_file_range.c) is in http://www.zip.com.au/~akpm/linux/patches/stuff/ext3-tools.tar.gz. The available-to-GPL-modules do_sync_file_range() is for knfsd: "A COMMIT can say NFS_DATA_SYNC or NFS_FILE_SYNC. I can skip the ->fsync call for NFS_DATA_SYNC which is hopefully the more common." Note: the `async' writeout mode SYNC_FILE_RANGE_WRITE will turn synchronous if the queue is congested. This is trivial to fix: add a new flag bit, set wbc->nonblocking. But I'm not sure that we want to expose implementation details down to that level. Note: it's notable that we can sync an fd which wasn't opened for writing. Same with fsync() and fdatasync()). Note: the code takes some care to handle attempts to sync file contents outside the 16TB offset on 32-bit machines. It makes such attempts appear to succeed, for best 32-bit/64-bit compatibility. Perhaps it should make such requests fail... Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Michael Kerrisk <mtk-manpages@gmx.net> Cc: Ulrich Drepper <drepper@redhat.com> Cc: Neil Brown <neilb@cse.unsw.edu.au> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org> 2006-03-31 12:30:42 +02:00			`else`
			`endbyte--; /* inclusive */`

			`ret = -EBADF;`
			`file = fget_light(fd, &fput_needed);`
			`if (!file)`
			`goto out;`

			`i_mode = file->f_dentry->d_inode->i_mode;`
			`ret = -ESPIPE;`
			`if (!S_ISREG(i_mode) && !S_ISBLK(i_mode) && !S_ISDIR(i_mode) &&`
			`!S_ISLNK(i_mode))`
			`goto out_put;`

			`ret = do_sync_file_range(file, offset, endbyte, flags);`
			`out_put:`
			`fput_light(file, fput_needed);`
			`out:`
			`return ret;`
			`}`

			`/*`
			* `endbyte' is inclusive
			`*/`
			`int do_sync_file_range(struct file *file, loff_t offset, loff_t endbyte,`
[PATCH] sync_file_range(): use unsigned for flags Ulrich suggested that the `flags' arg to sync_file_range() become unsigned. Cc: Ulrich Drepper <drepper@redhat.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org> 2006-04-11 07:53:57 +02:00			`unsigned int flags)`
[PATCH] sys_sync_file_range() Remove the recently-added LINUX_FADV_ASYNC_WRITE and LINUX_FADV_WRITE_WAIT fadvise() additions, do it in a new sys_sync_file_range() syscall instead. Reasons: - It's more flexible. Things which would require two or three syscalls with fadvise() can be done in a single syscall. - Using fadvise() in this manner is something not covered by POSIX. The patch wires up the syscall for x86. The sycall is implemented in the new fs/sync.c. The intention is that we can move sys_fsync(), sys_fdatasync() and perhaps sys_sync() into there later. Documentation for the syscall is in fs/sync.c. A test app (sync_file_range.c) is in http://www.zip.com.au/~akpm/linux/patches/stuff/ext3-tools.tar.gz. The available-to-GPL-modules do_sync_file_range() is for knfsd: "A COMMIT can say NFS_DATA_SYNC or NFS_FILE_SYNC. I can skip the ->fsync call for NFS_DATA_SYNC which is hopefully the more common." Note: the `async' writeout mode SYNC_FILE_RANGE_WRITE will turn synchronous if the queue is congested. This is trivial to fix: add a new flag bit, set wbc->nonblocking. But I'm not sure that we want to expose implementation details down to that level. Note: it's notable that we can sync an fd which wasn't opened for writing. Same with fsync() and fdatasync()). Note: the code takes some care to handle attempts to sync file contents outside the 16TB offset on 32-bit machines. It makes such attempts appear to succeed, for best 32-bit/64-bit compatibility. Perhaps it should make such requests fail... Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Michael Kerrisk <mtk-manpages@gmx.net> Cc: Ulrich Drepper <drepper@redhat.com> Cc: Neil Brown <neilb@cse.unsw.edu.au> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org> 2006-03-31 12:30:42 +02:00			`{`
			`int ret;`
			`struct address_space *mapping;`

			`mapping = file->f_mapping;`
			`if (!mapping) {`
			`ret = -EINVAL;`
			`goto out;`
			`}`

			`ret = 0;`
			`if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) {`
			`ret = wait_on_page_writeback_range(mapping,`
			`offset >> PAGE_CACHE_SHIFT,`
			`endbyte >> PAGE_CACHE_SHIFT);`
			`if (ret < 0)`
			`goto out;`
			`}`

			`if (flags & SYNC_FILE_RANGE_WRITE) {`
			`ret = __filemap_fdatawrite_range(mapping, offset, endbyte,`
			`WB_SYNC_NONE);`
			`if (ret < 0)`
			`goto out;`
			`}`

			`if (flags & SYNC_FILE_RANGE_WAIT_AFTER) {`
			`ret = wait_on_page_writeback_range(mapping,`
			`offset >> PAGE_CACHE_SHIFT,`
			`endbyte >> PAGE_CACHE_SHIFT);`
			`}`
			`out:`
			`return ret;`
			`}`
			`EXPORT_SYMBOL_GPL(do_sync_file_range);`