diff --git a/arch/Kconfig b/arch/Kconfig index a71cdbe2a04d..8f3564930580 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -93,7 +93,6 @@ config KPROBES_ON_FTRACE config UPROBES def_bool n - select PERCPU_RWSEM help Uprobes is the user-space counterpart to kprobes: they enable instrumentation applications (such as 'perf probe') diff --git a/fs/block_dev.c b/fs/block_dev.c index 198243717da5..33b813e04f79 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1769,7 +1769,7 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg) { struct inode *inode, *old_inode = NULL; - spin_lock(&inode_sb_list_lock); + spin_lock(&blockdev_superblock->s_inode_list_lock); list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) { struct address_space *mapping = inode->i_mapping; @@ -1781,13 +1781,13 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg) } __iget(inode); spin_unlock(&inode->i_lock); - spin_unlock(&inode_sb_list_lock); + spin_unlock(&blockdev_superblock->s_inode_list_lock); /* * We hold a reference to 'inode' so it couldn't have been * removed from s_inodes list while we dropped the - * inode_sb_list_lock. We cannot iput the inode now as we can + * s_inode_list_lock We cannot iput the inode now as we can * be holding the last reference and we cannot iput it under - * inode_sb_list_lock. So we keep the reference and iput it + * s_inode_list_lock. So we keep the reference and iput it * later. */ iput(old_inode); @@ -1795,8 +1795,8 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg) func(I_BDEV(inode), arg); - spin_lock(&inode_sb_list_lock); + spin_lock(&blockdev_superblock->s_inode_list_lock); } - spin_unlock(&inode_sb_list_lock); + spin_unlock(&blockdev_superblock->s_inode_list_lock); iput(old_inode); } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 68ad89e23713..8f259b3a66b3 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1640,9 +1640,7 @@ static void do_async_commit(struct work_struct *work) * Tell lockdep about it. */ if (ac->newtrans->type & __TRANS_FREEZABLE) - rwsem_acquire_read( - &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], - 0, 1, _THIS_IP_); + __sb_writers_acquired(ac->root->fs_info->sb, SB_FREEZE_FS); current->journal_info = ac->newtrans; @@ -1681,9 +1679,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, * async commit thread will be the one to unlock it. */ if (ac->newtrans->type & __TRANS_FREEZABLE) - rwsem_release( - &root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], - 1, _THIS_IP_); + __sb_writers_release(root->fs_info->sb, SB_FREEZE_FS); schedule_work(&ac->work); diff --git a/fs/dcache.c b/fs/dcache.c index 9b5fe503f6cb..5c33aeb0f68f 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2718,7 +2718,7 @@ struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2) * This helper attempts to cope with remotely renamed directories * * It assumes that the caller is already holding - * dentry->d_parent->d_inode->i_mutex, inode->i_lock and rename_lock + * dentry->d_parent->d_inode->i_mutex, and rename_lock * * Note: If ever the locking in lock_rename() changes, then please * remember to update this too... @@ -2744,7 +2744,6 @@ out_unalias: __d_move(alias, dentry, false); ret = 0; out_err: - spin_unlock(&inode->i_lock); if (m2) mutex_unlock(m2); if (m1) @@ -2790,10 +2789,11 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) if (S_ISDIR(inode->i_mode)) { struct dentry *new = __d_find_any_alias(inode); if (unlikely(new)) { + /* The reference to new ensures it remains an alias */ + spin_unlock(&inode->i_lock); write_seqlock(&rename_lock); if (unlikely(d_ancestor(new, dentry))) { write_sequnlock(&rename_lock); - spin_unlock(&inode->i_lock); dput(new); new = ERR_PTR(-ELOOP); pr_warn_ratelimited( @@ -2812,7 +2812,6 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) } else { __d_move(new, dentry, false); write_sequnlock(&rename_lock); - spin_unlock(&inode->i_lock); security_d_instantiate(new, inode); } iput(inode); @@ -2926,6 +2925,13 @@ restart: if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) { struct mount *parent = ACCESS_ONCE(mnt->mnt_parent); + /* Escaped? */ + if (dentry != vfsmnt->mnt_root) { + bptr = *buffer; + blen = *buflen; + error = 3; + break; + } /* Global root? */ if (mnt != parent) { dentry = ACCESS_ONCE(mnt->mnt_mountpoint); diff --git a/fs/drop_caches.c b/fs/drop_caches.c index 5718cb9f7273..d72d52b90433 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -17,7 +17,7 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) { struct inode *inode, *toput_inode = NULL; - spin_lock(&inode_sb_list_lock); + spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { spin_lock(&inode->i_lock); if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || @@ -27,13 +27,15 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) } __iget(inode); spin_unlock(&inode->i_lock); - spin_unlock(&inode_sb_list_lock); + spin_unlock(&sb->s_inode_list_lock); + invalidate_mapping_pages(inode->i_mapping, 0, -1); iput(toput_inode); toput_inode = inode; - spin_lock(&inode_sb_list_lock); + + spin_lock(&sb->s_inode_list_lock); } - spin_unlock(&inode_sb_list_lock); + spin_unlock(&sb->s_inode_list_lock); iput(toput_inode); } diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 5fa588e933d5..ae0f438c2ee6 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -88,7 +88,7 @@ unsigned int dirtytime_expire_interval = 12 * 60 * 60; static inline struct inode *wb_inode(struct list_head *head) { - return list_entry(head, struct inode, i_wb_list); + return list_entry(head, struct inode, i_io_list); } /* @@ -125,22 +125,22 @@ static void wb_io_lists_depopulated(struct bdi_writeback *wb) } /** - * inode_wb_list_move_locked - move an inode onto a bdi_writeback IO list + * inode_io_list_move_locked - move an inode onto a bdi_writeback IO list * @inode: inode to be moved * @wb: target bdi_writeback * @head: one of @wb->b_{dirty|io|more_io} * - * Move @inode->i_wb_list to @list of @wb and set %WB_has_dirty_io. + * Move @inode->i_io_list to @list of @wb and set %WB_has_dirty_io. * Returns %true if @inode is the first occupant of the !dirty_time IO * lists; otherwise, %false. */ -static bool inode_wb_list_move_locked(struct inode *inode, +static bool inode_io_list_move_locked(struct inode *inode, struct bdi_writeback *wb, struct list_head *head) { assert_spin_locked(&wb->list_lock); - list_move(&inode->i_wb_list, head); + list_move(&inode->i_io_list, head); /* dirty_time doesn't count as dirty_io until expiration */ if (head != &wb->b_dirty_time) @@ -151,19 +151,19 @@ static bool inode_wb_list_move_locked(struct inode *inode, } /** - * inode_wb_list_del_locked - remove an inode from its bdi_writeback IO list + * inode_io_list_del_locked - remove an inode from its bdi_writeback IO list * @inode: inode to be removed * @wb: bdi_writeback @inode is being removed from * * Remove @inode which may be on one of @wb->b_{dirty|io|more_io} lists and * clear %WB_has_dirty_io if all are empty afterwards. */ -static void inode_wb_list_del_locked(struct inode *inode, +static void inode_io_list_del_locked(struct inode *inode, struct bdi_writeback *wb) { assert_spin_locked(&wb->list_lock); - list_del_init(&inode->i_wb_list); + list_del_init(&inode->i_io_list); wb_io_lists_depopulated(wb); } @@ -351,7 +351,7 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) /* * Once I_FREEING is visible under i_lock, the eviction path owns - * the inode and we shouldn't modify ->i_wb_list. + * the inode and we shouldn't modify ->i_io_list. */ if (unlikely(inode->i_state & I_FREEING)) goto skip_switch; @@ -390,16 +390,16 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) * is always correct including from ->b_dirty_time. The transfer * preserves @inode->dirtied_when ordering. */ - if (!list_empty(&inode->i_wb_list)) { + if (!list_empty(&inode->i_io_list)) { struct inode *pos; - inode_wb_list_del_locked(inode, old_wb); + inode_io_list_del_locked(inode, old_wb); inode->i_wb = new_wb; - list_for_each_entry(pos, &new_wb->b_dirty, i_wb_list) + list_for_each_entry(pos, &new_wb->b_dirty, i_io_list) if (time_after_eq(inode->dirtied_when, pos->dirtied_when)) break; - inode_wb_list_move_locked(inode, new_wb, pos->i_wb_list.prev); + inode_io_list_move_locked(inode, new_wb, pos->i_io_list.prev); } else { inode->i_wb = new_wb; } @@ -961,12 +961,12 @@ void wb_start_background_writeback(struct bdi_writeback *wb) /* * Remove the inode from the writeback list it is on. */ -void inode_wb_list_del(struct inode *inode) +void inode_io_list_del(struct inode *inode) { struct bdi_writeback *wb; wb = inode_to_wb_and_lock_list(inode); - inode_wb_list_del_locked(inode, wb); + inode_io_list_del_locked(inode, wb); spin_unlock(&wb->list_lock); } @@ -988,7 +988,7 @@ static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) if (time_before(inode->dirtied_when, tail->dirtied_when)) inode->dirtied_when = jiffies; } - inode_wb_list_move_locked(inode, wb, &wb->b_dirty); + inode_io_list_move_locked(inode, wb, &wb->b_dirty); } /* @@ -996,7 +996,7 @@ static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) */ static void requeue_io(struct inode *inode, struct bdi_writeback *wb) { - inode_wb_list_move_locked(inode, wb, &wb->b_more_io); + inode_io_list_move_locked(inode, wb, &wb->b_more_io); } static void inode_sync_complete(struct inode *inode) @@ -1055,7 +1055,7 @@ static int move_expired_inodes(struct list_head *delaying_queue, if (older_than_this && inode_dirtied_after(inode, *older_than_this)) break; - list_move(&inode->i_wb_list, &tmp); + list_move(&inode->i_io_list, &tmp); moved++; if (flags & EXPIRE_DIRTY_ATIME) set_bit(__I_DIRTY_TIME_EXPIRED, &inode->i_state); @@ -1078,7 +1078,7 @@ static int move_expired_inodes(struct list_head *delaying_queue, list_for_each_prev_safe(pos, node, &tmp) { inode = wb_inode(pos); if (inode->i_sb == sb) - list_move(&inode->i_wb_list, dispatch_queue); + list_move(&inode->i_io_list, dispatch_queue); } } out: @@ -1232,10 +1232,10 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, redirty_tail(inode, wb); } else if (inode->i_state & I_DIRTY_TIME) { inode->dirtied_when = jiffies; - inode_wb_list_move_locked(inode, wb, &wb->b_dirty_time); + inode_io_list_move_locked(inode, wb, &wb->b_dirty_time); } else { /* The inode is clean. Remove from writeback lists. */ - inode_wb_list_del_locked(inode, wb); + inode_io_list_del_locked(inode, wb); } } @@ -1378,7 +1378,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, * touch it. See comment above for explanation. */ if (!(inode->i_state & I_DIRTY_ALL)) - inode_wb_list_del_locked(inode, wb); + inode_io_list_del_locked(inode, wb); spin_unlock(&wb->list_lock); inode_sync_complete(inode); out: @@ -1439,7 +1439,9 @@ static long writeback_sb_inodes(struct super_block *sb, unsigned long start_time = jiffies; long write_chunk; long wrote = 0; /* count both pages and inodes */ + struct blk_plug plug; + blk_start_plug(&plug); while (!list_empty(&wb->b_io)) { struct inode *inode = wb_inode(wb->b_io.prev); @@ -1537,6 +1539,7 @@ static long writeback_sb_inodes(struct super_block *sb, break; } } + blk_finish_plug(&plug); return wrote; } @@ -2088,7 +2091,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) else dirty_list = &wb->b_dirty_time; - wakeup_bdi = inode_wb_list_move_locked(inode, wb, + wakeup_bdi = inode_io_list_move_locked(inode, wb, dirty_list); spin_unlock(&wb->list_lock); @@ -2111,6 +2114,15 @@ out_unlock_inode: } EXPORT_SYMBOL(__mark_inode_dirty); +/* + * The @s_sync_lock is used to serialise concurrent sync operations + * to avoid lock contention problems with concurrent wait_sb_inodes() calls. + * Concurrent callers will block on the s_sync_lock rather than doing contending + * walks. The queueing maintains sync(2) required behaviour as all the IO that + * has been issued up to the time this function is enter is guaranteed to be + * completed by the time we have gained the lock and waited for all IO that is + * in progress regardless of the order callers are granted the lock. + */ static void wait_sb_inodes(struct super_block *sb) { struct inode *inode, *old_inode = NULL; @@ -2121,7 +2133,8 @@ static void wait_sb_inodes(struct super_block *sb) */ WARN_ON(!rwsem_is_locked(&sb->s_umount)); - spin_lock(&inode_sb_list_lock); + mutex_lock(&sb->s_sync_lock); + spin_lock(&sb->s_inode_list_lock); /* * Data integrity sync. Must wait for all pages under writeback, @@ -2141,14 +2154,14 @@ static void wait_sb_inodes(struct super_block *sb) } __iget(inode); spin_unlock(&inode->i_lock); - spin_unlock(&inode_sb_list_lock); + spin_unlock(&sb->s_inode_list_lock); /* * We hold a reference to 'inode' so it couldn't have been * removed from s_inodes list while we dropped the - * inode_sb_list_lock. We cannot iput the inode now as we can + * s_inode_list_lock. We cannot iput the inode now as we can * be holding the last reference and we cannot iput it under - * inode_sb_list_lock. So we keep the reference and iput it + * s_inode_list_lock. So we keep the reference and iput it * later. */ iput(old_inode); @@ -2158,10 +2171,11 @@ static void wait_sb_inodes(struct super_block *sb) cond_resched(); - spin_lock(&inode_sb_list_lock); + spin_lock(&sb->s_inode_list_lock); } - spin_unlock(&inode_sb_list_lock); + spin_unlock(&sb->s_inode_list_lock); iput(old_inode); + mutex_unlock(&sb->s_sync_lock); } static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr, diff --git a/fs/inode.c b/fs/inode.c index d30640f7a193..78a17b8859e1 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -28,16 +28,16 @@ * inode->i_state, inode->i_hash, __iget() * Inode LRU list locks protect: * inode->i_sb->s_inode_lru, inode->i_lru - * inode_sb_list_lock protects: - * sb->s_inodes, inode->i_sb_list + * inode->i_sb->s_inode_list_lock protects: + * inode->i_sb->s_inodes, inode->i_sb_list * bdi->wb.list_lock protects: - * bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_wb_list + * bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_io_list * inode_hash_lock protects: * inode_hashtable, inode->i_hash * * Lock ordering: * - * inode_sb_list_lock + * inode->i_sb->s_inode_list_lock * inode->i_lock * Inode LRU list locks * @@ -45,7 +45,7 @@ * inode->i_lock * * inode_hash_lock - * inode_sb_list_lock + * inode->i_sb->s_inode_list_lock * inode->i_lock * * iunique_lock @@ -57,8 +57,6 @@ static unsigned int i_hash_shift __read_mostly; static struct hlist_head *inode_hashtable __read_mostly; static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock); -__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock); - /* * Empty aops. Can be used for the cases where the user does not * define any of the address_space operations. @@ -359,7 +357,7 @@ void inode_init_once(struct inode *inode) memset(inode, 0, sizeof(*inode)); INIT_HLIST_NODE(&inode->i_hash); INIT_LIST_HEAD(&inode->i_devices); - INIT_LIST_HEAD(&inode->i_wb_list); + INIT_LIST_HEAD(&inode->i_io_list); INIT_LIST_HEAD(&inode->i_lru); address_space_init_once(&inode->i_data); i_size_ordered_init(inode); @@ -426,18 +424,18 @@ static void inode_lru_list_del(struct inode *inode) */ void inode_sb_list_add(struct inode *inode) { - spin_lock(&inode_sb_list_lock); + spin_lock(&inode->i_sb->s_inode_list_lock); list_add(&inode->i_sb_list, &inode->i_sb->s_inodes); - spin_unlock(&inode_sb_list_lock); + spin_unlock(&inode->i_sb->s_inode_list_lock); } EXPORT_SYMBOL_GPL(inode_sb_list_add); static inline void inode_sb_list_del(struct inode *inode) { if (!list_empty(&inode->i_sb_list)) { - spin_lock(&inode_sb_list_lock); + spin_lock(&inode->i_sb->s_inode_list_lock); list_del_init(&inode->i_sb_list); - spin_unlock(&inode_sb_list_lock); + spin_unlock(&inode->i_sb->s_inode_list_lock); } } @@ -527,8 +525,8 @@ static void evict(struct inode *inode) BUG_ON(!(inode->i_state & I_FREEING)); BUG_ON(!list_empty(&inode->i_lru)); - if (!list_empty(&inode->i_wb_list)) - inode_wb_list_del(inode); + if (!list_empty(&inode->i_io_list)) + inode_io_list_del(inode); inode_sb_list_del(inode); @@ -577,6 +575,7 @@ static void dispose_list(struct list_head *head) list_del_init(&inode->i_lru); evict(inode); + cond_resched(); } } @@ -594,7 +593,8 @@ void evict_inodes(struct super_block *sb) struct inode *inode, *next; LIST_HEAD(dispose); - spin_lock(&inode_sb_list_lock); +again: + spin_lock(&sb->s_inode_list_lock); list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { if (atomic_read(&inode->i_count)) continue; @@ -609,8 +609,20 @@ void evict_inodes(struct super_block *sb) inode_lru_list_del(inode); spin_unlock(&inode->i_lock); list_add(&inode->i_lru, &dispose); + + /* + * We can have a ton of inodes to evict at unmount time given + * enough memory, check to see if we need to go to sleep for a + * bit so we don't livelock. + */ + if (need_resched()) { + spin_unlock(&sb->s_inode_list_lock); + cond_resched(); + dispose_list(&dispose); + goto again; + } } - spin_unlock(&inode_sb_list_lock); + spin_unlock(&sb->s_inode_list_lock); dispose_list(&dispose); } @@ -631,7 +643,7 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty) struct inode *inode, *next; LIST_HEAD(dispose); - spin_lock(&inode_sb_list_lock); + spin_lock(&sb->s_inode_list_lock); list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { spin_lock(&inode->i_lock); if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { @@ -654,7 +666,7 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty) spin_unlock(&inode->i_lock); list_add(&inode->i_lru, &dispose); } - spin_unlock(&inode_sb_list_lock); + spin_unlock(&sb->s_inode_list_lock); dispose_list(&dispose); @@ -890,7 +902,7 @@ struct inode *new_inode(struct super_block *sb) { struct inode *inode; - spin_lock_prefetch(&inode_sb_list_lock); + spin_lock_prefetch(&sb->s_inode_list_lock); inode = new_inode_pseudo(sb); if (inode) diff --git a/fs/internal.h b/fs/internal.h index 4d5af583ab03..71859c4d0b41 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -112,14 +112,13 @@ extern int vfs_open(const struct path *, struct file *, const struct cred *); /* * inode.c */ -extern spinlock_t inode_sb_list_lock; extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc); extern void inode_add_lru(struct inode *inode); /* * fs-writeback.c */ -extern void inode_wb_list_del(struct inode *inode); +extern void inode_io_list_del(struct inode *inode); extern long get_nr_dirty_inodes(void); extern void evict_inodes(struct super_block *); diff --git a/fs/namei.c b/fs/namei.c index 1c2105ed20c5..29b927938b8c 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -560,6 +560,24 @@ static int __nd_alloc_stack(struct nameidata *nd) return 0; } +/** + * path_connected - Verify that a path->dentry is below path->mnt.mnt_root + * @path: nameidate to verify + * + * Rename can sometimes move a file or directory outside of a bind + * mount, path_connected allows those cases to be detected. + */ +static bool path_connected(const struct path *path) +{ + struct vfsmount *mnt = path->mnt; + + /* Only bind mounts can have disconnected paths */ + if (mnt->mnt_root == mnt->mnt_sb->s_root) + return true; + + return is_subdir(path->dentry, mnt->mnt_root); +} + static inline int nd_alloc_stack(struct nameidata *nd) { if (likely(nd->depth != EMBEDDED_LEVELS)) @@ -1296,6 +1314,8 @@ static int follow_dotdot_rcu(struct nameidata *nd) return -ECHILD; nd->path.dentry = parent; nd->seq = seq; + if (unlikely(!path_connected(&nd->path))) + return -ENOENT; break; } else { struct mount *mnt = real_mount(nd->path.mnt); @@ -1396,7 +1416,7 @@ static void follow_mount(struct path *path) } } -static void follow_dotdot(struct nameidata *nd) +static int follow_dotdot(struct nameidata *nd) { if (!nd->root.mnt) set_root(nd); @@ -1412,6 +1432,8 @@ static void follow_dotdot(struct nameidata *nd) /* rare case of legitimate dget_parent()... */ nd->path.dentry = dget_parent(nd->path.dentry); dput(old); + if (unlikely(!path_connected(&nd->path))) + return -ENOENT; break; } if (!follow_up(&nd->path)) @@ -1419,6 +1441,7 @@ static void follow_dotdot(struct nameidata *nd) } follow_mount(&nd->path); nd->inode = nd->path.dentry->d_inode; + return 0; } /* @@ -1634,7 +1657,7 @@ static inline int handle_dots(struct nameidata *nd, int type) if (nd->flags & LOOKUP_RCU) { return follow_dotdot_rcu(nd); } else - follow_dotdot(nd); + return follow_dotdot(nd); } return 0; } diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c index 474a3ce1b5e1..e785fd954c30 100644 --- a/fs/notify/inode_mark.c +++ b/fs/notify/inode_mark.c @@ -143,17 +143,17 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark, /** * fsnotify_unmount_inodes - an sb is unmounting. handle any watched inodes. - * @list: list of inodes being unmounted (sb->s_inodes) + * @sb: superblock being unmounted. * * Called during unmount with no locks held, so needs to be safe against - * concurrent modifiers. We temporarily drop inode_sb_list_lock and CAN block. + * concurrent modifiers. We temporarily drop sb->s_inode_list_lock and CAN block. */ -void fsnotify_unmount_inodes(struct list_head *list) +void fsnotify_unmount_inodes(struct super_block *sb) { struct inode *inode, *next_i, *need_iput = NULL; - spin_lock(&inode_sb_list_lock); - list_for_each_entry_safe(inode, next_i, list, i_sb_list) { + spin_lock(&sb->s_inode_list_lock); + list_for_each_entry_safe(inode, next_i, &sb->s_inodes, i_sb_list) { struct inode *need_iput_tmp; /* @@ -189,7 +189,7 @@ void fsnotify_unmount_inodes(struct list_head *list) spin_unlock(&inode->i_lock); /* In case the dropping of a reference would nuke next_i. */ - while (&next_i->i_sb_list != list) { + while (&next_i->i_sb_list != &sb->s_inodes) { spin_lock(&next_i->i_lock); if (!(next_i->i_state & (I_FREEING | I_WILL_FREE)) && atomic_read(&next_i->i_count)) { @@ -204,12 +204,12 @@ void fsnotify_unmount_inodes(struct list_head *list) } /* - * We can safely drop inode_sb_list_lock here because either + * We can safely drop s_inode_list_lock here because either * we actually hold references on both inode and next_i or * end of list. Also no new inodes will be added since the * umount has begun. */ - spin_unlock(&inode_sb_list_lock); + spin_unlock(&sb->s_inode_list_lock); if (need_iput_tmp) iput(need_iput_tmp); @@ -221,7 +221,7 @@ void fsnotify_unmount_inodes(struct list_head *list) iput(inode); - spin_lock(&inode_sb_list_lock); + spin_lock(&sb->s_inode_list_lock); } - spin_unlock(&inode_sb_list_lock); + spin_unlock(&sb->s_inode_list_lock); } diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index fed66e2c9fe8..ef0d64b2a6d9 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -928,7 +928,7 @@ static void add_dquot_ref(struct super_block *sb, int type) int reserved = 0; #endif - spin_lock(&inode_sb_list_lock); + spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { spin_lock(&inode->i_lock); if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || @@ -939,7 +939,7 @@ static void add_dquot_ref(struct super_block *sb, int type) } __iget(inode); spin_unlock(&inode->i_lock); - spin_unlock(&inode_sb_list_lock); + spin_unlock(&sb->s_inode_list_lock); #ifdef CONFIG_QUOTA_DEBUG if (unlikely(inode_get_rsv_space(inode) > 0)) @@ -951,15 +951,15 @@ static void add_dquot_ref(struct super_block *sb, int type) /* * We hold a reference to 'inode' so it couldn't have been * removed from s_inodes list while we dropped the - * inode_sb_list_lock We cannot iput the inode now as we can be + * s_inode_list_lock. We cannot iput the inode now as we can be * holding the last reference and we cannot iput it under - * inode_sb_list_lock. So we keep the reference and iput it + * s_inode_list_lock. So we keep the reference and iput it * later. */ old_inode = inode; - spin_lock(&inode_sb_list_lock); + spin_lock(&sb->s_inode_list_lock); } - spin_unlock(&inode_sb_list_lock); + spin_unlock(&sb->s_inode_list_lock); iput(old_inode); #ifdef CONFIG_QUOTA_DEBUG @@ -1028,7 +1028,7 @@ static void remove_dquot_ref(struct super_block *sb, int type, struct inode *inode; int reserved = 0; - spin_lock(&inode_sb_list_lock); + spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { /* * We have to scan also I_NEW inodes because they can already @@ -1044,7 +1044,7 @@ static void remove_dquot_ref(struct super_block *sb, int type, } spin_unlock(&dq_data_lock); } - spin_unlock(&inode_sb_list_lock); + spin_unlock(&sb->s_inode_list_lock); #ifdef CONFIG_QUOTA_DEBUG if (reserved) { printk(KERN_WARNING "VFS (%s): Writes happened after quota" diff --git a/fs/super.c b/fs/super.c index b61372354f2b..954aeb80e202 100644 --- a/fs/super.c +++ b/fs/super.c @@ -135,6 +135,24 @@ static unsigned long super_cache_count(struct shrinker *shrink, return total_objects; } +static void destroy_super_work(struct work_struct *work) +{ + struct super_block *s = container_of(work, struct super_block, + destroy_work); + int i; + + for (i = 0; i < SB_FREEZE_LEVELS; i++) + percpu_free_rwsem(&s->s_writers.rw_sem[i]); + kfree(s); +} + +static void destroy_super_rcu(struct rcu_head *head) +{ + struct super_block *s = container_of(head, struct super_block, rcu); + INIT_WORK(&s->destroy_work, destroy_super_work); + schedule_work(&s->destroy_work); +} + /** * destroy_super - frees a superblock * @s: superblock to free @@ -143,16 +161,13 @@ static unsigned long super_cache_count(struct shrinker *shrink, */ static void destroy_super(struct super_block *s) { - int i; list_lru_destroy(&s->s_dentry_lru); list_lru_destroy(&s->s_inode_lru); - for (i = 0; i < SB_FREEZE_LEVELS; i++) - percpu_counter_destroy(&s->s_writers.counter[i]); security_sb_free(s); WARN_ON(!list_empty(&s->s_mounts)); kfree(s->s_subtype); kfree(s->s_options); - kfree_rcu(s, rcu); + call_rcu(&s->rcu, destroy_super_rcu); } /** @@ -178,19 +193,19 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) goto fail; for (i = 0; i < SB_FREEZE_LEVELS; i++) { - if (percpu_counter_init(&s->s_writers.counter[i], 0, - GFP_KERNEL) < 0) + if (__percpu_init_rwsem(&s->s_writers.rw_sem[i], + sb_writers_name[i], + &type->s_writers_key[i])) goto fail; - lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i], - &type->s_writers_key[i], 0); } - init_waitqueue_head(&s->s_writers.wait); init_waitqueue_head(&s->s_writers.wait_unfrozen); s->s_bdi = &noop_backing_dev_info; s->s_flags = flags; INIT_HLIST_NODE(&s->s_instances); INIT_HLIST_BL_HEAD(&s->s_anon); + mutex_init(&s->s_sync_lock); INIT_LIST_HEAD(&s->s_inodes); + spin_lock_init(&s->s_inode_list_lock); if (list_lru_init_memcg(&s->s_dentry_lru)) goto fail; @@ -399,7 +414,7 @@ void generic_shutdown_super(struct super_block *sb) sync_filesystem(sb); sb->s_flags &= ~MS_ACTIVE; - fsnotify_unmount_inodes(&sb->s_inodes); + fsnotify_unmount_inodes(sb); evict_inodes(sb); @@ -1146,72 +1161,46 @@ out: */ void __sb_end_write(struct super_block *sb, int level) { - percpu_counter_dec(&sb->s_writers.counter[level-1]); - /* - * Make sure s_writers are updated before we wake up waiters in - * freeze_super(). - */ - smp_mb(); - if (waitqueue_active(&sb->s_writers.wait)) - wake_up(&sb->s_writers.wait); - rwsem_release(&sb->s_writers.lock_map[level-1], 1, _RET_IP_); + percpu_up_read(sb->s_writers.rw_sem + level-1); } EXPORT_SYMBOL(__sb_end_write); -#ifdef CONFIG_LOCKDEP -/* - * We want lockdep to tell us about possible deadlocks with freezing but - * it's it bit tricky to properly instrument it. Getting a freeze protection - * works as getting a read lock but there are subtle problems. XFS for example - * gets freeze protection on internal level twice in some cases, which is OK - * only because we already hold a freeze protection also on higher level. Due - * to these cases we have to tell lockdep we are doing trylock when we - * already hold a freeze protection for a higher freeze level. - */ -static void acquire_freeze_lock(struct super_block *sb, int level, bool trylock, - unsigned long ip) -{ - int i; - - if (!trylock) { - for (i = 0; i < level - 1; i++) - if (lock_is_held(&sb->s_writers.lock_map[i])) { - trylock = true; - break; - } - } - rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, trylock, ip); -} -#endif - /* * This is an internal function, please use sb_start_{write,pagefault,intwrite} * instead. */ int __sb_start_write(struct super_block *sb, int level, bool wait) { -retry: - if (unlikely(sb->s_writers.frozen >= level)) { - if (!wait) - return 0; - wait_event(sb->s_writers.wait_unfrozen, - sb->s_writers.frozen < level); - } + bool force_trylock = false; + int ret = 1; #ifdef CONFIG_LOCKDEP - acquire_freeze_lock(sb, level, !wait, _RET_IP_); -#endif - percpu_counter_inc(&sb->s_writers.counter[level-1]); /* - * Make sure counter is updated before we check for frozen. - * freeze_super() first sets frozen and then checks the counter. + * We want lockdep to tell us about possible deadlocks with freezing + * but it's it bit tricky to properly instrument it. Getting a freeze + * protection works as getting a read lock but there are subtle + * problems. XFS for example gets freeze protection on internal level + * twice in some cases, which is OK only because we already hold a + * freeze protection also on higher level. Due to these cases we have + * to use wait == F (trylock mode) which must not fail. */ - smp_mb(); - if (unlikely(sb->s_writers.frozen >= level)) { - __sb_end_write(sb, level); - goto retry; + if (wait) { + int i; + + for (i = 0; i < level - 1; i++) + if (percpu_rwsem_is_held(sb->s_writers.rw_sem + i)) { + force_trylock = true; + break; + } } - return 1; +#endif + if (wait && !force_trylock) + percpu_down_read(sb->s_writers.rw_sem + level-1); + else + ret = percpu_down_read_trylock(sb->s_writers.rw_sem + level-1); + + WARN_ON(force_trylock & !ret); + return ret; } EXPORT_SYMBOL(__sb_start_write); @@ -1221,37 +1210,33 @@ EXPORT_SYMBOL(__sb_start_write); * @level: type of writers we wait for (normal vs page fault) * * This function waits until there are no writers of given type to given file - * system. Caller of this function should make sure there can be no new writers - * of type @level before calling this function. Otherwise this function can - * livelock. + * system. */ static void sb_wait_write(struct super_block *sb, int level) { - s64 writers; - + percpu_down_write(sb->s_writers.rw_sem + level-1); /* - * We just cycle-through lockdep here so that it does not complain - * about returning with lock to userspace + * We are going to return to userspace and forget about this lock, the + * ownership goes to the caller of thaw_super() which does unlock. + * + * FIXME: we should do this before return from freeze_super() after we + * called sync_filesystem(sb) and s_op->freeze_fs(sb), and thaw_super() + * should re-acquire these locks before s_op->unfreeze_fs(sb). However + * this leads to lockdep false-positives, so currently we do the early + * release right after acquire. */ - rwsem_acquire(&sb->s_writers.lock_map[level-1], 0, 0, _THIS_IP_); - rwsem_release(&sb->s_writers.lock_map[level-1], 1, _THIS_IP_); + percpu_rwsem_release(sb->s_writers.rw_sem + level-1, 0, _THIS_IP_); +} - do { - DEFINE_WAIT(wait); +static void sb_freeze_unlock(struct super_block *sb) +{ + int level; - /* - * We use a barrier in prepare_to_wait() to separate setting - * of frozen and checking of the counter - */ - prepare_to_wait(&sb->s_writers.wait, &wait, - TASK_UNINTERRUPTIBLE); + for (level = 0; level < SB_FREEZE_LEVELS; ++level) + percpu_rwsem_acquire(sb->s_writers.rw_sem + level, 0, _THIS_IP_); - writers = percpu_counter_sum(&sb->s_writers.counter[level-1]); - if (writers) - schedule(); - - finish_wait(&sb->s_writers.wait, &wait); - } while (writers); + for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--) + percpu_up_write(sb->s_writers.rw_sem + level); } /** @@ -1310,20 +1295,14 @@ int freeze_super(struct super_block *sb) return 0; } - /* From now on, no new normal writers can start */ sb->s_writers.frozen = SB_FREEZE_WRITE; - smp_wmb(); - /* Release s_umount to preserve sb_start_write -> s_umount ordering */ up_write(&sb->s_umount); - sb_wait_write(sb, SB_FREEZE_WRITE); + down_write(&sb->s_umount); /* Now we go and block page faults... */ - down_write(&sb->s_umount); sb->s_writers.frozen = SB_FREEZE_PAGEFAULT; - smp_wmb(); - sb_wait_write(sb, SB_FREEZE_PAGEFAULT); /* All writers are done so after syncing there won't be dirty data */ @@ -1331,7 +1310,6 @@ int freeze_super(struct super_block *sb) /* Now wait for internal filesystem counter */ sb->s_writers.frozen = SB_FREEZE_FS; - smp_wmb(); sb_wait_write(sb, SB_FREEZE_FS); if (sb->s_op->freeze_fs) { @@ -1340,7 +1318,7 @@ int freeze_super(struct super_block *sb) printk(KERN_ERR "VFS:Filesystem freeze failed\n"); sb->s_writers.frozen = SB_UNFROZEN; - smp_wmb(); + sb_freeze_unlock(sb); wake_up(&sb->s_writers.wait_unfrozen); deactivate_locked_super(sb); return ret; @@ -1372,8 +1350,10 @@ int thaw_super(struct super_block *sb) return -EINVAL; } - if (sb->s_flags & MS_RDONLY) + if (sb->s_flags & MS_RDONLY) { + sb->s_writers.frozen = SB_UNFROZEN; goto out; + } if (sb->s_op->unfreeze_fs) { error = sb->s_op->unfreeze_fs(sb); @@ -1385,12 +1365,11 @@ int thaw_super(struct super_block *sb) } } -out: sb->s_writers.frozen = SB_UNFROZEN; - smp_wmb(); + sb_freeze_unlock(sb); +out: wake_up(&sb->s_writers.wait_unfrozen); deactivate_locked_super(sb); - return 0; } EXPORT_SYMBOL(thaw_super); diff --git a/fs/ufs/Makefile b/fs/ufs/Makefile index 4d0e02b022b3..392db25c0b56 100644 --- a/fs/ufs/Makefile +++ b/fs/ufs/Makefile @@ -5,5 +5,5 @@ obj-$(CONFIG_UFS_FS) += ufs.o ufs-objs := balloc.o cylinder.o dir.o file.o ialloc.o inode.o \ - namei.o super.o symlink.o truncate.o util.o + namei.o super.o symlink.o util.o ccflags-$(CONFIG_UFS_DEBUG) += -DDEBUG diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index a7106eda5024..fb8b54eb77c5 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -417,7 +417,9 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, if (oldcount == 0) { result = ufs_alloc_fragments (inode, cgno, goal, count, err); if (result) { + write_seqlock(&UFS_I(inode)->meta_lock); ufs_cpu_to_data_ptr(sb, p, result); + write_sequnlock(&UFS_I(inode)->meta_lock); *err = 0; UFS_I(inode)->i_lastfrag = max(UFS_I(inode)->i_lastfrag, fragment + count); @@ -473,7 +475,9 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, ufs_change_blocknr(inode, fragment - oldcount, oldcount, uspi->s_sbbase + tmp, uspi->s_sbbase + result, locked_page); + write_seqlock(&UFS_I(inode)->meta_lock); ufs_cpu_to_data_ptr(sb, p, result); + write_sequnlock(&UFS_I(inode)->meta_lock); *err = 0; UFS_I(inode)->i_lastfrag = max(UFS_I(inode)->i_lastfrag, fragment + count); diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index f913a6924b23..a064cf44b143 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -41,9 +41,7 @@ #include "swab.h" #include "util.h" -static u64 ufs_frag_map(struct inode *inode, sector_t frag, bool needs_lock); - -static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t offsets[4]) +static int ufs_block_to_path(struct inode *inode, sector_t i_block, unsigned offsets[4]) { struct ufs_sb_private_info *uspi = UFS_SB(inode->i_sb)->s_uspi; int ptrs = uspi->s_apb; @@ -75,227 +73,232 @@ static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t off return n; } +typedef struct { + void *p; + union { + __fs32 key32; + __fs64 key64; + }; + struct buffer_head *bh; +} Indirect; + +static inline int grow_chain32(struct ufs_inode_info *ufsi, + struct buffer_head *bh, __fs32 *v, + Indirect *from, Indirect *to) +{ + Indirect *p; + unsigned seq; + to->bh = bh; + do { + seq = read_seqbegin(&ufsi->meta_lock); + to->key32 = *(__fs32 *)(to->p = v); + for (p = from; p <= to && p->key32 == *(__fs32 *)p->p; p++) + ; + } while (read_seqretry(&ufsi->meta_lock, seq)); + return (p > to); +} + +static inline int grow_chain64(struct ufs_inode_info *ufsi, + struct buffer_head *bh, __fs64 *v, + Indirect *from, Indirect *to) +{ + Indirect *p; + unsigned seq; + to->bh = bh; + do { + seq = read_seqbegin(&ufsi->meta_lock); + to->key64 = *(__fs64 *)(to->p = v); + for (p = from; p <= to && p->key64 == *(__fs64 *)p->p; p++) + ; + } while (read_seqretry(&ufsi->meta_lock, seq)); + return (p > to); +} + /* * Returns the location of the fragment from * the beginning of the filesystem. */ -static u64 ufs_frag_map(struct inode *inode, sector_t frag, bool needs_lock) +static u64 ufs_frag_map(struct inode *inode, unsigned offsets[4], int depth) { struct ufs_inode_info *ufsi = UFS_I(inode); struct super_block *sb = inode->i_sb; struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; u64 mask = (u64) uspi->s_apbmask>>uspi->s_fpbshift; int shift = uspi->s_apbshift-uspi->s_fpbshift; - sector_t offsets[4], *p; - int depth = ufs_block_to_path(inode, frag >> uspi->s_fpbshift, offsets); - u64 ret = 0L; - __fs32 block; - __fs64 u2_block = 0L; + Indirect chain[4], *q = chain; + unsigned *p; unsigned flags = UFS_SB(sb)->s_flags; - u64 temp = 0L; + u64 res = 0; - UFSD(": frag = %llu depth = %d\n", (unsigned long long)frag, depth); UFSD(": uspi->s_fpbshift = %d ,uspi->s_apbmask = %x, mask=%llx\n", uspi->s_fpbshift, uspi->s_apbmask, (unsigned long long)mask); if (depth == 0) - return 0; + goto no_block; +again: p = offsets; - if (needs_lock) - lock_ufs(sb); if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) goto ufs2; - block = ufsi->i_u1.i_data[*p++]; - if (!block) - goto out; + if (!grow_chain32(ufsi, NULL, &ufsi->i_u1.i_data[*p++], chain, q)) + goto changed; + if (!q->key32) + goto no_block; while (--depth) { + __fs32 *ptr; struct buffer_head *bh; - sector_t n = *p++; + unsigned n = *p++; - bh = sb_bread(sb, uspi->s_sbbase + fs32_to_cpu(sb, block)+(n>>shift)); + bh = sb_bread(sb, uspi->s_sbbase + + fs32_to_cpu(sb, q->key32) + (n>>shift)); if (!bh) - goto out; - block = ((__fs32 *) bh->b_data)[n & mask]; - brelse (bh); - if (!block) - goto out; + goto no_block; + ptr = (__fs32 *)bh->b_data + (n & mask); + if (!grow_chain32(ufsi, bh, ptr, chain, ++q)) + goto changed; + if (!q->key32) + goto no_block; } - ret = (u64) (uspi->s_sbbase + fs32_to_cpu(sb, block) + (frag & uspi->s_fpbmask)); - goto out; + res = fs32_to_cpu(sb, q->key32); + goto found; + ufs2: - u2_block = ufsi->i_u1.u2_i_data[*p++]; - if (!u2_block) - goto out; - + if (!grow_chain64(ufsi, NULL, &ufsi->i_u1.u2_i_data[*p++], chain, q)) + goto changed; + if (!q->key64) + goto no_block; while (--depth) { + __fs64 *ptr; struct buffer_head *bh; - sector_t n = *p++; + unsigned n = *p++; - - temp = (u64)(uspi->s_sbbase) + fs64_to_cpu(sb, u2_block); - bh = sb_bread(sb, temp +(u64) (n>>shift)); + bh = sb_bread(sb, uspi->s_sbbase + + fs64_to_cpu(sb, q->key64) + (n>>shift)); if (!bh) - goto out; - u2_block = ((__fs64 *)bh->b_data)[n & mask]; - brelse(bh); - if (!u2_block) - goto out; + goto no_block; + ptr = (__fs64 *)bh->b_data + (n & mask); + if (!grow_chain64(ufsi, bh, ptr, chain, ++q)) + goto changed; + if (!q->key64) + goto no_block; } - temp = (u64)uspi->s_sbbase + fs64_to_cpu(sb, u2_block); - ret = temp + (u64) (frag & uspi->s_fpbmask); + res = fs64_to_cpu(sb, q->key64); +found: + res += uspi->s_sbbase; +no_block: + while (q > chain) { + brelse(q->bh); + q--; + } + return res; -out: - if (needs_lock) - unlock_ufs(sb); - return ret; +changed: + while (q > chain) { + brelse(q->bh); + q--; + } + goto again; +} + +/* + * Unpacking tails: we have a file with partial final block and + * we had been asked to extend it. If the fragment being written + * is within the same block, we need to extend the tail just to cover + * that fragment. Otherwise the tail is extended to full block. + * + * Note that we might need to create a _new_ tail, but that will + * be handled elsewhere; this is strictly for resizing old + * ones. + */ +static bool +ufs_extend_tail(struct inode *inode, u64 writes_to, + int *err, struct page *locked_page) +{ + struct ufs_inode_info *ufsi = UFS_I(inode); + struct super_block *sb = inode->i_sb; + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + unsigned lastfrag = ufsi->i_lastfrag; /* it's a short file, so unsigned is enough */ + unsigned block = ufs_fragstoblks(lastfrag); + unsigned new_size; + void *p; + u64 tmp; + + if (writes_to < (lastfrag | uspi->s_fpbmask)) + new_size = (writes_to & uspi->s_fpbmask) + 1; + else + new_size = uspi->s_fpb; + + p = ufs_get_direct_data_ptr(uspi, ufsi, block); + tmp = ufs_new_fragments(inode, p, lastfrag, ufs_data_ptr_to_cpu(sb, p), + new_size, err, locked_page); + return tmp != 0; } /** * ufs_inode_getfrag() - allocate new fragment(s) * @inode: pointer to inode - * @fragment: number of `fragment' which hold pointer - * to new allocated fragment(s) + * @index: number of block pointer within the inode's array. * @new_fragment: number of new allocated fragment(s) - * @required: how many fragment(s) we require * @err: we set it if something wrong - * @phys: pointer to where we save physical number of new allocated fragments, - * NULL if we allocate not data(indirect blocks for example). * @new: we set it if we allocate new block * @locked_page: for ufs_new_fragments() */ -static struct buffer_head * -ufs_inode_getfrag(struct inode *inode, u64 fragment, - sector_t new_fragment, unsigned int required, int *err, - long *phys, int *new, struct page *locked_page) +static u64 +ufs_inode_getfrag(struct inode *inode, unsigned index, + sector_t new_fragment, int *err, + int *new, struct page *locked_page) { struct ufs_inode_info *ufsi = UFS_I(inode); struct super_block *sb = inode->i_sb; struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; - struct buffer_head * result; - unsigned blockoff, lastblockoff; - u64 tmp, goal, lastfrag, block, lastblock; - void *p, *p2; - - UFSD("ENTER, ino %lu, fragment %llu, new_fragment %llu, required %u, " - "metadata %d\n", inode->i_ino, (unsigned long long)fragment, - (unsigned long long)new_fragment, required, !phys); + u64 tmp, goal, lastfrag; + unsigned nfrags = uspi->s_fpb; + void *p; /* TODO : to be done for write support if ( (flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) goto ufs2; */ - block = ufs_fragstoblks (fragment); - blockoff = ufs_fragnum (fragment); - p = ufs_get_direct_data_ptr(uspi, ufsi, block); - - goal = 0; - -repeat: + p = ufs_get_direct_data_ptr(uspi, ufsi, index); tmp = ufs_data_ptr_to_cpu(sb, p); + if (tmp) + goto out; lastfrag = ufsi->i_lastfrag; - if (tmp && fragment < lastfrag) { - if (!phys) { - result = sb_getblk(sb, uspi->s_sbbase + tmp + blockoff); - if (tmp == ufs_data_ptr_to_cpu(sb, p)) { - UFSD("EXIT, result %llu\n", - (unsigned long long)tmp + blockoff); - return result; - } - brelse (result); - goto repeat; - } else { - *phys = uspi->s_sbbase + tmp + blockoff; - return NULL; - } - } - lastblock = ufs_fragstoblks (lastfrag); - lastblockoff = ufs_fragnum (lastfrag); - /* - * We will extend file into new block beyond last allocated block - */ - if (lastblock < block) { - /* - * We must reallocate last allocated block - */ - if (lastblockoff) { - p2 = ufs_get_direct_data_ptr(uspi, ufsi, lastblock); - tmp = ufs_new_fragments(inode, p2, lastfrag, - ufs_data_ptr_to_cpu(sb, p2), - uspi->s_fpb - lastblockoff, - err, locked_page); - if (!tmp) { - if (lastfrag != ufsi->i_lastfrag) - goto repeat; - else - return NULL; - } - lastfrag = ufsi->i_lastfrag; - - } - tmp = ufs_data_ptr_to_cpu(sb, - ufs_get_direct_data_ptr(uspi, ufsi, - lastblock)); - if (tmp) - goal = tmp + uspi->s_fpb; - tmp = ufs_new_fragments (inode, p, fragment - blockoff, - goal, required + blockoff, - err, - phys != NULL ? locked_page : NULL); - } else if (lastblock == block) { - /* - * We will extend last allocated block - */ - tmp = ufs_new_fragments(inode, p, fragment - - (blockoff - lastblockoff), - ufs_data_ptr_to_cpu(sb, p), - required + (blockoff - lastblockoff), - err, phys != NULL ? locked_page : NULL); - } else /* (lastblock > block) */ { - /* - * We will allocate new block before last allocated block - */ - if (block) { - tmp = ufs_data_ptr_to_cpu(sb, - ufs_get_direct_data_ptr(uspi, ufsi, block - 1)); - if (tmp) - goal = tmp + uspi->s_fpb; - } - tmp = ufs_new_fragments(inode, p, fragment - blockoff, - goal, uspi->s_fpb, err, - phys != NULL ? locked_page : NULL); + /* will that be a new tail? */ + if (new_fragment < UFS_NDIR_FRAGMENT && new_fragment >= lastfrag) + nfrags = (new_fragment & uspi->s_fpbmask) + 1; + + goal = 0; + if (index) { + goal = ufs_data_ptr_to_cpu(sb, + ufs_get_direct_data_ptr(uspi, ufsi, index - 1)); + if (goal) + goal += uspi->s_fpb; } + tmp = ufs_new_fragments(inode, p, ufs_blknum(new_fragment), + goal, uspi->s_fpb, err, locked_page); + if (!tmp) { - if ((!blockoff && ufs_data_ptr_to_cpu(sb, p)) || - (blockoff && lastfrag != ufsi->i_lastfrag)) - goto repeat; *err = -ENOSPC; - return NULL; + return 0; } - if (!phys) { - result = sb_getblk(sb, uspi->s_sbbase + tmp + blockoff); - } else { - *phys = uspi->s_sbbase + tmp + blockoff; - result = NULL; - *err = 0; + if (new) *new = 1; - } - inode->i_ctime = CURRENT_TIME_SEC; if (IS_SYNC(inode)) ufs_sync_inode (inode); mark_inode_dirty(inode); - UFSD("EXIT, result %llu\n", (unsigned long long)tmp + blockoff); - return result; +out: + return tmp + uspi->s_sbbase; /* This part : To be implemented .... Required only for writing, not required for READ-ONLY. @@ -316,95 +319,70 @@ repeat2: /** * ufs_inode_getblock() - allocate new block * @inode: pointer to inode - * @bh: pointer to block which hold "pointer" to new allocated block - * @fragment: number of `fragment' which hold pointer - * to new allocated block + * @ind_block: block number of the indirect block + * @index: number of pointer within the indirect block * @new_fragment: number of new allocated fragment * (block will hold this fragment and also uspi->s_fpb-1) * @err: see ufs_inode_getfrag() - * @phys: see ufs_inode_getfrag() * @new: see ufs_inode_getfrag() * @locked_page: see ufs_inode_getfrag() */ -static struct buffer_head * -ufs_inode_getblock(struct inode *inode, struct buffer_head *bh, - u64 fragment, sector_t new_fragment, int *err, - long *phys, int *new, struct page *locked_page) +static u64 +ufs_inode_getblock(struct inode *inode, u64 ind_block, + unsigned index, sector_t new_fragment, int *err, + int *new, struct page *locked_page) { struct super_block *sb = inode->i_sb; struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; - struct buffer_head * result; - unsigned blockoff; - u64 tmp, goal, block; + int shift = uspi->s_apbshift - uspi->s_fpbshift; + u64 tmp = 0, goal; + struct buffer_head *bh; void *p; - block = ufs_fragstoblks (fragment); - blockoff = ufs_fragnum (fragment); + if (!ind_block) + return 0; - UFSD("ENTER, ino %lu, fragment %llu, new_fragment %llu, metadata %d\n", - inode->i_ino, (unsigned long long)fragment, - (unsigned long long)new_fragment, !phys); - - result = NULL; - if (!bh) - goto out; - if (!buffer_uptodate(bh)) { - ll_rw_block (READ, 1, &bh); - wait_on_buffer (bh); - if (!buffer_uptodate(bh)) - goto out; + bh = sb_bread(sb, ind_block + (index >> shift)); + if (unlikely(!bh)) { + *err = -EIO; + return 0; } + + index &= uspi->s_apbmask >> uspi->s_fpbshift; if (uspi->fs_magic == UFS2_MAGIC) - p = (__fs64 *)bh->b_data + block; + p = (__fs64 *)bh->b_data + index; else - p = (__fs32 *)bh->b_data + block; -repeat: - tmp = ufs_data_ptr_to_cpu(sb, p); - if (tmp) { - if (!phys) { - result = sb_getblk(sb, uspi->s_sbbase + tmp + blockoff); - if (tmp == ufs_data_ptr_to_cpu(sb, p)) - goto out; - brelse (result); - goto repeat; - } else { - *phys = uspi->s_sbbase + tmp + blockoff; - goto out; - } - } + p = (__fs32 *)bh->b_data + index; - if (block && (uspi->fs_magic == UFS2_MAGIC ? - (tmp = fs64_to_cpu(sb, ((__fs64 *)bh->b_data)[block-1])) : - (tmp = fs32_to_cpu(sb, ((__fs32 *)bh->b_data)[block-1])))) + tmp = ufs_data_ptr_to_cpu(sb, p); + if (tmp) + goto out; + + if (index && (uspi->fs_magic == UFS2_MAGIC ? + (tmp = fs64_to_cpu(sb, ((__fs64 *)bh->b_data)[index-1])) : + (tmp = fs32_to_cpu(sb, ((__fs32 *)bh->b_data)[index-1])))) goal = tmp + uspi->s_fpb; else goal = bh->b_blocknr + uspi->s_fpb; tmp = ufs_new_fragments(inode, p, ufs_blknum(new_fragment), goal, uspi->s_fpb, err, locked_page); - if (!tmp) { - if (ufs_data_ptr_to_cpu(sb, p)) - goto repeat; + if (!tmp) goto out; - } - - if (!phys) { - result = sb_getblk(sb, uspi->s_sbbase + tmp + blockoff); - } else { - *phys = uspi->s_sbbase + tmp + blockoff; + if (new) *new = 1; - } mark_buffer_dirty(bh); if (IS_SYNC(inode)) sync_dirty_buffer(bh); inode->i_ctime = CURRENT_TIME_SEC; mark_inode_dirty(inode); - UFSD("result %llu\n", (unsigned long long)tmp + blockoff); out: brelse (bh); UFSD("EXIT\n"); - return result; + if (tmp) + tmp += uspi->s_sbbase; + return tmp; } /** @@ -412,103 +390,64 @@ out: * readpage, writepage and so on */ -int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create) +static int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create) { - struct super_block * sb = inode->i_sb; - struct ufs_sb_info * sbi = UFS_SB(sb); - struct ufs_sb_private_info * uspi = sbi->s_uspi; - struct buffer_head * bh; - int ret, err, new; - unsigned long ptr,phys; + struct super_block *sb = inode->i_sb; + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + int err = 0, new = 0; + unsigned offsets[4]; + int depth = ufs_block_to_path(inode, fragment >> uspi->s_fpbshift, offsets); u64 phys64 = 0; - bool needs_lock = (sbi->mutex_owner != current); - + unsigned frag = fragment & uspi->s_fpbmask; + if (!create) { - phys64 = ufs_frag_map(inode, fragment, needs_lock); - UFSD("phys64 = %llu\n", (unsigned long long)phys64); - if (phys64) - map_bh(bh_result, sb, phys64); - return 0; + phys64 = ufs_frag_map(inode, offsets, depth); + goto out; } /* This code entered only while writing ....? */ - err = -EIO; - new = 0; - ret = 0; - bh = NULL; - - if (needs_lock) - lock_ufs(sb); + mutex_lock(&UFS_I(inode)->truncate_mutex); UFSD("ENTER, ino %lu, fragment %llu\n", inode->i_ino, (unsigned long long)fragment); - if (fragment > - ((UFS_NDADDR + uspi->s_apb + uspi->s_2apb + uspi->s_3apb) - << uspi->s_fpbshift)) - goto abort_too_big; - - err = 0; - ptr = fragment; - - /* - * ok, these macros clean the logic up a bit and make - * it much more readable: - */ -#define GET_INODE_DATABLOCK(x) \ - ufs_inode_getfrag(inode, x, fragment, 1, &err, &phys, &new,\ - bh_result->b_page) -#define GET_INODE_PTR(x) \ - ufs_inode_getfrag(inode, x, fragment, uspi->s_fpb, &err, NULL, NULL,\ - bh_result->b_page) -#define GET_INDIRECT_DATABLOCK(x) \ - ufs_inode_getblock(inode, bh, x, fragment, \ - &err, &phys, &new, bh_result->b_page) -#define GET_INDIRECT_PTR(x) \ - ufs_inode_getblock(inode, bh, x, fragment, \ - &err, NULL, NULL, NULL) - - if (ptr < UFS_NDIR_FRAGMENT) { - bh = GET_INODE_DATABLOCK(ptr); + if (unlikely(!depth)) { + ufs_warning(sb, "ufs_get_block", "block > big"); + err = -EIO; goto out; } - ptr -= UFS_NDIR_FRAGMENT; - if (ptr < (1 << (uspi->s_apbshift + uspi->s_fpbshift))) { - bh = GET_INODE_PTR(UFS_IND_FRAGMENT + (ptr >> uspi->s_apbshift)); - goto get_indirect; - } - ptr -= 1 << (uspi->s_apbshift + uspi->s_fpbshift); - if (ptr < (1 << (uspi->s_2apbshift + uspi->s_fpbshift))) { - bh = GET_INODE_PTR(UFS_DIND_FRAGMENT + (ptr >> uspi->s_2apbshift)); - goto get_double; - } - ptr -= 1 << (uspi->s_2apbshift + uspi->s_fpbshift); - bh = GET_INODE_PTR(UFS_TIND_FRAGMENT + (ptr >> uspi->s_3apbshift)); - bh = GET_INDIRECT_PTR((ptr >> uspi->s_2apbshift) & uspi->s_apbmask); -get_double: - bh = GET_INDIRECT_PTR((ptr >> uspi->s_apbshift) & uspi->s_apbmask); -get_indirect: - bh = GET_INDIRECT_DATABLOCK(ptr & uspi->s_apbmask); -#undef GET_INODE_DATABLOCK -#undef GET_INODE_PTR -#undef GET_INDIRECT_DATABLOCK -#undef GET_INDIRECT_PTR + if (UFS_I(inode)->i_lastfrag < UFS_NDIR_FRAGMENT) { + unsigned lastfrag = UFS_I(inode)->i_lastfrag; + unsigned tailfrags = lastfrag & uspi->s_fpbmask; + if (tailfrags && fragment >= lastfrag) { + if (!ufs_extend_tail(inode, fragment, + &err, bh_result->b_page)) + goto out; + } + } + if (depth == 1) { + phys64 = ufs_inode_getfrag(inode, offsets[0], fragment, + &err, &new, bh_result->b_page); + } else { + int i; + phys64 = ufs_inode_getfrag(inode, offsets[0], fragment, + &err, NULL, NULL); + for (i = 1; i < depth - 1; i++) + phys64 = ufs_inode_getblock(inode, phys64, offsets[i], + fragment, &err, NULL, NULL); + phys64 = ufs_inode_getblock(inode, phys64, offsets[depth - 1], + fragment, &err, &new, bh_result->b_page); + } out: - if (err) - goto abort; - if (new) - set_buffer_new(bh_result); - map_bh(bh_result, sb, phys); -abort: - if (needs_lock) - unlock_ufs(sb); - + if (phys64) { + phys64 += frag; + map_bh(bh_result, sb, phys64); + if (new) + set_buffer_new(bh_result); + } + mutex_unlock(&UFS_I(inode)->truncate_mutex); return err; - -abort_too_big: - ufs_warning(sb, "ufs_get_block", "block > big"); - goto abort; } static int ufs_writepage(struct page *page, struct writeback_control *wbc) @@ -526,12 +465,16 @@ int ufs_prepare_chunk(struct page *page, loff_t pos, unsigned len) return __block_write_begin(page, pos, len, ufs_getfrag_block); } +static void ufs_truncate_blocks(struct inode *); + static void ufs_write_failed(struct address_space *mapping, loff_t to) { struct inode *inode = mapping->host; - if (to > inode->i_size) + if (to > inode->i_size) { truncate_pagecache(inode, inode->i_size); + ufs_truncate_blocks(inode); + } } static int ufs_write_begin(struct file *file, struct address_space *mapping, @@ -548,6 +491,18 @@ static int ufs_write_begin(struct file *file, struct address_space *mapping, return ret; } +static int ufs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + int ret; + + ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); + if (ret < len) + ufs_write_failed(mapping, pos + len); + return ret; +} + static sector_t ufs_bmap(struct address_space *mapping, sector_t block) { return generic_block_bmap(mapping,block,ufs_getfrag_block); @@ -557,7 +512,7 @@ const struct address_space_operations ufs_aops = { .readpage = ufs_readpage, .writepage = ufs_writepage, .write_begin = ufs_write_begin, - .write_end = generic_write_end, + .write_end = ufs_write_end, .bmap = ufs_bmap }; @@ -599,7 +554,7 @@ static int ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode) ufs_error (sb, "ufs_read_inode", "inode %lu has zero nlink\n", inode->i_ino); return -1; } - + /* * Linux now has 32-bit uid and gid, so we can support EFT. */ @@ -619,7 +574,7 @@ static int ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode) ufsi->i_shadow = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_shadow); ufsi->i_oeftflag = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_oeftflag); - + if (S_ISCHR(mode) || S_ISBLK(mode) || inode->i_blocks) { memcpy(ufsi->i_u1.i_data, &ufs_inode->ui_u2.ui_addr, sizeof(ufs_inode->ui_u2.ui_addr)); @@ -753,7 +708,7 @@ static void ufs1_update_inode(struct inode *inode, struct ufs_inode *ufs_inode) ufs_set_inode_uid(sb, ufs_inode, i_uid_read(inode)); ufs_set_inode_gid(sb, ufs_inode, i_gid_read(inode)); - + ufs_inode->ui_size = cpu_to_fs64(sb, inode->i_size); ufs_inode->ui_atime.tv_sec = cpu_to_fs32(sb, inode->i_atime.tv_sec); ufs_inode->ui_atime.tv_usec = 0; @@ -855,23 +810,19 @@ static int ufs_update_inode(struct inode * inode, int do_sync) ufs1_update_inode(inode, ufs_inode + ufs_inotofsbo(inode->i_ino)); } - + mark_buffer_dirty(bh); if (do_sync) sync_dirty_buffer(bh); brelse (bh); - + UFSD("EXIT\n"); return 0; } int ufs_write_inode(struct inode *inode, struct writeback_control *wbc) { - int ret; - lock_ufs(inode->i_sb); - ret = ufs_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL); - unlock_ufs(inode->i_sb); - return ret; + return ufs_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL); } int ufs_sync_inode (struct inode *inode) @@ -888,24 +839,389 @@ void ufs_evict_inode(struct inode * inode) truncate_inode_pages_final(&inode->i_data); if (want_delete) { - loff_t old_i_size; - /*UFS_I(inode)->i_dtime = CURRENT_TIME;*/ - lock_ufs(inode->i_sb); - mark_inode_dirty(inode); - ufs_update_inode(inode, IS_SYNC(inode)); - old_i_size = inode->i_size; inode->i_size = 0; - if (inode->i_blocks && ufs_truncate(inode, old_i_size)) - ufs_warning(inode->i_sb, __func__, "ufs_truncate failed\n"); - unlock_ufs(inode->i_sb); + if (inode->i_blocks) + ufs_truncate_blocks(inode); } invalidate_inode_buffers(inode); clear_inode(inode); - if (want_delete) { - lock_ufs(inode->i_sb); + if (want_delete) ufs_free_inode(inode); - unlock_ufs(inode->i_sb); - } } + +struct to_free { + struct inode *inode; + u64 to; + unsigned count; +}; + +static inline void free_data(struct to_free *ctx, u64 from, unsigned count) +{ + if (ctx->count && ctx->to != from) { + ufs_free_blocks(ctx->inode, ctx->to - ctx->count, ctx->count); + ctx->count = 0; + } + ctx->count += count; + ctx->to = from + count; +} + +#define DIRECT_BLOCK ((inode->i_size + uspi->s_bsize - 1) >> uspi->s_bshift) +#define DIRECT_FRAGMENT ((inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift) + +static void ufs_trunc_direct(struct inode *inode) +{ + struct ufs_inode_info *ufsi = UFS_I(inode); + struct super_block * sb; + struct ufs_sb_private_info * uspi; + void *p; + u64 frag1, frag2, frag3, frag4, block1, block2; + struct to_free ctx = {.inode = inode}; + unsigned i, tmp; + + UFSD("ENTER: ino %lu\n", inode->i_ino); + + sb = inode->i_sb; + uspi = UFS_SB(sb)->s_uspi; + + frag1 = DIRECT_FRAGMENT; + frag4 = min_t(u64, UFS_NDIR_FRAGMENT, ufsi->i_lastfrag); + frag2 = ((frag1 & uspi->s_fpbmask) ? ((frag1 | uspi->s_fpbmask) + 1) : frag1); + frag3 = frag4 & ~uspi->s_fpbmask; + block1 = block2 = 0; + if (frag2 > frag3) { + frag2 = frag4; + frag3 = frag4 = 0; + } else if (frag2 < frag3) { + block1 = ufs_fragstoblks (frag2); + block2 = ufs_fragstoblks (frag3); + } + + UFSD("ino %lu, frag1 %llu, frag2 %llu, block1 %llu, block2 %llu," + " frag3 %llu, frag4 %llu\n", inode->i_ino, + (unsigned long long)frag1, (unsigned long long)frag2, + (unsigned long long)block1, (unsigned long long)block2, + (unsigned long long)frag3, (unsigned long long)frag4); + + if (frag1 >= frag2) + goto next1; + + /* + * Free first free fragments + */ + p = ufs_get_direct_data_ptr(uspi, ufsi, ufs_fragstoblks(frag1)); + tmp = ufs_data_ptr_to_cpu(sb, p); + if (!tmp ) + ufs_panic (sb, "ufs_trunc_direct", "internal error"); + frag2 -= frag1; + frag1 = ufs_fragnum (frag1); + + ufs_free_fragments(inode, tmp + frag1, frag2); + +next1: + /* + * Free whole blocks + */ + for (i = block1 ; i < block2; i++) { + p = ufs_get_direct_data_ptr(uspi, ufsi, i); + tmp = ufs_data_ptr_to_cpu(sb, p); + if (!tmp) + continue; + write_seqlock(&ufsi->meta_lock); + ufs_data_ptr_clear(uspi, p); + write_sequnlock(&ufsi->meta_lock); + + free_data(&ctx, tmp, uspi->s_fpb); + } + + free_data(&ctx, 0, 0); + + if (frag3 >= frag4) + goto next3; + + /* + * Free last free fragments + */ + p = ufs_get_direct_data_ptr(uspi, ufsi, ufs_fragstoblks(frag3)); + tmp = ufs_data_ptr_to_cpu(sb, p); + if (!tmp ) + ufs_panic(sb, "ufs_truncate_direct", "internal error"); + frag4 = ufs_fragnum (frag4); + write_seqlock(&ufsi->meta_lock); + ufs_data_ptr_clear(uspi, p); + write_sequnlock(&ufsi->meta_lock); + + ufs_free_fragments (inode, tmp, frag4); + next3: + + UFSD("EXIT: ino %lu\n", inode->i_ino); +} + +static void free_full_branch(struct inode *inode, u64 ind_block, int depth) +{ + struct super_block *sb = inode->i_sb; + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + struct ufs_buffer_head *ubh = ubh_bread(sb, ind_block, uspi->s_bsize); + unsigned i; + + if (!ubh) + return; + + if (--depth) { + for (i = 0; i < uspi->s_apb; i++) { + void *p = ubh_get_data_ptr(uspi, ubh, i); + u64 block = ufs_data_ptr_to_cpu(sb, p); + if (block) + free_full_branch(inode, block, depth); + } + } else { + struct to_free ctx = {.inode = inode}; + + for (i = 0; i < uspi->s_apb; i++) { + void *p = ubh_get_data_ptr(uspi, ubh, i); + u64 block = ufs_data_ptr_to_cpu(sb, p); + if (block) + free_data(&ctx, block, uspi->s_fpb); + } + free_data(&ctx, 0, 0); + } + + ubh_bforget(ubh); + ufs_free_blocks(inode, ind_block, uspi->s_fpb); +} + +static void free_branch_tail(struct inode *inode, unsigned from, struct ufs_buffer_head *ubh, int depth) +{ + struct super_block *sb = inode->i_sb; + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + unsigned i; + + if (--depth) { + for (i = from; i < uspi->s_apb ; i++) { + void *p = ubh_get_data_ptr(uspi, ubh, i); + u64 block = ufs_data_ptr_to_cpu(sb, p); + if (block) { + write_seqlock(&UFS_I(inode)->meta_lock); + ufs_data_ptr_clear(uspi, p); + write_sequnlock(&UFS_I(inode)->meta_lock); + ubh_mark_buffer_dirty(ubh); + free_full_branch(inode, block, depth); + } + } + } else { + struct to_free ctx = {.inode = inode}; + + for (i = from; i < uspi->s_apb; i++) { + void *p = ubh_get_data_ptr(uspi, ubh, i); + u64 block = ufs_data_ptr_to_cpu(sb, p); + if (block) { + write_seqlock(&UFS_I(inode)->meta_lock); + ufs_data_ptr_clear(uspi, p); + write_sequnlock(&UFS_I(inode)->meta_lock); + ubh_mark_buffer_dirty(ubh); + free_data(&ctx, block, uspi->s_fpb); + } + } + free_data(&ctx, 0, 0); + } + if (IS_SYNC(inode) && ubh_buffer_dirty(ubh)) + ubh_sync_block(ubh); + ubh_brelse(ubh); +} + +static int ufs_alloc_lastblock(struct inode *inode, loff_t size) +{ + int err = 0; + struct super_block *sb = inode->i_sb; + struct address_space *mapping = inode->i_mapping; + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + unsigned i, end; + sector_t lastfrag; + struct page *lastpage; + struct buffer_head *bh; + u64 phys64; + + lastfrag = (size + uspi->s_fsize - 1) >> uspi->s_fshift; + + if (!lastfrag) + goto out; + + lastfrag--; + + lastpage = ufs_get_locked_page(mapping, lastfrag >> + (PAGE_CACHE_SHIFT - inode->i_blkbits)); + if (IS_ERR(lastpage)) { + err = -EIO; + goto out; + } + + end = lastfrag & ((1 << (PAGE_CACHE_SHIFT - inode->i_blkbits)) - 1); + bh = page_buffers(lastpage); + for (i = 0; i < end; ++i) + bh = bh->b_this_page; + + + err = ufs_getfrag_block(inode, lastfrag, bh, 1); + + if (unlikely(err)) + goto out_unlock; + + if (buffer_new(bh)) { + clear_buffer_new(bh); + unmap_underlying_metadata(bh->b_bdev, + bh->b_blocknr); + /* + * we do not zeroize fragment, because of + * if it maped to hole, it already contains zeroes + */ + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + set_page_dirty(lastpage); + } + + if (lastfrag >= UFS_IND_FRAGMENT) { + end = uspi->s_fpb - ufs_fragnum(lastfrag) - 1; + phys64 = bh->b_blocknr + 1; + for (i = 0; i < end; ++i) { + bh = sb_getblk(sb, i + phys64); + lock_buffer(bh); + memset(bh->b_data, 0, sb->s_blocksize); + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + unlock_buffer(bh); + sync_dirty_buffer(bh); + brelse(bh); + } + } +out_unlock: + ufs_put_locked_page(lastpage); +out: + return err; +} + +static void __ufs_truncate_blocks(struct inode *inode) +{ + struct ufs_inode_info *ufsi = UFS_I(inode); + struct super_block *sb = inode->i_sb; + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + unsigned offsets[4]; + int depth = ufs_block_to_path(inode, DIRECT_BLOCK, offsets); + int depth2; + unsigned i; + struct ufs_buffer_head *ubh[3]; + void *p; + u64 block; + + if (!depth) + return; + + /* find the last non-zero in offsets[] */ + for (depth2 = depth - 1; depth2; depth2--) + if (offsets[depth2]) + break; + + mutex_lock(&ufsi->truncate_mutex); + if (depth == 1) { + ufs_trunc_direct(inode); + offsets[0] = UFS_IND_BLOCK; + } else { + /* get the blocks that should be partially emptied */ + p = ufs_get_direct_data_ptr(uspi, ufsi, offsets[0]); + for (i = 0; i < depth2; i++) { + offsets[i]++; /* next branch is fully freed */ + block = ufs_data_ptr_to_cpu(sb, p); + if (!block) + break; + ubh[i] = ubh_bread(sb, block, uspi->s_bsize); + if (!ubh[i]) { + write_seqlock(&ufsi->meta_lock); + ufs_data_ptr_clear(uspi, p); + write_sequnlock(&ufsi->meta_lock); + break; + } + p = ubh_get_data_ptr(uspi, ubh[i], offsets[i + 1]); + } + while (i--) + free_branch_tail(inode, offsets[i + 1], ubh[i], depth - i - 1); + } + for (i = offsets[0]; i <= UFS_TIND_BLOCK; i++) { + p = ufs_get_direct_data_ptr(uspi, ufsi, i); + block = ufs_data_ptr_to_cpu(sb, p); + if (block) { + write_seqlock(&ufsi->meta_lock); + ufs_data_ptr_clear(uspi, p); + write_sequnlock(&ufsi->meta_lock); + free_full_branch(inode, block, i - UFS_IND_BLOCK + 1); + } + } + ufsi->i_lastfrag = DIRECT_FRAGMENT; + mark_inode_dirty(inode); + mutex_unlock(&ufsi->truncate_mutex); +} + +static int ufs_truncate(struct inode *inode, loff_t size) +{ + int err = 0; + + UFSD("ENTER: ino %lu, i_size: %llu, old_i_size: %llu\n", + inode->i_ino, (unsigned long long)size, + (unsigned long long)i_size_read(inode)); + + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode))) + return -EINVAL; + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return -EPERM; + + err = ufs_alloc_lastblock(inode, size); + + if (err) + goto out; + + block_truncate_page(inode->i_mapping, size, ufs_getfrag_block); + + truncate_setsize(inode, size); + + __ufs_truncate_blocks(inode); + inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); +out: + UFSD("EXIT: err %d\n", err); + return err; +} + +void ufs_truncate_blocks(struct inode *inode) +{ + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode))) + return; + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return; + __ufs_truncate_blocks(inode); +} + +int ufs_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = d_inode(dentry); + unsigned int ia_valid = attr->ia_valid; + int error; + + error = inode_change_ok(inode, attr); + if (error) + return error; + + if (ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) { + error = ufs_truncate(inode, attr->ia_size); + if (error) + return error; + } + + setattr_copy(inode, attr); + mark_inode_dirty(inode); + return 0; +} + +const struct inode_operations ufs_file_inode_operations = { + .setattr = ufs_setattr, +}; diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 250579a80d90..f6390eec02ca 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -94,22 +94,6 @@ #include "swab.h" #include "util.h" -void lock_ufs(struct super_block *sb) -{ - struct ufs_sb_info *sbi = UFS_SB(sb); - - mutex_lock(&sbi->mutex); - sbi->mutex_owner = current; -} - -void unlock_ufs(struct super_block *sb) -{ - struct ufs_sb_info *sbi = UFS_SB(sb); - - sbi->mutex_owner = NULL; - mutex_unlock(&sbi->mutex); -} - static struct inode *ufs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation) { struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; @@ -694,7 +678,6 @@ static int ufs_sync_fs(struct super_block *sb, int wait) struct ufs_super_block_third * usb3; unsigned flags; - lock_ufs(sb); mutex_lock(&UFS_SB(sb)->s_lock); UFSD("ENTER\n"); @@ -714,7 +697,6 @@ static int ufs_sync_fs(struct super_block *sb, int wait) UFSD("EXIT\n"); mutex_unlock(&UFS_SB(sb)->s_lock); - unlock_ufs(sb); return 0; } @@ -758,7 +740,6 @@ static void ufs_put_super(struct super_block *sb) ubh_brelse_uspi (sbi->s_uspi); kfree (sbi->s_uspi); - mutex_destroy(&sbi->mutex); kfree (sbi); sb->s_fs_info = NULL; UFSD("EXIT\n"); @@ -801,7 +782,6 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) UFSD("flag %u\n", (int)(sb->s_flags & MS_RDONLY)); - mutex_init(&sbi->mutex); mutex_init(&sbi->s_lock); spin_lock_init(&sbi->work_lock); INIT_DELAYED_WORK(&sbi->sync_work, delayed_sync_fs); @@ -1257,7 +1237,6 @@ magic_found: return 0; failed: - mutex_destroy(&sbi->mutex); if (ubh) ubh_brelse_uspi (uspi); kfree (uspi); @@ -1280,7 +1259,6 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) unsigned flags; sync_filesystem(sb); - lock_ufs(sb); mutex_lock(&UFS_SB(sb)->s_lock); uspi = UFS_SB(sb)->s_uspi; flags = UFS_SB(sb)->s_flags; @@ -1296,7 +1274,6 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) ufs_set_opt (new_mount_opt, ONERROR_LOCK); if (!ufs_parse_options (data, &new_mount_opt)) { mutex_unlock(&UFS_SB(sb)->s_lock); - unlock_ufs(sb); return -EINVAL; } if (!(new_mount_opt & UFS_MOUNT_UFSTYPE)) { @@ -1304,14 +1281,12 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) } else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) { pr_err("ufstype can't be changed during remount\n"); mutex_unlock(&UFS_SB(sb)->s_lock); - unlock_ufs(sb); return -EINVAL; } if ((*mount_flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { UFS_SB(sb)->s_mount_opt = new_mount_opt; mutex_unlock(&UFS_SB(sb)->s_lock); - unlock_ufs(sb); return 0; } @@ -1335,7 +1310,6 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) #ifndef CONFIG_UFS_FS_WRITE pr_err("ufs was compiled with read-only support, can't be mounted as read-write\n"); mutex_unlock(&UFS_SB(sb)->s_lock); - unlock_ufs(sb); return -EINVAL; #else if (ufstype != UFS_MOUNT_UFSTYPE_SUN && @@ -1345,13 +1319,11 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) ufstype != UFS_MOUNT_UFSTYPE_UFS2) { pr_err("this ufstype is read-only supported\n"); mutex_unlock(&UFS_SB(sb)->s_lock); - unlock_ufs(sb); return -EINVAL; } if (!ufs_read_cylinder_structures(sb)) { pr_err("failed during remounting\n"); mutex_unlock(&UFS_SB(sb)->s_lock); - unlock_ufs(sb); return -EPERM; } sb->s_flags &= ~MS_RDONLY; @@ -1359,7 +1331,6 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) } UFS_SB(sb)->s_mount_opt = new_mount_opt; mutex_unlock(&UFS_SB(sb)->s_lock); - unlock_ufs(sb); return 0; } @@ -1391,8 +1362,7 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf) struct ufs_super_block_third *usb3; u64 id = huge_encode_dev(sb->s_bdev->bd_dev); - lock_ufs(sb); - + mutex_lock(&UFS_SB(sb)->s_lock); usb3 = ubh_get_usb_third(uspi); if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) { @@ -1413,7 +1383,7 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_fsid.val[0] = (u32)id; buf->f_fsid.val[1] = (u32)(id >> 32); - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); return 0; } @@ -1429,6 +1399,8 @@ static struct inode *ufs_alloc_inode(struct super_block *sb) return NULL; ei->vfs_inode.i_version = 1; + seqlock_init(&ei->meta_lock); + mutex_init(&ei->truncate_mutex); return &ei->vfs_inode; } diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c deleted file mode 100644 index 21154704c168..000000000000 --- a/fs/ufs/truncate.c +++ /dev/null @@ -1,523 +0,0 @@ -/* - * linux/fs/ufs/truncate.c - * - * Copyright (C) 1998 - * Daniel Pirkl - * Charles University, Faculty of Mathematics and Physics - * - * from - * - * linux/fs/ext2/truncate.c - * - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - * - * from - * - * linux/fs/minix/truncate.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - * - * Big-endian to little-endian byte-swapping/bitmaps by - * David S. Miller (davem@caip.rutgers.edu), 1995 - */ - -/* - * Real random numbers for secure rm added 94/02/18 - * Idea from Pierre del Perugia - */ - -/* - * Adoptation to use page cache and UFS2 write support by - * Evgeniy Dushistov , 2006-2007 - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "ufs_fs.h" -#include "ufs.h" -#include "swab.h" -#include "util.h" - -/* - * Secure deletion currently doesn't work. It interacts very badly - * with buffers shared with memory mappings, and for that reason - * can't be done in the truncate() routines. It should instead be - * done separately in "release()" before calling the truncate routines - * that will release the actual file blocks. - * - * Linus - */ - -#define DIRECT_BLOCK ((inode->i_size + uspi->s_bsize - 1) >> uspi->s_bshift) -#define DIRECT_FRAGMENT ((inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift) - - -static int ufs_trunc_direct(struct inode *inode) -{ - struct ufs_inode_info *ufsi = UFS_I(inode); - struct super_block * sb; - struct ufs_sb_private_info * uspi; - void *p; - u64 frag1, frag2, frag3, frag4, block1, block2; - unsigned frag_to_free, free_count; - unsigned i, tmp; - int retry; - - UFSD("ENTER: ino %lu\n", inode->i_ino); - - sb = inode->i_sb; - uspi = UFS_SB(sb)->s_uspi; - - frag_to_free = 0; - free_count = 0; - retry = 0; - - frag1 = DIRECT_FRAGMENT; - frag4 = min_t(u64, UFS_NDIR_FRAGMENT, ufsi->i_lastfrag); - frag2 = ((frag1 & uspi->s_fpbmask) ? ((frag1 | uspi->s_fpbmask) + 1) : frag1); - frag3 = frag4 & ~uspi->s_fpbmask; - block1 = block2 = 0; - if (frag2 > frag3) { - frag2 = frag4; - frag3 = frag4 = 0; - } else if (frag2 < frag3) { - block1 = ufs_fragstoblks (frag2); - block2 = ufs_fragstoblks (frag3); - } - - UFSD("ino %lu, frag1 %llu, frag2 %llu, block1 %llu, block2 %llu," - " frag3 %llu, frag4 %llu\n", inode->i_ino, - (unsigned long long)frag1, (unsigned long long)frag2, - (unsigned long long)block1, (unsigned long long)block2, - (unsigned long long)frag3, (unsigned long long)frag4); - - if (frag1 >= frag2) - goto next1; - - /* - * Free first free fragments - */ - p = ufs_get_direct_data_ptr(uspi, ufsi, ufs_fragstoblks(frag1)); - tmp = ufs_data_ptr_to_cpu(sb, p); - if (!tmp ) - ufs_panic (sb, "ufs_trunc_direct", "internal error"); - frag2 -= frag1; - frag1 = ufs_fragnum (frag1); - - ufs_free_fragments(inode, tmp + frag1, frag2); - mark_inode_dirty(inode); - frag_to_free = tmp + frag1; - -next1: - /* - * Free whole blocks - */ - for (i = block1 ; i < block2; i++) { - p = ufs_get_direct_data_ptr(uspi, ufsi, i); - tmp = ufs_data_ptr_to_cpu(sb, p); - if (!tmp) - continue; - ufs_data_ptr_clear(uspi, p); - - if (free_count == 0) { - frag_to_free = tmp; - free_count = uspi->s_fpb; - } else if (free_count > 0 && frag_to_free == tmp - free_count) - free_count += uspi->s_fpb; - else { - ufs_free_blocks (inode, frag_to_free, free_count); - frag_to_free = tmp; - free_count = uspi->s_fpb; - } - mark_inode_dirty(inode); - } - - if (free_count > 0) - ufs_free_blocks (inode, frag_to_free, free_count); - - if (frag3 >= frag4) - goto next3; - - /* - * Free last free fragments - */ - p = ufs_get_direct_data_ptr(uspi, ufsi, ufs_fragstoblks(frag3)); - tmp = ufs_data_ptr_to_cpu(sb, p); - if (!tmp ) - ufs_panic(sb, "ufs_truncate_direct", "internal error"); - frag4 = ufs_fragnum (frag4); - ufs_data_ptr_clear(uspi, p); - - ufs_free_fragments (inode, tmp, frag4); - mark_inode_dirty(inode); - next3: - - UFSD("EXIT: ino %lu\n", inode->i_ino); - return retry; -} - - -static int ufs_trunc_indirect(struct inode *inode, u64 offset, void *p) -{ - struct super_block * sb; - struct ufs_sb_private_info * uspi; - struct ufs_buffer_head * ind_ubh; - void *ind; - u64 tmp, indirect_block, i, frag_to_free; - unsigned free_count; - int retry; - - UFSD("ENTER: ino %lu, offset %llu, p: %p\n", - inode->i_ino, (unsigned long long)offset, p); - - BUG_ON(!p); - - sb = inode->i_sb; - uspi = UFS_SB(sb)->s_uspi; - - frag_to_free = 0; - free_count = 0; - retry = 0; - - tmp = ufs_data_ptr_to_cpu(sb, p); - if (!tmp) - return 0; - ind_ubh = ubh_bread(sb, tmp, uspi->s_bsize); - if (tmp != ufs_data_ptr_to_cpu(sb, p)) { - ubh_brelse (ind_ubh); - return 1; - } - if (!ind_ubh) { - ufs_data_ptr_clear(uspi, p); - return 0; - } - - indirect_block = (DIRECT_BLOCK > offset) ? (DIRECT_BLOCK - offset) : 0; - for (i = indirect_block; i < uspi->s_apb; i++) { - ind = ubh_get_data_ptr(uspi, ind_ubh, i); - tmp = ufs_data_ptr_to_cpu(sb, ind); - if (!tmp) - continue; - - ufs_data_ptr_clear(uspi, ind); - ubh_mark_buffer_dirty(ind_ubh); - if (free_count == 0) { - frag_to_free = tmp; - free_count = uspi->s_fpb; - } else if (free_count > 0 && frag_to_free == tmp - free_count) - free_count += uspi->s_fpb; - else { - ufs_free_blocks (inode, frag_to_free, free_count); - frag_to_free = tmp; - free_count = uspi->s_fpb; - } - - mark_inode_dirty(inode); - } - - if (free_count > 0) { - ufs_free_blocks (inode, frag_to_free, free_count); - } - for (i = 0; i < uspi->s_apb; i++) - if (!ufs_is_data_ptr_zero(uspi, - ubh_get_data_ptr(uspi, ind_ubh, i))) - break; - if (i >= uspi->s_apb) { - tmp = ufs_data_ptr_to_cpu(sb, p); - ufs_data_ptr_clear(uspi, p); - - ufs_free_blocks (inode, tmp, uspi->s_fpb); - mark_inode_dirty(inode); - ubh_bforget(ind_ubh); - ind_ubh = NULL; - } - if (IS_SYNC(inode) && ind_ubh && ubh_buffer_dirty(ind_ubh)) - ubh_sync_block(ind_ubh); - ubh_brelse (ind_ubh); - - UFSD("EXIT: ino %lu\n", inode->i_ino); - - return retry; -} - -static int ufs_trunc_dindirect(struct inode *inode, u64 offset, void *p) -{ - struct super_block * sb; - struct ufs_sb_private_info * uspi; - struct ufs_buffer_head *dind_bh; - u64 i, tmp, dindirect_block; - void *dind; - int retry = 0; - - UFSD("ENTER: ino %lu\n", inode->i_ino); - - sb = inode->i_sb; - uspi = UFS_SB(sb)->s_uspi; - - dindirect_block = (DIRECT_BLOCK > offset) - ? ((DIRECT_BLOCK - offset) >> uspi->s_apbshift) : 0; - retry = 0; - - tmp = ufs_data_ptr_to_cpu(sb, p); - if (!tmp) - return 0; - dind_bh = ubh_bread(sb, tmp, uspi->s_bsize); - if (tmp != ufs_data_ptr_to_cpu(sb, p)) { - ubh_brelse (dind_bh); - return 1; - } - if (!dind_bh) { - ufs_data_ptr_clear(uspi, p); - return 0; - } - - for (i = dindirect_block ; i < uspi->s_apb ; i++) { - dind = ubh_get_data_ptr(uspi, dind_bh, i); - tmp = ufs_data_ptr_to_cpu(sb, dind); - if (!tmp) - continue; - retry |= ufs_trunc_indirect (inode, offset + (i << uspi->s_apbshift), dind); - ubh_mark_buffer_dirty(dind_bh); - } - - for (i = 0; i < uspi->s_apb; i++) - if (!ufs_is_data_ptr_zero(uspi, - ubh_get_data_ptr(uspi, dind_bh, i))) - break; - if (i >= uspi->s_apb) { - tmp = ufs_data_ptr_to_cpu(sb, p); - ufs_data_ptr_clear(uspi, p); - - ufs_free_blocks(inode, tmp, uspi->s_fpb); - mark_inode_dirty(inode); - ubh_bforget(dind_bh); - dind_bh = NULL; - } - if (IS_SYNC(inode) && dind_bh && ubh_buffer_dirty(dind_bh)) - ubh_sync_block(dind_bh); - ubh_brelse (dind_bh); - - UFSD("EXIT: ino %lu\n", inode->i_ino); - - return retry; -} - -static int ufs_trunc_tindirect(struct inode *inode) -{ - struct super_block *sb = inode->i_sb; - struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; - struct ufs_inode_info *ufsi = UFS_I(inode); - struct ufs_buffer_head * tind_bh; - u64 tindirect_block, tmp, i; - void *tind, *p; - int retry; - - UFSD("ENTER: ino %lu\n", inode->i_ino); - - retry = 0; - - tindirect_block = (DIRECT_BLOCK > (UFS_NDADDR + uspi->s_apb + uspi->s_2apb)) - ? ((DIRECT_BLOCK - UFS_NDADDR - uspi->s_apb - uspi->s_2apb) >> uspi->s_2apbshift) : 0; - - p = ufs_get_direct_data_ptr(uspi, ufsi, UFS_TIND_BLOCK); - if (!(tmp = ufs_data_ptr_to_cpu(sb, p))) - return 0; - tind_bh = ubh_bread (sb, tmp, uspi->s_bsize); - if (tmp != ufs_data_ptr_to_cpu(sb, p)) { - ubh_brelse (tind_bh); - return 1; - } - if (!tind_bh) { - ufs_data_ptr_clear(uspi, p); - return 0; - } - - for (i = tindirect_block ; i < uspi->s_apb ; i++) { - tind = ubh_get_data_ptr(uspi, tind_bh, i); - retry |= ufs_trunc_dindirect(inode, UFS_NDADDR + - uspi->s_apb + ((i + 1) << uspi->s_2apbshift), tind); - ubh_mark_buffer_dirty(tind_bh); - } - for (i = 0; i < uspi->s_apb; i++) - if (!ufs_is_data_ptr_zero(uspi, - ubh_get_data_ptr(uspi, tind_bh, i))) - break; - if (i >= uspi->s_apb) { - tmp = ufs_data_ptr_to_cpu(sb, p); - ufs_data_ptr_clear(uspi, p); - - ufs_free_blocks(inode, tmp, uspi->s_fpb); - mark_inode_dirty(inode); - ubh_bforget(tind_bh); - tind_bh = NULL; - } - if (IS_SYNC(inode) && tind_bh && ubh_buffer_dirty(tind_bh)) - ubh_sync_block(tind_bh); - ubh_brelse (tind_bh); - - UFSD("EXIT: ino %lu\n", inode->i_ino); - return retry; -} - -static int ufs_alloc_lastblock(struct inode *inode) -{ - int err = 0; - struct super_block *sb = inode->i_sb; - struct address_space *mapping = inode->i_mapping; - struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; - unsigned i, end; - sector_t lastfrag; - struct page *lastpage; - struct buffer_head *bh; - u64 phys64; - - lastfrag = (i_size_read(inode) + uspi->s_fsize - 1) >> uspi->s_fshift; - - if (!lastfrag) - goto out; - - lastfrag--; - - lastpage = ufs_get_locked_page(mapping, lastfrag >> - (PAGE_CACHE_SHIFT - inode->i_blkbits)); - if (IS_ERR(lastpage)) { - err = -EIO; - goto out; - } - - end = lastfrag & ((1 << (PAGE_CACHE_SHIFT - inode->i_blkbits)) - 1); - bh = page_buffers(lastpage); - for (i = 0; i < end; ++i) - bh = bh->b_this_page; - - - err = ufs_getfrag_block(inode, lastfrag, bh, 1); - - if (unlikely(err)) - goto out_unlock; - - if (buffer_new(bh)) { - clear_buffer_new(bh); - unmap_underlying_metadata(bh->b_bdev, - bh->b_blocknr); - /* - * we do not zeroize fragment, because of - * if it maped to hole, it already contains zeroes - */ - set_buffer_uptodate(bh); - mark_buffer_dirty(bh); - set_page_dirty(lastpage); - } - - if (lastfrag >= UFS_IND_FRAGMENT) { - end = uspi->s_fpb - ufs_fragnum(lastfrag) - 1; - phys64 = bh->b_blocknr + 1; - for (i = 0; i < end; ++i) { - bh = sb_getblk(sb, i + phys64); - lock_buffer(bh); - memset(bh->b_data, 0, sb->s_blocksize); - set_buffer_uptodate(bh); - mark_buffer_dirty(bh); - unlock_buffer(bh); - sync_dirty_buffer(bh); - brelse(bh); - } - } -out_unlock: - ufs_put_locked_page(lastpage); -out: - return err; -} - -int ufs_truncate(struct inode *inode, loff_t old_i_size) -{ - struct ufs_inode_info *ufsi = UFS_I(inode); - struct super_block *sb = inode->i_sb; - struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; - int retry, err = 0; - - UFSD("ENTER: ino %lu, i_size: %llu, old_i_size: %llu\n", - inode->i_ino, (unsigned long long)i_size_read(inode), - (unsigned long long)old_i_size); - - if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || - S_ISLNK(inode->i_mode))) - return -EINVAL; - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) - return -EPERM; - - err = ufs_alloc_lastblock(inode); - - if (err) { - i_size_write(inode, old_i_size); - goto out; - } - - block_truncate_page(inode->i_mapping, inode->i_size, ufs_getfrag_block); - - while (1) { - retry = ufs_trunc_direct(inode); - retry |= ufs_trunc_indirect(inode, UFS_IND_BLOCK, - ufs_get_direct_data_ptr(uspi, ufsi, - UFS_IND_BLOCK)); - retry |= ufs_trunc_dindirect(inode, UFS_IND_BLOCK + uspi->s_apb, - ufs_get_direct_data_ptr(uspi, ufsi, - UFS_DIND_BLOCK)); - retry |= ufs_trunc_tindirect (inode); - if (!retry) - break; - if (IS_SYNC(inode) && (inode->i_state & I_DIRTY)) - ufs_sync_inode (inode); - yield(); - } - - inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; - ufsi->i_lastfrag = DIRECT_FRAGMENT; - mark_inode_dirty(inode); -out: - UFSD("EXIT: err %d\n", err); - return err; -} - -int ufs_setattr(struct dentry *dentry, struct iattr *attr) -{ - struct inode *inode = d_inode(dentry); - unsigned int ia_valid = attr->ia_valid; - int error; - - error = inode_change_ok(inode, attr); - if (error) - return error; - - if (ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) { - loff_t old_i_size = inode->i_size; - - /* XXX(truncate): truncate_setsize should be called last */ - truncate_setsize(inode, attr->ia_size); - - lock_ufs(inode->i_sb); - error = ufs_truncate(inode, old_i_size); - unlock_ufs(inode->i_sb); - if (error) - return error; - } - - setattr_copy(inode, attr); - mark_inode_dirty(inode); - return 0; -} - -const struct inode_operations ufs_file_inode_operations = { - .setattr = ufs_setattr, -}; diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h index 2e31ea2e35a3..7da4aca868c0 100644 --- a/fs/ufs/ufs.h +++ b/fs/ufs/ufs.h @@ -24,8 +24,6 @@ struct ufs_sb_info { unsigned s_cgno[UFS_MAX_GROUP_LOADED]; unsigned short s_cg_loaded; unsigned s_mount_opt; - struct mutex mutex; - struct task_struct *mutex_owner; struct super_block *sb; int work_queued; /* non-zero if the delayed work is queued */ struct delayed_work sync_work; /* FS sync delayed work */ @@ -46,6 +44,8 @@ struct ufs_inode_info { __u32 i_oeftflag; __u16 i_osync; __u64 i_lastfrag; + seqlock_t meta_lock; + struct mutex truncate_mutex; __u32 i_dir_start_lookup; struct inode vfs_inode; }; @@ -122,7 +122,7 @@ extern struct inode *ufs_iget(struct super_block *, unsigned long); extern int ufs_write_inode (struct inode *, struct writeback_control *); extern int ufs_sync_inode (struct inode *); extern void ufs_evict_inode (struct inode *); -extern int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create); +extern int ufs_setattr(struct dentry *dentry, struct iattr *attr); /* namei.c */ extern const struct file_operations ufs_dir_operations; @@ -140,10 +140,6 @@ void ufs_mark_sb_dirty(struct super_block *sb); extern const struct inode_operations ufs_fast_symlink_inode_operations; extern const struct inode_operations ufs_symlink_inode_operations; -/* truncate.c */ -extern int ufs_truncate (struct inode *, loff_t); -extern int ufs_setattr(struct dentry *dentry, struct iattr *attr); - static inline struct ufs_sb_info *UFS_SB(struct super_block *sb) { return sb->s_fs_info; @@ -170,7 +166,4 @@ static inline u32 ufs_dtogd(struct ufs_sb_private_info * uspi, u64 b) return do_div(b, uspi->s_fpg); } -extern void lock_ufs(struct super_block *sb); -extern void unlock_ufs(struct super_block *sb); - #endif /* _UFS_UFS_H */ diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index c77499bcbd7a..cc2a321f774b 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -119,8 +119,7 @@ xfs_setfilesize_trans_alloc( * We may pass freeze protection with a transaction. So tell lockdep * we released it. */ - rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], - 1, _THIS_IP_); + __sb_writers_release(ioend->io_inode->i_sb, SB_FREEZE_FS); /* * We hand off the transaction to the completion thread now, so * clear the flag here. @@ -171,8 +170,7 @@ xfs_setfilesize_ioend( * Similarly for freeze protection. */ current_set_flags_nested(&tp->t_pflags, PF_FSTRANS); - rwsem_acquire_read(&VFS_I(ip)->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], - 0, 1, _THIS_IP_); + __sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS); return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size); } diff --git a/include/linux/fs.h b/include/linux/fs.h index dc634a55163b..b2f9b9c25e41 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1,7 +1,6 @@ #ifndef _LINUX_FS_H #define _LINUX_FS_H - #include #include #include @@ -30,6 +29,8 @@ #include #include #include +#include +#include #include #include @@ -636,7 +637,7 @@ struct inode { unsigned long dirtied_time_when; struct hlist_node i_hash; - struct list_head i_wb_list; /* backing dev IO list */ + struct list_head i_io_list; /* backing dev IO list */ #ifdef CONFIG_CGROUP_WRITEBACK struct bdi_writeback *i_wb; /* the associated cgroup wb */ @@ -1281,16 +1282,9 @@ enum { #define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1) struct sb_writers { - /* Counters for counting writers at each level */ - struct percpu_counter counter[SB_FREEZE_LEVELS]; - wait_queue_head_t wait; /* queue for waiting for - writers / faults to finish */ - int frozen; /* Is sb frozen? */ - wait_queue_head_t wait_unfrozen; /* queue for waiting for - sb to be thawed */ -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lockdep_map lock_map[SB_FREEZE_LEVELS]; -#endif + int frozen; /* Is sb frozen? */ + wait_queue_head_t wait_unfrozen; /* for get_super_thawed() */ + struct percpu_rw_semaphore rw_sem[SB_FREEZE_LEVELS]; }; struct super_block { @@ -1316,7 +1310,6 @@ struct super_block { #endif const struct xattr_handler **s_xattr; - struct list_head s_inodes; /* all inodes */ struct hlist_bl_head s_anon; /* anonymous dentries for (nfs) exporting */ struct list_head s_mounts; /* list of mounts; _not_ for fs use */ struct block_device *s_bdev; @@ -1382,11 +1375,18 @@ struct super_block { struct list_lru s_dentry_lru ____cacheline_aligned_in_smp; struct list_lru s_inode_lru ____cacheline_aligned_in_smp; struct rcu_head rcu; + struct work_struct destroy_work; + + struct mutex s_sync_lock; /* sync serialisation lock */ /* * Indicates how deep in a filesystem stack this SB is */ int s_stack_depth; + + /* s_inode_list_lock protects s_inodes */ + spinlock_t s_inode_list_lock ____cacheline_aligned_in_smp; + struct list_head s_inodes; /* all inodes */ }; extern struct timespec current_fs_time(struct super_block *sb); @@ -1398,6 +1398,11 @@ extern struct timespec current_fs_time(struct super_block *sb); void __sb_end_write(struct super_block *sb, int level); int __sb_start_write(struct super_block *sb, int level, bool wait); +#define __sb_writers_acquired(sb, lev) \ + percpu_rwsem_acquire(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_) +#define __sb_writers_release(sb, lev) \ + percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_) + /** * sb_end_write - drop write access to a superblock * @sb: the super we wrote to @@ -2614,7 +2619,7 @@ static inline void insert_inode_hash(struct inode *inode) extern void __remove_inode_hash(struct inode *); static inline void remove_inode_hash(struct inode *inode) { - if (!inode_unhashed(inode)) + if (!inode_unhashed(inode) && !hlist_fake(&inode->i_hash)) __remove_inode_hash(inode); } diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index e0727d77feaf..533c4408529a 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -368,7 +368,7 @@ extern void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, un extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group); extern void fsnotify_get_mark(struct fsnotify_mark *mark); extern void fsnotify_put_mark(struct fsnotify_mark *mark); -extern void fsnotify_unmount_inodes(struct list_head *list); +extern void fsnotify_unmount_inodes(struct super_block *sb); /* put here because inotify does some weird stuff when destroying watches */ extern void fsnotify_init_event(struct fsnotify_event *event, @@ -404,7 +404,7 @@ static inline u32 fsnotify_get_cookie(void) return 0; } -static inline void fsnotify_unmount_inodes(struct list_head *list) +static inline void fsnotify_unmount_inodes(struct super_block *sb) {} #endif /* CONFIG_FSNOTIFY */ diff --git a/include/linux/list.h b/include/linux/list.h index feb773c76ee0..3e3e64a61002 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -672,6 +672,11 @@ static inline void hlist_add_fake(struct hlist_node *n) n->pprev = &n->next; } +static inline bool hlist_fake(struct hlist_node *h) +{ + return h->pprev == &h->next; +} + /* * Move a list from one list head to another. Fixup the pprev * reference of the first entry if it exists. diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h index 3e88c9a7d57f..834c4e52cb2d 100644 --- a/include/linux/percpu-rwsem.h +++ b/include/linux/percpu-rwsem.h @@ -16,6 +16,7 @@ struct percpu_rw_semaphore { }; extern void percpu_down_read(struct percpu_rw_semaphore *); +extern int percpu_down_read_trylock(struct percpu_rw_semaphore *); extern void percpu_up_read(struct percpu_rw_semaphore *); extern void percpu_down_write(struct percpu_rw_semaphore *); @@ -31,4 +32,23 @@ extern void percpu_free_rwsem(struct percpu_rw_semaphore *); __percpu_init_rwsem(brw, #brw, &rwsem_key); \ }) + +#define percpu_rwsem_is_held(sem) lockdep_is_held(&(sem)->rw_sem) + +static inline void percpu_rwsem_release(struct percpu_rw_semaphore *sem, + bool read, unsigned long ip) +{ + lock_release(&sem->rw_sem.dep_map, 1, ip); +#ifdef CONFIG_RWSEM_SPIN_ON_OWNER + if (!read) + sem->rw_sem.owner = NULL; +#endif +} + +static inline void percpu_rwsem_acquire(struct percpu_rw_semaphore *sem, + bool read, unsigned long ip) +{ + lock_acquire(&sem->rw_sem.dep_map, 0, 1, read, 1, NULL, ip); +} + #endif diff --git a/init/Kconfig b/init/Kconfig index 2c0e50ef554a..9cabd866b34b 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -927,7 +927,6 @@ config NUMA_BALANCING_DEFAULT_ENABLED menuconfig CGROUPS bool "Control Group support" select KERNFS - select PERCPU_RWSEM help This option adds support for grouping sets of processes together, for use with process control subsystems such as Cpusets, CFS, memory diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 36942047ffc0..8e96f6cc2a4a 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -1,5 +1,5 @@ -obj-y += mutex.o semaphore.o rwsem.o +obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE) @@ -24,6 +24,5 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o -obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index 652a8ee8efe9..f32567254867 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -88,6 +88,19 @@ void percpu_down_read(struct percpu_rw_semaphore *brw) __up_read(&brw->rw_sem); } +int percpu_down_read_trylock(struct percpu_rw_semaphore *brw) +{ + if (unlikely(!update_fast_ctr(brw, +1))) { + if (!__down_read_trylock(&brw->rw_sem)) + return 0; + atomic_inc(&brw->slow_read_ctr); + __up_read(&brw->rw_sem); + } + + rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 1, _RET_IP_); + return 1; +} + void percpu_up_read(struct percpu_rw_semaphore *brw) { rwsem_release(&brw->rw_sem.dep_map, 1, _RET_IP_); diff --git a/lib/Kconfig b/lib/Kconfig index a16555281d53..8a49ff9d1502 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -53,9 +53,6 @@ config GENERIC_IO config STMP_DEVICE bool -config PERCPU_RWSEM - bool - config ARCH_USE_CMPXCHG_LOCKREF bool diff --git a/mm/backing-dev.c b/mm/backing-dev.c index dac5bf59309d..ee8d7fd07be3 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -55,13 +55,13 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) nr_dirty = nr_io = nr_more_io = nr_dirty_time = 0; spin_lock(&wb->list_lock); - list_for_each_entry(inode, &wb->b_dirty, i_wb_list) + list_for_each_entry(inode, &wb->b_dirty, i_io_list) nr_dirty++; - list_for_each_entry(inode, &wb->b_io, i_wb_list) + list_for_each_entry(inode, &wb->b_io, i_io_list) nr_io++; - list_for_each_entry(inode, &wb->b_more_io, i_wb_list) + list_for_each_entry(inode, &wb->b_more_io, i_io_list) nr_more_io++; - list_for_each_entry(inode, &wb->b_dirty_time, i_wb_list) + list_for_each_entry(inode, &wb->b_dirty_time, i_io_list) if (inode->i_state & I_DIRTY_TIME) nr_dirty_time++; spin_unlock(&wb->list_lock); diff --git a/mm/swapfile.c b/mm/swapfile.c index 41e4581af7c5..aebc2dd6e649 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2143,11 +2143,10 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode) if (S_ISBLK(inode->i_mode)) { p->bdev = bdgrab(I_BDEV(inode)); error = blkdev_get(p->bdev, - FMODE_READ | FMODE_WRITE | FMODE_EXCL, - sys_swapon); + FMODE_READ | FMODE_WRITE | FMODE_EXCL, p); if (error < 0) { p->bdev = NULL; - return -EINVAL; + return error; } p->old_block_size = block_size(p->bdev); error = set_blocksize(p->bdev, PAGE_SIZE); @@ -2348,7 +2347,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) struct filename *name; struct file *swap_file = NULL; struct address_space *mapping; - int i; int prio; int error; union swap_header *swap_header; @@ -2388,19 +2386,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) p->swap_file = swap_file; mapping = swap_file->f_mapping; - - for (i = 0; i < nr_swapfiles; i++) { - struct swap_info_struct *q = swap_info[i]; - - if (q == p || !q->swap_file) - continue; - if (mapping == q->swap_file->f_mapping) { - error = -EBUSY; - goto bad_swap; - } - } - inode = mapping->host; + /* If S_ISREG(inode->i_mode) will do mutex_lock(&inode->i_mutex); */ error = claim_swapfile(p, inode); if (unlikely(error)) @@ -2433,6 +2420,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) goto bad_swap; } if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) { + int cpu; + p->flags |= SWP_SOLIDSTATE; /* * select a random position to start with to help wear leveling @@ -2451,9 +2440,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = -ENOMEM; goto bad_swap; } - for_each_possible_cpu(i) { + for_each_possible_cpu(cpu) { struct percpu_cluster *cluster; - cluster = per_cpu_ptr(p->percpu_cluster, i); + cluster = per_cpu_ptr(p->percpu_cluster, cpu); cluster_set_null(&cluster->index); } }