From 642be6ec218b956fbae88304449720f76ba0d578 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 25 Feb 2008 17:20:46 -0500 Subject: [PATCH 01/14] Remove incorrect BKL comments in ext4 Signed-off-by: Andi Kleen Signed-off-by: "Theodore Ts'o" --- fs/ext4/dir.c | 2 +- fs/ext4/inode.c | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 33888bb58144..2c23bade9aa6 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -46,7 +46,7 @@ const struct file_operations ext4_dir_operations = { #ifdef CONFIG_COMPAT .compat_ioctl = ext4_compat_ioctl, #endif - .fsync = ext4_sync_file, /* BKL held */ + .fsync = ext4_sync_file, .release = ext4_release_dir, }; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 7dd9b50d5ebc..d3c6f58a9def 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -768,7 +768,6 @@ err_out: * * `handle' can be NULL if create == 0. * - * The BKL may not be held on entry here. Be sure to take it early. * return > 0, # of blocks mapped or allocated. * return = 0, if plain lookup failed. * return < 0, error case. From 55bd725aa3a83b3935988f37275b5a80e10d4169 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 15 Feb 2008 12:47:21 -0500 Subject: [PATCH 02/14] ext4: Fix locking hierarchy violation in ext4_fallocate() ext4_fallocate() was trying to acquire i_data_sem outside of jbd2_start_transaction/jbd2_journal_stop, which violates ext4's locking hierarchy. So we take i_mutex to prevent writes and truncates during the complete fallocate operation, and use ext4_get_block_wrap() which acquires and releases i_data_sem for each block allocation. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Mingming Cao Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index bc7081f1fbe8..e856f660fc30 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2623,7 +2623,7 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len) * modify 1 super block, 1 block bitmap and 1 group descriptor. */ credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + 3; - down_write((&EXT4_I(inode)->i_data_sem)); + mutex_lock(&inode->i_mutex); retry: while (ret >= 0 && ret < max_blocks) { block = block + ret; @@ -2634,7 +2634,7 @@ retry: break; } - ret = ext4_ext_get_blocks(handle, inode, block, + ret = ext4_get_blocks_wrap(handle, inode, block, max_blocks, &map_bh, EXT4_CREATE_UNINITIALIZED_EXT, 0); WARN_ON(ret <= 0); @@ -2680,7 +2680,6 @@ retry: if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; - up_write((&EXT4_I(inode)->i_data_sem)); /* * Time to update the file size. * Update only when preallocation was requested beyond the file size. @@ -2692,21 +2691,18 @@ retry: * if no error, we assume preallocation succeeded * completely */ - mutex_lock(&inode->i_mutex); i_size_write(inode, offset + len); EXT4_I(inode)->i_disksize = i_size_read(inode); - mutex_unlock(&inode->i_mutex); } else if (ret < 0 && nblocks) { /* Handle partial allocation scenario */ loff_t newsize; - mutex_lock(&inode->i_mutex); newsize = (nblocks << blkbits) + i_size_read(inode); i_size_write(inode, EXT4_BLOCK_ALIGN(newsize, blkbits)); EXT4_I(inode)->i_disksize = i_size_read(inode); - mutex_unlock(&inode->i_mutex); } } + mutex_unlock(&inode->i_mutex); return ret > 0 ? ret2 : ret; } From b73fce69ecb091a178ef9286027c370a63eb25aa Mon Sep 17 00:00:00 2001 From: Valerie Clement Date: Fri, 15 Feb 2008 13:48:51 -0500 Subject: [PATCH 03/14] ext4: Fix kernel BUG at fs/ext4/mballoc.c:910! With the flex_bg feature enabled, a large file creation oopses the kernel. The BUG_ON is: BUG_ON(len >= EXT4_BLOCKS_PER_GROUP(sb)); As the allocation of the bitmaps and the inode table can be done outside the block group with flex_bg, this allows to allocate up to EXT4_BLOCKS_PER_GROUP blocks in a group. This patch fixes the oops. Signed-off-by: Valerie Clement Signed-off-by: Mingming Cao Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index dd0fcfcb35ce..2121184a3fa5 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -906,7 +906,7 @@ static void ext4_mb_mark_free_simple(struct super_block *sb, unsigned short chunk; unsigned short border; - BUG_ON(len >= EXT4_BLOCKS_PER_GROUP(sb)); + BUG_ON(len > EXT4_BLOCKS_PER_GROUP(sb)); border = 2 << sb->s_blocksize_bits; From 4cdeed861b5f797b3fa661eb331a6bd6ad669c6a Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 22 Feb 2008 06:17:31 -0500 Subject: [PATCH 04/14] ext4: Don't leave behind a half-created inode if ext4_mkdir() fails If ext4_mkdir() fails to allocate the initial block for the directory, don't leave behind a half-created directory inode with the link count left at one. This was caused by an inappropriate call to ext4_dec_count(). Signed-off-by: Aneesh Kumar K.V Signed-off-by: Mingming Cao Signed-off-by: "Theodore Ts'o" --- fs/ext4/namei.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index a9347fb43bcc..fffd0807a01b 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1804,12 +1804,8 @@ retry: inode->i_fop = &ext4_dir_operations; inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize; dir_block = ext4_bread (handle, inode, 0, 1, &err); - if (!dir_block) { - ext4_dec_count(handle, inode); /* is this nlink == 0? */ - ext4_mark_inode_dirty(handle, inode); - iput (inode); - goto out_stop; - } + if (!dir_block) + goto out_clear_inode; BUFFER_TRACE(dir_block, "get_write_access"); ext4_journal_get_write_access(handle, dir_block); de = (struct ext4_dir_entry_2 *) dir_block->b_data; @@ -1832,7 +1828,8 @@ retry: ext4_mark_inode_dirty(handle, inode); err = ext4_add_entry (handle, dentry, inode); if (err) { - inode->i_nlink = 0; +out_clear_inode: + clear_nlink(inode); ext4_mark_inode_dirty(handle, inode); iput (inode); goto out_stop; From b35905c16ad6428551eb9e49525011bd2700cf56 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 25 Feb 2008 16:54:37 -0500 Subject: [PATCH 05/14] ext4: Fix memory and buffer head leak in callers to ext4_ext_find_extent() The path variable returned via ext4_ext_find_extent is a kmalloc variable and needs to be freeded. It also contains a reference to buffer_head which needs to be dropped. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Mingming Cao Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 6 +++--- fs/ext4/migrate.c | 5 +++++ include/linux/ext4_fs_extents.h | 1 + 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index e856f660fc30..995ac16102a9 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -349,7 +349,7 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path) #define ext4_ext_show_leaf(inode,path) #endif -static void ext4_ext_drop_refs(struct ext4_ext_path *path) +void ext4_ext_drop_refs(struct ext4_ext_path *path) { int depth = path->p_depth; int i; @@ -2200,10 +2200,10 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, newdepth = ext_depth(inode); if (newdepth != depth) { depth = newdepth; - path = ext4_ext_find_extent(inode, iblock, NULL); + ext4_ext_drop_refs(path); + path = ext4_ext_find_extent(inode, iblock, path); if (IS_ERR(path)) { err = PTR_ERR(path); - path = NULL; goto out; } eh = path[depth].p_hdr; diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 8c6c685b9d22..5c1e27de7755 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -43,6 +43,7 @@ static int finish_range(handle_t *handle, struct inode *inode, if (IS_ERR(path)) { retval = PTR_ERR(path); + path = NULL; goto err_out; } @@ -74,6 +75,10 @@ static int finish_range(handle_t *handle, struct inode *inode, } retval = ext4_ext_insert_extent(handle, inode, path, &newext); err_out: + if (path) { + ext4_ext_drop_refs(path); + kfree(path); + } lb->first_pblock = 0; return retval; } diff --git a/include/linux/ext4_fs_extents.h b/include/linux/ext4_fs_extents.h index 697da4bce6c5..1285c583b2d8 100644 --- a/include/linux/ext4_fs_extents.h +++ b/include/linux/ext4_fs_extents.h @@ -227,5 +227,6 @@ extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *, ext4_lblk_t *, ext4_fsblk_t *); extern int ext4_ext_search_right(struct inode *, struct ext4_ext_path *, ext4_lblk_t *, ext4_fsblk_t *); +extern void ext4_ext_drop_refs(struct ext4_ext_path *); #endif /* _LINUX_EXT4_EXTENTS */ From 9df5643ad135c7f8c02d3b69020de4ec910f9fc0 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 22 Feb 2008 06:17:31 -0500 Subject: [PATCH 06/14] ext4: Get journal write access before modifying the extent tree When the user was writing into an unitialized extent, ext4_ext_convert_to_initialize() was not requesting journal write access before it started to modify the extent tree. Fix this oversight. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Mingming Cao Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 995ac16102a9..c4d6f19faf37 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2168,6 +2168,10 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, newblock = iblock - ee_block + ext_pblock(ex); ex2 = ex; + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; + /* ex1: ee_block to iblock - 1 : uninitialized */ if (iblock > ee_block) { ex1 = ex; @@ -2210,6 +2214,10 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ex = path[depth].p_ext; if (ex2 != &newex) ex2 = ex; + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; } allocated = max_blocks; } @@ -2230,9 +2238,6 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ex2->ee_len = cpu_to_le16(allocated); if (ex2 != ex) goto insert; - err = ext4_ext_get_access(handle, inode, path + depth); - if (err) - goto out; /* * New (initialized) extent starts from the first block * in the current extent. i.e., ex2 == ex From e56eb6590693a5a340e8f596db2768a6e1b9e236 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 15 Feb 2008 13:48:21 -0500 Subject: [PATCH 07/14] ext4: Don't claim block from group which has corrupt bitmap In ext4_mb_complex_scan_group, if the extent length of the newly found extentet is greater than than the total free blocks counted in group info, break without claiming the block. Document different ext4_error usage, explaining the state with which we continue if we mount with errors=continue Signed-off-by: Aneesh Kumar K.V Signed-off-by: Mingming Cao Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 2121184a3fa5..6968c53e62ad 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -967,6 +967,10 @@ static void ext4_mb_generate_buddy(struct super_block *sb, ext4_error(sb, __FUNCTION__, "EXT4-fs: group %lu: %u blocks in bitmap, %u in gd\n", group, free, grp->bb_free); + /* + * If we intent to continue, we consider group descritor + * corrupt and update bb_free using bitmap value + */ grp->bb_free = free; } @@ -1822,7 +1826,7 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, EXT4_BLOCKS_PER_GROUP(sb), i); if (i >= EXT4_BLOCKS_PER_GROUP(sb)) { /* - * IF we corrupt the bitmap we won't find any + * IF we have corrupt bitmap, we won't find any * free blocks even though group info says we * we have free blocks */ @@ -1838,6 +1842,12 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, ext4_error(sb, __FUNCTION__, "%d free blocks as per " "group info. But got %d blocks\n", free, ex.fe_len); + /* + * The number of free blocks differs. This mostly + * indicate that the bitmap is corrupt. So exit + * without claiming the space. + */ + break; } ext4_mb_measure_extent(ac, &ex, e4b); @@ -3771,6 +3781,10 @@ static int ext4_mb_release_inode_pa(struct ext4_buddy *e4b, (unsigned long) pa->pa_len); ext4_error(sb, __FUNCTION__, "free %u, pa_free %u\n", free, pa->pa_free); + /* + * pa is already deleted so we use the value obtained + * from the bitmap and continue. + */ } atomic_add(free, &sbi->s_mb_discarded); if (ac) From 74d3487fc8aa58cec16dff7239dea1ca59bdab0e Mon Sep 17 00:00:00 2001 From: Valerie Clement Date: Fri, 15 Feb 2008 13:43:07 -0500 Subject: [PATCH 08/14] ext4: modify block allocation algorithm for the last group When a directory inode is allocated in the last group and the last group contains less than s_blocks_per_group blocks, the initial block allocated for the directory is not always allocated in the same group as the directory inode, but in one of the first groups of the filesystem (group 1 for example). Depending on the current process's pid, ext4_find_near() and ext4_ext_find_goal() can return a block number greater than the maximum blocks count in the filesystem and in that case the block will be not allocated in the same group as the inode. The following patch fixes the problem. Should the modification also be done in ext2/3 code? Signed-off-by: Valerie Clement Signed-off-by: Mingming Cao Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 8 +++++++- fs/ext4/inode.c | 8 +++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index c4d6f19faf37..8a59f7ba30e6 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -148,6 +148,7 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, { struct ext4_inode_info *ei = EXT4_I(inode); ext4_fsblk_t bg_start; + ext4_fsblk_t last_block; ext4_grpblk_t colour; int depth; @@ -169,8 +170,13 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, /* OK. use inode's group */ bg_start = (ei->i_block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) + le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block); - colour = (current->pid % 16) * + last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; + + if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) + colour = (current->pid % 16) * (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); + else + colour = (current->pid % 16) * ((last_block - bg_start) / 16); return bg_start + colour + block; } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d3c6f58a9def..34f3eb615fd5 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -403,6 +403,7 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) __le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data; __le32 *p; ext4_fsblk_t bg_start; + ext4_fsblk_t last_block; ext4_grpblk_t colour; /* Try to find previous block */ @@ -420,8 +421,13 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) * into the same cylinder group then. */ bg_start = ext4_group_first_block_no(inode->i_sb, ei->i_block_group); - colour = (current->pid % 16) * + last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; + + if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) + colour = (current->pid % 16) * (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); + else + colour = (current->pid % 16) * ((last_block - bg_start) / 16); return bg_start + colour; } From 825f1481ead4ce40671089bae7412ac3519e8caa Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 15 Feb 2008 15:00:38 -0500 Subject: [PATCH 09/14] ext4: Don't use ext4_dec_count() if not needed The ext4_dec_count() function is only needed when dropping the i_nlink count on inodes which are (or which could be) directories. If we *know* that the inode in question can't possibly be a directory, use drop_nlink or clear_nlink() if we know i_nlink is 1. Signed-off-by: "Theodore Ts'o" --- fs/ext4/namei.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index fffd0807a01b..5a79c6b6dc69 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2161,7 +2161,7 @@ static int ext4_unlink(struct inode * dir, struct dentry *dentry) dir->i_ctime = dir->i_mtime = ext4_current_time(dir); ext4_update_dx_flag(dir); ext4_mark_inode_dirty(handle, dir); - ext4_dec_count(handle, inode); + drop_nlink(inode); if (!inode->i_nlink) ext4_orphan_add(handle, inode); inode->i_ctime = ext4_current_time(inode); @@ -2211,7 +2211,7 @@ retry: err = __page_symlink(inode, symname, l, mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); if (err) { - ext4_dec_count(handle, inode); + clear_nlink(inode); ext4_mark_inode_dirty(handle, inode); iput (inode); goto out_stop; @@ -2404,7 +2404,7 @@ static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry, ext4_dec_count(handle, old_dir); if (new_inode) { /* checked empty_dir above, can't have another parent, - * ext3_dec_count() won't work for many-linked dirs */ + * ext4_dec_count() won't work for many-linked dirs */ new_inode->i_nlink = 0; } else { ext4_inc_count(handle, new_dir); From f5ab0d1f8f7df937778c60c3da6f4ef939a54a7b Mon Sep 17 00:00:00 2001 From: Mingming Cao Date: Mon, 25 Feb 2008 15:29:55 -0500 Subject: [PATCH 10/14] ext4: Fix BUG when writing to an unitialized extent This patch fixes a bug when writing to preallocated but uninitialized blocks, which resulted in a BUG in fs/buffer.c saying that the buffer is not mapped. When writing to a file, ext4_get_block_wrap() is called with create=1 in order to request that blocks be allocated if necessary. It currently calls ext4_get_blocks() with create=0 in order to do a lookup first. If the inode contains an unitialized data block, the buffer head is left unampped, which ext4_get_blocks_wrap() returns, causing the BUG. We fix this by checking to see if the buffer head is unmapped, and if so, we make sure the the buffer head is mapped by calling ext4_ext_get_blocks with create=1. Signed-off-by: Mingming Cao Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 13 +++++++++++++ fs/ext4/inode.c | 47 ++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 57 insertions(+), 3 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 8a59f7ba30e6..bcf5d040e328 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2287,9 +2287,22 @@ out: } /* + * Block allocation/map/preallocation routine for extents based files + * + * * Need to be called with * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem) + * + * return > 0, number of of blocks already mapped/allocated + * if create == 0 and these are pre-allocated blocks + * buffer head is unmapped + * otherwise blocks are mapped + * + * return = 0, if plain look up failed (blocks have not been allocated) + * buffer head is unmapped + * + * return < 0, error case. */ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, ext4_lblk_t iblock, diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 34f3eb615fd5..945cbf6cb1fc 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -908,11 +908,38 @@ out: */ #define DIO_CREDITS 25 + +/* + * + * + * ext4_ext4 get_block() wrapper function + * It will do a look up first, and returns if the blocks already mapped. + * Otherwise it takes the write lock of the i_data_sem and allocate blocks + * and store the allocated blocks in the result buffer head and mark it + * mapped. + * + * If file type is extents based, it will call ext4_ext_get_blocks(), + * Otherwise, call with ext4_get_blocks_handle() to handle indirect mapping + * based files + * + * On success, it returns the number of blocks being mapped or allocate. + * if create==0 and the blocks are pre-allocated and uninitialized block, + * the result buffer head is unmapped. If the create ==1, it will make sure + * the buffer head is mapped. + * + * It returns 0 if plain look up failed (blocks have not been allocated), in + * that casem, buffer head is unmapped + * + * It returns the error in case of allocation failure. + */ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, unsigned long max_blocks, struct buffer_head *bh, int create, int extend_disksize) { int retval; + + clear_buffer_mapped(bh); + /* * Try to see if we can get the block without requesting * for new file system block. @@ -926,12 +953,26 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, inode, block, max_blocks, bh, 0, 0); } up_read((&EXT4_I(inode)->i_data_sem)); - if (!create || (retval > 0)) + + /* If it is only a block(s) look up */ + if (!create) return retval; /* - * We need to allocate new blocks which will result - * in i_data update + * Returns if the blocks have already allocated + * + * Note that if blocks have been preallocated + * ext4_ext_get_block() returns th create = 0 + * with buffer head unmapped. + */ + if (retval > 0 && buffer_mapped(bh)) + return retval; + + /* + * New blocks allocate and/or writing to uninitialized extent + * will possibly result in updating i_data, so we take + * the write lock of i_data_sem, and call get_blocks() + * with create == 1 flag. */ down_write((&EXT4_I(inode)->i_data_sem)); /* From 2c98615d3b64ce7888cd46cc668023f456daf287 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 25 Feb 2008 15:41:35 -0500 Subject: [PATCH 11/14] ext4: Don't mark filesystem error if fallocate fails If we fail to allocate blocks don't call ext4_error. Also don't hide errors from ext4_get_blocks_wrap Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index bcf5d040e328..9ae6e67090cd 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2661,13 +2661,14 @@ retry: ret = ext4_get_blocks_wrap(handle, inode, block, max_blocks, &map_bh, EXT4_CREATE_UNINITIALIZED_EXT, 0); - WARN_ON(ret <= 0); if (ret <= 0) { - ext4_error(inode->i_sb, "ext4_fallocate", - "ext4_ext_get_blocks returned error: " - "inode#%lu, block=%u, max_blocks=%lu", +#ifdef EXT4FS_DEBUG + WARN_ON(ret <= 0); + printk(KERN_ERR "%s: ext4_ext_get_blocks " + "returned error inode#%lu, block=%u, " + "max_blocks=%lu", __func__, inode->i_ino, block, max_blocks); - ret = -EIO; +#endif ext4_mark_inode_dirty(handle, inode); ret2 = ext4_journal_stop(handle); break; From 42bf0383d1e09dd1b38f3debb13a76b2f87634b3 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 25 Feb 2008 16:38:03 -0500 Subject: [PATCH 12/14] ext4: set EXT4_EXTENTS_FL only for directory and regular files In addition, don't inherit EXT4_EXTENTS_FL from parent directory. If we have a directory with extent flag set and later mount the file system with -o noextents, the files created in that directory will also have extent flag set but we would not have called ext4_ext_tree_init for them. This will cause error later when we are verifying the extent header Signed-off-by: Aneesh Kumar K.V Acked-off-by: Eric Sandeen Signed-off-by: Mingming Cao Signed-off-by: "Theodore Ts'o" --- fs/ext4/ialloc.c | 22 +++++++++++++++------- fs/ext4/namei.c | 1 - 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index da18a74b966a..8036b9b5376b 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -702,7 +702,12 @@ got: ei->i_dir_start_lookup = 0; ei->i_disksize = 0; - ei->i_flags = EXT4_I(dir)->i_flags & ~EXT4_INDEX_FL; + /* + * Don't inherit extent flag from directory. We set extent flag on + * newly created directory and file only if -o extent mount option is + * specified + */ + ei->i_flags = EXT4_I(dir)->i_flags & ~(EXT4_INDEX_FL|EXT4_EXTENTS_FL); if (S_ISLNK(mode)) ei->i_flags &= ~(EXT4_IMMUTABLE_FL|EXT4_APPEND_FL); /* dirsync only applies to directories */ @@ -745,12 +750,15 @@ got: goto fail_free_drop; } if (test_opt(sb, EXTENTS)) { - EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL; - ext4_ext_tree_init(handle, inode); - err = ext4_update_incompat_feature(handle, sb, - EXT4_FEATURE_INCOMPAT_EXTENTS); - if (err) - goto fail; + /* set extent flag only for directory and file */ + if (S_ISDIR(mode) || S_ISREG(mode)) { + EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL; + ext4_ext_tree_init(handle, inode); + err = ext4_update_incompat_feature(handle, sb, + EXT4_FEATURE_INCOMPAT_EXTENTS); + if (err) + goto fail; + } } ext4_debug("allocating inode %lu\n", inode->i_ino); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 5a79c6b6dc69..28aa2ed4297e 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2220,7 +2220,6 @@ retry: inode->i_op = &ext4_fast_symlink_inode_operations; memcpy((char*)&EXT4_I(inode)->i_data,symname,l); inode->i_size = l-1; - EXT4_I(inode)->i_flags &= ~EXT4_EXTENTS_FL; } EXT4_I(inode)->i_disksize = inode->i_size; err = ext4_add_nondir(handle, dentry, inode); From ffad0a44b7216d0f079dcf95a351082099d1e5fb Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Sat, 23 Feb 2008 01:38:34 -0500 Subject: [PATCH 13/14] ext4: ext4_find_next_zero_bit needs an aligned address on some arch ext4_find_next_zero_bit and ext4_find_next_bit needs a long aligned address on x8_64. Add mb_find_next_zero_bit and mb_find_next_bit and use them in the mballoc. Fix: https://bugzilla.redhat.com/show_bug.cgi?id=433286 Eric Sandeen debugged the problem and suggested the fix. Signed-off-by: Aneesh Kumar K.V Acked-by: Eric Sandeen Signed-off-by: Mingming Cao Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 62 ++++++++++++++++++++++++++++++----------------- 1 file changed, 40 insertions(+), 22 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 6968c53e62ad..ef97f19c2f9d 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -627,21 +627,19 @@ static ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, return block; } +static inline void *mb_correct_addr_and_bit(int *bit, void *addr) +{ #if BITS_PER_LONG == 64 -#define mb_correct_addr_and_bit(bit, addr) \ -{ \ - bit += ((unsigned long) addr & 7UL) << 3; \ - addr = (void *) ((unsigned long) addr & ~7UL); \ -} + *bit += ((unsigned long) addr & 7UL) << 3; + addr = (void *) ((unsigned long) addr & ~7UL); #elif BITS_PER_LONG == 32 -#define mb_correct_addr_and_bit(bit, addr) \ -{ \ - bit += ((unsigned long) addr & 3UL) << 3; \ - addr = (void *) ((unsigned long) addr & ~3UL); \ -} + *bit += ((unsigned long) addr & 3UL) << 3; + addr = (void *) ((unsigned long) addr & ~3UL); #else #error "how many bits you are?!" #endif + return addr; +} static inline int mb_test_bit(int bit, void *addr) { @@ -649,34 +647,54 @@ static inline int mb_test_bit(int bit, void *addr) * ext4_test_bit on architecture like powerpc * needs unsigned long aligned address */ - mb_correct_addr_and_bit(bit, addr); + addr = mb_correct_addr_and_bit(&bit, addr); return ext4_test_bit(bit, addr); } static inline void mb_set_bit(int bit, void *addr) { - mb_correct_addr_and_bit(bit, addr); + addr = mb_correct_addr_and_bit(&bit, addr); ext4_set_bit(bit, addr); } static inline void mb_set_bit_atomic(spinlock_t *lock, int bit, void *addr) { - mb_correct_addr_and_bit(bit, addr); + addr = mb_correct_addr_and_bit(&bit, addr); ext4_set_bit_atomic(lock, bit, addr); } static inline void mb_clear_bit(int bit, void *addr) { - mb_correct_addr_and_bit(bit, addr); + addr = mb_correct_addr_and_bit(&bit, addr); ext4_clear_bit(bit, addr); } static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr) { - mb_correct_addr_and_bit(bit, addr); + addr = mb_correct_addr_and_bit(&bit, addr); ext4_clear_bit_atomic(lock, bit, addr); } +static inline int mb_find_next_zero_bit(void *addr, int max, int start) +{ + int fix = 0; + addr = mb_correct_addr_and_bit(&fix, addr); + max += fix; + start += fix; + + return ext4_find_next_zero_bit(addr, max, start) - fix; +} + +static inline int mb_find_next_bit(void *addr, int max, int start) +{ + int fix = 0; + addr = mb_correct_addr_and_bit(&fix, addr); + max += fix; + start += fix; + + return ext4_find_next_bit(addr, max, start) - fix; +} + static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max) { char *bb; @@ -946,12 +964,12 @@ static void ext4_mb_generate_buddy(struct super_block *sb, /* initialize buddy from bitmap which is aggregation * of on-disk bitmap and preallocations */ - i = ext4_find_next_zero_bit(bitmap, max, 0); + i = mb_find_next_zero_bit(bitmap, max, 0); grp->bb_first_free = i; while (i < max) { fragments++; first = i; - i = ext4_find_next_bit(bitmap, max, i); + i = mb_find_next_bit(bitmap, max, i); len = i - first; free += len; if (len > 1) @@ -959,7 +977,7 @@ static void ext4_mb_generate_buddy(struct super_block *sb, else grp->bb_counters[0]++; if (i < max) - i = ext4_find_next_zero_bit(bitmap, max, i); + i = mb_find_next_zero_bit(bitmap, max, i); } grp->bb_fragments = fragments; @@ -1782,7 +1800,7 @@ static void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac, buddy = mb_find_buddy(e4b, i, &max); BUG_ON(buddy == NULL); - k = ext4_find_next_zero_bit(buddy, max, 0); + k = mb_find_next_zero_bit(buddy, max, 0); BUG_ON(k >= max); ac->ac_found++; @@ -1822,7 +1840,7 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, i = e4b->bd_info->bb_first_free; while (free && ac->ac_status == AC_STATUS_CONTINUE) { - i = ext4_find_next_zero_bit(bitmap, + i = mb_find_next_zero_bit(bitmap, EXT4_BLOCKS_PER_GROUP(sb), i); if (i >= EXT4_BLOCKS_PER_GROUP(sb)) { /* @@ -3750,10 +3768,10 @@ static int ext4_mb_release_inode_pa(struct ext4_buddy *e4b, } while (bit < end) { - bit = ext4_find_next_zero_bit(bitmap_bh->b_data, end, bit); + bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit); if (bit >= end) break; - next = ext4_find_next_bit(bitmap_bh->b_data, end, bit); + next = mb_find_next_bit(bitmap_bh->b_data, end, bit); if (next > end) next = end; start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit + From 5606bf5d0cbfbc3dfa78793a3793c43dd045fb1b Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Mon, 25 Feb 2008 15:37:42 -0500 Subject: [PATCH 14/14] ext4: add missing ext4_journal_stop() Add missing ext4_journal_stop() in error handling. Signed-off-by: Akinobu Mita Signed-off-by: "Theodore Ts'o" Cc: Stephen Tweedie Cc: adilger@clusterfs.com Cc: Andrew Morton Cc: Mingming Cao --- fs/ext4/resize.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 9477a2bd6ff2..e29efa0f9d62 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1037,6 +1037,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, ext4_warning(sb, __FUNCTION__, "multiple resizers run on filesystem!"); unlock_super(sb); + ext4_journal_stop(handle); err = -EBUSY; goto exit_put; }