From ff896738be381efa6e50ba9a3b6cdc94f69ada42 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 17 Jun 2019 11:14:11 +0200 Subject: [PATCH 1/3] block: return from __bio_try_merge_page if merging occured in the same page We currently have an input same_page parameter to __bio_try_merge_page to prohibit merging in the same page. The rationale for that is that some callers need to account for every page added to a bio. Instead of letting these callers call twice into the merge code to account for the new vs existing page cases, just turn the paramter into an output one that returns if a merge in the same page occured and let them act accordingly. Signed-off-by: Christoph Hellwig Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- block/bio.c | 26 +++++++++++--------------- fs/iomap.c | 12 ++++++++---- fs/xfs/xfs_aops.c | 11 ++++++++--- include/linux/bio.h | 2 +- 4 files changed, 28 insertions(+), 23 deletions(-) diff --git a/block/bio.c b/block/bio.c index 683cbb40f051..daa1c1ae72cd 100644 --- a/block/bio.c +++ b/block/bio.c @@ -636,7 +636,7 @@ EXPORT_SYMBOL(bio_clone_fast); static inline bool page_is_mergeable(const struct bio_vec *bv, struct page *page, unsigned int len, unsigned int off, - bool same_page) + bool *same_page) { phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) + bv->bv_offset + bv->bv_len - 1; @@ -647,15 +647,9 @@ static inline bool page_is_mergeable(const struct bio_vec *bv, if (xen_domain() && !xen_biovec_phys_mergeable(bv, page)) return false; - if ((vec_end_addr & PAGE_MASK) != page_addr) { - if (same_page) - return false; - if (pfn_to_page(PFN_DOWN(vec_end_addr)) + 1 != page) - return false; - } - - WARN_ON_ONCE(same_page && (len + off) > PAGE_SIZE); - + *same_page = ((vec_end_addr & PAGE_MASK) == page_addr); + if (!*same_page && pfn_to_page(PFN_DOWN(vec_end_addr)) + 1 != page) + return false; return true; } @@ -701,6 +695,7 @@ static int __bio_add_pc_page(struct request_queue *q, struct bio *bio, bool put_same_page) { struct bio_vec *bvec; + bool same_page = false; /* * cloned bio must not modify vec list @@ -729,7 +724,7 @@ static int __bio_add_pc_page(struct request_queue *q, struct bio *bio, if (bvec_gap_to_prev(q, bvec, offset)) return 0; - if (page_is_mergeable(bvec, page, len, offset, false) && + if (page_is_mergeable(bvec, page, len, offset, &same_page) && can_add_page_to_seg(q, bvec, page, len, offset)) { bvec->bv_len += len; goto done; @@ -767,8 +762,7 @@ EXPORT_SYMBOL(bio_add_pc_page); * @page: start page to add * @len: length of the data to add * @off: offset of the data relative to @page - * @same_page: if %true only merge if the new data is in the same physical - * page as the last segment of the bio. + * @same_page: return if the segment has been merged inside the same page * * Try to add the data at @page + @off to the last bvec of @bio. This is a * a useful optimisation for file systems with a block size smaller than the @@ -779,7 +773,7 @@ EXPORT_SYMBOL(bio_add_pc_page); * Return %true on success or %false on failure. */ bool __bio_try_merge_page(struct bio *bio, struct page *page, - unsigned int len, unsigned int off, bool same_page) + unsigned int len, unsigned int off, bool *same_page) { if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) return false; @@ -837,7 +831,9 @@ EXPORT_SYMBOL_GPL(__bio_add_page); int bio_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset) { - if (!__bio_try_merge_page(bio, page, len, offset, false)) { + bool same_page = false; + + if (!__bio_try_merge_page(bio, page, len, offset, &same_page)) { if (bio_full(bio)) return 0; __bio_add_page(bio, page, len, offset); diff --git a/fs/iomap.c b/fs/iomap.c index 23ef63fd1669..12654c2e78f8 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -287,7 +287,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, struct iomap_readpage_ctx *ctx = data; struct page *page = ctx->cur_page; struct iomap_page *iop = iomap_page_create(inode, page); - bool is_contig = false; + bool same_page = false, is_contig = false; loff_t orig_pos = pos; unsigned poff, plen; sector_t sector; @@ -315,10 +315,14 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, * Try to merge into a previous segment if we can. */ sector = iomap_sector(iomap, pos); - if (ctx->bio && bio_end_sector(ctx->bio) == sector) { - if (__bio_try_merge_page(ctx->bio, page, plen, poff, true)) - goto done; + if (ctx->bio && bio_end_sector(ctx->bio) == sector) is_contig = true; + + if (is_contig && + __bio_try_merge_page(ctx->bio, page, plen, poff, &same_page)) { + if (!same_page && iop) + atomic_inc(&iop->read_count); + goto done; } /* diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index a6f0f4761a37..8da5e6637771 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -758,6 +758,7 @@ xfs_add_to_ioend( struct block_device *bdev = xfs_find_bdev_for_inode(inode); unsigned len = i_blocksize(inode); unsigned poff = offset & (PAGE_SIZE - 1); + bool merged, same_page = false; sector_t sector; sector = xfs_fsb_to_db(ip, wpc->imap.br_startblock) + @@ -774,9 +775,13 @@ xfs_add_to_ioend( wpc->imap.br_state, offset, bdev, sector); } - if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff, true)) { - if (iop) - atomic_inc(&iop->write_count); + merged = __bio_try_merge_page(wpc->ioend->io_bio, page, len, poff, + &same_page); + + if (iop && !same_page) + atomic_inc(&iop->write_count); + + if (!merged) { if (bio_full(wpc->ioend->io_bio)) xfs_chain_bio(wpc->ioend, wbc, bdev, sector); bio_add_page(wpc->ioend->io_bio, page, len, poff); diff --git a/include/linux/bio.h b/include/linux/bio.h index 0f23b5682640..f87abaa898f0 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -423,7 +423,7 @@ extern int bio_add_page(struct bio *, struct page *, unsigned int,unsigned int); extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *, unsigned int, unsigned int); bool __bio_try_merge_page(struct bio *bio, struct page *page, - unsigned int len, unsigned int off, bool same_page); + unsigned int len, unsigned int off, bool *same_page); void __bio_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int off); int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter); From 4569180495600ac59f5cd27f67242a6cb51254f3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 17 Jun 2019 11:14:12 +0200 Subject: [PATCH 2/3] block: fix page leak when merging to same page When multiple iovecs reference the same page, each get_user_page call will add a reference to the page. But once we've created the bio that information gets lost and only a single reference will be dropped after I/O completion. Use the same_page information returned from __bio_try_merge_page to drop additional references to pages that were already present in the bio. Based on a patch from Ming Lei. Link: https://lkml.org/lkml/2019/4/23/64 Fixes: 576ed913 ("block: use bio_add_page in bio_iov_iter_get_pages") Reported-by: David Gibson Signed-off-by: Christoph Hellwig Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- block/bio.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/block/bio.c b/block/bio.c index daa1c1ae72cd..ce797d73bb43 100644 --- a/block/bio.c +++ b/block/bio.c @@ -896,6 +896,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt; struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt; struct page **pages = (struct page **)bv; + bool same_page = false; ssize_t size, left; unsigned len, i; size_t offset; @@ -916,8 +917,15 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) struct page *page = pages[i]; len = min_t(size_t, PAGE_SIZE - offset, left); - if (WARN_ON_ONCE(bio_add_page(bio, page, len, offset) != len)) - return -EINVAL; + + if (__bio_try_merge_page(bio, page, len, offset, &same_page)) { + if (same_page) + put_page(page); + } else { + if (WARN_ON_ONCE(bio_full(bio))) + return -EINVAL; + __bio_add_page(bio, page, len, offset); + } offset = 0; } From 9642fa73d073527b0cbc337cc17a47d545d82cd2 Mon Sep 17 00:00:00 2001 From: Mariusz Tkaczyk Date: Thu, 13 Jun 2019 16:11:41 +0200 Subject: [PATCH 3/3] md: fix for divide error in status_resync Stopping external metadata arrays during resync/recovery causes retries, loop of interrupting and starting reconstruction, until it hit at good moment to stop completely. While these retries curr_mark_cnt can be small- especially on HDD drives, so subtraction result can be smaller than 0. However it is casted to uint without checking. As a result of it the status bar in /proc/mdstat while stopping is strange (it jumps between 0% and 99%). The real problem occurs here after commit 72deb455b5ec ("block: remove CONFIG_LBDAF"). Sector_div() macro has been changed, now the divisor is casted to uint32. For db = -8 the divisior(db/32-1) becomes 0. Check if db value can be really counted and replace these macro by div64_u64() inline. Signed-off-by: Mariusz Tkaczyk Signed-off-by: Song Liu --- drivers/md/md.c | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 04f4f131f9d6..9801d540fea1 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -7607,9 +7607,9 @@ static void status_unused(struct seq_file *seq) static int status_resync(struct seq_file *seq, struct mddev *mddev) { sector_t max_sectors, resync, res; - unsigned long dt, db; - sector_t rt; - int scale; + unsigned long dt, db = 0; + sector_t rt, curr_mark_cnt, resync_mark_cnt; + int scale, recovery_active; unsigned int per_milli; if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || @@ -7698,22 +7698,30 @@ static int status_resync(struct seq_file *seq, struct mddev *mddev) * db: blocks written from mark until now * rt: remaining time * - * rt is a sector_t, so could be 32bit or 64bit. - * So we divide before multiply in case it is 32bit and close - * to the limit. - * We scale the divisor (db) by 32 to avoid losing precision - * near the end of resync when the number of remaining sectors - * is close to 'db'. - * We then divide rt by 32 after multiplying by db to compensate. - * The '+1' avoids division by zero if db is very small. + * rt is a sector_t, which is always 64bit now. We are keeping + * the original algorithm, but it is not really necessary. + * + * Original algorithm: + * So we divide before multiply in case it is 32bit and close + * to the limit. + * We scale the divisor (db) by 32 to avoid losing precision + * near the end of resync when the number of remaining sectors + * is close to 'db'. + * We then divide rt by 32 after multiplying by db to compensate. + * The '+1' avoids division by zero if db is very small. */ dt = ((jiffies - mddev->resync_mark) / HZ); if (!dt) dt++; - db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active)) - - mddev->resync_mark_cnt; + + curr_mark_cnt = mddev->curr_mark_cnt; + recovery_active = atomic_read(&mddev->recovery_active); + resync_mark_cnt = mddev->resync_mark_cnt; + + if (curr_mark_cnt >= (recovery_active + resync_mark_cnt)) + db = curr_mark_cnt - (recovery_active + resync_mark_cnt); rt = max_sectors - resync; /* number of remaining sectors */ - sector_div(rt, db/32+1); + rt = div64_u64(rt, db/32+1); rt *= dt; rt >>= 5;