diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt index 7e06e65586d4..4a0a7469fdd7 100644 --- a/Documentation/device-mapper/dm-raid.txt +++ b/Documentation/device-mapper/dm-raid.txt @@ -343,3 +343,4 @@ Version History 1.11.0 Fix table line argument order (wrong raid10_copies/raid10_format sequence) 1.11.1 Add raid4/5/6 journal write-back support via journal_mode option +1.12.1 fix for MD deadlock between mddev_suspend() and md_write_start() available diff --git a/drivers/dax/super.c b/drivers/dax/super.c index ce9e563e6e1d..938eb4868f7f 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -278,6 +278,12 @@ void dax_write_cache(struct dax_device *dax_dev, bool wc) } EXPORT_SYMBOL_GPL(dax_write_cache); +bool dax_write_cache_enabled(struct dax_device *dax_dev) +{ + return test_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags); +} +EXPORT_SYMBOL_GPL(dax_write_cache_enabled); + bool dax_alive(struct dax_device *dax_dev) { lockdep_assert_held(&dax_srcu); diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 850ff6c67994..44f4a8ac95bd 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -1258,8 +1258,7 @@ EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async); */ int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c) { - blk_status_t a; - int f; + int a, f; unsigned long buffers_processed = 0; struct dm_buffer *b, *tmp; diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 1b224aa9cf15..3acce09bba35 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -1587,16 +1587,18 @@ retry: if (likely(ic->mode == 'J')) { if (dio->write) { unsigned next_entry, i, pos; - unsigned ws, we; + unsigned ws, we, range_sectors; - dio->range.n_sectors = min(dio->range.n_sectors, ic->free_sectors); + dio->range.n_sectors = min(dio->range.n_sectors, + ic->free_sectors << ic->sb->log2_sectors_per_block); if (unlikely(!dio->range.n_sectors)) goto sleep; - ic->free_sectors -= dio->range.n_sectors; + range_sectors = dio->range.n_sectors >> ic->sb->log2_sectors_per_block; + ic->free_sectors -= range_sectors; journal_section = ic->free_section; journal_entry = ic->free_section_entry; - next_entry = ic->free_section_entry + dio->range.n_sectors; + next_entry = ic->free_section_entry + range_sectors; ic->free_section_entry = next_entry % ic->journal_section_entries; ic->free_section += next_entry / ic->journal_section_entries; ic->n_uncommitted_sections += next_entry / ic->journal_section_entries; @@ -1727,6 +1729,8 @@ static void pad_uncommitted(struct dm_integrity_c *ic) wraparound_section(ic, &ic->free_section); ic->n_uncommitted_sections++; } + WARN_ON(ic->journal_sections * ic->journal_section_entries != + (ic->n_uncommitted_sections + ic->n_committed_sections) * ic->journal_section_entries + ic->free_sectors); } static void integrity_commit(struct work_struct *w) @@ -1821,6 +1825,9 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned write_start, { unsigned i, j, n; struct journal_completion comp; + struct blk_plug plug; + + blk_start_plug(&plug); comp.ic = ic; comp.in_flight = (atomic_t)ATOMIC_INIT(1); @@ -1945,6 +1952,8 @@ skip_io: dm_bufio_write_dirty_buffers_async(ic->bufio); + blk_finish_plug(&plug); + complete_journal_op(&comp); wait_for_completion_io(&comp.comp); @@ -3019,6 +3028,11 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) ti->error = "Block size doesn't match the information in superblock"; goto bad; } + if (!le32_to_cpu(ic->sb->journal_sections)) { + r = -EINVAL; + ti->error = "Corrupted superblock, journal_sections is 0"; + goto bad; + } /* make sure that ti->max_io_len doesn't overflow */ if (ic->sb->log2_interleave_sectors < MIN_LOG2_INTERLEAVE_SECTORS || ic->sb->log2_interleave_sectors > MAX_LOG2_INTERLEAVE_SECTORS) { diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 2e10c2f13a34..5bfe285ea9d1 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -208,6 +208,7 @@ struct raid_dev { #define RT_FLAG_RS_BITMAP_LOADED 2 #define RT_FLAG_UPDATE_SBS 3 #define RT_FLAG_RESHAPE_RS 4 +#define RT_FLAG_RS_SUSPENDED 5 /* Array elements of 64 bit needed for rebuild/failed disk bits */ #define DISKS_ARRAY_ELEMS ((MAX_RAID_DEVICES + (sizeof(uint64_t) * 8 - 1)) / sizeof(uint64_t) / 8) @@ -564,9 +565,10 @@ static const char *raid10_md_layout_to_format(int layout) if (__raid10_near_copies(layout) > 1) return "near"; - WARN_ON(__raid10_far_copies(layout) < 2); + if (__raid10_far_copies(layout) > 1) + return "far"; - return "far"; + return "unknown"; } /* Return md raid10 algorithm for @name */ @@ -2540,11 +2542,6 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) if (!freshest) return 0; - if (validate_raid_redundancy(rs)) { - rs->ti->error = "Insufficient redundancy to activate array"; - return -EINVAL; - } - /* * Validation of the freshest device provides the source of * validation for the remaining devices. @@ -2553,6 +2550,11 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) if (super_validate(rs, freshest)) return -EINVAL; + if (validate_raid_redundancy(rs)) { + rs->ti->error = "Insufficient redundancy to activate array"; + return -EINVAL; + } + rdev_for_each(rdev, mddev) if (!test_bit(Journal, &rdev->flags) && rdev != freshest && @@ -3168,6 +3170,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) } mddev_suspend(&rs->md); + set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags); /* Try to adjust the raid4/5/6 stripe cache size to the stripe size */ if (rs_is_raid456(rs)) { @@ -3625,7 +3628,7 @@ static void raid_postsuspend(struct dm_target *ti) { struct raid_set *rs = ti->private; - if (!rs->md.suspended) + if (!test_and_set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) mddev_suspend(&rs->md); rs->md.ro = 1; @@ -3759,7 +3762,7 @@ static int rs_start_reshape(struct raid_set *rs) return r; /* Need to be resumed to be able to start reshape, recovery is frozen until raid_resume() though */ - if (mddev->suspended) + if (test_and_clear_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) mddev_resume(mddev); /* @@ -3786,8 +3789,8 @@ static int rs_start_reshape(struct raid_set *rs) } /* Suspend because a resume will happen in raid_resume() */ - if (!mddev->suspended) - mddev_suspend(mddev); + set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags); + mddev_suspend(mddev); /* * Now reshape got set up, update superblocks to @@ -3883,13 +3886,13 @@ static void raid_resume(struct dm_target *ti) if (!(rs->ctr_flags & RESUME_STAY_FROZEN_FLAGS)) clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - if (mddev->suspended) + if (test_and_clear_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) mddev_resume(mddev); } static struct target_type raid_target = { .name = "raid", - .version = {1, 11, 1}, + .version = {1, 12, 1}, .module = THIS_MODULE, .ctr = raid_ctr, .dtr = raid_dtr, diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index a39bcd9b982a..28a4071cdf85 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -20,6 +20,7 @@ #include #include #include +#include #define DM_MSG_PREFIX "table" @@ -1630,6 +1631,37 @@ static bool dm_table_supports_flush(struct dm_table *t, unsigned long flush) return false; } +static int device_dax_write_cache_enabled(struct dm_target *ti, + struct dm_dev *dev, sector_t start, + sector_t len, void *data) +{ + struct dax_device *dax_dev = dev->dax_dev; + + if (!dax_dev) + return false; + + if (dax_write_cache_enabled(dax_dev)) + return true; + return false; +} + +static int dm_table_supports_dax_write_cache(struct dm_table *t) +{ + struct dm_target *ti; + unsigned i; + + for (i = 0; i < dm_table_get_num_targets(t); i++) { + ti = dm_table_get_target(t, i); + + if (ti->type->iterate_devices && + ti->type->iterate_devices(ti, + device_dax_write_cache_enabled, NULL)) + return true; + } + + return false; +} + static int device_is_nonrot(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) { @@ -1785,6 +1817,9 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, } blk_queue_write_cache(q, wc, fua); + if (dm_table_supports_dax_write_cache(t)) + dax_write_cache(t->md->dax_dev, true); + /* Ensure that all underlying devices are non-rotational. */ if (dm_table_all_devices_attribute(t, device_is_nonrot)) queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index 504ba3fa328b..e13f90832b6b 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -308,19 +308,14 @@ static int fec_alloc_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio) { unsigned n; - if (!fio->rs) { - fio->rs = mempool_alloc(v->fec->rs_pool, 0); - if (unlikely(!fio->rs)) { - DMERR("failed to allocate RS"); - return -ENOMEM; - } - } + if (!fio->rs) + fio->rs = mempool_alloc(v->fec->rs_pool, GFP_NOIO); fec_for_each_prealloc_buffer(n) { if (fio->bufs[n]) continue; - fio->bufs[n] = mempool_alloc(v->fec->prealloc_pool, GFP_NOIO); + fio->bufs[n] = mempool_alloc(v->fec->prealloc_pool, GFP_NOWAIT); if (unlikely(!fio->bufs[n])) { DMERR("failed to allocate FEC buffer"); return -ENOMEM; @@ -332,22 +327,16 @@ static int fec_alloc_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio) if (fio->bufs[n]) continue; - fio->bufs[n] = mempool_alloc(v->fec->extra_pool, GFP_NOIO); + fio->bufs[n] = mempool_alloc(v->fec->extra_pool, GFP_NOWAIT); /* we can manage with even one buffer if necessary */ if (unlikely(!fio->bufs[n])) break; } fio->nbufs = n; - if (!fio->output) { + if (!fio->output) fio->output = mempool_alloc(v->fec->output_pool, GFP_NOIO); - if (!fio->output) { - DMERR("failed to allocate FEC page"); - return -ENOMEM; - } - } - return 0; } diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 884ff7c170a0..a4fa2ada6883 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -624,7 +624,7 @@ static int dmz_write_sb(struct dmz_metadata *zmd, unsigned int set) ret = dmz_rdwr_block(zmd, REQ_OP_WRITE, block, mblk->page); if (ret == 0) - ret = blkdev_issue_flush(zmd->dev->bdev, GFP_KERNEL, NULL); + ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO, NULL); return ret; } @@ -658,7 +658,7 @@ static int dmz_write_dirty_mblocks(struct dmz_metadata *zmd, /* Flush drive cache (this will also sync data) */ if (ret == 0) - ret = blkdev_issue_flush(zmd->dev->bdev, GFP_KERNEL, NULL); + ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO, NULL); return ret; } @@ -722,7 +722,7 @@ int dmz_flush_metadata(struct dmz_metadata *zmd) /* If there are no dirty metadata blocks, just flush the device cache */ if (list_empty(&write_list)) { - ret = blkdev_issue_flush(zmd->dev->bdev, GFP_KERNEL, NULL); + ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO, NULL); goto out; } @@ -927,7 +927,7 @@ static int dmz_recover_mblocks(struct dmz_metadata *zmd, unsigned int dst_set) (zmd->nr_meta_zones << zmd->dev->zone_nr_blocks_shift); } - page = alloc_page(GFP_KERNEL); + page = alloc_page(GFP_NOIO); if (!page) return -ENOMEM; @@ -1183,7 +1183,7 @@ static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone) /* Get zone information from disk */ ret = blkdev_report_zones(zmd->dev->bdev, dmz_start_sect(zmd, zone), - &blkz, &nr_blkz, GFP_KERNEL); + &blkz, &nr_blkz, GFP_NOIO); if (ret) { dmz_dev_err(zmd->dev, "Get zone %u report failed", dmz_id(zmd, zone)); @@ -1257,7 +1257,7 @@ static int dmz_reset_zone(struct dmz_metadata *zmd, struct dm_zone *zone) ret = blkdev_reset_zones(dev->bdev, dmz_start_sect(zmd, zone), - dev->zone_nr_sectors, GFP_KERNEL); + dev->zone_nr_sectors, GFP_NOIO); if (ret) { dmz_dev_err(dev, "Reset zone %u failed %d", dmz_id(zmd, zone), ret); diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c index 05c0a126f5c8..44a119e12f1a 100644 --- a/drivers/md/dm-zoned-reclaim.c +++ b/drivers/md/dm-zoned-reclaim.c @@ -75,7 +75,7 @@ static int dmz_reclaim_align_wp(struct dmz_reclaim *zrc, struct dm_zone *zone, nr_blocks = block - wp_block; ret = blkdev_issue_zeroout(zrc->dev->bdev, dmz_start_sect(zmd, zone) + dmz_blk2sect(wp_block), - dmz_blk2sect(nr_blocks), GFP_NOFS, false); + dmz_blk2sect(nr_blocks), GFP_NOIO, 0); if (ret) { dmz_dev_err(zrc->dev, "Align zone %u wp %llu to %llu (wp+%u) blocks failed %d", diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index 2b538fa817f4..b08bbbd4d902 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -541,7 +541,7 @@ static void dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio) int ret; /* Create a new chunk work */ - cw = kmalloc(sizeof(struct dm_chunk_work), GFP_NOFS); + cw = kmalloc(sizeof(struct dm_chunk_work), GFP_NOIO); if (!cw) goto out; @@ -588,7 +588,7 @@ static int dmz_map(struct dm_target *ti, struct bio *bio) bio->bi_bdev = dev->bdev; - if (!nr_sectors && (bio_op(bio) != REQ_OP_FLUSH) && (bio_op(bio) != REQ_OP_WRITE)) + if (!nr_sectors && bio_op(bio) != REQ_OP_WRITE) return DM_MAPIO_REMAPPED; /* The BIO should be block aligned */ @@ -603,7 +603,7 @@ static int dmz_map(struct dm_target *ti, struct bio *bio) bioctx->status = BLK_STS_OK; /* Set the BIO pending in the flush list */ - if (bio_op(bio) == REQ_OP_FLUSH || (!nr_sectors && bio_op(bio) == REQ_OP_WRITE)) { + if (!nr_sectors && bio_op(bio) == REQ_OP_WRITE) { spin_lock(&dmz->flush_lock); bio_list_add(&dmz->flush_list, bio); spin_unlock(&dmz->flush_lock); @@ -785,7 +785,7 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv) /* Chunk BIO work */ mutex_init(&dmz->chunk_lock); - INIT_RADIX_TREE(&dmz->chunk_rxtree, GFP_NOFS); + INIT_RADIX_TREE(&dmz->chunk_rxtree, GFP_KERNEL); dmz->chunk_wq = alloc_workqueue("dmz_cwq_%s", WQ_MEM_RECLAIM | WQ_UNBOUND, 0, dev->name); if (!dmz->chunk_wq) { diff --git a/include/linux/dax.h b/include/linux/dax.h index 794811875732..df97b7af7e2c 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -87,6 +87,7 @@ size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, void dax_flush(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t size); void dax_write_cache(struct dax_device *dax_dev, bool wc); +bool dax_write_cache_enabled(struct dax_device *dax_dev); /* * We use lowest available bit in exceptional entry for locking, one bit for