diff --git a/Documentation/device-mapper/dm-crypt.txt b/Documentation/device-mapper/dm-crypt.txt index ff1f87bf26e8..3b3e1de21c9c 100644 --- a/Documentation/device-mapper/dm-crypt.txt +++ b/Documentation/device-mapper/dm-crypt.txt @@ -11,14 +11,31 @@ Parameters: \ [<#opt_params> ] - Encryption cipher and an optional IV generation mode. - (In format cipher[:keycount]-chainmode-ivmode[:ivopts]). - Examples: - des - aes-cbc-essiv:sha256 - twofish-ecb + Encryption cipher, encryption mode and Initial Vector (IV) generator. - /proc/crypto contains supported crypto modes + The cipher specifications format is: + cipher[:keycount]-chainmode-ivmode[:ivopts] + Examples: + aes-cbc-essiv:sha256 + aes-xts-plain64 + serpent-xts-plain64 + + Cipher format also supports direct specification with kernel crypt API + format (selected by capi: prefix). The IV specification is the same + as for the first format type. + This format is mainly used for specification of authenticated modes. + + The crypto API cipher specifications format is: + capi:cipher_api_spec-ivmode[:ivopts] + Examples: + capi:cbc(aes)-essiv:sha256 + capi:xts(aes)-plain64 + Examples of authenticated modes: + capi:gcm(aes)-random + capi:authenc(hmac(sha256),xts(aes))-random + capi:rfc7539(chacha20,poly1305)-random + + The /proc/crypto contains a list of curently loaded crypto modes. Key used for encryption. It is encoded either as a hexadecimal number @@ -93,6 +110,32 @@ submit_from_crypt_cpus thread because it benefits CFQ to have writes submitted using the same context. +integrity:: + The device requires additional metadata per-sector stored + in per-bio integrity structure. This metadata must by provided + by underlying dm-integrity target. + + The can be "none" if metadata is used only for persistent IV. + + For Authenticated Encryption with Additional Data (AEAD) + the is "aead". An AEAD mode additionally calculates and verifies + integrity for the encrypted device. The additional space is then + used for storing authentication tag (and persistent IV if needed). + +sector_size: + Use as the encryption unit instead of 512 bytes sectors. + This option can be in range 512 - 4096 bytes and must be power of two. + Virtual device will announce this size as a minimal IO and logical sector. + +iv_large_sectors + IV generators will use sector number counted in units + instead of default 512 bytes sectors. + + For example, if is 4096 bytes, plain64 IV for the second + sector will be 8 (without flag) and 1 if iv_large_sectors is present. + The must be multiple of (in 512 bytes units) + if this flag is specified. + Example scripts =============== LUKS (Linux Unified Key Setup) is now the preferred way to set up disk diff --git a/Documentation/device-mapper/dm-integrity.txt b/Documentation/device-mapper/dm-integrity.txt new file mode 100644 index 000000000000..f33e3ade7a09 --- /dev/null +++ b/Documentation/device-mapper/dm-integrity.txt @@ -0,0 +1,199 @@ +The dm-integrity target emulates a block device that has additional +per-sector tags that can be used for storing integrity information. + +A general problem with storing integrity tags with every sector is that +writing the sector and the integrity tag must be atomic - i.e. in case of +crash, either both sector and integrity tag or none of them is written. + +To guarantee write atomicity, the dm-integrity target uses journal, it +writes sector data and integrity tags into a journal, commits the journal +and then copies the data and integrity tags to their respective location. + +The dm-integrity target can be used with the dm-crypt target - in this +situation the dm-crypt target creates the integrity data and passes them +to the dm-integrity target via bio_integrity_payload attached to the bio. +In this mode, the dm-crypt and dm-integrity targets provide authenticated +disk encryption - if the attacker modifies the encrypted device, an I/O +error is returned instead of random data. + +The dm-integrity target can also be used as a standalone target, in this +mode it calculates and verifies the integrity tag internally. In this +mode, the dm-integrity target can be used to detect silent data +corruption on the disk or in the I/O path. + + +When loading the target for the first time, the kernel driver will format +the device. But it will only format the device if the superblock contains +zeroes. If the superblock is neither valid nor zeroed, the dm-integrity +target can't be loaded. + +To use the target for the first time: +1. overwrite the superblock with zeroes +2. load the dm-integrity target with one-sector size, the kernel driver + will format the device +3. unload the dm-integrity target +4. read the "provided_data_sectors" value from the superblock +5. load the dm-integrity target with the the target size + "provided_data_sectors" +6. if you want to use dm-integrity with dm-crypt, load the dm-crypt target + with the size "provided_data_sectors" + + +Target arguments: + +1. the underlying block device + +2. the number of reserved sector at the beginning of the device - the + dm-integrity won't read of write these sectors + +3. the size of the integrity tag (if "-" is used, the size is taken from + the internal-hash algorithm) + +4. mode: + D - direct writes (without journal) - in this mode, journaling is + not used and data sectors and integrity tags are written + separately. In case of crash, it is possible that the data + and integrity tag doesn't match. + J - journaled writes - data and integrity tags are written to the + journal and atomicity is guaranteed. In case of crash, + either both data and tag or none of them are written. The + journaled mode degrades write throughput twice because the + data have to be written twice. + R - recovery mode - in this mode, journal is not replayed, + checksums are not checked and writes to the device are not + allowed. This mode is useful for data recovery if the + device cannot be activated in any of the other standard + modes. + +5. the number of additional arguments + +Additional arguments: + +journal_sectors:number + The size of journal, this argument is used only if formatting the + device. If the device is already formatted, the value from the + superblock is used. + +interleave_sectors:number + The number of interleaved sectors. This values is rounded down to + a power of two. If the device is already formatted, the value from + the superblock is used. + +buffer_sectors:number + The number of sectors in one buffer. The value is rounded down to + a power of two. + + The tag area is accessed using buffers, the buffer size is + configurable. The large buffer size means that the I/O size will + be larger, but there could be less I/Os issued. + +journal_watermark:number + The journal watermark in percents. When the size of the journal + exceeds this watermark, the thread that flushes the journal will + be started. + +commit_time:number + Commit time in milliseconds. When this time passes, the journal is + written. The journal is also written immediatelly if the FLUSH + request is received. + +internal_hash:algorithm(:key) (the key is optional) + Use internal hash or crc. + When this argument is used, the dm-integrity target won't accept + integrity tags from the upper target, but it will automatically + generate and verify the integrity tags. + + You can use a crc algorithm (such as crc32), then integrity target + will protect the data against accidental corruption. + You can also use a hmac algorithm (for example + "hmac(sha256):0123456789abcdef"), in this mode it will provide + cryptographic authentication of the data without encryption. + + When this argument is not used, the integrity tags are accepted + from an upper layer target, such as dm-crypt. The upper layer + target should check the validity of the integrity tags. + +journal_crypt:algorithm(:key) (the key is optional) + Encrypt the journal using given algorithm to make sure that the + attacker can't read the journal. You can use a block cipher here + (such as "cbc(aes)") or a stream cipher (for example "chacha20", + "salsa20", "ctr(aes)" or "ecb(arc4)"). + + The journal contains history of last writes to the block device, + an attacker reading the journal could see the last sector nubmers + that were written. From the sector numbers, the attacker can infer + the size of files that were written. To protect against this + situation, you can encrypt the journal. + +journal_mac:algorithm(:key) (the key is optional) + Protect sector numbers in the journal from accidental or malicious + modification. To protect against accidental modification, use a + crc algorithm, to protect against malicious modification, use a + hmac algorithm with a key. + + This option is not needed when using internal-hash because in this + mode, the integrity of journal entries is checked when replaying + the journal. Thus, modified sector number would be detected at + this stage. + +block_size:number + The size of a data block in bytes. The larger the block size the + less overhead there is for per-block integrity metadata. + Supported values are 512, 1024, 2048 and 4096 bytes. If not + specified the default block size is 512 bytes. + +The journal mode (D/J), buffer_sectors, journal_watermark, commit_time can +be changed when reloading the target (load an inactive table and swap the +tables with suspend and resume). The other arguments should not be changed +when reloading the target because the layout of disk data depend on them +and the reloaded target would be non-functional. + + +The layout of the formatted block device: +* reserved sectors (they are not used by this target, they can be used for + storing LUKS metadata or for other purpose), the size of the reserved + area is specified in the target arguments +* superblock (4kiB) + * magic string - identifies that the device was formatted + * version + * log2(interleave sectors) + * integrity tag size + * the number of journal sections + * provided data sectors - the number of sectors that this target + provides (i.e. the size of the device minus the size of all + metadata and padding). The user of this target should not send + bios that access data beyond the "provided data sectors" limit. + * flags - a flag is set if journal_mac is used +* journal + The journal is divided into sections, each section contains: + * metadata area (4kiB), it contains journal entries + every journal entry contains: + * logical sector (specifies where the data and tag should + be written) + * last 8 bytes of data + * integrity tag (the size is specified in the superblock) + every metadata sector ends with + * mac (8-bytes), all the macs in 8 metadata sectors form a + 64-byte value. It is used to store hmac of sector + numbers in the journal section, to protect against a + possibility that the attacker tampers with sector + numbers in the journal. + * commit id + * data area (the size is variable; it depends on how many journal + entries fit into the metadata area) + every sector in the data area contains: + * data (504 bytes of data, the last 8 bytes are stored in + the journal entry) + * commit id + To test if the whole journal section was written correctly, every + 512-byte sector of the journal ends with 8-byte commit id. If the + commit id matches on all sectors in a journal section, then it is + assumed that the section was written correctly. If the commit id + doesn't match, the section was written partially and it should not + be replayed. +* one or more runs of interleaved tags and data. Each run contains: + * tag area - it contains integrity tags. There is one tag for each + sector in the data area + * data area - it contains data sectors. The number of data sectors + in one run must be a power of two. log2 of this value is stored + in the superblock. diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt index cd2cb2fc85ea..7e06e65586d4 100644 --- a/Documentation/device-mapper/dm-raid.txt +++ b/Documentation/device-mapper/dm-raid.txt @@ -170,6 +170,13 @@ The target is named "raid" and it accepts the following parameters: Takeover/reshape is not possible with a raid4/5/6 journal device; it has to be deconfigured before requesting these. + [journal_mode ] + This option sets the caching mode on journaled raid4/5/6 raid sets + (see 'journal_dev ' above) to 'writethrough' or 'writeback'. + If 'writeback' is selected the journal device has to be resilient + and must not suffer from the 'write hole' problem itself (e.g. use + raid1 or raid10) to avoid a single point of failure. + <#raid_devs>: The number of devices composing the array. Each device consists of two entries. The first is the device containing the metadata (if any); the second is the one containing the @@ -254,7 +261,8 @@ recovery. Here is a fuller description of the individual fields: The current data offset to the start of the user data on each component device of a raid set (see the respective raid parameter to support out-of-place reshaping). - 'A' - active raid4/5/6 journal device. + 'A' - active write-through journal device. + 'a' - active write-back journal device. 'D' - dead journal device. '-' - no journal device. @@ -331,3 +339,7 @@ Version History 'D' on the status line. If '- -' is passed into the constructor, emit '- -' on the table line and '-' as the status line health character. 1.10.0 Add support for raid4/5/6 journal device +1.10.1 Fix data corruption on reshape request +1.11.0 Fix table line argument order + (wrong raid10_copies/raid10_format sequence) +1.11.1 Add raid4/5/6 journal write-back support via journal_mode option diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index b7767da50c26..ee2c21e3d232 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -325,14 +325,6 @@ config DM_CACHE_SMQ of less memory utilization, improved performance and increased adaptability in the face of changing workloads. -config DM_CACHE_CLEANER - tristate "Cleaner Cache Policy (EXPERIMENTAL)" - depends on DM_CACHE - default y - ---help--- - A simple cache policy that writes back all data to the - origin. Used when decommissioning a dm-cache. - config DM_ERA tristate "Era target (EXPERIMENTAL)" depends on BLK_DEV_DM @@ -365,6 +357,7 @@ config DM_LOG_USERSPACE config DM_RAID tristate "RAID 1/4/5/6/10 target" depends on BLK_DEV_DM + select MD_RAID0 select MD_RAID1 select MD_RAID10 select MD_RAID456 @@ -508,4 +501,14 @@ config DM_LOG_WRITES If unsure, say N. +config DM_INTEGRITY + tristate "Integrity target" + depends on BLK_DEV_DM + select BLK_DEV_INTEGRITY + select DM_BUFIO + select CRYPTO + select ASYNC_XOR + ---help--- + This is the integrity target. + endif # MD diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 3cbda1af87a0..39cf2a1b5f90 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -11,10 +11,11 @@ dm-snapshot-y += dm-snap.o dm-exception-store.o dm-snap-transient.o \ dm-mirror-y += dm-raid1.o dm-log-userspace-y \ += dm-log-userspace-base.o dm-log-userspace-transfer.o +dm-bio-prison-y += dm-bio-prison-v1.o dm-bio-prison-v2.o dm-thin-pool-y += dm-thin.o dm-thin-metadata.o -dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o +dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o \ + dm-cache-background-tracker.o dm-cache-smq-y += dm-cache-policy-smq.o -dm-cache-cleaner-y += dm-cache-policy-cleaner.o dm-era-y += dm-era-target.o dm-verity-y += dm-verity-target.o md-mod-y += md.o bitmap.o @@ -56,9 +57,9 @@ obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o obj-$(CONFIG_DM_VERITY) += dm-verity.o obj-$(CONFIG_DM_CACHE) += dm-cache.o obj-$(CONFIG_DM_CACHE_SMQ) += dm-cache-smq.o -obj-$(CONFIG_DM_CACHE_CLEANER) += dm-cache-cleaner.o obj-$(CONFIG_DM_ERA) += dm-era.o obj-$(CONFIG_DM_LOG_WRITES) += dm-log-writes.o +obj-$(CONFIG_DM_INTEGRITY) += dm-integrity.o ifeq ($(CONFIG_DM_UEVENT),y) dm-mod-objs += dm-uevent.o diff --git a/drivers/md/dm-bio-prison.c b/drivers/md/dm-bio-prison-v1.c similarity index 94% rename from drivers/md/dm-bio-prison.c rename to drivers/md/dm-bio-prison-v1.c index 03af174485d3..ae7da2c30a57 100644 --- a/drivers/md/dm-bio-prison.c +++ b/drivers/md/dm-bio-prison-v1.c @@ -5,7 +5,8 @@ */ #include "dm.h" -#include "dm-bio-prison.h" +#include "dm-bio-prison-v1.h" +#include "dm-bio-prison-v2.h" #include #include @@ -398,7 +399,7 @@ EXPORT_SYMBOL_GPL(dm_deferred_set_add_work); /*----------------------------------------------------------------*/ -static int __init dm_bio_prison_init(void) +static int __init dm_bio_prison_init_v1(void) { _cell_cache = KMEM_CACHE(dm_bio_prison_cell, 0); if (!_cell_cache) @@ -407,12 +408,51 @@ static int __init dm_bio_prison_init(void) return 0; } -static void __exit dm_bio_prison_exit(void) +static void dm_bio_prison_exit_v1(void) { kmem_cache_destroy(_cell_cache); _cell_cache = NULL; } +static int (*_inits[])(void) __initdata = { + dm_bio_prison_init_v1, + dm_bio_prison_init_v2, +}; + +static void (*_exits[])(void) = { + dm_bio_prison_exit_v1, + dm_bio_prison_exit_v2, +}; + +static int __init dm_bio_prison_init(void) +{ + const int count = ARRAY_SIZE(_inits); + + int r, i; + + for (i = 0; i < count; i++) { + r = _inits[i](); + if (r) + goto bad; + } + + return 0; + + bad: + while (i--) + _exits[i](); + + return r; +} + +static void __exit dm_bio_prison_exit(void) +{ + int i = ARRAY_SIZE(_exits); + + while (i--) + _exits[i](); +} + /* * module hooks */ diff --git a/drivers/md/dm-bio-prison.h b/drivers/md/dm-bio-prison-v1.h similarity index 99% rename from drivers/md/dm-bio-prison.h rename to drivers/md/dm-bio-prison-v1.h index 54352f009bfd..cddd4ac07e2c 100644 --- a/drivers/md/dm-bio-prison.h +++ b/drivers/md/dm-bio-prison-v1.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2011-2012 Red Hat, Inc. + * Copyright (C) 2011-2017 Red Hat, Inc. * * This file is released under the GPL. */ diff --git a/drivers/md/dm-bio-prison-v2.c b/drivers/md/dm-bio-prison-v2.c new file mode 100644 index 000000000000..c9b11f799cd8 --- /dev/null +++ b/drivers/md/dm-bio-prison-v2.c @@ -0,0 +1,369 @@ +/* + * Copyright (C) 2012-2017 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#include "dm.h" +#include "dm-bio-prison-v2.h" + +#include +#include +#include +#include +#include + +/*----------------------------------------------------------------*/ + +#define MIN_CELLS 1024 + +struct dm_bio_prison_v2 { + struct workqueue_struct *wq; + + spinlock_t lock; + mempool_t *cell_pool; + struct rb_root cells; +}; + +static struct kmem_cache *_cell_cache; + +/*----------------------------------------------------------------*/ + +/* + * @nr_cells should be the number of cells you want in use _concurrently_. + * Don't confuse it with the number of distinct keys. + */ +struct dm_bio_prison_v2 *dm_bio_prison_create_v2(struct workqueue_struct *wq) +{ + struct dm_bio_prison_v2 *prison = kmalloc(sizeof(*prison), GFP_KERNEL); + + if (!prison) + return NULL; + + prison->wq = wq; + spin_lock_init(&prison->lock); + + prison->cell_pool = mempool_create_slab_pool(MIN_CELLS, _cell_cache); + if (!prison->cell_pool) { + kfree(prison); + return NULL; + } + + prison->cells = RB_ROOT; + + return prison; +} +EXPORT_SYMBOL_GPL(dm_bio_prison_create_v2); + +void dm_bio_prison_destroy_v2(struct dm_bio_prison_v2 *prison) +{ + mempool_destroy(prison->cell_pool); + kfree(prison); +} +EXPORT_SYMBOL_GPL(dm_bio_prison_destroy_v2); + +struct dm_bio_prison_cell_v2 *dm_bio_prison_alloc_cell_v2(struct dm_bio_prison_v2 *prison, gfp_t gfp) +{ + return mempool_alloc(prison->cell_pool, gfp); +} +EXPORT_SYMBOL_GPL(dm_bio_prison_alloc_cell_v2); + +void dm_bio_prison_free_cell_v2(struct dm_bio_prison_v2 *prison, + struct dm_bio_prison_cell_v2 *cell) +{ + mempool_free(cell, prison->cell_pool); +} +EXPORT_SYMBOL_GPL(dm_bio_prison_free_cell_v2); + +static void __setup_new_cell(struct dm_cell_key_v2 *key, + struct dm_bio_prison_cell_v2 *cell) +{ + memset(cell, 0, sizeof(*cell)); + memcpy(&cell->key, key, sizeof(cell->key)); + bio_list_init(&cell->bios); +} + +static int cmp_keys(struct dm_cell_key_v2 *lhs, + struct dm_cell_key_v2 *rhs) +{ + if (lhs->virtual < rhs->virtual) + return -1; + + if (lhs->virtual > rhs->virtual) + return 1; + + if (lhs->dev < rhs->dev) + return -1; + + if (lhs->dev > rhs->dev) + return 1; + + if (lhs->block_end <= rhs->block_begin) + return -1; + + if (lhs->block_begin >= rhs->block_end) + return 1; + + return 0; +} + +/* + * Returns true if node found, otherwise it inserts a new one. + */ +static bool __find_or_insert(struct dm_bio_prison_v2 *prison, + struct dm_cell_key_v2 *key, + struct dm_bio_prison_cell_v2 *cell_prealloc, + struct dm_bio_prison_cell_v2 **result) +{ + int r; + struct rb_node **new = &prison->cells.rb_node, *parent = NULL; + + while (*new) { + struct dm_bio_prison_cell_v2 *cell = + container_of(*new, struct dm_bio_prison_cell_v2, node); + + r = cmp_keys(key, &cell->key); + + parent = *new; + if (r < 0) + new = &((*new)->rb_left); + + else if (r > 0) + new = &((*new)->rb_right); + + else { + *result = cell; + return true; + } + } + + __setup_new_cell(key, cell_prealloc); + *result = cell_prealloc; + rb_link_node(&cell_prealloc->node, parent, new); + rb_insert_color(&cell_prealloc->node, &prison->cells); + + return false; +} + +static bool __get(struct dm_bio_prison_v2 *prison, + struct dm_cell_key_v2 *key, + unsigned lock_level, + struct bio *inmate, + struct dm_bio_prison_cell_v2 *cell_prealloc, + struct dm_bio_prison_cell_v2 **cell) +{ + if (__find_or_insert(prison, key, cell_prealloc, cell)) { + if ((*cell)->exclusive_lock) { + if (lock_level <= (*cell)->exclusive_level) { + bio_list_add(&(*cell)->bios, inmate); + return false; + } + } + + (*cell)->shared_count++; + + } else + (*cell)->shared_count = 1; + + return true; +} + +bool dm_cell_get_v2(struct dm_bio_prison_v2 *prison, + struct dm_cell_key_v2 *key, + unsigned lock_level, + struct bio *inmate, + struct dm_bio_prison_cell_v2 *cell_prealloc, + struct dm_bio_prison_cell_v2 **cell_result) +{ + int r; + unsigned long flags; + + spin_lock_irqsave(&prison->lock, flags); + r = __get(prison, key, lock_level, inmate, cell_prealloc, cell_result); + spin_unlock_irqrestore(&prison->lock, flags); + + return r; +} +EXPORT_SYMBOL_GPL(dm_cell_get_v2); + +static bool __put(struct dm_bio_prison_v2 *prison, + struct dm_bio_prison_cell_v2 *cell) +{ + BUG_ON(!cell->shared_count); + cell->shared_count--; + + // FIXME: shared locks granted above the lock level could starve this + if (!cell->shared_count) { + if (cell->exclusive_lock){ + if (cell->quiesce_continuation) { + queue_work(prison->wq, cell->quiesce_continuation); + cell->quiesce_continuation = NULL; + } + } else { + rb_erase(&cell->node, &prison->cells); + return true; + } + } + + return false; +} + +bool dm_cell_put_v2(struct dm_bio_prison_v2 *prison, + struct dm_bio_prison_cell_v2 *cell) +{ + bool r; + unsigned long flags; + + spin_lock_irqsave(&prison->lock, flags); + r = __put(prison, cell); + spin_unlock_irqrestore(&prison->lock, flags); + + return r; +} +EXPORT_SYMBOL_GPL(dm_cell_put_v2); + +static int __lock(struct dm_bio_prison_v2 *prison, + struct dm_cell_key_v2 *key, + unsigned lock_level, + struct dm_bio_prison_cell_v2 *cell_prealloc, + struct dm_bio_prison_cell_v2 **cell_result) +{ + struct dm_bio_prison_cell_v2 *cell; + + if (__find_or_insert(prison, key, cell_prealloc, &cell)) { + if (cell->exclusive_lock) + return -EBUSY; + + cell->exclusive_lock = true; + cell->exclusive_level = lock_level; + *cell_result = cell; + + // FIXME: we don't yet know what level these shared locks + // were taken at, so have to quiesce them all. + return cell->shared_count > 0; + + } else { + cell = cell_prealloc; + cell->shared_count = 0; + cell->exclusive_lock = true; + cell->exclusive_level = lock_level; + *cell_result = cell; + } + + return 0; +} + +int dm_cell_lock_v2(struct dm_bio_prison_v2 *prison, + struct dm_cell_key_v2 *key, + unsigned lock_level, + struct dm_bio_prison_cell_v2 *cell_prealloc, + struct dm_bio_prison_cell_v2 **cell_result) +{ + int r; + unsigned long flags; + + spin_lock_irqsave(&prison->lock, flags); + r = __lock(prison, key, lock_level, cell_prealloc, cell_result); + spin_unlock_irqrestore(&prison->lock, flags); + + return r; +} +EXPORT_SYMBOL_GPL(dm_cell_lock_v2); + +static void __quiesce(struct dm_bio_prison_v2 *prison, + struct dm_bio_prison_cell_v2 *cell, + struct work_struct *continuation) +{ + if (!cell->shared_count) + queue_work(prison->wq, continuation); + else + cell->quiesce_continuation = continuation; +} + +void dm_cell_quiesce_v2(struct dm_bio_prison_v2 *prison, + struct dm_bio_prison_cell_v2 *cell, + struct work_struct *continuation) +{ + unsigned long flags; + + spin_lock_irqsave(&prison->lock, flags); + __quiesce(prison, cell, continuation); + spin_unlock_irqrestore(&prison->lock, flags); +} +EXPORT_SYMBOL_GPL(dm_cell_quiesce_v2); + +static int __promote(struct dm_bio_prison_v2 *prison, + struct dm_bio_prison_cell_v2 *cell, + unsigned new_lock_level) +{ + if (!cell->exclusive_lock) + return -EINVAL; + + cell->exclusive_level = new_lock_level; + return cell->shared_count > 0; +} + +int dm_cell_lock_promote_v2(struct dm_bio_prison_v2 *prison, + struct dm_bio_prison_cell_v2 *cell, + unsigned new_lock_level) +{ + int r; + unsigned long flags; + + spin_lock_irqsave(&prison->lock, flags); + r = __promote(prison, cell, new_lock_level); + spin_unlock_irqrestore(&prison->lock, flags); + + return r; +} +EXPORT_SYMBOL_GPL(dm_cell_lock_promote_v2); + +static bool __unlock(struct dm_bio_prison_v2 *prison, + struct dm_bio_prison_cell_v2 *cell, + struct bio_list *bios) +{ + BUG_ON(!cell->exclusive_lock); + + bio_list_merge(bios, &cell->bios); + bio_list_init(&cell->bios); + + if (cell->shared_count) { + cell->exclusive_lock = 0; + return false; + } + + rb_erase(&cell->node, &prison->cells); + return true; +} + +bool dm_cell_unlock_v2(struct dm_bio_prison_v2 *prison, + struct dm_bio_prison_cell_v2 *cell, + struct bio_list *bios) +{ + bool r; + unsigned long flags; + + spin_lock_irqsave(&prison->lock, flags); + r = __unlock(prison, cell, bios); + spin_unlock_irqrestore(&prison->lock, flags); + + return r; +} +EXPORT_SYMBOL_GPL(dm_cell_unlock_v2); + +/*----------------------------------------------------------------*/ + +int __init dm_bio_prison_init_v2(void) +{ + _cell_cache = KMEM_CACHE(dm_bio_prison_cell_v2, 0); + if (!_cell_cache) + return -ENOMEM; + + return 0; +} + +void dm_bio_prison_exit_v2(void) +{ + kmem_cache_destroy(_cell_cache); + _cell_cache = NULL; +} diff --git a/drivers/md/dm-bio-prison-v2.h b/drivers/md/dm-bio-prison-v2.h new file mode 100644 index 000000000000..6e04234268db --- /dev/null +++ b/drivers/md/dm-bio-prison-v2.h @@ -0,0 +1,152 @@ +/* + * Copyright (C) 2011-2017 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#ifndef DM_BIO_PRISON_V2_H +#define DM_BIO_PRISON_V2_H + +#include "persistent-data/dm-block-manager.h" /* FIXME: for dm_block_t */ +#include "dm-thin-metadata.h" /* FIXME: for dm_thin_id */ + +#include +#include +#include + +/*----------------------------------------------------------------*/ + +int dm_bio_prison_init_v2(void); +void dm_bio_prison_exit_v2(void); + +/* + * Sometimes we can't deal with a bio straight away. We put them in prison + * where they can't cause any mischief. Bios are put in a cell identified + * by a key, multiple bios can be in the same cell. When the cell is + * subsequently unlocked the bios become available. + */ +struct dm_bio_prison_v2; + +/* + * Keys define a range of blocks within either a virtual or physical + * device. + */ +struct dm_cell_key_v2 { + int virtual; + dm_thin_id dev; + dm_block_t block_begin, block_end; +}; + +/* + * Treat this as opaque, only in header so callers can manage allocation + * themselves. + */ +struct dm_bio_prison_cell_v2 { + // FIXME: pack these + bool exclusive_lock; + unsigned exclusive_level; + unsigned shared_count; + struct work_struct *quiesce_continuation; + + struct rb_node node; + struct dm_cell_key_v2 key; + struct bio_list bios; +}; + +struct dm_bio_prison_v2 *dm_bio_prison_create_v2(struct workqueue_struct *wq); +void dm_bio_prison_destroy_v2(struct dm_bio_prison_v2 *prison); + +/* + * These two functions just wrap a mempool. This is a transitory step: + * Eventually all bio prison clients should manage their own cell memory. + * + * Like mempool_alloc(), dm_bio_prison_alloc_cell_v2() can only fail if called + * in interrupt context or passed GFP_NOWAIT. + */ +struct dm_bio_prison_cell_v2 *dm_bio_prison_alloc_cell_v2(struct dm_bio_prison_v2 *prison, + gfp_t gfp); +void dm_bio_prison_free_cell_v2(struct dm_bio_prison_v2 *prison, + struct dm_bio_prison_cell_v2 *cell); + +/* + * Shared locks have a bio associated with them. + * + * If the lock is granted the caller can continue to use the bio, and must + * call dm_cell_put_v2() to drop the reference count when finished using it. + * + * If the lock cannot be granted then the bio will be tracked within the + * cell, and later given to the holder of the exclusive lock. + * + * See dm_cell_lock_v2() for discussion of the lock_level parameter. + * + * Compare *cell_result with cell_prealloc to see if the prealloc was used. + * If cell_prealloc was used then inmate wasn't added to it. + * + * Returns true if the lock is granted. + */ +bool dm_cell_get_v2(struct dm_bio_prison_v2 *prison, + struct dm_cell_key_v2 *key, + unsigned lock_level, + struct bio *inmate, + struct dm_bio_prison_cell_v2 *cell_prealloc, + struct dm_bio_prison_cell_v2 **cell_result); + +/* + * Decrement the shared reference count for the lock. Returns true if + * returning ownership of the cell (ie. you should free it). + */ +bool dm_cell_put_v2(struct dm_bio_prison_v2 *prison, + struct dm_bio_prison_cell_v2 *cell); + +/* + * Locks a cell. No associated bio. Exclusive locks get priority. These + * locks constrain whether the io locks are granted according to level. + * + * Shared locks will still be granted if the lock_level is > (not = to) the + * exclusive lock level. + * + * If an _exclusive_ lock is already held then -EBUSY is returned. + * + * Return values: + * < 0 - error + * 0 - locked; no quiescing needed + * 1 - locked; quiescing needed + */ +int dm_cell_lock_v2(struct dm_bio_prison_v2 *prison, + struct dm_cell_key_v2 *key, + unsigned lock_level, + struct dm_bio_prison_cell_v2 *cell_prealloc, + struct dm_bio_prison_cell_v2 **cell_result); + +void dm_cell_quiesce_v2(struct dm_bio_prison_v2 *prison, + struct dm_bio_prison_cell_v2 *cell, + struct work_struct *continuation); + +/* + * Promotes an _exclusive_ lock to a higher lock level. + * + * Return values: + * < 0 - error + * 0 - promoted; no quiescing needed + * 1 - promoted; quiescing needed + */ +int dm_cell_lock_promote_v2(struct dm_bio_prison_v2 *prison, + struct dm_bio_prison_cell_v2 *cell, + unsigned new_lock_level); + +/* + * Adds any held bios to the bio list. + * + * There may be shared locks still held at this point even if you quiesced + * (ie. different lock levels). + * + * Returns true if returning ownership of the cell (ie. you should free + * it). + */ +bool dm_cell_unlock_v2(struct dm_bio_prison_v2 *prison, + struct dm_bio_prison_cell_v2 *cell, + struct bio_list *bios); + +/*----------------------------------------------------------------*/ + +#endif diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index df4859f6ac6a..c92c31b23e54 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -110,6 +110,8 @@ struct dm_bufio_client { struct rb_root buffer_tree; wait_queue_head_t free_buffer_wait; + sector_t start; + int async_write_error; struct list_head client_list; @@ -557,8 +559,8 @@ static void dmio_complete(unsigned long error, void *context) b->bio.bi_end_io(&b->bio); } -static void use_dmio(struct dm_buffer *b, int rw, sector_t block, - bio_end_io_t *end_io) +static void use_dmio(struct dm_buffer *b, int rw, sector_t sector, + unsigned n_sectors, bio_end_io_t *end_io) { int r; struct dm_io_request io_req = { @@ -570,8 +572,8 @@ static void use_dmio(struct dm_buffer *b, int rw, sector_t block, }; struct dm_io_region region = { .bdev = b->c->bdev, - .sector = block << b->c->sectors_per_block_bits, - .count = b->c->block_size >> SECTOR_SHIFT, + .sector = sector, + .count = n_sectors, }; if (b->data_mode != DATA_MODE_VMALLOC) { @@ -606,14 +608,14 @@ static void inline_endio(struct bio *bio) end_fn(bio); } -static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block, - bio_end_io_t *end_io) +static void use_inline_bio(struct dm_buffer *b, int rw, sector_t sector, + unsigned n_sectors, bio_end_io_t *end_io) { char *ptr; int len; bio_init(&b->bio, b->bio_vec, DM_BUFIO_INLINE_VECS); - b->bio.bi_iter.bi_sector = block << b->c->sectors_per_block_bits; + b->bio.bi_iter.bi_sector = sector; b->bio.bi_bdev = b->c->bdev; b->bio.bi_end_io = inline_endio; /* @@ -628,7 +630,7 @@ static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block, * If len < PAGE_SIZE the buffer doesn't cross page boundary. */ ptr = b->data; - len = b->c->block_size; + len = n_sectors << SECTOR_SHIFT; if (len >= PAGE_SIZE) BUG_ON((unsigned long)ptr & (PAGE_SIZE - 1)); @@ -640,7 +642,7 @@ static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block, len < PAGE_SIZE ? len : PAGE_SIZE, offset_in_page(ptr))) { BUG_ON(b->c->block_size <= PAGE_SIZE); - use_dmio(b, rw, block, end_io); + use_dmio(b, rw, sector, n_sectors, end_io); return; } @@ -651,17 +653,22 @@ static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block, submit_bio(&b->bio); } -static void submit_io(struct dm_buffer *b, int rw, sector_t block, - bio_end_io_t *end_io) +static void submit_io(struct dm_buffer *b, int rw, bio_end_io_t *end_io) { + unsigned n_sectors; + sector_t sector; + if (rw == WRITE && b->c->write_callback) b->c->write_callback(b); - if (b->c->block_size <= DM_BUFIO_INLINE_VECS * PAGE_SIZE && + sector = (b->block << b->c->sectors_per_block_bits) + b->c->start; + n_sectors = 1 << b->c->sectors_per_block_bits; + + if (n_sectors <= ((DM_BUFIO_INLINE_VECS * PAGE_SIZE) >> SECTOR_SHIFT) && b->data_mode != DATA_MODE_VMALLOC) - use_inline_bio(b, rw, block, end_io); + use_inline_bio(b, rw, sector, n_sectors, end_io); else - use_dmio(b, rw, block, end_io); + use_dmio(b, rw, sector, n_sectors, end_io); } /*---------------------------------------------------------------- @@ -713,7 +720,7 @@ static void __write_dirty_buffer(struct dm_buffer *b, wait_on_bit_lock_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE); if (!write_list) - submit_io(b, WRITE, b->block, write_endio); + submit_io(b, WRITE, write_endio); else list_add_tail(&b->write_list, write_list); } @@ -726,7 +733,7 @@ static void __flush_write_list(struct list_head *write_list) struct dm_buffer *b = list_entry(write_list->next, struct dm_buffer, write_list); list_del(&b->write_list); - submit_io(b, WRITE, b->block, write_endio); + submit_io(b, WRITE, write_endio); cond_resched(); } blk_finish_plug(&plug); @@ -933,10 +940,11 @@ static void __get_memory_limit(struct dm_bufio_client *c, { unsigned long buffers; - if (ACCESS_ONCE(dm_bufio_cache_size) != dm_bufio_cache_size_latch) { - mutex_lock(&dm_bufio_clients_lock); - __cache_size_refresh(); - mutex_unlock(&dm_bufio_clients_lock); + if (unlikely(ACCESS_ONCE(dm_bufio_cache_size) != dm_bufio_cache_size_latch)) { + if (mutex_trylock(&dm_bufio_clients_lock)) { + __cache_size_refresh(); + mutex_unlock(&dm_bufio_clients_lock); + } } buffers = dm_bufio_cache_size_per_client >> @@ -1094,7 +1102,7 @@ static void *new_read(struct dm_bufio_client *c, sector_t block, return NULL; if (need_submit) - submit_io(b, READ, b->block, read_endio); + submit_io(b, READ, read_endio); wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE); @@ -1164,7 +1172,7 @@ void dm_bufio_prefetch(struct dm_bufio_client *c, dm_bufio_unlock(c); if (need_submit) - submit_io(b, READ, b->block, read_endio); + submit_io(b, READ, read_endio); dm_bufio_release(b); cond_resched(); @@ -1405,7 +1413,7 @@ retry: old_block = b->block; __unlink_buffer(b); __link_buffer(b, new_block, b->list_mode); - submit_io(b, WRITE, new_block, write_endio); + submit_io(b, WRITE, write_endio); wait_on_bit_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE); __unlink_buffer(b); @@ -1762,6 +1770,12 @@ void dm_bufio_client_destroy(struct dm_bufio_client *c) } EXPORT_SYMBOL_GPL(dm_bufio_client_destroy); +void dm_bufio_set_sector_offset(struct dm_bufio_client *c, sector_t start) +{ + c->start = start; +} +EXPORT_SYMBOL_GPL(dm_bufio_set_sector_offset); + static unsigned get_max_age_hz(void) { unsigned max_age = ACCESS_ONCE(dm_bufio_max_age); @@ -1782,9 +1796,17 @@ static void __evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz) struct dm_buffer *b, *tmp; unsigned retain_target = get_retain_buffers(c); unsigned count; + LIST_HEAD(write_list); dm_bufio_lock(c); + __check_watermark(c, &write_list); + if (unlikely(!list_empty(&write_list))) { + dm_bufio_unlock(c); + __flush_write_list(&write_list); + dm_bufio_lock(c); + } + count = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY]; list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_CLEAN], lru_list) { if (count <= retain_target) @@ -1809,6 +1831,8 @@ static void cleanup_old_buffers(void) mutex_lock(&dm_bufio_clients_lock); + __cache_size_refresh(); + list_for_each_entry(c, &dm_bufio_all_clients, client_list) __evict_old_buffers(c, max_age_hz); diff --git a/drivers/md/dm-bufio.h b/drivers/md/dm-bufio.h index c096779a7292..b6d8f53ec15b 100644 --- a/drivers/md/dm-bufio.h +++ b/drivers/md/dm-bufio.h @@ -31,6 +31,13 @@ dm_bufio_client_create(struct block_device *bdev, unsigned block_size, */ void dm_bufio_client_destroy(struct dm_bufio_client *c); +/* + * Set the sector range. + * When this function is called, there must be no I/O in progress on the bufio + * client. + */ +void dm_bufio_set_sector_offset(struct dm_bufio_client *c, sector_t start); + /* * WARNING: to avoid deadlocks, these conditions are observed: * diff --git a/drivers/md/dm-cache-background-tracker.c b/drivers/md/dm-cache-background-tracker.c new file mode 100644 index 000000000000..9b1afdfb13f0 --- /dev/null +++ b/drivers/md/dm-cache-background-tracker.c @@ -0,0 +1,238 @@ +/* + * Copyright (C) 2017 Red Hat. All rights reserved. + * + * This file is released under the GPL. + */ + +#include "dm-cache-background-tracker.h" + +/*----------------------------------------------------------------*/ + +#define DM_MSG_PREFIX "dm-background-tracker" + +struct bt_work { + struct list_head list; + struct rb_node node; + struct policy_work work; +}; + +struct background_tracker { + unsigned max_work; + atomic_t pending_promotes; + atomic_t pending_writebacks; + atomic_t pending_demotes; + + struct list_head issued; + struct list_head queued; + struct rb_root pending; + + struct kmem_cache *work_cache; +}; + +struct background_tracker *btracker_create(unsigned max_work) +{ + struct background_tracker *b = kmalloc(sizeof(*b), GFP_KERNEL); + + b->max_work = max_work; + atomic_set(&b->pending_promotes, 0); + atomic_set(&b->pending_writebacks, 0); + atomic_set(&b->pending_demotes, 0); + + INIT_LIST_HEAD(&b->issued); + INIT_LIST_HEAD(&b->queued); + + b->pending = RB_ROOT; + b->work_cache = KMEM_CACHE(bt_work, 0); + if (!b->work_cache) { + DMERR("couldn't create mempool for background work items"); + kfree(b); + b = NULL; + } + + return b; +} +EXPORT_SYMBOL_GPL(btracker_create); + +void btracker_destroy(struct background_tracker *b) +{ + kmem_cache_destroy(b->work_cache); + kfree(b); +} +EXPORT_SYMBOL_GPL(btracker_destroy); + +static int cmp_oblock(dm_oblock_t lhs, dm_oblock_t rhs) +{ + if (from_oblock(lhs) < from_oblock(rhs)) + return -1; + + if (from_oblock(rhs) < from_oblock(lhs)) + return 1; + + return 0; +} + +static bool __insert_pending(struct background_tracker *b, + struct bt_work *nw) +{ + int cmp; + struct bt_work *w; + struct rb_node **new = &b->pending.rb_node, *parent = NULL; + + while (*new) { + w = container_of(*new, struct bt_work, node); + + parent = *new; + cmp = cmp_oblock(w->work.oblock, nw->work.oblock); + if (cmp < 0) + new = &((*new)->rb_left); + + else if (cmp > 0) + new = &((*new)->rb_right); + + else + /* already present */ + return false; + } + + rb_link_node(&nw->node, parent, new); + rb_insert_color(&nw->node, &b->pending); + + return true; +} + +static struct bt_work *__find_pending(struct background_tracker *b, + dm_oblock_t oblock) +{ + int cmp; + struct bt_work *w; + struct rb_node **new = &b->pending.rb_node; + + while (*new) { + w = container_of(*new, struct bt_work, node); + + cmp = cmp_oblock(w->work.oblock, oblock); + if (cmp < 0) + new = &((*new)->rb_left); + + else if (cmp > 0) + new = &((*new)->rb_right); + + else + break; + } + + return *new ? w : NULL; +} + + +static void update_stats(struct background_tracker *b, struct policy_work *w, int delta) +{ + switch (w->op) { + case POLICY_PROMOTE: + atomic_add(delta, &b->pending_promotes); + break; + + case POLICY_DEMOTE: + atomic_add(delta, &b->pending_demotes); + break; + + case POLICY_WRITEBACK: + atomic_add(delta, &b->pending_writebacks); + break; + } +} + +unsigned btracker_nr_writebacks_queued(struct background_tracker *b) +{ + return atomic_read(&b->pending_writebacks); +} +EXPORT_SYMBOL_GPL(btracker_nr_writebacks_queued); + +unsigned btracker_nr_demotions_queued(struct background_tracker *b) +{ + return atomic_read(&b->pending_demotes); +} +EXPORT_SYMBOL_GPL(btracker_nr_demotions_queued); + +static bool max_work_reached(struct background_tracker *b) +{ + // FIXME: finish + return false; +} + +int btracker_queue(struct background_tracker *b, + struct policy_work *work, + struct policy_work **pwork) +{ + struct bt_work *w; + + if (pwork) + *pwork = NULL; + + if (max_work_reached(b)) + return -ENOMEM; + + w = kmem_cache_alloc(b->work_cache, GFP_NOWAIT); + if (!w) + return -ENOMEM; + + memcpy(&w->work, work, sizeof(*work)); + + if (!__insert_pending(b, w)) { + /* + * There was a race, we'll just ignore this second + * bit of work for the same oblock. + */ + kmem_cache_free(b->work_cache, w); + return -EINVAL; + } + + if (pwork) { + *pwork = &w->work; + list_add(&w->list, &b->issued); + } else + list_add(&w->list, &b->queued); + update_stats(b, &w->work, 1); + + return 0; +} +EXPORT_SYMBOL_GPL(btracker_queue); + +/* + * Returns -ENODATA if there's no work. + */ +int btracker_issue(struct background_tracker *b, struct policy_work **work) +{ + struct bt_work *w; + + if (list_empty(&b->queued)) + return -ENODATA; + + w = list_first_entry(&b->queued, struct bt_work, list); + list_move(&w->list, &b->issued); + *work = &w->work; + + return 0; +} +EXPORT_SYMBOL_GPL(btracker_issue); + +void btracker_complete(struct background_tracker *b, + struct policy_work *op) +{ + struct bt_work *w = container_of(op, struct bt_work, work); + + update_stats(b, &w->work, -1); + rb_erase(&w->node, &b->pending); + list_del(&w->list); + kmem_cache_free(b->work_cache, w); +} +EXPORT_SYMBOL_GPL(btracker_complete); + +bool btracker_promotion_already_present(struct background_tracker *b, + dm_oblock_t oblock) +{ + return __find_pending(b, oblock) != NULL; +} +EXPORT_SYMBOL_GPL(btracker_promotion_already_present); + +/*----------------------------------------------------------------*/ diff --git a/drivers/md/dm-cache-background-tracker.h b/drivers/md/dm-cache-background-tracker.h new file mode 100644 index 000000000000..27ab90dbc275 --- /dev/null +++ b/drivers/md/dm-cache-background-tracker.h @@ -0,0 +1,46 @@ +/* + * Copyright (C) 2017 Red Hat. All rights reserved. + * + * This file is released under the GPL. + */ + +#ifndef DM_CACHE_BACKGROUND_WORK_H +#define DM_CACHE_BACKGROUND_WORK_H + +#include +#include "dm-cache-policy.h" + +/*----------------------------------------------------------------*/ + +struct background_work; +struct background_tracker; + +/* + * FIXME: discuss lack of locking in all methods. + */ +struct background_tracker *btracker_create(unsigned max_work); +void btracker_destroy(struct background_tracker *b); + +unsigned btracker_nr_writebacks_queued(struct background_tracker *b); +unsigned btracker_nr_demotions_queued(struct background_tracker *b); + +/* + * returns -EINVAL iff the work is already queued. -ENOMEM if the work + * couldn't be queued for another reason. + */ +int btracker_queue(struct background_tracker *b, + struct policy_work *work, + struct policy_work **pwork); + +/* + * Returns -ENODATA if there's no work. + */ +int btracker_issue(struct background_tracker *b, struct policy_work **work); +void btracker_complete(struct background_tracker *b, + struct policy_work *op); +bool btracker_promotion_already_present(struct background_tracker *b, + dm_oblock_t oblock); + +/*----------------------------------------------------------------*/ + +#endif diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index e4c2c1a1e993..5a026dc24db6 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -27,8 +27,6 @@ #define MIN_CACHE_VERSION 1 #define MAX_CACHE_VERSION 2 -#define CACHE_METADATA_CACHE_SIZE 64 - /* * 3 for btree insert + * 2 for btree lookup used within space map @@ -535,7 +533,6 @@ static int __create_persistent_data_objects(struct dm_cache_metadata *cmd, { int r; cmd->bm = dm_block_manager_create(cmd->bdev, DM_CACHE_METADATA_BLOCK_SIZE << SECTOR_SHIFT, - CACHE_METADATA_CACHE_SIZE, CACHE_MAX_CONCURRENT_LOCKS); if (IS_ERR(cmd->bm)) { DMERR("could not create block manager"); diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h index 4f07c08cf107..179ed5bf81a3 100644 --- a/drivers/md/dm-cache-metadata.h +++ b/drivers/md/dm-cache-metadata.h @@ -50,6 +50,8 @@ #define DM_CACHE_FEATURE_COMPAT_RO_SUPP 0UL #define DM_CACHE_FEATURE_INCOMPAT_SUPP 0UL +struct dm_cache_metadata; + /* * Reopens or creates a new, empty metadata volume. Returns an ERR_PTR on * failure. If reopening then features must match. diff --git a/drivers/md/dm-cache-policy-cleaner.c b/drivers/md/dm-cache-policy-cleaner.c deleted file mode 100644 index 2e8a8f1d8358..000000000000 --- a/drivers/md/dm-cache-policy-cleaner.c +++ /dev/null @@ -1,469 +0,0 @@ -/* - * Copyright (C) 2012 Red Hat. All rights reserved. - * - * writeback cache policy supporting flushing out dirty cache blocks. - * - * This file is released under the GPL. - */ - -#include "dm-cache-policy.h" -#include "dm.h" - -#include -#include -#include -#include - -/*----------------------------------------------------------------*/ - -#define DM_MSG_PREFIX "cache cleaner" - -/* Cache entry struct. */ -struct wb_cache_entry { - struct list_head list; - struct hlist_node hlist; - - dm_oblock_t oblock; - dm_cblock_t cblock; - bool dirty:1; - bool pending:1; -}; - -struct hash { - struct hlist_head *table; - dm_block_t hash_bits; - unsigned nr_buckets; -}; - -struct policy { - struct dm_cache_policy policy; - spinlock_t lock; - - struct list_head free; - struct list_head clean; - struct list_head clean_pending; - struct list_head dirty; - - /* - * We know exactly how many cblocks will be needed, - * so we can allocate them up front. - */ - dm_cblock_t cache_size, nr_cblocks_allocated; - struct wb_cache_entry *cblocks; - struct hash chash; -}; - -/*----------------------------------------------------------------------------*/ - -/* - * Low-level functions. - */ -static unsigned next_power(unsigned n, unsigned min) -{ - return roundup_pow_of_two(max(n, min)); -} - -static struct policy *to_policy(struct dm_cache_policy *p) -{ - return container_of(p, struct policy, policy); -} - -static struct list_head *list_pop(struct list_head *q) -{ - struct list_head *r = q->next; - - list_del(r); - - return r; -} - -/*----------------------------------------------------------------------------*/ - -/* Allocate/free various resources. */ -static int alloc_hash(struct hash *hash, unsigned elts) -{ - hash->nr_buckets = next_power(elts >> 4, 16); - hash->hash_bits = __ffs(hash->nr_buckets); - hash->table = vzalloc(sizeof(*hash->table) * hash->nr_buckets); - - return hash->table ? 0 : -ENOMEM; -} - -static void free_hash(struct hash *hash) -{ - vfree(hash->table); -} - -static int alloc_cache_blocks_with_hash(struct policy *p, dm_cblock_t cache_size) -{ - int r = -ENOMEM; - - p->cblocks = vzalloc(sizeof(*p->cblocks) * from_cblock(cache_size)); - if (p->cblocks) { - unsigned u = from_cblock(cache_size); - - while (u--) - list_add(&p->cblocks[u].list, &p->free); - - p->nr_cblocks_allocated = 0; - - /* Cache entries hash. */ - r = alloc_hash(&p->chash, from_cblock(cache_size)); - if (r) - vfree(p->cblocks); - } - - return r; -} - -static void free_cache_blocks_and_hash(struct policy *p) -{ - free_hash(&p->chash); - vfree(p->cblocks); -} - -static struct wb_cache_entry *alloc_cache_entry(struct policy *p) -{ - struct wb_cache_entry *e; - - BUG_ON(from_cblock(p->nr_cblocks_allocated) >= from_cblock(p->cache_size)); - - e = list_entry(list_pop(&p->free), struct wb_cache_entry, list); - p->nr_cblocks_allocated = to_cblock(from_cblock(p->nr_cblocks_allocated) + 1); - - return e; -} - -/*----------------------------------------------------------------------------*/ - -/* Hash functions (lookup, insert, remove). */ -static struct wb_cache_entry *lookup_cache_entry(struct policy *p, dm_oblock_t oblock) -{ - struct hash *hash = &p->chash; - unsigned h = hash_64(from_oblock(oblock), hash->hash_bits); - struct wb_cache_entry *cur; - struct hlist_head *bucket = &hash->table[h]; - - hlist_for_each_entry(cur, bucket, hlist) { - if (cur->oblock == oblock) { - /* Move upfront bucket for faster access. */ - hlist_del(&cur->hlist); - hlist_add_head(&cur->hlist, bucket); - return cur; - } - } - - return NULL; -} - -static void insert_cache_hash_entry(struct policy *p, struct wb_cache_entry *e) -{ - unsigned h = hash_64(from_oblock(e->oblock), p->chash.hash_bits); - - hlist_add_head(&e->hlist, &p->chash.table[h]); -} - -static void remove_cache_hash_entry(struct wb_cache_entry *e) -{ - hlist_del(&e->hlist); -} - -/* Public interface (see dm-cache-policy.h */ -static int wb_map(struct dm_cache_policy *pe, dm_oblock_t oblock, - bool can_block, bool can_migrate, bool discarded_oblock, - struct bio *bio, struct policy_locker *locker, - struct policy_result *result) -{ - struct policy *p = to_policy(pe); - struct wb_cache_entry *e; - unsigned long flags; - - result->op = POLICY_MISS; - - if (can_block) - spin_lock_irqsave(&p->lock, flags); - - else if (!spin_trylock_irqsave(&p->lock, flags)) - return -EWOULDBLOCK; - - e = lookup_cache_entry(p, oblock); - if (e) { - result->op = POLICY_HIT; - result->cblock = e->cblock; - - } - - spin_unlock_irqrestore(&p->lock, flags); - - return 0; -} - -static int wb_lookup(struct dm_cache_policy *pe, dm_oblock_t oblock, dm_cblock_t *cblock) -{ - int r; - struct policy *p = to_policy(pe); - struct wb_cache_entry *e; - unsigned long flags; - - if (!spin_trylock_irqsave(&p->lock, flags)) - return -EWOULDBLOCK; - - e = lookup_cache_entry(p, oblock); - if (e) { - *cblock = e->cblock; - r = 0; - - } else - r = -ENOENT; - - spin_unlock_irqrestore(&p->lock, flags); - - return r; -} - -static void __set_clear_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock, bool set) -{ - struct policy *p = to_policy(pe); - struct wb_cache_entry *e; - - e = lookup_cache_entry(p, oblock); - BUG_ON(!e); - - if (set) { - if (!e->dirty) { - e->dirty = true; - list_move(&e->list, &p->dirty); - } - - } else { - if (e->dirty) { - e->pending = false; - e->dirty = false; - list_move(&e->list, &p->clean); - } - } -} - -static void wb_set_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock) -{ - struct policy *p = to_policy(pe); - unsigned long flags; - - spin_lock_irqsave(&p->lock, flags); - __set_clear_dirty(pe, oblock, true); - spin_unlock_irqrestore(&p->lock, flags); -} - -static void wb_clear_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock) -{ - struct policy *p = to_policy(pe); - unsigned long flags; - - spin_lock_irqsave(&p->lock, flags); - __set_clear_dirty(pe, oblock, false); - spin_unlock_irqrestore(&p->lock, flags); -} - -static void add_cache_entry(struct policy *p, struct wb_cache_entry *e) -{ - insert_cache_hash_entry(p, e); - if (e->dirty) - list_add(&e->list, &p->dirty); - else - list_add(&e->list, &p->clean); -} - -static int wb_load_mapping(struct dm_cache_policy *pe, - dm_oblock_t oblock, dm_cblock_t cblock, - uint32_t hint, bool hint_valid) -{ - int r; - struct policy *p = to_policy(pe); - struct wb_cache_entry *e = alloc_cache_entry(p); - - if (e) { - e->cblock = cblock; - e->oblock = oblock; - e->dirty = false; /* blocks default to clean */ - add_cache_entry(p, e); - r = 0; - - } else - r = -ENOMEM; - - return r; -} - -static void wb_destroy(struct dm_cache_policy *pe) -{ - struct policy *p = to_policy(pe); - - free_cache_blocks_and_hash(p); - kfree(p); -} - -static struct wb_cache_entry *__wb_force_remove_mapping(struct policy *p, dm_oblock_t oblock) -{ - struct wb_cache_entry *r = lookup_cache_entry(p, oblock); - - BUG_ON(!r); - - remove_cache_hash_entry(r); - list_del(&r->list); - - return r; -} - -static void wb_remove_mapping(struct dm_cache_policy *pe, dm_oblock_t oblock) -{ - struct policy *p = to_policy(pe); - struct wb_cache_entry *e; - unsigned long flags; - - spin_lock_irqsave(&p->lock, flags); - e = __wb_force_remove_mapping(p, oblock); - list_add_tail(&e->list, &p->free); - BUG_ON(!from_cblock(p->nr_cblocks_allocated)); - p->nr_cblocks_allocated = to_cblock(from_cblock(p->nr_cblocks_allocated) - 1); - spin_unlock_irqrestore(&p->lock, flags); -} - -static void wb_force_mapping(struct dm_cache_policy *pe, - dm_oblock_t current_oblock, dm_oblock_t oblock) -{ - struct policy *p = to_policy(pe); - struct wb_cache_entry *e; - unsigned long flags; - - spin_lock_irqsave(&p->lock, flags); - e = __wb_force_remove_mapping(p, current_oblock); - e->oblock = oblock; - add_cache_entry(p, e); - spin_unlock_irqrestore(&p->lock, flags); -} - -static struct wb_cache_entry *get_next_dirty_entry(struct policy *p) -{ - struct list_head *l; - struct wb_cache_entry *r; - - if (list_empty(&p->dirty)) - return NULL; - - l = list_pop(&p->dirty); - r = container_of(l, struct wb_cache_entry, list); - list_add(l, &p->clean_pending); - - return r; -} - -static int wb_writeback_work(struct dm_cache_policy *pe, - dm_oblock_t *oblock, - dm_cblock_t *cblock, - bool critical_only) -{ - int r = -ENOENT; - struct policy *p = to_policy(pe); - struct wb_cache_entry *e; - unsigned long flags; - - spin_lock_irqsave(&p->lock, flags); - - e = get_next_dirty_entry(p); - if (e) { - *oblock = e->oblock; - *cblock = e->cblock; - r = 0; - } - - spin_unlock_irqrestore(&p->lock, flags); - - return r; -} - -static dm_cblock_t wb_residency(struct dm_cache_policy *pe) -{ - return to_policy(pe)->nr_cblocks_allocated; -} - -/* Init the policy plugin interface function pointers. */ -static void init_policy_functions(struct policy *p) -{ - p->policy.destroy = wb_destroy; - p->policy.map = wb_map; - p->policy.lookup = wb_lookup; - p->policy.set_dirty = wb_set_dirty; - p->policy.clear_dirty = wb_clear_dirty; - p->policy.load_mapping = wb_load_mapping; - p->policy.get_hint = NULL; - p->policy.remove_mapping = wb_remove_mapping; - p->policy.writeback_work = wb_writeback_work; - p->policy.force_mapping = wb_force_mapping; - p->policy.residency = wb_residency; - p->policy.tick = NULL; -} - -static struct dm_cache_policy *wb_create(dm_cblock_t cache_size, - sector_t origin_size, - sector_t cache_block_size) -{ - int r; - struct policy *p = kzalloc(sizeof(*p), GFP_KERNEL); - - if (!p) - return NULL; - - init_policy_functions(p); - INIT_LIST_HEAD(&p->free); - INIT_LIST_HEAD(&p->clean); - INIT_LIST_HEAD(&p->clean_pending); - INIT_LIST_HEAD(&p->dirty); - - p->cache_size = cache_size; - spin_lock_init(&p->lock); - - /* Allocate cache entry structs and add them to free list. */ - r = alloc_cache_blocks_with_hash(p, cache_size); - if (!r) - return &p->policy; - - kfree(p); - - return NULL; -} -/*----------------------------------------------------------------------------*/ - -static struct dm_cache_policy_type wb_policy_type = { - .name = "cleaner", - .version = {1, 0, 0}, - .hint_size = 4, - .owner = THIS_MODULE, - .create = wb_create -}; - -static int __init wb_init(void) -{ - int r = dm_cache_policy_register(&wb_policy_type); - - if (r < 0) - DMERR("register failed %d", r); - else - DMINFO("version %u.%u.%u loaded", - wb_policy_type.version[0], - wb_policy_type.version[1], - wb_policy_type.version[2]); - - return r; -} - -static void __exit wb_exit(void) -{ - dm_cache_policy_unregister(&wb_policy_type); -} - -module_init(wb_init); -module_exit(wb_exit); - -MODULE_AUTHOR("Heinz Mauelshagen "); -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("cleaner cache policy"); diff --git a/drivers/md/dm-cache-policy-internal.h b/drivers/md/dm-cache-policy-internal.h index 808ee0e2b2c4..56f0a23f698c 100644 --- a/drivers/md/dm-cache-policy-internal.h +++ b/drivers/md/dm-cache-policy-internal.h @@ -12,40 +12,59 @@ /*----------------------------------------------------------------*/ -/* - * Little inline functions that simplify calling the policy methods. - */ -static inline int policy_map(struct dm_cache_policy *p, dm_oblock_t oblock, - bool can_block, bool can_migrate, bool discarded_oblock, - struct bio *bio, struct policy_locker *locker, - struct policy_result *result) +static inline int policy_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock, + int data_dir, bool fast_copy, bool *background_queued) { - return p->map(p, oblock, can_block, can_migrate, discarded_oblock, bio, locker, result); + return p->lookup(p, oblock, cblock, data_dir, fast_copy, background_queued); } -static inline int policy_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock) +static inline int policy_lookup_with_work(struct dm_cache_policy *p, + dm_oblock_t oblock, dm_cblock_t *cblock, + int data_dir, bool fast_copy, + struct policy_work **work) { - BUG_ON(!p->lookup); - return p->lookup(p, oblock, cblock); + if (!p->lookup_with_work) { + *work = NULL; + return p->lookup(p, oblock, cblock, data_dir, fast_copy, NULL); + } + + return p->lookup_with_work(p, oblock, cblock, data_dir, fast_copy, work); } -static inline void policy_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) +static inline int policy_get_background_work(struct dm_cache_policy *p, + bool idle, struct policy_work **result) { - if (p->set_dirty) - p->set_dirty(p, oblock); + return p->get_background_work(p, idle, result); } -static inline void policy_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) +static inline void policy_complete_background_work(struct dm_cache_policy *p, + struct policy_work *work, + bool success) { - if (p->clear_dirty) - p->clear_dirty(p, oblock); + return p->complete_background_work(p, work, success); +} + +static inline void policy_set_dirty(struct dm_cache_policy *p, dm_cblock_t cblock) +{ + p->set_dirty(p, cblock); +} + +static inline void policy_clear_dirty(struct dm_cache_policy *p, dm_cblock_t cblock) +{ + p->clear_dirty(p, cblock); } static inline int policy_load_mapping(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t cblock, - uint32_t hint, bool hint_valid) + bool dirty, uint32_t hint, bool hint_valid) { - return p->load_mapping(p, oblock, cblock, hint, hint_valid); + return p->load_mapping(p, oblock, cblock, dirty, hint, hint_valid); +} + +static inline int policy_invalidate_mapping(struct dm_cache_policy *p, + dm_cblock_t cblock) +{ + return p->invalidate_mapping(p, cblock); } static inline uint32_t policy_get_hint(struct dm_cache_policy *p, @@ -54,30 +73,6 @@ static inline uint32_t policy_get_hint(struct dm_cache_policy *p, return p->get_hint ? p->get_hint(p, cblock) : 0; } -static inline int policy_writeback_work(struct dm_cache_policy *p, - dm_oblock_t *oblock, - dm_cblock_t *cblock, - bool critical_only) -{ - return p->writeback_work ? p->writeback_work(p, oblock, cblock, critical_only) : -ENOENT; -} - -static inline void policy_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock) -{ - p->remove_mapping(p, oblock); -} - -static inline int policy_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock) -{ - return p->remove_cblock(p, cblock); -} - -static inline void policy_force_mapping(struct dm_cache_policy *p, - dm_oblock_t current_oblock, dm_oblock_t new_oblock) -{ - return p->force_mapping(p, current_oblock, new_oblock); -} - static inline dm_cblock_t policy_residency(struct dm_cache_policy *p) { return p->residency(p); @@ -107,6 +102,11 @@ static inline int policy_set_config_value(struct dm_cache_policy *p, return p->set_config_value ? p->set_config_value(p, key, value) : -EINVAL; } +static inline void policy_allow_migrations(struct dm_cache_policy *p, bool allow) +{ + return p->allow_migrations(p, allow); +} + /*----------------------------------------------------------------*/ /* diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c index f19c6930a67c..e0c40aec5e96 100644 --- a/drivers/md/dm-cache-policy-smq.c +++ b/drivers/md/dm-cache-policy-smq.c @@ -4,8 +4,9 @@ * This file is released under the GPL. */ -#include "dm-cache-policy.h" +#include "dm-cache-background-tracker.h" #include "dm-cache-policy-internal.h" +#include "dm-cache-policy.h" #include "dm.h" #include @@ -38,10 +39,11 @@ struct entry { unsigned hash_next:28; unsigned prev:28; unsigned next:28; - unsigned level:7; + unsigned level:6; bool dirty:1; bool allocated:1; bool sentinel:1; + bool pending_work:1; dm_oblock_t oblock; }; @@ -279,14 +281,28 @@ static unsigned q_size(struct queue *q) */ static void q_push(struct queue *q, struct entry *e) { + BUG_ON(e->pending_work); + if (!e->sentinel) q->nr_elts++; l_add_tail(q->es, q->qs + e->level, e); } +static void q_push_front(struct queue *q, struct entry *e) +{ + BUG_ON(e->pending_work); + + if (!e->sentinel) + q->nr_elts++; + + l_add_head(q->es, q->qs + e->level, e); +} + static void q_push_before(struct queue *q, struct entry *old, struct entry *e) { + BUG_ON(e->pending_work); + if (!e->sentinel) q->nr_elts++; @@ -335,19 +351,6 @@ static struct entry *q_pop(struct queue *q) return e; } -/* - * Pops an entry from a level that is not past a sentinel. - */ -static struct entry *q_pop_old(struct queue *q, unsigned max_level) -{ - struct entry *e = q_peek(q, max_level, false); - - if (e) - q_del(q, e); - - return e; -} - /* * This function assumes there is a non-sentinel entry to pop. It's only * used by redistribute, so we know this is true. It also doesn't adjust @@ -446,45 +449,49 @@ static void q_redistribute(struct queue *q) break; e->level = level + 1u; - l_add_head(q->es, l_above, e); + l_add_tail(q->es, l_above, e); } } } -static void q_requeue_before(struct queue *q, struct entry *dest, struct entry *e, unsigned extra_levels) +static void q_requeue(struct queue *q, struct entry *e, unsigned extra_levels, + struct entry *s1, struct entry *s2) { struct entry *de; - unsigned new_level; - - q_del(q, e); + unsigned sentinels_passed = 0; + unsigned new_level = min(q->nr_levels - 1u, e->level + extra_levels); + /* try and find an entry to swap with */ if (extra_levels && (e->level < q->nr_levels - 1u)) { - new_level = min(q->nr_levels - 1u, e->level + extra_levels); - for (de = l_head(q->es, q->qs + new_level); de; de = l_next(q->es, de)) { - if (de->sentinel) - continue; + for (de = l_head(q->es, q->qs + new_level); de && de->sentinel; de = l_next(q->es, de)) + sentinels_passed++; + if (de) { q_del(q, de); de->level = e->level; + if (s1) { + switch (sentinels_passed) { + case 0: + q_push_before(q, s1, de); + break; - if (dest) - q_push_before(q, dest, de); - else + case 1: + q_push_before(q, s2, de); + break; + + default: + q_push(q, de); + } + } else q_push(q, de); - break; } - - e->level = new_level; } + q_del(q, e); + e->level = new_level; q_push(q, e); } -static void q_requeue(struct queue *q, struct entry *e, unsigned extra_levels) -{ - q_requeue_before(q, NULL, e, extra_levels); -} - /*----------------------------------------------------------------*/ #define FP_SHIFT 8 @@ -550,7 +557,7 @@ static enum performance stats_assess(struct stats *s) /*----------------------------------------------------------------*/ -struct hash_table { +struct smq_hash_table { struct entry_space *es; unsigned long long hash_bits; unsigned *buckets; @@ -560,7 +567,7 @@ struct hash_table { * All cache entries are stored in a chained hash table. To save space we * use indexing again, and only store indexes to the next entry. */ -static int h_init(struct hash_table *ht, struct entry_space *es, unsigned nr_entries) +static int h_init(struct smq_hash_table *ht, struct entry_space *es, unsigned nr_entries) { unsigned i, nr_buckets; @@ -578,34 +585,34 @@ static int h_init(struct hash_table *ht, struct entry_space *es, unsigned nr_ent return 0; } -static void h_exit(struct hash_table *ht) +static void h_exit(struct smq_hash_table *ht) { vfree(ht->buckets); } -static struct entry *h_head(struct hash_table *ht, unsigned bucket) +static struct entry *h_head(struct smq_hash_table *ht, unsigned bucket) { return to_entry(ht->es, ht->buckets[bucket]); } -static struct entry *h_next(struct hash_table *ht, struct entry *e) +static struct entry *h_next(struct smq_hash_table *ht, struct entry *e) { return to_entry(ht->es, e->hash_next); } -static void __h_insert(struct hash_table *ht, unsigned bucket, struct entry *e) +static void __h_insert(struct smq_hash_table *ht, unsigned bucket, struct entry *e) { e->hash_next = ht->buckets[bucket]; ht->buckets[bucket] = to_index(ht->es, e); } -static void h_insert(struct hash_table *ht, struct entry *e) +static void h_insert(struct smq_hash_table *ht, struct entry *e) { unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits); __h_insert(ht, h, e); } -static struct entry *__h_lookup(struct hash_table *ht, unsigned h, dm_oblock_t oblock, +static struct entry *__h_lookup(struct smq_hash_table *ht, unsigned h, dm_oblock_t oblock, struct entry **prev) { struct entry *e; @@ -621,7 +628,7 @@ static struct entry *__h_lookup(struct hash_table *ht, unsigned h, dm_oblock_t o return NULL; } -static void __h_unlink(struct hash_table *ht, unsigned h, +static void __h_unlink(struct smq_hash_table *ht, unsigned h, struct entry *e, struct entry *prev) { if (prev) @@ -633,7 +640,7 @@ static void __h_unlink(struct hash_table *ht, unsigned h, /* * Also moves each entry to the front of the bucket. */ -static struct entry *h_lookup(struct hash_table *ht, dm_oblock_t oblock) +static struct entry *h_lookup(struct smq_hash_table *ht, dm_oblock_t oblock) { struct entry *e, *prev; unsigned h = hash_64(from_oblock(oblock), ht->hash_bits); @@ -651,7 +658,7 @@ static struct entry *h_lookup(struct hash_table *ht, dm_oblock_t oblock) return e; } -static void h_remove(struct hash_table *ht, struct entry *e) +static void h_remove(struct smq_hash_table *ht, struct entry *e) { unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits); struct entry *prev; @@ -699,7 +706,10 @@ static void init_entry(struct entry *e) e->next = INDEXER_NULL; e->prev = INDEXER_NULL; e->level = 0u; + e->dirty = true; /* FIXME: audit */ e->allocated = true; + e->sentinel = false; + e->pending_work = false; } static struct entry *alloc_entry(struct entry_alloc *ea) @@ -762,11 +772,11 @@ static struct entry *get_entry(struct entry_alloc *ea, unsigned index) #define NR_HOTSPOT_LEVELS 64u #define NR_CACHE_LEVELS 64u -#define WRITEBACK_PERIOD (10 * HZ) -#define DEMOTE_PERIOD (60 * HZ) +#define WRITEBACK_PERIOD (10ul * HZ) +#define DEMOTE_PERIOD (60ul * HZ) #define HOTSPOT_UPDATE_PERIOD (HZ) -#define CACHE_UPDATE_PERIOD (10u * HZ) +#define CACHE_UPDATE_PERIOD (60ul * HZ) struct smq_policy { struct dm_cache_policy policy; @@ -814,8 +824,8 @@ struct smq_policy { * The hash tables allows us to quickly find an entry by origin * block. */ - struct hash_table table; - struct hash_table hotspot_table; + struct smq_hash_table table; + struct smq_hash_table hotspot_table; bool current_writeback_sentinels; unsigned long next_writeback_period; @@ -828,6 +838,10 @@ struct smq_policy { unsigned long next_hotspot_period; unsigned long next_cache_period; + + struct background_tracker *bg_work; + + bool migrations_allowed; }; /*----------------------------------------------------------------*/ @@ -876,15 +890,15 @@ static void __update_demote_sentinels(struct smq_policy *mq) static void update_sentinels(struct smq_policy *mq) { if (time_after(jiffies, mq->next_writeback_period)) { - __update_writeback_sentinels(mq); mq->next_writeback_period = jiffies + WRITEBACK_PERIOD; mq->current_writeback_sentinels = !mq->current_writeback_sentinels; + __update_writeback_sentinels(mq); } if (time_after(jiffies, mq->next_demote_period)) { - __update_demote_sentinels(mq); mq->next_demote_period = jiffies + DEMOTE_PERIOD; mq->current_demote_sentinels = !mq->current_demote_sentinels; + __update_demote_sentinels(mq); } } @@ -920,55 +934,40 @@ static void sentinels_init(struct smq_policy *mq) /*----------------------------------------------------------------*/ -/* - * These methods tie together the dirty queue, clean queue and hash table. - */ -static void push_new(struct smq_policy *mq, struct entry *e) +static void del_queue(struct smq_policy *mq, struct entry *e) { - struct queue *q = e->dirty ? &mq->dirty : &mq->clean; - h_insert(&mq->table, e); - q_push(q, e); + q_del(e->dirty ? &mq->dirty : &mq->clean, e); } +static void push_queue(struct smq_policy *mq, struct entry *e) +{ + if (e->dirty) + q_push(&mq->dirty, e); + else + q_push(&mq->clean, e); +} + +// !h, !q, a -> h, q, a static void push(struct smq_policy *mq, struct entry *e) { - struct entry *sentinel; - h_insert(&mq->table, e); - - /* - * Punch this into the queue just in front of the sentinel, to - * ensure it's cleaned straight away. - */ - if (e->dirty) { - sentinel = writeback_sentinel(mq, e->level); - q_push_before(&mq->dirty, sentinel, e); - } else { - sentinel = demote_sentinel(mq, e->level); - q_push_before(&mq->clean, sentinel, e); - } + if (!e->pending_work) + push_queue(mq, e); } -/* - * Removes an entry from cache. Removes from the hash table. - */ -static void __del(struct smq_policy *mq, struct queue *q, struct entry *e) +static void push_queue_front(struct smq_policy *mq, struct entry *e) { - q_del(q, e); - h_remove(&mq->table, e); + if (e->dirty) + q_push_front(&mq->dirty, e); + else + q_push_front(&mq->clean, e); } -static void del(struct smq_policy *mq, struct entry *e) +static void push_front(struct smq_policy *mq, struct entry *e) { - __del(mq, e->dirty ? &mq->dirty : &mq->clean, e); -} - -static struct entry *pop_old(struct smq_policy *mq, struct queue *q, unsigned max_level) -{ - struct entry *e = q_pop_old(q, max_level); - if (e) - h_remove(&mq->table, e); - return e; + h_insert(&mq->table, e); + if (!e->pending_work) + push_queue_front(mq, e); } static dm_cblock_t infer_cblock(struct smq_policy *mq, struct entry *e) @@ -978,16 +977,21 @@ static dm_cblock_t infer_cblock(struct smq_policy *mq, struct entry *e) static void requeue(struct smq_policy *mq, struct entry *e) { - struct entry *sentinel; + /* + * Pending work has temporarily been taken out of the queues. + */ + if (e->pending_work) + return; if (!test_and_set_bit(from_cblock(infer_cblock(mq, e)), mq->cache_hit_bits)) { - if (e->dirty) { - sentinel = writeback_sentinel(mq, e->level); - q_requeue_before(&mq->dirty, sentinel, e, 1u); - } else { - sentinel = demote_sentinel(mq, e->level); - q_requeue_before(&mq->clean, sentinel, e, 1u); + if (!e->dirty) { + q_requeue(&mq->clean, e, 1u, NULL, NULL); + return; } + + q_requeue(&mq->dirty, e, 1u, + get_sentinel(&mq->writeback_sentinel_alloc, e->level, !mq->current_writeback_sentinels), + get_sentinel(&mq->writeback_sentinel_alloc, e->level, mq->current_writeback_sentinels)); } } @@ -1026,6 +1030,8 @@ static void update_promote_levels(struct smq_policy *mq) unsigned threshold_level = allocator_empty(&mq->cache_alloc) ? default_promote_level(mq) : (NR_HOTSPOT_LEVELS / 2u); + threshold_level = max(threshold_level, NR_HOTSPOT_LEVELS); + /* * If the hotspot queue is performing badly then we have little * confidence that we know which blocks to promote. So we cut down @@ -1045,7 +1051,7 @@ static void update_promote_levels(struct smq_policy *mq) } mq->read_promote_level = NR_HOTSPOT_LEVELS - threshold_level; - mq->write_promote_level = (NR_HOTSPOT_LEVELS - threshold_level) + 2u; + mq->write_promote_level = (NR_HOTSPOT_LEVELS - threshold_level); } /* @@ -1095,34 +1101,142 @@ static void end_cache_period(struct smq_policy *mq) } } -static int demote_cblock(struct smq_policy *mq, - struct policy_locker *locker, - dm_oblock_t *oblock) +/*----------------------------------------------------------------*/ + +/* + * Targets are given as a percentage. + */ +#define CLEAN_TARGET 25u +#define FREE_TARGET 25u + +static unsigned percent_to_target(struct smq_policy *mq, unsigned p) { - struct entry *demoted = q_peek(&mq->clean, mq->clean.nr_levels, false); - if (!demoted) - /* - * We could get a block from mq->dirty, but that - * would add extra latency to the triggering bio as it - * waits for the writeback. Better to not promote this - * time and hope there's a clean block next time this block - * is hit. - */ - return -ENOSPC; - - if (locker->fn(locker, demoted->oblock)) - /* - * We couldn't lock this block. - */ - return -EBUSY; - - del(mq, demoted); - *oblock = demoted->oblock; - free_entry(&mq->cache_alloc, demoted); - - return 0; + return from_cblock(mq->cache_size) * p / 100u; } +static bool clean_target_met(struct smq_policy *mq, bool idle) +{ + /* + * Cache entries may not be populated. So we cannot rely on the + * size of the clean queue. + */ + unsigned nr_clean = from_cblock(mq->cache_size) - q_size(&mq->dirty); + + if (idle) + /* + * We'd like to clean everything. + */ + return q_size(&mq->dirty) == 0u; + else + return (nr_clean + btracker_nr_writebacks_queued(mq->bg_work)) >= + percent_to_target(mq, CLEAN_TARGET); +} + +static bool free_target_met(struct smq_policy *mq, bool idle) +{ + unsigned nr_free = from_cblock(mq->cache_size) - + mq->cache_alloc.nr_allocated; + + if (idle) + return (nr_free + btracker_nr_demotions_queued(mq->bg_work)) >= + percent_to_target(mq, FREE_TARGET); + else + return true; +} + +/*----------------------------------------------------------------*/ + +static void mark_pending(struct smq_policy *mq, struct entry *e) +{ + BUG_ON(e->sentinel); + BUG_ON(!e->allocated); + BUG_ON(e->pending_work); + e->pending_work = true; +} + +static void clear_pending(struct smq_policy *mq, struct entry *e) +{ + BUG_ON(!e->pending_work); + e->pending_work = false; +} + +static void queue_writeback(struct smq_policy *mq) +{ + int r; + struct policy_work work; + struct entry *e; + + e = q_peek(&mq->dirty, mq->dirty.nr_levels, !mq->migrations_allowed); + if (e) { + mark_pending(mq, e); + q_del(&mq->dirty, e); + + work.op = POLICY_WRITEBACK; + work.oblock = e->oblock; + work.cblock = infer_cblock(mq, e); + + r = btracker_queue(mq->bg_work, &work, NULL); + WARN_ON_ONCE(r); // FIXME: finish, I think we have to get rid of this race. + } +} + +static void queue_demotion(struct smq_policy *mq) +{ + struct policy_work work; + struct entry *e; + + if (unlikely(WARN_ON_ONCE(!mq->migrations_allowed))) + return; + + e = q_peek(&mq->clean, mq->clean.nr_levels, true); + if (!e) { + if (!clean_target_met(mq, false)) + queue_writeback(mq); + return; + } + + mark_pending(mq, e); + q_del(&mq->clean, e); + + work.op = POLICY_DEMOTE; + work.oblock = e->oblock; + work.cblock = infer_cblock(mq, e); + btracker_queue(mq->bg_work, &work, NULL); +} + +static void queue_promotion(struct smq_policy *mq, dm_oblock_t oblock, + struct policy_work **workp) +{ + struct entry *e; + struct policy_work work; + + if (!mq->migrations_allowed) + return; + + if (allocator_empty(&mq->cache_alloc)) { + if (!free_target_met(mq, false)) + queue_demotion(mq); + return; + } + + if (btracker_promotion_already_present(mq->bg_work, oblock)) + return; + + /* + * We allocate the entry now to reserve the cblock. If the + * background work is aborted we must remember to free it. + */ + e = alloc_entry(&mq->cache_alloc); + BUG_ON(!e); + e->pending_work = true; + work.op = POLICY_PROMOTE; + work.oblock = oblock; + work.cblock = infer_cblock(mq, e); + btracker_queue(mq->bg_work, &work, workp); +} + +/*----------------------------------------------------------------*/ + enum promote_result { PROMOTE_NOT, PROMOTE_TEMPORARY, @@ -1137,49 +1251,18 @@ static enum promote_result maybe_promote(bool promote) return promote ? PROMOTE_PERMANENT : PROMOTE_NOT; } -static enum promote_result should_promote(struct smq_policy *mq, struct entry *hs_e, struct bio *bio, - bool fast_promote) +static enum promote_result should_promote(struct smq_policy *mq, struct entry *hs_e, + int data_dir, bool fast_promote) { - if (bio_data_dir(bio) == WRITE) { + if (data_dir == WRITE) { if (!allocator_empty(&mq->cache_alloc) && fast_promote) return PROMOTE_TEMPORARY; - else - return maybe_promote(hs_e->level >= mq->write_promote_level); + return maybe_promote(hs_e->level >= mq->write_promote_level); } else return maybe_promote(hs_e->level >= mq->read_promote_level); } -static void insert_in_cache(struct smq_policy *mq, dm_oblock_t oblock, - struct policy_locker *locker, - struct policy_result *result, enum promote_result pr) -{ - int r; - struct entry *e; - - if (allocator_empty(&mq->cache_alloc)) { - result->op = POLICY_REPLACE; - r = demote_cblock(mq, locker, &result->old_oblock); - if (r) { - result->op = POLICY_MISS; - return; - } - - } else - result->op = POLICY_NEW; - - e = alloc_entry(&mq->cache_alloc); - BUG_ON(!e); - e->oblock = oblock; - - if (pr == PROMOTE_TEMPORARY) - push(mq, e); - else - push_new(mq, e); - - result->cblock = infer_cblock(mq, e); -} - static dm_oblock_t to_hblock(struct smq_policy *mq, dm_oblock_t b) { sector_t r = from_oblock(b); @@ -1187,7 +1270,7 @@ static dm_oblock_t to_hblock(struct smq_policy *mq, dm_oblock_t b) return to_oblock(r); } -static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b, struct bio *bio) +static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b) { unsigned hi; dm_oblock_t hb = to_hblock(mq, b); @@ -1199,7 +1282,8 @@ static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b, hi = get_index(&mq->hotspot_alloc, e); q_requeue(&mq->hotspot, e, test_and_set_bit(hi, mq->hotspot_hit_bits) ? - 0u : mq->hotspot_level_jump); + 0u : mq->hotspot_level_jump, + NULL, NULL); } else { stats_miss(&mq->hotspot_stats); @@ -1225,47 +1309,6 @@ static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b, return e; } -/* - * Looks the oblock up in the hash table, then decides whether to put in - * pre_cache, or cache etc. - */ -static int map(struct smq_policy *mq, struct bio *bio, dm_oblock_t oblock, - bool can_migrate, bool fast_promote, - struct policy_locker *locker, struct policy_result *result) -{ - struct entry *e, *hs_e; - enum promote_result pr; - - hs_e = update_hotspot_queue(mq, oblock, bio); - - e = h_lookup(&mq->table, oblock); - if (e) { - stats_level_accessed(&mq->cache_stats, e->level); - - requeue(mq, e); - result->op = POLICY_HIT; - result->cblock = infer_cblock(mq, e); - - } else { - stats_miss(&mq->cache_stats); - - pr = should_promote(mq, hs_e, bio, fast_promote); - if (pr == PROMOTE_NOT) - result->op = POLICY_MISS; - - else { - if (!can_migrate) { - result->op = POLICY_MISS; - return -EWOULDBLOCK; - } - - insert_in_cache(mq, oblock, locker, result, pr); - } - } - - return 0; -} - /*----------------------------------------------------------------*/ /* @@ -1282,6 +1325,7 @@ static void smq_destroy(struct dm_cache_policy *p) { struct smq_policy *mq = to_smq_policy(p); + btracker_destroy(mq->bg_work); h_exit(&mq->hotspot_table); h_exit(&mq->table); free_bitset(mq->hotspot_hit_bits); @@ -1290,72 +1334,193 @@ static void smq_destroy(struct dm_cache_policy *p) kfree(mq); } -static int smq_map(struct dm_cache_policy *p, dm_oblock_t oblock, - bool can_block, bool can_migrate, bool fast_promote, - struct bio *bio, struct policy_locker *locker, - struct policy_result *result) +/*----------------------------------------------------------------*/ + +static int __lookup(struct smq_policy *mq, dm_oblock_t oblock, dm_cblock_t *cblock, + int data_dir, bool fast_copy, + struct policy_work **work, bool *background_work) { - int r; - unsigned long flags; - struct smq_policy *mq = to_smq_policy(p); + struct entry *e, *hs_e; + enum promote_result pr; - result->op = POLICY_MISS; + *background_work = false; - spin_lock_irqsave(&mq->lock, flags); - r = map(mq, bio, oblock, can_migrate, fast_promote, locker, result); - spin_unlock_irqrestore(&mq->lock, flags); - - return r; -} - -static int smq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock) -{ - int r; - unsigned long flags; - struct smq_policy *mq = to_smq_policy(p); - struct entry *e; - - spin_lock_irqsave(&mq->lock, flags); e = h_lookup(&mq->table, oblock); if (e) { + stats_level_accessed(&mq->cache_stats, e->level); + + requeue(mq, e); *cblock = infer_cblock(mq, e); - r = 0; - } else - r = -ENOENT; + return 0; + + } else { + stats_miss(&mq->cache_stats); + + /* + * The hotspot queue only gets updated with misses. + */ + hs_e = update_hotspot_queue(mq, oblock); + + pr = should_promote(mq, hs_e, data_dir, fast_copy); + if (pr != PROMOTE_NOT) { + queue_promotion(mq, oblock, work); + *background_work = true; + } + + return -ENOENT; + } +} + +static int smq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock, + int data_dir, bool fast_copy, + bool *background_work) +{ + int r; + unsigned long flags; + struct smq_policy *mq = to_smq_policy(p); + + spin_lock_irqsave(&mq->lock, flags); + r = __lookup(mq, oblock, cblock, + data_dir, fast_copy, + NULL, background_work); spin_unlock_irqrestore(&mq->lock, flags); return r; } -static void __smq_set_clear_dirty(struct smq_policy *mq, dm_oblock_t oblock, bool set) +static int smq_lookup_with_work(struct dm_cache_policy *p, + dm_oblock_t oblock, dm_cblock_t *cblock, + int data_dir, bool fast_copy, + struct policy_work **work) { - struct entry *e; + int r; + bool background_queued; + unsigned long flags; + struct smq_policy *mq = to_smq_policy(p); - e = h_lookup(&mq->table, oblock); - BUG_ON(!e); + spin_lock_irqsave(&mq->lock, flags); + r = __lookup(mq, oblock, cblock, data_dir, fast_copy, work, &background_queued); + spin_unlock_irqrestore(&mq->lock, flags); - del(mq, e); - e->dirty = set; - push(mq, e); + return r; } -static void smq_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) +static int smq_get_background_work(struct dm_cache_policy *p, bool idle, + struct policy_work **result) +{ + int r; + unsigned long flags; + struct smq_policy *mq = to_smq_policy(p); + + spin_lock_irqsave(&mq->lock, flags); + r = btracker_issue(mq->bg_work, result); + if (r == -ENODATA) { + /* find some writeback work to do */ + if (mq->migrations_allowed && !free_target_met(mq, idle)) + queue_demotion(mq); + + else if (!clean_target_met(mq, idle)) + queue_writeback(mq); + + r = btracker_issue(mq->bg_work, result); + } + spin_unlock_irqrestore(&mq->lock, flags); + + return r; +} + +/* + * We need to clear any pending work flags that have been set, and in the + * case of promotion free the entry for the destination cblock. + */ +static void __complete_background_work(struct smq_policy *mq, + struct policy_work *work, + bool success) +{ + struct entry *e = get_entry(&mq->cache_alloc, + from_cblock(work->cblock)); + + switch (work->op) { + case POLICY_PROMOTE: + // !h, !q, a + clear_pending(mq, e); + if (success) { + e->oblock = work->oblock; + push(mq, e); + // h, q, a + } else { + free_entry(&mq->cache_alloc, e); + // !h, !q, !a + } + break; + + case POLICY_DEMOTE: + // h, !q, a + if (success) { + h_remove(&mq->table, e); + free_entry(&mq->cache_alloc, e); + // !h, !q, !a + } else { + clear_pending(mq, e); + push_queue(mq, e); + // h, q, a + } + break; + + case POLICY_WRITEBACK: + // h, !q, a + clear_pending(mq, e); + push_queue(mq, e); + // h, q, a + break; + } + + btracker_complete(mq->bg_work, work); +} + +static void smq_complete_background_work(struct dm_cache_policy *p, + struct policy_work *work, + bool success) { unsigned long flags; struct smq_policy *mq = to_smq_policy(p); spin_lock_irqsave(&mq->lock, flags); - __smq_set_clear_dirty(mq, oblock, true); + __complete_background_work(mq, work, success); spin_unlock_irqrestore(&mq->lock, flags); } -static void smq_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) +// in_hash(oblock) -> in_hash(oblock) +static void __smq_set_clear_dirty(struct smq_policy *mq, dm_cblock_t cblock, bool set) +{ + struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock)); + + if (e->pending_work) + e->dirty = set; + else { + del_queue(mq, e); + e->dirty = set; + push_queue(mq, e); + } +} + +static void smq_set_dirty(struct dm_cache_policy *p, dm_cblock_t cblock) +{ + unsigned long flags; + struct smq_policy *mq = to_smq_policy(p); + + spin_lock_irqsave(&mq->lock, flags); + __smq_set_clear_dirty(mq, cblock, true); + spin_unlock_irqrestore(&mq->lock, flags); +} + +static void smq_clear_dirty(struct dm_cache_policy *p, dm_cblock_t cblock) { struct smq_policy *mq = to_smq_policy(p); unsigned long flags; spin_lock_irqsave(&mq->lock, flags); - __smq_set_clear_dirty(mq, oblock, false); + __smq_set_clear_dirty(mq, cblock, false); spin_unlock_irqrestore(&mq->lock, flags); } @@ -1366,17 +1531,38 @@ static unsigned random_level(dm_cblock_t cblock) static int smq_load_mapping(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t cblock, - uint32_t hint, bool hint_valid) + bool dirty, uint32_t hint, bool hint_valid) { struct smq_policy *mq = to_smq_policy(p); struct entry *e; e = alloc_particular_entry(&mq->cache_alloc, from_cblock(cblock)); e->oblock = oblock; - e->dirty = false; /* this gets corrected in a minute */ + e->dirty = dirty; e->level = hint_valid ? min(hint, NR_CACHE_LEVELS - 1) : random_level(cblock); - push(mq, e); + e->pending_work = false; + /* + * When we load mappings we push ahead of both sentinels in order to + * allow demotions and cleaning to occur immediately. + */ + push_front(mq, e); + + return 0; +} + +static int smq_invalidate_mapping(struct dm_cache_policy *p, dm_cblock_t cblock) +{ + struct smq_policy *mq = to_smq_policy(p); + struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock)); + + if (!e->allocated) + return -ENODATA; + + // FIXME: what if this block has pending background work? + del_queue(mq, e); + h_remove(&mq->table, e); + free_entry(&mq->cache_alloc, e); return 0; } @@ -1391,135 +1577,6 @@ static uint32_t smq_get_hint(struct dm_cache_policy *p, dm_cblock_t cblock) return e->level; } -static void __remove_mapping(struct smq_policy *mq, dm_oblock_t oblock) -{ - struct entry *e; - - e = h_lookup(&mq->table, oblock); - BUG_ON(!e); - - del(mq, e); - free_entry(&mq->cache_alloc, e); -} - -static void smq_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock) -{ - struct smq_policy *mq = to_smq_policy(p); - unsigned long flags; - - spin_lock_irqsave(&mq->lock, flags); - __remove_mapping(mq, oblock); - spin_unlock_irqrestore(&mq->lock, flags); -} - -static int __remove_cblock(struct smq_policy *mq, dm_cblock_t cblock) -{ - struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock)); - - if (!e || !e->allocated) - return -ENODATA; - - del(mq, e); - free_entry(&mq->cache_alloc, e); - - return 0; -} - -static int smq_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock) -{ - int r; - unsigned long flags; - struct smq_policy *mq = to_smq_policy(p); - - spin_lock_irqsave(&mq->lock, flags); - r = __remove_cblock(mq, cblock); - spin_unlock_irqrestore(&mq->lock, flags); - - return r; -} - - -#define CLEAN_TARGET_CRITICAL 5u /* percent */ - -static bool clean_target_met(struct smq_policy *mq, bool critical) -{ - if (critical) { - /* - * Cache entries may not be populated. So we're cannot rely on the - * size of the clean queue. - */ - unsigned nr_clean = from_cblock(mq->cache_size) - q_size(&mq->dirty); - unsigned target = from_cblock(mq->cache_size) * CLEAN_TARGET_CRITICAL / 100u; - - return nr_clean >= target; - } else - return !q_size(&mq->dirty); -} - -static int __smq_writeback_work(struct smq_policy *mq, dm_oblock_t *oblock, - dm_cblock_t *cblock, bool critical_only) -{ - struct entry *e = NULL; - bool target_met = clean_target_met(mq, critical_only); - - if (critical_only) - /* - * Always try and keep the bottom level clean. - */ - e = pop_old(mq, &mq->dirty, target_met ? 1u : mq->dirty.nr_levels); - - else - e = pop_old(mq, &mq->dirty, mq->dirty.nr_levels); - - if (!e) - return -ENODATA; - - *oblock = e->oblock; - *cblock = infer_cblock(mq, e); - e->dirty = false; - push_new(mq, e); - - return 0; -} - -static int smq_writeback_work(struct dm_cache_policy *p, dm_oblock_t *oblock, - dm_cblock_t *cblock, bool critical_only) -{ - int r; - unsigned long flags; - struct smq_policy *mq = to_smq_policy(p); - - spin_lock_irqsave(&mq->lock, flags); - r = __smq_writeback_work(mq, oblock, cblock, critical_only); - spin_unlock_irqrestore(&mq->lock, flags); - - return r; -} - -static void __force_mapping(struct smq_policy *mq, - dm_oblock_t current_oblock, dm_oblock_t new_oblock) -{ - struct entry *e = h_lookup(&mq->table, current_oblock); - - if (e) { - del(mq, e); - e->oblock = new_oblock; - e->dirty = true; - push(mq, e); - } -} - -static void smq_force_mapping(struct dm_cache_policy *p, - dm_oblock_t current_oblock, dm_oblock_t new_oblock) -{ - unsigned long flags; - struct smq_policy *mq = to_smq_policy(p); - - spin_lock_irqsave(&mq->lock, flags); - __force_mapping(mq, current_oblock, new_oblock); - spin_unlock_irqrestore(&mq->lock, flags); -} - static dm_cblock_t smq_residency(struct dm_cache_policy *p) { dm_cblock_t r; @@ -1546,6 +1603,12 @@ static void smq_tick(struct dm_cache_policy *p, bool can_block) spin_unlock_irqrestore(&mq->lock, flags); } +static void smq_allow_migrations(struct dm_cache_policy *p, bool allow) +{ + struct smq_policy *mq = to_smq_policy(p); + mq->migrations_allowed = allow; +} + /* * smq has no config values, but the old mq policy did. To avoid breaking * software we continue to accept these configurables for the mq policy, @@ -1590,18 +1653,18 @@ static int mq_emit_config_values(struct dm_cache_policy *p, char *result, static void init_policy_functions(struct smq_policy *mq, bool mimic_mq) { mq->policy.destroy = smq_destroy; - mq->policy.map = smq_map; mq->policy.lookup = smq_lookup; + mq->policy.lookup_with_work = smq_lookup_with_work; + mq->policy.get_background_work = smq_get_background_work; + mq->policy.complete_background_work = smq_complete_background_work; mq->policy.set_dirty = smq_set_dirty; mq->policy.clear_dirty = smq_clear_dirty; mq->policy.load_mapping = smq_load_mapping; + mq->policy.invalidate_mapping = smq_invalidate_mapping; mq->policy.get_hint = smq_get_hint; - mq->policy.remove_mapping = smq_remove_mapping; - mq->policy.remove_cblock = smq_remove_cblock; - mq->policy.writeback_work = smq_writeback_work; - mq->policy.force_mapping = smq_force_mapping; mq->policy.residency = smq_residency; mq->policy.tick = smq_tick; + mq->policy.allow_migrations = smq_allow_migrations; if (mimic_mq) { mq->policy.set_config_value = mq_set_config_value; @@ -1633,7 +1696,8 @@ static void calc_hotspot_params(sector_t origin_size, static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size, sector_t origin_size, sector_t cache_block_size, - bool mimic_mq) + bool mimic_mq, + bool migrations_allowed) { unsigned i; unsigned nr_sentinels_per_queue = 2u * NR_CACHE_LEVELS; @@ -1658,11 +1722,11 @@ static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size, } init_allocator(&mq->writeback_sentinel_alloc, &mq->es, 0, nr_sentinels_per_queue); - for (i = 0; i < nr_sentinels_per_queue; i++) + for (i = 0; i < nr_sentinels_per_queue; i++) get_entry(&mq->writeback_sentinel_alloc, i)->sentinel = true; init_allocator(&mq->demote_sentinel_alloc, &mq->es, nr_sentinels_per_queue, total_sentinels); - for (i = 0; i < nr_sentinels_per_queue; i++) + for (i = 0; i < nr_sentinels_per_queue; i++) get_entry(&mq->demote_sentinel_alloc, i)->sentinel = true; init_allocator(&mq->hotspot_alloc, &mq->es, total_sentinels, @@ -1715,8 +1779,16 @@ static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size, mq->next_hotspot_period = jiffies; mq->next_cache_period = jiffies; + mq->bg_work = btracker_create(10240); /* FIXME: hard coded value */ + if (!mq->bg_work) + goto bad_btracker; + + mq->migrations_allowed = migrations_allowed; + return &mq->policy; +bad_btracker: + h_exit(&mq->hotspot_table); bad_alloc_hotspot_table: h_exit(&mq->table); bad_alloc_table: @@ -1735,21 +1807,28 @@ static struct dm_cache_policy *smq_create(dm_cblock_t cache_size, sector_t origin_size, sector_t cache_block_size) { - return __smq_create(cache_size, origin_size, cache_block_size, false); + return __smq_create(cache_size, origin_size, cache_block_size, false, true); } static struct dm_cache_policy *mq_create(dm_cblock_t cache_size, sector_t origin_size, sector_t cache_block_size) { - return __smq_create(cache_size, origin_size, cache_block_size, true); + return __smq_create(cache_size, origin_size, cache_block_size, true, true); +} + +static struct dm_cache_policy *cleaner_create(dm_cblock_t cache_size, + sector_t origin_size, + sector_t cache_block_size) +{ + return __smq_create(cache_size, origin_size, cache_block_size, false, false); } /*----------------------------------------------------------------*/ static struct dm_cache_policy_type smq_policy_type = { .name = "smq", - .version = {1, 5, 0}, + .version = {2, 0, 0}, .hint_size = 4, .owner = THIS_MODULE, .create = smq_create @@ -1757,15 +1836,23 @@ static struct dm_cache_policy_type smq_policy_type = { static struct dm_cache_policy_type mq_policy_type = { .name = "mq", - .version = {1, 5, 0}, + .version = {2, 0, 0}, .hint_size = 4, .owner = THIS_MODULE, .create = mq_create, }; +static struct dm_cache_policy_type cleaner_policy_type = { + .name = "cleaner", + .version = {2, 0, 0}, + .hint_size = 4, + .owner = THIS_MODULE, + .create = cleaner_create, +}; + static struct dm_cache_policy_type default_policy_type = { .name = "default", - .version = {1, 5, 0}, + .version = {2, 0, 0}, .hint_size = 4, .owner = THIS_MODULE, .create = smq_create, @@ -1785,23 +1872,36 @@ static int __init smq_init(void) r = dm_cache_policy_register(&mq_policy_type); if (r) { DMERR("register failed (as mq) %d", r); - dm_cache_policy_unregister(&smq_policy_type); - return -ENOMEM; + goto out_mq; + } + + r = dm_cache_policy_register(&cleaner_policy_type); + if (r) { + DMERR("register failed (as cleaner) %d", r); + goto out_cleaner; } r = dm_cache_policy_register(&default_policy_type); if (r) { DMERR("register failed (as default) %d", r); - dm_cache_policy_unregister(&mq_policy_type); - dm_cache_policy_unregister(&smq_policy_type); - return -ENOMEM; + goto out_default; } return 0; + +out_default: + dm_cache_policy_unregister(&cleaner_policy_type); +out_cleaner: + dm_cache_policy_unregister(&mq_policy_type); +out_mq: + dm_cache_policy_unregister(&smq_policy_type); + + return -ENOMEM; } static void __exit smq_exit(void) { + dm_cache_policy_unregister(&cleaner_policy_type); dm_cache_policy_unregister(&smq_policy_type); dm_cache_policy_unregister(&mq_policy_type); dm_cache_policy_unregister(&default_policy_type); @@ -1816,3 +1916,4 @@ MODULE_DESCRIPTION("smq cache policy"); MODULE_ALIAS("dm-cache-default"); MODULE_ALIAS("dm-cache-mq"); +MODULE_ALIAS("dm-cache-cleaner"); diff --git a/drivers/md/dm-cache-policy.h b/drivers/md/dm-cache-policy.h index aa10b1493f34..c05fc3436cef 100644 --- a/drivers/md/dm-cache-policy.h +++ b/drivers/md/dm-cache-policy.h @@ -13,147 +13,94 @@ /*----------------------------------------------------------------*/ -/* FIXME: make it clear which methods are optional. Get debug policy to - * double check this at start. - */ - /* * The cache policy makes the important decisions about which blocks get to * live on the faster cache device. - * - * When the core target has to remap a bio it calls the 'map' method of the - * policy. This returns an instruction telling the core target what to do. - * - * POLICY_HIT: - * That block is in the cache. Remap to the cache and carry on. - * - * POLICY_MISS: - * This block is on the origin device. Remap and carry on. - * - * POLICY_NEW: - * This block is currently on the origin device, but the policy wants to - * move it. The core should: - * - * - hold any further io to this origin block - * - copy the origin to the given cache block - * - release all the held blocks - * - remap the original block to the cache - * - * POLICY_REPLACE: - * This block is currently on the origin device. The policy wants to - * move it to the cache, with the added complication that the destination - * cache block needs a writeback first. The core should: - * - * - hold any further io to this origin block - * - hold any further io to the origin block that's being written back - * - writeback - * - copy new block to cache - * - release held blocks - * - remap bio to cache and reissue. - * - * Should the core run into trouble while processing a POLICY_NEW or - * POLICY_REPLACE instruction it will roll back the policies mapping using - * remove_mapping() or force_mapping(). These methods must not fail. This - * approach avoids having transactional semantics in the policy (ie, the - * core informing the policy when a migration is complete), and hence makes - * it easier to write new policies. - * - * In general policy methods should never block, except in the case of the - * map function when can_migrate is set. So be careful to implement using - * bounded, preallocated memory. */ enum policy_operation { - POLICY_HIT, - POLICY_MISS, - POLICY_NEW, - POLICY_REPLACE -}; - -/* - * When issuing a POLICY_REPLACE the policy needs to make a callback to - * lock the block being demoted. This doesn't need to occur during a - * writeback operation since the block remains in the cache. - */ -struct policy_locker; -typedef int (*policy_lock_fn)(struct policy_locker *l, dm_oblock_t oblock); - -struct policy_locker { - policy_lock_fn fn; + POLICY_PROMOTE, + POLICY_DEMOTE, + POLICY_WRITEBACK }; /* * This is the instruction passed back to the core target. */ -struct policy_result { +struct policy_work { enum policy_operation op; - dm_oblock_t old_oblock; /* POLICY_REPLACE */ - dm_cblock_t cblock; /* POLICY_HIT, POLICY_NEW, POLICY_REPLACE */ + dm_oblock_t oblock; + dm_cblock_t cblock; }; /* - * The cache policy object. Just a bunch of methods. It is envisaged that - * this structure will be embedded in a bigger, policy specific structure - * (ie. use container_of()). + * The cache policy object. It is envisaged that this structure will be + * embedded in a bigger, policy specific structure (ie. use container_of()). */ struct dm_cache_policy { - - /* - * FIXME: make it clear which methods are optional, and which may - * block. - */ - /* * Destroys this object. */ void (*destroy)(struct dm_cache_policy *p); /* - * See large comment above. - * - * oblock - the origin block we're interested in. - * - * can_block - indicates whether the current thread is allowed to - * block. -EWOULDBLOCK returned if it can't and would. - * - * can_migrate - gives permission for POLICY_NEW or POLICY_REPLACE - * instructions. If denied and the policy would have - * returned one of these instructions it should - * return -EWOULDBLOCK. - * - * discarded_oblock - indicates whether the whole origin block is - * in a discarded state (FIXME: better to tell the - * policy about this sooner, so it can recycle that - * cache block if it wants.) - * bio - the bio that triggered this call. - * result - gets filled in with the instruction. - * - * May only return 0, or -EWOULDBLOCK (if !can_migrate) - */ - int (*map)(struct dm_cache_policy *p, dm_oblock_t oblock, - bool can_block, bool can_migrate, bool discarded_oblock, - struct bio *bio, struct policy_locker *locker, - struct policy_result *result); - - /* - * Sometimes we want to see if a block is in the cache, without - * triggering any update of stats. (ie. it's not a real hit). + * Find the location of a block. * * Must not block. * - * Returns 0 if in cache, -ENOENT if not, < 0 for other errors - * (-EWOULDBLOCK would be typical). + * Returns 0 if in cache (cblock will be set), -ENOENT if not, < 0 for + * other errors (-EWOULDBLOCK would be typical). data_dir should be + * READ or WRITE. fast_copy should be set if migrating this block would + * be 'cheap' somehow (eg, discarded data). background_queued will be set + * if a migration has just been queued. */ - int (*lookup)(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock); + int (*lookup)(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock, + int data_dir, bool fast_copy, bool *background_queued); - void (*set_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock); - void (*clear_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock); + /* + * Sometimes the core target can optimise a migration, eg, the + * block may be discarded, or the bio may cover an entire block. + * In order to optimise it needs the migration immediately though + * so it knows to do something different with the bio. + * + * This method is optional (policy-internal will fallback to using + * lookup). + */ + int (*lookup_with_work)(struct dm_cache_policy *p, + dm_oblock_t oblock, dm_cblock_t *cblock, + int data_dir, bool fast_copy, + struct policy_work **work); + + /* + * Retrieves background work. Returns -ENODATA when there's no + * background work. + */ + int (*get_background_work)(struct dm_cache_policy *p, bool idle, + struct policy_work **result); + + /* + * You must pass in the same work pointer that you were given, not + * a copy. + */ + void (*complete_background_work)(struct dm_cache_policy *p, + struct policy_work *work, + bool success); + + void (*set_dirty)(struct dm_cache_policy *p, dm_cblock_t cblock); + void (*clear_dirty)(struct dm_cache_policy *p, dm_cblock_t cblock); /* * Called when a cache target is first created. Used to load a * mapping from the metadata device into the policy. */ int (*load_mapping)(struct dm_cache_policy *p, dm_oblock_t oblock, - dm_cblock_t cblock, uint32_t hint, bool hint_valid); + dm_cblock_t cblock, bool dirty, + uint32_t hint, bool hint_valid); + + /* + * Drops the mapping, irrespective of whether it's clean or dirty. + * Returns -ENODATA if cblock is not mapped. + */ + int (*invalidate_mapping)(struct dm_cache_policy *p, dm_cblock_t cblock); /* * Gets the hint for a given cblock. Called in a single threaded @@ -161,36 +108,6 @@ struct dm_cache_policy { */ uint32_t (*get_hint)(struct dm_cache_policy *p, dm_cblock_t cblock); - /* - * Override functions used on the error paths of the core target. - * They must succeed. - */ - void (*remove_mapping)(struct dm_cache_policy *p, dm_oblock_t oblock); - void (*force_mapping)(struct dm_cache_policy *p, dm_oblock_t current_oblock, - dm_oblock_t new_oblock); - - /* - * This is called via the invalidate_cblocks message. It is - * possible the particular cblock has already been removed due to a - * write io in passthrough mode. In which case this should return - * -ENODATA. - */ - int (*remove_cblock)(struct dm_cache_policy *p, dm_cblock_t cblock); - - /* - * Provide a dirty block to be written back by the core target. If - * critical_only is set then the policy should only provide work if - * it urgently needs it. - * - * Returns: - * - * 0 and @cblock,@oblock: block to write back provided - * - * -ENODATA: no dirty blocks available - */ - int (*writeback_work)(struct dm_cache_policy *p, dm_oblock_t *oblock, dm_cblock_t *cblock, - bool critical_only); - /* * How full is the cache? */ @@ -202,6 +119,8 @@ struct dm_cache_policy { * queue merging has occurred). To stop the policy being fooled by * these, the core target sends regular tick() calls to the policy. * The policy should only count an entry as hit once per tick. + * + * This method is optional. */ void (*tick)(struct dm_cache_policy *p, bool can_block); @@ -213,6 +132,8 @@ struct dm_cache_policy { int (*set_config_value)(struct dm_cache_policy *p, const char *key, const char *value); + void (*allow_migrations)(struct dm_cache_policy *p, bool allow); + /* * Book keeping ptr for the policy register, not for general use. */ diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 975922c8f231..1db375f50a13 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -5,7 +5,7 @@ */ #include "dm.h" -#include "dm-bio-prison.h" +#include "dm-bio-prison-v2.h" #include "dm-bio-record.h" #include "dm-cache-metadata.h" @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -25,7 +26,18 @@ DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle, /*----------------------------------------------------------------*/ -#define IOT_RESOLUTION 4 +/* + * Glossary: + * + * oblock: index of an origin block + * cblock: index of a cache block + * promotion: movement of a block from origin to cache + * demotion: movement of a block from cache to origin + * migration: movement of a block between the origin and cache device, + * either direction + */ + +/*----------------------------------------------------------------*/ struct io_tracker { spinlock_t lock; @@ -99,18 +111,177 @@ static void iot_io_end(struct io_tracker *iot, sector_t len) /*----------------------------------------------------------------*/ /* - * Glossary: - * - * oblock: index of an origin block - * cblock: index of a cache block - * promotion: movement of a block from origin to cache - * demotion: movement of a block from cache to origin - * migration: movement of a block between the origin and cache device, - * either direction + * Represents a chunk of future work. 'input' allows continuations to pass + * values between themselves, typically error values. */ +struct continuation { + struct work_struct ws; + int input; +}; + +static inline void init_continuation(struct continuation *k, + void (*fn)(struct work_struct *)) +{ + INIT_WORK(&k->ws, fn); + k->input = 0; +} + +static inline void queue_continuation(struct workqueue_struct *wq, + struct continuation *k) +{ + queue_work(wq, &k->ws); +} /*----------------------------------------------------------------*/ +/* + * The batcher collects together pieces of work that need a particular + * operation to occur before they can proceed (typically a commit). + */ +struct batcher { + /* + * The operation that everyone is waiting for. + */ + int (*commit_op)(void *context); + void *commit_context; + + /* + * This is how bios should be issued once the commit op is complete + * (accounted_request). + */ + void (*issue_op)(struct bio *bio, void *context); + void *issue_context; + + /* + * Queued work gets put on here after commit. + */ + struct workqueue_struct *wq; + + spinlock_t lock; + struct list_head work_items; + struct bio_list bios; + struct work_struct commit_work; + + bool commit_scheduled; +}; + +static void __commit(struct work_struct *_ws) +{ + struct batcher *b = container_of(_ws, struct batcher, commit_work); + + int r; + unsigned long flags; + struct list_head work_items; + struct work_struct *ws, *tmp; + struct continuation *k; + struct bio *bio; + struct bio_list bios; + + INIT_LIST_HEAD(&work_items); + bio_list_init(&bios); + + /* + * We have to grab these before the commit_op to avoid a race + * condition. + */ + spin_lock_irqsave(&b->lock, flags); + list_splice_init(&b->work_items, &work_items); + bio_list_merge(&bios, &b->bios); + bio_list_init(&b->bios); + b->commit_scheduled = false; + spin_unlock_irqrestore(&b->lock, flags); + + r = b->commit_op(b->commit_context); + + list_for_each_entry_safe(ws, tmp, &work_items, entry) { + k = container_of(ws, struct continuation, ws); + k->input = r; + INIT_LIST_HEAD(&ws->entry); /* to avoid a WARN_ON */ + queue_work(b->wq, ws); + } + + while ((bio = bio_list_pop(&bios))) { + if (r) { + bio->bi_error = r; + bio_endio(bio); + } else + b->issue_op(bio, b->issue_context); + } +} + +static void batcher_init(struct batcher *b, + int (*commit_op)(void *), + void *commit_context, + void (*issue_op)(struct bio *bio, void *), + void *issue_context, + struct workqueue_struct *wq) +{ + b->commit_op = commit_op; + b->commit_context = commit_context; + b->issue_op = issue_op; + b->issue_context = issue_context; + b->wq = wq; + + spin_lock_init(&b->lock); + INIT_LIST_HEAD(&b->work_items); + bio_list_init(&b->bios); + INIT_WORK(&b->commit_work, __commit); + b->commit_scheduled = false; +} + +static void async_commit(struct batcher *b) +{ + queue_work(b->wq, &b->commit_work); +} + +static void continue_after_commit(struct batcher *b, struct continuation *k) +{ + unsigned long flags; + bool commit_scheduled; + + spin_lock_irqsave(&b->lock, flags); + commit_scheduled = b->commit_scheduled; + list_add_tail(&k->ws.entry, &b->work_items); + spin_unlock_irqrestore(&b->lock, flags); + + if (commit_scheduled) + async_commit(b); +} + +/* + * Bios are errored if commit failed. + */ +static void issue_after_commit(struct batcher *b, struct bio *bio) +{ + unsigned long flags; + bool commit_scheduled; + + spin_lock_irqsave(&b->lock, flags); + commit_scheduled = b->commit_scheduled; + bio_list_add(&b->bios, bio); + spin_unlock_irqrestore(&b->lock, flags); + + if (commit_scheduled) + async_commit(b); +} + +/* + * Call this if some urgent work is waiting for the commit to complete. + */ +static void schedule_commit(struct batcher *b) +{ + bool immediate; + unsigned long flags; + + spin_lock_irqsave(&b->lock, flags); + immediate = !list_empty(&b->work_items) || !bio_list_empty(&b->bios); + b->commit_scheduled = true; + spin_unlock_irqrestore(&b->lock, flags); + + if (immediate) + async_commit(b); +} + /* * There are a couple of places where we let a bio run, but want to do some * work before calling its endio function. We do this by temporarily @@ -189,31 +360,13 @@ struct cache_stats { atomic_t write_miss; atomic_t demotion; atomic_t promotion; + atomic_t writeback; atomic_t copies_avoided; atomic_t cache_cell_clash; atomic_t commit_count; atomic_t discard_count; }; -/* - * Defines a range of cblocks, begin to (end - 1) are in the range. end is - * the one-past-the-end value. - */ -struct cblock_range { - dm_cblock_t begin; - dm_cblock_t end; -}; - -struct invalidation_request { - struct list_head list; - struct cblock_range *cblocks; - - atomic_t complete; - int err; - - wait_queue_head_t result_wait; -}; - struct cache { struct dm_target *ti; struct dm_target_callbacks callbacks; @@ -255,11 +408,7 @@ struct cache { spinlock_t lock; struct list_head deferred_cells; struct bio_list deferred_bios; - struct bio_list deferred_flush_bios; struct bio_list deferred_writethrough_bios; - struct list_head quiesced_migrations; - struct list_head completed_migrations; - struct list_head need_commit_migrations; sector_t migration_threshold; wait_queue_head_t migration_wait; atomic_t nr_allocated_migrations; @@ -270,9 +419,7 @@ struct cache { */ atomic_t nr_io_migrations; - wait_queue_head_t quiescing_wait; - atomic_t quiescing; - atomic_t quiescing_ack; + struct rw_semaphore quiesce_lock; /* * cache_size entries, dirty if set @@ -296,13 +443,11 @@ struct cache { struct dm_kcopyd_client *copier; struct workqueue_struct *wq; - struct work_struct worker; - + struct work_struct deferred_bio_worker; + struct work_struct deferred_writethrough_worker; + struct work_struct migration_worker; struct delayed_work waker; - unsigned long last_commit_jiffies; - - struct dm_bio_prison *prison; - struct dm_deferred_set *all_io_ds; + struct dm_bio_prison_v2 *prison; mempool_t *migration_pool; @@ -330,12 +475,17 @@ struct cache { struct list_head invalidation_requests; struct io_tracker origin_tracker; + + struct work_struct commit_ws; + struct batcher committer; + + struct rw_semaphore background_work_lock; }; struct per_bio_data { bool tick:1; unsigned req_nr:2; - struct dm_deferred_entry *all_io_entry; + struct dm_bio_prison_cell_v2 *cell; struct dm_hook_info hook_info; sector_t len; @@ -350,55 +500,64 @@ struct per_bio_data { }; struct dm_cache_migration { - struct list_head list; + struct continuation k; struct cache *cache; - unsigned long start_jiffies; - dm_oblock_t old_oblock; - dm_oblock_t new_oblock; - dm_cblock_t cblock; + struct policy_work *op; + struct bio *overwrite_bio; + struct dm_bio_prison_cell_v2 *cell; - bool err:1; - bool discard:1; - bool writeback:1; - bool demote:1; - bool promote:1; - bool requeue_holder:1; - bool invalidate:1; - - struct dm_bio_prison_cell *old_ocell; - struct dm_bio_prison_cell *new_ocell; + dm_cblock_t invalidate_cblock; + dm_oblock_t invalidate_oblock; }; -/* - * Processing a bio in the worker thread may require these memory - * allocations. We prealloc to avoid deadlocks (the same worker thread - * frees them back to the mempool). - */ -struct prealloc { - struct dm_cache_migration *mg; - struct dm_bio_prison_cell *cell1; - struct dm_bio_prison_cell *cell2; -}; +/*----------------------------------------------------------------*/ -static enum cache_metadata_mode get_cache_mode(struct cache *cache); - -static void wake_worker(struct cache *cache) +static bool writethrough_mode(struct cache_features *f) { - queue_work(cache->wq, &cache->worker); + return f->io_mode == CM_IO_WRITETHROUGH; +} + +static bool writeback_mode(struct cache_features *f) +{ + return f->io_mode == CM_IO_WRITEBACK; +} + +static inline bool passthrough_mode(struct cache_features *f) +{ + return unlikely(f->io_mode == CM_IO_PASSTHROUGH); } /*----------------------------------------------------------------*/ -static struct dm_bio_prison_cell *alloc_prison_cell(struct cache *cache) +static void wake_deferred_bio_worker(struct cache *cache) { - /* FIXME: change to use a local slab. */ - return dm_bio_prison_alloc_cell(cache->prison, GFP_NOWAIT); + queue_work(cache->wq, &cache->deferred_bio_worker); } -static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cell) +static void wake_deferred_writethrough_worker(struct cache *cache) { - dm_bio_prison_free_cell(cache->prison, cell); + queue_work(cache->wq, &cache->deferred_writethrough_worker); +} + +static void wake_migration_worker(struct cache *cache) +{ + if (passthrough_mode(&cache->features)) + return; + + queue_work(cache->wq, &cache->migration_worker); +} + +/*----------------------------------------------------------------*/ + +static struct dm_bio_prison_cell_v2 *alloc_prison_cell(struct cache *cache) +{ + return dm_bio_prison_alloc_cell_v2(cache->prison, GFP_NOWAIT); +} + +static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell_v2 *cell) +{ + dm_bio_prison_free_cell_v2(cache->prison, cell); } static struct dm_cache_migration *alloc_migration(struct cache *cache) @@ -424,91 +583,14 @@ static void free_migration(struct dm_cache_migration *mg) mempool_free(mg, cache->migration_pool); } -static int prealloc_data_structs(struct cache *cache, struct prealloc *p) -{ - if (!p->mg) { - p->mg = alloc_migration(cache); - if (!p->mg) - return -ENOMEM; - } - - if (!p->cell1) { - p->cell1 = alloc_prison_cell(cache); - if (!p->cell1) - return -ENOMEM; - } - - if (!p->cell2) { - p->cell2 = alloc_prison_cell(cache); - if (!p->cell2) - return -ENOMEM; - } - - return 0; -} - -static void prealloc_free_structs(struct cache *cache, struct prealloc *p) -{ - if (p->cell2) - free_prison_cell(cache, p->cell2); - - if (p->cell1) - free_prison_cell(cache, p->cell1); - - if (p->mg) - free_migration(p->mg); -} - -static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p) -{ - struct dm_cache_migration *mg = p->mg; - - BUG_ON(!mg); - p->mg = NULL; - - return mg; -} - -/* - * You must have a cell within the prealloc struct to return. If not this - * function will BUG() rather than returning NULL. - */ -static struct dm_bio_prison_cell *prealloc_get_cell(struct prealloc *p) -{ - struct dm_bio_prison_cell *r = NULL; - - if (p->cell1) { - r = p->cell1; - p->cell1 = NULL; - - } else if (p->cell2) { - r = p->cell2; - p->cell2 = NULL; - } else - BUG(); - - return r; -} - -/* - * You can't have more than two cells in a prealloc struct. BUG() will be - * called if you try and overfill. - */ -static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cell) -{ - if (!p->cell2) - p->cell2 = cell; - - else if (!p->cell1) - p->cell1 = cell; - - else - BUG(); -} - /*----------------------------------------------------------------*/ -static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key *key) +static inline dm_oblock_t oblock_succ(dm_oblock_t b) +{ + return to_oblock(from_oblock(b) + 1ull); +} + +static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key_v2 *key) { key->virtual = 0; key->dev = 0; @@ -517,53 +599,111 @@ static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key *ke } /* - * The caller hands in a preallocated cell, and a free function for it. - * The cell will be freed if there's an error, or if it wasn't used because - * a cell with that key already exists. + * We have two lock levels. Level 0, which is used to prevent WRITEs, and + * level 1 which prevents *both* READs and WRITEs. */ -typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell); +#define WRITE_LOCK_LEVEL 0 +#define READ_WRITE_LOCK_LEVEL 1 -static int bio_detain_range(struct cache *cache, dm_oblock_t oblock_begin, dm_oblock_t oblock_end, - struct bio *bio, struct dm_bio_prison_cell *cell_prealloc, - cell_free_fn free_fn, void *free_context, - struct dm_bio_prison_cell **cell_result) +static unsigned lock_level(struct bio *bio) { - int r; - struct dm_cell_key key; - - build_key(oblock_begin, oblock_end, &key); - r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result); - if (r) - free_fn(free_context, cell_prealloc); - - return r; + return bio_data_dir(bio) == WRITE ? + WRITE_LOCK_LEVEL : + READ_WRITE_LOCK_LEVEL; } -static int bio_detain(struct cache *cache, dm_oblock_t oblock, - struct bio *bio, struct dm_bio_prison_cell *cell_prealloc, - cell_free_fn free_fn, void *free_context, - struct dm_bio_prison_cell **cell_result) +/*---------------------------------------------------------------- + * Per bio data + *--------------------------------------------------------------*/ + +/* + * If using writeback, leave out struct per_bio_data's writethrough fields. + */ +#define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache)) +#define PB_DATA_SIZE_WT (sizeof(struct per_bio_data)) + +static size_t get_per_bio_data_size(struct cache *cache) { + return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB; +} + +static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size) +{ + struct per_bio_data *pb = dm_per_bio_data(bio, data_size); + BUG_ON(!pb); + return pb; +} + +static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size) +{ + struct per_bio_data *pb = get_per_bio_data(bio, data_size); + + pb->tick = false; + pb->req_nr = dm_bio_get_target_bio_nr(bio); + pb->cell = NULL; + pb->len = 0; + + return pb; +} + +/*----------------------------------------------------------------*/ + +static void defer_bio(struct cache *cache, struct bio *bio) +{ + unsigned long flags; + + spin_lock_irqsave(&cache->lock, flags); + bio_list_add(&cache->deferred_bios, bio); + spin_unlock_irqrestore(&cache->lock, flags); + + wake_deferred_bio_worker(cache); +} + +static void defer_bios(struct cache *cache, struct bio_list *bios) +{ + unsigned long flags; + + spin_lock_irqsave(&cache->lock, flags); + bio_list_merge(&cache->deferred_bios, bios); + bio_list_init(bios); + spin_unlock_irqrestore(&cache->lock, flags); + + wake_deferred_bio_worker(cache); +} + +/*----------------------------------------------------------------*/ + +static bool bio_detain_shared(struct cache *cache, dm_oblock_t oblock, struct bio *bio) +{ + bool r; + size_t pb_size; + struct per_bio_data *pb; + struct dm_cell_key_v2 key; dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL); - return bio_detain_range(cache, oblock, end, bio, - cell_prealloc, free_fn, free_context, cell_result); -} + struct dm_bio_prison_cell_v2 *cell_prealloc, *cell; -static int get_cell(struct cache *cache, - dm_oblock_t oblock, - struct prealloc *structs, - struct dm_bio_prison_cell **cell_result) -{ - int r; - struct dm_cell_key key; - struct dm_bio_prison_cell *cell_prealloc; + cell_prealloc = alloc_prison_cell(cache); /* FIXME: allow wait if calling from worker */ + if (!cell_prealloc) { + defer_bio(cache, bio); + return false; + } - cell_prealloc = prealloc_get_cell(structs); + build_key(oblock, end, &key); + r = dm_cell_get_v2(cache->prison, &key, lock_level(bio), bio, cell_prealloc, &cell); + if (!r) { + /* + * Failed to get the lock. + */ + free_prison_cell(cache, cell_prealloc); + return r; + } - build_key(oblock, to_oblock(from_oblock(oblock) + 1ULL), &key); - r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result); - if (r) - prealloc_put_cell(structs, cell_prealloc); + if (cell != cell_prealloc) + free_prison_cell(cache, cell_prealloc); + + pb_size = get_per_bio_data_size(cache); + pb = get_per_bio_data(bio, pb_size); + pb->cell = cell; return r; } @@ -575,21 +715,33 @@ static bool is_dirty(struct cache *cache, dm_cblock_t b) return test_bit(from_cblock(b), cache->dirty_bitset); } -static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock) +static void set_dirty(struct cache *cache, dm_cblock_t cblock) { if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) { atomic_inc(&cache->nr_dirty); - policy_set_dirty(cache->policy, oblock); + policy_set_dirty(cache->policy, cblock); } } -static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock) +/* + * These two are called when setting after migrations to force the policy + * and dirty bitset to be in sync. + */ +static void force_set_dirty(struct cache *cache, dm_cblock_t cblock) +{ + if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) + atomic_inc(&cache->nr_dirty); + policy_set_dirty(cache->policy, cblock); +} + +static void force_clear_dirty(struct cache *cache, dm_cblock_t cblock) { if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) { - policy_clear_dirty(cache->policy, oblock); if (atomic_dec_return(&cache->nr_dirty) == 0) dm_table_event(cache->ti->table); } + + policy_clear_dirty(cache->policy, cblock); } /*----------------------------------------------------------------*/ @@ -628,11 +780,6 @@ static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock) oblocks_per_dblock(cache))); } -static dm_oblock_t dblock_to_oblock(struct cache *cache, dm_dblock_t dblock) -{ - return to_oblock(from_dblock(dblock) * oblocks_per_dblock(cache)); -} - static void set_discard(struct cache *cache, dm_dblock_t b) { unsigned long flags; @@ -679,83 +826,6 @@ static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b) return r; } -/*----------------------------------------------------------------*/ - -static void load_stats(struct cache *cache) -{ - struct dm_cache_statistics stats; - - dm_cache_metadata_get_stats(cache->cmd, &stats); - atomic_set(&cache->stats.read_hit, stats.read_hits); - atomic_set(&cache->stats.read_miss, stats.read_misses); - atomic_set(&cache->stats.write_hit, stats.write_hits); - atomic_set(&cache->stats.write_miss, stats.write_misses); -} - -static void save_stats(struct cache *cache) -{ - struct dm_cache_statistics stats; - - if (get_cache_mode(cache) >= CM_READ_ONLY) - return; - - stats.read_hits = atomic_read(&cache->stats.read_hit); - stats.read_misses = atomic_read(&cache->stats.read_miss); - stats.write_hits = atomic_read(&cache->stats.write_hit); - stats.write_misses = atomic_read(&cache->stats.write_miss); - - dm_cache_metadata_set_stats(cache->cmd, &stats); -} - -/*---------------------------------------------------------------- - * Per bio data - *--------------------------------------------------------------*/ - -/* - * If using writeback, leave out struct per_bio_data's writethrough fields. - */ -#define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache)) -#define PB_DATA_SIZE_WT (sizeof(struct per_bio_data)) - -static bool writethrough_mode(struct cache_features *f) -{ - return f->io_mode == CM_IO_WRITETHROUGH; -} - -static bool writeback_mode(struct cache_features *f) -{ - return f->io_mode == CM_IO_WRITEBACK; -} - -static bool passthrough_mode(struct cache_features *f) -{ - return f->io_mode == CM_IO_PASSTHROUGH; -} - -static size_t get_per_bio_data_size(struct cache *cache) -{ - return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB; -} - -static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size) -{ - struct per_bio_data *pb = dm_per_bio_data(bio, data_size); - BUG_ON(!pb); - return pb; -} - -static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size) -{ - struct per_bio_data *pb = get_per_bio_data(bio, data_size); - - pb->tick = false; - pb->req_nr = dm_bio_get_target_bio_nr(bio); - pb->all_io_entry = NULL; - pb->len = 0; - - return pb; -} - /*---------------------------------------------------------------- * Remapping *--------------------------------------------------------------*/ @@ -797,8 +867,9 @@ static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio) } static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, - dm_oblock_t oblock) + dm_oblock_t oblock) { + // FIXME: this is called way too much. check_if_tick_bio_needed(cache, bio); remap_to_origin(cache, bio); if (bio_data_dir(bio) == WRITE) @@ -811,7 +882,7 @@ static void remap_to_cache_dirty(struct cache *cache, struct bio *bio, check_if_tick_bio_needed(cache, bio); remap_to_cache(cache, bio, cblock); if (bio_data_dir(bio) == WRITE) { - set_dirty(cache, oblock, cblock); + set_dirty(cache, cblock); clear_discard(cache, oblock_to_dblock(cache, oblock)); } } @@ -828,22 +899,6 @@ static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio) return to_oblock(block_nr); } -/* - * You must increment the deferred set whilst the prison cell is held. To - * encourage this, we ask for 'cell' to be passed in. - */ -static void inc_ds(struct cache *cache, struct bio *bio, - struct dm_bio_prison_cell *cell) -{ - size_t pb_data_size = get_per_bio_data_size(cache); - struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); - - BUG_ON(!cell); - BUG_ON(pb->all_io_entry); - - pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); -} - static bool accountable_bio(struct cache *cache, struct bio *bio) { return ((bio->bi_bdev == cache->origin_dev->bdev) && @@ -875,29 +930,10 @@ static void accounted_request(struct cache *cache, struct bio *bio) generic_make_request(bio); } -static void issue(struct cache *cache, struct bio *bio) +static void issue_op(struct bio *bio, void *context) { - unsigned long flags; - - if (!op_is_flush(bio->bi_opf)) { - accounted_request(cache, bio); - return; - } - - /* - * Batch together any bios that trigger commits and then issue a - * single commit for them in do_worker(). - */ - spin_lock_irqsave(&cache->lock, flags); - cache->commit_requested = true; - bio_list_add(&cache->deferred_flush_bios, bio); - spin_unlock_irqrestore(&cache->lock, flags); -} - -static void inc_and_issue(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell *cell) -{ - inc_ds(cache, bio, cell); - issue(cache, bio); + struct cache *cache = context; + accounted_request(cache, bio); } static void defer_writethrough_bio(struct cache *cache, struct bio *bio) @@ -908,7 +944,7 @@ static void defer_writethrough_bio(struct cache *cache, struct bio *bio) bio_list_add(&cache->deferred_writethrough_bios, bio); spin_unlock_irqrestore(&cache->lock, flags); - wake_worker(cache); + wake_deferred_writethrough_worker(cache); } static void writethrough_endio(struct bio *bio) @@ -934,6 +970,7 @@ static void writethrough_endio(struct bio *bio) } /* + * FIXME: send in parallel, huge latency as is. * When running in writethrough mode we need to send writes to clean blocks * to both the cache and origin devices. In future we'd like to clone the * bio and send them in parallel, but for now we're doing them in @@ -1046,12 +1083,58 @@ static void metadata_operation_failed(struct cache *cache, const char *op, int r set_cache_mode(cache, CM_READ_ONLY); } +/*----------------------------------------------------------------*/ + +static void load_stats(struct cache *cache) +{ + struct dm_cache_statistics stats; + + dm_cache_metadata_get_stats(cache->cmd, &stats); + atomic_set(&cache->stats.read_hit, stats.read_hits); + atomic_set(&cache->stats.read_miss, stats.read_misses); + atomic_set(&cache->stats.write_hit, stats.write_hits); + atomic_set(&cache->stats.write_miss, stats.write_misses); +} + +static void save_stats(struct cache *cache) +{ + struct dm_cache_statistics stats; + + if (get_cache_mode(cache) >= CM_READ_ONLY) + return; + + stats.read_hits = atomic_read(&cache->stats.read_hit); + stats.read_misses = atomic_read(&cache->stats.read_miss); + stats.write_hits = atomic_read(&cache->stats.write_hit); + stats.write_misses = atomic_read(&cache->stats.write_miss); + + dm_cache_metadata_set_stats(cache->cmd, &stats); +} + +static void update_stats(struct cache_stats *stats, enum policy_operation op) +{ + switch (op) { + case POLICY_PROMOTE: + atomic_inc(&stats->promotion); + break; + + case POLICY_DEMOTE: + atomic_inc(&stats->demotion); + break; + + case POLICY_WRITEBACK: + atomic_inc(&stats->writeback); + break; + } +} + /*---------------------------------------------------------------- * Migration processing * * Migration covers moving data from the origin device to the cache, or * vice versa. *--------------------------------------------------------------*/ + static void inc_io_migrations(struct cache *cache) { atomic_inc(&cache->nr_io_migrations); @@ -1067,264 +1150,6 @@ static bool discard_or_flush(struct bio *bio) return bio_op(bio) == REQ_OP_DISCARD || op_is_flush(bio->bi_opf); } -static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell) -{ - if (discard_or_flush(cell->holder)) { - /* - * We have to handle these bios individually. - */ - dm_cell_release(cache->prison, cell, &cache->deferred_bios); - free_prison_cell(cache, cell); - } else - list_add_tail(&cell->user_list, &cache->deferred_cells); -} - -static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, bool holder) -{ - unsigned long flags; - - if (!holder && dm_cell_promote_or_release(cache->prison, cell)) { - /* - * There was no prisoner to promote to holder, the - * cell has been released. - */ - free_prison_cell(cache, cell); - return; - } - - spin_lock_irqsave(&cache->lock, flags); - __cell_defer(cache, cell); - spin_unlock_irqrestore(&cache->lock, flags); - - wake_worker(cache); -} - -static void cell_error_with_code(struct cache *cache, struct dm_bio_prison_cell *cell, int err) -{ - dm_cell_error(cache->prison, cell, err); - free_prison_cell(cache, cell); -} - -static void cell_requeue(struct cache *cache, struct dm_bio_prison_cell *cell) -{ - cell_error_with_code(cache, cell, DM_ENDIO_REQUEUE); -} - -static void free_io_migration(struct dm_cache_migration *mg) -{ - struct cache *cache = mg->cache; - - dec_io_migrations(cache); - free_migration(mg); - wake_worker(cache); -} - -static void migration_failure(struct dm_cache_migration *mg) -{ - struct cache *cache = mg->cache; - const char *dev_name = cache_device_name(cache); - - if (mg->writeback) { - DMERR_LIMIT("%s: writeback failed; couldn't copy block", dev_name); - set_dirty(cache, mg->old_oblock, mg->cblock); - cell_defer(cache, mg->old_ocell, false); - - } else if (mg->demote) { - DMERR_LIMIT("%s: demotion failed; couldn't copy block", dev_name); - policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock); - - cell_defer(cache, mg->old_ocell, mg->promote ? false : true); - if (mg->promote) - cell_defer(cache, mg->new_ocell, true); - } else { - DMERR_LIMIT("%s: promotion failed; couldn't copy block", dev_name); - policy_remove_mapping(cache->policy, mg->new_oblock); - cell_defer(cache, mg->new_ocell, true); - } - - free_io_migration(mg); -} - -static void migration_success_pre_commit(struct dm_cache_migration *mg) -{ - int r; - unsigned long flags; - struct cache *cache = mg->cache; - - if (mg->writeback) { - clear_dirty(cache, mg->old_oblock, mg->cblock); - cell_defer(cache, mg->old_ocell, false); - free_io_migration(mg); - return; - - } else if (mg->demote) { - r = dm_cache_remove_mapping(cache->cmd, mg->cblock); - if (r) { - DMERR_LIMIT("%s: demotion failed; couldn't update on disk metadata", - cache_device_name(cache)); - metadata_operation_failed(cache, "dm_cache_remove_mapping", r); - policy_force_mapping(cache->policy, mg->new_oblock, - mg->old_oblock); - if (mg->promote) - cell_defer(cache, mg->new_ocell, true); - free_io_migration(mg); - return; - } - } else { - r = dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock); - if (r) { - DMERR_LIMIT("%s: promotion failed; couldn't update on disk metadata", - cache_device_name(cache)); - metadata_operation_failed(cache, "dm_cache_insert_mapping", r); - policy_remove_mapping(cache->policy, mg->new_oblock); - free_io_migration(mg); - return; - } - } - - spin_lock_irqsave(&cache->lock, flags); - list_add_tail(&mg->list, &cache->need_commit_migrations); - cache->commit_requested = true; - spin_unlock_irqrestore(&cache->lock, flags); -} - -static void migration_success_post_commit(struct dm_cache_migration *mg) -{ - unsigned long flags; - struct cache *cache = mg->cache; - - if (mg->writeback) { - DMWARN_LIMIT("%s: writeback unexpectedly triggered commit", - cache_device_name(cache)); - return; - - } else if (mg->demote) { - cell_defer(cache, mg->old_ocell, mg->promote ? false : true); - - if (mg->promote) { - mg->demote = false; - - spin_lock_irqsave(&cache->lock, flags); - list_add_tail(&mg->list, &cache->quiesced_migrations); - spin_unlock_irqrestore(&cache->lock, flags); - - } else { - if (mg->invalidate) - policy_remove_mapping(cache->policy, mg->old_oblock); - free_io_migration(mg); - } - - } else { - if (mg->requeue_holder) { - clear_dirty(cache, mg->new_oblock, mg->cblock); - cell_defer(cache, mg->new_ocell, true); - } else { - /* - * The block was promoted via an overwrite, so it's dirty. - */ - set_dirty(cache, mg->new_oblock, mg->cblock); - bio_endio(mg->new_ocell->holder); - cell_defer(cache, mg->new_ocell, false); - } - free_io_migration(mg); - } -} - -static void copy_complete(int read_err, unsigned long write_err, void *context) -{ - unsigned long flags; - struct dm_cache_migration *mg = (struct dm_cache_migration *) context; - struct cache *cache = mg->cache; - - if (read_err || write_err) - mg->err = true; - - spin_lock_irqsave(&cache->lock, flags); - list_add_tail(&mg->list, &cache->completed_migrations); - spin_unlock_irqrestore(&cache->lock, flags); - - wake_worker(cache); -} - -static void issue_copy(struct dm_cache_migration *mg) -{ - int r; - struct dm_io_region o_region, c_region; - struct cache *cache = mg->cache; - sector_t cblock = from_cblock(mg->cblock); - - o_region.bdev = cache->origin_dev->bdev; - o_region.count = cache->sectors_per_block; - - c_region.bdev = cache->cache_dev->bdev; - c_region.sector = cblock * cache->sectors_per_block; - c_region.count = cache->sectors_per_block; - - if (mg->writeback || mg->demote) { - /* demote */ - o_region.sector = from_oblock(mg->old_oblock) * cache->sectors_per_block; - r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, mg); - } else { - /* promote */ - o_region.sector = from_oblock(mg->new_oblock) * cache->sectors_per_block; - r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg); - } - - if (r < 0) { - DMERR_LIMIT("%s: issuing migration failed", cache_device_name(cache)); - migration_failure(mg); - } -} - -static void overwrite_endio(struct bio *bio) -{ - struct dm_cache_migration *mg = bio->bi_private; - struct cache *cache = mg->cache; - size_t pb_data_size = get_per_bio_data_size(cache); - struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); - unsigned long flags; - - dm_unhook_bio(&pb->hook_info, bio); - - if (bio->bi_error) - mg->err = true; - - mg->requeue_holder = false; - - spin_lock_irqsave(&cache->lock, flags); - list_add_tail(&mg->list, &cache->completed_migrations); - spin_unlock_irqrestore(&cache->lock, flags); - - wake_worker(cache); -} - -static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio) -{ - size_t pb_data_size = get_per_bio_data_size(mg->cache); - struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); - - dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg); - remap_to_cache_dirty(mg->cache, bio, mg->new_oblock, mg->cblock); - - /* - * No need to inc_ds() here, since the cell will be held for the - * duration of the io. - */ - accounted_request(mg->cache, bio); -} - -static bool bio_writes_complete_block(struct cache *cache, struct bio *bio) -{ - return (bio_data_dir(bio) == WRITE) && - (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT)); -} - -static void avoid_copy(struct dm_cache_migration *mg) -{ - atomic_inc(&mg->cache->stats.copies_avoided); - migration_success_pre_commit(mg); -} - static void calc_discard_block_range(struct cache *cache, struct bio *bio, dm_dblock_t *b, dm_dblock_t *e) { @@ -1339,311 +1164,572 @@ static void calc_discard_block_range(struct cache *cache, struct bio *bio, *e = to_dblock(block_div(se, cache->discard_block_size)); } -static void issue_discard(struct dm_cache_migration *mg) +/*----------------------------------------------------------------*/ + +static void prevent_background_work(struct cache *cache) { - dm_dblock_t b, e; - struct bio *bio = mg->new_ocell->holder; - struct cache *cache = mg->cache; - - calc_discard_block_range(cache, bio, &b, &e); - while (b != e) { - set_discard(cache, b); - b = to_dblock(from_dblock(b) + 1); - } - - bio_endio(bio); - cell_defer(cache, mg->new_ocell, false); - free_migration(mg); - wake_worker(cache); + lockdep_off(); + down_write(&cache->background_work_lock); + lockdep_on(); } -static void issue_copy_or_discard(struct dm_cache_migration *mg) +static void allow_background_work(struct cache *cache) { - bool avoid; - struct cache *cache = mg->cache; - - if (mg->discard) { - issue_discard(mg); - return; - } - - if (mg->writeback || mg->demote) - avoid = !is_dirty(cache, mg->cblock) || - is_discarded_oblock(cache, mg->old_oblock); - else { - struct bio *bio = mg->new_ocell->holder; - - avoid = is_discarded_oblock(cache, mg->new_oblock); - - if (writeback_mode(&cache->features) && - !avoid && bio_writes_complete_block(cache, bio)) { - issue_overwrite(mg, bio); - return; - } - } - - avoid ? avoid_copy(mg) : issue_copy(mg); + lockdep_off(); + up_write(&cache->background_work_lock); + lockdep_on(); } -static void complete_migration(struct dm_cache_migration *mg) +static bool background_work_begin(struct cache *cache) { - if (mg->err) - migration_failure(mg); + bool r; + + lockdep_off(); + r = down_read_trylock(&cache->background_work_lock); + lockdep_on(); + + return r; +} + +static void background_work_end(struct cache *cache) +{ + lockdep_off(); + up_read(&cache->background_work_lock); + lockdep_on(); +} + +/*----------------------------------------------------------------*/ + +static void quiesce(struct dm_cache_migration *mg, + void (*continuation)(struct work_struct *)) +{ + init_continuation(&mg->k, continuation); + dm_cell_quiesce_v2(mg->cache->prison, mg->cell, &mg->k.ws); +} + +static struct dm_cache_migration *ws_to_mg(struct work_struct *ws) +{ + struct continuation *k = container_of(ws, struct continuation, ws); + return container_of(k, struct dm_cache_migration, k); +} + +static void copy_complete(int read_err, unsigned long write_err, void *context) +{ + struct dm_cache_migration *mg = container_of(context, struct dm_cache_migration, k); + + if (read_err || write_err) + mg->k.input = -EIO; + + queue_continuation(mg->cache->wq, &mg->k); +} + +static int copy(struct dm_cache_migration *mg, bool promote) +{ + int r; + struct dm_io_region o_region, c_region; + struct cache *cache = mg->cache; + + o_region.bdev = cache->origin_dev->bdev; + o_region.sector = from_oblock(mg->op->oblock) * cache->sectors_per_block; + o_region.count = cache->sectors_per_block; + + c_region.bdev = cache->cache_dev->bdev; + c_region.sector = from_cblock(mg->op->cblock) * cache->sectors_per_block; + c_region.count = cache->sectors_per_block; + + if (promote) + r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, &mg->k); else - migration_success_pre_commit(mg); + r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, &mg->k); + + return r; } -static void process_migrations(struct cache *cache, struct list_head *head, - void (*fn)(struct dm_cache_migration *)) +static void bio_drop_shared_lock(struct cache *cache, struct bio *bio) { - unsigned long flags; - struct list_head list; - struct dm_cache_migration *mg, *tmp; + size_t pb_data_size = get_per_bio_data_size(cache); + struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); - INIT_LIST_HEAD(&list); - spin_lock_irqsave(&cache->lock, flags); - list_splice_init(head, &list); - spin_unlock_irqrestore(&cache->lock, flags); - - list_for_each_entry_safe(mg, tmp, &list, list) - fn(mg); + if (pb->cell && dm_cell_put_v2(cache->prison, pb->cell)) + free_prison_cell(cache, pb->cell); + pb->cell = NULL; } -static void __queue_quiesced_migration(struct dm_cache_migration *mg) +static void overwrite_endio(struct bio *bio) { - list_add_tail(&mg->list, &mg->cache->quiesced_migrations); -} - -static void queue_quiesced_migration(struct dm_cache_migration *mg) -{ - unsigned long flags; + struct dm_cache_migration *mg = bio->bi_private; struct cache *cache = mg->cache; + size_t pb_data_size = get_per_bio_data_size(cache); + struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); - spin_lock_irqsave(&cache->lock, flags); - __queue_quiesced_migration(mg); - spin_unlock_irqrestore(&cache->lock, flags); + dm_unhook_bio(&pb->hook_info, bio); - wake_worker(cache); + if (bio->bi_error) + mg->k.input = bio->bi_error; + + queue_continuation(mg->cache->wq, &mg->k); } -static void queue_quiesced_migrations(struct cache *cache, struct list_head *work) +static void overwrite(struct dm_cache_migration *mg, + void (*continuation)(struct work_struct *)) { - unsigned long flags; - struct dm_cache_migration *mg, *tmp; + struct bio *bio = mg->overwrite_bio; + size_t pb_data_size = get_per_bio_data_size(mg->cache); + struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); - spin_lock_irqsave(&cache->lock, flags); - list_for_each_entry_safe(mg, tmp, work, list) - __queue_quiesced_migration(mg); - spin_unlock_irqrestore(&cache->lock, flags); + dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg); - wake_worker(cache); -} + /* + * The overwrite bio is part of the copy operation, as such it does + * not set/clear discard or dirty flags. + */ + if (mg->op->op == POLICY_PROMOTE) + remap_to_cache(mg->cache, bio, mg->op->cblock); + else + remap_to_origin(mg->cache, bio); -static void check_for_quiesced_migrations(struct cache *cache, - struct per_bio_data *pb) -{ - struct list_head work; - - if (!pb->all_io_entry) - return; - - INIT_LIST_HEAD(&work); - dm_deferred_entry_dec(pb->all_io_entry, &work); - - if (!list_empty(&work)) - queue_quiesced_migrations(cache, &work); -} - -static void quiesce_migration(struct dm_cache_migration *mg) -{ - if (!dm_deferred_set_add_work(mg->cache->all_io_ds, &mg->list)) - queue_quiesced_migration(mg); -} - -static void promote(struct cache *cache, struct prealloc *structs, - dm_oblock_t oblock, dm_cblock_t cblock, - struct dm_bio_prison_cell *cell) -{ - struct dm_cache_migration *mg = prealloc_get_migration(structs); - - mg->err = false; - mg->discard = false; - mg->writeback = false; - mg->demote = false; - mg->promote = true; - mg->requeue_holder = true; - mg->invalidate = false; - mg->cache = cache; - mg->new_oblock = oblock; - mg->cblock = cblock; - mg->old_ocell = NULL; - mg->new_ocell = cell; - mg->start_jiffies = jiffies; - - inc_io_migrations(cache); - quiesce_migration(mg); -} - -static void writeback(struct cache *cache, struct prealloc *structs, - dm_oblock_t oblock, dm_cblock_t cblock, - struct dm_bio_prison_cell *cell) -{ - struct dm_cache_migration *mg = prealloc_get_migration(structs); - - mg->err = false; - mg->discard = false; - mg->writeback = true; - mg->demote = false; - mg->promote = false; - mg->requeue_holder = true; - mg->invalidate = false; - mg->cache = cache; - mg->old_oblock = oblock; - mg->cblock = cblock; - mg->old_ocell = cell; - mg->new_ocell = NULL; - mg->start_jiffies = jiffies; - - inc_io_migrations(cache); - quiesce_migration(mg); -} - -static void demote_then_promote(struct cache *cache, struct prealloc *structs, - dm_oblock_t old_oblock, dm_oblock_t new_oblock, - dm_cblock_t cblock, - struct dm_bio_prison_cell *old_ocell, - struct dm_bio_prison_cell *new_ocell) -{ - struct dm_cache_migration *mg = prealloc_get_migration(structs); - - mg->err = false; - mg->discard = false; - mg->writeback = false; - mg->demote = true; - mg->promote = true; - mg->requeue_holder = true; - mg->invalidate = false; - mg->cache = cache; - mg->old_oblock = old_oblock; - mg->new_oblock = new_oblock; - mg->cblock = cblock; - mg->old_ocell = old_ocell; - mg->new_ocell = new_ocell; - mg->start_jiffies = jiffies; - - inc_io_migrations(cache); - quiesce_migration(mg); + init_continuation(&mg->k, continuation); + accounted_request(mg->cache, bio); } /* - * Invalidate a cache entry. No writeback occurs; any changes in the cache - * block are thrown away. + * Migration steps: + * + * 1) exclusive lock preventing WRITEs + * 2) quiesce + * 3) copy or issue overwrite bio + * 4) upgrade to exclusive lock preventing READs and WRITEs + * 5) quiesce + * 6) update metadata and commit + * 7) unlock */ -static void invalidate(struct cache *cache, struct prealloc *structs, - dm_oblock_t oblock, dm_cblock_t cblock, - struct dm_bio_prison_cell *cell) +static void mg_complete(struct dm_cache_migration *mg, bool success) { - struct dm_cache_migration *mg = prealloc_get_migration(structs); + struct bio_list bios; + struct cache *cache = mg->cache; + struct policy_work *op = mg->op; + dm_cblock_t cblock = op->cblock; - mg->err = false; - mg->discard = false; - mg->writeback = false; - mg->demote = true; - mg->promote = false; - mg->requeue_holder = true; - mg->invalidate = true; - mg->cache = cache; - mg->old_oblock = oblock; - mg->cblock = cblock; - mg->old_ocell = cell; - mg->new_ocell = NULL; - mg->start_jiffies = jiffies; + if (success) + update_stats(&cache->stats, op->op); - inc_io_migrations(cache); - quiesce_migration(mg); + switch (op->op) { + case POLICY_PROMOTE: + clear_discard(cache, oblock_to_dblock(cache, op->oblock)); + policy_complete_background_work(cache->policy, op, success); + + if (mg->overwrite_bio) { + if (success) + force_set_dirty(cache, cblock); + else + mg->overwrite_bio->bi_error = (mg->k.input ? : -EIO); + bio_endio(mg->overwrite_bio); + } else { + if (success) + force_clear_dirty(cache, cblock); + dec_io_migrations(cache); + } + break; + + case POLICY_DEMOTE: + /* + * We clear dirty here to update the nr_dirty counter. + */ + if (success) + force_clear_dirty(cache, cblock); + policy_complete_background_work(cache->policy, op, success); + dec_io_migrations(cache); + break; + + case POLICY_WRITEBACK: + if (success) + force_clear_dirty(cache, cblock); + policy_complete_background_work(cache->policy, op, success); + dec_io_migrations(cache); + break; + } + + bio_list_init(&bios); + if (mg->cell) { + if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios)) + free_prison_cell(cache, mg->cell); + } + + free_migration(mg); + defer_bios(cache, &bios); + wake_migration_worker(cache); + + background_work_end(cache); } -static void discard(struct cache *cache, struct prealloc *structs, - struct dm_bio_prison_cell *cell) +static void mg_success(struct work_struct *ws) { - struct dm_cache_migration *mg = prealloc_get_migration(structs); + struct dm_cache_migration *mg = ws_to_mg(ws); + mg_complete(mg, mg->k.input == 0); +} + +static void mg_update_metadata(struct work_struct *ws) +{ + int r; + struct dm_cache_migration *mg = ws_to_mg(ws); + struct cache *cache = mg->cache; + struct policy_work *op = mg->op; + + switch (op->op) { + case POLICY_PROMOTE: + r = dm_cache_insert_mapping(cache->cmd, op->cblock, op->oblock); + if (r) { + DMERR_LIMIT("%s: migration failed; couldn't insert mapping", + cache_device_name(cache)); + metadata_operation_failed(cache, "dm_cache_insert_mapping", r); + + mg_complete(mg, false); + return; + } + mg_complete(mg, true); + break; + + case POLICY_DEMOTE: + r = dm_cache_remove_mapping(cache->cmd, op->cblock); + if (r) { + DMERR_LIMIT("%s: migration failed; couldn't update on disk metadata", + cache_device_name(cache)); + metadata_operation_failed(cache, "dm_cache_remove_mapping", r); + + mg_complete(mg, false); + return; + } + + /* + * It would be nice if we only had to commit when a REQ_FLUSH + * comes through. But there's one scenario that we have to + * look out for: + * + * - vblock x in a cache block + * - domotion occurs + * - cache block gets reallocated and over written + * - crash + * + * When we recover, because there was no commit the cache will + * rollback to having the data for vblock x in the cache block. + * But the cache block has since been overwritten, so it'll end + * up pointing to data that was never in 'x' during the history + * of the device. + * + * To avoid this issue we require a commit as part of the + * demotion operation. + */ + init_continuation(&mg->k, mg_success); + continue_after_commit(&cache->committer, &mg->k); + schedule_commit(&cache->committer); + break; + + case POLICY_WRITEBACK: + mg_complete(mg, true); + break; + } +} + +static void mg_update_metadata_after_copy(struct work_struct *ws) +{ + struct dm_cache_migration *mg = ws_to_mg(ws); + + /* + * Did the copy succeed? + */ + if (mg->k.input) + mg_complete(mg, false); + else + mg_update_metadata(ws); +} + +static void mg_upgrade_lock(struct work_struct *ws) +{ + int r; + struct dm_cache_migration *mg = ws_to_mg(ws); + + /* + * Did the copy succeed? + */ + if (mg->k.input) + mg_complete(mg, false); + + else { + /* + * Now we want the lock to prevent both reads and writes. + */ + r = dm_cell_lock_promote_v2(mg->cache->prison, mg->cell, + READ_WRITE_LOCK_LEVEL); + if (r < 0) + mg_complete(mg, false); + + else if (r) + quiesce(mg, mg_update_metadata); + + else + mg_update_metadata(ws); + } +} + +static void mg_copy(struct work_struct *ws) +{ + int r; + struct dm_cache_migration *mg = ws_to_mg(ws); + + if (mg->overwrite_bio) { + /* + * It's safe to do this here, even though it's new data + * because all IO has been locked out of the block. + * + * mg_lock_writes() already took READ_WRITE_LOCK_LEVEL + * so _not_ using mg_upgrade_lock() as continutation. + */ + overwrite(mg, mg_update_metadata_after_copy); + + } else { + struct cache *cache = mg->cache; + struct policy_work *op = mg->op; + bool is_policy_promote = (op->op == POLICY_PROMOTE); + + if ((!is_policy_promote && !is_dirty(cache, op->cblock)) || + is_discarded_oblock(cache, op->oblock)) { + mg_upgrade_lock(ws); + return; + } + + init_continuation(&mg->k, mg_upgrade_lock); + + r = copy(mg, is_policy_promote); + if (r) { + DMERR_LIMIT("%s: migration copy failed", cache_device_name(cache)); + mg->k.input = -EIO; + mg_complete(mg, false); + } + } +} + +static int mg_lock_writes(struct dm_cache_migration *mg) +{ + int r; + struct dm_cell_key_v2 key; + struct cache *cache = mg->cache; + struct dm_bio_prison_cell_v2 *prealloc; + + prealloc = alloc_prison_cell(cache); + if (!prealloc) { + DMERR_LIMIT("%s: alloc_prison_cell failed", cache_device_name(cache)); + mg_complete(mg, false); + return -ENOMEM; + } + + /* + * Prevent writes to the block, but allow reads to continue. + * Unless we're using an overwrite bio, in which case we lock + * everything. + */ + build_key(mg->op->oblock, oblock_succ(mg->op->oblock), &key); + r = dm_cell_lock_v2(cache->prison, &key, + mg->overwrite_bio ? READ_WRITE_LOCK_LEVEL : WRITE_LOCK_LEVEL, + prealloc, &mg->cell); + if (r < 0) { + free_prison_cell(cache, prealloc); + mg_complete(mg, false); + return r; + } + + if (mg->cell != prealloc) + free_prison_cell(cache, prealloc); + + if (r == 0) + mg_copy(&mg->k.ws); + else + quiesce(mg, mg_copy); + + return 0; +} + +static int mg_start(struct cache *cache, struct policy_work *op, struct bio *bio) +{ + struct dm_cache_migration *mg; + + if (!background_work_begin(cache)) { + policy_complete_background_work(cache->policy, op, false); + return -EPERM; + } + + mg = alloc_migration(cache); + if (!mg) { + policy_complete_background_work(cache->policy, op, false); + background_work_end(cache); + return -ENOMEM; + } + + memset(mg, 0, sizeof(*mg)); - mg->err = false; - mg->discard = true; - mg->writeback = false; - mg->demote = false; - mg->promote = false; - mg->requeue_holder = false; - mg->invalidate = false; mg->cache = cache; - mg->old_ocell = NULL; - mg->new_ocell = cell; - mg->start_jiffies = jiffies; + mg->op = op; + mg->overwrite_bio = bio; - quiesce_migration(mg); + if (!bio) + inc_io_migrations(cache); + + return mg_lock_writes(mg); +} + +/*---------------------------------------------------------------- + * invalidation processing + *--------------------------------------------------------------*/ + +static void invalidate_complete(struct dm_cache_migration *mg, bool success) +{ + struct bio_list bios; + struct cache *cache = mg->cache; + + bio_list_init(&bios); + if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios)) + free_prison_cell(cache, mg->cell); + + if (!success && mg->overwrite_bio) + bio_io_error(mg->overwrite_bio); + + free_migration(mg); + defer_bios(cache, &bios); + + background_work_end(cache); +} + +static void invalidate_completed(struct work_struct *ws) +{ + struct dm_cache_migration *mg = ws_to_mg(ws); + invalidate_complete(mg, !mg->k.input); +} + +static int invalidate_cblock(struct cache *cache, dm_cblock_t cblock) +{ + int r = policy_invalidate_mapping(cache->policy, cblock); + if (!r) { + r = dm_cache_remove_mapping(cache->cmd, cblock); + if (r) { + DMERR_LIMIT("%s: invalidation failed; couldn't update on disk metadata", + cache_device_name(cache)); + metadata_operation_failed(cache, "dm_cache_remove_mapping", r); + } + + } else if (r == -ENODATA) { + /* + * Harmless, already unmapped. + */ + r = 0; + + } else + DMERR("%s: policy_invalidate_mapping failed", cache_device_name(cache)); + + return r; +} + +static void invalidate_remove(struct work_struct *ws) +{ + int r; + struct dm_cache_migration *mg = ws_to_mg(ws); + struct cache *cache = mg->cache; + + r = invalidate_cblock(cache, mg->invalidate_cblock); + if (r) { + invalidate_complete(mg, false); + return; + } + + init_continuation(&mg->k, invalidate_completed); + continue_after_commit(&cache->committer, &mg->k); + remap_to_origin_clear_discard(cache, mg->overwrite_bio, mg->invalidate_oblock); + mg->overwrite_bio = NULL; + schedule_commit(&cache->committer); +} + +static int invalidate_lock(struct dm_cache_migration *mg) +{ + int r; + struct dm_cell_key_v2 key; + struct cache *cache = mg->cache; + struct dm_bio_prison_cell_v2 *prealloc; + + prealloc = alloc_prison_cell(cache); + if (!prealloc) { + invalidate_complete(mg, false); + return -ENOMEM; + } + + build_key(mg->invalidate_oblock, oblock_succ(mg->invalidate_oblock), &key); + r = dm_cell_lock_v2(cache->prison, &key, + READ_WRITE_LOCK_LEVEL, prealloc, &mg->cell); + if (r < 0) { + free_prison_cell(cache, prealloc); + invalidate_complete(mg, false); + return r; + } + + if (mg->cell != prealloc) + free_prison_cell(cache, prealloc); + + if (r) + quiesce(mg, invalidate_remove); + + else { + /* + * We can't call invalidate_remove() directly here because we + * might still be in request context. + */ + init_continuation(&mg->k, invalidate_remove); + queue_work(cache->wq, &mg->k.ws); + } + + return 0; +} + +static int invalidate_start(struct cache *cache, dm_cblock_t cblock, + dm_oblock_t oblock, struct bio *bio) +{ + struct dm_cache_migration *mg; + + if (!background_work_begin(cache)) + return -EPERM; + + mg = alloc_migration(cache); + if (!mg) { + background_work_end(cache); + return -ENOMEM; + } + + memset(mg, 0, sizeof(*mg)); + + mg->cache = cache; + mg->overwrite_bio = bio; + mg->invalidate_cblock = cblock; + mg->invalidate_oblock = oblock; + + return invalidate_lock(mg); } /*---------------------------------------------------------------- * bio processing *--------------------------------------------------------------*/ -static void defer_bio(struct cache *cache, struct bio *bio) -{ - unsigned long flags; - - spin_lock_irqsave(&cache->lock, flags); - bio_list_add(&cache->deferred_bios, bio); - spin_unlock_irqrestore(&cache->lock, flags); - - wake_worker(cache); -} - -static void process_flush_bio(struct cache *cache, struct bio *bio) -{ - size_t pb_data_size = get_per_bio_data_size(cache); - struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); - - BUG_ON(bio->bi_iter.bi_size); - if (!pb->req_nr) - remap_to_origin(cache, bio); - else - remap_to_cache(cache, bio, 0); - - /* - * REQ_PREFLUSH is not directed at any particular block so we don't - * need to inc_ds(). REQ_FUA's are split into a write + REQ_PREFLUSH - * by dm-core. - */ - issue(cache, bio); -} - -static void process_discard_bio(struct cache *cache, struct prealloc *structs, - struct bio *bio) -{ - int r; - dm_dblock_t b, e; - struct dm_bio_prison_cell *cell_prealloc, *new_ocell; - - calc_discard_block_range(cache, bio, &b, &e); - if (b == e) { - bio_endio(bio); - return; - } - - cell_prealloc = prealloc_get_cell(structs); - r = bio_detain_range(cache, dblock_to_oblock(cache, b), dblock_to_oblock(cache, e), bio, cell_prealloc, - (cell_free_fn) prealloc_put_cell, - structs, &new_ocell); - if (r > 0) - return; - - discard(cache, structs, new_ocell); -} - -static bool spare_migration_bandwidth(struct cache *cache) + +enum busy { + IDLE, + MODERATE, + BUSY +}; + +static enum busy spare_migration_bandwidth(struct cache *cache) { + bool idle = iot_idle_for(&cache->origin_tracker, HZ); sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) * cache->sectors_per_block; - return current_volume < cache->migration_threshold; + + if (current_volume <= cache->migration_threshold) + return idle ? IDLE : MODERATE; + else + return idle ? MODERATE : BUSY; } static void inc_hit_counter(struct cache *cache, struct bio *bio) @@ -1660,255 +1746,143 @@ static void inc_miss_counter(struct cache *cache, struct bio *bio) /*----------------------------------------------------------------*/ -struct inc_detail { - struct cache *cache; - struct bio_list bios_for_issue; - struct bio_list unhandled_bios; - bool any_writes; -}; - -static void inc_fn(void *context, struct dm_bio_prison_cell *cell) +static bool bio_writes_complete_block(struct cache *cache, struct bio *bio) { - struct bio *bio; - struct inc_detail *detail = context; - struct cache *cache = detail->cache; + return (bio_data_dir(bio) == WRITE) && + (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT)); +} - inc_ds(cache, cell->holder, cell); - if (bio_data_dir(cell->holder) == WRITE) - detail->any_writes = true; +static bool optimisable_bio(struct cache *cache, struct bio *bio, dm_oblock_t block) +{ + return writeback_mode(&cache->features) && + (is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio)); +} - while ((bio = bio_list_pop(&cell->bios))) { - if (discard_or_flush(bio)) { - bio_list_add(&detail->unhandled_bios, bio); - continue; +static int map_bio(struct cache *cache, struct bio *bio, dm_oblock_t block, + bool *commit_needed) +{ + int r, data_dir; + bool rb, background_queued; + dm_cblock_t cblock; + size_t pb_data_size = get_per_bio_data_size(cache); + struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); + + *commit_needed = false; + + rb = bio_detain_shared(cache, block, bio); + if (!rb) { + /* + * An exclusive lock is held for this block, so we have to + * wait. We set the commit_needed flag so the current + * transaction will be committed asap, allowing this lock + * to be dropped. + */ + *commit_needed = true; + return DM_MAPIO_SUBMITTED; + } + + data_dir = bio_data_dir(bio); + + if (optimisable_bio(cache, bio, block)) { + struct policy_work *op = NULL; + + r = policy_lookup_with_work(cache->policy, block, &cblock, data_dir, true, &op); + if (unlikely(r && r != -ENOENT)) { + DMERR_LIMIT("%s: policy_lookup_with_work() failed with r = %d", + cache_device_name(cache), r); + bio_io_error(bio); + return DM_MAPIO_SUBMITTED; } - if (bio_data_dir(bio) == WRITE) - detail->any_writes = true; - - bio_list_add(&detail->bios_for_issue, bio); - inc_ds(cache, bio, cell); - } -} - -// FIXME: refactor these two -static void remap_cell_to_origin_clear_discard(struct cache *cache, - struct dm_bio_prison_cell *cell, - dm_oblock_t oblock, bool issue_holder) -{ - struct bio *bio; - unsigned long flags; - struct inc_detail detail; - - detail.cache = cache; - bio_list_init(&detail.bios_for_issue); - bio_list_init(&detail.unhandled_bios); - detail.any_writes = false; - - spin_lock_irqsave(&cache->lock, flags); - dm_cell_visit_release(cache->prison, inc_fn, &detail, cell); - bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios); - spin_unlock_irqrestore(&cache->lock, flags); - - remap_to_origin(cache, cell->holder); - if (issue_holder) - issue(cache, cell->holder); - else - accounted_begin(cache, cell->holder); - - if (detail.any_writes) - clear_discard(cache, oblock_to_dblock(cache, oblock)); - - while ((bio = bio_list_pop(&detail.bios_for_issue))) { - remap_to_origin(cache, bio); - issue(cache, bio); - } - - free_prison_cell(cache, cell); -} - -static void remap_cell_to_cache_dirty(struct cache *cache, struct dm_bio_prison_cell *cell, - dm_oblock_t oblock, dm_cblock_t cblock, bool issue_holder) -{ - struct bio *bio; - unsigned long flags; - struct inc_detail detail; - - detail.cache = cache; - bio_list_init(&detail.bios_for_issue); - bio_list_init(&detail.unhandled_bios); - detail.any_writes = false; - - spin_lock_irqsave(&cache->lock, flags); - dm_cell_visit_release(cache->prison, inc_fn, &detail, cell); - bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios); - spin_unlock_irqrestore(&cache->lock, flags); - - remap_to_cache(cache, cell->holder, cblock); - if (issue_holder) - issue(cache, cell->holder); - else - accounted_begin(cache, cell->holder); - - if (detail.any_writes) { - set_dirty(cache, oblock, cblock); - clear_discard(cache, oblock_to_dblock(cache, oblock)); - } - - while ((bio = bio_list_pop(&detail.bios_for_issue))) { - remap_to_cache(cache, bio, cblock); - issue(cache, bio); - } - - free_prison_cell(cache, cell); -} - -/*----------------------------------------------------------------*/ - -struct old_oblock_lock { - struct policy_locker locker; - struct cache *cache; - struct prealloc *structs; - struct dm_bio_prison_cell *cell; -}; - -static int null_locker(struct policy_locker *locker, dm_oblock_t b) -{ - /* This should never be called */ - BUG(); - return 0; -} - -static int cell_locker(struct policy_locker *locker, dm_oblock_t b) -{ - struct old_oblock_lock *l = container_of(locker, struct old_oblock_lock, locker); - struct dm_bio_prison_cell *cell_prealloc = prealloc_get_cell(l->structs); - - return bio_detain(l->cache, b, NULL, cell_prealloc, - (cell_free_fn) prealloc_put_cell, - l->structs, &l->cell); -} - -static void process_cell(struct cache *cache, struct prealloc *structs, - struct dm_bio_prison_cell *new_ocell) -{ - int r; - bool release_cell = true; - struct bio *bio = new_ocell->holder; - dm_oblock_t block = get_bio_block(cache, bio); - struct policy_result lookup_result; - bool passthrough = passthrough_mode(&cache->features); - bool fast_promotion, can_migrate; - struct old_oblock_lock ool; - - fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio); - can_migrate = !passthrough && (fast_promotion || spare_migration_bandwidth(cache)); - - ool.locker.fn = cell_locker; - ool.cache = cache; - ool.structs = structs; - ool.cell = NULL; - r = policy_map(cache->policy, block, true, can_migrate, fast_promotion, - bio, &ool.locker, &lookup_result); - - if (r == -EWOULDBLOCK) - /* migration has been denied */ - lookup_result.op = POLICY_MISS; - - switch (lookup_result.op) { - case POLICY_HIT: - if (passthrough) { - inc_miss_counter(cache, bio); - - /* - * Passthrough always maps to the origin, - * invalidating any cache blocks that are written - * to. - */ - - if (bio_data_dir(bio) == WRITE) { - atomic_inc(&cache->stats.demotion); - invalidate(cache, structs, block, lookup_result.cblock, new_ocell); - release_cell = false; - - } else { - /* FIXME: factor out issue_origin() */ - remap_to_origin_clear_discard(cache, bio, block); - inc_and_issue(cache, bio, new_ocell); - } - } else { - inc_hit_counter(cache, bio); - - if (bio_data_dir(bio) == WRITE && - writethrough_mode(&cache->features) && - !is_dirty(cache, lookup_result.cblock)) { - remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); - inc_and_issue(cache, bio, new_ocell); - - } else { - remap_cell_to_cache_dirty(cache, new_ocell, block, lookup_result.cblock, true); - release_cell = false; - } + if (r == -ENOENT && op) { + bio_drop_shared_lock(cache, bio); + BUG_ON(op->op != POLICY_PROMOTE); + mg_start(cache, op, bio); + return DM_MAPIO_SUBMITTED; + } + } else { + r = policy_lookup(cache->policy, block, &cblock, data_dir, false, &background_queued); + if (unlikely(r && r != -ENOENT)) { + DMERR_LIMIT("%s: policy_lookup() failed with r = %d", + cache_device_name(cache), r); + bio_io_error(bio); + return DM_MAPIO_SUBMITTED; } - break; + if (background_queued) + wake_migration_worker(cache); + } - case POLICY_MISS: + if (r == -ENOENT) { + /* + * Miss. + */ inc_miss_counter(cache, bio); - remap_cell_to_origin_clear_discard(cache, new_ocell, block, true); - release_cell = false; - break; + if (pb->req_nr == 0) { + accounted_begin(cache, bio); + remap_to_origin_clear_discard(cache, bio, block); - case POLICY_NEW: - atomic_inc(&cache->stats.promotion); - promote(cache, structs, block, lookup_result.cblock, new_ocell); - release_cell = false; - break; + } else { + /* + * This is a duplicate writethrough io that is no + * longer needed because the block has been demoted. + */ + bio_endio(bio); + return DM_MAPIO_SUBMITTED; + } + } else { + /* + * Hit. + */ + inc_hit_counter(cache, bio); - case POLICY_REPLACE: - atomic_inc(&cache->stats.demotion); - atomic_inc(&cache->stats.promotion); - demote_then_promote(cache, structs, lookup_result.old_oblock, - block, lookup_result.cblock, - ool.cell, new_ocell); - release_cell = false; - break; + /* + * Passthrough always maps to the origin, invalidating any + * cache blocks that are written to. + */ + if (passthrough_mode(&cache->features)) { + if (bio_data_dir(bio) == WRITE) { + bio_drop_shared_lock(cache, bio); + atomic_inc(&cache->stats.demotion); + invalidate_start(cache, cblock, block, bio); + } else + remap_to_origin_clear_discard(cache, bio, block); - default: - DMERR_LIMIT("%s: %s: erroring bio, unknown policy op: %u", - cache_device_name(cache), __func__, - (unsigned) lookup_result.op); - bio_io_error(bio); + } else { + if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) && + !is_dirty(cache, cblock)) { + remap_to_origin_then_cache(cache, bio, block, cblock); + accounted_begin(cache, bio); + } else + remap_to_cache_dirty(cache, bio, block, cblock); + } } - if (release_cell) - cell_defer(cache, new_ocell, false); -} - -static void process_bio(struct cache *cache, struct prealloc *structs, - struct bio *bio) -{ - int r; - dm_oblock_t block = get_bio_block(cache, bio); - struct dm_bio_prison_cell *cell_prealloc, *new_ocell; - /* - * Check to see if that block is currently migrating. + * dm core turns FUA requests into a separate payload and FLUSH req. */ - cell_prealloc = prealloc_get_cell(structs); - r = bio_detain(cache, block, bio, cell_prealloc, - (cell_free_fn) prealloc_put_cell, - structs, &new_ocell); - if (r > 0) - return; + if (bio->bi_opf & REQ_FUA) { + /* + * issue_after_commit will call accounted_begin a second time. So + * we call accounted_complete() to avoid double accounting. + */ + accounted_complete(cache, bio); + issue_after_commit(&cache->committer, bio); + *commit_needed = true; + return DM_MAPIO_SUBMITTED; + } - process_cell(cache, structs, new_ocell); + return DM_MAPIO_REMAPPED; } -static int need_commit_due_to_time(struct cache *cache) +static bool process_bio(struct cache *cache, struct bio *bio) { - return jiffies < cache->last_commit_jiffies || - jiffies > cache->last_commit_jiffies + COMMIT_PERIOD; + bool commit_needed; + + if (map_bio(cache, bio, get_bio_block(cache, bio), &commit_needed) == DM_MAPIO_REMAPPED) + generic_make_request(bio); + + return commit_needed; } /* @@ -1929,29 +1903,62 @@ static int commit(struct cache *cache, bool clean_shutdown) return r; } -static int commit_if_needed(struct cache *cache) +/* + * Used by the batcher. + */ +static int commit_op(void *context) { - int r = 0; + struct cache *cache = context; - if ((cache->commit_requested || need_commit_due_to_time(cache)) && - dm_cache_changed_this_transaction(cache->cmd)) { - r = commit(cache, false); - cache->commit_requested = false; - cache->last_commit_jiffies = jiffies; - } + if (dm_cache_changed_this_transaction(cache->cmd)) + return commit(cache, false); - return r; + return 0; } -static void process_deferred_bios(struct cache *cache) +/*----------------------------------------------------------------*/ + +static bool process_flush_bio(struct cache *cache, struct bio *bio) { - bool prealloc_used = false; + size_t pb_data_size = get_per_bio_data_size(cache); + struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); + + if (!pb->req_nr) + remap_to_origin(cache, bio); + else + remap_to_cache(cache, bio, 0); + + issue_after_commit(&cache->committer, bio); + return true; +} + +static bool process_discard_bio(struct cache *cache, struct bio *bio) +{ + dm_dblock_t b, e; + + // FIXME: do we need to lock the region? Or can we just assume the + // user wont be so foolish as to issue discard concurrently with + // other IO? + calc_discard_block_range(cache, bio, &b, &e); + while (b != e) { + set_discard(cache, b); + b = to_dblock(from_dblock(b) + 1); + } + + bio_endio(bio); + + return false; +} + +static void process_deferred_bios(struct work_struct *ws) +{ + struct cache *cache = container_of(ws, struct cache, deferred_bio_worker); + unsigned long flags; + bool commit_needed = false; struct bio_list bios; struct bio *bio; - struct prealloc structs; - memset(&structs, 0, sizeof(structs)); bio_list_init(&bios); spin_lock_irqsave(&cache->lock, flags); @@ -1959,93 +1966,25 @@ static void process_deferred_bios(struct cache *cache) bio_list_init(&cache->deferred_bios); spin_unlock_irqrestore(&cache->lock, flags); - while (!bio_list_empty(&bios)) { - /* - * If we've got no free migration structs, and processing - * this bio might require one, we pause until there are some - * prepared mappings to process. - */ - prealloc_used = true; - if (prealloc_data_structs(cache, &structs)) { - spin_lock_irqsave(&cache->lock, flags); - bio_list_merge(&cache->deferred_bios, &bios); - spin_unlock_irqrestore(&cache->lock, flags); - break; - } - - bio = bio_list_pop(&bios); - + while ((bio = bio_list_pop(&bios))) { if (bio->bi_opf & REQ_PREFLUSH) - process_flush_bio(cache, bio); + commit_needed = process_flush_bio(cache, bio) || commit_needed; + else if (bio_op(bio) == REQ_OP_DISCARD) - process_discard_bio(cache, &structs, bio); + commit_needed = process_discard_bio(cache, bio) || commit_needed; + else - process_bio(cache, &structs, bio); + commit_needed = process_bio(cache, bio) || commit_needed; } - if (prealloc_used) - prealloc_free_structs(cache, &structs); + if (commit_needed) + schedule_commit(&cache->committer); } -static void process_deferred_cells(struct cache *cache) +static void process_deferred_writethrough_bios(struct work_struct *ws) { - bool prealloc_used = false; - unsigned long flags; - struct dm_bio_prison_cell *cell, *tmp; - struct list_head cells; - struct prealloc structs; + struct cache *cache = container_of(ws, struct cache, deferred_writethrough_worker); - memset(&structs, 0, sizeof(structs)); - - INIT_LIST_HEAD(&cells); - - spin_lock_irqsave(&cache->lock, flags); - list_splice_init(&cache->deferred_cells, &cells); - spin_unlock_irqrestore(&cache->lock, flags); - - list_for_each_entry_safe(cell, tmp, &cells, user_list) { - /* - * If we've got no free migration structs, and processing - * this bio might require one, we pause until there are some - * prepared mappings to process. - */ - prealloc_used = true; - if (prealloc_data_structs(cache, &structs)) { - spin_lock_irqsave(&cache->lock, flags); - list_splice(&cells, &cache->deferred_cells); - spin_unlock_irqrestore(&cache->lock, flags); - break; - } - - process_cell(cache, &structs, cell); - } - - if (prealloc_used) - prealloc_free_structs(cache, &structs); -} - -static void process_deferred_flush_bios(struct cache *cache, bool submit_bios) -{ - unsigned long flags; - struct bio_list bios; - struct bio *bio; - - bio_list_init(&bios); - - spin_lock_irqsave(&cache->lock, flags); - bio_list_merge(&bios, &cache->deferred_flush_bios); - bio_list_init(&cache->deferred_flush_bios); - spin_unlock_irqrestore(&cache->lock, flags); - - /* - * These bios have already been through inc_ds() - */ - while ((bio = bio_list_pop(&bios))) - submit_bios ? accounted_request(cache, bio) : bio_io_error(bio); -} - -static void process_deferred_writethrough_bios(struct cache *cache) -{ unsigned long flags; struct bio_list bios; struct bio *bio; @@ -2058,153 +1997,15 @@ static void process_deferred_writethrough_bios(struct cache *cache) spin_unlock_irqrestore(&cache->lock, flags); /* - * These bios have already been through inc_ds() + * These bios have already been through accounted_begin() */ while ((bio = bio_list_pop(&bios))) - accounted_request(cache, bio); -} - -static void writeback_some_dirty_blocks(struct cache *cache) -{ - bool prealloc_used = false; - dm_oblock_t oblock; - dm_cblock_t cblock; - struct prealloc structs; - struct dm_bio_prison_cell *old_ocell; - bool busy = !iot_idle_for(&cache->origin_tracker, HZ); - - memset(&structs, 0, sizeof(structs)); - - while (spare_migration_bandwidth(cache)) { - if (policy_writeback_work(cache->policy, &oblock, &cblock, busy)) - break; /* no work to do */ - - prealloc_used = true; - if (prealloc_data_structs(cache, &structs) || - get_cell(cache, oblock, &structs, &old_ocell)) { - policy_set_dirty(cache->policy, oblock); - break; - } - - writeback(cache, &structs, oblock, cblock, old_ocell); - } - - if (prealloc_used) - prealloc_free_structs(cache, &structs); -} - -/*---------------------------------------------------------------- - * Invalidations. - * Dropping something from the cache *without* writing back. - *--------------------------------------------------------------*/ - -static void process_invalidation_request(struct cache *cache, struct invalidation_request *req) -{ - int r = 0; - uint64_t begin = from_cblock(req->cblocks->begin); - uint64_t end = from_cblock(req->cblocks->end); - - while (begin != end) { - r = policy_remove_cblock(cache->policy, to_cblock(begin)); - if (!r) { - r = dm_cache_remove_mapping(cache->cmd, to_cblock(begin)); - if (r) { - metadata_operation_failed(cache, "dm_cache_remove_mapping", r); - break; - } - - } else if (r == -ENODATA) { - /* harmless, already unmapped */ - r = 0; - - } else { - DMERR("%s: policy_remove_cblock failed", cache_device_name(cache)); - break; - } - - begin++; - } - - cache->commit_requested = true; - - req->err = r; - atomic_set(&req->complete, 1); - - wake_up(&req->result_wait); -} - -static void process_invalidation_requests(struct cache *cache) -{ - struct list_head list; - struct invalidation_request *req, *tmp; - - INIT_LIST_HEAD(&list); - spin_lock(&cache->invalidation_lock); - list_splice_init(&cache->invalidation_requests, &list); - spin_unlock(&cache->invalidation_lock); - - list_for_each_entry_safe (req, tmp, &list, list) - process_invalidation_request(cache, req); + generic_make_request(bio); } /*---------------------------------------------------------------- * Main worker loop *--------------------------------------------------------------*/ -static bool is_quiescing(struct cache *cache) -{ - return atomic_read(&cache->quiescing); -} - -static void ack_quiescing(struct cache *cache) -{ - if (is_quiescing(cache)) { - atomic_inc(&cache->quiescing_ack); - wake_up(&cache->quiescing_wait); - } -} - -static void wait_for_quiescing_ack(struct cache *cache) -{ - wait_event(cache->quiescing_wait, atomic_read(&cache->quiescing_ack)); -} - -static void start_quiescing(struct cache *cache) -{ - atomic_inc(&cache->quiescing); - wait_for_quiescing_ack(cache); -} - -static void stop_quiescing(struct cache *cache) -{ - atomic_set(&cache->quiescing, 0); - atomic_set(&cache->quiescing_ack, 0); -} - -static void wait_for_migrations(struct cache *cache) -{ - wait_event(cache->migration_wait, !atomic_read(&cache->nr_allocated_migrations)); -} - -static void stop_worker(struct cache *cache) -{ - cancel_delayed_work(&cache->waker); - flush_workqueue(cache->wq); -} - -static void requeue_deferred_cells(struct cache *cache) -{ - unsigned long flags; - struct list_head cells; - struct dm_bio_prison_cell *cell, *tmp; - - INIT_LIST_HEAD(&cells); - spin_lock_irqsave(&cache->lock, flags); - list_splice_init(&cache->deferred_cells, &cells); - spin_unlock_irqrestore(&cache->lock, flags); - - list_for_each_entry_safe(cell, tmp, &cells, user_list) - cell_requeue(cache, cell); -} static void requeue_deferred_bios(struct cache *cache) { @@ -2221,53 +2022,6 @@ static void requeue_deferred_bios(struct cache *cache) } } -static int more_work(struct cache *cache) -{ - if (is_quiescing(cache)) - return !list_empty(&cache->quiesced_migrations) || - !list_empty(&cache->completed_migrations) || - !list_empty(&cache->need_commit_migrations); - else - return !bio_list_empty(&cache->deferred_bios) || - !list_empty(&cache->deferred_cells) || - !bio_list_empty(&cache->deferred_flush_bios) || - !bio_list_empty(&cache->deferred_writethrough_bios) || - !list_empty(&cache->quiesced_migrations) || - !list_empty(&cache->completed_migrations) || - !list_empty(&cache->need_commit_migrations) || - cache->invalidate; -} - -static void do_worker(struct work_struct *ws) -{ - struct cache *cache = container_of(ws, struct cache, worker); - - do { - if (!is_quiescing(cache)) { - writeback_some_dirty_blocks(cache); - process_deferred_writethrough_bios(cache); - process_deferred_bios(cache); - process_deferred_cells(cache); - process_invalidation_requests(cache); - } - - process_migrations(cache, &cache->quiesced_migrations, issue_copy_or_discard); - process_migrations(cache, &cache->completed_migrations, complete_migration); - - if (commit_if_needed(cache)) { - process_deferred_flush_bios(cache, false); - process_migrations(cache, &cache->need_commit_migrations, migration_failure); - } else { - process_deferred_flush_bios(cache, true); - process_migrations(cache, &cache->need_commit_migrations, - migration_success_post_commit); - } - - ack_quiescing(cache); - - } while (more_work(cache)); -} - /* * We want to commit periodically so that not too much * unwritten metadata builds up. @@ -2275,25 +2029,39 @@ static void do_worker(struct work_struct *ws) static void do_waker(struct work_struct *ws) { struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker); + policy_tick(cache->policy, true); - wake_worker(cache); + wake_migration_worker(cache); + schedule_commit(&cache->committer); queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD); } -/*----------------------------------------------------------------*/ - -static int is_congested(struct dm_dev *dev, int bdi_bits) +static void check_migrations(struct work_struct *ws) { - struct request_queue *q = bdev_get_queue(dev->bdev); - return bdi_congested(q->backing_dev_info, bdi_bits); -} + int r; + struct policy_work *op; + struct cache *cache = container_of(ws, struct cache, migration_worker); + enum busy b; -static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits) -{ - struct cache *cache = container_of(cb, struct cache, callbacks); + for (;;) { + b = spare_migration_bandwidth(cache); + if (b == BUSY) + break; - return is_congested(cache->origin_dev, bdi_bits) || - is_congested(cache->cache_dev, bdi_bits); + r = policy_get_background_work(cache->policy, b == IDLE, &op); + if (r == -ENODATA) + break; + + if (r) { + DMERR_LIMIT("%s: policy_background_work failed", + cache_device_name(cache)); + break; + } + + r = mg_start(cache, op, NULL); + if (r) + break; + } } /*---------------------------------------------------------------- @@ -2310,11 +2078,8 @@ static void destroy(struct cache *cache) mempool_destroy(cache->migration_pool); - if (cache->all_io_ds) - dm_deferred_set_destroy(cache->all_io_ds); - if (cache->prison) - dm_bio_prison_destroy(cache->prison); + dm_bio_prison_destroy_v2(cache->prison); if (cache->wq) destroy_workqueue(cache->wq); @@ -2707,6 +2472,7 @@ static int create_cache_policy(struct cache *cache, struct cache_args *ca, return PTR_ERR(p); } cache->policy = p; + BUG_ON(!cache->policy); return 0; } @@ -2750,6 +2516,20 @@ static void set_cache_size(struct cache *cache, dm_cblock_t size) cache->cache_size = size; } +static int is_congested(struct dm_dev *dev, int bdi_bits) +{ + struct request_queue *q = bdev_get_queue(dev->bdev); + return bdi_congested(q->backing_dev_info, bdi_bits); +} + +static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits) +{ + struct cache *cache = container_of(cb, struct cache, callbacks); + + return is_congested(cache->origin_dev, bdi_bits) || + is_congested(cache->cache_dev, bdi_bits); +} + #define DEFAULT_MIGRATION_THRESHOLD 2048 static int cache_create(struct cache_args *ca, struct cache **result) @@ -2787,7 +2567,6 @@ static int cache_create(struct cache_args *ca, struct cache **result) ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL; - /* FIXME: factor out this whole section */ origin_blocks = cache->origin_sectors = ca->origin_sectors; origin_blocks = block_div(origin_blocks, ca->block_size); cache->origin_blocks = to_oblock(origin_blocks); @@ -2853,24 +2632,18 @@ static int cache_create(struct cache_args *ca, struct cache **result) r = -EINVAL; goto bad; } + + policy_allow_migrations(cache->policy, false); } spin_lock_init(&cache->lock); INIT_LIST_HEAD(&cache->deferred_cells); bio_list_init(&cache->deferred_bios); - bio_list_init(&cache->deferred_flush_bios); bio_list_init(&cache->deferred_writethrough_bios); - INIT_LIST_HEAD(&cache->quiesced_migrations); - INIT_LIST_HEAD(&cache->completed_migrations); - INIT_LIST_HEAD(&cache->need_commit_migrations); atomic_set(&cache->nr_allocated_migrations, 0); atomic_set(&cache->nr_io_migrations, 0); init_waitqueue_head(&cache->migration_wait); - init_waitqueue_head(&cache->quiescing_wait); - atomic_set(&cache->quiescing, 0); - atomic_set(&cache->quiescing_ack, 0); - r = -ENOMEM; atomic_set(&cache->nr_dirty, 0); cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size)); @@ -2899,27 +2672,23 @@ static int cache_create(struct cache_args *ca, struct cache **result) goto bad; } - cache->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM); + cache->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0); if (!cache->wq) { *error = "could not create workqueue for metadata object"; goto bad; } - INIT_WORK(&cache->worker, do_worker); + INIT_WORK(&cache->deferred_bio_worker, process_deferred_bios); + INIT_WORK(&cache->deferred_writethrough_worker, + process_deferred_writethrough_bios); + INIT_WORK(&cache->migration_worker, check_migrations); INIT_DELAYED_WORK(&cache->waker, do_waker); - cache->last_commit_jiffies = jiffies; - cache->prison = dm_bio_prison_create(); + cache->prison = dm_bio_prison_create_v2(cache->wq); if (!cache->prison) { *error = "could not create bio prison"; goto bad; } - cache->all_io_ds = dm_deferred_set_create(); - if (!cache->all_io_ds) { - *error = "could not create all_io deferred set"; - goto bad; - } - cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE, migration_cache); if (!cache->migration_pool) { @@ -2946,11 +2715,15 @@ static int cache_create(struct cache_args *ca, struct cache **result) spin_lock_init(&cache->invalidation_lock); INIT_LIST_HEAD(&cache->invalidation_requests); + batcher_init(&cache->committer, commit_op, cache, + issue_op, cache, cache->wq); iot_init(&cache->origin_tracker); + init_rwsem(&cache->background_work_lock); + prevent_background_work(cache); + *result = cache; return 0; - bad: destroy(cache); return r; @@ -3008,7 +2781,6 @@ static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv) } ti->private = cache; - out: destroy_cache_args(ca); return r; @@ -3021,17 +2793,11 @@ static int cache_map(struct dm_target *ti, struct bio *bio) struct cache *cache = ti->private; int r; - struct dm_bio_prison_cell *cell = NULL; + bool commit_needed; dm_oblock_t block = get_bio_block(cache, bio); size_t pb_data_size = get_per_bio_data_size(cache); - bool can_migrate = false; - bool fast_promotion; - struct policy_result lookup_result; - struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size); - struct old_oblock_lock ool; - - ool.locker.fn = null_locker; + init_per_bio_data(bio, pb_data_size); if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) { /* * This can only occur if the io goes to a partial block at @@ -3048,101 +2814,9 @@ static int cache_map(struct dm_target *ti, struct bio *bio) return DM_MAPIO_SUBMITTED; } - /* - * Check to see if that block is currently migrating. - */ - cell = alloc_prison_cell(cache); - if (!cell) { - defer_bio(cache, bio); - return DM_MAPIO_SUBMITTED; - } - - r = bio_detain(cache, block, bio, cell, - (cell_free_fn) free_prison_cell, - cache, &cell); - if (r) { - if (r < 0) - defer_bio(cache, bio); - - return DM_MAPIO_SUBMITTED; - } - - fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio); - - r = policy_map(cache->policy, block, false, can_migrate, fast_promotion, - bio, &ool.locker, &lookup_result); - if (r == -EWOULDBLOCK) { - cell_defer(cache, cell, true); - return DM_MAPIO_SUBMITTED; - - } else if (r) { - DMERR_LIMIT("%s: Unexpected return from cache replacement policy: %d", - cache_device_name(cache), r); - cell_defer(cache, cell, false); - bio_io_error(bio); - return DM_MAPIO_SUBMITTED; - } - - r = DM_MAPIO_REMAPPED; - switch (lookup_result.op) { - case POLICY_HIT: - if (passthrough_mode(&cache->features)) { - if (bio_data_dir(bio) == WRITE) { - /* - * We need to invalidate this block, so - * defer for the worker thread. - */ - cell_defer(cache, cell, true); - r = DM_MAPIO_SUBMITTED; - - } else { - inc_miss_counter(cache, bio); - remap_to_origin_clear_discard(cache, bio, block); - accounted_begin(cache, bio); - inc_ds(cache, bio, cell); - // FIXME: we want to remap hits or misses straight - // away rather than passing over to the worker. - cell_defer(cache, cell, false); - } - - } else { - inc_hit_counter(cache, bio); - if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) && - !is_dirty(cache, lookup_result.cblock)) { - remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); - accounted_begin(cache, bio); - inc_ds(cache, bio, cell); - cell_defer(cache, cell, false); - - } else - remap_cell_to_cache_dirty(cache, cell, block, lookup_result.cblock, false); - } - break; - - case POLICY_MISS: - inc_miss_counter(cache, bio); - if (pb->req_nr != 0) { - /* - * This is a duplicate writethrough io that is no - * longer needed because the block has been demoted. - */ - bio_endio(bio); - // FIXME: remap everything as a miss - cell_defer(cache, cell, false); - r = DM_MAPIO_SUBMITTED; - - } else - remap_cell_to_origin_clear_discard(cache, cell, block, false); - break; - - default: - DMERR_LIMIT("%s: %s: erroring bio: unknown policy op: %u", - cache_device_name(cache), __func__, - (unsigned) lookup_result.op); - cell_defer(cache, cell, false); - bio_io_error(bio); - r = DM_MAPIO_SUBMITTED; - } + r = map_bio(cache, bio, block, &commit_needed); + if (commit_needed) + schedule_commit(&cache->committer); return r; } @@ -3162,7 +2836,7 @@ static int cache_end_io(struct dm_target *ti, struct bio *bio, int error) spin_unlock_irqrestore(&cache->lock, flags); } - check_for_quiesced_migrations(cache, pb); + bio_drop_shared_lock(cache, bio); accounted_complete(cache, bio); return 0; @@ -3262,12 +2936,18 @@ static void cache_postsuspend(struct dm_target *ti) { struct cache *cache = ti->private; - start_quiescing(cache); - wait_for_migrations(cache); - stop_worker(cache); + prevent_background_work(cache); + BUG_ON(atomic_read(&cache->nr_io_migrations)); + + cancel_delayed_work(&cache->waker); + flush_workqueue(cache->wq); + WARN_ON(cache->origin_tracker.in_flight); + + /* + * If it's a flush suspend there won't be any deferred bios, so this + * call is harmless. + */ requeue_deferred_bios(cache); - requeue_deferred_cells(cache); - stop_quiescing(cache); if (get_cache_mode(cache) == CM_WRITE) (void) sync_metadata(cache); @@ -3279,15 +2959,16 @@ static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock, int r; struct cache *cache = context; - r = policy_load_mapping(cache->policy, oblock, cblock, hint, hint_valid); + if (dirty) { + set_bit(from_cblock(cblock), cache->dirty_bitset); + atomic_inc(&cache->nr_dirty); + } else + clear_bit(from_cblock(cblock), cache->dirty_bitset); + + r = policy_load_mapping(cache->policy, oblock, cblock, dirty, hint, hint_valid); if (r) return r; - if (dirty) - set_dirty(cache, oblock, cblock); - else - clear_dirty(cache, oblock, cblock); - return 0; } @@ -3486,6 +3167,7 @@ static void cache_resume(struct dm_target *ti) struct cache *cache = ti->private; cache->need_tick_bio = true; + allow_background_work(cache); do_waker(&cache->waker.work); } @@ -3619,11 +3301,20 @@ err: DMEMIT("Error"); } +/* + * Defines a range of cblocks, begin to (end - 1) are in the range. end is + * the one-past-the-end value. + */ +struct cblock_range { + dm_cblock_t begin; + dm_cblock_t end; +}; + /* * A cache block range can take two forms: * * i) A single cblock, eg. '3456' - * ii) A begin and end cblock with dots between, eg. 123-234 + * ii) A begin and end cblock with a dash between, eg. 123-234 */ static int parse_cblock_range(struct cache *cache, const char *str, struct cblock_range *result) @@ -3689,23 +3380,31 @@ static int validate_cblock_range(struct cache *cache, struct cblock_range *range return 0; } +static inline dm_cblock_t cblock_succ(dm_cblock_t b) +{ + return to_cblock(from_cblock(b) + 1); +} + static int request_invalidation(struct cache *cache, struct cblock_range *range) { - struct invalidation_request req; + int r = 0; - INIT_LIST_HEAD(&req.list); - req.cblocks = range; - atomic_set(&req.complete, 0); - req.err = 0; - init_waitqueue_head(&req.result_wait); + /* + * We don't need to do any locking here because we know we're in + * passthrough mode. There's is potential for a race between an + * invalidation triggered by an io and an invalidation message. This + * is harmless, we must not worry if the policy call fails. + */ + while (range->begin != range->end) { + r = invalidate_cblock(cache, range->begin); + if (r) + return r; - spin_lock(&cache->invalidation_lock); - list_add(&req.list, &cache->invalidation_requests); - spin_unlock(&cache->invalidation_lock); - wake_worker(cache); + range->begin = cblock_succ(range->begin); + } - wait_event(req.result_wait, atomic_read(&req.complete)); - return req.err; + cache->commit_requested = true; + return r; } static int process_invalidate_cblocks_message(struct cache *cache, unsigned count, @@ -3815,7 +3514,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type cache_target = { .name = "cache", - .version = {1, 10, 0}, + .version = {2, 0, 0}, .module = THIS_MODULE, .ctr = cache_ctr, .dtr = cache_dtr, diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index fea5bd52ada8..97db4d11c05a 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -47,7 +47,7 @@ struct mapped_device { struct request_queue *queue; int numa_node_id; - unsigned type; + enum dm_queue_mode type; /* Protect queue and type against concurrent access. */ struct mutex type_lock; diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index ef1d836bd81b..ebf9e72d479b 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1,8 +1,8 @@ /* * Copyright (C) 2003 Jana Saout * Copyright (C) 2004 Clemens Fruhwirth - * Copyright (C) 2006-2015 Red Hat, Inc. All rights reserved. - * Copyright (C) 2013 Milan Broz + * Copyright (C) 2006-2017 Red Hat, Inc. All rights reserved. + * Copyright (C) 2013-2017 Milan Broz * * This file is released under the GPL. */ @@ -31,6 +31,9 @@ #include #include #include +#include +#include +#include /* for struct rtattr and RTA macros only */ #include #include @@ -48,7 +51,11 @@ struct convert_context { struct bvec_iter iter_out; sector_t cc_sector; atomic_t cc_pending; - struct skcipher_request *req; + union { + struct skcipher_request *req; + struct aead_request *req_aead; + } r; + }; /* @@ -57,6 +64,8 @@ struct convert_context { struct dm_crypt_io { struct crypt_config *cc; struct bio *base_bio; + u8 *integrity_metadata; + bool integrity_metadata_from_pool; struct work_struct work; struct convert_context ctx; @@ -70,8 +79,8 @@ struct dm_crypt_io { struct dm_crypt_request { struct convert_context *ctx; - struct scatterlist sg_in; - struct scatterlist sg_out; + struct scatterlist sg_in[4]; + struct scatterlist sg_out[4]; sector_t iv_sector; }; @@ -118,6 +127,11 @@ struct iv_tcw_private { enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID, DM_CRYPT_SAME_CPU, DM_CRYPT_NO_OFFLOAD }; +enum cipher_flags { + CRYPT_MODE_INTEGRITY_AEAD, /* Use authenticated mode for cihper */ + CRYPT_IV_LARGE_SECTORS, /* Calculate IV from sector_size, not 512B sectors */ +}; + /* * The fields in here must be read only after initialization. */ @@ -126,11 +140,14 @@ struct crypt_config { sector_t start; /* - * pool for per bio private data, crypto requests and - * encryption requeusts/buffer pages + * pool for per bio private data, crypto requests, + * encryption requeusts/buffer pages and integrity tags */ mempool_t *req_pool; mempool_t *page_pool; + mempool_t *tag_pool; + unsigned tag_pool_max_sectors; + struct bio_set *bs; struct mutex bio_alloc_lock; @@ -143,6 +160,7 @@ struct crypt_config { char *cipher; char *cipher_string; + char *cipher_auth; char *key_string; const struct crypt_iv_operations *iv_gen_ops; @@ -154,11 +172,17 @@ struct crypt_config { } iv_gen_private; sector_t iv_offset; unsigned int iv_size; + unsigned short int sector_size; + unsigned char sector_shift; /* ESSIV: struct crypto_cipher *essiv_tfm */ void *iv_private; - struct crypto_skcipher **tfms; + union { + struct crypto_skcipher **tfms; + struct crypto_aead **tfms_aead; + } cipher_tfm; unsigned tfms_count; + unsigned long cipher_flags; /* * Layout of each crypto request: @@ -181,21 +205,36 @@ struct crypt_config { unsigned int key_size; unsigned int key_parts; /* independent parts in key buffer */ unsigned int key_extra_size; /* additional keys length */ + unsigned int key_mac_size; /* MAC key size for authenc(...) */ + + unsigned int integrity_tag_size; + unsigned int integrity_iv_size; + unsigned int on_disk_tag_size; + + u8 *authenc_key; /* space for keys in authenc() format (if used) */ u8 key[0]; }; -#define MIN_IOS 64 +#define MIN_IOS 64 +#define MAX_TAG_SIZE 480 +#define POOL_ENTRY_SIZE 512 static void clone_init(struct dm_crypt_io *, struct bio *); static void kcryptd_queue_crypt(struct dm_crypt_io *io); -static u8 *iv_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq); +static struct scatterlist *crypt_get_sg_data(struct crypt_config *cc, + struct scatterlist *sg); /* - * Use this to access cipher attributes that are the same for each CPU. + * Use this to access cipher attributes that are independent of the key. */ static struct crypto_skcipher *any_tfm(struct crypt_config *cc) { - return cc->tfms[0]; + return cc->cipher_tfm.tfms[0]; +} + +static struct crypto_aead *any_tfm_aead(struct crypt_config *cc) +{ + return cc->cipher_tfm.tfms_aead[0]; } /* @@ -310,10 +349,11 @@ static int crypt_iv_essiv_wipe(struct crypt_config *cc) return err; } -/* Set up per cpu cipher state */ -static struct crypto_cipher *setup_essiv_cpu(struct crypt_config *cc, - struct dm_target *ti, - u8 *salt, unsigned saltsize) +/* Allocate the cipher for ESSIV */ +static struct crypto_cipher *alloc_essiv_cipher(struct crypt_config *cc, + struct dm_target *ti, + const u8 *salt, + unsigned int saltsize) { struct crypto_cipher *essiv_tfm; int err; @@ -325,8 +365,7 @@ static struct crypto_cipher *setup_essiv_cpu(struct crypt_config *cc, return essiv_tfm; } - if (crypto_cipher_blocksize(essiv_tfm) != - crypto_skcipher_ivsize(any_tfm(cc))) { + if (crypto_cipher_blocksize(essiv_tfm) != cc->iv_size) { ti->error = "Block size of ESSIV cipher does " "not match IV size of block cipher"; crypto_free_cipher(essiv_tfm); @@ -393,8 +432,8 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, cc->iv_gen_private.essiv.salt = salt; cc->iv_gen_private.essiv.hash_tfm = hash_tfm; - essiv_tfm = setup_essiv_cpu(cc, ti, salt, - crypto_ahash_digestsize(hash_tfm)); + essiv_tfm = alloc_essiv_cipher(cc, ti, salt, + crypto_ahash_digestsize(hash_tfm)); if (IS_ERR(essiv_tfm)) { crypt_iv_essiv_dtr(cc); return PTR_ERR(essiv_tfm); @@ -488,6 +527,11 @@ static int crypt_iv_lmk_ctr(struct crypt_config *cc, struct dm_target *ti, { struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; + if (cc->sector_size != (1 << SECTOR_SHIFT)) { + ti->error = "Unsupported sector size for LMK"; + return -EINVAL; + } + lmk->hash_tfm = crypto_alloc_shash("md5", 0, 0); if (IS_ERR(lmk->hash_tfm)) { ti->error = "Error initializing LMK hash"; @@ -585,12 +629,14 @@ static int crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv, static int crypt_iv_lmk_gen(struct crypt_config *cc, u8 *iv, struct dm_crypt_request *dmreq) { + struct scatterlist *sg; u8 *src; int r = 0; if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) { - src = kmap_atomic(sg_page(&dmreq->sg_in)); - r = crypt_iv_lmk_one(cc, iv, dmreq, src + dmreq->sg_in.offset); + sg = crypt_get_sg_data(cc, dmreq->sg_in); + src = kmap_atomic(sg_page(sg)); + r = crypt_iv_lmk_one(cc, iv, dmreq, src + sg->offset); kunmap_atomic(src); } else memset(iv, 0, cc->iv_size); @@ -601,18 +647,20 @@ static int crypt_iv_lmk_gen(struct crypt_config *cc, u8 *iv, static int crypt_iv_lmk_post(struct crypt_config *cc, u8 *iv, struct dm_crypt_request *dmreq) { + struct scatterlist *sg; u8 *dst; int r; if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) return 0; - dst = kmap_atomic(sg_page(&dmreq->sg_out)); - r = crypt_iv_lmk_one(cc, iv, dmreq, dst + dmreq->sg_out.offset); + sg = crypt_get_sg_data(cc, dmreq->sg_out); + dst = kmap_atomic(sg_page(sg)); + r = crypt_iv_lmk_one(cc, iv, dmreq, dst + sg->offset); /* Tweak the first block of plaintext sector */ if (!r) - crypto_xor(dst + dmreq->sg_out.offset, iv, cc->iv_size); + crypto_xor(dst + sg->offset, iv, cc->iv_size); kunmap_atomic(dst); return r; @@ -637,6 +685,11 @@ static int crypt_iv_tcw_ctr(struct crypt_config *cc, struct dm_target *ti, { struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw; + if (cc->sector_size != (1 << SECTOR_SHIFT)) { + ti->error = "Unsupported sector size for TCW"; + return -EINVAL; + } + if (cc->key_size <= (cc->iv_size + TCW_WHITENING_SIZE)) { ti->error = "Wrong key size for TCW"; return -EINVAL; @@ -724,6 +777,7 @@ out: static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv, struct dm_crypt_request *dmreq) { + struct scatterlist *sg; struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw; __le64 sector = cpu_to_le64(dmreq->iv_sector); u8 *src; @@ -731,8 +785,9 @@ static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv, /* Remove whitening from ciphertext */ if (bio_data_dir(dmreq->ctx->bio_in) != WRITE) { - src = kmap_atomic(sg_page(&dmreq->sg_in)); - r = crypt_iv_tcw_whitening(cc, dmreq, src + dmreq->sg_in.offset); + sg = crypt_get_sg_data(cc, dmreq->sg_in); + src = kmap_atomic(sg_page(sg)); + r = crypt_iv_tcw_whitening(cc, dmreq, src + sg->offset); kunmap_atomic(src); } @@ -748,6 +803,7 @@ static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv, static int crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv, struct dm_crypt_request *dmreq) { + struct scatterlist *sg; u8 *dst; int r; @@ -755,13 +811,22 @@ static int crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv, return 0; /* Apply whitening on ciphertext */ - dst = kmap_atomic(sg_page(&dmreq->sg_out)); - r = crypt_iv_tcw_whitening(cc, dmreq, dst + dmreq->sg_out.offset); + sg = crypt_get_sg_data(cc, dmreq->sg_out); + dst = kmap_atomic(sg_page(sg)); + r = crypt_iv_tcw_whitening(cc, dmreq, dst + sg->offset); kunmap_atomic(dst); return r; } +static int crypt_iv_random_gen(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) +{ + /* Used only for writes, there must be an additional space to store IV */ + get_random_bytes(iv, cc->iv_size); + return 0; +} + static const struct crypt_iv_operations crypt_iv_plain_ops = { .generator = crypt_iv_plain_gen }; @@ -806,6 +871,108 @@ static const struct crypt_iv_operations crypt_iv_tcw_ops = { .post = crypt_iv_tcw_post }; +static struct crypt_iv_operations crypt_iv_random_ops = { + .generator = crypt_iv_random_gen +}; + +/* + * Integrity extensions + */ +static bool crypt_integrity_aead(struct crypt_config *cc) +{ + return test_bit(CRYPT_MODE_INTEGRITY_AEAD, &cc->cipher_flags); +} + +static bool crypt_integrity_hmac(struct crypt_config *cc) +{ + return crypt_integrity_aead(cc) && cc->key_mac_size; +} + +/* Get sg containing data */ +static struct scatterlist *crypt_get_sg_data(struct crypt_config *cc, + struct scatterlist *sg) +{ + if (unlikely(crypt_integrity_aead(cc))) + return &sg[2]; + + return sg; +} + +static int dm_crypt_integrity_io_alloc(struct dm_crypt_io *io, struct bio *bio) +{ + struct bio_integrity_payload *bip; + unsigned int tag_len; + int ret; + + if (!bio_sectors(bio) || !io->cc->on_disk_tag_size) + return 0; + + bip = bio_integrity_alloc(bio, GFP_NOIO, 1); + if (IS_ERR(bip)) + return PTR_ERR(bip); + + tag_len = io->cc->on_disk_tag_size * bio_sectors(bio); + + bip->bip_iter.bi_size = tag_len; + bip->bip_iter.bi_sector = io->cc->start + io->sector; + + /* We own the metadata, do not let bio_free to release it */ + bip->bip_flags &= ~BIP_BLOCK_INTEGRITY; + + ret = bio_integrity_add_page(bio, virt_to_page(io->integrity_metadata), + tag_len, offset_in_page(io->integrity_metadata)); + if (unlikely(ret != tag_len)) + return -ENOMEM; + + return 0; +} + +static int crypt_integrity_ctr(struct crypt_config *cc, struct dm_target *ti) +{ +#ifdef CONFIG_BLK_DEV_INTEGRITY + struct blk_integrity *bi = blk_get_integrity(cc->dev->bdev->bd_disk); + + /* From now we require underlying device with our integrity profile */ + if (!bi || strcasecmp(bi->profile->name, "DM-DIF-EXT-TAG")) { + ti->error = "Integrity profile not supported."; + return -EINVAL; + } + + if (bi->tag_size != cc->on_disk_tag_size || + bi->tuple_size != cc->on_disk_tag_size) { + ti->error = "Integrity profile tag size mismatch."; + return -EINVAL; + } + if (1 << bi->interval_exp != cc->sector_size) { + ti->error = "Integrity profile sector size mismatch."; + return -EINVAL; + } + + if (crypt_integrity_aead(cc)) { + cc->integrity_tag_size = cc->on_disk_tag_size - cc->integrity_iv_size; + DMINFO("Integrity AEAD, tag size %u, IV size %u.", + cc->integrity_tag_size, cc->integrity_iv_size); + + if (crypto_aead_setauthsize(any_tfm_aead(cc), cc->integrity_tag_size)) { + ti->error = "Integrity AEAD auth tag size is not supported."; + return -EINVAL; + } + } else if (cc->integrity_iv_size) + DMINFO("Additional per-sector space %u bytes for IV.", + cc->integrity_iv_size); + + if ((cc->integrity_tag_size + cc->integrity_iv_size) != bi->tag_size) { + ti->error = "Not enough space for integrity tag in the profile."; + return -EINVAL; + } + + return 0; +#else + ti->error = "Integrity profile not supported."; + return -EINVAL; +#endif +} + static void crypt_convert_init(struct crypt_config *cc, struct convert_context *ctx, struct bio *bio_out, struct bio *bio_in, @@ -822,58 +989,217 @@ static void crypt_convert_init(struct crypt_config *cc, } static struct dm_crypt_request *dmreq_of_req(struct crypt_config *cc, - struct skcipher_request *req) + void *req) { return (struct dm_crypt_request *)((char *)req + cc->dmreq_start); } -static struct skcipher_request *req_of_dmreq(struct crypt_config *cc, - struct dm_crypt_request *dmreq) +static void *req_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq) { - return (struct skcipher_request *)((char *)dmreq - cc->dmreq_start); + return (void *)((char *)dmreq - cc->dmreq_start); } static u8 *iv_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq) { - return (u8 *)ALIGN((unsigned long)(dmreq + 1), - crypto_skcipher_alignmask(any_tfm(cc)) + 1); + if (crypt_integrity_aead(cc)) + return (u8 *)ALIGN((unsigned long)(dmreq + 1), + crypto_aead_alignmask(any_tfm_aead(cc)) + 1); + else + return (u8 *)ALIGN((unsigned long)(dmreq + 1), + crypto_skcipher_alignmask(any_tfm(cc)) + 1); } -static int crypt_convert_block(struct crypt_config *cc, - struct convert_context *ctx, - struct skcipher_request *req) +static u8 *org_iv_of_dmreq(struct crypt_config *cc, + struct dm_crypt_request *dmreq) +{ + return iv_of_dmreq(cc, dmreq) + cc->iv_size; +} + +static uint64_t *org_sector_of_dmreq(struct crypt_config *cc, + struct dm_crypt_request *dmreq) +{ + u8 *ptr = iv_of_dmreq(cc, dmreq) + cc->iv_size + cc->iv_size; + return (uint64_t*) ptr; +} + +static unsigned int *org_tag_of_dmreq(struct crypt_config *cc, + struct dm_crypt_request *dmreq) +{ + u8 *ptr = iv_of_dmreq(cc, dmreq) + cc->iv_size + + cc->iv_size + sizeof(uint64_t); + return (unsigned int*)ptr; +} + +static void *tag_from_dmreq(struct crypt_config *cc, + struct dm_crypt_request *dmreq) +{ + struct convert_context *ctx = dmreq->ctx; + struct dm_crypt_io *io = container_of(ctx, struct dm_crypt_io, ctx); + + return &io->integrity_metadata[*org_tag_of_dmreq(cc, dmreq) * + cc->on_disk_tag_size]; +} + +static void *iv_tag_from_dmreq(struct crypt_config *cc, + struct dm_crypt_request *dmreq) +{ + return tag_from_dmreq(cc, dmreq) + cc->integrity_tag_size; +} + +static int crypt_convert_block_aead(struct crypt_config *cc, + struct convert_context *ctx, + struct aead_request *req, + unsigned int tag_offset) { struct bio_vec bv_in = bio_iter_iovec(ctx->bio_in, ctx->iter_in); struct bio_vec bv_out = bio_iter_iovec(ctx->bio_out, ctx->iter_out); struct dm_crypt_request *dmreq; - u8 *iv; - int r; + u8 *iv, *org_iv, *tag_iv, *tag; + uint64_t *sector; + int r = 0; + + BUG_ON(cc->integrity_iv_size && cc->integrity_iv_size != cc->iv_size); + + /* Reject unexpected unaligned bio. */ + if (unlikely(bv_in.bv_offset & (cc->sector_size - 1))) + return -EIO; dmreq = dmreq_of_req(cc, req); - iv = iv_of_dmreq(cc, dmreq); - dmreq->iv_sector = ctx->cc_sector; + if (test_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags)) + dmreq->iv_sector >>= cc->sector_shift; dmreq->ctx = ctx; - sg_init_table(&dmreq->sg_in, 1); - sg_set_page(&dmreq->sg_in, bv_in.bv_page, 1 << SECTOR_SHIFT, - bv_in.bv_offset); - sg_init_table(&dmreq->sg_out, 1); - sg_set_page(&dmreq->sg_out, bv_out.bv_page, 1 << SECTOR_SHIFT, - bv_out.bv_offset); + *org_tag_of_dmreq(cc, dmreq) = tag_offset; - bio_advance_iter(ctx->bio_in, &ctx->iter_in, 1 << SECTOR_SHIFT); - bio_advance_iter(ctx->bio_out, &ctx->iter_out, 1 << SECTOR_SHIFT); + sector = org_sector_of_dmreq(cc, dmreq); + *sector = cpu_to_le64(ctx->cc_sector - cc->iv_offset); + + iv = iv_of_dmreq(cc, dmreq); + org_iv = org_iv_of_dmreq(cc, dmreq); + tag = tag_from_dmreq(cc, dmreq); + tag_iv = iv_tag_from_dmreq(cc, dmreq); + + /* AEAD request: + * |----- AAD -------|------ DATA -------|-- AUTH TAG --| + * | (authenticated) | (auth+encryption) | | + * | sector_LE | IV | sector in/out | tag in/out | + */ + sg_init_table(dmreq->sg_in, 4); + sg_set_buf(&dmreq->sg_in[0], sector, sizeof(uint64_t)); + sg_set_buf(&dmreq->sg_in[1], org_iv, cc->iv_size); + sg_set_page(&dmreq->sg_in[2], bv_in.bv_page, cc->sector_size, bv_in.bv_offset); + sg_set_buf(&dmreq->sg_in[3], tag, cc->integrity_tag_size); + + sg_init_table(dmreq->sg_out, 4); + sg_set_buf(&dmreq->sg_out[0], sector, sizeof(uint64_t)); + sg_set_buf(&dmreq->sg_out[1], org_iv, cc->iv_size); + sg_set_page(&dmreq->sg_out[2], bv_out.bv_page, cc->sector_size, bv_out.bv_offset); + sg_set_buf(&dmreq->sg_out[3], tag, cc->integrity_tag_size); if (cc->iv_gen_ops) { - r = cc->iv_gen_ops->generator(cc, iv, dmreq); - if (r < 0) - return r; + /* For READs use IV stored in integrity metadata */ + if (cc->integrity_iv_size && bio_data_dir(ctx->bio_in) != WRITE) { + memcpy(org_iv, tag_iv, cc->iv_size); + } else { + r = cc->iv_gen_ops->generator(cc, org_iv, dmreq); + if (r < 0) + return r; + /* Store generated IV in integrity metadata */ + if (cc->integrity_iv_size) + memcpy(tag_iv, org_iv, cc->iv_size); + } + /* Working copy of IV, to be modified in crypto API */ + memcpy(iv, org_iv, cc->iv_size); } - skcipher_request_set_crypt(req, &dmreq->sg_in, &dmreq->sg_out, - 1 << SECTOR_SHIFT, iv); + aead_request_set_ad(req, sizeof(uint64_t) + cc->iv_size); + if (bio_data_dir(ctx->bio_in) == WRITE) { + aead_request_set_crypt(req, dmreq->sg_in, dmreq->sg_out, + cc->sector_size, iv); + r = crypto_aead_encrypt(req); + if (cc->integrity_tag_size + cc->integrity_iv_size != cc->on_disk_tag_size) + memset(tag + cc->integrity_tag_size + cc->integrity_iv_size, 0, + cc->on_disk_tag_size - (cc->integrity_tag_size + cc->integrity_iv_size)); + } else { + aead_request_set_crypt(req, dmreq->sg_in, dmreq->sg_out, + cc->sector_size + cc->integrity_tag_size, iv); + r = crypto_aead_decrypt(req); + } + + if (r == -EBADMSG) + DMERR_LIMIT("INTEGRITY AEAD ERROR, sector %llu", + (unsigned long long)le64_to_cpu(*sector)); + + if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post) + r = cc->iv_gen_ops->post(cc, org_iv, dmreq); + + bio_advance_iter(ctx->bio_in, &ctx->iter_in, cc->sector_size); + bio_advance_iter(ctx->bio_out, &ctx->iter_out, cc->sector_size); + + return r; +} + +static int crypt_convert_block_skcipher(struct crypt_config *cc, + struct convert_context *ctx, + struct skcipher_request *req, + unsigned int tag_offset) +{ + struct bio_vec bv_in = bio_iter_iovec(ctx->bio_in, ctx->iter_in); + struct bio_vec bv_out = bio_iter_iovec(ctx->bio_out, ctx->iter_out); + struct scatterlist *sg_in, *sg_out; + struct dm_crypt_request *dmreq; + u8 *iv, *org_iv, *tag_iv; + uint64_t *sector; + int r = 0; + + /* Reject unexpected unaligned bio. */ + if (unlikely(bv_in.bv_offset & (cc->sector_size - 1))) + return -EIO; + + dmreq = dmreq_of_req(cc, req); + dmreq->iv_sector = ctx->cc_sector; + if (test_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags)) + dmreq->iv_sector >>= cc->sector_shift; + dmreq->ctx = ctx; + + *org_tag_of_dmreq(cc, dmreq) = tag_offset; + + iv = iv_of_dmreq(cc, dmreq); + org_iv = org_iv_of_dmreq(cc, dmreq); + tag_iv = iv_tag_from_dmreq(cc, dmreq); + + sector = org_sector_of_dmreq(cc, dmreq); + *sector = cpu_to_le64(ctx->cc_sector - cc->iv_offset); + + /* For skcipher we use only the first sg item */ + sg_in = &dmreq->sg_in[0]; + sg_out = &dmreq->sg_out[0]; + + sg_init_table(sg_in, 1); + sg_set_page(sg_in, bv_in.bv_page, cc->sector_size, bv_in.bv_offset); + + sg_init_table(sg_out, 1); + sg_set_page(sg_out, bv_out.bv_page, cc->sector_size, bv_out.bv_offset); + + if (cc->iv_gen_ops) { + /* For READs use IV stored in integrity metadata */ + if (cc->integrity_iv_size && bio_data_dir(ctx->bio_in) != WRITE) { + memcpy(org_iv, tag_iv, cc->integrity_iv_size); + } else { + r = cc->iv_gen_ops->generator(cc, org_iv, dmreq); + if (r < 0) + return r; + /* Store generated IV in integrity metadata */ + if (cc->integrity_iv_size) + memcpy(tag_iv, org_iv, cc->integrity_iv_size); + } + /* Working copy of IV, to be modified in crypto API */ + memcpy(iv, org_iv, cc->iv_size); + } + + skcipher_request_set_crypt(req, sg_in, sg_out, cc->sector_size, iv); if (bio_data_dir(ctx->bio_in) == WRITE) r = crypto_skcipher_encrypt(req); @@ -881,7 +1207,10 @@ static int crypt_convert_block(struct crypt_config *cc, r = crypto_skcipher_decrypt(req); if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post) - r = cc->iv_gen_ops->post(cc, iv, dmreq); + r = cc->iv_gen_ops->post(cc, org_iv, dmreq); + + bio_advance_iter(ctx->bio_in, &ctx->iter_in, cc->sector_size); + bio_advance_iter(ctx->bio_out, &ctx->iter_out, cc->sector_size); return r; } @@ -889,27 +1218,53 @@ static int crypt_convert_block(struct crypt_config *cc, static void kcryptd_async_done(struct crypto_async_request *async_req, int error); -static void crypt_alloc_req(struct crypt_config *cc, - struct convert_context *ctx) +static void crypt_alloc_req_skcipher(struct crypt_config *cc, + struct convert_context *ctx) { unsigned key_index = ctx->cc_sector & (cc->tfms_count - 1); - if (!ctx->req) - ctx->req = mempool_alloc(cc->req_pool, GFP_NOIO); + if (!ctx->r.req) + ctx->r.req = mempool_alloc(cc->req_pool, GFP_NOIO); - skcipher_request_set_tfm(ctx->req, cc->tfms[key_index]); + skcipher_request_set_tfm(ctx->r.req, cc->cipher_tfm.tfms[key_index]); /* * Use REQ_MAY_BACKLOG so a cipher driver internally backlogs * requests if driver request queue is full. */ - skcipher_request_set_callback(ctx->req, + skcipher_request_set_callback(ctx->r.req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - kcryptd_async_done, dmreq_of_req(cc, ctx->req)); + kcryptd_async_done, dmreq_of_req(cc, ctx->r.req)); } -static void crypt_free_req(struct crypt_config *cc, - struct skcipher_request *req, struct bio *base_bio) +static void crypt_alloc_req_aead(struct crypt_config *cc, + struct convert_context *ctx) +{ + if (!ctx->r.req_aead) + ctx->r.req_aead = mempool_alloc(cc->req_pool, GFP_NOIO); + + aead_request_set_tfm(ctx->r.req_aead, cc->cipher_tfm.tfms_aead[0]); + + /* + * Use REQ_MAY_BACKLOG so a cipher driver internally backlogs + * requests if driver request queue is full. + */ + aead_request_set_callback(ctx->r.req_aead, + CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, + kcryptd_async_done, dmreq_of_req(cc, ctx->r.req_aead)); +} + +static void crypt_alloc_req(struct crypt_config *cc, + struct convert_context *ctx) +{ + if (crypt_integrity_aead(cc)) + crypt_alloc_req_aead(cc, ctx); + else + crypt_alloc_req_skcipher(cc, ctx); +} + +static void crypt_free_req_skcipher(struct crypt_config *cc, + struct skcipher_request *req, struct bio *base_bio) { struct dm_crypt_io *io = dm_per_bio_data(base_bio, cc->per_bio_data_size); @@ -917,12 +1272,31 @@ static void crypt_free_req(struct crypt_config *cc, mempool_free(req, cc->req_pool); } +static void crypt_free_req_aead(struct crypt_config *cc, + struct aead_request *req, struct bio *base_bio) +{ + struct dm_crypt_io *io = dm_per_bio_data(base_bio, cc->per_bio_data_size); + + if ((struct aead_request *)(io + 1) != req) + mempool_free(req, cc->req_pool); +} + +static void crypt_free_req(struct crypt_config *cc, void *req, struct bio *base_bio) +{ + if (crypt_integrity_aead(cc)) + crypt_free_req_aead(cc, req, base_bio); + else + crypt_free_req_skcipher(cc, req, base_bio); +} + /* * Encrypt / decrypt data from one bio to another one (can be the same one) */ static int crypt_convert(struct crypt_config *cc, struct convert_context *ctx) { + unsigned int tag_offset = 0; + unsigned int sector_step = cc->sector_size >> SECTOR_SHIFT; int r; atomic_set(&ctx->cc_pending, 1); @@ -930,10 +1304,12 @@ static int crypt_convert(struct crypt_config *cc, while (ctx->iter_in.bi_size && ctx->iter_out.bi_size) { crypt_alloc_req(cc, ctx); - atomic_inc(&ctx->cc_pending); - r = crypt_convert_block(cc, ctx, ctx->req); + if (crypt_integrity_aead(cc)) + r = crypt_convert_block_aead(cc, ctx, ctx->r.req_aead, tag_offset); + else + r = crypt_convert_block_skcipher(cc, ctx, ctx->r.req, tag_offset); switch (r) { /* @@ -949,22 +1325,31 @@ static int crypt_convert(struct crypt_config *cc, * completion function kcryptd_async_done() will be called. */ case -EINPROGRESS: - ctx->req = NULL; - ctx->cc_sector++; + ctx->r.req = NULL; + ctx->cc_sector += sector_step; + tag_offset++; continue; /* * The request was already processed (synchronously). */ case 0: atomic_dec(&ctx->cc_pending); - ctx->cc_sector++; + ctx->cc_sector += sector_step; + tag_offset++; cond_resched(); continue; - - /* There was an error while processing the request. */ + /* + * There was a data integrity error. + */ + case -EBADMSG: + atomic_dec(&ctx->cc_pending); + return -EILSEQ; + /* + * There was an error while processing the request. + */ default: atomic_dec(&ctx->cc_pending); - return r; + return -EIO; } } @@ -1005,7 +1390,7 @@ retry: clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, cc->bs); if (!clone) - goto return_clone; + goto out; clone_init(io, clone); @@ -1027,7 +1412,13 @@ retry: remaining_size -= len; } -return_clone: + /* Allocate space for integrity tags */ + if (dm_crypt_integrity_io_alloc(io, clone)) { + crypt_free_buffer_pages(cc, clone); + bio_put(clone); + clone = NULL; + } +out: if (unlikely(gfp_mask & __GFP_DIRECT_RECLAIM)) mutex_unlock(&cc->bio_alloc_lock); @@ -1053,7 +1444,9 @@ static void crypt_io_init(struct dm_crypt_io *io, struct crypt_config *cc, io->base_bio = bio; io->sector = sector; io->error = 0; - io->ctx.req = NULL; + io->ctx.r.req = NULL; + io->integrity_metadata = NULL; + io->integrity_metadata_from_pool = false; atomic_set(&io->io_pending, 0); } @@ -1075,8 +1468,13 @@ static void crypt_dec_pending(struct dm_crypt_io *io) if (!atomic_dec_and_test(&io->io_pending)) return; - if (io->ctx.req) - crypt_free_req(cc, io->ctx.req, base_bio); + if (io->ctx.r.req) + crypt_free_req(cc, io->ctx.r.req, base_bio); + + if (unlikely(io->integrity_metadata_from_pool)) + mempool_free(io->integrity_metadata, io->cc->tag_pool); + else + kfree(io->integrity_metadata); base_bio->bi_error = error; bio_endio(base_bio); @@ -1156,6 +1554,12 @@ static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp) clone_init(io, clone); clone->bi_iter.bi_sector = cc->start + io->sector; + if (dm_crypt_integrity_io_alloc(io, clone)) { + crypt_dec_pending(io); + bio_put(clone); + return 1; + } + generic_make_request(clone); return 0; } @@ -1314,8 +1718,8 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) crypt_inc_pending(io); r = crypt_convert(cc, &io->ctx); - if (r) - io->error = -EIO; + if (r < 0) + io->error = r; crypt_finished = atomic_dec_and_test(&io->ctx.cc_pending); /* Encryption was already finished, submit io now */ @@ -1345,7 +1749,7 @@ static void kcryptd_crypt_read_convert(struct dm_crypt_io *io) r = crypt_convert(cc, &io->ctx); if (r < 0) - io->error = -EIO; + io->error = r; if (atomic_dec_and_test(&io->ctx.cc_pending)) kcryptd_crypt_read_done(io); @@ -1372,9 +1776,13 @@ static void kcryptd_async_done(struct crypto_async_request *async_req, } if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post) - error = cc->iv_gen_ops->post(cc, iv_of_dmreq(cc, dmreq), dmreq); + error = cc->iv_gen_ops->post(cc, org_iv_of_dmreq(cc, dmreq), dmreq); - if (error < 0) + if (error == -EBADMSG) { + DMERR_LIMIT("INTEGRITY AEAD ERROR, sector %llu", + (unsigned long long)le64_to_cpu(*org_sector_of_dmreq(cc, dmreq))); + io->error = -EILSEQ; + } else if (error < 0) io->error = -EIO; crypt_free_req(cc, req_of_dmreq(cc, dmreq), io->base_bio); @@ -1406,61 +1814,59 @@ static void kcryptd_queue_crypt(struct dm_crypt_io *io) queue_work(cc->crypt_queue, &io->work); } -/* - * Decode key from its hex representation - */ -static int crypt_decode_key(u8 *key, char *hex, unsigned int size) +static void crypt_free_tfms_aead(struct crypt_config *cc) { - char buffer[3]; - unsigned int i; + if (!cc->cipher_tfm.tfms_aead) + return; - buffer[2] = '\0'; - - for (i = 0; i < size; i++) { - buffer[0] = *hex++; - buffer[1] = *hex++; - - if (kstrtou8(buffer, 16, &key[i])) - return -EINVAL; + if (cc->cipher_tfm.tfms_aead[0] && !IS_ERR(cc->cipher_tfm.tfms_aead[0])) { + crypto_free_aead(cc->cipher_tfm.tfms_aead[0]); + cc->cipher_tfm.tfms_aead[0] = NULL; } - if (*hex != '\0') - return -EINVAL; + kfree(cc->cipher_tfm.tfms_aead); + cc->cipher_tfm.tfms_aead = NULL; +} - return 0; +static void crypt_free_tfms_skcipher(struct crypt_config *cc) +{ + unsigned i; + + if (!cc->cipher_tfm.tfms) + return; + + for (i = 0; i < cc->tfms_count; i++) + if (cc->cipher_tfm.tfms[i] && !IS_ERR(cc->cipher_tfm.tfms[i])) { + crypto_free_skcipher(cc->cipher_tfm.tfms[i]); + cc->cipher_tfm.tfms[i] = NULL; + } + + kfree(cc->cipher_tfm.tfms); + cc->cipher_tfm.tfms = NULL; } static void crypt_free_tfms(struct crypt_config *cc) { - unsigned i; - - if (!cc->tfms) - return; - - for (i = 0; i < cc->tfms_count; i++) - if (cc->tfms[i] && !IS_ERR(cc->tfms[i])) { - crypto_free_skcipher(cc->tfms[i]); - cc->tfms[i] = NULL; - } - - kfree(cc->tfms); - cc->tfms = NULL; + if (crypt_integrity_aead(cc)) + crypt_free_tfms_aead(cc); + else + crypt_free_tfms_skcipher(cc); } -static int crypt_alloc_tfms(struct crypt_config *cc, char *ciphermode) +static int crypt_alloc_tfms_skcipher(struct crypt_config *cc, char *ciphermode) { unsigned i; int err; - cc->tfms = kzalloc(cc->tfms_count * sizeof(struct crypto_skcipher *), - GFP_KERNEL); - if (!cc->tfms) + cc->cipher_tfm.tfms = kzalloc(cc->tfms_count * + sizeof(struct crypto_skcipher *), GFP_KERNEL); + if (!cc->cipher_tfm.tfms) return -ENOMEM; for (i = 0; i < cc->tfms_count; i++) { - cc->tfms[i] = crypto_alloc_skcipher(ciphermode, 0, 0); - if (IS_ERR(cc->tfms[i])) { - err = PTR_ERR(cc->tfms[i]); + cc->cipher_tfm.tfms[i] = crypto_alloc_skcipher(ciphermode, 0, 0); + if (IS_ERR(cc->cipher_tfm.tfms[i])) { + err = PTR_ERR(cc->cipher_tfm.tfms[i]); crypt_free_tfms(cc); return err; } @@ -1469,22 +1875,95 @@ static int crypt_alloc_tfms(struct crypt_config *cc, char *ciphermode) return 0; } +static int crypt_alloc_tfms_aead(struct crypt_config *cc, char *ciphermode) +{ + int err; + + cc->cipher_tfm.tfms = kmalloc(sizeof(struct crypto_aead *), GFP_KERNEL); + if (!cc->cipher_tfm.tfms) + return -ENOMEM; + + cc->cipher_tfm.tfms_aead[0] = crypto_alloc_aead(ciphermode, 0, 0); + if (IS_ERR(cc->cipher_tfm.tfms_aead[0])) { + err = PTR_ERR(cc->cipher_tfm.tfms_aead[0]); + crypt_free_tfms(cc); + return err; + } + + return 0; +} + +static int crypt_alloc_tfms(struct crypt_config *cc, char *ciphermode) +{ + if (crypt_integrity_aead(cc)) + return crypt_alloc_tfms_aead(cc, ciphermode); + else + return crypt_alloc_tfms_skcipher(cc, ciphermode); +} + +static unsigned crypt_subkey_size(struct crypt_config *cc) +{ + return (cc->key_size - cc->key_extra_size) >> ilog2(cc->tfms_count); +} + +static unsigned crypt_authenckey_size(struct crypt_config *cc) +{ + return crypt_subkey_size(cc) + RTA_SPACE(sizeof(struct crypto_authenc_key_param)); +} + +/* + * If AEAD is composed like authenc(hmac(sha256),xts(aes)), + * the key must be for some reason in special format. + * This funcion converts cc->key to this special format. + */ +static void crypt_copy_authenckey(char *p, const void *key, + unsigned enckeylen, unsigned authkeylen) +{ + struct crypto_authenc_key_param *param; + struct rtattr *rta; + + rta = (struct rtattr *)p; + param = RTA_DATA(rta); + param->enckeylen = cpu_to_be32(enckeylen); + rta->rta_len = RTA_LENGTH(sizeof(*param)); + rta->rta_type = CRYPTO_AUTHENC_KEYA_PARAM; + p += RTA_SPACE(sizeof(*param)); + memcpy(p, key + enckeylen, authkeylen); + p += authkeylen; + memcpy(p, key, enckeylen); +} + static int crypt_setkey(struct crypt_config *cc) { unsigned subkey_size; int err = 0, i, r; /* Ignore extra keys (which are used for IV etc) */ - subkey_size = (cc->key_size - cc->key_extra_size) >> ilog2(cc->tfms_count); + subkey_size = crypt_subkey_size(cc); + if (crypt_integrity_hmac(cc)) + crypt_copy_authenckey(cc->authenc_key, cc->key, + subkey_size - cc->key_mac_size, + cc->key_mac_size); for (i = 0; i < cc->tfms_count; i++) { - r = crypto_skcipher_setkey(cc->tfms[i], - cc->key + (i * subkey_size), - subkey_size); + if (crypt_integrity_hmac(cc)) + r = crypto_aead_setkey(cc->cipher_tfm.tfms_aead[i], + cc->authenc_key, crypt_authenckey_size(cc)); + else if (crypt_integrity_aead(cc)) + r = crypto_aead_setkey(cc->cipher_tfm.tfms_aead[i], + cc->key + (i * subkey_size), + subkey_size); + else + r = crypto_skcipher_setkey(cc->cipher_tfm.tfms[i], + cc->key + (i * subkey_size), + subkey_size); if (r) err = r; } + if (crypt_integrity_hmac(cc)) + memzero_explicit(cc->authenc_key, crypt_authenckey_size(cc)); + return err; } @@ -1633,7 +2112,8 @@ static int crypt_set_key(struct crypt_config *cc, char *key) kzfree(cc->key_string); cc->key_string = NULL; - if (cc->key_size && crypt_decode_key(cc->key, key, cc->key_size) < 0) + /* Decode key from its hex representation. */ + if (cc->key_size && hex2bin(cc->key, key, cc->key_size) < 0) goto out; r = crypt_setkey(cc); @@ -1649,12 +2129,16 @@ out: static int crypt_wipe_key(struct crypt_config *cc) { + int r; + clear_bit(DM_CRYPT_KEY_VALID, &cc->flags); - memset(&cc->key, 0, cc->key_size * sizeof(u8)); + get_random_bytes(&cc->key, cc->key_size); kzfree(cc->key_string); cc->key_string = NULL; + r = crypt_setkey(cc); + memset(&cc->key, 0, cc->key_size * sizeof(u8)); - return crypt_setkey(cc); + return r; } static void crypt_dtr(struct dm_target *ti) @@ -1681,6 +2165,7 @@ static void crypt_dtr(struct dm_target *ti) mempool_destroy(cc->page_pool); mempool_destroy(cc->req_pool); + mempool_destroy(cc->tag_pool); if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) cc->iv_gen_ops->dtr(cc); @@ -1691,93 +2176,22 @@ static void crypt_dtr(struct dm_target *ti) kzfree(cc->cipher); kzfree(cc->cipher_string); kzfree(cc->key_string); + kzfree(cc->cipher_auth); + kzfree(cc->authenc_key); /* Must zero key material before freeing */ kzfree(cc); } -static int crypt_ctr_cipher(struct dm_target *ti, - char *cipher_in, char *key) +static int crypt_ctr_ivmode(struct dm_target *ti, const char *ivmode) { struct crypt_config *cc = ti->private; - char *tmp, *cipher, *chainmode, *ivmode, *ivopts, *keycount; - char *cipher_api = NULL; - int ret = -EINVAL; - char dummy; - /* Convert to crypto api definition? */ - if (strchr(cipher_in, '(')) { - ti->error = "Bad cipher specification"; - return -EINVAL; - } + if (crypt_integrity_aead(cc)) + cc->iv_size = crypto_aead_ivsize(any_tfm_aead(cc)); + else + cc->iv_size = crypto_skcipher_ivsize(any_tfm(cc)); - cc->cipher_string = kstrdup(cipher_in, GFP_KERNEL); - if (!cc->cipher_string) - goto bad_mem; - - /* - * Legacy dm-crypt cipher specification - * cipher[:keycount]-mode-iv:ivopts - */ - tmp = cipher_in; - keycount = strsep(&tmp, "-"); - cipher = strsep(&keycount, ":"); - - if (!keycount) - cc->tfms_count = 1; - else if (sscanf(keycount, "%u%c", &cc->tfms_count, &dummy) != 1 || - !is_power_of_2(cc->tfms_count)) { - ti->error = "Bad cipher key count specification"; - return -EINVAL; - } - cc->key_parts = cc->tfms_count; - cc->key_extra_size = 0; - - cc->cipher = kstrdup(cipher, GFP_KERNEL); - if (!cc->cipher) - goto bad_mem; - - chainmode = strsep(&tmp, "-"); - ivopts = strsep(&tmp, "-"); - ivmode = strsep(&ivopts, ":"); - - if (tmp) - DMWARN("Ignoring unexpected additional cipher options"); - - /* - * For compatibility with the original dm-crypt mapping format, if - * only the cipher name is supplied, use cbc-plain. - */ - if (!chainmode || (!strcmp(chainmode, "plain") && !ivmode)) { - chainmode = "cbc"; - ivmode = "plain"; - } - - if (strcmp(chainmode, "ecb") && !ivmode) { - ti->error = "IV mechanism required"; - return -EINVAL; - } - - cipher_api = kmalloc(CRYPTO_MAX_ALG_NAME, GFP_KERNEL); - if (!cipher_api) - goto bad_mem; - - ret = snprintf(cipher_api, CRYPTO_MAX_ALG_NAME, - "%s(%s)", chainmode, cipher); - if (ret < 0) { - kfree(cipher_api); - goto bad_mem; - } - - /* Allocate cipher */ - ret = crypt_alloc_tfms(cc, cipher_api); - if (ret < 0) { - ti->error = "Error allocating crypto tfm"; - goto bad; - } - - /* Initialize IV */ - cc->iv_size = crypto_skcipher_ivsize(any_tfm(cc)); if (cc->iv_size) /* at least a 64 bit sector number should fit in our buffer */ cc->iv_size = max(cc->iv_size, @@ -1816,17 +2230,263 @@ static int crypt_ctr_cipher(struct dm_target *ti, cc->iv_gen_ops = &crypt_iv_tcw_ops; cc->key_parts += 2; /* IV + whitening */ cc->key_extra_size = cc->iv_size + TCW_WHITENING_SIZE; + } else if (strcmp(ivmode, "random") == 0) { + cc->iv_gen_ops = &crypt_iv_random_ops; + /* Need storage space in integrity fields. */ + cc->integrity_iv_size = cc->iv_size; } else { - ret = -EINVAL; ti->error = "Invalid IV mode"; - goto bad; + return -EINVAL; } + return 0; +} + +/* + * Workaround to parse cipher algorithm from crypto API spec. + * The cc->cipher is currently used only in ESSIV. + * This should be probably done by crypto-api calls (once available...) + */ +static int crypt_ctr_blkdev_cipher(struct crypt_config *cc) +{ + const char *alg_name = NULL; + char *start, *end; + + if (crypt_integrity_aead(cc)) { + alg_name = crypto_tfm_alg_name(crypto_aead_tfm(any_tfm_aead(cc))); + if (!alg_name) + return -EINVAL; + if (crypt_integrity_hmac(cc)) { + alg_name = strchr(alg_name, ','); + if (!alg_name) + return -EINVAL; + } + alg_name++; + } else { + alg_name = crypto_tfm_alg_name(crypto_skcipher_tfm(any_tfm(cc))); + if (!alg_name) + return -EINVAL; + } + + start = strchr(alg_name, '('); + end = strchr(alg_name, ')'); + + if (!start && !end) { + cc->cipher = kstrdup(alg_name, GFP_KERNEL); + return cc->cipher ? 0 : -ENOMEM; + } + + if (!start || !end || ++start >= end) + return -EINVAL; + + cc->cipher = kzalloc(end - start + 1, GFP_KERNEL); + if (!cc->cipher) + return -ENOMEM; + + strncpy(cc->cipher, start, end - start); + + return 0; +} + +/* + * Workaround to parse HMAC algorithm from AEAD crypto API spec. + * The HMAC is needed to calculate tag size (HMAC digest size). + * This should be probably done by crypto-api calls (once available...) + */ +static int crypt_ctr_auth_cipher(struct crypt_config *cc, char *cipher_api) +{ + char *start, *end, *mac_alg = NULL; + struct crypto_ahash *mac; + + if (!strstarts(cipher_api, "authenc(")) + return 0; + + start = strchr(cipher_api, '('); + end = strchr(cipher_api, ','); + if (!start || !end || ++start > end) + return -EINVAL; + + mac_alg = kzalloc(end - start + 1, GFP_KERNEL); + if (!mac_alg) + return -ENOMEM; + strncpy(mac_alg, start, end - start); + + mac = crypto_alloc_ahash(mac_alg, 0, 0); + kfree(mac_alg); + + if (IS_ERR(mac)) + return PTR_ERR(mac); + + cc->key_mac_size = crypto_ahash_digestsize(mac); + crypto_free_ahash(mac); + + cc->authenc_key = kmalloc(crypt_authenckey_size(cc), GFP_KERNEL); + if (!cc->authenc_key) + return -ENOMEM; + + return 0; +} + +static int crypt_ctr_cipher_new(struct dm_target *ti, char *cipher_in, char *key, + char **ivmode, char **ivopts) +{ + struct crypt_config *cc = ti->private; + char *tmp, *cipher_api; + int ret = -EINVAL; + + cc->tfms_count = 1; + + /* + * New format (capi: prefix) + * capi:cipher_api_spec-iv:ivopts + */ + tmp = &cipher_in[strlen("capi:")]; + cipher_api = strsep(&tmp, "-"); + *ivmode = strsep(&tmp, ":"); + *ivopts = tmp; + + if (*ivmode && !strcmp(*ivmode, "lmk")) + cc->tfms_count = 64; + + cc->key_parts = cc->tfms_count; + + /* Allocate cipher */ + ret = crypt_alloc_tfms(cc, cipher_api); + if (ret < 0) { + ti->error = "Error allocating crypto tfm"; + return ret; + } + + /* Alloc AEAD, can be used only in new format. */ + if (crypt_integrity_aead(cc)) { + ret = crypt_ctr_auth_cipher(cc, cipher_api); + if (ret < 0) { + ti->error = "Invalid AEAD cipher spec"; + return -ENOMEM; + } + cc->iv_size = crypto_aead_ivsize(any_tfm_aead(cc)); + } else + cc->iv_size = crypto_skcipher_ivsize(any_tfm(cc)); + + ret = crypt_ctr_blkdev_cipher(cc); + if (ret < 0) { + ti->error = "Cannot allocate cipher string"; + return -ENOMEM; + } + + return 0; +} + +static int crypt_ctr_cipher_old(struct dm_target *ti, char *cipher_in, char *key, + char **ivmode, char **ivopts) +{ + struct crypt_config *cc = ti->private; + char *tmp, *cipher, *chainmode, *keycount; + char *cipher_api = NULL; + int ret = -EINVAL; + char dummy; + + if (strchr(cipher_in, '(') || crypt_integrity_aead(cc)) { + ti->error = "Bad cipher specification"; + return -EINVAL; + } + + /* + * Legacy dm-crypt cipher specification + * cipher[:keycount]-mode-iv:ivopts + */ + tmp = cipher_in; + keycount = strsep(&tmp, "-"); + cipher = strsep(&keycount, ":"); + + if (!keycount) + cc->tfms_count = 1; + else if (sscanf(keycount, "%u%c", &cc->tfms_count, &dummy) != 1 || + !is_power_of_2(cc->tfms_count)) { + ti->error = "Bad cipher key count specification"; + return -EINVAL; + } + cc->key_parts = cc->tfms_count; + + cc->cipher = kstrdup(cipher, GFP_KERNEL); + if (!cc->cipher) + goto bad_mem; + + chainmode = strsep(&tmp, "-"); + *ivopts = strsep(&tmp, "-"); + *ivmode = strsep(&*ivopts, ":"); + + if (tmp) + DMWARN("Ignoring unexpected additional cipher options"); + + /* + * For compatibility with the original dm-crypt mapping format, if + * only the cipher name is supplied, use cbc-plain. + */ + if (!chainmode || (!strcmp(chainmode, "plain") && !*ivmode)) { + chainmode = "cbc"; + *ivmode = "plain"; + } + + if (strcmp(chainmode, "ecb") && !*ivmode) { + ti->error = "IV mechanism required"; + return -EINVAL; + } + + cipher_api = kmalloc(CRYPTO_MAX_ALG_NAME, GFP_KERNEL); + if (!cipher_api) + goto bad_mem; + + ret = snprintf(cipher_api, CRYPTO_MAX_ALG_NAME, + "%s(%s)", chainmode, cipher); + if (ret < 0) { + kfree(cipher_api); + goto bad_mem; + } + + /* Allocate cipher */ + ret = crypt_alloc_tfms(cc, cipher_api); + if (ret < 0) { + ti->error = "Error allocating crypto tfm"; + kfree(cipher_api); + return ret; + } + + return 0; +bad_mem: + ti->error = "Cannot allocate cipher strings"; + return -ENOMEM; +} + +static int crypt_ctr_cipher(struct dm_target *ti, char *cipher_in, char *key) +{ + struct crypt_config *cc = ti->private; + char *ivmode = NULL, *ivopts = NULL; + int ret; + + cc->cipher_string = kstrdup(cipher_in, GFP_KERNEL); + if (!cc->cipher_string) { + ti->error = "Cannot allocate cipher strings"; + return -ENOMEM; + } + + if (strstarts(cipher_in, "capi:")) + ret = crypt_ctr_cipher_new(ti, cipher_in, key, &ivmode, &ivopts); + else + ret = crypt_ctr_cipher_old(ti, cipher_in, key, &ivmode, &ivopts); + if (ret) + return ret; + + /* Initialize IV */ + ret = crypt_ctr_ivmode(ti, ivmode); + if (ret < 0) + return ret; + /* Initialize and set key */ ret = crypt_set_key(cc, key); if (ret < 0) { ti->error = "Error decoding and setting key"; - goto bad; + return ret; } /* Allocate IV */ @@ -1834,7 +2494,7 @@ static int crypt_ctr_cipher(struct dm_target *ti, ret = cc->iv_gen_ops->ctr(cc, ti, ivopts); if (ret < 0) { ti->error = "Error creating IV"; - goto bad; + return ret; } } @@ -1843,18 +2503,82 @@ static int crypt_ctr_cipher(struct dm_target *ti, ret = cc->iv_gen_ops->init(cc); if (ret < 0) { ti->error = "Error initialising IV"; - goto bad; + return ret; } } - ret = 0; -bad: - kfree(cipher_api); return ret; +} -bad_mem: - ti->error = "Cannot allocate cipher strings"; - return -ENOMEM; +static int crypt_ctr_optional(struct dm_target *ti, unsigned int argc, char **argv) +{ + struct crypt_config *cc = ti->private; + struct dm_arg_set as; + static struct dm_arg _args[] = { + {0, 6, "Invalid number of feature args"}, + }; + unsigned int opt_params, val; + const char *opt_string, *sval; + char dummy; + int ret; + + /* Optional parameters */ + as.argc = argc; + as.argv = argv; + + ret = dm_read_arg_group(_args, &as, &opt_params, &ti->error); + if (ret) + return ret; + + while (opt_params--) { + opt_string = dm_shift_arg(&as); + if (!opt_string) { + ti->error = "Not enough feature arguments"; + return -EINVAL; + } + + if (!strcasecmp(opt_string, "allow_discards")) + ti->num_discard_bios = 1; + + else if (!strcasecmp(opt_string, "same_cpu_crypt")) + set_bit(DM_CRYPT_SAME_CPU, &cc->flags); + + else if (!strcasecmp(opt_string, "submit_from_crypt_cpus")) + set_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags); + else if (sscanf(opt_string, "integrity:%u:", &val) == 1) { + if (val == 0 || val > MAX_TAG_SIZE) { + ti->error = "Invalid integrity arguments"; + return -EINVAL; + } + cc->on_disk_tag_size = val; + sval = strchr(opt_string + strlen("integrity:"), ':') + 1; + if (!strcasecmp(sval, "aead")) { + set_bit(CRYPT_MODE_INTEGRITY_AEAD, &cc->cipher_flags); + } else if (strcasecmp(sval, "none")) { + ti->error = "Unknown integrity profile"; + return -EINVAL; + } + + cc->cipher_auth = kstrdup(sval, GFP_KERNEL); + if (!cc->cipher_auth) + return -ENOMEM; + } else if (sscanf(opt_string, "sector_size:%hu%c", &cc->sector_size, &dummy) == 1) { + if (cc->sector_size < (1 << SECTOR_SHIFT) || + cc->sector_size > 4096 || + (cc->sector_size & (cc->sector_size - 1))) { + ti->error = "Invalid feature value for sector_size"; + return -EINVAL; + } + cc->sector_shift = __ffs(cc->sector_size) - SECTOR_SHIFT; + } else if (!strcasecmp(opt_string, "iv_large_sectors")) + set_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags); + else { + ti->error = "Invalid feature arguments"; + return -EINVAL; + } + } + + return 0; } /* @@ -1865,18 +2589,12 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) { struct crypt_config *cc; int key_size; - unsigned int opt_params; + unsigned int align_mask; unsigned long long tmpll; int ret; - size_t iv_size_padding; - struct dm_arg_set as; - const char *opt_string; + size_t iv_size_padding, additional_req_size; char dummy; - static struct dm_arg _args[] = { - {0, 3, "Invalid number of feature args"}, - }; - if (argc < 5) { ti->error = "Not enough arguments"; return -EINVAL; @@ -1894,40 +2612,63 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) return -ENOMEM; } cc->key_size = key_size; + cc->sector_size = (1 << SECTOR_SHIFT); + cc->sector_shift = 0; ti->private = cc; + + /* Optional parameters need to be read before cipher constructor */ + if (argc > 5) { + ret = crypt_ctr_optional(ti, argc - 5, &argv[5]); + if (ret) + goto bad; + } + ret = crypt_ctr_cipher(ti, argv[0], argv[1]); if (ret < 0) goto bad; - cc->dmreq_start = sizeof(struct skcipher_request); - cc->dmreq_start += crypto_skcipher_reqsize(any_tfm(cc)); + if (crypt_integrity_aead(cc)) { + cc->dmreq_start = sizeof(struct aead_request); + cc->dmreq_start += crypto_aead_reqsize(any_tfm_aead(cc)); + align_mask = crypto_aead_alignmask(any_tfm_aead(cc)); + } else { + cc->dmreq_start = sizeof(struct skcipher_request); + cc->dmreq_start += crypto_skcipher_reqsize(any_tfm(cc)); + align_mask = crypto_skcipher_alignmask(any_tfm(cc)); + } cc->dmreq_start = ALIGN(cc->dmreq_start, __alignof__(struct dm_crypt_request)); - if (crypto_skcipher_alignmask(any_tfm(cc)) < CRYPTO_MINALIGN) { + if (align_mask < CRYPTO_MINALIGN) { /* Allocate the padding exactly */ iv_size_padding = -(cc->dmreq_start + sizeof(struct dm_crypt_request)) - & crypto_skcipher_alignmask(any_tfm(cc)); + & align_mask; } else { /* * If the cipher requires greater alignment than kmalloc * alignment, we don't know the exact position of the * initialization vector. We must assume worst case. */ - iv_size_padding = crypto_skcipher_alignmask(any_tfm(cc)); + iv_size_padding = align_mask; } ret = -ENOMEM; - cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start + - sizeof(struct dm_crypt_request) + iv_size_padding + cc->iv_size); + + /* ...| IV + padding | original IV | original sec. number | bio tag offset | */ + additional_req_size = sizeof(struct dm_crypt_request) + + iv_size_padding + cc->iv_size + + cc->iv_size + + sizeof(uint64_t) + + sizeof(unsigned int); + + cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start + additional_req_size); if (!cc->req_pool) { ti->error = "Cannot allocate crypt request mempool"; goto bad; } cc->per_bio_data_size = ti->per_io_data_size = - ALIGN(sizeof(struct dm_crypt_io) + cc->dmreq_start + - sizeof(struct dm_crypt_request) + iv_size_padding + cc->iv_size, + ALIGN(sizeof(struct dm_crypt_io) + cc->dmreq_start + additional_req_size, ARCH_KMALLOC_MINALIGN); cc->page_pool = mempool_create_page_pool(BIO_MAX_PAGES, 0); @@ -1945,7 +2686,8 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) mutex_init(&cc->bio_alloc_lock); ret = -EINVAL; - if (sscanf(argv[2], "%llu%c", &tmpll, &dummy) != 1) { + if ((sscanf(argv[2], "%llu%c", &tmpll, &dummy) != 1) || + (tmpll & ((cc->sector_size >> SECTOR_SHIFT) - 1))) { ti->error = "Invalid iv_offset sector"; goto bad; } @@ -1964,53 +2706,37 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) } cc->start = tmpll; - argv += 5; - argc -= 5; - - /* Optional parameters */ - if (argc) { - as.argc = argc; - as.argv = argv; - - ret = dm_read_arg_group(_args, &as, &opt_params, &ti->error); + if (crypt_integrity_aead(cc) || cc->integrity_iv_size) { + ret = crypt_integrity_ctr(cc, ti); if (ret) goto bad; - ret = -EINVAL; - while (opt_params--) { - opt_string = dm_shift_arg(&as); - if (!opt_string) { - ti->error = "Not enough feature arguments"; - goto bad; - } + cc->tag_pool_max_sectors = POOL_ENTRY_SIZE / cc->on_disk_tag_size; + if (!cc->tag_pool_max_sectors) + cc->tag_pool_max_sectors = 1; - if (!strcasecmp(opt_string, "allow_discards")) - ti->num_discard_bios = 1; - - else if (!strcasecmp(opt_string, "same_cpu_crypt")) - set_bit(DM_CRYPT_SAME_CPU, &cc->flags); - - else if (!strcasecmp(opt_string, "submit_from_crypt_cpus")) - set_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags); - - else { - ti->error = "Invalid feature arguments"; - goto bad; - } + cc->tag_pool = mempool_create_kmalloc_pool(MIN_IOS, + cc->tag_pool_max_sectors * cc->on_disk_tag_size); + if (!cc->tag_pool) { + ti->error = "Cannot allocate integrity tags mempool"; + goto bad; } + + cc->tag_pool_max_sectors <<= cc->sector_shift; } ret = -ENOMEM; - cc->io_queue = alloc_workqueue("kcryptd_io", WQ_MEM_RECLAIM, 1); + cc->io_queue = alloc_workqueue("kcryptd_io", WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1); if (!cc->io_queue) { ti->error = "Couldn't create kcryptd io queue"; goto bad; } if (test_bit(DM_CRYPT_SAME_CPU, &cc->flags)) - cc->crypt_queue = alloc_workqueue("kcryptd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1); + cc->crypt_queue = alloc_workqueue("kcryptd", WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1); else - cc->crypt_queue = alloc_workqueue("kcryptd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND, + cc->crypt_queue = alloc_workqueue("kcryptd", + WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND, num_online_cpus()); if (!cc->crypt_queue) { ti->error = "Couldn't create kcryptd queue"; @@ -2061,12 +2787,39 @@ static int crypt_map(struct dm_target *ti, struct bio *bio) * Check if bio is too large, split as needed. */ if (unlikely(bio->bi_iter.bi_size > (BIO_MAX_PAGES << PAGE_SHIFT)) && - bio_data_dir(bio) == WRITE) + (bio_data_dir(bio) == WRITE || cc->on_disk_tag_size)) dm_accept_partial_bio(bio, ((BIO_MAX_PAGES << PAGE_SHIFT) >> SECTOR_SHIFT)); + /* + * Ensure that bio is a multiple of internal sector encryption size + * and is aligned to this size as defined in IO hints. + */ + if (unlikely((bio->bi_iter.bi_sector & ((cc->sector_size >> SECTOR_SHIFT) - 1)) != 0)) + return -EIO; + + if (unlikely(bio->bi_iter.bi_size & (cc->sector_size - 1))) + return -EIO; + io = dm_per_bio_data(bio, cc->per_bio_data_size); crypt_io_init(io, cc, bio, dm_target_offset(ti, bio->bi_iter.bi_sector)); - io->ctx.req = (struct skcipher_request *)(io + 1); + + if (cc->on_disk_tag_size) { + unsigned tag_len = cc->on_disk_tag_size * (bio_sectors(bio) >> cc->sector_shift); + + if (unlikely(tag_len > KMALLOC_MAX_SIZE) || + unlikely(!(io->integrity_metadata = kmalloc(tag_len, + GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN)))) { + if (bio_sectors(bio) > cc->tag_pool_max_sectors) + dm_accept_partial_bio(bio, cc->tag_pool_max_sectors); + io->integrity_metadata = mempool_alloc(cc->tag_pool, GFP_NOIO); + io->integrity_metadata_from_pool = true; + } + } + + if (crypt_integrity_aead(cc)) + io->ctx.r.req_aead = (struct aead_request *)(io + 1); + else + io->ctx.r.req = (struct skcipher_request *)(io + 1); if (bio_data_dir(io->base_bio) == READ) { if (kcryptd_io_read(io, GFP_NOWAIT)) @@ -2107,6 +2860,10 @@ static void crypt_status(struct dm_target *ti, status_type_t type, num_feature_args += !!ti->num_discard_bios; num_feature_args += test_bit(DM_CRYPT_SAME_CPU, &cc->flags); num_feature_args += test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags); + num_feature_args += cc->sector_size != (1 << SECTOR_SHIFT); + num_feature_args += test_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags); + if (cc->on_disk_tag_size) + num_feature_args++; if (num_feature_args) { DMEMIT(" %d", num_feature_args); if (ti->num_discard_bios) @@ -2115,6 +2872,12 @@ static void crypt_status(struct dm_target *ti, status_type_t type, DMEMIT(" same_cpu_crypt"); if (test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags)) DMEMIT(" submit_from_crypt_cpus"); + if (cc->on_disk_tag_size) + DMEMIT(" integrity:%u:%s", cc->on_disk_tag_size, cc->cipher_auth); + if (cc->sector_size != (1 << SECTOR_SHIFT)) + DMEMIT(" sector_size:%d", cc->sector_size); + if (test_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags)) + DMEMIT(" iv_large_sectors"); } break; @@ -2204,6 +2967,8 @@ static int crypt_iterate_devices(struct dm_target *ti, static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits) { + struct crypt_config *cc = ti->private; + /* * Unfortunate constraint that is required to avoid the potential * for exceeding underlying device's max_segments limits -- due to @@ -2211,11 +2976,17 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits) * bio that are not as physically contiguous as the original bio. */ limits->max_segment_size = PAGE_SIZE; + + if (cc->sector_size != (1 << SECTOR_SHIFT)) { + limits->logical_block_size = cc->sector_size; + limits->physical_block_size = cc->sector_size; + blk_limits_io_min(limits, cc->sector_size); + } } static struct target_type crypt_target = { .name = "crypt", - .version = {1, 15, 0}, + .version = {1, 17, 0}, .module = THIS_MODULE, .ctr = crypt_ctr, .dtr = crypt_dtr, diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index cc70871a6d29..ae3158795d26 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -340,6 +340,7 @@ out: static struct target_type delay_target = { .name = "delay", .version = {1, 2, 1}, + .features = DM_TARGET_PASSES_INTEGRITY, .module = THIS_MODULE, .ctr = delay_ctr, .dtr = delay_dtr, diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c index 9fab33b113c4..e7ba89f98d8d 100644 --- a/drivers/md/dm-era-target.c +++ b/drivers/md/dm-era-target.c @@ -254,7 +254,6 @@ static struct dm_block_validator sb_validator = { * Low level metadata handling *--------------------------------------------------------------*/ #define DM_ERA_METADATA_BLOCK_SIZE 4096 -#define DM_ERA_METADATA_CACHE_SIZE 64 #define ERA_MAX_CONCURRENT_LOCKS 5 struct era_metadata { @@ -615,7 +614,6 @@ static int create_persistent_data_objects(struct era_metadata *md, int r; md->bm = dm_block_manager_create(md->bdev, DM_ERA_METADATA_BLOCK_SIZE, - DM_ERA_METADATA_CACHE_SIZE, ERA_MAX_CONCURRENT_LOCKS); if (IS_ERR(md->bm)) { DMERR("could not create block manager"); @@ -961,18 +959,18 @@ static int metadata_commit(struct era_metadata *md) } } - r = save_sm_root(md); - if (r) { - DMERR("%s: save_sm_root failed", __func__); - return r; - } - r = dm_tm_pre_commit(md->tm); if (r) { DMERR("%s: pre commit failed", __func__); return r; } + r = save_sm_root(md); + if (r) { + DMERR("%s: save_sm_root failed", __func__); + return r; + } + r = superblock_lock(md, &sblock); if (r) { DMERR("%s: superblock lock failed", __func__); diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c new file mode 100644 index 000000000000..c7f7c8d76576 --- /dev/null +++ b/drivers/md/dm-integrity.c @@ -0,0 +1,3238 @@ +/* + * Copyright (C) 2016-2017 Red Hat, Inc. All rights reserved. + * Copyright (C) 2016-2017 Milan Broz + * Copyright (C) 2016-2017 Mikulas Patocka + * + * This file is released under the GPL. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "dm-bufio.h" + +#define DM_MSG_PREFIX "integrity" + +#define DEFAULT_INTERLEAVE_SECTORS 32768 +#define DEFAULT_JOURNAL_SIZE_FACTOR 7 +#define DEFAULT_BUFFER_SECTORS 128 +#define DEFAULT_JOURNAL_WATERMARK 50 +#define DEFAULT_SYNC_MSEC 10000 +#define DEFAULT_MAX_JOURNAL_SECTORS 131072 +#define MIN_LOG2_INTERLEAVE_SECTORS 3 +#define MAX_LOG2_INTERLEAVE_SECTORS 31 +#define METADATA_WORKQUEUE_MAX_ACTIVE 16 + +/* + * Warning - DEBUG_PRINT prints security-sensitive data to the log, + * so it should not be enabled in the official kernel + */ +//#define DEBUG_PRINT +//#define INTERNAL_VERIFY + +/* + * On disk structures + */ + +#define SB_MAGIC "integrt" +#define SB_VERSION 1 +#define SB_SECTORS 8 +#define MAX_SECTORS_PER_BLOCK 8 + +struct superblock { + __u8 magic[8]; + __u8 version; + __u8 log2_interleave_sectors; + __u16 integrity_tag_size; + __u32 journal_sections; + __u64 provided_data_sectors; /* userspace uses this value */ + __u32 flags; + __u8 log2_sectors_per_block; +}; + +#define SB_FLAG_HAVE_JOURNAL_MAC 0x1 + +#define JOURNAL_ENTRY_ROUNDUP 8 + +typedef __u64 commit_id_t; +#define JOURNAL_MAC_PER_SECTOR 8 + +struct journal_entry { + union { + struct { + __u32 sector_lo; + __u32 sector_hi; + } s; + __u64 sector; + } u; + commit_id_t last_bytes[0]; + /* __u8 tag[0]; */ +}; + +#define journal_entry_tag(ic, je) ((__u8 *)&(je)->last_bytes[(ic)->sectors_per_block]) + +#if BITS_PER_LONG == 64 +#define journal_entry_set_sector(je, x) do { smp_wmb(); ACCESS_ONCE((je)->u.sector) = cpu_to_le64(x); } while (0) +#define journal_entry_get_sector(je) le64_to_cpu((je)->u.sector) +#elif defined(CONFIG_LBDAF) +#define journal_entry_set_sector(je, x) do { (je)->u.s.sector_lo = cpu_to_le32(x); smp_wmb(); ACCESS_ONCE((je)->u.s.sector_hi) = cpu_to_le32((x) >> 32); } while (0) +#define journal_entry_get_sector(je) le64_to_cpu((je)->u.sector) +#else +#define journal_entry_set_sector(je, x) do { (je)->u.s.sector_lo = cpu_to_le32(x); smp_wmb(); ACCESS_ONCE((je)->u.s.sector_hi) = cpu_to_le32(0); } while (0) +#define journal_entry_get_sector(je) le32_to_cpu((je)->u.s.sector_lo) +#endif +#define journal_entry_is_unused(je) ((je)->u.s.sector_hi == cpu_to_le32(-1)) +#define journal_entry_set_unused(je) do { ((je)->u.s.sector_hi = cpu_to_le32(-1)); } while (0) +#define journal_entry_is_inprogress(je) ((je)->u.s.sector_hi == cpu_to_le32(-2)) +#define journal_entry_set_inprogress(je) do { ((je)->u.s.sector_hi = cpu_to_le32(-2)); } while (0) + +#define JOURNAL_BLOCK_SECTORS 8 +#define JOURNAL_SECTOR_DATA ((1 << SECTOR_SHIFT) - sizeof(commit_id_t)) +#define JOURNAL_MAC_SIZE (JOURNAL_MAC_PER_SECTOR * JOURNAL_BLOCK_SECTORS) + +struct journal_sector { + __u8 entries[JOURNAL_SECTOR_DATA - JOURNAL_MAC_PER_SECTOR]; + __u8 mac[JOURNAL_MAC_PER_SECTOR]; + commit_id_t commit_id; +}; + +#define MAX_TAG_SIZE (JOURNAL_SECTOR_DATA - JOURNAL_MAC_PER_SECTOR - offsetof(struct journal_entry, last_bytes[MAX_SECTORS_PER_BLOCK])) + +#define METADATA_PADDING_SECTORS 8 + +#define N_COMMIT_IDS 4 + +static unsigned char prev_commit_seq(unsigned char seq) +{ + return (seq + N_COMMIT_IDS - 1) % N_COMMIT_IDS; +} + +static unsigned char next_commit_seq(unsigned char seq) +{ + return (seq + 1) % N_COMMIT_IDS; +} + +/* + * In-memory structures + */ + +struct journal_node { + struct rb_node node; + sector_t sector; +}; + +struct alg_spec { + char *alg_string; + char *key_string; + __u8 *key; + unsigned key_size; +}; + +struct dm_integrity_c { + struct dm_dev *dev; + unsigned tag_size; + __s8 log2_tag_size; + sector_t start; + mempool_t *journal_io_mempool; + struct dm_io_client *io; + struct dm_bufio_client *bufio; + struct workqueue_struct *metadata_wq; + struct superblock *sb; + unsigned journal_pages; + struct page_list *journal; + struct page_list *journal_io; + struct page_list *journal_xor; + + struct crypto_skcipher *journal_crypt; + struct scatterlist **journal_scatterlist; + struct scatterlist **journal_io_scatterlist; + struct skcipher_request **sk_requests; + + struct crypto_shash *journal_mac; + + struct journal_node *journal_tree; + struct rb_root journal_tree_root; + + sector_t provided_data_sectors; + + unsigned short journal_entry_size; + unsigned char journal_entries_per_sector; + unsigned char journal_section_entries; + unsigned short journal_section_sectors; + unsigned journal_sections; + unsigned journal_entries; + sector_t device_sectors; + unsigned initial_sectors; + unsigned metadata_run; + __s8 log2_metadata_run; + __u8 log2_buffer_sectors; + __u8 sectors_per_block; + + unsigned char mode; + bool suspending; + + int failed; + + struct crypto_shash *internal_hash; + + /* these variables are locked with endio_wait.lock */ + struct rb_root in_progress; + wait_queue_head_t endio_wait; + struct workqueue_struct *wait_wq; + + unsigned char commit_seq; + commit_id_t commit_ids[N_COMMIT_IDS]; + + unsigned committed_section; + unsigned n_committed_sections; + + unsigned uncommitted_section; + unsigned n_uncommitted_sections; + + unsigned free_section; + unsigned char free_section_entry; + unsigned free_sectors; + + unsigned free_sectors_threshold; + + struct workqueue_struct *commit_wq; + struct work_struct commit_work; + + struct workqueue_struct *writer_wq; + struct work_struct writer_work; + + struct bio_list flush_bio_list; + + unsigned long autocommit_jiffies; + struct timer_list autocommit_timer; + unsigned autocommit_msec; + + wait_queue_head_t copy_to_journal_wait; + + struct completion crypto_backoff; + + bool journal_uptodate; + bool just_formatted; + + struct alg_spec internal_hash_alg; + struct alg_spec journal_crypt_alg; + struct alg_spec journal_mac_alg; +}; + +struct dm_integrity_range { + sector_t logical_sector; + unsigned n_sectors; + struct rb_node node; +}; + +struct dm_integrity_io { + struct work_struct work; + + struct dm_integrity_c *ic; + bool write; + bool fua; + + struct dm_integrity_range range; + + sector_t metadata_block; + unsigned metadata_offset; + + atomic_t in_flight; + int bi_error; + + struct completion *completion; + + struct block_device *orig_bi_bdev; + bio_end_io_t *orig_bi_end_io; + struct bio_integrity_payload *orig_bi_integrity; + struct bvec_iter orig_bi_iter; +}; + +struct journal_completion { + struct dm_integrity_c *ic; + atomic_t in_flight; + struct completion comp; +}; + +struct journal_io { + struct dm_integrity_range range; + struct journal_completion *comp; +}; + +static struct kmem_cache *journal_io_cache; + +#define JOURNAL_IO_MEMPOOL 32 + +#ifdef DEBUG_PRINT +#define DEBUG_print(x, ...) printk(KERN_DEBUG x, ##__VA_ARGS__) +static void __DEBUG_bytes(__u8 *bytes, size_t len, const char *msg, ...) +{ + va_list args; + va_start(args, msg); + vprintk(msg, args); + va_end(args); + if (len) + pr_cont(":"); + while (len) { + pr_cont(" %02x", *bytes); + bytes++; + len--; + } + pr_cont("\n"); +} +#define DEBUG_bytes(bytes, len, msg, ...) __DEBUG_bytes(bytes, len, KERN_DEBUG msg, ##__VA_ARGS__) +#else +#define DEBUG_print(x, ...) do { } while (0) +#define DEBUG_bytes(bytes, len, msg, ...) do { } while (0) +#endif + +/* + * DM Integrity profile, protection is performed layer above (dm-crypt) + */ +static struct blk_integrity_profile dm_integrity_profile = { + .name = "DM-DIF-EXT-TAG", + .generate_fn = NULL, + .verify_fn = NULL, +}; + +static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map); +static void integrity_bio_wait(struct work_struct *w); +static void dm_integrity_dtr(struct dm_target *ti); + +static void dm_integrity_io_error(struct dm_integrity_c *ic, const char *msg, int err) +{ + if (!cmpxchg(&ic->failed, 0, err)) + DMERR("Error on %s: %d", msg, err); +} + +static int dm_integrity_failed(struct dm_integrity_c *ic) +{ + return ACCESS_ONCE(ic->failed); +} + +static commit_id_t dm_integrity_commit_id(struct dm_integrity_c *ic, unsigned i, + unsigned j, unsigned char seq) +{ + /* + * Xor the number with section and sector, so that if a piece of + * journal is written at wrong place, it is detected. + */ + return ic->commit_ids[seq] ^ cpu_to_le64(((__u64)i << 32) ^ j); +} + +static void get_area_and_offset(struct dm_integrity_c *ic, sector_t data_sector, + sector_t *area, sector_t *offset) +{ + __u8 log2_interleave_sectors = ic->sb->log2_interleave_sectors; + + *area = data_sector >> log2_interleave_sectors; + *offset = (unsigned)data_sector & ((1U << log2_interleave_sectors) - 1); +} + +#define sector_to_block(ic, n) \ +do { \ + BUG_ON((n) & (unsigned)((ic)->sectors_per_block - 1)); \ + (n) >>= (ic)->sb->log2_sectors_per_block; \ +} while (0) + +static __u64 get_metadata_sector_and_offset(struct dm_integrity_c *ic, sector_t area, + sector_t offset, unsigned *metadata_offset) +{ + __u64 ms; + unsigned mo; + + ms = area << ic->sb->log2_interleave_sectors; + if (likely(ic->log2_metadata_run >= 0)) + ms += area << ic->log2_metadata_run; + else + ms += area * ic->metadata_run; + ms >>= ic->log2_buffer_sectors; + + sector_to_block(ic, offset); + + if (likely(ic->log2_tag_size >= 0)) { + ms += offset >> (SECTOR_SHIFT + ic->log2_buffer_sectors - ic->log2_tag_size); + mo = (offset << ic->log2_tag_size) & ((1U << SECTOR_SHIFT << ic->log2_buffer_sectors) - 1); + } else { + ms += (__u64)offset * ic->tag_size >> (SECTOR_SHIFT + ic->log2_buffer_sectors); + mo = (offset * ic->tag_size) & ((1U << SECTOR_SHIFT << ic->log2_buffer_sectors) - 1); + } + *metadata_offset = mo; + return ms; +} + +static sector_t get_data_sector(struct dm_integrity_c *ic, sector_t area, sector_t offset) +{ + sector_t result; + + result = area << ic->sb->log2_interleave_sectors; + if (likely(ic->log2_metadata_run >= 0)) + result += (area + 1) << ic->log2_metadata_run; + else + result += (area + 1) * ic->metadata_run; + + result += (sector_t)ic->initial_sectors + offset; + return result; +} + +static void wraparound_section(struct dm_integrity_c *ic, unsigned *sec_ptr) +{ + if (unlikely(*sec_ptr >= ic->journal_sections)) + *sec_ptr -= ic->journal_sections; +} + +static int sync_rw_sb(struct dm_integrity_c *ic, int op, int op_flags) +{ + struct dm_io_request io_req; + struct dm_io_region io_loc; + + io_req.bi_op = op; + io_req.bi_op_flags = op_flags; + io_req.mem.type = DM_IO_KMEM; + io_req.mem.ptr.addr = ic->sb; + io_req.notify.fn = NULL; + io_req.client = ic->io; + io_loc.bdev = ic->dev->bdev; + io_loc.sector = ic->start; + io_loc.count = SB_SECTORS; + + return dm_io(&io_req, 1, &io_loc, NULL); +} + +static void access_journal_check(struct dm_integrity_c *ic, unsigned section, unsigned offset, + bool e, const char *function) +{ +#if defined(CONFIG_DM_DEBUG) || defined(INTERNAL_VERIFY) + unsigned limit = e ? ic->journal_section_entries : ic->journal_section_sectors; + + if (unlikely(section >= ic->journal_sections) || + unlikely(offset >= limit)) { + printk(KERN_CRIT "%s: invalid access at (%u,%u), limit (%u,%u)\n", + function, section, offset, ic->journal_sections, limit); + BUG(); + } +#endif +} + +static void page_list_location(struct dm_integrity_c *ic, unsigned section, unsigned offset, + unsigned *pl_index, unsigned *pl_offset) +{ + unsigned sector; + + access_journal_check(ic, section, offset, false, "page_list_location"); + + sector = section * ic->journal_section_sectors + offset; + + *pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT); + *pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1); +} + +static struct journal_sector *access_page_list(struct dm_integrity_c *ic, struct page_list *pl, + unsigned section, unsigned offset, unsigned *n_sectors) +{ + unsigned pl_index, pl_offset; + char *va; + + page_list_location(ic, section, offset, &pl_index, &pl_offset); + + if (n_sectors) + *n_sectors = (PAGE_SIZE - pl_offset) >> SECTOR_SHIFT; + + va = lowmem_page_address(pl[pl_index].page); + + return (struct journal_sector *)(va + pl_offset); +} + +static struct journal_sector *access_journal(struct dm_integrity_c *ic, unsigned section, unsigned offset) +{ + return access_page_list(ic, ic->journal, section, offset, NULL); +} + +static struct journal_entry *access_journal_entry(struct dm_integrity_c *ic, unsigned section, unsigned n) +{ + unsigned rel_sector, offset; + struct journal_sector *js; + + access_journal_check(ic, section, n, true, "access_journal_entry"); + + rel_sector = n % JOURNAL_BLOCK_SECTORS; + offset = n / JOURNAL_BLOCK_SECTORS; + + js = access_journal(ic, section, rel_sector); + return (struct journal_entry *)((char *)js + offset * ic->journal_entry_size); +} + +static struct journal_sector *access_journal_data(struct dm_integrity_c *ic, unsigned section, unsigned n) +{ + n <<= ic->sb->log2_sectors_per_block; + + n += JOURNAL_BLOCK_SECTORS; + + access_journal_check(ic, section, n, false, "access_journal_data"); + + return access_journal(ic, section, n); +} + +static void section_mac(struct dm_integrity_c *ic, unsigned section, __u8 result[JOURNAL_MAC_SIZE]) +{ + SHASH_DESC_ON_STACK(desc, ic->journal_mac); + int r; + unsigned j, size; + + desc->tfm = ic->journal_mac; + desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; + + r = crypto_shash_init(desc); + if (unlikely(r)) { + dm_integrity_io_error(ic, "crypto_shash_init", r); + goto err; + } + + for (j = 0; j < ic->journal_section_entries; j++) { + struct journal_entry *je = access_journal_entry(ic, section, j); + r = crypto_shash_update(desc, (__u8 *)&je->u.sector, sizeof je->u.sector); + if (unlikely(r)) { + dm_integrity_io_error(ic, "crypto_shash_update", r); + goto err; + } + } + + size = crypto_shash_digestsize(ic->journal_mac); + + if (likely(size <= JOURNAL_MAC_SIZE)) { + r = crypto_shash_final(desc, result); + if (unlikely(r)) { + dm_integrity_io_error(ic, "crypto_shash_final", r); + goto err; + } + memset(result + size, 0, JOURNAL_MAC_SIZE - size); + } else { + __u8 digest[size]; + r = crypto_shash_final(desc, digest); + if (unlikely(r)) { + dm_integrity_io_error(ic, "crypto_shash_final", r); + goto err; + } + memcpy(result, digest, JOURNAL_MAC_SIZE); + } + + return; +err: + memset(result, 0, JOURNAL_MAC_SIZE); +} + +static void rw_section_mac(struct dm_integrity_c *ic, unsigned section, bool wr) +{ + __u8 result[JOURNAL_MAC_SIZE]; + unsigned j; + + if (!ic->journal_mac) + return; + + section_mac(ic, section, result); + + for (j = 0; j < JOURNAL_BLOCK_SECTORS; j++) { + struct journal_sector *js = access_journal(ic, section, j); + + if (likely(wr)) + memcpy(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR); + else { + if (memcmp(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR)) + dm_integrity_io_error(ic, "journal mac", -EILSEQ); + } + } +} + +static void complete_journal_op(void *context) +{ + struct journal_completion *comp = context; + BUG_ON(!atomic_read(&comp->in_flight)); + if (likely(atomic_dec_and_test(&comp->in_flight))) + complete(&comp->comp); +} + +static void xor_journal(struct dm_integrity_c *ic, bool encrypt, unsigned section, + unsigned n_sections, struct journal_completion *comp) +{ + struct async_submit_ctl submit; + size_t n_bytes = (size_t)(n_sections * ic->journal_section_sectors) << SECTOR_SHIFT; + unsigned pl_index, pl_offset, section_index; + struct page_list *source_pl, *target_pl; + + if (likely(encrypt)) { + source_pl = ic->journal; + target_pl = ic->journal_io; + } else { + source_pl = ic->journal_io; + target_pl = ic->journal; + } + + page_list_location(ic, section, 0, &pl_index, &pl_offset); + + atomic_add(roundup(pl_offset + n_bytes, PAGE_SIZE) >> PAGE_SHIFT, &comp->in_flight); + + init_async_submit(&submit, ASYNC_TX_XOR_ZERO_DST, NULL, complete_journal_op, comp, NULL); + + section_index = pl_index; + + do { + size_t this_step; + struct page *src_pages[2]; + struct page *dst_page; + + while (unlikely(pl_index == section_index)) { + unsigned dummy; + if (likely(encrypt)) + rw_section_mac(ic, section, true); + section++; + n_sections--; + if (!n_sections) + break; + page_list_location(ic, section, 0, §ion_index, &dummy); + } + + this_step = min(n_bytes, (size_t)PAGE_SIZE - pl_offset); + dst_page = target_pl[pl_index].page; + src_pages[0] = source_pl[pl_index].page; + src_pages[1] = ic->journal_xor[pl_index].page; + + async_xor(dst_page, src_pages, pl_offset, 2, this_step, &submit); + + pl_index++; + pl_offset = 0; + n_bytes -= this_step; + } while (n_bytes); + + BUG_ON(n_sections); + + async_tx_issue_pending_all(); +} + +static void complete_journal_encrypt(struct crypto_async_request *req, int err) +{ + struct journal_completion *comp = req->data; + if (unlikely(err)) { + if (likely(err == -EINPROGRESS)) { + complete(&comp->ic->crypto_backoff); + return; + } + dm_integrity_io_error(comp->ic, "asynchronous encrypt", err); + } + complete_journal_op(comp); +} + +static bool do_crypt(bool encrypt, struct skcipher_request *req, struct journal_completion *comp) +{ + int r; + skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, + complete_journal_encrypt, comp); + if (likely(encrypt)) + r = crypto_skcipher_encrypt(req); + else + r = crypto_skcipher_decrypt(req); + if (likely(!r)) + return false; + if (likely(r == -EINPROGRESS)) + return true; + if (likely(r == -EBUSY)) { + wait_for_completion(&comp->ic->crypto_backoff); + reinit_completion(&comp->ic->crypto_backoff); + return true; + } + dm_integrity_io_error(comp->ic, "encrypt", r); + return false; +} + +static void crypt_journal(struct dm_integrity_c *ic, bool encrypt, unsigned section, + unsigned n_sections, struct journal_completion *comp) +{ + struct scatterlist **source_sg; + struct scatterlist **target_sg; + + atomic_add(2, &comp->in_flight); + + if (likely(encrypt)) { + source_sg = ic->journal_scatterlist; + target_sg = ic->journal_io_scatterlist; + } else { + source_sg = ic->journal_io_scatterlist; + target_sg = ic->journal_scatterlist; + } + + do { + struct skcipher_request *req; + unsigned ivsize; + char *iv; + + if (likely(encrypt)) + rw_section_mac(ic, section, true); + + req = ic->sk_requests[section]; + ivsize = crypto_skcipher_ivsize(ic->journal_crypt); + iv = req->iv; + + memcpy(iv, iv + ivsize, ivsize); + + req->src = source_sg[section]; + req->dst = target_sg[section]; + + if (unlikely(do_crypt(encrypt, req, comp))) + atomic_inc(&comp->in_flight); + + section++; + n_sections--; + } while (n_sections); + + atomic_dec(&comp->in_flight); + complete_journal_op(comp); +} + +static void encrypt_journal(struct dm_integrity_c *ic, bool encrypt, unsigned section, + unsigned n_sections, struct journal_completion *comp) +{ + if (ic->journal_xor) + return xor_journal(ic, encrypt, section, n_sections, comp); + else + return crypt_journal(ic, encrypt, section, n_sections, comp); +} + +static void complete_journal_io(unsigned long error, void *context) +{ + struct journal_completion *comp = context; + if (unlikely(error != 0)) + dm_integrity_io_error(comp->ic, "writing journal", -EIO); + complete_journal_op(comp); +} + +static void rw_journal(struct dm_integrity_c *ic, int op, int op_flags, unsigned section, + unsigned n_sections, struct journal_completion *comp) +{ + struct dm_io_request io_req; + struct dm_io_region io_loc; + unsigned sector, n_sectors, pl_index, pl_offset; + int r; + + if (unlikely(dm_integrity_failed(ic))) { + if (comp) + complete_journal_io(-1UL, comp); + return; + } + + sector = section * ic->journal_section_sectors; + n_sectors = n_sections * ic->journal_section_sectors; + + pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT); + pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1); + + io_req.bi_op = op; + io_req.bi_op_flags = op_flags; + io_req.mem.type = DM_IO_PAGE_LIST; + if (ic->journal_io) + io_req.mem.ptr.pl = &ic->journal_io[pl_index]; + else + io_req.mem.ptr.pl = &ic->journal[pl_index]; + io_req.mem.offset = pl_offset; + if (likely(comp != NULL)) { + io_req.notify.fn = complete_journal_io; + io_req.notify.context = comp; + } else { + io_req.notify.fn = NULL; + } + io_req.client = ic->io; + io_loc.bdev = ic->dev->bdev; + io_loc.sector = ic->start + SB_SECTORS + sector; + io_loc.count = n_sectors; + + r = dm_io(&io_req, 1, &io_loc, NULL); + if (unlikely(r)) { + dm_integrity_io_error(ic, op == REQ_OP_READ ? "reading journal" : "writing journal", r); + if (comp) { + WARN_ONCE(1, "asynchronous dm_io failed: %d", r); + complete_journal_io(-1UL, comp); + } + } +} + +static void write_journal(struct dm_integrity_c *ic, unsigned commit_start, unsigned commit_sections) +{ + struct journal_completion io_comp; + struct journal_completion crypt_comp_1; + struct journal_completion crypt_comp_2; + unsigned i; + + io_comp.ic = ic; + io_comp.comp = COMPLETION_INITIALIZER_ONSTACK(io_comp.comp); + + if (commit_start + commit_sections <= ic->journal_sections) { + io_comp.in_flight = (atomic_t)ATOMIC_INIT(1); + if (ic->journal_io) { + crypt_comp_1.ic = ic; + crypt_comp_1.comp = COMPLETION_INITIALIZER_ONSTACK(crypt_comp_1.comp); + crypt_comp_1.in_flight = (atomic_t)ATOMIC_INIT(0); + encrypt_journal(ic, true, commit_start, commit_sections, &crypt_comp_1); + wait_for_completion_io(&crypt_comp_1.comp); + } else { + for (i = 0; i < commit_sections; i++) + rw_section_mac(ic, commit_start + i, true); + } + rw_journal(ic, REQ_OP_WRITE, REQ_FUA, commit_start, commit_sections, &io_comp); + } else { + unsigned to_end; + io_comp.in_flight = (atomic_t)ATOMIC_INIT(2); + to_end = ic->journal_sections - commit_start; + if (ic->journal_io) { + crypt_comp_1.ic = ic; + crypt_comp_1.comp = COMPLETION_INITIALIZER_ONSTACK(crypt_comp_1.comp); + crypt_comp_1.in_flight = (atomic_t)ATOMIC_INIT(0); + encrypt_journal(ic, true, commit_start, to_end, &crypt_comp_1); + if (try_wait_for_completion(&crypt_comp_1.comp)) { + rw_journal(ic, REQ_OP_WRITE, REQ_FUA, commit_start, to_end, &io_comp); + crypt_comp_1.comp = COMPLETION_INITIALIZER_ONSTACK(crypt_comp_1.comp); + crypt_comp_1.in_flight = (atomic_t)ATOMIC_INIT(0); + encrypt_journal(ic, true, 0, commit_sections - to_end, &crypt_comp_1); + wait_for_completion_io(&crypt_comp_1.comp); + } else { + crypt_comp_2.ic = ic; + crypt_comp_2.comp = COMPLETION_INITIALIZER_ONSTACK(crypt_comp_2.comp); + crypt_comp_2.in_flight = (atomic_t)ATOMIC_INIT(0); + encrypt_journal(ic, true, 0, commit_sections - to_end, &crypt_comp_2); + wait_for_completion_io(&crypt_comp_1.comp); + rw_journal(ic, REQ_OP_WRITE, REQ_FUA, commit_start, to_end, &io_comp); + wait_for_completion_io(&crypt_comp_2.comp); + } + } else { + for (i = 0; i < to_end; i++) + rw_section_mac(ic, commit_start + i, true); + rw_journal(ic, REQ_OP_WRITE, REQ_FUA, commit_start, to_end, &io_comp); + for (i = 0; i < commit_sections - to_end; i++) + rw_section_mac(ic, i, true); + } + rw_journal(ic, REQ_OP_WRITE, REQ_FUA, 0, commit_sections - to_end, &io_comp); + } + + wait_for_completion_io(&io_comp.comp); +} + +static void copy_from_journal(struct dm_integrity_c *ic, unsigned section, unsigned offset, + unsigned n_sectors, sector_t target, io_notify_fn fn, void *data) +{ + struct dm_io_request io_req; + struct dm_io_region io_loc; + int r; + unsigned sector, pl_index, pl_offset; + + BUG_ON((target | n_sectors | offset) & (unsigned)(ic->sectors_per_block - 1)); + + if (unlikely(dm_integrity_failed(ic))) { + fn(-1UL, data); + return; + } + + sector = section * ic->journal_section_sectors + JOURNAL_BLOCK_SECTORS + offset; + + pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT); + pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1); + + io_req.bi_op = REQ_OP_WRITE; + io_req.bi_op_flags = 0; + io_req.mem.type = DM_IO_PAGE_LIST; + io_req.mem.ptr.pl = &ic->journal[pl_index]; + io_req.mem.offset = pl_offset; + io_req.notify.fn = fn; + io_req.notify.context = data; + io_req.client = ic->io; + io_loc.bdev = ic->dev->bdev; + io_loc.sector = ic->start + target; + io_loc.count = n_sectors; + + r = dm_io(&io_req, 1, &io_loc, NULL); + if (unlikely(r)) { + WARN_ONCE(1, "asynchronous dm_io failed: %d", r); + fn(-1UL, data); + } +} + +static bool add_new_range(struct dm_integrity_c *ic, struct dm_integrity_range *new_range) +{ + struct rb_node **n = &ic->in_progress.rb_node; + struct rb_node *parent; + + BUG_ON((new_range->logical_sector | new_range->n_sectors) & (unsigned)(ic->sectors_per_block - 1)); + + parent = NULL; + + while (*n) { + struct dm_integrity_range *range = container_of(*n, struct dm_integrity_range, node); + + parent = *n; + if (new_range->logical_sector + new_range->n_sectors <= range->logical_sector) { + n = &range->node.rb_left; + } else if (new_range->logical_sector >= range->logical_sector + range->n_sectors) { + n = &range->node.rb_right; + } else { + return false; + } + } + + rb_link_node(&new_range->node, parent, n); + rb_insert_color(&new_range->node, &ic->in_progress); + + return true; +} + +static void remove_range_unlocked(struct dm_integrity_c *ic, struct dm_integrity_range *range) +{ + rb_erase(&range->node, &ic->in_progress); + wake_up_locked(&ic->endio_wait); +} + +static void remove_range(struct dm_integrity_c *ic, struct dm_integrity_range *range) +{ + unsigned long flags; + + spin_lock_irqsave(&ic->endio_wait.lock, flags); + remove_range_unlocked(ic, range); + spin_unlock_irqrestore(&ic->endio_wait.lock, flags); +} + +static void init_journal_node(struct journal_node *node) +{ + RB_CLEAR_NODE(&node->node); + node->sector = (sector_t)-1; +} + +static void add_journal_node(struct dm_integrity_c *ic, struct journal_node *node, sector_t sector) +{ + struct rb_node **link; + struct rb_node *parent; + + node->sector = sector; + BUG_ON(!RB_EMPTY_NODE(&node->node)); + + link = &ic->journal_tree_root.rb_node; + parent = NULL; + + while (*link) { + struct journal_node *j; + parent = *link; + j = container_of(parent, struct journal_node, node); + if (sector < j->sector) + link = &j->node.rb_left; + else + link = &j->node.rb_right; + } + + rb_link_node(&node->node, parent, link); + rb_insert_color(&node->node, &ic->journal_tree_root); +} + +static void remove_journal_node(struct dm_integrity_c *ic, struct journal_node *node) +{ + BUG_ON(RB_EMPTY_NODE(&node->node)); + rb_erase(&node->node, &ic->journal_tree_root); + init_journal_node(node); +} + +#define NOT_FOUND (-1U) + +static unsigned find_journal_node(struct dm_integrity_c *ic, sector_t sector, sector_t *next_sector) +{ + struct rb_node *n = ic->journal_tree_root.rb_node; + unsigned found = NOT_FOUND; + *next_sector = (sector_t)-1; + while (n) { + struct journal_node *j = container_of(n, struct journal_node, node); + if (sector == j->sector) { + found = j - ic->journal_tree; + } + if (sector < j->sector) { + *next_sector = j->sector; + n = j->node.rb_left; + } else { + n = j->node.rb_right; + } + } + + return found; +} + +static bool test_journal_node(struct dm_integrity_c *ic, unsigned pos, sector_t sector) +{ + struct journal_node *node, *next_node; + struct rb_node *next; + + if (unlikely(pos >= ic->journal_entries)) + return false; + node = &ic->journal_tree[pos]; + if (unlikely(RB_EMPTY_NODE(&node->node))) + return false; + if (unlikely(node->sector != sector)) + return false; + + next = rb_next(&node->node); + if (unlikely(!next)) + return true; + + next_node = container_of(next, struct journal_node, node); + return next_node->sector != sector; +} + +static bool find_newer_committed_node(struct dm_integrity_c *ic, struct journal_node *node) +{ + struct rb_node *next; + struct journal_node *next_node; + unsigned next_section; + + BUG_ON(RB_EMPTY_NODE(&node->node)); + + next = rb_next(&node->node); + if (unlikely(!next)) + return false; + + next_node = container_of(next, struct journal_node, node); + + if (next_node->sector != node->sector) + return false; + + next_section = (unsigned)(next_node - ic->journal_tree) / ic->journal_section_entries; + if (next_section >= ic->committed_section && + next_section < ic->committed_section + ic->n_committed_sections) + return true; + if (next_section + ic->journal_sections < ic->committed_section + ic->n_committed_sections) + return true; + + return false; +} + +#define TAG_READ 0 +#define TAG_WRITE 1 +#define TAG_CMP 2 + +static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, sector_t *metadata_block, + unsigned *metadata_offset, unsigned total_size, int op) +{ + do { + unsigned char *data, *dp; + struct dm_buffer *b; + unsigned to_copy; + int r; + + r = dm_integrity_failed(ic); + if (unlikely(r)) + return r; + + data = dm_bufio_read(ic->bufio, *metadata_block, &b); + if (unlikely(IS_ERR(data))) + return PTR_ERR(data); + + to_copy = min((1U << SECTOR_SHIFT << ic->log2_buffer_sectors) - *metadata_offset, total_size); + dp = data + *metadata_offset; + if (op == TAG_READ) { + memcpy(tag, dp, to_copy); + } else if (op == TAG_WRITE) { + memcpy(dp, tag, to_copy); + dm_bufio_mark_buffer_dirty(b); + } else { + /* e.g.: op == TAG_CMP */ + if (unlikely(memcmp(dp, tag, to_copy))) { + unsigned i; + + for (i = 0; i < to_copy; i++) { + if (dp[i] != tag[i]) + break; + total_size--; + } + dm_bufio_release(b); + return total_size; + } + } + dm_bufio_release(b); + + tag += to_copy; + *metadata_offset += to_copy; + if (unlikely(*metadata_offset == 1U << SECTOR_SHIFT << ic->log2_buffer_sectors)) { + (*metadata_block)++; + *metadata_offset = 0; + } + total_size -= to_copy; + } while (unlikely(total_size)); + + return 0; +} + +static void dm_integrity_flush_buffers(struct dm_integrity_c *ic) +{ + int r; + r = dm_bufio_write_dirty_buffers(ic->bufio); + if (unlikely(r)) + dm_integrity_io_error(ic, "writing tags", r); +} + +static void sleep_on_endio_wait(struct dm_integrity_c *ic) +{ + DECLARE_WAITQUEUE(wait, current); + __add_wait_queue(&ic->endio_wait, &wait); + __set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock_irq(&ic->endio_wait.lock); + io_schedule(); + spin_lock_irq(&ic->endio_wait.lock); + __remove_wait_queue(&ic->endio_wait, &wait); +} + +static void autocommit_fn(unsigned long data) +{ + struct dm_integrity_c *ic = (struct dm_integrity_c *)data; + + if (likely(!dm_integrity_failed(ic))) + queue_work(ic->commit_wq, &ic->commit_work); +} + +static void schedule_autocommit(struct dm_integrity_c *ic) +{ + if (!timer_pending(&ic->autocommit_timer)) + mod_timer(&ic->autocommit_timer, jiffies + ic->autocommit_jiffies); +} + +static void submit_flush_bio(struct dm_integrity_c *ic, struct dm_integrity_io *dio) +{ + struct bio *bio; + spin_lock_irq(&ic->endio_wait.lock); + bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); + bio_list_add(&ic->flush_bio_list, bio); + spin_unlock_irq(&ic->endio_wait.lock); + queue_work(ic->commit_wq, &ic->commit_work); +} + +static void do_endio(struct dm_integrity_c *ic, struct bio *bio) +{ + int r = dm_integrity_failed(ic); + if (unlikely(r) && !bio->bi_error) + bio->bi_error = r; + bio_endio(bio); +} + +static void do_endio_flush(struct dm_integrity_c *ic, struct dm_integrity_io *dio) +{ + struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); + + if (unlikely(dio->fua) && likely(!bio->bi_error) && likely(!dm_integrity_failed(ic))) + submit_flush_bio(ic, dio); + else + do_endio(ic, bio); +} + +static void dec_in_flight(struct dm_integrity_io *dio) +{ + if (atomic_dec_and_test(&dio->in_flight)) { + struct dm_integrity_c *ic = dio->ic; + struct bio *bio; + + remove_range(ic, &dio->range); + + if (unlikely(dio->write)) + schedule_autocommit(ic); + + bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); + + if (unlikely(dio->bi_error) && !bio->bi_error) + bio->bi_error = dio->bi_error; + if (likely(!bio->bi_error) && unlikely(bio_sectors(bio) != dio->range.n_sectors)) { + dio->range.logical_sector += dio->range.n_sectors; + bio_advance(bio, dio->range.n_sectors << SECTOR_SHIFT); + INIT_WORK(&dio->work, integrity_bio_wait); + queue_work(ic->wait_wq, &dio->work); + return; + } + do_endio_flush(ic, dio); + } +} + +static void integrity_end_io(struct bio *bio) +{ + struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io)); + + bio->bi_iter = dio->orig_bi_iter; + bio->bi_bdev = dio->orig_bi_bdev; + if (dio->orig_bi_integrity) { + bio->bi_integrity = dio->orig_bi_integrity; + bio->bi_opf |= REQ_INTEGRITY; + } + bio->bi_end_io = dio->orig_bi_end_io; + + if (dio->completion) + complete(dio->completion); + + dec_in_flight(dio); +} + +static void integrity_sector_checksum(struct dm_integrity_c *ic, sector_t sector, + const char *data, char *result) +{ + __u64 sector_le = cpu_to_le64(sector); + SHASH_DESC_ON_STACK(req, ic->internal_hash); + int r; + unsigned digest_size; + + req->tfm = ic->internal_hash; + req->flags = 0; + + r = crypto_shash_init(req); + if (unlikely(r < 0)) { + dm_integrity_io_error(ic, "crypto_shash_init", r); + goto failed; + } + + r = crypto_shash_update(req, (const __u8 *)§or_le, sizeof sector_le); + if (unlikely(r < 0)) { + dm_integrity_io_error(ic, "crypto_shash_update", r); + goto failed; + } + + r = crypto_shash_update(req, data, ic->sectors_per_block << SECTOR_SHIFT); + if (unlikely(r < 0)) { + dm_integrity_io_error(ic, "crypto_shash_update", r); + goto failed; + } + + r = crypto_shash_final(req, result); + if (unlikely(r < 0)) { + dm_integrity_io_error(ic, "crypto_shash_final", r); + goto failed; + } + + digest_size = crypto_shash_digestsize(ic->internal_hash); + if (unlikely(digest_size < ic->tag_size)) + memset(result + digest_size, 0, ic->tag_size - digest_size); + + return; + +failed: + /* this shouldn't happen anyway, the hash functions have no reason to fail */ + get_random_bytes(result, ic->tag_size); +} + +static void integrity_metadata(struct work_struct *w) +{ + struct dm_integrity_io *dio = container_of(w, struct dm_integrity_io, work); + struct dm_integrity_c *ic = dio->ic; + + int r; + + if (ic->internal_hash) { + struct bvec_iter iter; + struct bio_vec bv; + unsigned digest_size = crypto_shash_digestsize(ic->internal_hash); + struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); + char *checksums; + unsigned extra_space = unlikely(digest_size > ic->tag_size) ? digest_size - ic->tag_size : 0; + char checksums_onstack[ic->tag_size + extra_space]; + unsigned sectors_to_process = dio->range.n_sectors; + sector_t sector = dio->range.logical_sector; + + if (unlikely(ic->mode == 'R')) + goto skip_io; + + checksums = kmalloc((PAGE_SIZE >> SECTOR_SHIFT >> ic->sb->log2_sectors_per_block) * ic->tag_size + extra_space, + GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN); + if (!checksums) + checksums = checksums_onstack; + + __bio_for_each_segment(bv, bio, iter, dio->orig_bi_iter) { + unsigned pos; + char *mem, *checksums_ptr; + +again: + mem = (char *)kmap_atomic(bv.bv_page) + bv.bv_offset; + pos = 0; + checksums_ptr = checksums; + do { + integrity_sector_checksum(ic, sector, mem + pos, checksums_ptr); + checksums_ptr += ic->tag_size; + sectors_to_process -= ic->sectors_per_block; + pos += ic->sectors_per_block << SECTOR_SHIFT; + sector += ic->sectors_per_block; + } while (pos < bv.bv_len && sectors_to_process && checksums != checksums_onstack); + kunmap_atomic(mem); + + r = dm_integrity_rw_tag(ic, checksums, &dio->metadata_block, &dio->metadata_offset, + checksums_ptr - checksums, !dio->write ? TAG_CMP : TAG_WRITE); + if (unlikely(r)) { + if (r > 0) { + DMERR("Checksum failed at sector 0x%llx", + (unsigned long long)(sector - ((r + ic->tag_size - 1) / ic->tag_size))); + r = -EILSEQ; + } + if (likely(checksums != checksums_onstack)) + kfree(checksums); + goto error; + } + + if (!sectors_to_process) + break; + + if (unlikely(pos < bv.bv_len)) { + bv.bv_offset += pos; + bv.bv_len -= pos; + goto again; + } + } + + if (likely(checksums != checksums_onstack)) + kfree(checksums); + } else { + struct bio_integrity_payload *bip = dio->orig_bi_integrity; + + if (bip) { + struct bio_vec biv; + struct bvec_iter iter; + unsigned data_to_process = dio->range.n_sectors; + sector_to_block(ic, data_to_process); + data_to_process *= ic->tag_size; + + bip_for_each_vec(biv, bip, iter) { + unsigned char *tag; + unsigned this_len; + + BUG_ON(PageHighMem(biv.bv_page)); + tag = lowmem_page_address(biv.bv_page) + biv.bv_offset; + this_len = min(biv.bv_len, data_to_process); + r = dm_integrity_rw_tag(ic, tag, &dio->metadata_block, &dio->metadata_offset, + this_len, !dio->write ? TAG_READ : TAG_WRITE); + if (unlikely(r)) + goto error; + data_to_process -= this_len; + if (!data_to_process) + break; + } + } + } +skip_io: + dec_in_flight(dio); + return; +error: + dio->bi_error = r; + dec_in_flight(dio); +} + +static int dm_integrity_map(struct dm_target *ti, struct bio *bio) +{ + struct dm_integrity_c *ic = ti->private; + struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io)); + struct bio_integrity_payload *bip; + + sector_t area, offset; + + dio->ic = ic; + dio->bi_error = 0; + + if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { + submit_flush_bio(ic, dio); + return DM_MAPIO_SUBMITTED; + } + + dio->range.logical_sector = dm_target_offset(ti, bio->bi_iter.bi_sector); + dio->write = bio_op(bio) == REQ_OP_WRITE; + dio->fua = dio->write && bio->bi_opf & REQ_FUA; + if (unlikely(dio->fua)) { + /* + * Don't pass down the FUA flag because we have to flush + * disk cache anyway. + */ + bio->bi_opf &= ~REQ_FUA; + } + if (unlikely(dio->range.logical_sector + bio_sectors(bio) > ic->provided_data_sectors)) { + DMERR("Too big sector number: 0x%llx + 0x%x > 0x%llx", + (unsigned long long)dio->range.logical_sector, bio_sectors(bio), + (unsigned long long)ic->provided_data_sectors); + return -EIO; + } + if (unlikely((dio->range.logical_sector | bio_sectors(bio)) & (unsigned)(ic->sectors_per_block - 1))) { + DMERR("Bio not aligned on %u sectors: 0x%llx, 0x%x", + ic->sectors_per_block, + (unsigned long long)dio->range.logical_sector, bio_sectors(bio)); + return -EIO; + } + + if (ic->sectors_per_block > 1) { + struct bvec_iter iter; + struct bio_vec bv; + bio_for_each_segment(bv, bio, iter) { + if (unlikely((bv.bv_offset | bv.bv_len) & ((ic->sectors_per_block << SECTOR_SHIFT) - 1))) { + DMERR("Bio vector (%u,%u) is not aligned on %u-sector boundary", + bv.bv_offset, bv.bv_len, ic->sectors_per_block); + return -EIO; + } + } + } + + bip = bio_integrity(bio); + if (!ic->internal_hash) { + if (bip) { + unsigned wanted_tag_size = bio_sectors(bio) >> ic->sb->log2_sectors_per_block; + if (ic->log2_tag_size >= 0) + wanted_tag_size <<= ic->log2_tag_size; + else + wanted_tag_size *= ic->tag_size; + if (unlikely(wanted_tag_size != bip->bip_iter.bi_size)) { + DMERR("Invalid integrity data size %u, expected %u", bip->bip_iter.bi_size, wanted_tag_size); + return -EIO; + } + } + } else { + if (unlikely(bip != NULL)) { + DMERR("Unexpected integrity data when using internal hash"); + return -EIO; + } + } + + if (unlikely(ic->mode == 'R') && unlikely(dio->write)) + return -EIO; + + get_area_and_offset(ic, dio->range.logical_sector, &area, &offset); + dio->metadata_block = get_metadata_sector_and_offset(ic, area, offset, &dio->metadata_offset); + bio->bi_iter.bi_sector = get_data_sector(ic, area, offset); + + dm_integrity_map_continue(dio, true); + return DM_MAPIO_SUBMITTED; +} + +static bool __journal_read_write(struct dm_integrity_io *dio, struct bio *bio, + unsigned journal_section, unsigned journal_entry) +{ + struct dm_integrity_c *ic = dio->ic; + sector_t logical_sector; + unsigned n_sectors; + + logical_sector = dio->range.logical_sector; + n_sectors = dio->range.n_sectors; + do { + struct bio_vec bv = bio_iovec(bio); + char *mem; + + if (unlikely(bv.bv_len >> SECTOR_SHIFT > n_sectors)) + bv.bv_len = n_sectors << SECTOR_SHIFT; + n_sectors -= bv.bv_len >> SECTOR_SHIFT; + bio_advance_iter(bio, &bio->bi_iter, bv.bv_len); +retry_kmap: + mem = kmap_atomic(bv.bv_page); + if (likely(dio->write)) + flush_dcache_page(bv.bv_page); + + do { + struct journal_entry *je = access_journal_entry(ic, journal_section, journal_entry); + + if (unlikely(!dio->write)) { + struct journal_sector *js; + char *mem_ptr; + unsigned s; + + if (unlikely(journal_entry_is_inprogress(je))) { + flush_dcache_page(bv.bv_page); + kunmap_atomic(mem); + + __io_wait_event(ic->copy_to_journal_wait, !journal_entry_is_inprogress(je)); + goto retry_kmap; + } + smp_rmb(); + BUG_ON(journal_entry_get_sector(je) != logical_sector); + js = access_journal_data(ic, journal_section, journal_entry); + mem_ptr = mem + bv.bv_offset; + s = 0; + do { + memcpy(mem_ptr, js, JOURNAL_SECTOR_DATA); + *(commit_id_t *)(mem_ptr + JOURNAL_SECTOR_DATA) = je->last_bytes[s]; + js++; + mem_ptr += 1 << SECTOR_SHIFT; + } while (++s < ic->sectors_per_block); +#ifdef INTERNAL_VERIFY + if (ic->internal_hash) { + char checksums_onstack[max(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)]; + + integrity_sector_checksum(ic, logical_sector, mem + bv.bv_offset, checksums_onstack); + if (unlikely(memcmp(checksums_onstack, journal_entry_tag(ic, je), ic->tag_size))) { + DMERR("Checksum failed when reading from journal, at sector 0x%llx", + (unsigned long long)logical_sector); + } + } +#endif + } + + if (!ic->internal_hash) { + struct bio_integrity_payload *bip = bio_integrity(bio); + unsigned tag_todo = ic->tag_size; + char *tag_ptr = journal_entry_tag(ic, je); + + if (bip) do { + struct bio_vec biv = bvec_iter_bvec(bip->bip_vec, bip->bip_iter); + unsigned tag_now = min(biv.bv_len, tag_todo); + char *tag_addr; + BUG_ON(PageHighMem(biv.bv_page)); + tag_addr = lowmem_page_address(biv.bv_page) + biv.bv_offset; + if (likely(dio->write)) + memcpy(tag_ptr, tag_addr, tag_now); + else + memcpy(tag_addr, tag_ptr, tag_now); + bvec_iter_advance(bip->bip_vec, &bip->bip_iter, tag_now); + tag_ptr += tag_now; + tag_todo -= tag_now; + } while (unlikely(tag_todo)); else { + if (likely(dio->write)) + memset(tag_ptr, 0, tag_todo); + } + } + + if (likely(dio->write)) { + struct journal_sector *js; + unsigned s; + + js = access_journal_data(ic, journal_section, journal_entry); + memcpy(js, mem + bv.bv_offset, ic->sectors_per_block << SECTOR_SHIFT); + + s = 0; + do { + je->last_bytes[s] = js[s].commit_id; + } while (++s < ic->sectors_per_block); + + if (ic->internal_hash) { + unsigned digest_size = crypto_shash_digestsize(ic->internal_hash); + if (unlikely(digest_size > ic->tag_size)) { + char checksums_onstack[digest_size]; + integrity_sector_checksum(ic, logical_sector, (char *)js, checksums_onstack); + memcpy(journal_entry_tag(ic, je), checksums_onstack, ic->tag_size); + } else + integrity_sector_checksum(ic, logical_sector, (char *)js, journal_entry_tag(ic, je)); + } + + journal_entry_set_sector(je, logical_sector); + } + logical_sector += ic->sectors_per_block; + + journal_entry++; + if (unlikely(journal_entry == ic->journal_section_entries)) { + journal_entry = 0; + journal_section++; + wraparound_section(ic, &journal_section); + } + + bv.bv_offset += ic->sectors_per_block << SECTOR_SHIFT; + } while (bv.bv_len -= ic->sectors_per_block << SECTOR_SHIFT); + + if (unlikely(!dio->write)) + flush_dcache_page(bv.bv_page); + kunmap_atomic(mem); + } while (n_sectors); + + if (likely(dio->write)) { + smp_mb(); + if (unlikely(waitqueue_active(&ic->copy_to_journal_wait))) + wake_up(&ic->copy_to_journal_wait); + if (ACCESS_ONCE(ic->free_sectors) <= ic->free_sectors_threshold) { + queue_work(ic->commit_wq, &ic->commit_work); + } else { + schedule_autocommit(ic); + } + } else { + remove_range(ic, &dio->range); + } + + if (unlikely(bio->bi_iter.bi_size)) { + sector_t area, offset; + + dio->range.logical_sector = logical_sector; + get_area_and_offset(ic, dio->range.logical_sector, &area, &offset); + dio->metadata_block = get_metadata_sector_and_offset(ic, area, offset, &dio->metadata_offset); + return true; + } + + return false; +} + +static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map) +{ + struct dm_integrity_c *ic = dio->ic; + struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); + unsigned journal_section, journal_entry; + unsigned journal_read_pos; + struct completion read_comp; + bool need_sync_io = ic->internal_hash && !dio->write; + + if (need_sync_io && from_map) { + INIT_WORK(&dio->work, integrity_bio_wait); + queue_work(ic->metadata_wq, &dio->work); + return; + } + +lock_retry: + spin_lock_irq(&ic->endio_wait.lock); +retry: + if (unlikely(dm_integrity_failed(ic))) { + spin_unlock_irq(&ic->endio_wait.lock); + do_endio(ic, bio); + return; + } + dio->range.n_sectors = bio_sectors(bio); + journal_read_pos = NOT_FOUND; + if (likely(ic->mode == 'J')) { + if (dio->write) { + unsigned next_entry, i, pos; + unsigned ws, we; + + dio->range.n_sectors = min(dio->range.n_sectors, ic->free_sectors); + if (unlikely(!dio->range.n_sectors)) + goto sleep; + ic->free_sectors -= dio->range.n_sectors; + journal_section = ic->free_section; + journal_entry = ic->free_section_entry; + + next_entry = ic->free_section_entry + dio->range.n_sectors; + ic->free_section_entry = next_entry % ic->journal_section_entries; + ic->free_section += next_entry / ic->journal_section_entries; + ic->n_uncommitted_sections += next_entry / ic->journal_section_entries; + wraparound_section(ic, &ic->free_section); + + pos = journal_section * ic->journal_section_entries + journal_entry; + ws = journal_section; + we = journal_entry; + i = 0; + do { + struct journal_entry *je; + + add_journal_node(ic, &ic->journal_tree[pos], dio->range.logical_sector + i); + pos++; + if (unlikely(pos >= ic->journal_entries)) + pos = 0; + + je = access_journal_entry(ic, ws, we); + BUG_ON(!journal_entry_is_unused(je)); + journal_entry_set_inprogress(je); + we++; + if (unlikely(we == ic->journal_section_entries)) { + we = 0; + ws++; + wraparound_section(ic, &ws); + } + } while ((i += ic->sectors_per_block) < dio->range.n_sectors); + + spin_unlock_irq(&ic->endio_wait.lock); + goto journal_read_write; + } else { + sector_t next_sector; + journal_read_pos = find_journal_node(ic, dio->range.logical_sector, &next_sector); + if (likely(journal_read_pos == NOT_FOUND)) { + if (unlikely(dio->range.n_sectors > next_sector - dio->range.logical_sector)) + dio->range.n_sectors = next_sector - dio->range.logical_sector; + } else { + unsigned i; + unsigned jp = journal_read_pos + 1; + for (i = ic->sectors_per_block; i < dio->range.n_sectors; i += ic->sectors_per_block, jp++) { + if (!test_journal_node(ic, jp, dio->range.logical_sector + i)) + break; + } + dio->range.n_sectors = i; + } + } + } + if (unlikely(!add_new_range(ic, &dio->range))) { + /* + * We must not sleep in the request routine because it could + * stall bios on current->bio_list. + * So, we offload the bio to a workqueue if we have to sleep. + */ +sleep: + if (from_map) { + spin_unlock_irq(&ic->endio_wait.lock); + INIT_WORK(&dio->work, integrity_bio_wait); + queue_work(ic->wait_wq, &dio->work); + return; + } else { + sleep_on_endio_wait(ic); + goto retry; + } + } + spin_unlock_irq(&ic->endio_wait.lock); + + if (unlikely(journal_read_pos != NOT_FOUND)) { + journal_section = journal_read_pos / ic->journal_section_entries; + journal_entry = journal_read_pos % ic->journal_section_entries; + goto journal_read_write; + } + + dio->in_flight = (atomic_t)ATOMIC_INIT(2); + + if (need_sync_io) { + read_comp = COMPLETION_INITIALIZER_ONSTACK(read_comp); + dio->completion = &read_comp; + } else + dio->completion = NULL; + + dio->orig_bi_iter = bio->bi_iter; + + dio->orig_bi_bdev = bio->bi_bdev; + bio->bi_bdev = ic->dev->bdev; + + dio->orig_bi_integrity = bio_integrity(bio); + bio->bi_integrity = NULL; + bio->bi_opf &= ~REQ_INTEGRITY; + + dio->orig_bi_end_io = bio->bi_end_io; + bio->bi_end_io = integrity_end_io; + + bio->bi_iter.bi_size = dio->range.n_sectors << SECTOR_SHIFT; + bio->bi_iter.bi_sector += ic->start; + generic_make_request(bio); + + if (need_sync_io) { + wait_for_completion_io(&read_comp); + integrity_metadata(&dio->work); + } else { + INIT_WORK(&dio->work, integrity_metadata); + queue_work(ic->metadata_wq, &dio->work); + } + + return; + +journal_read_write: + if (unlikely(__journal_read_write(dio, bio, journal_section, journal_entry))) + goto lock_retry; + + do_endio_flush(ic, dio); +} + + +static void integrity_bio_wait(struct work_struct *w) +{ + struct dm_integrity_io *dio = container_of(w, struct dm_integrity_io, work); + + dm_integrity_map_continue(dio, false); +} + +static void pad_uncommitted(struct dm_integrity_c *ic) +{ + if (ic->free_section_entry) { + ic->free_sectors -= ic->journal_section_entries - ic->free_section_entry; + ic->free_section_entry = 0; + ic->free_section++; + wraparound_section(ic, &ic->free_section); + ic->n_uncommitted_sections++; + } +} + +static void integrity_commit(struct work_struct *w) +{ + struct dm_integrity_c *ic = container_of(w, struct dm_integrity_c, commit_work); + unsigned commit_start, commit_sections; + unsigned i, j, n; + struct bio *flushes; + + del_timer(&ic->autocommit_timer); + + spin_lock_irq(&ic->endio_wait.lock); + flushes = bio_list_get(&ic->flush_bio_list); + if (unlikely(ic->mode != 'J')) { + spin_unlock_irq(&ic->endio_wait.lock); + dm_integrity_flush_buffers(ic); + goto release_flush_bios; + } + + pad_uncommitted(ic); + commit_start = ic->uncommitted_section; + commit_sections = ic->n_uncommitted_sections; + spin_unlock_irq(&ic->endio_wait.lock); + + if (!commit_sections) + goto release_flush_bios; + + i = commit_start; + for (n = 0; n < commit_sections; n++) { + for (j = 0; j < ic->journal_section_entries; j++) { + struct journal_entry *je; + je = access_journal_entry(ic, i, j); + io_wait_event(ic->copy_to_journal_wait, !journal_entry_is_inprogress(je)); + } + for (j = 0; j < ic->journal_section_sectors; j++) { + struct journal_sector *js; + js = access_journal(ic, i, j); + js->commit_id = dm_integrity_commit_id(ic, i, j, ic->commit_seq); + } + i++; + if (unlikely(i >= ic->journal_sections)) + ic->commit_seq = next_commit_seq(ic->commit_seq); + wraparound_section(ic, &i); + } + smp_rmb(); + + write_journal(ic, commit_start, commit_sections); + + spin_lock_irq(&ic->endio_wait.lock); + ic->uncommitted_section += commit_sections; + wraparound_section(ic, &ic->uncommitted_section); + ic->n_uncommitted_sections -= commit_sections; + ic->n_committed_sections += commit_sections; + spin_unlock_irq(&ic->endio_wait.lock); + + if (ACCESS_ONCE(ic->free_sectors) <= ic->free_sectors_threshold) + queue_work(ic->writer_wq, &ic->writer_work); + +release_flush_bios: + while (flushes) { + struct bio *next = flushes->bi_next; + flushes->bi_next = NULL; + do_endio(ic, flushes); + flushes = next; + } +} + +static void complete_copy_from_journal(unsigned long error, void *context) +{ + struct journal_io *io = context; + struct journal_completion *comp = io->comp; + struct dm_integrity_c *ic = comp->ic; + remove_range(ic, &io->range); + mempool_free(io, ic->journal_io_mempool); + if (unlikely(error != 0)) + dm_integrity_io_error(ic, "copying from journal", -EIO); + complete_journal_op(comp); +} + +static void restore_last_bytes(struct dm_integrity_c *ic, struct journal_sector *js, + struct journal_entry *je) +{ + unsigned s = 0; + do { + js->commit_id = je->last_bytes[s]; + js++; + } while (++s < ic->sectors_per_block); +} + +static void do_journal_write(struct dm_integrity_c *ic, unsigned write_start, + unsigned write_sections, bool from_replay) +{ + unsigned i, j, n; + struct journal_completion comp; + + comp.ic = ic; + comp.in_flight = (atomic_t)ATOMIC_INIT(1); + comp.comp = COMPLETION_INITIALIZER_ONSTACK(comp.comp); + + i = write_start; + for (n = 0; n < write_sections; n++, i++, wraparound_section(ic, &i)) { +#ifndef INTERNAL_VERIFY + if (unlikely(from_replay)) +#endif + rw_section_mac(ic, i, false); + for (j = 0; j < ic->journal_section_entries; j++) { + struct journal_entry *je = access_journal_entry(ic, i, j); + sector_t sec, area, offset; + unsigned k, l, next_loop; + sector_t metadata_block; + unsigned metadata_offset; + struct journal_io *io; + + if (journal_entry_is_unused(je)) + continue; + BUG_ON(unlikely(journal_entry_is_inprogress(je)) && !from_replay); + sec = journal_entry_get_sector(je); + if (unlikely(from_replay)) { + if (unlikely(sec & (unsigned)(ic->sectors_per_block - 1))) { + dm_integrity_io_error(ic, "invalid sector in journal", -EIO); + sec &= ~(sector_t)(ic->sectors_per_block - 1); + } + } + get_area_and_offset(ic, sec, &area, &offset); + restore_last_bytes(ic, access_journal_data(ic, i, j), je); + for (k = j + 1; k < ic->journal_section_entries; k++) { + struct journal_entry *je2 = access_journal_entry(ic, i, k); + sector_t sec2, area2, offset2; + if (journal_entry_is_unused(je2)) + break; + BUG_ON(unlikely(journal_entry_is_inprogress(je2)) && !from_replay); + sec2 = journal_entry_get_sector(je2); + get_area_and_offset(ic, sec2, &area2, &offset2); + if (area2 != area || offset2 != offset + ((k - j) << ic->sb->log2_sectors_per_block)) + break; + restore_last_bytes(ic, access_journal_data(ic, i, k), je2); + } + next_loop = k - 1; + + io = mempool_alloc(ic->journal_io_mempool, GFP_NOIO); + io->comp = ∁ + io->range.logical_sector = sec; + io->range.n_sectors = (k - j) << ic->sb->log2_sectors_per_block; + + spin_lock_irq(&ic->endio_wait.lock); + while (unlikely(!add_new_range(ic, &io->range))) + sleep_on_endio_wait(ic); + + if (likely(!from_replay)) { + struct journal_node *section_node = &ic->journal_tree[i * ic->journal_section_entries]; + + /* don't write if there is newer committed sector */ + while (j < k && find_newer_committed_node(ic, §ion_node[j])) { + struct journal_entry *je2 = access_journal_entry(ic, i, j); + + journal_entry_set_unused(je2); + remove_journal_node(ic, §ion_node[j]); + j++; + sec += ic->sectors_per_block; + offset += ic->sectors_per_block; + } + while (j < k && find_newer_committed_node(ic, §ion_node[k - 1])) { + struct journal_entry *je2 = access_journal_entry(ic, i, k - 1); + + journal_entry_set_unused(je2); + remove_journal_node(ic, §ion_node[k - 1]); + k--; + } + if (j == k) { + remove_range_unlocked(ic, &io->range); + spin_unlock_irq(&ic->endio_wait.lock); + mempool_free(io, ic->journal_io_mempool); + goto skip_io; + } + for (l = j; l < k; l++) { + remove_journal_node(ic, §ion_node[l]); + } + } + spin_unlock_irq(&ic->endio_wait.lock); + + metadata_block = get_metadata_sector_and_offset(ic, area, offset, &metadata_offset); + for (l = j; l < k; l++) { + int r; + struct journal_entry *je2 = access_journal_entry(ic, i, l); + + if ( +#ifndef INTERNAL_VERIFY + unlikely(from_replay) && +#endif + ic->internal_hash) { + char test_tag[max(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)]; + + integrity_sector_checksum(ic, sec + ((l - j) << ic->sb->log2_sectors_per_block), + (char *)access_journal_data(ic, i, l), test_tag); + if (unlikely(memcmp(test_tag, journal_entry_tag(ic, je2), ic->tag_size))) + dm_integrity_io_error(ic, "tag mismatch when replaying journal", -EILSEQ); + } + + journal_entry_set_unused(je2); + r = dm_integrity_rw_tag(ic, journal_entry_tag(ic, je2), &metadata_block, &metadata_offset, + ic->tag_size, TAG_WRITE); + if (unlikely(r)) { + dm_integrity_io_error(ic, "reading tags", r); + } + } + + atomic_inc(&comp.in_flight); + copy_from_journal(ic, i, j << ic->sb->log2_sectors_per_block, + (k - j) << ic->sb->log2_sectors_per_block, + get_data_sector(ic, area, offset), + complete_copy_from_journal, io); +skip_io: + j = next_loop; + } + } + + dm_bufio_write_dirty_buffers_async(ic->bufio); + + complete_journal_op(&comp); + wait_for_completion_io(&comp.comp); + + dm_integrity_flush_buffers(ic); +} + +static void integrity_writer(struct work_struct *w) +{ + struct dm_integrity_c *ic = container_of(w, struct dm_integrity_c, writer_work); + unsigned write_start, write_sections; + + unsigned prev_free_sectors; + + /* the following test is not needed, but it tests the replay code */ + if (ACCESS_ONCE(ic->suspending)) + return; + + spin_lock_irq(&ic->endio_wait.lock); + write_start = ic->committed_section; + write_sections = ic->n_committed_sections; + spin_unlock_irq(&ic->endio_wait.lock); + + if (!write_sections) + return; + + do_journal_write(ic, write_start, write_sections, false); + + spin_lock_irq(&ic->endio_wait.lock); + + ic->committed_section += write_sections; + wraparound_section(ic, &ic->committed_section); + ic->n_committed_sections -= write_sections; + + prev_free_sectors = ic->free_sectors; + ic->free_sectors += write_sections * ic->journal_section_entries; + if (unlikely(!prev_free_sectors)) + wake_up_locked(&ic->endio_wait); + + spin_unlock_irq(&ic->endio_wait.lock); +} + +static void init_journal(struct dm_integrity_c *ic, unsigned start_section, + unsigned n_sections, unsigned char commit_seq) +{ + unsigned i, j, n; + + if (!n_sections) + return; + + for (n = 0; n < n_sections; n++) { + i = start_section + n; + wraparound_section(ic, &i); + for (j = 0; j < ic->journal_section_sectors; j++) { + struct journal_sector *js = access_journal(ic, i, j); + memset(&js->entries, 0, JOURNAL_SECTOR_DATA); + js->commit_id = dm_integrity_commit_id(ic, i, j, commit_seq); + } + for (j = 0; j < ic->journal_section_entries; j++) { + struct journal_entry *je = access_journal_entry(ic, i, j); + journal_entry_set_unused(je); + } + } + + write_journal(ic, start_section, n_sections); +} + +static int find_commit_seq(struct dm_integrity_c *ic, unsigned i, unsigned j, commit_id_t id) +{ + unsigned char k; + for (k = 0; k < N_COMMIT_IDS; k++) { + if (dm_integrity_commit_id(ic, i, j, k) == id) + return k; + } + dm_integrity_io_error(ic, "journal commit id", -EIO); + return -EIO; +} + +static void replay_journal(struct dm_integrity_c *ic) +{ + unsigned i, j; + bool used_commit_ids[N_COMMIT_IDS]; + unsigned max_commit_id_sections[N_COMMIT_IDS]; + unsigned write_start, write_sections; + unsigned continue_section; + bool journal_empty; + unsigned char unused, last_used, want_commit_seq; + + if (ic->mode == 'R') + return; + + if (ic->journal_uptodate) + return; + + last_used = 0; + write_start = 0; + + if (!ic->just_formatted) { + DEBUG_print("reading journal\n"); + rw_journal(ic, REQ_OP_READ, 0, 0, ic->journal_sections, NULL); + if (ic->journal_io) + DEBUG_bytes(lowmem_page_address(ic->journal_io[0].page), 64, "read journal"); + if (ic->journal_io) { + struct journal_completion crypt_comp; + crypt_comp.ic = ic; + crypt_comp.comp = COMPLETION_INITIALIZER_ONSTACK(crypt_comp.comp); + crypt_comp.in_flight = (atomic_t)ATOMIC_INIT(0); + encrypt_journal(ic, false, 0, ic->journal_sections, &crypt_comp); + wait_for_completion(&crypt_comp.comp); + } + DEBUG_bytes(lowmem_page_address(ic->journal[0].page), 64, "decrypted journal"); + } + + if (dm_integrity_failed(ic)) + goto clear_journal; + + journal_empty = true; + memset(used_commit_ids, 0, sizeof used_commit_ids); + memset(max_commit_id_sections, 0, sizeof max_commit_id_sections); + for (i = 0; i < ic->journal_sections; i++) { + for (j = 0; j < ic->journal_section_sectors; j++) { + int k; + struct journal_sector *js = access_journal(ic, i, j); + k = find_commit_seq(ic, i, j, js->commit_id); + if (k < 0) + goto clear_journal; + used_commit_ids[k] = true; + max_commit_id_sections[k] = i; + } + if (journal_empty) { + for (j = 0; j < ic->journal_section_entries; j++) { + struct journal_entry *je = access_journal_entry(ic, i, j); + if (!journal_entry_is_unused(je)) { + journal_empty = false; + break; + } + } + } + } + + if (!used_commit_ids[N_COMMIT_IDS - 1]) { + unused = N_COMMIT_IDS - 1; + while (unused && !used_commit_ids[unused - 1]) + unused--; + } else { + for (unused = 0; unused < N_COMMIT_IDS; unused++) + if (!used_commit_ids[unused]) + break; + if (unused == N_COMMIT_IDS) { + dm_integrity_io_error(ic, "journal commit ids", -EIO); + goto clear_journal; + } + } + DEBUG_print("first unused commit seq %d [%d,%d,%d,%d]\n", + unused, used_commit_ids[0], used_commit_ids[1], + used_commit_ids[2], used_commit_ids[3]); + + last_used = prev_commit_seq(unused); + want_commit_seq = prev_commit_seq(last_used); + + if (!used_commit_ids[want_commit_seq] && used_commit_ids[prev_commit_seq(want_commit_seq)]) + journal_empty = true; + + write_start = max_commit_id_sections[last_used] + 1; + if (unlikely(write_start >= ic->journal_sections)) + want_commit_seq = next_commit_seq(want_commit_seq); + wraparound_section(ic, &write_start); + + i = write_start; + for (write_sections = 0; write_sections < ic->journal_sections; write_sections++) { + for (j = 0; j < ic->journal_section_sectors; j++) { + struct journal_sector *js = access_journal(ic, i, j); + + if (js->commit_id != dm_integrity_commit_id(ic, i, j, want_commit_seq)) { + /* + * This could be caused by crash during writing. + * We won't replay the inconsistent part of the + * journal. + */ + DEBUG_print("commit id mismatch at position (%u, %u): %d != %d\n", + i, j, find_commit_seq(ic, i, j, js->commit_id), want_commit_seq); + goto brk; + } + } + i++; + if (unlikely(i >= ic->journal_sections)) + want_commit_seq = next_commit_seq(want_commit_seq); + wraparound_section(ic, &i); + } +brk: + + if (!journal_empty) { + DEBUG_print("replaying %u sections, starting at %u, commit seq %d\n", + write_sections, write_start, want_commit_seq); + do_journal_write(ic, write_start, write_sections, true); + } + + if (write_sections == ic->journal_sections && (ic->mode == 'J' || journal_empty)) { + continue_section = write_start; + ic->commit_seq = want_commit_seq; + DEBUG_print("continuing from section %u, commit seq %d\n", write_start, ic->commit_seq); + } else { + unsigned s; + unsigned char erase_seq; +clear_journal: + DEBUG_print("clearing journal\n"); + + erase_seq = prev_commit_seq(prev_commit_seq(last_used)); + s = write_start; + init_journal(ic, s, 1, erase_seq); + s++; + wraparound_section(ic, &s); + if (ic->journal_sections >= 2) { + init_journal(ic, s, ic->journal_sections - 2, erase_seq); + s += ic->journal_sections - 2; + wraparound_section(ic, &s); + init_journal(ic, s, 1, erase_seq); + } + + continue_section = 0; + ic->commit_seq = next_commit_seq(erase_seq); + } + + ic->committed_section = continue_section; + ic->n_committed_sections = 0; + + ic->uncommitted_section = continue_section; + ic->n_uncommitted_sections = 0; + + ic->free_section = continue_section; + ic->free_section_entry = 0; + ic->free_sectors = ic->journal_entries; + + ic->journal_tree_root = RB_ROOT; + for (i = 0; i < ic->journal_entries; i++) + init_journal_node(&ic->journal_tree[i]); +} + +static void dm_integrity_postsuspend(struct dm_target *ti) +{ + struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private; + + del_timer_sync(&ic->autocommit_timer); + + ic->suspending = true; + + queue_work(ic->commit_wq, &ic->commit_work); + drain_workqueue(ic->commit_wq); + + if (ic->mode == 'J') { + drain_workqueue(ic->writer_wq); + dm_integrity_flush_buffers(ic); + } + + ic->suspending = false; + + BUG_ON(!RB_EMPTY_ROOT(&ic->in_progress)); + + ic->journal_uptodate = true; +} + +static void dm_integrity_resume(struct dm_target *ti) +{ + struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private; + + replay_journal(ic); +} + +static void dm_integrity_status(struct dm_target *ti, status_type_t type, + unsigned status_flags, char *result, unsigned maxlen) +{ + struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private; + unsigned arg_count; + size_t sz = 0; + + switch (type) { + case STATUSTYPE_INFO: + result[0] = '\0'; + break; + + case STATUSTYPE_TABLE: { + __u64 watermark_percentage = (__u64)(ic->journal_entries - ic->free_sectors_threshold) * 100; + watermark_percentage += ic->journal_entries / 2; + do_div(watermark_percentage, ic->journal_entries); + arg_count = 5; + arg_count += ic->sectors_per_block != 1; + arg_count += !!ic->internal_hash_alg.alg_string; + arg_count += !!ic->journal_crypt_alg.alg_string; + arg_count += !!ic->journal_mac_alg.alg_string; + DMEMIT("%s %llu %u %c %u", ic->dev->name, (unsigned long long)ic->start, + ic->tag_size, ic->mode, arg_count); + DMEMIT(" journal_sectors:%u", ic->initial_sectors - SB_SECTORS); + DMEMIT(" interleave_sectors:%u", 1U << ic->sb->log2_interleave_sectors); + DMEMIT(" buffer_sectors:%u", 1U << ic->log2_buffer_sectors); + DMEMIT(" journal_watermark:%u", (unsigned)watermark_percentage); + DMEMIT(" commit_time:%u", ic->autocommit_msec); + if (ic->sectors_per_block != 1) + DMEMIT(" block_size:%u", ic->sectors_per_block << SECTOR_SHIFT); + +#define EMIT_ALG(a, n) \ + do { \ + if (ic->a.alg_string) { \ + DMEMIT(" %s:%s", n, ic->a.alg_string); \ + if (ic->a.key_string) \ + DMEMIT(":%s", ic->a.key_string);\ + } \ + } while (0) + EMIT_ALG(internal_hash_alg, "internal_hash"); + EMIT_ALG(journal_crypt_alg, "journal_crypt"); + EMIT_ALG(journal_mac_alg, "journal_mac"); + break; + } + } +} + +static int dm_integrity_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct dm_integrity_c *ic = ti->private; + + return fn(ti, ic->dev, ic->start + ic->initial_sectors + ic->metadata_run, ti->len, data); +} + +static void dm_integrity_io_hints(struct dm_target *ti, struct queue_limits *limits) +{ + struct dm_integrity_c *ic = ti->private; + + if (ic->sectors_per_block > 1) { + limits->logical_block_size = ic->sectors_per_block << SECTOR_SHIFT; + limits->physical_block_size = ic->sectors_per_block << SECTOR_SHIFT; + blk_limits_io_min(limits, ic->sectors_per_block << SECTOR_SHIFT); + } +} + +static void calculate_journal_section_size(struct dm_integrity_c *ic) +{ + unsigned sector_space = JOURNAL_SECTOR_DATA; + + ic->journal_sections = le32_to_cpu(ic->sb->journal_sections); + ic->journal_entry_size = roundup(offsetof(struct journal_entry, last_bytes[ic->sectors_per_block]) + ic->tag_size, + JOURNAL_ENTRY_ROUNDUP); + + if (ic->sb->flags & cpu_to_le32(SB_FLAG_HAVE_JOURNAL_MAC)) + sector_space -= JOURNAL_MAC_PER_SECTOR; + ic->journal_entries_per_sector = sector_space / ic->journal_entry_size; + ic->journal_section_entries = ic->journal_entries_per_sector * JOURNAL_BLOCK_SECTORS; + ic->journal_section_sectors = (ic->journal_section_entries << ic->sb->log2_sectors_per_block) + JOURNAL_BLOCK_SECTORS; + ic->journal_entries = ic->journal_section_entries * ic->journal_sections; +} + +static int calculate_device_limits(struct dm_integrity_c *ic) +{ + __u64 initial_sectors; + sector_t last_sector, last_area, last_offset; + + calculate_journal_section_size(ic); + initial_sectors = SB_SECTORS + (__u64)ic->journal_section_sectors * ic->journal_sections; + if (initial_sectors + METADATA_PADDING_SECTORS >= ic->device_sectors || initial_sectors > UINT_MAX) + return -EINVAL; + ic->initial_sectors = initial_sectors; + + ic->metadata_run = roundup((__u64)ic->tag_size << (ic->sb->log2_interleave_sectors - ic->sb->log2_sectors_per_block), + (__u64)(1 << SECTOR_SHIFT << METADATA_PADDING_SECTORS)) >> SECTOR_SHIFT; + if (!(ic->metadata_run & (ic->metadata_run - 1))) + ic->log2_metadata_run = __ffs(ic->metadata_run); + else + ic->log2_metadata_run = -1; + + get_area_and_offset(ic, ic->provided_data_sectors - 1, &last_area, &last_offset); + last_sector = get_data_sector(ic, last_area, last_offset); + + if (ic->start + last_sector < last_sector || ic->start + last_sector >= ic->device_sectors) + return -EINVAL; + + return 0; +} + +static int initialize_superblock(struct dm_integrity_c *ic, unsigned journal_sectors, unsigned interleave_sectors) +{ + unsigned journal_sections; + int test_bit; + + memset(ic->sb, 0, SB_SECTORS << SECTOR_SHIFT); + memcpy(ic->sb->magic, SB_MAGIC, 8); + ic->sb->version = SB_VERSION; + ic->sb->integrity_tag_size = cpu_to_le16(ic->tag_size); + ic->sb->log2_sectors_per_block = __ffs(ic->sectors_per_block); + if (ic->journal_mac_alg.alg_string) + ic->sb->flags |= cpu_to_le32(SB_FLAG_HAVE_JOURNAL_MAC); + + calculate_journal_section_size(ic); + journal_sections = journal_sectors / ic->journal_section_sectors; + if (!journal_sections) + journal_sections = 1; + ic->sb->journal_sections = cpu_to_le32(journal_sections); + + if (!interleave_sectors) + interleave_sectors = DEFAULT_INTERLEAVE_SECTORS; + ic->sb->log2_interleave_sectors = __fls(interleave_sectors); + ic->sb->log2_interleave_sectors = max((__u8)MIN_LOG2_INTERLEAVE_SECTORS, ic->sb->log2_interleave_sectors); + ic->sb->log2_interleave_sectors = min((__u8)MAX_LOG2_INTERLEAVE_SECTORS, ic->sb->log2_interleave_sectors); + + ic->provided_data_sectors = 0; + for (test_bit = fls64(ic->device_sectors) - 1; test_bit >= 3; test_bit--) { + __u64 prev_data_sectors = ic->provided_data_sectors; + + ic->provided_data_sectors |= (sector_t)1 << test_bit; + if (calculate_device_limits(ic)) + ic->provided_data_sectors = prev_data_sectors; + } + + if (!ic->provided_data_sectors) + return -EINVAL; + + ic->sb->provided_data_sectors = cpu_to_le64(ic->provided_data_sectors); + + return 0; +} + +static void dm_integrity_set(struct dm_target *ti, struct dm_integrity_c *ic) +{ + struct gendisk *disk = dm_disk(dm_table_get_md(ti->table)); + struct blk_integrity bi; + + memset(&bi, 0, sizeof(bi)); + bi.profile = &dm_integrity_profile; + bi.tuple_size = ic->tag_size; + bi.tag_size = bi.tuple_size; + bi.interval_exp = ic->sb->log2_sectors_per_block + SECTOR_SHIFT; + + blk_integrity_register(disk, &bi); + blk_queue_max_integrity_segments(disk->queue, UINT_MAX); +} + +/* FIXME: use new kvmalloc */ +static void *dm_integrity_kvmalloc(size_t size, gfp_t gfp) +{ + void *ptr = NULL; + + if (size <= PAGE_SIZE) + ptr = kmalloc(size, GFP_KERNEL | gfp); + if (!ptr && size <= KMALLOC_MAX_SIZE) + ptr = kmalloc(size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY | gfp); + if (!ptr) + ptr = __vmalloc(size, GFP_KERNEL | gfp, PAGE_KERNEL); + + return ptr; +} + +static void dm_integrity_free_page_list(struct dm_integrity_c *ic, struct page_list *pl) +{ + unsigned i; + + if (!pl) + return; + for (i = 0; i < ic->journal_pages; i++) + if (pl[i].page) + __free_page(pl[i].page); + kvfree(pl); +} + +static struct page_list *dm_integrity_alloc_page_list(struct dm_integrity_c *ic) +{ + size_t page_list_desc_size = ic->journal_pages * sizeof(struct page_list); + struct page_list *pl; + unsigned i; + + pl = dm_integrity_kvmalloc(page_list_desc_size, __GFP_ZERO); + if (!pl) + return NULL; + + for (i = 0; i < ic->journal_pages; i++) { + pl[i].page = alloc_page(GFP_KERNEL); + if (!pl[i].page) { + dm_integrity_free_page_list(ic, pl); + return NULL; + } + if (i) + pl[i - 1].next = &pl[i]; + } + + return pl; +} + +static void dm_integrity_free_journal_scatterlist(struct dm_integrity_c *ic, struct scatterlist **sl) +{ + unsigned i; + for (i = 0; i < ic->journal_sections; i++) + kvfree(sl[i]); + kfree(sl); +} + +static struct scatterlist **dm_integrity_alloc_journal_scatterlist(struct dm_integrity_c *ic, struct page_list *pl) +{ + struct scatterlist **sl; + unsigned i; + + sl = dm_integrity_kvmalloc(ic->journal_sections * sizeof(struct scatterlist *), __GFP_ZERO); + if (!sl) + return NULL; + + for (i = 0; i < ic->journal_sections; i++) { + struct scatterlist *s; + unsigned start_index, start_offset; + unsigned end_index, end_offset; + unsigned n_pages; + unsigned idx; + + page_list_location(ic, i, 0, &start_index, &start_offset); + page_list_location(ic, i, ic->journal_section_sectors - 1, &end_index, &end_offset); + + n_pages = (end_index - start_index + 1); + + s = dm_integrity_kvmalloc(n_pages * sizeof(struct scatterlist), 0); + if (!s) { + dm_integrity_free_journal_scatterlist(ic, sl); + return NULL; + } + + sg_init_table(s, n_pages); + for (idx = start_index; idx <= end_index; idx++) { + char *va = lowmem_page_address(pl[idx].page); + unsigned start = 0, end = PAGE_SIZE; + if (idx == start_index) + start = start_offset; + if (idx == end_index) + end = end_offset + (1 << SECTOR_SHIFT); + sg_set_buf(&s[idx - start_index], va + start, end - start); + } + + sl[i] = s; + } + + return sl; +} + +static void free_alg(struct alg_spec *a) +{ + kzfree(a->alg_string); + kzfree(a->key); + memset(a, 0, sizeof *a); +} + +static int get_alg_and_key(const char *arg, struct alg_spec *a, char **error, char *error_inval) +{ + char *k; + + free_alg(a); + + a->alg_string = kstrdup(strchr(arg, ':') + 1, GFP_KERNEL); + if (!a->alg_string) + goto nomem; + + k = strchr(a->alg_string, ':'); + if (k) { + *k = 0; + a->key_string = k + 1; + if (strlen(a->key_string) & 1) + goto inval; + + a->key_size = strlen(a->key_string) / 2; + a->key = kmalloc(a->key_size, GFP_KERNEL); + if (!a->key) + goto nomem; + if (hex2bin(a->key, a->key_string, a->key_size)) + goto inval; + } + + return 0; +inval: + *error = error_inval; + return -EINVAL; +nomem: + *error = "Out of memory for an argument"; + return -ENOMEM; +} + +static int get_mac(struct crypto_shash **hash, struct alg_spec *a, char **error, + char *error_alg, char *error_key) +{ + int r; + + if (a->alg_string) { + *hash = crypto_alloc_shash(a->alg_string, 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(*hash)) { + *error = error_alg; + r = PTR_ERR(*hash); + *hash = NULL; + return r; + } + + if (a->key) { + r = crypto_shash_setkey(*hash, a->key, a->key_size); + if (r) { + *error = error_key; + return r; + } + } + } + + return 0; +} + +static int create_journal(struct dm_integrity_c *ic, char **error) +{ + int r = 0; + unsigned i; + __u64 journal_pages, journal_desc_size, journal_tree_size; + unsigned char *crypt_data = NULL; + + ic->commit_ids[0] = cpu_to_le64(0x1111111111111111ULL); + ic->commit_ids[1] = cpu_to_le64(0x2222222222222222ULL); + ic->commit_ids[2] = cpu_to_le64(0x3333333333333333ULL); + ic->commit_ids[3] = cpu_to_le64(0x4444444444444444ULL); + + journal_pages = roundup((__u64)ic->journal_sections * ic->journal_section_sectors, + PAGE_SIZE >> SECTOR_SHIFT) >> (PAGE_SHIFT - SECTOR_SHIFT); + journal_desc_size = journal_pages * sizeof(struct page_list); + if (journal_pages >= totalram_pages - totalhigh_pages || journal_desc_size > ULONG_MAX) { + *error = "Journal doesn't fit into memory"; + r = -ENOMEM; + goto bad; + } + ic->journal_pages = journal_pages; + + ic->journal = dm_integrity_alloc_page_list(ic); + if (!ic->journal) { + *error = "Could not allocate memory for journal"; + r = -ENOMEM; + goto bad; + } + if (ic->journal_crypt_alg.alg_string) { + unsigned ivsize, blocksize; + struct journal_completion comp; + + comp.ic = ic; + ic->journal_crypt = crypto_alloc_skcipher(ic->journal_crypt_alg.alg_string, 0, 0); + if (IS_ERR(ic->journal_crypt)) { + *error = "Invalid journal cipher"; + r = PTR_ERR(ic->journal_crypt); + ic->journal_crypt = NULL; + goto bad; + } + ivsize = crypto_skcipher_ivsize(ic->journal_crypt); + blocksize = crypto_skcipher_blocksize(ic->journal_crypt); + + if (ic->journal_crypt_alg.key) { + r = crypto_skcipher_setkey(ic->journal_crypt, ic->journal_crypt_alg.key, + ic->journal_crypt_alg.key_size); + if (r) { + *error = "Error setting encryption key"; + goto bad; + } + } + DEBUG_print("cipher %s, block size %u iv size %u\n", + ic->journal_crypt_alg.alg_string, blocksize, ivsize); + + ic->journal_io = dm_integrity_alloc_page_list(ic); + if (!ic->journal_io) { + *error = "Could not allocate memory for journal io"; + r = -ENOMEM; + goto bad; + } + + if (blocksize == 1) { + struct scatterlist *sg; + SKCIPHER_REQUEST_ON_STACK(req, ic->journal_crypt); + unsigned char iv[ivsize]; + skcipher_request_set_tfm(req, ic->journal_crypt); + + ic->journal_xor = dm_integrity_alloc_page_list(ic); + if (!ic->journal_xor) { + *error = "Could not allocate memory for journal xor"; + r = -ENOMEM; + goto bad; + } + + sg = dm_integrity_kvmalloc((ic->journal_pages + 1) * sizeof(struct scatterlist), 0); + if (!sg) { + *error = "Unable to allocate sg list"; + r = -ENOMEM; + goto bad; + } + sg_init_table(sg, ic->journal_pages + 1); + for (i = 0; i < ic->journal_pages; i++) { + char *va = lowmem_page_address(ic->journal_xor[i].page); + clear_page(va); + sg_set_buf(&sg[i], va, PAGE_SIZE); + } + sg_set_buf(&sg[i], &ic->commit_ids, sizeof ic->commit_ids); + memset(iv, 0x00, ivsize); + + skcipher_request_set_crypt(req, sg, sg, PAGE_SIZE * ic->journal_pages + sizeof ic->commit_ids, iv); + comp.comp = COMPLETION_INITIALIZER_ONSTACK(comp.comp); + comp.in_flight = (atomic_t)ATOMIC_INIT(1); + if (do_crypt(true, req, &comp)) + wait_for_completion(&comp.comp); + kvfree(sg); + r = dm_integrity_failed(ic); + if (r) { + *error = "Unable to encrypt journal"; + goto bad; + } + DEBUG_bytes(lowmem_page_address(ic->journal_xor[0].page), 64, "xor data"); + + crypto_free_skcipher(ic->journal_crypt); + ic->journal_crypt = NULL; + } else { + SKCIPHER_REQUEST_ON_STACK(req, ic->journal_crypt); + unsigned char iv[ivsize]; + unsigned crypt_len = roundup(ivsize, blocksize); + + crypt_data = kmalloc(crypt_len, GFP_KERNEL); + if (!crypt_data) { + *error = "Unable to allocate crypt data"; + r = -ENOMEM; + goto bad; + } + + skcipher_request_set_tfm(req, ic->journal_crypt); + + ic->journal_scatterlist = dm_integrity_alloc_journal_scatterlist(ic, ic->journal); + if (!ic->journal_scatterlist) { + *error = "Unable to allocate sg list"; + r = -ENOMEM; + goto bad; + } + ic->journal_io_scatterlist = dm_integrity_alloc_journal_scatterlist(ic, ic->journal_io); + if (!ic->journal_io_scatterlist) { + *error = "Unable to allocate sg list"; + r = -ENOMEM; + goto bad; + } + ic->sk_requests = dm_integrity_kvmalloc(ic->journal_sections * sizeof(struct skcipher_request *), __GFP_ZERO); + if (!ic->sk_requests) { + *error = "Unable to allocate sk requests"; + r = -ENOMEM; + goto bad; + } + for (i = 0; i < ic->journal_sections; i++) { + struct scatterlist sg; + struct skcipher_request *section_req; + __u32 section_le = cpu_to_le32(i); + + memset(iv, 0x00, ivsize); + memset(crypt_data, 0x00, crypt_len); + memcpy(crypt_data, §ion_le, min((size_t)crypt_len, sizeof(section_le))); + + sg_init_one(&sg, crypt_data, crypt_len); + skcipher_request_set_crypt(req, &sg, &sg, crypt_len, iv); + comp.comp = COMPLETION_INITIALIZER_ONSTACK(comp.comp); + comp.in_flight = (atomic_t)ATOMIC_INIT(1); + if (do_crypt(true, req, &comp)) + wait_for_completion(&comp.comp); + + r = dm_integrity_failed(ic); + if (r) { + *error = "Unable to generate iv"; + goto bad; + } + + section_req = skcipher_request_alloc(ic->journal_crypt, GFP_KERNEL); + if (!section_req) { + *error = "Unable to allocate crypt request"; + r = -ENOMEM; + goto bad; + } + section_req->iv = kmalloc(ivsize * 2, GFP_KERNEL); + if (!section_req->iv) { + skcipher_request_free(section_req); + *error = "Unable to allocate iv"; + r = -ENOMEM; + goto bad; + } + memcpy(section_req->iv + ivsize, crypt_data, ivsize); + section_req->cryptlen = (size_t)ic->journal_section_sectors << SECTOR_SHIFT; + ic->sk_requests[i] = section_req; + DEBUG_bytes(crypt_data, ivsize, "iv(%u)", i); + } + } + } + + for (i = 0; i < N_COMMIT_IDS; i++) { + unsigned j; +retest_commit_id: + for (j = 0; j < i; j++) { + if (ic->commit_ids[j] == ic->commit_ids[i]) { + ic->commit_ids[i] = cpu_to_le64(le64_to_cpu(ic->commit_ids[i]) + 1); + goto retest_commit_id; + } + } + DEBUG_print("commit id %u: %016llx\n", i, ic->commit_ids[i]); + } + + journal_tree_size = (__u64)ic->journal_entries * sizeof(struct journal_node); + if (journal_tree_size > ULONG_MAX) { + *error = "Journal doesn't fit into memory"; + r = -ENOMEM; + goto bad; + } + ic->journal_tree = dm_integrity_kvmalloc(journal_tree_size, 0); + if (!ic->journal_tree) { + *error = "Could not allocate memory for journal tree"; + r = -ENOMEM; + } +bad: + kfree(crypt_data); + return r; +} + +/* + * Construct a integrity mapping + * + * Arguments: + * device + * offset from the start of the device + * tag size + * D - direct writes, J - journal writes, R - recovery mode + * number of optional arguments + * optional arguments: + * journal_sectors + * interleave_sectors + * buffer_sectors + * journal_watermark + * commit_time + * internal_hash + * journal_crypt + * journal_mac + * block_size + */ +static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) +{ + struct dm_integrity_c *ic; + char dummy; + int r; + unsigned extra_args; + struct dm_arg_set as; + static struct dm_arg _args[] = { + {0, 9, "Invalid number of feature args"}, + }; + unsigned journal_sectors, interleave_sectors, buffer_sectors, journal_watermark, sync_msec; + bool should_write_sb; + __u64 threshold; + unsigned long long start; + +#define DIRECT_ARGUMENTS 4 + + if (argc <= DIRECT_ARGUMENTS) { + ti->error = "Invalid argument count"; + return -EINVAL; + } + + ic = kzalloc(sizeof(struct dm_integrity_c), GFP_KERNEL); + if (!ic) { + ti->error = "Cannot allocate integrity context"; + return -ENOMEM; + } + ti->private = ic; + ti->per_io_data_size = sizeof(struct dm_integrity_io); + + ic->in_progress = RB_ROOT; + init_waitqueue_head(&ic->endio_wait); + bio_list_init(&ic->flush_bio_list); + init_waitqueue_head(&ic->copy_to_journal_wait); + init_completion(&ic->crypto_backoff); + + r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &ic->dev); + if (r) { + ti->error = "Device lookup failed"; + goto bad; + } + + if (sscanf(argv[1], "%llu%c", &start, &dummy) != 1 || start != (sector_t)start) { + ti->error = "Invalid starting offset"; + r = -EINVAL; + goto bad; + } + ic->start = start; + + if (strcmp(argv[2], "-")) { + if (sscanf(argv[2], "%u%c", &ic->tag_size, &dummy) != 1 || !ic->tag_size) { + ti->error = "Invalid tag size"; + r = -EINVAL; + goto bad; + } + } + + if (!strcmp(argv[3], "J") || !strcmp(argv[3], "D") || !strcmp(argv[3], "R")) + ic->mode = argv[3][0]; + else { + ti->error = "Invalid mode (expecting J, D, R)"; + r = -EINVAL; + goto bad; + } + + ic->device_sectors = i_size_read(ic->dev->bdev->bd_inode) >> SECTOR_SHIFT; + journal_sectors = min((sector_t)DEFAULT_MAX_JOURNAL_SECTORS, + ic->device_sectors >> DEFAULT_JOURNAL_SIZE_FACTOR); + interleave_sectors = DEFAULT_INTERLEAVE_SECTORS; + buffer_sectors = DEFAULT_BUFFER_SECTORS; + journal_watermark = DEFAULT_JOURNAL_WATERMARK; + sync_msec = DEFAULT_SYNC_MSEC; + ic->sectors_per_block = 1; + + as.argc = argc - DIRECT_ARGUMENTS; + as.argv = argv + DIRECT_ARGUMENTS; + r = dm_read_arg_group(_args, &as, &extra_args, &ti->error); + if (r) + goto bad; + + while (extra_args--) { + const char *opt_string; + unsigned val; + opt_string = dm_shift_arg(&as); + if (!opt_string) { + r = -EINVAL; + ti->error = "Not enough feature arguments"; + goto bad; + } + if (sscanf(opt_string, "journal_sectors:%u%c", &val, &dummy) == 1) + journal_sectors = val; + else if (sscanf(opt_string, "interleave_sectors:%u%c", &val, &dummy) == 1) + interleave_sectors = val; + else if (sscanf(opt_string, "buffer_sectors:%u%c", &val, &dummy) == 1) + buffer_sectors = val; + else if (sscanf(opt_string, "journal_watermark:%u%c", &val, &dummy) == 1 && val <= 100) + journal_watermark = val; + else if (sscanf(opt_string, "commit_time:%u%c", &val, &dummy) == 1) + sync_msec = val; + else if (sscanf(opt_string, "block_size:%u%c", &val, &dummy) == 1) { + if (val < 1 << SECTOR_SHIFT || + val > MAX_SECTORS_PER_BLOCK << SECTOR_SHIFT || + (val & (val -1))) { + r = -EINVAL; + ti->error = "Invalid block_size argument"; + goto bad; + } + ic->sectors_per_block = val >> SECTOR_SHIFT; + } else if (!memcmp(opt_string, "internal_hash:", strlen("internal_hash:"))) { + r = get_alg_and_key(opt_string, &ic->internal_hash_alg, &ti->error, + "Invalid internal_hash argument"); + if (r) + goto bad; + } else if (!memcmp(opt_string, "journal_crypt:", strlen("journal_crypt:"))) { + r = get_alg_and_key(opt_string, &ic->journal_crypt_alg, &ti->error, + "Invalid journal_crypt argument"); + if (r) + goto bad; + } else if (!memcmp(opt_string, "journal_mac:", strlen("journal_mac:"))) { + r = get_alg_and_key(opt_string, &ic->journal_mac_alg, &ti->error, + "Invalid journal_mac argument"); + if (r) + goto bad; + } else { + r = -EINVAL; + ti->error = "Invalid argument"; + goto bad; + } + } + + r = get_mac(&ic->internal_hash, &ic->internal_hash_alg, &ti->error, + "Invalid internal hash", "Error setting internal hash key"); + if (r) + goto bad; + + r = get_mac(&ic->journal_mac, &ic->journal_mac_alg, &ti->error, + "Invalid journal mac", "Error setting journal mac key"); + if (r) + goto bad; + + if (!ic->tag_size) { + if (!ic->internal_hash) { + ti->error = "Unknown tag size"; + r = -EINVAL; + goto bad; + } + ic->tag_size = crypto_shash_digestsize(ic->internal_hash); + } + if (ic->tag_size > MAX_TAG_SIZE) { + ti->error = "Too big tag size"; + r = -EINVAL; + goto bad; + } + if (!(ic->tag_size & (ic->tag_size - 1))) + ic->log2_tag_size = __ffs(ic->tag_size); + else + ic->log2_tag_size = -1; + + ic->autocommit_jiffies = msecs_to_jiffies(sync_msec); + ic->autocommit_msec = sync_msec; + setup_timer(&ic->autocommit_timer, autocommit_fn, (unsigned long)ic); + + ic->io = dm_io_client_create(); + if (IS_ERR(ic->io)) { + r = PTR_ERR(ic->io); + ic->io = NULL; + ti->error = "Cannot allocate dm io"; + goto bad; + } + + ic->journal_io_mempool = mempool_create_slab_pool(JOURNAL_IO_MEMPOOL, journal_io_cache); + if (!ic->journal_io_mempool) { + r = -ENOMEM; + ti->error = "Cannot allocate mempool"; + goto bad; + } + + ic->metadata_wq = alloc_workqueue("dm-integrity-metadata", + WQ_MEM_RECLAIM, METADATA_WORKQUEUE_MAX_ACTIVE); + if (!ic->metadata_wq) { + ti->error = "Cannot allocate workqueue"; + r = -ENOMEM; + goto bad; + } + + /* + * If this workqueue were percpu, it would cause bio reordering + * and reduced performance. + */ + ic->wait_wq = alloc_workqueue("dm-integrity-wait", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); + if (!ic->wait_wq) { + ti->error = "Cannot allocate workqueue"; + r = -ENOMEM; + goto bad; + } + + ic->commit_wq = alloc_workqueue("dm-integrity-commit", WQ_MEM_RECLAIM, 1); + if (!ic->commit_wq) { + ti->error = "Cannot allocate workqueue"; + r = -ENOMEM; + goto bad; + } + INIT_WORK(&ic->commit_work, integrity_commit); + + if (ic->mode == 'J') { + ic->writer_wq = alloc_workqueue("dm-integrity-writer", WQ_MEM_RECLAIM, 1); + if (!ic->writer_wq) { + ti->error = "Cannot allocate workqueue"; + r = -ENOMEM; + goto bad; + } + INIT_WORK(&ic->writer_work, integrity_writer); + } + + ic->sb = alloc_pages_exact(SB_SECTORS << SECTOR_SHIFT, GFP_KERNEL); + if (!ic->sb) { + r = -ENOMEM; + ti->error = "Cannot allocate superblock area"; + goto bad; + } + + r = sync_rw_sb(ic, REQ_OP_READ, 0); + if (r) { + ti->error = "Error reading superblock"; + goto bad; + } + should_write_sb = false; + if (memcmp(ic->sb->magic, SB_MAGIC, 8)) { + if (ic->mode != 'R') { + if (memchr_inv(ic->sb, 0, SB_SECTORS << SECTOR_SHIFT)) { + r = -EINVAL; + ti->error = "The device is not initialized"; + goto bad; + } + } + + r = initialize_superblock(ic, journal_sectors, interleave_sectors); + if (r) { + ti->error = "Could not initialize superblock"; + goto bad; + } + if (ic->mode != 'R') + should_write_sb = true; + } + + if (ic->sb->version != SB_VERSION) { + r = -EINVAL; + ti->error = "Unknown version"; + goto bad; + } + if (le16_to_cpu(ic->sb->integrity_tag_size) != ic->tag_size) { + r = -EINVAL; + ti->error = "Tag size doesn't match the information in superblock"; + goto bad; + } + if (ic->sb->log2_sectors_per_block != __ffs(ic->sectors_per_block)) { + r = -EINVAL; + ti->error = "Block size doesn't match the information in superblock"; + goto bad; + } + /* make sure that ti->max_io_len doesn't overflow */ + if (ic->sb->log2_interleave_sectors < MIN_LOG2_INTERLEAVE_SECTORS || + ic->sb->log2_interleave_sectors > MAX_LOG2_INTERLEAVE_SECTORS) { + r = -EINVAL; + ti->error = "Invalid interleave_sectors in the superblock"; + goto bad; + } + ic->provided_data_sectors = le64_to_cpu(ic->sb->provided_data_sectors); + if (ic->provided_data_sectors != le64_to_cpu(ic->sb->provided_data_sectors)) { + /* test for overflow */ + r = -EINVAL; + ti->error = "The superblock has 64-bit device size, but the kernel was compiled with 32-bit sectors"; + goto bad; + } + if (!!(ic->sb->flags & cpu_to_le32(SB_FLAG_HAVE_JOURNAL_MAC)) != !!ic->journal_mac_alg.alg_string) { + r = -EINVAL; + ti->error = "Journal mac mismatch"; + goto bad; + } + r = calculate_device_limits(ic); + if (r) { + ti->error = "The device is too small"; + goto bad; + } + + if (!buffer_sectors) + buffer_sectors = 1; + ic->log2_buffer_sectors = min3((int)__fls(buffer_sectors), (int)__ffs(ic->metadata_run), 31 - SECTOR_SHIFT); + + threshold = (__u64)ic->journal_entries * (100 - journal_watermark); + threshold += 50; + do_div(threshold, 100); + ic->free_sectors_threshold = threshold; + + DEBUG_print("initialized:\n"); + DEBUG_print(" integrity_tag_size %u\n", le16_to_cpu(ic->sb->integrity_tag_size)); + DEBUG_print(" journal_entry_size %u\n", ic->journal_entry_size); + DEBUG_print(" journal_entries_per_sector %u\n", ic->journal_entries_per_sector); + DEBUG_print(" journal_section_entries %u\n", ic->journal_section_entries); + DEBUG_print(" journal_section_sectors %u\n", ic->journal_section_sectors); + DEBUG_print(" journal_sections %u\n", (unsigned)le32_to_cpu(ic->sb->journal_sections)); + DEBUG_print(" journal_entries %u\n", ic->journal_entries); + DEBUG_print(" log2_interleave_sectors %d\n", ic->sb->log2_interleave_sectors); + DEBUG_print(" device_sectors 0x%llx\n", (unsigned long long)ic->device_sectors); + DEBUG_print(" initial_sectors 0x%x\n", ic->initial_sectors); + DEBUG_print(" metadata_run 0x%x\n", ic->metadata_run); + DEBUG_print(" log2_metadata_run %d\n", ic->log2_metadata_run); + DEBUG_print(" provided_data_sectors 0x%llx (%llu)\n", (unsigned long long)ic->provided_data_sectors, + (unsigned long long)ic->provided_data_sectors); + DEBUG_print(" log2_buffer_sectors %u\n", ic->log2_buffer_sectors); + + ic->bufio = dm_bufio_client_create(ic->dev->bdev, 1U << (SECTOR_SHIFT + ic->log2_buffer_sectors), + 1, 0, NULL, NULL); + if (IS_ERR(ic->bufio)) { + r = PTR_ERR(ic->bufio); + ti->error = "Cannot initialize dm-bufio"; + ic->bufio = NULL; + goto bad; + } + dm_bufio_set_sector_offset(ic->bufio, ic->start + ic->initial_sectors); + + if (ic->mode != 'R') { + r = create_journal(ic, &ti->error); + if (r) + goto bad; + } + + if (should_write_sb) { + int r; + + init_journal(ic, 0, ic->journal_sections, 0); + r = dm_integrity_failed(ic); + if (unlikely(r)) { + ti->error = "Error initializing journal"; + goto bad; + } + r = sync_rw_sb(ic, REQ_OP_WRITE, REQ_FUA); + if (r) { + ti->error = "Error initializing superblock"; + goto bad; + } + ic->just_formatted = true; + } + + r = dm_set_target_max_io_len(ti, 1U << ic->sb->log2_interleave_sectors); + if (r) + goto bad; + + if (!ic->internal_hash) + dm_integrity_set(ti, ic); + + ti->num_flush_bios = 1; + ti->flush_supported = true; + + return 0; +bad: + dm_integrity_dtr(ti); + return r; +} + +static void dm_integrity_dtr(struct dm_target *ti) +{ + struct dm_integrity_c *ic = ti->private; + + BUG_ON(!RB_EMPTY_ROOT(&ic->in_progress)); + + if (ic->metadata_wq) + destroy_workqueue(ic->metadata_wq); + if (ic->wait_wq) + destroy_workqueue(ic->wait_wq); + if (ic->commit_wq) + destroy_workqueue(ic->commit_wq); + if (ic->writer_wq) + destroy_workqueue(ic->writer_wq); + if (ic->bufio) + dm_bufio_client_destroy(ic->bufio); + mempool_destroy(ic->journal_io_mempool); + if (ic->io) + dm_io_client_destroy(ic->io); + if (ic->dev) + dm_put_device(ti, ic->dev); + dm_integrity_free_page_list(ic, ic->journal); + dm_integrity_free_page_list(ic, ic->journal_io); + dm_integrity_free_page_list(ic, ic->journal_xor); + if (ic->journal_scatterlist) + dm_integrity_free_journal_scatterlist(ic, ic->journal_scatterlist); + if (ic->journal_io_scatterlist) + dm_integrity_free_journal_scatterlist(ic, ic->journal_io_scatterlist); + if (ic->sk_requests) { + unsigned i; + + for (i = 0; i < ic->journal_sections; i++) { + struct skcipher_request *req = ic->sk_requests[i]; + if (req) { + kzfree(req->iv); + skcipher_request_free(req); + } + } + kvfree(ic->sk_requests); + } + kvfree(ic->journal_tree); + if (ic->sb) + free_pages_exact(ic->sb, SB_SECTORS << SECTOR_SHIFT); + + if (ic->internal_hash) + crypto_free_shash(ic->internal_hash); + free_alg(&ic->internal_hash_alg); + + if (ic->journal_crypt) + crypto_free_skcipher(ic->journal_crypt); + free_alg(&ic->journal_crypt_alg); + + if (ic->journal_mac) + crypto_free_shash(ic->journal_mac); + free_alg(&ic->journal_mac_alg); + + kfree(ic); +} + +static struct target_type integrity_target = { + .name = "integrity", + .version = {1, 0, 0}, + .module = THIS_MODULE, + .features = DM_TARGET_SINGLETON | DM_TARGET_INTEGRITY, + .ctr = dm_integrity_ctr, + .dtr = dm_integrity_dtr, + .map = dm_integrity_map, + .postsuspend = dm_integrity_postsuspend, + .resume = dm_integrity_resume, + .status = dm_integrity_status, + .iterate_devices = dm_integrity_iterate_devices, + .io_hints = dm_integrity_io_hints, +}; + +int __init dm_integrity_init(void) +{ + int r; + + journal_io_cache = kmem_cache_create("integrity_journal_io", + sizeof(struct journal_io), 0, 0, NULL); + if (!journal_io_cache) { + DMERR("can't allocate journal io cache"); + return -ENOMEM; + } + + r = dm_register_target(&integrity_target); + + if (r < 0) + DMERR("register failed %d", r); + + return r; +} + +void dm_integrity_exit(void) +{ + dm_unregister_target(&integrity_target); + kmem_cache_destroy(journal_io_cache); +} + +module_init(dm_integrity_init); +module_exit(dm_integrity_exit); + +MODULE_AUTHOR("Milan Broz"); +MODULE_AUTHOR("Mikulas Patocka"); +MODULE_DESCRIPTION(DM_NAME " target for integrity tags extension"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 4da6fc6b1ffd..2d5d7064acbf 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -37,14 +37,6 @@ struct hash_cell { struct dm_table *new_map; }; -/* - * A dummy definition to make RCU happy. - * struct dm_table should never be dereferenced in this file. - */ -struct dm_table { - int undefined__; -}; - struct vers_iter { size_t param_size; struct dm_target_versions *vers, *old_vers; @@ -1268,7 +1260,7 @@ static int populate_table(struct dm_table *table, return dm_table_complete(table); } -static bool is_valid_type(unsigned cur, unsigned new) +static bool is_valid_type(enum dm_queue_mode cur, enum dm_queue_mode new) { if (cur == new || (cur == DM_TYPE_BIO_BASED && new == DM_TYPE_DAX_BIO_BASED)) @@ -1778,12 +1770,12 @@ static int validate_params(uint cmd, struct dm_ioctl *param) cmd == DM_LIST_VERSIONS_CMD) return 0; - if ((cmd == DM_DEV_CREATE_CMD)) { + if (cmd == DM_DEV_CREATE_CMD) { if (!*param->name) { DMWARN("name not supplied when creating device"); return -EINVAL; } - } else if ((*param->uuid && *param->name)) { + } else if (*param->uuid && *param->name) { DMWARN("only supply one of name or uuid, cmd(%u)", cmd); return -EINVAL; } @@ -1848,7 +1840,7 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user) if (r) goto out; - param->data_size = sizeof(*param); + param->data_size = offsetof(struct dm_ioctl, data); r = fn(param, input_param_size); if (unlikely(param->flags & DM_BUFFER_FULL_FLAG) && diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index e17fd44ceef5..a5120961632a 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -163,6 +163,7 @@ static long linear_direct_access(struct dm_target *ti, sector_t sector, static struct target_type linear_target = { .name = "linear", .version = {1, 3, 0}, + .features = DM_TARGET_PASSES_INTEGRITY, .module = THIS_MODULE, .ctr = linear_ctr, .dtr = linear_dtr, diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 2950b145443d..52cd3f1608b3 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -90,7 +90,7 @@ struct multipath { atomic_t pg_init_in_progress; /* Only one pg_init allowed at once */ atomic_t pg_init_count; /* Number of times pg_init called */ - unsigned queue_mode; + enum dm_queue_mode queue_mode; struct mutex work_mutex; struct work_struct trigger_event; @@ -111,7 +111,8 @@ typedef int (*action_fn) (struct pgpath *pgpath); static struct workqueue_struct *kmultipathd, *kmpath_handlerd; static void trigger_event(struct work_struct *work); -static void activate_path(struct work_struct *work); +static void activate_or_offline_path(struct pgpath *pgpath); +static void activate_path_work(struct work_struct *work); static void process_queued_bios(struct work_struct *work); /*----------------------------------------------- @@ -136,7 +137,7 @@ static struct pgpath *alloc_pgpath(void) if (pgpath) { pgpath->is_active = true; - INIT_DELAYED_WORK(&pgpath->activate_path, activate_path); + INIT_DELAYED_WORK(&pgpath->activate_path, activate_path_work); } return pgpath; @@ -297,6 +298,8 @@ static int __pg_init_all_paths(struct multipath *m) struct pgpath *pgpath; unsigned long pg_init_delay = 0; + lockdep_assert_held(&m->lock); + if (atomic_read(&m->pg_init_in_progress) || test_bit(MPATHF_PG_INIT_DISABLED, &m->flags)) return 0; @@ -321,13 +324,16 @@ static int __pg_init_all_paths(struct multipath *m) return atomic_read(&m->pg_init_in_progress); } -static void pg_init_all_paths(struct multipath *m) +static int pg_init_all_paths(struct multipath *m) { + int ret; unsigned long flags; spin_lock_irqsave(&m->lock, flags); - __pg_init_all_paths(m); + ret = __pg_init_all_paths(m); spin_unlock_irqrestore(&m->lock, flags); + + return ret; } static void __switch_pg(struct multipath *m, struct priority_group *pg) @@ -436,45 +442,21 @@ failed: } /* - * Check whether bios must be queued in the device-mapper core rather - * than here in the target. - * - * If m->queue_if_no_path and m->saved_queue_if_no_path hold the - * same value then we are not between multipath_presuspend() - * and multipath_resume() calls and we have no need to check - * for the DMF_NOFLUSH_SUSPENDING flag. + * dm_report_EIO() is a macro instead of a function to make pr_debug() + * report the function name and line number of the function from which + * it has been invoked. */ -static bool __must_push_back(struct multipath *m) -{ - return ((test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) != - test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)) && - dm_noflush_suspending(m->ti)); -} - -static bool must_push_back_rq(struct multipath *m) -{ - bool r; - unsigned long flags; - - spin_lock_irqsave(&m->lock, flags); - r = (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) || - __must_push_back(m)); - spin_unlock_irqrestore(&m->lock, flags); - - return r; -} - -static bool must_push_back_bio(struct multipath *m) -{ - bool r; - unsigned long flags; - - spin_lock_irqsave(&m->lock, flags); - r = __must_push_back(m); - spin_unlock_irqrestore(&m->lock, flags); - - return r; -} +#define dm_report_EIO(m) \ +({ \ + struct mapped_device *md = dm_table_get_md((m)->ti->table); \ + \ + pr_debug("%s: returning EIO; QIFNP = %d; SQIFNP = %d; DNFS = %d\n", \ + dm_device_name(md), \ + test_bit(MPATHF_QUEUE_IF_NO_PATH, &(m)->flags), \ + test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &(m)->flags), \ + dm_noflush_suspending((m)->ti)); \ + -EIO; \ +}) /* * Map cloned requests (request-based multipath) @@ -484,11 +466,11 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq, struct request **__clone) { struct multipath *m = ti->private; - int r = DM_MAPIO_REQUEUE; size_t nr_bytes = blk_rq_bytes(rq); struct pgpath *pgpath; struct block_device *bdev; struct dm_mpath_io *mpio = get_mpio(map_context); + struct request_queue *q; struct request *clone; /* Do we need to select a new pgpath? */ @@ -497,13 +479,14 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq, pgpath = choose_pgpath(m, nr_bytes); if (!pgpath) { - if (must_push_back_rq(m)) + if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) return DM_MAPIO_DELAY_REQUEUE; - return -EIO; /* Failed */ + return dm_report_EIO(m); /* Failed */ } else if (test_bit(MPATHF_QUEUE_IO, &m->flags) || test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) { - pg_init_all_paths(m); - return r; + if (pg_init_all_paths(m)) + return DM_MAPIO_DELAY_REQUEUE; + return DM_MAPIO_REQUEUE; } memset(mpio, 0, sizeof(*mpio)); @@ -511,13 +494,19 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq, mpio->nr_bytes = nr_bytes; bdev = pgpath->path.dev->bdev; - - clone = blk_get_request(bdev_get_queue(bdev), - rq->cmd_flags | REQ_NOMERGE, - GFP_ATOMIC); + q = bdev_get_queue(bdev); + clone = blk_get_request(q, rq->cmd_flags | REQ_NOMERGE, GFP_ATOMIC); if (IS_ERR(clone)) { /* EBUSY, ENODEV or EWOULDBLOCK: requeue */ - return r; + bool queue_dying = blk_queue_dying(q); + DMERR_LIMIT("blk_get_request() returned %ld%s - requeuing", + PTR_ERR(clone), queue_dying ? " (path offline)" : ""); + if (queue_dying) { + atomic_inc(&m->pg_init_in_progress); + activate_or_offline_path(pgpath); + return DM_MAPIO_REQUEUE; + } + return DM_MAPIO_DELAY_REQUEUE; } clone->bio = clone->biotail = NULL; clone->rq_disk = bdev->bd_disk; @@ -567,9 +556,9 @@ static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_m } if (!pgpath) { - if (!must_push_back_bio(m)) - return -EIO; - return DM_MAPIO_REQUEUE; + if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) + return DM_MAPIO_REQUEUE; + return dm_report_EIO(m); } mpio->pgpath = pgpath; @@ -640,6 +629,14 @@ static void process_queued_bios(struct work_struct *work) blk_finish_plug(&plug); } +static void assign_bit(bool value, long nr, unsigned long *addr) +{ + if (value) + set_bit(nr, addr); + else + clear_bit(nr, addr); +} + /* * If we run out of usable paths, should we queue I/O or error it? */ @@ -649,23 +646,11 @@ static int queue_if_no_path(struct multipath *m, bool queue_if_no_path, unsigned long flags; spin_lock_irqsave(&m->lock, flags); - - if (save_old_value) { - if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) - set_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags); - else - clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags); - } else { - if (queue_if_no_path) - set_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags); - else - clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags); - } - if (queue_if_no_path) - set_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags); - else - clear_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags); - + assign_bit((save_old_value && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) || + (!save_old_value && queue_if_no_path), + MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags); + assign_bit(queue_if_no_path || dm_noflush_suspending(m->ti), + MPATHF_QUEUE_IF_NO_PATH, &m->flags); spin_unlock_irqrestore(&m->lock, flags); if (!queue_if_no_path) { @@ -1438,10 +1423,8 @@ out: spin_unlock_irqrestore(&m->lock, flags); } -static void activate_path(struct work_struct *work) +static void activate_or_offline_path(struct pgpath *pgpath) { - struct pgpath *pgpath = - container_of(work, struct pgpath, activate_path.work); struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev); if (pgpath->is_active && !blk_queue_dying(q)) @@ -1450,6 +1433,14 @@ static void activate_path(struct work_struct *work) pg_init_done(pgpath, SCSI_DH_DEV_OFFLINED); } +static void activate_path_work(struct work_struct *work) +{ + struct pgpath *pgpath = + container_of(work, struct pgpath, activate_path.work); + + activate_or_offline_path(pgpath); +} + static int noretry_error(int error) { switch (error) { @@ -1501,12 +1492,9 @@ static int do_end_io(struct multipath *m, struct request *clone, if (mpio->pgpath) fail_path(mpio->pgpath); - if (!atomic_read(&m->nr_valid_paths)) { - if (!test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) { - if (!must_push_back_rq(m)) - r = -EIO; - } - } + if (atomic_read(&m->nr_valid_paths) == 0 && + !test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) + r = dm_report_EIO(m); return r; } @@ -1547,13 +1535,9 @@ static int do_end_io_bio(struct multipath *m, struct bio *clone, if (mpio->pgpath) fail_path(mpio->pgpath); - if (!atomic_read(&m->nr_valid_paths)) { - if (!test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) { - if (!must_push_back_bio(m)) - return -EIO; - return DM_ENDIO_REQUEUE; - } - } + if (atomic_read(&m->nr_valid_paths) == 0 && + !test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) + return dm_report_EIO(m); /* Queue for the daemon to resubmit */ dm_bio_restore(get_bio_details_from_bio(clone), clone); @@ -1619,10 +1603,8 @@ static void multipath_resume(struct dm_target *ti) unsigned long flags; spin_lock_irqsave(&m->lock, flags); - if (test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)) - set_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags); - else - clear_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags); + assign_bit(test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags), + MPATHF_QUEUE_IF_NO_PATH, &m->flags); spin_unlock_irqrestore(&m->lock, flags); } @@ -1682,6 +1664,9 @@ static void multipath_status(struct dm_target *ti, status_type_t type, case DM_TYPE_MQ_REQUEST_BASED: DMEMIT("queue_mode mq "); break; + default: + WARN_ON_ONCE(true); + break; } } } diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 468f1380de1d..3a67073d9aa1 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -1,6 +1,6 @@ /* * Copyright (C) 2010-2011 Neil Brown - * Copyright (C) 2010-2016 Red Hat, Inc. All rights reserved. + * Copyright (C) 2010-2017 Red Hat, Inc. All rights reserved. * * This file is released under the GPL. */ @@ -79,7 +79,10 @@ struct raid_dev { #define __CTR_FLAG_RAID10_USE_NEAR_SETS 14 /* 2 */ /* Only with raid10! */ /* New for v1.10.0 */ -#define __CTR_FLAG_JOURNAL_DEV 15 /* 2 */ /* Only with raid4/5/6! */ +#define __CTR_FLAG_JOURNAL_DEV 15 /* 2 */ /* Only with raid4/5/6 (journal device)! */ + +/* New for v1.11.1 */ +#define __CTR_FLAG_JOURNAL_MODE 16 /* 2 */ /* Only with raid4/5/6 (journal mode)! */ /* * Flags for rs->ctr_flags field. @@ -100,6 +103,7 @@ struct raid_dev { #define CTR_FLAG_DATA_OFFSET (1 << __CTR_FLAG_DATA_OFFSET) #define CTR_FLAG_RAID10_USE_NEAR_SETS (1 << __CTR_FLAG_RAID10_USE_NEAR_SETS) #define CTR_FLAG_JOURNAL_DEV (1 << __CTR_FLAG_JOURNAL_DEV) +#define CTR_FLAG_JOURNAL_MODE (1 << __CTR_FLAG_JOURNAL_MODE) #define RESUME_STAY_FROZEN_FLAGS (CTR_FLAG_DELTA_DISKS | CTR_FLAG_DATA_OFFSET) @@ -175,7 +179,8 @@ struct raid_dev { CTR_FLAG_REGION_SIZE | \ CTR_FLAG_DELTA_DISKS | \ CTR_FLAG_DATA_OFFSET | \ - CTR_FLAG_JOURNAL_DEV) + CTR_FLAG_JOURNAL_DEV | \ + CTR_FLAG_JOURNAL_MODE) #define RAID6_VALID_FLAGS (CTR_FLAG_SYNC | \ CTR_FLAG_REBUILD | \ @@ -186,7 +191,8 @@ struct raid_dev { CTR_FLAG_REGION_SIZE | \ CTR_FLAG_DELTA_DISKS | \ CTR_FLAG_DATA_OFFSET | \ - CTR_FLAG_JOURNAL_DEV) + CTR_FLAG_JOURNAL_DEV | \ + CTR_FLAG_JOURNAL_MODE) /* ...valid options definitions per raid level */ /* @@ -239,6 +245,7 @@ struct raid_set { struct journal_dev { struct dm_dev *dev; struct md_rdev rdev; + int mode; } journal_dev; struct raid_dev dev[0]; @@ -326,6 +333,7 @@ static struct arg_name_flag { { CTR_FLAG_DELTA_DISKS, "delta_disks"}, { CTR_FLAG_RAID10_USE_NEAR_SETS, "raid10_use_near_sets"}, { CTR_FLAG_JOURNAL_DEV, "journal_dev" }, + { CTR_FLAG_JOURNAL_MODE, "journal_mode" }, }; /* Return argument name string for given @flag */ @@ -344,6 +352,39 @@ static const char *dm_raid_arg_name_by_flag(const uint32_t flag) return NULL; } +/* Define correlation of raid456 journal cache modes and dm-raid target line parameters */ +static struct { + const int mode; + const char *param; +} _raid456_journal_mode[] = { + { R5C_JOURNAL_MODE_WRITE_THROUGH , "writethrough" }, + { R5C_JOURNAL_MODE_WRITE_BACK , "writeback" } +}; + +/* Return MD raid4/5/6 journal mode for dm @journal_mode one */ +static int dm_raid_journal_mode_to_md(const char *mode) +{ + int m = ARRAY_SIZE(_raid456_journal_mode); + + while (m--) + if (!strcasecmp(mode, _raid456_journal_mode[m].param)) + return _raid456_journal_mode[m].mode; + + return -EINVAL; +} + +/* Return dm-raid raid4/5/6 journal mode string for @mode */ +static const char *md_journal_mode_to_dm_raid(const int mode) +{ + int m = ARRAY_SIZE(_raid456_journal_mode); + + while (m--) + if (mode == _raid456_journal_mode[m].mode) + return _raid456_journal_mode[m].param; + + return "unknown"; +} + /* * Bool helpers to test for various raid levels of a raid set. * It's level as reported by the superblock rather than @@ -1183,7 +1224,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as, continue; } - /* "journal_dev dev" */ + /* "journal_dev " */ if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_DEV))) { int r; struct md_rdev *jdev; @@ -1211,10 +1252,32 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as, rs->ti->error = "No space for raid4/5/6 journal"; return -ENOSPC; } + rs->journal_dev.mode = R5C_JOURNAL_MODE_WRITE_THROUGH; set_bit(Journal, &jdev->flags); continue; } + /* "journal_mode " ("journal_dev" mandatory!) */ + if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_MODE))) { + int r; + + if (!test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags)) { + rs->ti->error = "raid4/5/6 'journal_mode' is invalid without 'journal_dev'"; + return -EINVAL; + } + if (test_and_set_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags)) { + rs->ti->error = "Only one raid4/5/6 'journal_mode' argument allowed"; + return -EINVAL; + } + r = dm_raid_journal_mode_to_md(arg); + if (r < 0) { + rs->ti->error = "Invalid 'journal_mode' argument"; + return r; + } + rs->journal_dev.mode = r; + continue; + } + /* * Parameters with number values from here on. */ @@ -3076,6 +3139,16 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) rs->callbacks.congested_fn = raid_is_congested; dm_table_add_target_callbacks(ti->table, &rs->callbacks); + /* If raid4/5/6 journal mode explictely requested (only possible with journal dev) -> set it */ + if (test_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags)) { + r = r5c_journal_mode_set(&rs->md, rs->journal_dev.mode); + if (r) { + ti->error = "Failed to set raid4/5/6 journal mode"; + mddev_unlock(&rs->md); + goto bad_journal_mode_set; + } + } + mddev_suspend(&rs->md); /* Try to adjust the raid4/5/6 stripe cache size to the stripe size */ @@ -3109,6 +3182,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) mddev_unlock(&rs->md); return 0; +bad_journal_mode_set: bad_stripe_cache: bad_check_reshape: md_stop(&rs->md); @@ -3180,18 +3254,18 @@ static const char *decipher_sync_action(struct mddev *mddev) * Status characters: * * 'D' = Dead/Failed raid set component or raid4/5/6 journal device - * 'a' = Alive but not in-sync - * 'A' = Alive and in-sync raid set component or alive raid4/5/6 journal device + * 'a' = Alive but not in-sync raid set component _or_ alive raid4/5/6 'write_back' journal device + * 'A' = Alive and in-sync raid set component _or_ alive raid4/5/6 'write_through' journal device * '-' = Non-existing device (i.e. uspace passed '- -' into the ctr) */ -static const char *__raid_dev_status(struct md_rdev *rdev, bool array_in_sync) +static const char *__raid_dev_status(struct raid_set *rs, struct md_rdev *rdev, bool array_in_sync) { if (!rdev->bdev) return "-"; else if (test_bit(Faulty, &rdev->flags)) return "D"; else if (test_bit(Journal, &rdev->flags)) - return "A"; + return (rs->journal_dev.mode == R5C_JOURNAL_MODE_WRITE_THROUGH) ? "A" : "a"; else if (!array_in_sync || !test_bit(In_sync, &rdev->flags)) return "a"; else @@ -3315,7 +3389,7 @@ static void raid_status(struct dm_target *ti, status_type_t type, /* HM FIXME: do we want another state char for raid0? It shows 'D'/'A'/'-' now */ for (i = 0; i < rs->raid_disks; i++) - DMEMIT(__raid_dev_status(&rs->dev[i].rdev, array_in_sync)); + DMEMIT(__raid_dev_status(rs, &rs->dev[i].rdev, array_in_sync)); /* * In-sync/Reshape ratio: @@ -3366,7 +3440,7 @@ static void raid_status(struct dm_target *ti, status_type_t type, * v1.10.0+: */ DMEMIT(" %s", test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ? - __raid_dev_status(&rs->journal_dev.rdev, 0) : "-"); + __raid_dev_status(rs, &rs->journal_dev.rdev, 0) : "-"); break; case STATUSTYPE_TABLE: @@ -3381,39 +3455,30 @@ static void raid_status(struct dm_target *ti, status_type_t type, write_mostly_params + hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_NO_ARGS) + hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_ONE_ARG) * 2 + - (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ? 2 : 0); + (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ? 2 : 0) + + (test_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags) ? 2 : 0); + /* Emit table line */ + /* This has to be in the documented order for userspace! */ DMEMIT("%s %u %u", rs->raid_type->name, raid_param_cnt, mddev->new_chunk_sectors); - if (test_bit(__CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags)) - DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_FORMAT), - raid10_md_layout_to_format(mddev->layout)); - if (test_bit(__CTR_FLAG_RAID10_COPIES, &rs->ctr_flags)) - DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_COPIES), - raid10_md_layout_to_copies(mddev->layout)); - if (test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags)) - DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_NOSYNC)); if (test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags)) DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_SYNC)); - if (test_bit(__CTR_FLAG_REGION_SIZE, &rs->ctr_flags)) - DMEMIT(" %s %llu", dm_raid_arg_name_by_flag(CTR_FLAG_REGION_SIZE), - (unsigned long long) to_sector(mddev->bitmap_info.chunksize)); - if (test_bit(__CTR_FLAG_DATA_OFFSET, &rs->ctr_flags)) - DMEMIT(" %s %llu", dm_raid_arg_name_by_flag(CTR_FLAG_DATA_OFFSET), - (unsigned long long) rs->data_offset); - if (test_bit(__CTR_FLAG_DAEMON_SLEEP, &rs->ctr_flags)) - DMEMIT(" %s %lu", dm_raid_arg_name_by_flag(CTR_FLAG_DAEMON_SLEEP), - mddev->bitmap_info.daemon_sleep); - if (test_bit(__CTR_FLAG_DELTA_DISKS, &rs->ctr_flags)) - DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_DELTA_DISKS), - max(rs->delta_disks, mddev->delta_disks)); - if (test_bit(__CTR_FLAG_STRIPE_CACHE, &rs->ctr_flags)) - DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_STRIPE_CACHE), - max_nr_stripes); + if (test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags)) + DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_NOSYNC)); if (rebuild_disks) for (i = 0; i < rs->raid_disks; i++) if (test_bit(rs->dev[i].rdev.raid_disk, (void *) rs->rebuild_disks)) DMEMIT(" %s %u", dm_raid_arg_name_by_flag(CTR_FLAG_REBUILD), rs->dev[i].rdev.raid_disk); + if (test_bit(__CTR_FLAG_DAEMON_SLEEP, &rs->ctr_flags)) + DMEMIT(" %s %lu", dm_raid_arg_name_by_flag(CTR_FLAG_DAEMON_SLEEP), + mddev->bitmap_info.daemon_sleep); + if (test_bit(__CTR_FLAG_MIN_RECOVERY_RATE, &rs->ctr_flags)) + DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MIN_RECOVERY_RATE), + mddev->sync_speed_min); + if (test_bit(__CTR_FLAG_MAX_RECOVERY_RATE, &rs->ctr_flags)) + DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MAX_RECOVERY_RATE), + mddev->sync_speed_max); if (write_mostly_params) for (i = 0; i < rs->raid_disks; i++) if (test_bit(WriteMostly, &rs->dev[i].rdev.flags)) @@ -3422,15 +3487,30 @@ static void raid_status(struct dm_target *ti, status_type_t type, if (test_bit(__CTR_FLAG_MAX_WRITE_BEHIND, &rs->ctr_flags)) DMEMIT(" %s %lu", dm_raid_arg_name_by_flag(CTR_FLAG_MAX_WRITE_BEHIND), mddev->bitmap_info.max_write_behind); - if (test_bit(__CTR_FLAG_MAX_RECOVERY_RATE, &rs->ctr_flags)) - DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MAX_RECOVERY_RATE), - mddev->sync_speed_max); - if (test_bit(__CTR_FLAG_MIN_RECOVERY_RATE, &rs->ctr_flags)) - DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MIN_RECOVERY_RATE), - mddev->sync_speed_min); + if (test_bit(__CTR_FLAG_STRIPE_CACHE, &rs->ctr_flags)) + DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_STRIPE_CACHE), + max_nr_stripes); + if (test_bit(__CTR_FLAG_REGION_SIZE, &rs->ctr_flags)) + DMEMIT(" %s %llu", dm_raid_arg_name_by_flag(CTR_FLAG_REGION_SIZE), + (unsigned long long) to_sector(mddev->bitmap_info.chunksize)); + if (test_bit(__CTR_FLAG_RAID10_COPIES, &rs->ctr_flags)) + DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_COPIES), + raid10_md_layout_to_copies(mddev->layout)); + if (test_bit(__CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags)) + DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_FORMAT), + raid10_md_layout_to_format(mddev->layout)); + if (test_bit(__CTR_FLAG_DELTA_DISKS, &rs->ctr_flags)) + DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_DELTA_DISKS), + max(rs->delta_disks, mddev->delta_disks)); + if (test_bit(__CTR_FLAG_DATA_OFFSET, &rs->ctr_flags)) + DMEMIT(" %s %llu", dm_raid_arg_name_by_flag(CTR_FLAG_DATA_OFFSET), + (unsigned long long) rs->data_offset); if (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags)) DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_DEV), __get_dev_name(rs->journal_dev.dev)); + if (test_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags)) + DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_MODE), + md_journal_mode_to_dm_raid(rs->journal_dev.mode)); DMEMIT(" %d", rs->raid_disks); for (i = 0; i < rs->raid_disks; i++) DMEMIT(" %s %s", __get_dev_name(rs->dev[i].meta_dev), @@ -3791,7 +3871,7 @@ static void raid_resume(struct dm_target *ti) static struct target_type raid_target = { .name = "raid", - .version = {1, 10, 1}, + .version = {1, 11, 1}, .module = THIS_MODULE, .ctr = raid_ctr, .dtr = raid_dtr, diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index bff7e3bdb4ed..d445b712970b 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -280,7 +280,7 @@ static void dm_requeue_original_request(struct dm_rq_target_io *tio, bool delay_ if (!rq->q->mq_ops) dm_old_requeue_request(rq); else - dm_mq_delay_requeue_request(rq, delay_requeue ? 5000 : 0); + dm_mq_delay_requeue_request(rq, delay_requeue ? 100/*ms*/ : 0); rq_completed(md, rw, false); } @@ -815,10 +815,14 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t) dm_init_md_queue(md); /* backfill 'mq' sysfs registration normally done in blk_register_queue */ - blk_mq_register_dev(disk_to_dev(md->disk), q); + err = blk_mq_register_dev(disk_to_dev(md->disk), q); + if (err) + goto out_cleanup_queue; return 0; +out_cleanup_queue: + blk_cleanup_queue(q); out_tag_set: blk_mq_free_tag_set(md->tag_set); out_kfree_tag_set: diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 5ef49c121d99..4b50ae115c6d 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -442,6 +442,7 @@ static void stripe_io_hints(struct dm_target *ti, static struct target_type stripe_target = { .name = "striped", .version = {1, 6, 0}, + .features = DM_TARGET_PASSES_INTEGRITY, .module = THIS_MODULE, .ctr = stripe_ctr, .dtr = stripe_dtr, diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 958275aca008..5f5eae41f804 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -30,7 +30,7 @@ struct dm_table { struct mapped_device *md; - unsigned type; + enum dm_queue_mode type; /* btree table */ unsigned int depth; @@ -47,6 +47,7 @@ struct dm_table { bool integrity_supported:1; bool singleton:1; bool all_blk_mq:1; + unsigned integrity_added:1; /* * Indicates the rw permissions for the new logical @@ -372,7 +373,7 @@ static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode, */ dev_t dm_get_dev_t(const char *path) { - dev_t uninitialized_var(dev); + dev_t dev; struct block_device *bdev; bdev = lookup_bdev(path); @@ -626,13 +627,13 @@ static int validate_hardware_logical_block_alignment(struct dm_table *table, struct dm_target *uninitialized_var(ti); struct queue_limits ti_limits; - unsigned i = 0; + unsigned i; /* * Check each entry in the table in turn. */ - while (i < dm_table_get_num_targets(table)) { - ti = dm_table_get_target(table, i++); + for (i = 0; i < dm_table_get_num_targets(table); i++) { + ti = dm_table_get_target(table, i); blk_set_stacking_limits(&ti_limits); @@ -725,6 +726,9 @@ int dm_table_add_target(struct dm_table *t, const char *type, t->immutable_target_type = tgt->type; } + if (dm_target_has_integrity(tgt->type)) + t->integrity_added = 1; + tgt->table = t; tgt->begin = start; tgt->len = len; @@ -821,19 +825,19 @@ void dm_consume_args(struct dm_arg_set *as, unsigned num_args) } EXPORT_SYMBOL(dm_consume_args); -static bool __table_type_bio_based(unsigned table_type) +static bool __table_type_bio_based(enum dm_queue_mode table_type) { return (table_type == DM_TYPE_BIO_BASED || table_type == DM_TYPE_DAX_BIO_BASED); } -static bool __table_type_request_based(unsigned table_type) +static bool __table_type_request_based(enum dm_queue_mode table_type) { return (table_type == DM_TYPE_REQUEST_BASED || table_type == DM_TYPE_MQ_REQUEST_BASED); } -void dm_table_set_type(struct dm_table *t, unsigned type) +void dm_table_set_type(struct dm_table *t, enum dm_queue_mode type) { t->type = type; } @@ -850,11 +854,11 @@ static int device_supports_dax(struct dm_target *ti, struct dm_dev *dev, static bool dm_table_supports_dax(struct dm_table *t) { struct dm_target *ti; - unsigned i = 0; + unsigned i; /* Ensure that all targets support DAX. */ - while (i < dm_table_get_num_targets(t)) { - ti = dm_table_get_target(t, i++); + for (i = 0; i < dm_table_get_num_targets(t); i++) { + ti = dm_table_get_target(t, i); if (!ti->type->direct_access) return false; @@ -875,7 +879,7 @@ static int dm_table_determine_type(struct dm_table *t) struct dm_target *tgt; struct dm_dev_internal *dd; struct list_head *devices = dm_table_get_devices(t); - unsigned live_md_type = dm_get_md_type(t->md); + enum dm_queue_mode live_md_type = dm_get_md_type(t->md); if (t->type != DM_TYPE_NONE) { /* target already set the table's type */ @@ -984,7 +988,7 @@ verify_rq_based: return 0; } -unsigned dm_table_get_type(struct dm_table *t) +enum dm_queue_mode dm_table_get_type(struct dm_table *t) { return t->type; } @@ -1006,11 +1010,11 @@ struct dm_target *dm_table_get_immutable_target(struct dm_table *t) struct dm_target *dm_table_get_wildcard_target(struct dm_table *t) { - struct dm_target *uninitialized_var(ti); - unsigned i = 0; + struct dm_target *ti; + unsigned i; - while (i < dm_table_get_num_targets(t)) { - ti = dm_table_get_target(t, i++); + for (i = 0; i < dm_table_get_num_targets(t); i++) { + ti = dm_table_get_target(t, i); if (dm_target_is_wildcard(ti->type)) return ti; } @@ -1035,7 +1039,7 @@ bool dm_table_all_blk_mq_devices(struct dm_table *t) static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *md) { - unsigned type = dm_table_get_type(t); + enum dm_queue_mode type = dm_table_get_type(t); unsigned per_io_data_size = 0; struct dm_target *tgt; unsigned i; @@ -1131,6 +1135,13 @@ static struct gendisk * dm_table_get_integrity_disk(struct dm_table *t) struct list_head *devices = dm_table_get_devices(t); struct dm_dev_internal *dd = NULL; struct gendisk *prev_disk = NULL, *template_disk = NULL; + unsigned i; + + for (i = 0; i < dm_table_get_num_targets(t); i++) { + struct dm_target *ti = dm_table_get_target(t, i); + if (!dm_target_passes_integrity(ti->type)) + goto no_integrity; + } list_for_each_entry(dd, devices, list) { template_disk = dd->dm_dev->bdev->bd_disk; @@ -1168,6 +1179,10 @@ static int dm_table_register_integrity(struct dm_table *t) struct mapped_device *md = t->md; struct gendisk *template_disk = NULL; + /* If target handles integrity itself do not register it here. */ + if (t->integrity_added) + return 0; + template_disk = dm_table_get_integrity_disk(t); if (!template_disk) return 0; @@ -1313,15 +1328,16 @@ static int count_device(struct dm_target *ti, struct dm_dev *dev, */ bool dm_table_has_no_data_devices(struct dm_table *table) { - struct dm_target *uninitialized_var(ti); - unsigned i = 0, num_devices = 0; + struct dm_target *ti; + unsigned i, num_devices; - while (i < dm_table_get_num_targets(table)) { - ti = dm_table_get_target(table, i++); + for (i = 0; i < dm_table_get_num_targets(table); i++) { + ti = dm_table_get_target(table, i); if (!ti->type->iterate_devices) return false; + num_devices = 0; ti->type->iterate_devices(ti, count_device, &num_devices); if (num_devices) return false; @@ -1336,16 +1352,16 @@ bool dm_table_has_no_data_devices(struct dm_table *table) int dm_calculate_queue_limits(struct dm_table *table, struct queue_limits *limits) { - struct dm_target *uninitialized_var(ti); + struct dm_target *ti; struct queue_limits ti_limits; - unsigned i = 0; + unsigned i; blk_set_stacking_limits(limits); - while (i < dm_table_get_num_targets(table)) { + for (i = 0; i < dm_table_get_num_targets(table); i++) { blk_set_stacking_limits(&ti_limits); - ti = dm_table_get_target(table, i++); + ti = dm_table_get_target(table, i); if (!ti->type->iterate_devices) goto combine_limits; @@ -1394,6 +1410,9 @@ static void dm_table_verify_integrity(struct dm_table *t) { struct gendisk *template_disk = NULL; + if (t->integrity_added) + return; + if (t->integrity_supported) { /* * Verify that the original integrity profile @@ -1424,7 +1443,7 @@ static int device_flush_capable(struct dm_target *ti, struct dm_dev *dev, static bool dm_table_supports_flush(struct dm_table *t, unsigned long flush) { struct dm_target *ti; - unsigned i = 0; + unsigned i; /* * Require at least one underlying device to support flushes. @@ -1432,8 +1451,8 @@ static bool dm_table_supports_flush(struct dm_table *t, unsigned long flush) * so we need to use iterate_devices here, which targets * supporting flushes must provide. */ - while (i < dm_table_get_num_targets(t)) { - ti = dm_table_get_target(t, i++); + for (i = 0; i < dm_table_get_num_targets(t); i++) { + ti = dm_table_get_target(t, i); if (!ti->num_flush_bios) continue; @@ -1477,10 +1496,10 @@ static bool dm_table_all_devices_attribute(struct dm_table *t, iterate_devices_callout_fn func) { struct dm_target *ti; - unsigned i = 0; + unsigned i; - while (i < dm_table_get_num_targets(t)) { - ti = dm_table_get_target(t, i++); + for (i = 0; i < dm_table_get_num_targets(t); i++) { + ti = dm_table_get_target(t, i); if (!ti->type->iterate_devices || !ti->type->iterate_devices(ti, func, NULL)) @@ -1501,10 +1520,10 @@ static int device_not_write_same_capable(struct dm_target *ti, struct dm_dev *de static bool dm_table_supports_write_same(struct dm_table *t) { struct dm_target *ti; - unsigned i = 0; + unsigned i; - while (i < dm_table_get_num_targets(t)) { - ti = dm_table_get_target(t, i++); + for (i = 0; i < dm_table_get_num_targets(t); i++) { + ti = dm_table_get_target(t, i); if (!ti->num_write_same_bios) return false; @@ -1556,7 +1575,7 @@ static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev, static bool dm_table_supports_discards(struct dm_table *t) { struct dm_target *ti; - unsigned i = 0; + unsigned i; /* * Unless any target used by the table set discards_supported, @@ -1565,8 +1584,8 @@ static bool dm_table_supports_discards(struct dm_table *t) * so we need to use iterate_devices here, which targets * supporting discard selectively must provide. */ - while (i < dm_table_get_num_targets(t)) { - ti = dm_table_get_target(t, i++); + for (i = 0; i < dm_table_get_num_targets(t); i++) { + ti = dm_table_get_target(t, i); if (!ti->num_discard_bios) continue; @@ -1672,6 +1691,8 @@ static void suspend_targets(struct dm_table *t, enum suspend_mode mode) int i = t->num_targets; struct dm_target *ti = t->targets; + lockdep_assert_held(&t->md->suspend_lock); + while (i--) { switch (mode) { case PRESUSPEND: @@ -1719,6 +1740,8 @@ int dm_table_resume_targets(struct dm_table *t) { int i, r = 0; + lockdep_assert_held(&t->md->suspend_lock); + for (i = 0; i < t->num_targets; i++) { struct dm_target *ti = t->targets + i; diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index a15091a0d40c..0f0251d0d337 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -77,7 +77,6 @@ #define THIN_SUPERBLOCK_MAGIC 27022010 #define THIN_SUPERBLOCK_LOCATION 0 #define THIN_VERSION 2 -#define THIN_METADATA_CACHE_SIZE 64 #define SECTOR_TO_BLOCK_SHIFT 3 /* @@ -686,7 +685,6 @@ static int __create_persistent_data_objects(struct dm_pool_metadata *pmd, bool f int r; pmd->bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE << SECTOR_SHIFT, - THIN_METADATA_CACHE_SIZE, THIN_MAX_CONCURRENT_LOCKS); if (IS_ERR(pmd->bm)) { DMERR("could not create block manager"); diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index a5f1916f621a..17ad50daed08 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -5,7 +5,7 @@ */ #include "dm-thin-metadata.h" -#include "dm-bio-prison.h" +#include "dm-bio-prison-v1.h" #include "dm.h" #include @@ -1069,6 +1069,7 @@ static void passdown_endio(struct bio *bio) * to unmap (we ignore err). */ queue_passdown_pt2(bio->bi_private); + bio_put(bio); } static void process_prepared_discard_passdown_pt1(struct dm_thin_new_mapping *m) diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index 0f0eb8a3d922..dab98fee0754 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -188,7 +188,7 @@ error: static int fec_is_erasure(struct dm_verity *v, struct dm_verity_io *io, u8 *want_digest, u8 *data) { - if (unlikely(verity_hash(v, verity_io_hash_desc(v, io), + if (unlikely(verity_hash(v, verity_io_hash_req(v, io), data, 1 << v->data_dev_block_bits, verity_io_real_digest(v, io)))) return 0; @@ -397,7 +397,7 @@ static int fec_decode_rsb(struct dm_verity *v, struct dm_verity_io *io, } /* Always re-validate the corrected block against the expected hash */ - r = verity_hash(v, verity_io_hash_desc(v, io), fio->output, + r = verity_hash(v, verity_io_hash_req(v, io), fio->output, 1 << v->data_dev_block_bits, verity_io_real_digest(v, io)); if (unlikely(r < 0)) diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 7335d8a3fc47..97de961a3bfc 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -93,81 +93,123 @@ static sector_t verity_position_at_level(struct dm_verity *v, sector_t block, } /* - * Wrapper for crypto_shash_init, which handles verity salting. + * Callback function for asynchrnous crypto API completion notification */ -static int verity_hash_init(struct dm_verity *v, struct shash_desc *desc) +static void verity_op_done(struct crypto_async_request *base, int err) +{ + struct verity_result *res = (struct verity_result *)base->data; + + if (err == -EINPROGRESS) + return; + + res->err = err; + complete(&res->completion); +} + +/* + * Wait for async crypto API callback + */ +static inline int verity_complete_op(struct verity_result *res, int ret) +{ + switch (ret) { + case 0: + break; + + case -EINPROGRESS: + case -EBUSY: + ret = wait_for_completion_interruptible(&res->completion); + if (!ret) + ret = res->err; + reinit_completion(&res->completion); + break; + + default: + DMERR("verity_wait_hash: crypto op submission failed: %d", ret); + } + + if (unlikely(ret < 0)) + DMERR("verity_wait_hash: crypto op failed: %d", ret); + + return ret; +} + +static int verity_hash_update(struct dm_verity *v, struct ahash_request *req, + const u8 *data, size_t len, + struct verity_result *res) +{ + struct scatterlist sg; + + sg_init_one(&sg, data, len); + ahash_request_set_crypt(req, &sg, NULL, len); + + return verity_complete_op(res, crypto_ahash_update(req)); +} + +/* + * Wrapper for crypto_ahash_init, which handles verity salting. + */ +static int verity_hash_init(struct dm_verity *v, struct ahash_request *req, + struct verity_result *res) { int r; - desc->tfm = v->tfm; - desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; + ahash_request_set_tfm(req, v->tfm); + ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP | + CRYPTO_TFM_REQ_MAY_BACKLOG, + verity_op_done, (void *)res); + init_completion(&res->completion); - r = crypto_shash_init(desc); + r = verity_complete_op(res, crypto_ahash_init(req)); if (unlikely(r < 0)) { - DMERR("crypto_shash_init failed: %d", r); + DMERR("crypto_ahash_init failed: %d", r); return r; } - if (likely(v->version >= 1)) { - r = crypto_shash_update(desc, v->salt, v->salt_size); - - if (unlikely(r < 0)) { - DMERR("crypto_shash_update failed: %d", r); - return r; - } - } - - return 0; -} - -static int verity_hash_update(struct dm_verity *v, struct shash_desc *desc, - const u8 *data, size_t len) -{ - int r = crypto_shash_update(desc, data, len); - - if (unlikely(r < 0)) - DMERR("crypto_shash_update failed: %d", r); + if (likely(v->version >= 1)) + r = verity_hash_update(v, req, v->salt, v->salt_size, res); return r; } -static int verity_hash_final(struct dm_verity *v, struct shash_desc *desc, - u8 *digest) +static int verity_hash_final(struct dm_verity *v, struct ahash_request *req, + u8 *digest, struct verity_result *res) { int r; if (unlikely(!v->version)) { - r = crypto_shash_update(desc, v->salt, v->salt_size); + r = verity_hash_update(v, req, v->salt, v->salt_size, res); if (r < 0) { - DMERR("crypto_shash_update failed: %d", r); - return r; + DMERR("verity_hash_final failed updating salt: %d", r); + goto out; } } - r = crypto_shash_final(desc, digest); - - if (unlikely(r < 0)) - DMERR("crypto_shash_final failed: %d", r); - + ahash_request_set_crypt(req, NULL, digest, 0); + r = verity_complete_op(res, crypto_ahash_final(req)); +out: return r; } -int verity_hash(struct dm_verity *v, struct shash_desc *desc, +int verity_hash(struct dm_verity *v, struct ahash_request *req, const u8 *data, size_t len, u8 *digest) { int r; + struct verity_result res; - r = verity_hash_init(v, desc); + r = verity_hash_init(v, req, &res); if (unlikely(r < 0)) - return r; + goto out; - r = verity_hash_update(v, desc, data, len); + r = verity_hash_update(v, req, data, len, &res); if (unlikely(r < 0)) - return r; + goto out; - return verity_hash_final(v, desc, digest); + r = verity_hash_final(v, req, digest, &res); + +out: + return r; } static void verity_hash_at_level(struct dm_verity *v, sector_t block, int level, @@ -275,7 +317,7 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io, goto release_ret_r; } - r = verity_hash(v, verity_io_hash_desc(v, io), + r = verity_hash(v, verity_io_hash_req(v, io), data, 1 << v->hash_dev_block_bits, verity_io_real_digest(v, io)); if (unlikely(r < 0)) @@ -343,6 +385,49 @@ out: return r; } +/* + * Calculates the digest for the given bio + */ +int verity_for_io_block(struct dm_verity *v, struct dm_verity_io *io, + struct bvec_iter *iter, struct verity_result *res) +{ + unsigned int todo = 1 << v->data_dev_block_bits; + struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size); + struct scatterlist sg; + struct ahash_request *req = verity_io_hash_req(v, io); + + do { + int r; + unsigned int len; + struct bio_vec bv = bio_iter_iovec(bio, *iter); + + sg_init_table(&sg, 1); + + len = bv.bv_len; + + if (likely(len >= todo)) + len = todo; + /* + * Operating on a single page at a time looks suboptimal + * until you consider the typical block size is 4,096B. + * Going through this loops twice should be very rare. + */ + sg_set_page(&sg, bv.bv_page, len, bv.bv_offset); + ahash_request_set_crypt(req, &sg, NULL, len); + r = verity_complete_op(res, crypto_ahash_update(req)); + + if (unlikely(r < 0)) { + DMERR("verity_for_io_block crypto op failed: %d", r); + return r; + } + + bio_advance_iter(bio, iter, len); + todo -= len; + } while (todo); + + return 0; +} + /* * Calls function process for 1 << v->data_dev_block_bits bytes in the bio_vec * starting from iter. @@ -381,12 +466,6 @@ int verity_for_bv_block(struct dm_verity *v, struct dm_verity_io *io, return 0; } -static int verity_bv_hash_update(struct dm_verity *v, struct dm_verity_io *io, - u8 *data, size_t len) -{ - return verity_hash_update(v, verity_io_hash_desc(v, io), data, len); -} - static int verity_bv_zero(struct dm_verity *v, struct dm_verity_io *io, u8 *data, size_t len) { @@ -403,10 +482,11 @@ static int verity_verify_io(struct dm_verity_io *io) struct dm_verity *v = io->v; struct bvec_iter start; unsigned b; + struct verity_result res; for (b = 0; b < io->n_blocks; b++) { int r; - struct shash_desc *desc = verity_io_hash_desc(v, io); + struct ahash_request *req = verity_io_hash_req(v, io); r = verity_hash_for_block(v, io, io->block + b, verity_io_want_digest(v, io), @@ -427,16 +507,17 @@ static int verity_verify_io(struct dm_verity_io *io) continue; } - r = verity_hash_init(v, desc); + r = verity_hash_init(v, req, &res); if (unlikely(r < 0)) return r; start = io->iter; - r = verity_for_bv_block(v, io, &io->iter, verity_bv_hash_update); + r = verity_for_io_block(v, io, &io->iter, &res); if (unlikely(r < 0)) return r; - r = verity_hash_final(v, desc, verity_io_real_digest(v, io)); + r = verity_hash_final(v, req, verity_io_real_digest(v, io), + &res); if (unlikely(r < 0)) return r; @@ -705,7 +786,7 @@ static void verity_dtr(struct dm_target *ti) kfree(v->zero_digest); if (v->tfm) - crypto_free_shash(v->tfm); + crypto_free_ahash(v->tfm); kfree(v->alg_name); @@ -723,7 +804,7 @@ static void verity_dtr(struct dm_target *ti) static int verity_alloc_zero_digest(struct dm_verity *v) { int r = -ENOMEM; - struct shash_desc *desc; + struct ahash_request *req; u8 *zero_data; v->zero_digest = kmalloc(v->digest_size, GFP_KERNEL); @@ -731,9 +812,9 @@ static int verity_alloc_zero_digest(struct dm_verity *v) if (!v->zero_digest) return r; - desc = kmalloc(v->shash_descsize, GFP_KERNEL); + req = kmalloc(v->ahash_reqsize, GFP_KERNEL); - if (!desc) + if (!req) return r; /* verity_dtr will free zero_digest */ zero_data = kzalloc(1 << v->data_dev_block_bits, GFP_KERNEL); @@ -741,11 +822,11 @@ static int verity_alloc_zero_digest(struct dm_verity *v) if (!zero_data) goto out; - r = verity_hash(v, desc, zero_data, 1 << v->data_dev_block_bits, + r = verity_hash(v, req, zero_data, 1 << v->data_dev_block_bits, v->zero_digest); out: - kfree(desc); + kfree(req); kfree(zero_data); return r; @@ -923,21 +1004,21 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) goto bad; } - v->tfm = crypto_alloc_shash(v->alg_name, 0, 0); + v->tfm = crypto_alloc_ahash(v->alg_name, 0, 0); if (IS_ERR(v->tfm)) { ti->error = "Cannot initialize hash function"; r = PTR_ERR(v->tfm); v->tfm = NULL; goto bad; } - v->digest_size = crypto_shash_digestsize(v->tfm); + v->digest_size = crypto_ahash_digestsize(v->tfm); if ((1 << v->hash_dev_block_bits) < v->digest_size * 2) { ti->error = "Digest size too big"; r = -EINVAL; goto bad; } - v->shash_descsize = - sizeof(struct shash_desc) + crypto_shash_descsize(v->tfm); + v->ahash_reqsize = sizeof(struct ahash_request) + + crypto_ahash_reqsize(v->tfm); v->root_digest = kmalloc(v->digest_size, GFP_KERNEL); if (!v->root_digest) { @@ -1037,7 +1118,7 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) } ti->per_io_data_size = sizeof(struct dm_verity_io) + - v->shash_descsize + v->digest_size * 2; + v->ahash_reqsize + v->digest_size * 2; r = verity_fec_ctr(v); if (r) diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h index fb419f422d73..a59e0ada6fd3 100644 --- a/drivers/md/dm-verity.h +++ b/drivers/md/dm-verity.h @@ -37,7 +37,7 @@ struct dm_verity { struct dm_target *ti; struct dm_bufio_client *bufio; char *alg_name; - struct crypto_shash *tfm; + struct crypto_ahash *tfm; u8 *root_digest; /* digest of the root block */ u8 *salt; /* salt: its size is salt_size */ u8 *zero_digest; /* digest for a zero block */ @@ -52,7 +52,7 @@ struct dm_verity { unsigned char levels; /* the number of tree levels */ unsigned char version; unsigned digest_size; /* digest size for the current hash algorithm */ - unsigned shash_descsize;/* the size of temporary space for crypto */ + unsigned int ahash_reqsize;/* the size of temporary space for crypto */ int hash_failed; /* set to 1 if hash of any block failed */ enum verity_mode mode; /* mode for handling verification errors */ unsigned corrupted_errs;/* Number of errors for corrupted blocks */ @@ -81,31 +81,36 @@ struct dm_verity_io { /* * Three variably-size fields follow this struct: * - * u8 hash_desc[v->shash_descsize]; + * u8 hash_req[v->ahash_reqsize]; * u8 real_digest[v->digest_size]; * u8 want_digest[v->digest_size]; * - * To access them use: verity_io_hash_desc(), verity_io_real_digest() + * To access them use: verity_io_hash_req(), verity_io_real_digest() * and verity_io_want_digest(). */ }; -static inline struct shash_desc *verity_io_hash_desc(struct dm_verity *v, +struct verity_result { + struct completion completion; + int err; +}; + +static inline struct ahash_request *verity_io_hash_req(struct dm_verity *v, struct dm_verity_io *io) { - return (struct shash_desc *)(io + 1); + return (struct ahash_request *)(io + 1); } static inline u8 *verity_io_real_digest(struct dm_verity *v, struct dm_verity_io *io) { - return (u8 *)(io + 1) + v->shash_descsize; + return (u8 *)(io + 1) + v->ahash_reqsize; } static inline u8 *verity_io_want_digest(struct dm_verity *v, struct dm_verity_io *io) { - return (u8 *)(io + 1) + v->shash_descsize + v->digest_size; + return (u8 *)(io + 1) + v->ahash_reqsize + v->digest_size; } static inline u8 *verity_io_digest_end(struct dm_verity *v, @@ -120,7 +125,7 @@ extern int verity_for_bv_block(struct dm_verity *v, struct dm_verity_io *io, struct dm_verity_io *io, u8 *data, size_t len)); -extern int verity_hash(struct dm_verity *v, struct shash_desc *desc, +extern int verity_hash(struct dm_verity *v, struct ahash_request *req, const u8 *data, size_t len, u8 *digest); extern int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io, diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 8bf397729bbd..268edf402bbb 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1104,8 +1104,18 @@ static int clone_bio(struct dm_target_io *tio, struct bio *bio, __bio_clone_fast(clone, bio); - if (bio_integrity(bio)) { - int r = bio_integrity_clone(clone, bio, GFP_NOIO); + if (unlikely(bio_integrity(bio) != NULL)) { + int r; + + if (unlikely(!dm_target_has_integrity(tio->ti->type) && + !dm_target_passes_integrity(tio->ti->type))) { + DMWARN("%s: the target %s doesn't support integrity data.", + dm_device_name(tio->io->md), + tio->ti->type->name); + return -EIO; + } + + r = bio_integrity_clone(clone, bio, GFP_NOIO); if (r < 0) return r; } @@ -1113,7 +1123,7 @@ static int clone_bio(struct dm_target_io *tio, struct bio *bio, bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector)); clone->bi_iter.bi_size = to_bytes(len); - if (bio_integrity(bio)) + if (unlikely(bio_integrity(bio) != NULL)) bio_integrity_trim(clone, 0, len); return 0; @@ -1715,6 +1725,8 @@ static void event_callback(void *context) */ static void __set_size(struct mapped_device *md, sector_t size) { + lockdep_assert_held(&md->suspend_lock); + set_capacity(md->disk, size); i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); @@ -1822,13 +1834,13 @@ void dm_unlock_md_type(struct mapped_device *md) mutex_unlock(&md->type_lock); } -void dm_set_md_type(struct mapped_device *md, unsigned type) +void dm_set_md_type(struct mapped_device *md, enum dm_queue_mode type) { BUG_ON(!mutex_is_locked(&md->type_lock)); md->type = type; } -unsigned dm_get_md_type(struct mapped_device *md) +enum dm_queue_mode dm_get_md_type(struct mapped_device *md) { return md->type; } @@ -1855,7 +1867,7 @@ EXPORT_SYMBOL_GPL(dm_get_queue_limits); int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) { int r; - unsigned type = dm_get_md_type(md); + enum dm_queue_mode type = dm_get_md_type(md); switch (type) { case DM_TYPE_REQUEST_BASED: @@ -1886,6 +1898,9 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) if (type == DM_TYPE_DAX_BIO_BASED) queue_flag_set_unlocked(QUEUE_FLAG_DAX, md->queue); break; + case DM_TYPE_NONE: + WARN_ON_ONCE(true); + break; } return 0; @@ -2164,8 +2179,6 @@ static void unlock_fs(struct mapped_device *md) * If __dm_suspend returns 0, the device is completely quiescent * now. There is no request-processing activity. All new requests * are being added to md->deferred list. - * - * Caller must hold md->suspend_lock */ static int __dm_suspend(struct mapped_device *md, struct dm_table *map, unsigned suspend_flags, long task_state, @@ -2183,6 +2196,8 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map, */ if (noflush) set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); + else + pr_debug("%s: suspending with flush\n", dm_device_name(md)); /* * This gets reverted if there's an error later and the targets @@ -2381,6 +2396,8 @@ static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_fla { struct dm_table *map = NULL; + lockdep_assert_held(&md->suspend_lock); + if (md->internal_suspend_count++) return; /* nested internal suspend */ @@ -2571,7 +2588,7 @@ int dm_noflush_suspending(struct dm_target *ti) } EXPORT_SYMBOL_GPL(dm_noflush_suspending); -struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type, +struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type, unsigned integrity, unsigned per_io_data_size) { struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id); diff --git a/drivers/md/dm.h b/drivers/md/dm.h index f298b01f7ab3..38c84c0a35d4 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -64,7 +64,7 @@ void dm_table_presuspend_undo_targets(struct dm_table *t); void dm_table_postsuspend_targets(struct dm_table *t); int dm_table_resume_targets(struct dm_table *t); int dm_table_any_congested(struct dm_table *t, int bdi_bits); -unsigned dm_table_get_type(struct dm_table *t); +enum dm_queue_mode dm_table_get_type(struct dm_table *t); struct target_type *dm_table_get_immutable_target_type(struct dm_table *t); struct dm_target *dm_table_get_immutable_target(struct dm_table *t); struct dm_target *dm_table_get_wildcard_target(struct dm_table *t); @@ -76,8 +76,8 @@ struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t); void dm_lock_md_type(struct mapped_device *md); void dm_unlock_md_type(struct mapped_device *md); -void dm_set_md_type(struct mapped_device *md, unsigned type); -unsigned dm_get_md_type(struct mapped_device *md); +void dm_set_md_type(struct mapped_device *md, enum dm_queue_mode type); +enum dm_queue_mode dm_get_md_type(struct mapped_device *md); struct target_type *dm_get_immutable_target_type(struct mapped_device *md); int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t); @@ -204,7 +204,7 @@ void dm_kcopyd_exit(void); /* * Mempool operations */ -struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type, +struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type, unsigned integrity, unsigned per_bio_data_size); void dm_free_md_mempools(struct dm_md_mempools *pools); diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c index 8589e0a14068..ea15d220ced7 100644 --- a/drivers/md/persistent-data/dm-block-manager.c +++ b/drivers/md/persistent-data/dm-block-manager.c @@ -378,7 +378,6 @@ struct dm_block_manager { struct dm_block_manager *dm_block_manager_create(struct block_device *bdev, unsigned block_size, - unsigned cache_size, unsigned max_held_per_thread) { int r; diff --git a/drivers/md/persistent-data/dm-block-manager.h b/drivers/md/persistent-data/dm-block-manager.h index 3627d1b7667a..e728937f376a 100644 --- a/drivers/md/persistent-data/dm-block-manager.h +++ b/drivers/md/persistent-data/dm-block-manager.h @@ -33,7 +33,7 @@ void *dm_block_data(struct dm_block *b); struct dm_block_manager; struct dm_block_manager *dm_block_manager_create( struct block_device *bdev, unsigned block_size, - unsigned cache_size, unsigned max_held_per_thread); + unsigned max_held_per_thread); void dm_block_manager_destroy(struct dm_block_manager *bm); unsigned dm_bm_block_size(struct dm_block_manager *bm); diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c index 02e2ee0d8a00..f21ce6a3d4cf 100644 --- a/drivers/md/persistent-data/dm-btree.c +++ b/drivers/md/persistent-data/dm-btree.c @@ -902,8 +902,12 @@ static int find_key(struct ro_spine *s, dm_block_t block, bool find_highest, else *result_key = le64_to_cpu(ro_node(s)->keys[0]); - if (next_block || flags & INTERNAL_NODE) - block = value64(ro_node(s), i); + if (next_block || flags & INTERNAL_NODE) { + if (find_highest) + block = value64(ro_node(s), i); + else + block = value64(ro_node(s), 0); + } } while (flags & INTERNAL_NODE); diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 3f307be01b10..218b6f37da85 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -53,16 +53,6 @@ */ #define R5L_POOL_SIZE 4 -/* - * r5c journal modes of the array: write-back or write-through. - * write-through mode has identical behavior as existing log only - * implementation. - */ -enum r5c_journal_mode { - R5C_JOURNAL_MODE_WRITE_THROUGH = 0, - R5C_JOURNAL_MODE_WRITE_BACK = 1, -}; - static char *r5c_journal_mode_str[] = {"write-through", "write-back"}; /* @@ -2327,40 +2317,56 @@ static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page) return ret; } -static ssize_t r5c_journal_mode_store(struct mddev *mddev, - const char *page, size_t length) +/* + * Set journal cache mode on @mddev (external API initially needed by dm-raid). + * + * @mode as defined in 'enum r5c_journal_mode'. + * + */ +int r5c_journal_mode_set(struct mddev *mddev, int mode) { struct r5conf *conf = mddev->private; struct r5l_log *log = conf->log; - int val = -1, i; - int len = length; if (!log) return -ENODEV; - if (len && page[len - 1] == '\n') - len -= 1; - for (i = 0; i < ARRAY_SIZE(r5c_journal_mode_str); i++) - if (strlen(r5c_journal_mode_str[i]) == len && - strncmp(page, r5c_journal_mode_str[i], len) == 0) { - val = i; - break; - } - if (val < R5C_JOURNAL_MODE_WRITE_THROUGH || - val > R5C_JOURNAL_MODE_WRITE_BACK) + if (mode < R5C_JOURNAL_MODE_WRITE_THROUGH || + mode > R5C_JOURNAL_MODE_WRITE_BACK) return -EINVAL; if (raid5_calc_degraded(conf) > 0 && - val == R5C_JOURNAL_MODE_WRITE_BACK) + mode == R5C_JOURNAL_MODE_WRITE_BACK) return -EINVAL; mddev_suspend(mddev); - conf->log->r5c_journal_mode = val; + conf->log->r5c_journal_mode = mode; mddev_resume(mddev); pr_debug("md/raid:%s: setting r5c cache mode to %d: %s\n", - mdname(mddev), val, r5c_journal_mode_str[val]); - return length; + mdname(mddev), mode, r5c_journal_mode_str[mode]); + return 0; +} +EXPORT_SYMBOL(r5c_journal_mode_set); + +static ssize_t r5c_journal_mode_store(struct mddev *mddev, + const char *page, size_t length) +{ + int mode = ARRAY_SIZE(r5c_journal_mode_str); + size_t len = length; + + if (len < 2) + return -EINVAL; + + if (page[len - 1] == '\n') + len--; + + while (mode--) + if (strlen(r5c_journal_mode_str[mode]) == len && + !strncmp(page, r5c_journal_mode_str[mode], len)) + break; + + return r5c_journal_mode_set(mddev, mode) ?: length; } struct md_sysfs_entry diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 4bb27b97bf6b..ec8ca15774d7 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -547,6 +547,16 @@ struct r5worker_group { int stripes_cnt; }; +/* + * r5c journal modes of the array: write-back or write-through. + * write-through mode has identical behavior as existing log only + * implementation. + */ +enum r5c_journal_mode { + R5C_JOURNAL_MODE_WRITE_THROUGH = 0, + R5C_JOURNAL_MODE_WRITE_BACK = 1, +}; + enum r5_cache_state { R5_INACTIVE_BLOCKED, /* release of inactive stripes blocked, * waiting for 25% to be free @@ -795,4 +805,5 @@ extern void r5c_check_cached_full_stripe(struct r5conf *conf); extern struct md_sysfs_entry r5c_journal_mode; extern void r5c_update_on_rdev_error(struct mddev *mddev); extern bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect); +extern int r5c_journal_mode_set(struct mddev *mddev, int journal_mode); #endif diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index c7ea33e38fb9..925b63cdef52 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -22,11 +22,13 @@ struct bio_vec; /* * Type of table, mapped_device's mempool and request_queue */ -#define DM_TYPE_NONE 0 -#define DM_TYPE_BIO_BASED 1 -#define DM_TYPE_REQUEST_BASED 2 -#define DM_TYPE_MQ_REQUEST_BASED 3 -#define DM_TYPE_DAX_BIO_BASED 4 +enum dm_queue_mode { + DM_TYPE_NONE = 0, + DM_TYPE_BIO_BASED = 1, + DM_TYPE_REQUEST_BASED = 2, + DM_TYPE_MQ_REQUEST_BASED = 3, + DM_TYPE_DAX_BIO_BASED = 4, +}; typedef enum { STATUSTYPE_INFO, STATUSTYPE_TABLE } status_type_t; @@ -221,6 +223,18 @@ struct target_type { */ typedef unsigned (*dm_num_write_bios_fn) (struct dm_target *ti, struct bio *bio); +/* + * A target implements own bio data integrity. + */ +#define DM_TARGET_INTEGRITY 0x00000010 +#define dm_target_has_integrity(type) ((type)->features & DM_TARGET_INTEGRITY) + +/* + * A target passes integrity data to the lower device. + */ +#define DM_TARGET_PASSES_INTEGRITY 0x00000020 +#define dm_target_passes_integrity(type) ((type)->features & DM_TARGET_PASSES_INTEGRITY) + struct dm_target { struct dm_table *table; struct target_type *type; @@ -465,7 +479,7 @@ void dm_table_add_target_callbacks(struct dm_table *t, struct dm_target_callback * Useful for "hybrid" target (supports both bio-based * and request-based). */ -void dm_table_set_type(struct dm_table *t, unsigned type); +void dm_table_set_type(struct dm_table *t, enum dm_queue_mode type); /* * Finally call this to make the table ready for use.