diff --git a/block.c b/block.c index 2ba76b2c36..9538af4884 100644 --- a/block.c +++ b/block.c @@ -712,11 +712,12 @@ int coroutine_fn bdrv_co_delete_file(BlockDriverState *bs, Error **errp) int bdrv_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz) { BlockDriver *drv = bs->drv; + BlockDriverState *filtered = bdrv_filter_bs(bs); if (drv && drv->bdrv_probe_blocksizes) { return drv->bdrv_probe_blocksizes(bs, bsz); - } else if (drv && drv->is_filter && bs->file) { - return bdrv_probe_blocksizes(bs->file->bs, bsz); + } else if (filtered) { + return bdrv_probe_blocksizes(filtered, bsz); } return -ENOTSUP; @@ -731,11 +732,12 @@ int bdrv_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz) int bdrv_probe_geometry(BlockDriverState *bs, HDGeometry *geo) { BlockDriver *drv = bs->drv; + BlockDriverState *filtered = bdrv_filter_bs(bs); if (drv && drv->bdrv_probe_geometry) { return drv->bdrv_probe_geometry(bs, geo); - } else if (drv && drv->is_filter && bs->file) { - return bdrv_probe_geometry(bs->file->bs, geo); + } else if (filtered) { + return bdrv_probe_geometry(filtered, geo); } return -ENOTSUP; @@ -1153,10 +1155,6 @@ static void bdrv_backing_attach(BdrvChild *c) bdrv_refresh_filename(backing_hd); parent->open_flags &= ~BDRV_O_NO_BACKING; - pstrcpy(parent->backing_file, sizeof(parent->backing_file), - backing_hd->filename); - pstrcpy(parent->backing_format, sizeof(parent->backing_format), - backing_hd->drv ? backing_hd->drv->format_name : ""); bdrv_op_block_all(backing_hd, parent->backing_blocker); /* Otherwise we won't be able to commit or stream */ @@ -2612,12 +2610,15 @@ static void bdrv_replace_child_noperm(BdrvChild *child, * If @new_bs is not NULL, bdrv_check_perm() must be called beforehand, as this * function uses bdrv_set_perm() to update the permissions according to the new * reference that @new_bs gets. + * + * Callers must ensure that child->frozen is false. */ static void bdrv_replace_child(BdrvChild *child, BlockDriverState *new_bs) { BlockDriverState *old_bs = child->bs; uint64_t perm, shared_perm; + /* Asserts that child->frozen == false */ bdrv_replace_child_noperm(child, new_bs); /* @@ -2778,6 +2779,7 @@ static void bdrv_detach_child(BdrvChild *child) g_free(child); } +/* Callers must ensure that child->frozen is false. */ void bdrv_root_unref_child(BdrvChild *child) { BlockDriverState *child_bs; @@ -2815,6 +2817,7 @@ static void bdrv_unset_inherits_from(BlockDriverState *root, BdrvChild *child) } } +/* Callers must ensure that child->frozen is false. */ void bdrv_unref_child(BlockDriverState *parent, BdrvChild *child) { if (child == NULL) { @@ -2863,7 +2866,7 @@ static BdrvChildRole bdrv_backing_role(BlockDriverState *bs) } /* - * Sets the backing file link of a BDS. A new reference is created; callers + * Sets the bs->backing link of a BDS. A new reference is created; callers * which don't need their own reference any more must call bdrv_unref(). */ void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd, @@ -2872,7 +2875,7 @@ void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd, bool update_inherits_from = bdrv_chain_contains(bs, backing_hd) && bdrv_inherits_from_recursive(backing_hd, bs); - if (bdrv_is_backing_chain_frozen(bs, backing_bs(bs), errp)) { + if (bdrv_is_backing_chain_frozen(bs, child_bs(bs->backing), errp)) { return; } @@ -2881,6 +2884,7 @@ void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd, } if (bs->backing) { + /* Cannot be frozen, we checked that above */ bdrv_unref_child(bs, bs->backing); bs->backing = NULL; } @@ -3996,7 +4000,7 @@ static int bdrv_reopen_parse_backing(BDRVReopenState *reopen_state, Error **errp) { BlockDriverState *bs = reopen_state->bs; - BlockDriverState *overlay_bs, *new_backing_bs; + BlockDriverState *overlay_bs, *below_bs, *new_backing_bs; QObject *value; const char *str; @@ -4035,26 +4039,57 @@ static int bdrv_reopen_parse_backing(BDRVReopenState *reopen_state, } } + /* + * Ensure that @bs can really handle backing files, because we are + * about to give it one (or swap the existing one) + */ + if (bs->drv->is_filter) { + /* Filters always have a file or a backing child */ + if (!bs->backing) { + error_setg(errp, "'%s' is a %s filter node that does not support a " + "backing child", bs->node_name, bs->drv->format_name); + return -EINVAL; + } + } else if (!bs->drv->supports_backing) { + error_setg(errp, "Driver '%s' of node '%s' does not support backing " + "files", bs->drv->format_name, bs->node_name); + return -EINVAL; + } + /* * Find the "actual" backing file by skipping all links that point * to an implicit node, if any (e.g. a commit filter node). + * We cannot use any of the bdrv_skip_*() functions here because + * those return the first explicit node, while we are looking for + * its overlay here. */ overlay_bs = bs; - while (backing_bs(overlay_bs) && backing_bs(overlay_bs)->implicit) { - overlay_bs = backing_bs(overlay_bs); + for (below_bs = bdrv_filter_or_cow_bs(overlay_bs); + below_bs && below_bs->implicit; + below_bs = bdrv_filter_or_cow_bs(overlay_bs)) + { + overlay_bs = below_bs; } /* If we want to replace the backing file we need some extra checks */ - if (new_backing_bs != backing_bs(overlay_bs)) { + if (new_backing_bs != bdrv_filter_or_cow_bs(overlay_bs)) { /* Check for implicit nodes between bs and its backing file */ if (bs != overlay_bs) { error_setg(errp, "Cannot change backing link if '%s' has " "an implicit backing file", bs->node_name); return -EPERM; } - /* Check if the backing link that we want to replace is frozen */ - if (bdrv_is_backing_chain_frozen(overlay_bs, backing_bs(overlay_bs), - errp)) { + /* + * Check if the backing link that we want to replace is frozen. + * Note that + * bdrv_filter_or_cow_child(overlay_bs) == overlay_bs->backing, + * because we know that overlay_bs == bs, and that @bs + * either is a filter that uses ->backing or a COW format BDS + * with bs->drv->supports_backing == true. + */ + if (bdrv_is_backing_chain_frozen(overlay_bs, + child_bs(overlay_bs->backing), errp)) + { return -EPERM; } reopen_state->replace_backing_bs = true; @@ -4203,7 +4238,7 @@ int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue, * its metadata. Otherwise the 'backing' option can be omitted. */ if (drv->supports_backing && reopen_state->backing_missing && - (backing_bs(reopen_state->bs) || reopen_state->bs->backing_file[0])) { + (reopen_state->bs->backing || reopen_state->bs->backing_file[0])) { error_setg(errp, "backing is missing for '%s'", reopen_state->bs->node_name); ret = -EINVAL; @@ -4344,7 +4379,7 @@ void bdrv_reopen_commit(BDRVReopenState *reopen_state) * from bdrv_set_backing_hd()) has the new values. */ if (reopen_state->replace_backing_bs) { - BlockDriverState *old_backing_bs = backing_bs(bs); + BlockDriverState *old_backing_bs = child_bs(bs->backing); assert(!old_backing_bs || !old_backing_bs->implicit); /* Abort the permission update on the backing bs we're detaching */ if (old_backing_bs) { @@ -4387,6 +4422,7 @@ static void bdrv_close(BlockDriverState *bs) if (bs->drv) { if (bs->drv->bdrv_close) { + /* Must unfreeze all children, so bdrv_unref_child() works */ bs->drv->bdrv_close(bs); } bs->drv = NULL; @@ -4736,9 +4772,9 @@ int bdrv_change_backing_file(BlockDriverState *bs, const char *backing_file, } /* - * Finds the image layer in the chain that has 'bs' as its backing file. - * - * active is the current topmost image. + * Finds the first non-filter node above bs in the chain between + * active and bs. The returned node is either an immediate parent of + * bs, or there are only filter nodes between the two. * * Returns NULL if bs is not found in active's image chain, * or if active == bs. @@ -4748,11 +4784,18 @@ int bdrv_change_backing_file(BlockDriverState *bs, const char *backing_file, BlockDriverState *bdrv_find_overlay(BlockDriverState *active, BlockDriverState *bs) { - while (active && bs != backing_bs(active)) { - active = backing_bs(active); + bs = bdrv_skip_filters(bs); + active = bdrv_skip_filters(active); + + while (active) { + BlockDriverState *next = bdrv_backing_chain_next(active); + if (bs == next) { + return active; + } + active = next; } - return active; + return NULL; } /* Given a BDS, searches for the base layer. */ @@ -4762,20 +4805,22 @@ BlockDriverState *bdrv_find_base(BlockDriverState *bs) } /* - * Return true if at least one of the backing links between @bs and - * @base is frozen. @errp is set if that's the case. + * Return true if at least one of the COW (backing) and filter links + * between @bs and @base is frozen. @errp is set if that's the case. * @base must be reachable from @bs, or NULL. */ bool bdrv_is_backing_chain_frozen(BlockDriverState *bs, BlockDriverState *base, Error **errp) { BlockDriverState *i; + BdrvChild *child; - for (i = bs; i != base; i = backing_bs(i)) { - if (i->backing && i->backing->frozen) { + for (i = bs; i != base; i = child_bs(child)) { + child = bdrv_filter_or_cow_child(i); + + if (child && child->frozen) { error_setg(errp, "Cannot change '%s' link from '%s' to '%s'", - i->backing->name, i->node_name, - backing_bs(i)->node_name); + child->name, i->node_name, child->bs->node_name); return true; } } @@ -4784,7 +4829,7 @@ bool bdrv_is_backing_chain_frozen(BlockDriverState *bs, BlockDriverState *base, } /* - * Freeze all backing links between @bs and @base. + * Freeze all COW (backing) and filter links between @bs and @base. * If any of the links is already frozen the operation is aborted and * none of the links are modified. * @base must be reachable from @bs, or NULL. @@ -4794,22 +4839,25 @@ int bdrv_freeze_backing_chain(BlockDriverState *bs, BlockDriverState *base, Error **errp) { BlockDriverState *i; + BdrvChild *child; if (bdrv_is_backing_chain_frozen(bs, base, errp)) { return -EPERM; } - for (i = bs; i != base; i = backing_bs(i)) { - if (i->backing && backing_bs(i)->never_freeze) { + for (i = bs; i != base; i = child_bs(child)) { + child = bdrv_filter_or_cow_child(i); + if (child && child->bs->never_freeze) { error_setg(errp, "Cannot freeze '%s' link to '%s'", - i->backing->name, backing_bs(i)->node_name); + child->name, child->bs->node_name); return -EPERM; } } - for (i = bs; i != base; i = backing_bs(i)) { - if (i->backing) { - i->backing->frozen = true; + for (i = bs; i != base; i = child_bs(child)) { + child = bdrv_filter_or_cow_child(i); + if (child) { + child->frozen = true; } } @@ -4817,18 +4865,21 @@ int bdrv_freeze_backing_chain(BlockDriverState *bs, BlockDriverState *base, } /* - * Unfreeze all backing links between @bs and @base. The caller must - * ensure that all links are frozen before using this function. + * Unfreeze all COW (backing) and filter links between @bs and @base. + * The caller must ensure that all links are frozen before using this + * function. * @base must be reachable from @bs, or NULL. */ void bdrv_unfreeze_backing_chain(BlockDriverState *bs, BlockDriverState *base) { BlockDriverState *i; + BdrvChild *child; - for (i = bs; i != base; i = backing_bs(i)) { - if (i->backing) { - assert(i->backing->frozen); - i->backing->frozen = false; + for (i = bs; i != base; i = child_bs(child)) { + child = bdrv_filter_or_cow_child(i); + if (child) { + assert(child->frozen); + child->frozen = false; } } } @@ -4896,9 +4947,7 @@ int bdrv_drop_intermediate(BlockDriverState *top, BlockDriverState *base, * other intermediate nodes have been dropped. * If 'top' is an implicit node (e.g. "commit_top") we should skip * it because no one inherits from it. We use explicit_top for that. */ - while (explicit_top && explicit_top->implicit) { - explicit_top = backing_bs(explicit_top); - } + explicit_top = bdrv_skip_implicit_filters(explicit_top); update_inherits_from = bdrv_inherits_from_recursive(base, explicit_top); /* success - we can delete the intermediate states, and link top->base */ @@ -4931,8 +4980,11 @@ int bdrv_drop_intermediate(BlockDriverState *top, BlockDriverState *base, } } - /* Do the actual switch in the in-memory graph. - * Completes bdrv_check_update_perm() transaction internally. */ + /* + * Do the actual switch in the in-memory graph. + * Completes bdrv_check_update_perm() transaction internally. + * c->frozen is false, we have checked that above. + */ bdrv_ref(base); bdrv_replace_child(c, base); bdrv_unref(top); @@ -4949,6 +5001,31 @@ exit: return ret; } +/** + * Implementation of BlockDriver.bdrv_get_allocated_file_size() that + * sums the size of all data-bearing children. (This excludes backing + * children.) + */ +static int64_t bdrv_sum_allocated_file_size(BlockDriverState *bs) +{ + BdrvChild *child; + int64_t child_size, sum = 0; + + QLIST_FOREACH(child, &bs->children, next) { + if (child->role & (BDRV_CHILD_DATA | BDRV_CHILD_METADATA | + BDRV_CHILD_FILTERED)) + { + child_size = bdrv_get_allocated_file_size(child->bs); + if (child_size < 0) { + return child_size; + } + sum += child_size; + } + } + + return sum; +} + /** * Length of a allocated file in bytes. Sparse files are counted by actual * allocated space. Return < 0 if error or unknown. @@ -4962,10 +5039,21 @@ int64_t bdrv_get_allocated_file_size(BlockDriverState *bs) if (drv->bdrv_get_allocated_file_size) { return drv->bdrv_get_allocated_file_size(bs); } - if (bs->file) { - return bdrv_get_allocated_file_size(bs->file->bs); + + if (drv->bdrv_file_open) { + /* + * Protocol drivers default to -ENOTSUP (most of their data is + * not stored in any of their children (if they even have any), + * so there is no generic way to figure it out). + */ + return -ENOTSUP; + } else if (drv->is_filter) { + /* Filter drivers default to the size of their filtered child */ + return bdrv_get_allocated_file_size(bdrv_filter_bs(bs)); + } else { + /* Other drivers default to summing their children's sizes */ + return bdrv_sum_allocated_file_size(bs); } - return -ENOTSUP; } /* @@ -5047,12 +5135,27 @@ bool bdrv_is_sg(BlockDriverState *bs) return bs->sg; } -bool bdrv_is_encrypted(BlockDriverState *bs) +/** + * Return whether the given node supports compressed writes. + */ +bool bdrv_supports_compressed_writes(BlockDriverState *bs) { - if (bs->backing && bs->backing->bs->encrypted) { - return true; + BlockDriverState *filtered; + + if (!bs->drv || !block_driver_can_compress(bs->drv)) { + return false; } - return bs->encrypted; + + filtered = bdrv_filter_bs(bs); + if (filtered) { + /* + * Filters can only forward compressed writes, so we have to + * check the child. + */ + return bdrv_supports_compressed_writes(filtered); + } + + return true; } const char *bdrv_get_format_name(BlockDriverState *bs) @@ -5337,7 +5440,7 @@ BlockDriverState *bdrv_lookup_bs(const char *device, bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base) { while (top && top != base) { - top = backing_bs(top); + top = bdrv_filter_or_cow_bs(top); } return top != NULL; @@ -5409,20 +5512,24 @@ int bdrv_has_zero_init_1(BlockDriverState *bs) int bdrv_has_zero_init(BlockDriverState *bs) { + BlockDriverState *filtered; + if (!bs->drv) { return 0; } /* If BS is a copy on write image, it is initialized to the contents of the base image, which may not be zeroes. */ - if (bs->backing) { + if (bdrv_cow_child(bs)) { return 0; } if (bs->drv->bdrv_has_zero_init) { return bs->drv->bdrv_has_zero_init(bs); } - if (bs->file && bs->drv->is_filter) { - return bdrv_has_zero_init(bs->file->bs); + + filtered = bdrv_filter_bs(bs); + if (filtered) { + return bdrv_has_zero_init(filtered); } /* safe default */ @@ -5452,8 +5559,9 @@ int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) return -ENOMEDIUM; } if (!drv->bdrv_get_info) { - if (bs->file && drv->is_filter) { - return bdrv_get_info(bs->file->bs, bdi); + BlockDriverState *filtered = bdrv_filter_bs(bs); + if (filtered) { + return bdrv_get_info(filtered, bdi); } return -ENOTSUP; } @@ -5492,17 +5600,7 @@ void bdrv_debug_event(BlockDriverState *bs, BlkdebugEvent event) static BlockDriverState *bdrv_find_debug_node(BlockDriverState *bs) { while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) { - if (bs->file) { - bs = bs->file->bs; - continue; - } - - if (bs->drv->is_filter && bs->backing) { - bs = bs->backing->bs; - continue; - } - - break; + bs = bdrv_primary_bs(bs); } if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) { @@ -5537,7 +5635,7 @@ int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag) int bdrv_debug_resume(BlockDriverState *bs, const char *tag) { while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) { - bs = bs->file ? bs->file->bs : NULL; + bs = bdrv_primary_bs(bs); } if (bs && bs->drv && bs->drv->bdrv_debug_resume) { @@ -5550,7 +5648,7 @@ int bdrv_debug_resume(BlockDriverState *bs, const char *tag) bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag) { while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) { - bs = bs->file ? bs->file->bs : NULL; + bs = bdrv_primary_bs(bs); } if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) { @@ -5571,8 +5669,10 @@ BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs, char *backing_file_full = NULL; char *filename_tmp = NULL; int is_protocol = 0; + bool filenames_refreshed = false; BlockDriverState *curr_bs = NULL; BlockDriverState *retval = NULL; + BlockDriverState *bs_below; if (!bs || !bs->drv || !backing_file) { return NULL; @@ -5583,15 +5683,47 @@ BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs, is_protocol = path_has_protocol(backing_file); - for (curr_bs = bs; curr_bs->backing; curr_bs = curr_bs->backing->bs) { + /* + * Being largely a legacy function, skip any filters here + * (because filters do not have normal filenames, so they cannot + * match anyway; and allowing json:{} filenames is a bit out of + * scope). + */ + for (curr_bs = bdrv_skip_filters(bs); + bdrv_cow_child(curr_bs) != NULL; + curr_bs = bs_below) + { + bs_below = bdrv_backing_chain_next(curr_bs); - /* If either of the filename paths is actually a protocol, then - * compare unmodified paths; otherwise make paths relative */ - if (is_protocol || path_has_protocol(curr_bs->backing_file)) { + if (bdrv_backing_overridden(curr_bs)) { + /* + * If the backing file was overridden, we can only compare + * directly against the backing node's filename. + */ + + if (!filenames_refreshed) { + /* + * This will automatically refresh all of the + * filenames in the rest of the backing chain, so we + * only need to do this once. + */ + bdrv_refresh_filename(bs_below); + filenames_refreshed = true; + } + + if (strcmp(backing_file, bs_below->filename) == 0) { + retval = bs_below; + break; + } + } else if (is_protocol || path_has_protocol(curr_bs->backing_file)) { + /* + * If either of the filename paths is actually a protocol, then + * compare unmodified paths; otherwise make paths relative. + */ char *backing_file_full_ret; if (strcmp(backing_file, curr_bs->backing_file) == 0) { - retval = curr_bs->backing->bs; + retval = bs_below; break; } /* Also check against the full backing filename for the image */ @@ -5601,7 +5733,7 @@ BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs, bool equal = strcmp(backing_file, backing_file_full_ret) == 0; g_free(backing_file_full_ret); if (equal) { - retval = curr_bs->backing->bs; + retval = bs_below; break; } } @@ -5627,7 +5759,7 @@ BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs, g_free(filename_tmp); if (strcmp(backing_file_full, filename_full) == 0) { - retval = curr_bs->backing->bs; + retval = bs_below; break; } } @@ -6119,6 +6251,10 @@ void bdrv_img_create(const char *filename, const char *fmt, "same filename as the backing file"); goto out; } + if (backing_file[0] == '\0') { + error_setg(errp, "Expected backing file name, got empty string"); + goto out; + } } backing_fmt = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT); @@ -6534,6 +6670,8 @@ int bdrv_amend_options(BlockDriverState *bs, QemuOpts *opts, bool bdrv_recurse_can_replace(BlockDriverState *bs, BlockDriverState *to_replace) { + BlockDriverState *filtered; + if (!bs || !bs->drv) { return false; } @@ -6548,9 +6686,9 @@ bool bdrv_recurse_can_replace(BlockDriverState *bs, } /* For filters without an own implementation, we can recurse on our own */ - if (bs->drv->is_filter) { - BdrvChild *child = bs->file ?: bs->backing; - return bdrv_recurse_can_replace(child->bs, to_replace); + filtered = bdrv_filter_bs(bs); + if (filtered) { + return bdrv_recurse_can_replace(filtered, to_replace); } /* Safe default */ @@ -6701,7 +6839,7 @@ static bool append_strong_runtime_options(QDict *d, BlockDriverState *bs) /* Note: This function may return false positives; it may return true * even if opening the backing file specified by bs's image header * would result in exactly bs->backing. */ -static bool bdrv_backing_overridden(BlockDriverState *bs) +bool bdrv_backing_overridden(BlockDriverState *bs) { if (bs->backing) { return strcmp(bs->auto_backing_file, @@ -6729,6 +6867,7 @@ void bdrv_refresh_filename(BlockDriverState *bs) { BlockDriver *drv = bs->drv; BdrvChild *child; + BlockDriverState *primary_child_bs; QDict *opts; bool backing_overridden; bool generate_json_filename; /* Whether our default implementation should @@ -6798,20 +6937,30 @@ void bdrv_refresh_filename(BlockDriverState *bs) qobject_unref(bs->full_open_options); bs->full_open_options = opts; + primary_child_bs = bdrv_primary_bs(bs); + if (drv->bdrv_refresh_filename) { /* Obsolete information is of no use here, so drop the old file name * information before refreshing it */ bs->exact_filename[0] = '\0'; drv->bdrv_refresh_filename(bs); - } else if (bs->file) { - /* Try to reconstruct valid information from the underlying file */ + } else if (primary_child_bs) { + /* + * Try to reconstruct valid information from the underlying + * file -- this only works for format nodes (filter nodes + * cannot be probed and as such must be selected by the user + * either through an options dict, or through a special + * filename which the filter driver must construct in its + * .bdrv_refresh_filename() implementation). + */ bs->exact_filename[0] = '\0'; /* * We can use the underlying file's filename if: * - it has a filename, + * - the current BDS is not a filter, * - the file is a protocol BDS, and * - opening that file (as this BDS's format) will automatically create * the BDS tree we have right now, that is: @@ -6820,11 +6969,11 @@ void bdrv_refresh_filename(BlockDriverState *bs) * - no non-file child of this BDS has been overridden by the user * Both of these conditions are represented by generate_json_filename. */ - if (bs->file->bs->exact_filename[0] && - bs->file->bs->drv->bdrv_file_open && - !generate_json_filename) + if (primary_child_bs->exact_filename[0] && + primary_child_bs->drv->bdrv_file_open && + !drv->is_filter && !generate_json_filename) { - strcpy(bs->exact_filename, bs->file->bs->exact_filename); + strcpy(bs->exact_filename, primary_child_bs->exact_filename); } } @@ -6844,6 +6993,7 @@ void bdrv_refresh_filename(BlockDriverState *bs) char *bdrv_dirname(BlockDriverState *bs, Error **errp) { BlockDriver *drv = bs->drv; + BlockDriverState *child_bs; if (!drv) { error_setg(errp, "Node '%s' is ejected", bs->node_name); @@ -6854,8 +7004,9 @@ char *bdrv_dirname(BlockDriverState *bs, Error **errp) return drv->bdrv_dirname(bs, errp); } - if (bs->file) { - return bdrv_dirname(bs->file->bs, errp); + child_bs = bdrv_primary_bs(bs); + if (child_bs) { + return bdrv_dirname(child_bs, errp); } bdrv_refresh_filename(bs); @@ -6939,3 +7090,156 @@ int bdrv_make_empty(BdrvChild *c, Error **errp) return 0; } + +/* + * Return the child that @bs acts as an overlay for, and from which data may be + * copied in COW or COR operations. Usually this is the backing file. + */ +BdrvChild *bdrv_cow_child(BlockDriverState *bs) +{ + if (!bs || !bs->drv) { + return NULL; + } + + if (bs->drv->is_filter) { + return NULL; + } + + if (!bs->backing) { + return NULL; + } + + assert(bs->backing->role & BDRV_CHILD_COW); + return bs->backing; +} + +/* + * If @bs acts as a filter for exactly one of its children, return + * that child. + */ +BdrvChild *bdrv_filter_child(BlockDriverState *bs) +{ + BdrvChild *c; + + if (!bs || !bs->drv) { + return NULL; + } + + if (!bs->drv->is_filter) { + return NULL; + } + + /* Only one of @backing or @file may be used */ + assert(!(bs->backing && bs->file)); + + c = bs->backing ?: bs->file; + if (!c) { + return NULL; + } + + assert(c->role & BDRV_CHILD_FILTERED); + return c; +} + +/* + * Return either the result of bdrv_cow_child() or bdrv_filter_child(), + * whichever is non-NULL. + * + * Return NULL if both are NULL. + */ +BdrvChild *bdrv_filter_or_cow_child(BlockDriverState *bs) +{ + BdrvChild *cow_child = bdrv_cow_child(bs); + BdrvChild *filter_child = bdrv_filter_child(bs); + + /* Filter nodes cannot have COW backing files */ + assert(!(cow_child && filter_child)); + + return cow_child ?: filter_child; +} + +/* + * Return the primary child of this node: For filters, that is the + * filtered child. For other nodes, that is usually the child storing + * metadata. + * (A generally more helpful description is that this is (usually) the + * child that has the same filename as @bs.) + * + * Drivers do not necessarily have a primary child; for example quorum + * does not. + */ +BdrvChild *bdrv_primary_child(BlockDriverState *bs) +{ + BdrvChild *c, *found = NULL; + + QLIST_FOREACH(c, &bs->children, next) { + if (c->role & BDRV_CHILD_PRIMARY) { + assert(!found); + found = c; + } + } + + return found; +} + +static BlockDriverState *bdrv_do_skip_filters(BlockDriverState *bs, + bool stop_on_explicit_filter) +{ + BdrvChild *c; + + if (!bs) { + return NULL; + } + + while (!(stop_on_explicit_filter && !bs->implicit)) { + c = bdrv_filter_child(bs); + if (!c) { + /* + * A filter that is embedded in a working block graph must + * have a child. Assert this here so this function does + * not return a filter node that is not expected by the + * caller. + */ + assert(!bs->drv || !bs->drv->is_filter); + break; + } + bs = c->bs; + } + /* + * Note that this treats nodes with bs->drv == NULL as not being + * filters (bs->drv == NULL should be replaced by something else + * anyway). + * The advantage of this behavior is that this function will thus + * always return a non-NULL value (given a non-NULL @bs). + */ + + return bs; +} + +/* + * Return the first BDS that has not been added implicitly or that + * does not have a filtered child down the chain starting from @bs + * (including @bs itself). + */ +BlockDriverState *bdrv_skip_implicit_filters(BlockDriverState *bs) +{ + return bdrv_do_skip_filters(bs, true); +} + +/* + * Return the first BDS that does not have a filtered child down the + * chain starting from @bs (including @bs itself). + */ +BlockDriverState *bdrv_skip_filters(BlockDriverState *bs) +{ + return bdrv_do_skip_filters(bs, false); +} + +/* + * For a backing chain, return the first non-filter backing image of + * the first non-filter image. + */ +BlockDriverState *bdrv_backing_chain_next(BlockDriverState *bs) +{ + return bdrv_skip_filters(bdrv_cow_bs(bdrv_skip_filters(bs))); +} diff --git a/block/backup-top.c b/block/backup-top.c index af2f20f346..fe6883cc97 100644 --- a/block/backup-top.c +++ b/block/backup-top.c @@ -175,8 +175,6 @@ BlockDriver bdrv_backup_top_filter = { .bdrv_co_pdiscard = backup_top_co_pdiscard, .bdrv_co_flush = backup_top_co_flush, - .bdrv_co_block_status = bdrv_co_block_status_from_backing, - .bdrv_refresh_filename = backup_top_refresh_filename, .bdrv_child_perm = backup_top_child_perm, @@ -281,7 +279,7 @@ void bdrv_backup_top_drop(BlockDriverState *bs) s->active = false; bdrv_child_refresh_perms(bs, bs->backing, &error_abort); - bdrv_replace_node(bs, backing_bs(bs), &error_abort); + bdrv_replace_node(bs, bs->backing->bs, &error_abort); bdrv_set_backing_hd(bs, NULL, &error_abort); bdrv_drained_end(bs); diff --git a/block/backup.c b/block/backup.c index 4f13bb20a5..9afa0bf3b4 100644 --- a/block/backup.c +++ b/block/backup.c @@ -297,6 +297,7 @@ static int64_t backup_calculate_cluster_size(BlockDriverState *target, { int ret; BlockDriverInfo bdi; + bool target_does_cow = bdrv_backing_chain_next(target); /* * If there is no backing file on the target, we cannot rely on COW if our @@ -304,7 +305,7 @@ static int64_t backup_calculate_cluster_size(BlockDriverState *target, * targets with a backing file, try to avoid COW if possible. */ ret = bdrv_get_info(target, &bdi); - if (ret == -ENOTSUP && !target->backing) { + if (ret == -ENOTSUP && !target_does_cow) { /* Cluster size is not defined */ warn_report("The target block device doesn't provide " "information about the block size and it doesn't have a " @@ -313,14 +314,14 @@ static int64_t backup_calculate_cluster_size(BlockDriverState *target, "this default, the backup may be unusable", BACKUP_CLUSTER_SIZE_DEFAULT); return BACKUP_CLUSTER_SIZE_DEFAULT; - } else if (ret < 0 && !target->backing) { + } else if (ret < 0 && !target_does_cow) { error_setg_errno(errp, -ret, "Couldn't determine the cluster size of the target image, " "which has no backing file"); error_append_hint(errp, "Aborting, since this may create an unusable destination image\n"); return ret; - } else if (ret < 0 && target->backing) { + } else if (ret < 0 && target_does_cow) { /* Not fatal; just trudge on ahead. */ return BACKUP_CLUSTER_SIZE_DEFAULT; } @@ -371,7 +372,7 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs, return NULL; } - if (compress && !block_driver_can_compress(target->drv)) { + if (compress && !bdrv_supports_compressed_writes(target)) { error_setg(errp, "Compression is not supported for this drive %s", bdrv_get_device_name(target)); return NULL; diff --git a/block/blkdebug.c b/block/blkdebug.c index 9c08d8a005..eecbf3e5c4 100644 --- a/block/blkdebug.c +++ b/block/blkdebug.c @@ -752,8 +752,11 @@ static int coroutine_fn blkdebug_co_block_status(BlockDriverState *bs, return err; } - return bdrv_co_block_status_from_file(bs, want_zero, offset, bytes, - pnum, map, file); + assert(bs->file && bs->file->bs); + *pnum = bytes; + *map = offset; + *file = bs->file->bs; + return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID; } static void blkdebug_close(BlockDriverState *bs) diff --git a/block/blklogwrites.c b/block/blklogwrites.c index 57315f56b4..13ae63983b 100644 --- a/block/blklogwrites.c +++ b/block/blklogwrites.c @@ -515,7 +515,6 @@ static BlockDriver bdrv_blk_log_writes = { .bdrv_co_pwrite_zeroes = blk_log_writes_co_pwrite_zeroes, .bdrv_co_flush_to_disk = blk_log_writes_co_flush_to_disk, .bdrv_co_pdiscard = blk_log_writes_co_pdiscard, - .bdrv_co_block_status = bdrv_co_block_status_from_file, .is_filter = true, .strong_runtime_opts = blk_log_writes_strong_runtime_opts, diff --git a/block/block-backend.c b/block/block-backend.c index 3a13cb5f0b..24dd0670d1 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -2279,10 +2279,13 @@ int blk_commit_all(void) while ((blk = blk_all_next(blk)) != NULL) { AioContext *aio_context = blk_get_aio_context(blk); + BlockDriverState *unfiltered_bs = bdrv_skip_filters(blk_bs(blk)); aio_context_acquire(aio_context); - if (blk_is_inserted(blk) && blk->root->bs->backing) { - int ret = bdrv_commit(blk->root->bs); + if (blk_is_inserted(blk) && bdrv_cow_child(unfiltered_bs)) { + int ret; + + ret = bdrv_commit(unfiltered_bs); if (ret < 0) { aio_context_release(aio_context); return ret; diff --git a/block/block-copy.c b/block/block-copy.c index a30b9097ef..cd9bc47c8f 100644 --- a/block/block-copy.c +++ b/block/block-copy.c @@ -440,8 +440,8 @@ static int block_copy_block_status(BlockCopyState *s, int64_t offset, BlockDriverState *base; int ret; - if (s->skip_unallocated && s->source->bs->backing) { - base = s->source->bs->backing->bs; + if (s->skip_unallocated) { + base = bdrv_backing_chain_next(s->source->bs); } else { base = NULL; } diff --git a/block/commit.c b/block/commit.c index 7732d02dfe..1e85c306cc 100644 --- a/block/commit.c +++ b/block/commit.c @@ -37,6 +37,7 @@ typedef struct CommitBlockJob { BlockBackend *top; BlockBackend *base; BlockDriverState *base_bs; + BlockDriverState *base_overlay; BlockdevOnError on_error; bool base_read_only; bool chain_frozen; @@ -89,7 +90,7 @@ static void commit_abort(Job *job) * XXX Can (or should) we somehow keep 'consistent read' blocked even * after the failed/cancelled commit job is gone? If we already wrote * something to base, the intermediate images aren't valid any more. */ - bdrv_replace_node(s->commit_top_bs, backing_bs(s->commit_top_bs), + bdrv_replace_node(s->commit_top_bs, s->commit_top_bs->backing->bs, &error_abort); bdrv_unref(s->commit_top_bs); @@ -153,7 +154,7 @@ static int coroutine_fn commit_run(Job *job, Error **errp) break; } /* Copy if allocated above the base */ - ret = bdrv_is_allocated_above(blk_bs(s->top), blk_bs(s->base), false, + ret = bdrv_is_allocated_above(blk_bs(s->top), s->base_overlay, true, offset, COMMIT_BUFFER_SIZE, &n); copy = (ret == 1); trace_commit_one_iteration(s, offset, n, ret); @@ -237,7 +238,6 @@ static void bdrv_commit_top_child_perm(BlockDriverState *bs, BdrvChild *c, static BlockDriver bdrv_commit_top = { .format_name = "commit_top", .bdrv_co_preadv = bdrv_commit_top_preadv, - .bdrv_co_block_status = bdrv_co_block_status_from_backing, .bdrv_refresh_filename = bdrv_commit_top_refresh_filename, .bdrv_child_perm = bdrv_commit_top_child_perm, @@ -253,15 +253,35 @@ void commit_start(const char *job_id, BlockDriverState *bs, CommitBlockJob *s; BlockDriverState *iter; BlockDriverState *commit_top_bs = NULL; + BlockDriverState *filtered_base; Error *local_err = NULL; + int64_t base_size, top_size; + uint64_t base_perms, iter_shared_perms; int ret; assert(top != bs); - if (top == base) { + if (bdrv_skip_filters(top) == bdrv_skip_filters(base)) { error_setg(errp, "Invalid files for merge: top and base are the same"); return; } + base_size = bdrv_getlength(base); + if (base_size < 0) { + error_setg_errno(errp, -base_size, "Could not inquire base image size"); + return; + } + + top_size = bdrv_getlength(top); + if (top_size < 0) { + error_setg_errno(errp, -top_size, "Could not inquire top image size"); + return; + } + + base_perms = BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE; + if (base_size < top_size) { + base_perms |= BLK_PERM_RESIZE; + } + s = block_job_create(job_id, &commit_job_driver, NULL, bs, 0, BLK_PERM_ALL, speed, creation_flags, NULL, NULL, errp); if (!s) { @@ -301,17 +321,43 @@ void commit_start(const char *job_id, BlockDriverState *bs, s->commit_top_bs = commit_top_bs; - /* Block all nodes between top and base, because they will - * disappear from the chain after this operation. */ - assert(bdrv_chain_contains(top, base)); - for (iter = top; iter != base; iter = backing_bs(iter)) { - /* XXX BLK_PERM_WRITE needs to be allowed so we don't block ourselves - * at s->base (if writes are blocked for a node, they are also blocked - * for its backing file). The other options would be a second filter - * driver above s->base. */ + /* + * Block all nodes between top and base, because they will + * disappear from the chain after this operation. + * Note that this assumes that the user is fine with removing all + * nodes (including R/W filters) between top and base. Assuring + * this is the responsibility of the interface (i.e. whoever calls + * commit_start()). + */ + s->base_overlay = bdrv_find_overlay(top, base); + assert(s->base_overlay); + + /* + * The topmost node with + * bdrv_skip_filters(filtered_base) == bdrv_skip_filters(base) + */ + filtered_base = bdrv_cow_bs(s->base_overlay); + assert(bdrv_skip_filters(filtered_base) == bdrv_skip_filters(base)); + + /* + * XXX BLK_PERM_WRITE needs to be allowed so we don't block ourselves + * at s->base (if writes are blocked for a node, they are also blocked + * for its backing file). The other options would be a second filter + * driver above s->base. + */ + iter_shared_perms = BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE; + + for (iter = top; iter != base; iter = bdrv_filter_or_cow_bs(iter)) { + if (iter == filtered_base) { + /* + * From here on, all nodes are filters on the base. This + * allows us to share BLK_PERM_CONSISTENT_READ. + */ + iter_shared_perms |= BLK_PERM_CONSISTENT_READ; + } + ret = block_job_add_bdrv(&s->common, "intermediate node", iter, 0, - BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE, - errp); + iter_shared_perms, errp); if (ret < 0) { goto fail; } @@ -328,9 +374,7 @@ void commit_start(const char *job_id, BlockDriverState *bs, } s->base = blk_new(s->common.job.aio_context, - BLK_PERM_CONSISTENT_READ - | BLK_PERM_WRITE - | BLK_PERM_RESIZE, + base_perms, BLK_PERM_CONSISTENT_READ | BLK_PERM_GRAPH_MOD | BLK_PERM_WRITE_UNCHANGED); @@ -398,19 +442,22 @@ int bdrv_commit(BlockDriverState *bs) if (!drv) return -ENOMEDIUM; - if (!bs->backing) { + backing_file_bs = bdrv_cow_bs(bs); + + if (!backing_file_bs) { return -ENOTSUP; } if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_COMMIT_SOURCE, NULL) || - bdrv_op_is_blocked(bs->backing->bs, BLOCK_OP_TYPE_COMMIT_TARGET, NULL)) { + bdrv_op_is_blocked(backing_file_bs, BLOCK_OP_TYPE_COMMIT_TARGET, NULL)) + { return -EBUSY; } - ro = bs->backing->bs->read_only; + ro = backing_file_bs->read_only; if (ro) { - if (bdrv_reopen_set_read_only(bs->backing->bs, false, NULL)) { + if (bdrv_reopen_set_read_only(backing_file_bs, false, NULL)) { return -EACCES; } } @@ -428,8 +475,6 @@ int bdrv_commit(BlockDriverState *bs) } /* Insert commit_top block node above backing, so we can write to it */ - backing_file_bs = backing_bs(bs); - commit_top_bs = bdrv_new_open_driver(&bdrv_commit_top, NULL, BDRV_O_RDWR, &local_err); if (commit_top_bs == NULL) { @@ -515,7 +560,7 @@ ro_cleanup: qemu_vfree(buf); blk_unref(backing); - if (backing_file_bs) { + if (bdrv_cow_bs(bs) != backing_file_bs) { bdrv_set_backing_hd(bs, backing_file_bs, &error_abort); } bdrv_unref(commit_top_bs); @@ -523,7 +568,7 @@ ro_cleanup: if (ro) { /* ignoring error return here */ - bdrv_reopen_set_read_only(bs->backing->bs, true, NULL); + bdrv_reopen_set_read_only(backing_file_bs, true, NULL); } return ret; diff --git a/block/copy-on-read.c b/block/copy-on-read.c index a6e3c74a68..2816e61afe 100644 --- a/block/copy-on-read.c +++ b/block/copy-on-read.c @@ -107,6 +107,16 @@ static int coroutine_fn cor_co_pdiscard(BlockDriverState *bs, } +static int coroutine_fn cor_co_pwritev_compressed(BlockDriverState *bs, + uint64_t offset, + uint64_t bytes, + QEMUIOVector *qiov) +{ + return bdrv_co_pwritev(bs->file, offset, bytes, qiov, + BDRV_REQ_WRITE_COMPRESSED); +} + + static void cor_eject(BlockDriverState *bs, bool eject_flag) { bdrv_eject(bs->file->bs, eject_flag); @@ -131,12 +141,11 @@ static BlockDriver bdrv_copy_on_read = { .bdrv_co_pwritev = cor_co_pwritev, .bdrv_co_pwrite_zeroes = cor_co_pwrite_zeroes, .bdrv_co_pdiscard = cor_co_pdiscard, + .bdrv_co_pwritev_compressed = cor_co_pwritev_compressed, .bdrv_eject = cor_eject, .bdrv_lock_medium = cor_lock_medium, - .bdrv_co_block_status = bdrv_co_block_status_from_file, - .has_variable_length = true, .is_filter = true, }; diff --git a/block/file-win32.c b/block/file-win32.c index ab69bd811a..e2900c3a51 100644 --- a/block/file-win32.c +++ b/block/file-win32.c @@ -299,6 +299,11 @@ static QemuOptsList raw_runtime_opts = { .type = QEMU_OPT_STRING, .help = "host AIO implementation (threads, native)", }, + { + .name = "locking", + .type = QEMU_OPT_STRING, + .help = "file locking mode (on/off/auto, default: auto)", + }, { /* end of list */ } }, }; @@ -333,6 +338,7 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags, Error *local_err = NULL; const char *filename; bool use_aio; + OnOffAuto locking; int ret; s->type = FTYPE_FILE; @@ -343,10 +349,24 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags, goto fail; } - if (qdict_get_try_bool(options, "locking", false)) { + locking = qapi_enum_parse(&OnOffAuto_lookup, + qemu_opt_get(opts, "locking"), + ON_OFF_AUTO_AUTO, &local_err); + if (local_err) { + error_propagate(errp, local_err); + ret = -EINVAL; + goto fail; + } + switch (locking) { + case ON_OFF_AUTO_ON: error_setg(errp, "locking=on is not supported on Windows"); ret = -EINVAL; goto fail; + case ON_OFF_AUTO_OFF: + case ON_OFF_AUTO_AUTO: + break; + default: + g_assert_not_reached(); } filename = qemu_opt_get(opts, "filename"); diff --git a/block/filter-compress.c b/block/filter-compress.c index 8ec1991c1f..5136371bf8 100644 --- a/block/filter-compress.c +++ b/block/filter-compress.c @@ -146,8 +146,6 @@ static BlockDriver bdrv_compress = { .bdrv_eject = compress_eject, .bdrv_lock_medium = compress_lock_medium, - .bdrv_co_block_status = bdrv_co_block_status_from_file, - .has_variable_length = true, .is_filter = true, }; diff --git a/block/io.c b/block/io.c index ad3a51ed53..a2389bb38c 100644 --- a/block/io.c +++ b/block/io.c @@ -135,6 +135,8 @@ static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src) void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) { BlockDriver *drv = bs->drv; + BdrvChild *c; + bool have_limits; Error *local_err = NULL; memset(&bs->bl, 0, sizeof(bs->bl)); @@ -149,14 +151,21 @@ void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) drv->bdrv_co_preadv_part) ? 1 : 512; /* Take some limits from the children as a default */ - if (bs->file) { - bdrv_refresh_limits(bs->file->bs, &local_err); - if (local_err) { - error_propagate(errp, local_err); - return; + have_limits = false; + QLIST_FOREACH(c, &bs->children, next) { + if (c->role & (BDRV_CHILD_DATA | BDRV_CHILD_FILTERED | BDRV_CHILD_COW)) + { + bdrv_refresh_limits(c->bs, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } + bdrv_merge_limits(&bs->bl, &c->bs->bl); + have_limits = true; } - bdrv_merge_limits(&bs->bl, &bs->file->bs->bl); - } else { + } + + if (!have_limits) { bs->bl.min_mem_alignment = 512; bs->bl.opt_mem_alignment = qemu_real_host_page_size; @@ -164,15 +173,6 @@ void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) bs->bl.max_iov = IOV_MAX; } - if (bs->backing) { - bdrv_refresh_limits(bs->backing->bs, &local_err); - if (local_err) { - error_propagate(errp, local_err); - return; - } - bdrv_merge_limits(&bs->bl, &bs->backing->bs->bl); - } - /* Then let the driver override it */ if (drv->bdrv_refresh_limits) { drv->bdrv_refresh_limits(bs, errp); @@ -2255,36 +2255,6 @@ typedef struct BdrvCoBlockStatusData { BlockDriverState **file; } BdrvCoBlockStatusData; -int coroutine_fn bdrv_co_block_status_from_file(BlockDriverState *bs, - bool want_zero, - int64_t offset, - int64_t bytes, - int64_t *pnum, - int64_t *map, - BlockDriverState **file) -{ - assert(bs->file && bs->file->bs); - *pnum = bytes; - *map = offset; - *file = bs->file->bs; - return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID; -} - -int coroutine_fn bdrv_co_block_status_from_backing(BlockDriverState *bs, - bool want_zero, - int64_t offset, - int64_t bytes, - int64_t *pnum, - int64_t *map, - BlockDriverState **file) -{ - assert(bs->backing && bs->backing->bs); - *pnum = bytes; - *map = offset; - *file = bs->backing->bs; - return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID; -} - /* * Returns the allocation status of the specified sectors. * Drivers not implementing the functionality are assumed to not support @@ -2325,6 +2295,7 @@ static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs, BlockDriverState *local_file = NULL; int64_t aligned_offset, aligned_bytes; uint32_t align; + bool has_filtered_child; assert(pnum); *pnum = 0; @@ -2350,7 +2321,8 @@ static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs, /* Must be non-NULL or bdrv_getlength() would have failed */ assert(bs->drv); - if (!bs->drv->bdrv_co_block_status) { + has_filtered_child = bdrv_filter_child(bs); + if (!bs->drv->bdrv_co_block_status && !has_filtered_child) { *pnum = bytes; ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED; if (offset + bytes == total_size) { @@ -2371,9 +2343,20 @@ static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs, aligned_offset = QEMU_ALIGN_DOWN(offset, align); aligned_bytes = ROUND_UP(offset + bytes, align) - aligned_offset; - ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset, - aligned_bytes, pnum, &local_map, - &local_file); + if (bs->drv->bdrv_co_block_status) { + ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset, + aligned_bytes, pnum, &local_map, + &local_file); + } else { + /* Default code for filters */ + + local_file = bdrv_filter_bs(bs); + assert(local_file); + + *pnum = aligned_bytes; + local_map = aligned_offset; + ret = BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID; + } if (ret < 0) { *pnum = 0; goto out; @@ -2409,9 +2392,10 @@ static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs, if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) { ret |= BDRV_BLOCK_ALLOCATED; } else if (want_zero && bs->drv->supports_backing) { - if (bs->backing) { - BlockDriverState *bs2 = bs->backing->bs; - int64_t size2 = bdrv_getlength(bs2); + BlockDriverState *cow_bs = bdrv_cow_bs(bs); + + if (cow_bs) { + int64_t size2 = bdrv_getlength(cow_bs); if (size2 >= 0 && offset >= size2) { ret |= BDRV_BLOCK_ZERO; @@ -2479,7 +2463,7 @@ static int coroutine_fn bdrv_co_block_status_above(BlockDriverState *bs, bool first = true; assert(bs != base); - for (p = bs; p != base; p = backing_bs(p)) { + for (p = bs; p != base; p = bdrv_filter_or_cow_bs(p)) { ret = bdrv_co_block_status(p, want_zero, offset, bytes, pnum, map, file); if (ret < 0) { @@ -2553,7 +2537,7 @@ int bdrv_block_status_above(BlockDriverState *bs, BlockDriverState *base, int bdrv_block_status(BlockDriverState *bs, int64_t offset, int64_t bytes, int64_t *pnum, int64_t *map, BlockDriverState **file) { - return bdrv_block_status_above(bs, backing_bs(bs), + return bdrv_block_status_above(bs, bdrv_filter_or_cow_bs(bs), offset, bytes, pnum, map, file); } @@ -2563,9 +2547,9 @@ int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t offset, int ret; int64_t dummy; - ret = bdrv_common_block_status_above(bs, backing_bs(bs), false, offset, - bytes, pnum ? pnum : &dummy, NULL, - NULL); + ret = bdrv_common_block_status_above(bs, bdrv_filter_or_cow_bs(bs), false, + offset, bytes, pnum ? pnum : &dummy, + NULL, NULL); if (ret < 0) { return ret; } @@ -2628,7 +2612,7 @@ int bdrv_is_allocated_above(BlockDriverState *top, break; } - intermediate = backing_bs(intermediate); + intermediate = bdrv_filter_or_cow_bs(intermediate); } *pnum = n; @@ -2647,6 +2631,7 @@ bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos, bool is_read) { BlockDriver *drv = bs->drv; + BlockDriverState *child_bs = bdrv_primary_bs(bs); int ret = -ENOTSUP; bdrv_inc_in_flight(bs); @@ -2659,8 +2644,8 @@ bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos, } else { ret = drv->bdrv_save_vmstate(bs, qiov, pos); } - } else if (bs->file) { - ret = bdrv_co_rw_vmstate(bs->file->bs, qiov, pos, is_read); + } else if (child_bs) { + ret = bdrv_co_rw_vmstate(child_bs, qiov, pos, is_read); } bdrv_dec_in_flight(bs); @@ -2770,6 +2755,8 @@ static int coroutine_fn bdrv_flush_co_entry(void *opaque) int coroutine_fn bdrv_co_flush(BlockDriverState *bs) { + BdrvChild *primary_child = bdrv_primary_child(bs); + BdrvChild *child; int current_gen; int ret = 0; @@ -2799,7 +2786,7 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs) } /* Write back cached data to the OS even with cache=unsafe */ - BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS); + BLKDBG_EVENT(primary_child, BLKDBG_FLUSH_TO_OS); if (bs->drv->bdrv_co_flush_to_os) { ret = bs->drv->bdrv_co_flush_to_os(bs); if (ret < 0) { @@ -2809,15 +2796,15 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs) /* But don't actually force it to the disk with cache=unsafe */ if (bs->open_flags & BDRV_O_NO_FLUSH) { - goto flush_parent; + goto flush_children; } /* Check if we really need to flush anything */ if (bs->flushed_gen == current_gen) { - goto flush_parent; + goto flush_children; } - BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK); + BLKDBG_EVENT(primary_child, BLKDBG_FLUSH_TO_DISK); if (!bs->drv) { /* bs->drv->bdrv_co_flush() might have ejected the BDS * (even in case of apparent success) */ @@ -2861,8 +2848,17 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs) /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH * in the case of cache=unsafe, so there are no useless flushes. */ -flush_parent: - ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0; +flush_children: + ret = 0; + QLIST_FOREACH(child, &bs->children, next) { + if (child->perm & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED)) { + int this_child_ret = bdrv_co_flush(child->bs); + if (!ret) { + ret = this_child_ret; + } + } + } + out: /* Notify any pending flushes that we have completed */ if (ret == 0) { @@ -3309,6 +3305,7 @@ int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact, Error **errp) { BlockDriverState *bs = child->bs; + BdrvChild *filtered, *backing; BlockDriver *drv = bs->drv; BdrvTrackedRequest req; int64_t old_size, new_bytes; @@ -3360,6 +3357,9 @@ int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact, goto out; } + filtered = bdrv_filter_child(bs); + backing = bdrv_cow_child(bs); + /* * If the image has a backing file that is large enough that it would * provide data for the new area, we cannot leave it unallocated because @@ -3370,10 +3370,10 @@ int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact, * backing file, taking care of keeping things consistent with that backing * file is the user's responsibility. */ - if (new_bytes && bs->backing) { + if (new_bytes && backing) { int64_t backing_len; - backing_len = bdrv_getlength(backing_bs(bs)); + backing_len = bdrv_getlength(backing->bs); if (backing_len < 0) { ret = backing_len; error_setg_errno(errp, -ret, "Could not get backing file size"); @@ -3392,8 +3392,8 @@ int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact, goto out; } ret = drv->bdrv_co_truncate(bs, offset, exact, prealloc, flags, errp); - } else if (bs->file && drv->is_filter) { - ret = bdrv_co_truncate(bs->file, offset, exact, prealloc, flags, errp); + } else if (filtered) { + ret = bdrv_co_truncate(filtered, offset, exact, prealloc, flags, errp); } else { error_setg(errp, "Image format driver does not support resize"); ret = -ENOTSUP; diff --git a/block/mirror.c b/block/mirror.c index e8e8844afc..26acf4af6f 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -42,6 +42,7 @@ typedef struct MirrorBlockJob { BlockBackend *target; BlockDriverState *mirror_top_bs; BlockDriverState *base; + BlockDriverState *base_overlay; /* The name of the graph node to replace */ char *replaces; @@ -677,8 +678,10 @@ static int mirror_exit_common(Job *job) &error_abort); if (!abort && s->backing_mode == MIRROR_SOURCE_BACKING_CHAIN) { BlockDriverState *backing = s->is_none_mode ? src : s->base; - if (backing_bs(target_bs) != backing) { - bdrv_set_backing_hd(target_bs, backing, &local_err); + BlockDriverState *unfiltered_target = bdrv_skip_filters(target_bs); + + if (bdrv_cow_bs(unfiltered_target) != backing) { + bdrv_set_backing_hd(unfiltered_target, backing, &local_err); if (local_err) { error_report_err(local_err); local_err = NULL; @@ -740,7 +743,7 @@ static int mirror_exit_common(Job *job) * valid. */ block_job_remove_all_bdrv(bjob); - bdrv_replace_node(mirror_top_bs, backing_bs(mirror_top_bs), &error_abort); + bdrv_replace_node(mirror_top_bs, mirror_top_bs->backing->bs, &error_abort); /* We just changed the BDS the job BB refers to (with either or both of the * bdrv_replace_node() calls), so switch the BB back so the cleanup does @@ -786,7 +789,6 @@ static void coroutine_fn mirror_throttle(MirrorBlockJob *s) static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s) { int64_t offset; - BlockDriverState *base = s->base; BlockDriverState *bs = s->mirror_top_bs->backing->bs; BlockDriverState *target_bs = blk_bs(s->target); int ret; @@ -837,7 +839,8 @@ static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s) return 0; } - ret = bdrv_is_allocated_above(bs, base, false, offset, bytes, &count); + ret = bdrv_is_allocated_above(bs, s->base_overlay, true, offset, bytes, + &count); if (ret < 0) { return ret; } @@ -936,7 +939,7 @@ static int coroutine_fn mirror_run(Job *job, Error **errp) } else { s->target_cluster_size = BDRV_SECTOR_SIZE; } - if (backing_filename[0] && !target_bs->backing && + if (backing_filename[0] && !bdrv_backing_chain_next(target_bs) && s->granularity < s->target_cluster_size) { s->buf_size = MAX(s->buf_size, s->target_cluster_size); s->cow_bitmap = bitmap_new(length); @@ -1116,8 +1119,9 @@ static void mirror_complete(Job *job, Error **errp) if (s->backing_mode == MIRROR_OPEN_BACKING_CHAIN) { int ret; - assert(!target->backing); - ret = bdrv_open_backing_file(target, NULL, "backing", errp); + assert(!bdrv_backing_chain_next(target)); + ret = bdrv_open_backing_file(bdrv_skip_filters(target), NULL, + "backing", errp); if (ret < 0) { return; } @@ -1527,7 +1531,6 @@ static BlockDriver bdrv_mirror_top = { .bdrv_co_pwrite_zeroes = bdrv_mirror_top_pwrite_zeroes, .bdrv_co_pdiscard = bdrv_mirror_top_pdiscard, .bdrv_co_flush = bdrv_mirror_top_flush, - .bdrv_co_block_status = bdrv_co_block_status_from_backing, .bdrv_refresh_filename = bdrv_mirror_top_refresh_filename, .bdrv_child_perm = bdrv_mirror_top_child_perm, @@ -1555,8 +1558,8 @@ static BlockJob *mirror_start_job( MirrorBlockJob *s; MirrorBDSOpaque *bs_opaque; BlockDriverState *mirror_top_bs; - bool target_graph_mod; bool target_is_backing; + uint64_t target_perms, target_shared_perms; Error *local_err = NULL; int ret; @@ -1575,7 +1578,7 @@ static BlockJob *mirror_start_job( buf_size = DEFAULT_MIRROR_BUF_SIZE; } - if (bs == target) { + if (bdrv_skip_filters(bs) == bdrv_skip_filters(target)) { error_setg(errp, "Can't mirror node into itself"); return NULL; } @@ -1639,15 +1642,50 @@ static BlockJob *mirror_start_job( * In the case of active commit, things look a bit different, though, * because the target is an already populated backing file in active use. * We can allow anything except resize there.*/ + + target_perms = BLK_PERM_WRITE; + target_shared_perms = BLK_PERM_WRITE_UNCHANGED; + target_is_backing = bdrv_chain_contains(bs, target); - target_graph_mod = (backing_mode != MIRROR_LEAVE_BACKING_CHAIN); + if (target_is_backing) { + int64_t bs_size, target_size; + bs_size = bdrv_getlength(bs); + if (bs_size < 0) { + error_setg_errno(errp, -bs_size, + "Could not inquire top image size"); + goto fail; + } + + target_size = bdrv_getlength(target); + if (target_size < 0) { + error_setg_errno(errp, -target_size, + "Could not inquire base image size"); + goto fail; + } + + if (target_size < bs_size) { + target_perms |= BLK_PERM_RESIZE; + } + + target_shared_perms |= BLK_PERM_CONSISTENT_READ + | BLK_PERM_WRITE + | BLK_PERM_GRAPH_MOD; + } else if (bdrv_chain_contains(bs, bdrv_skip_filters(target))) { + /* + * We may want to allow this in the future, but it would + * require taking some extra care. + */ + error_setg(errp, "Cannot mirror to a filter on top of a node in the " + "source's backing chain"); + goto fail; + } + + if (backing_mode != MIRROR_LEAVE_BACKING_CHAIN) { + target_perms |= BLK_PERM_GRAPH_MOD; + } + s->target = blk_new(s->common.job.aio_context, - BLK_PERM_WRITE | BLK_PERM_RESIZE | - (target_graph_mod ? BLK_PERM_GRAPH_MOD : 0), - BLK_PERM_WRITE_UNCHANGED | - (target_is_backing ? BLK_PERM_CONSISTENT_READ | - BLK_PERM_WRITE | - BLK_PERM_GRAPH_MOD : 0)); + target_perms, target_shared_perms); ret = blk_insert_bs(s->target, target, errp); if (ret < 0) { goto fail; @@ -1672,6 +1710,7 @@ static BlockJob *mirror_start_job( s->zero_target = zero_target; s->copy_mode = copy_mode; s->base = base; + s->base_overlay = bdrv_find_overlay(bs, base); s->granularity = granularity; s->buf_size = ROUND_UP(buf_size, granularity); s->unmap = unmap; @@ -1702,15 +1741,39 @@ static BlockJob *mirror_start_job( /* In commit_active_start() all intermediate nodes disappear, so * any jobs in them must be blocked */ if (target_is_backing) { - BlockDriverState *iter; - for (iter = backing_bs(bs); iter != target; iter = backing_bs(iter)) { - /* XXX BLK_PERM_WRITE needs to be allowed so we don't block - * ourselves at s->base (if writes are blocked for a node, they are - * also blocked for its backing file). The other options would be a - * second filter driver above s->base (== target). */ + BlockDriverState *iter, *filtered_target; + uint64_t iter_shared_perms; + + /* + * The topmost node with + * bdrv_skip_filters(filtered_target) == bdrv_skip_filters(target) + */ + filtered_target = bdrv_cow_bs(bdrv_find_overlay(bs, target)); + + assert(bdrv_skip_filters(filtered_target) == + bdrv_skip_filters(target)); + + /* + * XXX BLK_PERM_WRITE needs to be allowed so we don't block + * ourselves at s->base (if writes are blocked for a node, they are + * also blocked for its backing file). The other options would be a + * second filter driver above s->base (== target). + */ + iter_shared_perms = BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE; + + for (iter = bdrv_filter_or_cow_bs(bs); iter != target; + iter = bdrv_filter_or_cow_bs(iter)) + { + if (iter == filtered_target) { + /* + * From here on, all nodes are filters on the base. + * This allows us to share BLK_PERM_CONSISTENT_READ. + */ + iter_shared_perms |= BLK_PERM_CONSISTENT_READ; + } + ret = block_job_add_bdrv(&s->common, "intermediate node", iter, 0, - BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE, - errp); + iter_shared_perms, errp); if (ret < 0) { goto fail; } @@ -1746,7 +1809,7 @@ fail: bs_opaque->stop = true; bdrv_child_refresh_perms(mirror_top_bs, mirror_top_bs->backing, &error_abort); - bdrv_replace_node(mirror_top_bs, backing_bs(mirror_top_bs), &error_abort); + bdrv_replace_node(mirror_top_bs, mirror_top_bs->backing->bs, &error_abort); bdrv_unref(mirror_top_bs); @@ -1774,7 +1837,7 @@ void mirror_start(const char *job_id, BlockDriverState *bs, return; } is_none_mode = mode == MIRROR_SYNC_MODE_NONE; - base = mode == MIRROR_SYNC_MODE_TOP ? backing_bs(bs) : NULL; + base = mode == MIRROR_SYNC_MODE_TOP ? bdrv_backing_chain_next(bs) : NULL; mirror_start_job(job_id, bs, creation_flags, target, replaces, speed, granularity, buf_size, backing_mode, zero_target, on_source_error, on_target_error, unmap, NULL, NULL, diff --git a/block/monitor/block-hmp-cmds.c b/block/monitor/block-hmp-cmds.c index 4c8c375172..4d3db5ed3c 100644 --- a/block/monitor/block-hmp-cmds.c +++ b/block/monitor/block-hmp-cmds.c @@ -217,7 +217,7 @@ void hmp_commit(Monitor *mon, const QDict *qdict) return; } - bs = blk_bs(blk); + bs = bdrv_skip_implicit_filters(blk_bs(blk)); aio_context = bdrv_get_aio_context(bs); aio_context_acquire(aio_context); diff --git a/block/null.c b/block/null.c index 15e1d56746..cc9b1d4ea7 100644 --- a/block/null.c +++ b/block/null.c @@ -262,6 +262,11 @@ static void null_refresh_filename(BlockDriverState *bs) bs->drv->format_name); } +static int64_t null_allocated_file_size(BlockDriverState *bs) +{ + return 0; +} + static const char *const null_strong_runtime_opts[] = { BLOCK_OPT_SIZE, NULL_OPT_ZEROES, @@ -277,6 +282,7 @@ static BlockDriver bdrv_null_co = { .bdrv_file_open = null_file_open, .bdrv_parse_filename = null_co_parse_filename, .bdrv_getlength = null_getlength, + .bdrv_get_allocated_file_size = null_allocated_file_size, .bdrv_co_preadv = null_co_preadv, .bdrv_co_pwritev = null_co_pwritev, @@ -297,6 +303,7 @@ static BlockDriver bdrv_null_aio = { .bdrv_file_open = null_file_open, .bdrv_parse_filename = null_aio_parse_filename, .bdrv_getlength = null_getlength, + .bdrv_get_allocated_file_size = null_allocated_file_size, .bdrv_aio_preadv = null_aio_preadv, .bdrv_aio_pwritev = null_aio_pwritev, diff --git a/block/nvme.c b/block/nvme.c index 05485fdd11..f4f27b6da7 100644 --- a/block/nvme.c +++ b/block/nvme.c @@ -83,25 +83,21 @@ typedef struct { /* Memory mapped registers */ typedef volatile struct { - uint64_t cap; - uint32_t vs; - uint32_t intms; - uint32_t intmc; - uint32_t cc; - uint32_t reserved0; - uint32_t csts; - uint32_t nssr; - uint32_t aqa; - uint64_t asq; - uint64_t acq; - uint32_t cmbloc; - uint32_t cmbsz; - uint8_t reserved1[0xec0]; - uint8_t cmd_set_specfic[0x100]; - uint32_t doorbells[]; + NvmeBar ctrl; + struct { + uint32_t sq_tail; + uint32_t cq_head; + } doorbells[]; } NVMeRegs; -QEMU_BUILD_BUG_ON(offsetof(NVMeRegs, doorbells) != 0x1000); +#define INDEX_ADMIN 0 +#define INDEX_IO(n) (1 + n) + +/* This driver shares a single MSIX IRQ for the admin and I/O queues */ +enum { + MSIX_SHARED_IRQ_IDX = 0, + MSIX_IRQ_COUNT = 1 +}; struct BDRVNVMeState { AioContext *aio_context; @@ -117,7 +113,7 @@ struct BDRVNVMeState { /* How many uint32_t elements does each doorbell entry take. */ size_t doorbell_scale; bool write_cache_supported; - EventNotifier irq_notifier; + EventNotifier irq_notifier[MSIX_IRQ_COUNT]; uint64_t nsze; /* Namespace size reported by identify command */ int nsid; /* The namespace id to read/write data. */ @@ -162,21 +158,20 @@ static QemuOptsList runtime_opts = { }, }; -static void nvme_init_queue(BlockDriverState *bs, NVMeQueue *q, +static void nvme_init_queue(BDRVNVMeState *s, NVMeQueue *q, int nentries, int entry_bytes, Error **errp) { - BDRVNVMeState *s = bs->opaque; size_t bytes; int r; bytes = ROUND_UP(nentries * entry_bytes, s->page_size); q->head = q->tail = 0; - q->queue = qemu_try_blockalign0(bs, bytes); - + q->queue = qemu_try_memalign(s->page_size, bytes); if (!q->queue) { error_setg(errp, "Cannot allocate queue"); return; } + memset(q->queue, 0, bytes); r = qemu_vfio_dma_map(s->vfio, q->queue, bytes, false, &q->iova); if (r) { error_setg(errp, "Cannot map queue"); @@ -206,23 +201,31 @@ static void nvme_free_req_queue_cb(void *opaque) qemu_mutex_unlock(&q->lock); } -static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs, +static NVMeQueuePair *nvme_create_queue_pair(BDRVNVMeState *s, + AioContext *aio_context, int idx, int size, Error **errp) { int i, r; - BDRVNVMeState *s = bs->opaque; Error *local_err = NULL; - NVMeQueuePair *q = g_new0(NVMeQueuePair, 1); + NVMeQueuePair *q; uint64_t prp_list_iova; + q = g_try_new0(NVMeQueuePair, 1); + if (!q) { + return NULL; + } + q->prp_list_pages = qemu_try_memalign(s->page_size, + s->page_size * NVME_NUM_REQS); + if (!q->prp_list_pages) { + goto fail; + } + memset(q->prp_list_pages, 0, s->page_size * NVME_NUM_REQS); qemu_mutex_init(&q->lock); q->s = s; q->index = idx; qemu_co_queue_init(&q->free_req_queue); - q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_NUM_REQS); - q->completion_bh = aio_bh_new(bdrv_get_aio_context(bs), - nvme_process_completion_bh, q); + q->completion_bh = aio_bh_new(aio_context, nvme_process_completion_bh, q); r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages, s->page_size * NVME_NUM_REQS, false, &prp_list_iova); @@ -239,19 +242,19 @@ static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs, req->prp_list_iova = prp_list_iova + i * s->page_size; } - nvme_init_queue(bs, &q->sq, size, NVME_SQ_ENTRY_BYTES, &local_err); + nvme_init_queue(s, &q->sq, size, NVME_SQ_ENTRY_BYTES, &local_err); if (local_err) { error_propagate(errp, local_err); goto fail; } - q->sq.doorbell = &s->regs->doorbells[idx * 2 * s->doorbell_scale]; + q->sq.doorbell = &s->regs->doorbells[idx * s->doorbell_scale].sq_tail; - nvme_init_queue(bs, &q->cq, size, NVME_CQ_ENTRY_BYTES, &local_err); + nvme_init_queue(s, &q->cq, size, NVME_CQ_ENTRY_BYTES, &local_err); if (local_err) { error_propagate(errp, local_err); goto fail; } - q->cq.doorbell = &s->regs->doorbells[(idx * 2 + 1) * s->doorbell_scale]; + q->cq.doorbell = &s->regs->doorbells[idx * s->doorbell_scale].cq_head; return q; fail: @@ -441,6 +444,9 @@ static void nvme_trace_command(const NvmeCmd *cmd) { int i; + if (!trace_event_get_state_backends(TRACE_NVME_SUBMIT_COMMAND_RAW)) { + return; + } for (i = 0; i < 8; ++i) { uint8_t *cmdp = (uint8_t *)cmd + i * 8; trace_nvme_submit_command_raw(cmdp[0], cmdp[1], cmdp[2], cmdp[3], @@ -479,6 +485,7 @@ static void nvme_cmd_sync_cb(void *opaque, int ret) static int nvme_cmd_sync(BlockDriverState *bs, NVMeQueuePair *q, NvmeCmd *cmd) { + AioContext *aio_context = bdrv_get_aio_context(bs); NVMeRequest *req; int ret = -EINPROGRESS; req = nvme_get_free_req(q); @@ -487,17 +494,18 @@ static int nvme_cmd_sync(BlockDriverState *bs, NVMeQueuePair *q, } nvme_submit_command(q, req, cmd, nvme_cmd_sync_cb, &ret); - BDRV_POLL_WHILE(bs, ret == -EINPROGRESS); + AIO_WAIT_WHILE(aio_context, ret == -EINPROGRESS); return ret; } static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp) { BDRVNVMeState *s = bs->opaque; - NvmeIdCtrl *idctrl; - NvmeIdNs *idns; + union { + NvmeIdCtrl ctrl; + NvmeIdNs ns; + } *id; NvmeLBAF *lbaf; - uint8_t *resp; uint16_t oncs; int r; uint64_t iova; @@ -506,54 +514,52 @@ static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp) .cdw10 = cpu_to_le32(0x1), }; - resp = qemu_try_blockalign0(bs, sizeof(NvmeIdCtrl)); - if (!resp) { + id = qemu_try_memalign(s->page_size, sizeof(*id)); + if (!id) { error_setg(errp, "Cannot allocate buffer for identify response"); goto out; } - idctrl = (NvmeIdCtrl *)resp; - idns = (NvmeIdNs *)resp; - r = qemu_vfio_dma_map(s->vfio, resp, sizeof(NvmeIdCtrl), true, &iova); + r = qemu_vfio_dma_map(s->vfio, id, sizeof(*id), true, &iova); if (r) { error_setg(errp, "Cannot map buffer for DMA"); goto out; } - cmd.dptr.prp1 = cpu_to_le64(iova); - if (nvme_cmd_sync(bs, s->queues[0], &cmd)) { + memset(id, 0, sizeof(*id)); + cmd.dptr.prp1 = cpu_to_le64(iova); + if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) { error_setg(errp, "Failed to identify controller"); goto out; } - if (le32_to_cpu(idctrl->nn) < namespace) { + if (le32_to_cpu(id->ctrl.nn) < namespace) { error_setg(errp, "Invalid namespace"); goto out; } - s->write_cache_supported = le32_to_cpu(idctrl->vwc) & 0x1; - s->max_transfer = (idctrl->mdts ? 1 << idctrl->mdts : 0) * s->page_size; + s->write_cache_supported = le32_to_cpu(id->ctrl.vwc) & 0x1; + s->max_transfer = (id->ctrl.mdts ? 1 << id->ctrl.mdts : 0) * s->page_size; /* For now the page list buffer per command is one page, to hold at most * s->page_size / sizeof(uint64_t) entries. */ s->max_transfer = MIN_NON_ZERO(s->max_transfer, s->page_size / sizeof(uint64_t) * s->page_size); - oncs = le16_to_cpu(idctrl->oncs); + oncs = le16_to_cpu(id->ctrl.oncs); s->supports_write_zeroes = !!(oncs & NVME_ONCS_WRITE_ZEROES); s->supports_discard = !!(oncs & NVME_ONCS_DSM); - memset(resp, 0, 4096); - + memset(id, 0, sizeof(*id)); cmd.cdw10 = 0; cmd.nsid = cpu_to_le32(namespace); - if (nvme_cmd_sync(bs, s->queues[0], &cmd)) { + if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) { error_setg(errp, "Failed to identify namespace"); goto out; } - s->nsze = le64_to_cpu(idns->nsze); - lbaf = &idns->lbaf[NVME_ID_NS_FLBAS_INDEX(idns->flbas)]; + s->nsze = le64_to_cpu(id->ns.nsze); + lbaf = &id->ns.lbaf[NVME_ID_NS_FLBAS_INDEX(id->ns.flbas)]; - if (NVME_ID_NS_DLFEAT_WRITE_ZEROES(idns->dlfeat) && - NVME_ID_NS_DLFEAT_READ_BEHAVIOR(idns->dlfeat) == + if (NVME_ID_NS_DLFEAT_WRITE_ZEROES(id->ns.dlfeat) && + NVME_ID_NS_DLFEAT_READ_BEHAVIOR(id->ns.dlfeat) == NVME_ID_NS_DLFEAT_READ_BEHAVIOR_ZEROES) { bs->supported_write_flags |= BDRV_REQ_MAY_UNMAP; } @@ -573,8 +579,34 @@ static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp) s->blkshift = lbaf->ds; out: - qemu_vfio_dma_unmap(s->vfio, resp); - qemu_vfree(resp); + qemu_vfio_dma_unmap(s->vfio, id); + qemu_vfree(id); +} + +static bool nvme_poll_queue(NVMeQueuePair *q) +{ + bool progress = false; + + const size_t cqe_offset = q->cq.head * NVME_CQ_ENTRY_BYTES; + NvmeCqe *cqe = (NvmeCqe *)&q->cq.queue[cqe_offset]; + + /* + * Do an early check for completions. q->lock isn't needed because + * nvme_process_completion() only runs in the event loop thread and + * cannot race with itself. + */ + if ((le16_to_cpu(cqe->status) & 0x1) == q->cq_phase) { + return false; + } + + qemu_mutex_lock(&q->lock); + while (nvme_process_completion(q)) { + /* Keep polling */ + progress = true; + } + qemu_mutex_unlock(&q->lock); + + return progress; } static bool nvme_poll_queues(BDRVNVMeState *s) @@ -583,32 +615,17 @@ static bool nvme_poll_queues(BDRVNVMeState *s) int i; for (i = 0; i < s->nr_queues; i++) { - NVMeQueuePair *q = s->queues[i]; - const size_t cqe_offset = q->cq.head * NVME_CQ_ENTRY_BYTES; - NvmeCqe *cqe = (NvmeCqe *)&q->cq.queue[cqe_offset]; - - /* - * Do an early check for completions. q->lock isn't needed because - * nvme_process_completion() only runs in the event loop thread and - * cannot race with itself. - */ - if ((le16_to_cpu(cqe->status) & 0x1) == q->cq_phase) { - continue; - } - - qemu_mutex_lock(&q->lock); - while (nvme_process_completion(q)) { - /* Keep polling */ + if (nvme_poll_queue(s->queues[i])) { progress = true; } - qemu_mutex_unlock(&q->lock); } return progress; } static void nvme_handle_event(EventNotifier *n) { - BDRVNVMeState *s = container_of(n, BDRVNVMeState, irq_notifier); + BDRVNVMeState *s = container_of(n, BDRVNVMeState, + irq_notifier[MSIX_SHARED_IRQ_IDX]); trace_nvme_handle_event(s); event_notifier_test_and_clear(n); @@ -623,7 +640,8 @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp) NvmeCmd cmd; int queue_size = NVME_QUEUE_SIZE; - q = nvme_create_queue_pair(bs, n, queue_size, errp); + q = nvme_create_queue_pair(s, bdrv_get_aio_context(bs), + n, queue_size, errp); if (!q) { return false; } @@ -633,10 +651,9 @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp) .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | (n & 0xFFFF)), .cdw11 = cpu_to_le32(0x3), }; - if (nvme_cmd_sync(bs, s->queues[0], &cmd)) { - error_setg(errp, "Failed to create io queue [%d]", n); - nvme_free_queue_pair(q); - return false; + if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) { + error_setg(errp, "Failed to create CQ io queue [%d]", n); + goto out_error; } cmd = (NvmeCmd) { .opcode = NVME_ADM_CMD_CREATE_SQ, @@ -644,21 +661,24 @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp) .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | (n & 0xFFFF)), .cdw11 = cpu_to_le32(0x1 | (n << 16)), }; - if (nvme_cmd_sync(bs, s->queues[0], &cmd)) { - error_setg(errp, "Failed to create io queue [%d]", n); - nvme_free_queue_pair(q); - return false; + if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) { + error_setg(errp, "Failed to create SQ io queue [%d]", n); + goto out_error; } s->queues = g_renew(NVMeQueuePair *, s->queues, n + 1); s->queues[n] = q; s->nr_queues++; return true; +out_error: + nvme_free_queue_pair(q); + return false; } static bool nvme_poll_cb(void *opaque) { EventNotifier *e = opaque; - BDRVNVMeState *s = container_of(e, BDRVNVMeState, irq_notifier); + BDRVNVMeState *s = container_of(e, BDRVNVMeState, + irq_notifier[MSIX_SHARED_IRQ_IDX]); trace_nvme_poll_cb(s); return nvme_poll_queues(s); @@ -668,6 +688,7 @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace, Error **errp) { BDRVNVMeState *s = bs->opaque; + AioContext *aio_context = bdrv_get_aio_context(bs); int ret; uint64_t cap; uint64_t timeout_ms; @@ -679,7 +700,7 @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace, s->device = g_strdup(device); s->nsid = namespace; s->aio_context = bdrv_get_aio_context(bs); - ret = event_notifier_init(&s->irq_notifier, 0); + ret = event_notifier_init(&s->irq_notifier[MSIX_SHARED_IRQ_IDX], 0); if (ret) { error_setg(errp, "Failed to init event notifier"); return ret; @@ -700,7 +721,7 @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace, /* Perform initialize sequence as described in NVMe spec "7.6.1 * Initialization". */ - cap = le64_to_cpu(s->regs->cap); + cap = le64_to_cpu(s->regs->ctrl.cap); if (!(cap & (1ULL << 37))) { error_setg(errp, "Device doesn't support NVMe command set"); ret = -EINVAL; @@ -713,10 +734,10 @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace, timeout_ms = MIN(500 * ((cap >> 24) & 0xFF), 30000); /* Reset device to get a clean state. */ - s->regs->cc = cpu_to_le32(le32_to_cpu(s->regs->cc) & 0xFE); + s->regs->ctrl.cc = cpu_to_le32(le32_to_cpu(s->regs->ctrl.cc) & 0xFE); /* Wait for CSTS.RDY = 0. */ - deadline = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + timeout_ms * 1000000ULL; - while (le32_to_cpu(s->regs->csts) & 0x1) { + deadline = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + timeout_ms * SCALE_MS; + while (le32_to_cpu(s->regs->ctrl.csts) & 0x1) { if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) > deadline) { error_setg(errp, "Timeout while waiting for device to reset (%" PRId64 " ms)", @@ -728,25 +749,27 @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace, /* Set up admin queue. */ s->queues = g_new(NVMeQueuePair *, 1); - s->queues[0] = nvme_create_queue_pair(bs, 0, NVME_QUEUE_SIZE, errp); - if (!s->queues[0]) { + s->queues[INDEX_ADMIN] = nvme_create_queue_pair(s, aio_context, 0, + NVME_QUEUE_SIZE, + errp); + if (!s->queues[INDEX_ADMIN]) { ret = -EINVAL; goto out; } s->nr_queues = 1; QEMU_BUILD_BUG_ON(NVME_QUEUE_SIZE & 0xF000); - s->regs->aqa = cpu_to_le32((NVME_QUEUE_SIZE << 16) | NVME_QUEUE_SIZE); - s->regs->asq = cpu_to_le64(s->queues[0]->sq.iova); - s->regs->acq = cpu_to_le64(s->queues[0]->cq.iova); + s->regs->ctrl.aqa = cpu_to_le32((NVME_QUEUE_SIZE << 16) | NVME_QUEUE_SIZE); + s->regs->ctrl.asq = cpu_to_le64(s->queues[INDEX_ADMIN]->sq.iova); + s->regs->ctrl.acq = cpu_to_le64(s->queues[INDEX_ADMIN]->cq.iova); /* After setting up all control registers we can enable device now. */ - s->regs->cc = cpu_to_le32((ctz32(NVME_CQ_ENTRY_BYTES) << 20) | + s->regs->ctrl.cc = cpu_to_le32((ctz32(NVME_CQ_ENTRY_BYTES) << 20) | (ctz32(NVME_SQ_ENTRY_BYTES) << 16) | 0x1); /* Wait for CSTS.RDY = 1. */ now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); deadline = now + timeout_ms * 1000000; - while (!(le32_to_cpu(s->regs->csts) & 0x1)) { + while (!(le32_to_cpu(s->regs->ctrl.csts) & 0x1)) { if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) > deadline) { error_setg(errp, "Timeout while waiting for device to start (%" PRId64 " ms)", @@ -756,12 +779,13 @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace, } } - ret = qemu_vfio_pci_init_irq(s->vfio, &s->irq_notifier, + ret = qemu_vfio_pci_init_irq(s->vfio, s->irq_notifier, VFIO_PCI_MSIX_IRQ_INDEX, errp); if (ret) { goto out; } - aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier, + aio_set_event_notifier(bdrv_get_aio_context(bs), + &s->irq_notifier[MSIX_SHARED_IRQ_IDX], false, nvme_handle_event, nvme_poll_cb); nvme_identify(bs, namespace, &local_err); @@ -828,7 +852,7 @@ static int nvme_enable_disable_write_cache(BlockDriverState *bs, bool enable, .cdw11 = cpu_to_le32(enable ? 0x01 : 0x00), }; - ret = nvme_cmd_sync(bs, s->queues[0], &cmd); + ret = nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd); if (ret) { error_setg(errp, "Failed to configure NVMe write cache"); } @@ -844,9 +868,10 @@ static void nvme_close(BlockDriverState *bs) nvme_free_queue_pair(s->queues[i]); } g_free(s->queues); - aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier, + aio_set_event_notifier(bdrv_get_aio_context(bs), + &s->irq_notifier[MSIX_SHARED_IRQ_IDX], false, NULL, NULL); - event_notifier_cleanup(&s->irq_notifier); + event_notifier_cleanup(&s->irq_notifier[MSIX_SHARED_IRQ_IDX]); qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->regs, 0, NVME_BAR_SIZE); qemu_vfio_close(s->vfio); @@ -1045,7 +1070,7 @@ static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs, { int r; BDRVNVMeState *s = bs->opaque; - NVMeQueuePair *ioq = s->queues[1]; + NVMeQueuePair *ioq = s->queues[INDEX_IO(0)]; NVMeRequest *req; uint32_t cdw12 = (((bytes >> s->blkshift) - 1) & 0xFFFF) | @@ -1124,7 +1149,7 @@ static int nvme_co_prw(BlockDriverState *bs, uint64_t offset, uint64_t bytes, return nvme_co_prw_aligned(bs, offset, bytes, qiov, is_write, flags); } trace_nvme_prw_buffered(s, offset, bytes, qiov->niov, is_write); - buf = qemu_try_blockalign(bs, bytes); + buf = qemu_try_memalign(s->page_size, bytes); if (!buf) { return -ENOMEM; @@ -1160,7 +1185,7 @@ static coroutine_fn int nvme_co_pwritev(BlockDriverState *bs, static coroutine_fn int nvme_co_flush(BlockDriverState *bs) { BDRVNVMeState *s = bs->opaque; - NVMeQueuePair *ioq = s->queues[1]; + NVMeQueuePair *ioq = s->queues[INDEX_IO(0)]; NVMeRequest *req; NvmeCmd cmd = { .opcode = NVME_CMD_FLUSH, @@ -1191,7 +1216,7 @@ static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs, BdrvRequestFlags flags) { BDRVNVMeState *s = bs->opaque; - NVMeQueuePair *ioq = s->queues[1]; + NVMeQueuePair *ioq = s->queues[INDEX_IO(0)]; NVMeRequest *req; uint32_t cdw12 = ((bytes >> s->blkshift) - 1) & 0xFFFF; @@ -1244,7 +1269,7 @@ static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs, int bytes) { BDRVNVMeState *s = bs->opaque; - NVMeQueuePair *ioq = s->queues[1]; + NVMeQueuePair *ioq = s->queues[INDEX_IO(0)]; NVMeRequest *req; NvmeDsmRange *buf; QEMUIOVector local_qiov; @@ -1268,11 +1293,11 @@ static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs, assert(s->nr_queues > 1); - buf = qemu_try_blockalign0(bs, s->page_size); + buf = qemu_try_memalign(s->page_size, s->page_size); if (!buf) { return -ENOMEM; } - + memset(buf, 0, s->page_size); buf->nlb = cpu_to_le32(bytes >> s->blkshift); buf->slba = cpu_to_le64(offset >> s->blkshift); buf->cattr = 0; @@ -1353,7 +1378,8 @@ static void nvme_detach_aio_context(BlockDriverState *bs) q->completion_bh = NULL; } - aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier, + aio_set_event_notifier(bdrv_get_aio_context(bs), + &s->irq_notifier[MSIX_SHARED_IRQ_IDX], false, NULL, NULL); } @@ -1363,7 +1389,7 @@ static void nvme_attach_aio_context(BlockDriverState *bs, BDRVNVMeState *s = bs->opaque; s->aio_context = new_context; - aio_set_event_notifier(new_context, &s->irq_notifier, + aio_set_event_notifier(new_context, &s->irq_notifier[MSIX_SHARED_IRQ_IDX], false, nvme_handle_event, nvme_poll_cb); for (int i = 0; i < s->nr_queues; i++) { @@ -1387,7 +1413,7 @@ static void nvme_aio_unplug(BlockDriverState *bs) BDRVNVMeState *s = bs->opaque; assert(s->plugged); s->plugged = false; - for (i = 1; i < s->nr_queues; i++) { + for (i = INDEX_IO(0); i < s->nr_queues; i++) { NVMeQueuePair *q = s->queues[i]; qemu_mutex_lock(&q->lock); nvme_kick(q); diff --git a/block/qapi.c b/block/qapi.c index afd9f3b4a7..f423ece98c 100644 --- a/block/qapi.c +++ b/block/qapi.c @@ -47,7 +47,7 @@ BlockDeviceInfo *bdrv_block_device_info(BlockBackend *blk, Error **errp) { ImageInfo **p_image_info; - BlockDriverState *bs0; + BlockDriverState *bs0, *backing; BlockDeviceInfo *info; if (!bs->drv) { @@ -76,9 +76,10 @@ BlockDeviceInfo *bdrv_block_device_info(BlockBackend *blk, info->node_name = g_strdup(bs->node_name); } - if (bs->backing_file[0]) { + backing = bdrv_cow_bs(bs); + if (backing) { info->has_backing_file = true; - info->backing_file = g_strdup(bs->backing_file); + info->backing_file = g_strdup(backing->filename); } if (!QLIST_EMPTY(&bs->dirty_bitmaps)) { @@ -163,9 +164,13 @@ BlockDeviceInfo *bdrv_block_device_info(BlockBackend *blk, break; } - if (bs0->drv && bs0->backing) { + if (bs0->drv && bdrv_filter_or_cow_child(bs0)) { + /* + * Put any filtered child here (for backwards compatibility to when + * we put bs0->backing here, which might be any filtered child). + */ info->backing_file_depth++; - bs0 = bs0->backing->bs; + bs0 = bdrv_filter_or_cow_bs(bs0); (*p_image_info)->has_backing_image = true; p_image_info = &((*p_image_info)->backing_image); } else { @@ -174,9 +179,8 @@ BlockDeviceInfo *bdrv_block_device_info(BlockBackend *blk, /* Skip automatically inserted nodes that the user isn't aware of for * query-block (blk != NULL), but not for query-named-block-nodes */ - while (blk && bs0->drv && bs0->implicit) { - bs0 = backing_bs(bs0); - assert(bs0); + if (blk) { + bs0 = bdrv_skip_implicit_filters(bs0); } } @@ -288,7 +292,7 @@ void bdrv_query_image_info(BlockDriverState *bs, info->virtual_size = size; info->actual_size = bdrv_get_allocated_file_size(bs); info->has_actual_size = info->actual_size >= 0; - if (bdrv_is_encrypted(bs)) { + if (bs->encrypted) { info->encrypted = true; info->has_encrypted = true; } @@ -311,6 +315,7 @@ void bdrv_query_image_info(BlockDriverState *bs, backing_filename = bs->backing_file; if (backing_filename[0] != '\0') { char *backing_filename2; + info->backing_filename = g_strdup(backing_filename); info->has_backing_filename = true; backing_filename2 = bdrv_get_full_backing_filename(bs, NULL); @@ -362,9 +367,7 @@ static void bdrv_query_info(BlockBackend *blk, BlockInfo **p_info, char *qdev; /* Skip automatically inserted nodes that the user isn't aware of */ - while (bs && bs->drv && bs->implicit) { - bs = backing_bs(bs); - } + bs = bdrv_skip_implicit_filters(bs); info->device = g_strdup(blk_name(blk)); info->type = g_strdup("unknown"); @@ -526,6 +529,8 @@ static void bdrv_query_blk_stats(BlockDeviceStats *ds, BlockBackend *blk) static BlockStats *bdrv_query_bds_stats(BlockDriverState *bs, bool blk_level) { + BdrvChild *parent_child; + BlockDriverState *filter_or_cow_bs; BlockStats *s = NULL; s = g_malloc0(sizeof(*s)); @@ -538,9 +543,8 @@ static BlockStats *bdrv_query_bds_stats(BlockDriverState *bs, /* Skip automatically inserted nodes that the user isn't aware of in * a BlockBackend-level command. Stay at the exact node for a node-level * command. */ - while (blk_level && bs->drv && bs->implicit) { - bs = backing_bs(bs); - assert(bs); + if (blk_level) { + bs = bdrv_skip_implicit_filters(bs); } if (bdrv_get_node_name(bs)[0]) { @@ -555,14 +559,46 @@ static BlockStats *bdrv_query_bds_stats(BlockDriverState *bs, s->has_driver_specific = true; } - if (bs->file) { + parent_child = bdrv_primary_child(bs); + if (!parent_child || + !(parent_child->role & (BDRV_CHILD_DATA | BDRV_CHILD_FILTERED))) + { + BdrvChild *c; + + /* + * Look for a unique data-storing child. We do not need to look for + * filtered children, as there would be only one and it would have been + * the primary child. + */ + parent_child = NULL; + QLIST_FOREACH(c, &bs->children, next) { + if (c->role & BDRV_CHILD_DATA) { + if (parent_child) { + /* + * There are multiple data-storing children and we cannot + * choose between them. + */ + parent_child = NULL; + break; + } + parent_child = c; + } + } + } + if (parent_child) { s->has_parent = true; - s->parent = bdrv_query_bds_stats(bs->file->bs, blk_level); + s->parent = bdrv_query_bds_stats(parent_child->bs, blk_level); } - if (blk_level && bs->backing) { + filter_or_cow_bs = bdrv_filter_or_cow_bs(bs); + if (blk_level && filter_or_cow_bs) { + /* + * Put any filtered or COW child here (for backwards + * compatibility to when we put bs0->backing here, which might + * be either) + */ s->has_backing = true; - s->backing = bdrv_query_bds_stats(bs->backing->bs, blk_level); + s->backing = bdrv_query_bds_stats(filter_or_cow_bs, blk_level); } return s; diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index 996b3314f4..fcdf7af8e6 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -1320,6 +1320,7 @@ static bool cluster_needs_new_alloc(BlockDriverState *bs, uint64_t l2_entry) if (l2_entry & QCOW_OFLAG_COPIED) { return false; } + /* fallthrough */ case QCOW2_CLUSTER_UNALLOCATED: case QCOW2_CLUSTER_COMPRESSED: case QCOW2_CLUSTER_ZERO_PLAIN: diff --git a/block/snapshot.c b/block/snapshot.c index bd9fb01817..a2bf3a54eb 100644 --- a/block/snapshot.c +++ b/block/snapshot.c @@ -147,6 +147,56 @@ bool bdrv_snapshot_find_by_id_and_name(BlockDriverState *bs, return ret; } +/** + * Return a pointer to the child BDS pointer to which we can fall + * back if the given BDS does not support snapshots. + * Return NULL if there is no BDS to (safely) fall back to. + * + * We need to return an indirect pointer because bdrv_snapshot_goto() + * has to modify the BdrvChild pointer. + */ +static BdrvChild **bdrv_snapshot_fallback_ptr(BlockDriverState *bs) +{ + BdrvChild **fallback; + BdrvChild *child; + + /* + * The only BdrvChild pointers that are safe to modify (and which + * we can thus return a reference to) are bs->file and + * bs->backing. + */ + fallback = &bs->file; + if (!*fallback && bs->drv && bs->drv->is_filter) { + fallback = &bs->backing; + } + + if (!*fallback) { + return NULL; + } + + /* + * Check that there are no other children that would need to be + * snapshotted. If there are, it is not safe to fall back to + * *fallback. + */ + QLIST_FOREACH(child, &bs->children, next) { + if (child->role & (BDRV_CHILD_DATA | BDRV_CHILD_METADATA | + BDRV_CHILD_FILTERED) && + child != *fallback) + { + return NULL; + } + } + + return fallback; +} + +static BlockDriverState *bdrv_snapshot_fallback(BlockDriverState *bs) +{ + BdrvChild **child_ptr = bdrv_snapshot_fallback_ptr(bs); + return child_ptr ? (*child_ptr)->bs : NULL; +} + int bdrv_can_snapshot(BlockDriverState *bs) { BlockDriver *drv = bs->drv; @@ -155,8 +205,9 @@ int bdrv_can_snapshot(BlockDriverState *bs) } if (!drv->bdrv_snapshot_create) { - if (bs->file != NULL) { - return bdrv_can_snapshot(bs->file->bs); + BlockDriverState *fallback_bs = bdrv_snapshot_fallback(bs); + if (fallback_bs) { + return bdrv_can_snapshot(fallback_bs); } return 0; } @@ -168,14 +219,15 @@ int bdrv_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) { BlockDriver *drv = bs->drv; + BlockDriverState *fallback_bs = bdrv_snapshot_fallback(bs); if (!drv) { return -ENOMEDIUM; } if (drv->bdrv_snapshot_create) { return drv->bdrv_snapshot_create(bs, sn_info); } - if (bs->file) { - return bdrv_snapshot_create(bs->file->bs, sn_info); + if (fallback_bs) { + return bdrv_snapshot_create(fallback_bs, sn_info); } return -ENOTSUP; } @@ -185,6 +237,7 @@ int bdrv_snapshot_goto(BlockDriverState *bs, Error **errp) { BlockDriver *drv = bs->drv; + BdrvChild **fallback_ptr; int ret, open_ret; if (!drv) { @@ -205,39 +258,46 @@ int bdrv_snapshot_goto(BlockDriverState *bs, return ret; } - if (bs->file) { - BlockDriverState *file; - QDict *options = qdict_clone_shallow(bs->options); + fallback_ptr = bdrv_snapshot_fallback_ptr(bs); + if (fallback_ptr) { + QDict *options; QDict *file_options; Error *local_err = NULL; + BlockDriverState *fallback_bs = (*fallback_ptr)->bs; + char *subqdict_prefix = g_strdup_printf("%s.", (*fallback_ptr)->name); + + options = qdict_clone_shallow(bs->options); - file = bs->file->bs; /* Prevent it from getting deleted when detached from bs */ - bdrv_ref(file); + bdrv_ref(fallback_bs); - qdict_extract_subqdict(options, &file_options, "file."); + qdict_extract_subqdict(options, &file_options, subqdict_prefix); qobject_unref(file_options); - qdict_put_str(options, "file", bdrv_get_node_name(file)); + g_free(subqdict_prefix); + + qdict_put_str(options, (*fallback_ptr)->name, + bdrv_get_node_name(fallback_bs)); if (drv->bdrv_close) { drv->bdrv_close(bs); } - bdrv_unref_child(bs, bs->file); - bs->file = NULL; - ret = bdrv_snapshot_goto(file, snapshot_id, errp); + bdrv_unref_child(bs, *fallback_ptr); + *fallback_ptr = NULL; + + ret = bdrv_snapshot_goto(fallback_bs, snapshot_id, errp); open_ret = drv->bdrv_open(bs, options, bs->open_flags, &local_err); qobject_unref(options); if (open_ret < 0) { - bdrv_unref(file); + bdrv_unref(fallback_bs); bs->drv = NULL; /* A bdrv_snapshot_goto() error takes precedence */ error_propagate(errp, local_err); return ret < 0 ? ret : open_ret; } - assert(bs->file->bs == file); - bdrv_unref(file); + assert(fallback_bs == (*fallback_ptr)->bs); + bdrv_unref(fallback_bs); return ret; } @@ -273,6 +333,7 @@ int bdrv_snapshot_delete(BlockDriverState *bs, Error **errp) { BlockDriver *drv = bs->drv; + BlockDriverState *fallback_bs = bdrv_snapshot_fallback(bs); int ret; if (!drv) { @@ -289,8 +350,8 @@ int bdrv_snapshot_delete(BlockDriverState *bs, if (drv->bdrv_snapshot_delete) { ret = drv->bdrv_snapshot_delete(bs, snapshot_id, name, errp); - } else if (bs->file) { - ret = bdrv_snapshot_delete(bs->file->bs, snapshot_id, name, errp); + } else if (fallback_bs) { + ret = bdrv_snapshot_delete(fallback_bs, snapshot_id, name, errp); } else { error_setg(errp, "Block format '%s' used by device '%s' " "does not support internal snapshot deletion", @@ -306,14 +367,15 @@ int bdrv_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_info) { BlockDriver *drv = bs->drv; + BlockDriverState *fallback_bs = bdrv_snapshot_fallback(bs); if (!drv) { return -ENOMEDIUM; } if (drv->bdrv_snapshot_list) { return drv->bdrv_snapshot_list(bs, psn_info); } - if (bs->file) { - return bdrv_snapshot_list(bs->file->bs, psn_info); + if (fallback_bs) { + return bdrv_snapshot_list(fallback_bs, psn_info); } return -ENOTSUP; } diff --git a/block/stream.c b/block/stream.c index 310ccbaa4c..8ce6729a33 100644 --- a/block/stream.c +++ b/block/stream.c @@ -31,7 +31,8 @@ enum { typedef struct StreamBlockJob { BlockJob common; - BlockDriverState *bottom; + BlockDriverState *base_overlay; /* COW overlay (stream from this) */ + BlockDriverState *above_base; /* Node directly above the base */ BlockdevOnError on_error; char *backing_file_str; bool bs_read_only; @@ -53,7 +54,7 @@ static void stream_abort(Job *job) if (s->chain_frozen) { BlockJob *bjob = &s->common; - bdrv_unfreeze_backing_chain(blk_bs(bjob->blk), s->bottom); + bdrv_unfreeze_backing_chain(blk_bs(bjob->blk), s->above_base); } } @@ -62,14 +63,15 @@ static int stream_prepare(Job *job) StreamBlockJob *s = container_of(job, StreamBlockJob, common.job); BlockJob *bjob = &s->common; BlockDriverState *bs = blk_bs(bjob->blk); - BlockDriverState *base = backing_bs(s->bottom); + BlockDriverState *unfiltered_bs = bdrv_skip_filters(bs); + BlockDriverState *base = bdrv_filter_or_cow_bs(s->above_base); Error *local_err = NULL; int ret = 0; - bdrv_unfreeze_backing_chain(bs, s->bottom); + bdrv_unfreeze_backing_chain(bs, s->above_base); s->chain_frozen = false; - if (bs->backing) { + if (bdrv_cow_child(unfiltered_bs)) { const char *base_id = NULL, *base_fmt = NULL; if (base) { base_id = s->backing_file_str; @@ -77,8 +79,8 @@ static int stream_prepare(Job *job) base_fmt = base->drv->format_name; } } - bdrv_set_backing_hd(bs, base, &local_err); - ret = bdrv_change_backing_file(bs, base_id, base_fmt, false); + bdrv_set_backing_hd(unfiltered_bs, base, &local_err); + ret = bdrv_change_backing_file(unfiltered_bs, base_id, base_fmt, false); if (local_err) { error_report_err(local_err); return -EPERM; @@ -109,14 +111,15 @@ static int coroutine_fn stream_run(Job *job, Error **errp) StreamBlockJob *s = container_of(job, StreamBlockJob, common.job); BlockBackend *blk = s->common.blk; BlockDriverState *bs = blk_bs(blk); - bool enable_cor = !backing_bs(s->bottom); + BlockDriverState *unfiltered_bs = bdrv_skip_filters(bs); + bool enable_cor = !bdrv_cow_child(s->base_overlay); int64_t len; int64_t offset = 0; uint64_t delay_ns = 0; int error = 0; int64_t n = 0; /* bytes */ - if (bs == s->bottom) { + if (unfiltered_bs == s->base_overlay) { /* Nothing to stream */ return 0; } @@ -150,13 +153,14 @@ static int coroutine_fn stream_run(Job *job, Error **errp) copy = false; - ret = bdrv_is_allocated(bs, offset, STREAM_CHUNK, &n); + ret = bdrv_is_allocated(unfiltered_bs, offset, STREAM_CHUNK, &n); if (ret == 1) { /* Allocated in the top, no need to copy. */ } else if (ret >= 0) { /* Copy if allocated in the intermediate images. Limit to the * known-unallocated area [offset, offset+n*BDRV_SECTOR_SIZE). */ - ret = bdrv_is_allocated_above(backing_bs(bs), s->bottom, true, + ret = bdrv_is_allocated_above(bdrv_cow_bs(unfiltered_bs), + s->base_overlay, true, offset, n, &n); /* Finish early if end of backing file has been reached */ if (ret == 0 && n == 0) { @@ -223,9 +227,29 @@ void stream_start(const char *job_id, BlockDriverState *bs, BlockDriverState *iter; bool bs_read_only; int basic_flags = BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED; - BlockDriverState *bottom = bdrv_find_overlay(bs, base); + BlockDriverState *base_overlay = bdrv_find_overlay(bs, base); + BlockDriverState *above_base; - if (bdrv_freeze_backing_chain(bs, bottom, errp) < 0) { + if (!base_overlay) { + error_setg(errp, "'%s' is not in the backing chain of '%s'", + base->node_name, bs->node_name); + return; + } + + /* + * Find the node directly above @base. @base_overlay is a COW overlay, so + * it must have a bdrv_cow_child(), but it is the immediate overlay of + * @base, so between the two there can only be filters. + */ + above_base = base_overlay; + if (bdrv_cow_bs(above_base) != base) { + above_base = bdrv_cow_bs(above_base); + while (bdrv_filter_bs(above_base) != base) { + above_base = bdrv_filter_bs(above_base); + } + } + + if (bdrv_freeze_backing_chain(bs, above_base, errp) < 0) { return; } @@ -255,14 +279,19 @@ void stream_start(const char *job_id, BlockDriverState *bs, * and resizes. Reassign the base node pointer because the backing BS of the * bottom node might change after the call to bdrv_reopen_set_read_only() * due to parallel block jobs running. + * above_base node might change after the call to + * bdrv_reopen_set_read_only() due to parallel block jobs running. */ - base = backing_bs(bottom); - for (iter = backing_bs(bs); iter && iter != base; iter = backing_bs(iter)) { + base = bdrv_filter_or_cow_bs(above_base); + for (iter = bdrv_filter_or_cow_bs(bs); iter != base; + iter = bdrv_filter_or_cow_bs(iter)) + { block_job_add_bdrv(&s->common, "intermediate node", iter, 0, basic_flags, &error_abort); } - s->bottom = bottom; + s->base_overlay = base_overlay; + s->above_base = above_base; s->backing_file_str = g_strdup(backing_file_str); s->bs_read_only = bs_read_only; s->chain_frozen = true; @@ -276,5 +305,5 @@ fail: if (bs_read_only) { bdrv_reopen_set_read_only(bs, true, NULL); } - bdrv_unfreeze_backing_chain(bs, bottom); + bdrv_unfreeze_backing_chain(bs, above_base); } diff --git a/block/throttle.c b/block/throttle.c index 1c1ac57bee..9a0f38149a 100644 --- a/block/throttle.c +++ b/block/throttle.c @@ -151,6 +151,15 @@ static int coroutine_fn throttle_co_pdiscard(BlockDriverState *bs, return bdrv_co_pdiscard(bs->file, offset, bytes); } +static int coroutine_fn throttle_co_pwritev_compressed(BlockDriverState *bs, + uint64_t offset, + uint64_t bytes, + QEMUIOVector *qiov) +{ + return throttle_co_pwritev(bs, offset, bytes, qiov, + BDRV_REQ_WRITE_COMPRESSED); +} + static int throttle_co_flush(BlockDriverState *bs) { return bdrv_co_flush(bs->file->bs); @@ -243,6 +252,7 @@ static BlockDriver bdrv_throttle = { .bdrv_co_pwrite_zeroes = throttle_co_pwrite_zeroes, .bdrv_co_pdiscard = throttle_co_pdiscard, + .bdrv_co_pwritev_compressed = throttle_co_pwritev_compressed, .bdrv_attach_aio_context = throttle_attach_aio_context, .bdrv_detach_aio_context = throttle_detach_aio_context, @@ -250,7 +260,6 @@ static BlockDriver bdrv_throttle = { .bdrv_reopen_prepare = throttle_reopen_prepare, .bdrv_reopen_commit = throttle_reopen_commit, .bdrv_reopen_abort = throttle_reopen_abort, - .bdrv_co_block_status = bdrv_co_block_status_from_file, .bdrv_co_drain_begin = throttle_co_drain_begin, .bdrv_co_drain_end = throttle_co_drain_end, diff --git a/block/vmdk.c b/block/vmdk.c index d90855446a..8ec62c7ab7 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -2803,21 +2803,6 @@ static void vmdk_close(BlockDriverState *bs) error_free(s->migration_blocker); } -static coroutine_fn int vmdk_co_flush(BlockDriverState *bs) -{ - BDRVVmdkState *s = bs->opaque; - int i, err; - int ret = 0; - - for (i = 0; i < s->num_extents; i++) { - err = bdrv_co_flush(s->extents[i].file->bs); - if (err < 0) { - ret = err; - } - } - return ret; -} - static int64_t vmdk_get_allocated_file_size(BlockDriverState *bs) { int i; @@ -3081,7 +3066,6 @@ static BlockDriver bdrv_vmdk = { .bdrv_close = vmdk_close, .bdrv_co_create_opts = vmdk_co_create_opts, .bdrv_co_create = vmdk_co_create, - .bdrv_co_flush_to_disk = vmdk_co_flush, .bdrv_co_block_status = vmdk_co_block_status, .bdrv_get_allocated_file_size = vmdk_get_allocated_file_size, .bdrv_has_zero_init = vmdk_has_zero_init, diff --git a/blockdev.c b/blockdev.c index 3848a9c8ab..7f2561081e 100644 --- a/blockdev.c +++ b/blockdev.c @@ -1562,7 +1562,12 @@ static void external_snapshot_prepare(BlkActionState *common, goto out; } - if (state->new_bs->backing != NULL) { + if (state->new_bs->drv->is_filter) { + error_setg(errp, "Filters cannot be used as overlays"); + goto out; + } + + if (bdrv_cow_child(state->new_bs)) { error_setg(errp, "The overlay already has a backing image"); goto out; } @@ -1736,7 +1741,13 @@ static void drive_backup_prepare(BlkActionState *common, Error **errp) * on top of. */ if (backup->sync == MIRROR_SYNC_MODE_TOP) { - source = backing_bs(bs); + /* + * Backup will not replace the source by the target, so none + * of the filters skipped here will be removed (in contrast to + * mirror). Therefore, we can skip all of them when looking + * for the first COW relationship. + */ + source = bdrv_cow_bs(bdrv_skip_filters(bs)); if (!source) { backup->sync = MIRROR_SYNC_MODE_FULL; } @@ -1756,9 +1767,14 @@ static void drive_backup_prepare(BlkActionState *common, Error **errp) if (backup->mode != NEW_IMAGE_MODE_EXISTING) { assert(backup->format); if (source) { - bdrv_refresh_filename(source); - bdrv_img_create(backup->target, backup->format, source->filename, - source->drv->format_name, NULL, + /* Implicit filters should not appear in the filename */ + BlockDriverState *explicit_backing = + bdrv_skip_implicit_filters(source); + + bdrv_refresh_filename(explicit_backing); + bdrv_img_create(backup->target, backup->format, + explicit_backing->filename, + explicit_backing->drv->format_name, NULL, size, flags, false, &local_err); } else { bdrv_img_create(backup->target, backup->format, NULL, NULL, NULL, @@ -2528,7 +2544,9 @@ void qmp_block_stream(bool has_job_id, const char *job_id, const char *device, } /* Check for op blockers in the whole chain between bs and base */ - for (iter = bs; iter && iter != base_bs; iter = backing_bs(iter)) { + for (iter = bs; iter && iter != base_bs; + iter = bdrv_filter_or_cow_bs(iter)) + { if (bdrv_op_is_blocked(iter, BLOCK_OP_TYPE_STREAM, errp)) { goto out; } @@ -2584,6 +2602,7 @@ void qmp_block_commit(bool has_job_id, const char *job_id, const char *device, AioContext *aio_context; Error *local_err = NULL; int job_flags = JOB_DEFAULT; + uint64_t top_perm, top_shared; if (!has_speed) { speed = 0; @@ -2685,7 +2704,9 @@ void qmp_block_commit(bool has_job_id, const char *job_id, const char *device, assert(bdrv_get_aio_context(base_bs) == aio_context); - for (iter = top_bs; iter != backing_bs(base_bs); iter = backing_bs(iter)) { + for (iter = top_bs; iter != bdrv_filter_or_cow_bs(base_bs); + iter = bdrv_filter_or_cow_bs(iter)) + { if (bdrv_op_is_blocked(iter, BLOCK_OP_TYPE_COMMIT_TARGET, errp)) { goto out; } @@ -2697,14 +2718,38 @@ void qmp_block_commit(bool has_job_id, const char *job_id, const char *device, goto out; } - if (top_bs == bs) { + /* + * Active commit is required if and only if someone has taken a + * WRITE permission on the top node. Historically, we have always + * used active commit for top nodes, so continue that practice + * lest we possibly break clients that rely on this behavior, e.g. + * to later attach this node to a writing parent. + * (Active commit is never really wrong.) + */ + bdrv_get_cumulative_perm(top_bs, &top_perm, &top_shared); + if (top_perm & BLK_PERM_WRITE || + bdrv_skip_filters(top_bs) == bdrv_skip_filters(bs)) + { if (has_backing_file) { - error_setg(errp, "'backing-file' specified," - " but 'top' is the active layer"); + if (bdrv_skip_filters(top_bs) == bdrv_skip_filters(bs)) { + error_setg(errp, "'backing-file' specified," + " but 'top' is the active layer"); + } else { + error_setg(errp, "'backing-file' specified, but 'top' has a " + "writer on it"); + } goto out; } - commit_active_start(has_job_id ? job_id : NULL, bs, base_bs, - job_flags, speed, on_error, + if (!has_job_id) { + /* + * Emulate here what block_job_create() does, because it + * is possible that @bs != @top_bs (the block job should + * be named after @bs, even if @top_bs is the actual + * source) + */ + job_id = bdrv_get_device_name(bs); + } + commit_active_start(job_id, top_bs, base_bs, job_flags, speed, on_error, filter_node_name, NULL, NULL, false, &local_err); } else { BlockDriverState *overlay_bs = bdrv_find_overlay(bs, top_bs); @@ -2892,6 +2937,7 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs, bool has_auto_dismiss, bool auto_dismiss, Error **errp) { + BlockDriverState *unfiltered_bs; int job_flags = JOB_DEFAULT; if (!has_speed) { @@ -2943,10 +2989,19 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs, return; } - if (!bs->backing && sync == MIRROR_SYNC_MODE_TOP) { + if (!bdrv_backing_chain_next(bs) && sync == MIRROR_SYNC_MODE_TOP) { sync = MIRROR_SYNC_MODE_FULL; } + if (!has_replaces) { + /* We want to mirror from @bs, but keep implicit filters on top */ + unfiltered_bs = bdrv_skip_implicit_filters(bs); + if (unfiltered_bs != bs) { + replaces = unfiltered_bs->node_name; + has_replaces = true; + } + } + if (has_replaces) { BlockDriverState *to_replace_bs; AioContext *replace_aio_context; @@ -2993,7 +3048,7 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs, void qmp_drive_mirror(DriveMirror *arg, Error **errp) { BlockDriverState *bs; - BlockDriverState *source, *target_bs; + BlockDriverState *target_backing_bs, *target_bs; AioContext *aio_context; AioContext *old_context; BlockMirrorBackingMode backing_mode; @@ -3028,12 +3083,12 @@ void qmp_drive_mirror(DriveMirror *arg, Error **errp) } flags = bs->open_flags | BDRV_O_RDWR; - source = backing_bs(bs); - if (!source && arg->sync == MIRROR_SYNC_MODE_TOP) { + target_backing_bs = bdrv_cow_bs(bdrv_skip_filters(bs)); + if (!target_backing_bs && arg->sync == MIRROR_SYNC_MODE_TOP) { arg->sync = MIRROR_SYNC_MODE_FULL; } if (arg->sync == MIRROR_SYNC_MODE_NONE) { - source = bs; + target_backing_bs = bs; } size = bdrv_getlength(bs); @@ -3059,7 +3114,7 @@ void qmp_drive_mirror(DriveMirror *arg, Error **errp) /* Don't open backing image in create() */ flags |= BDRV_O_NO_BACKING; - if ((arg->sync == MIRROR_SYNC_MODE_FULL || !source) + if ((arg->sync == MIRROR_SYNC_MODE_FULL || !target_backing_bs) && arg->mode != NEW_IMAGE_MODE_EXISTING) { /* create new image w/o backing file */ @@ -3067,15 +3122,19 @@ void qmp_drive_mirror(DriveMirror *arg, Error **errp) bdrv_img_create(arg->target, format, NULL, NULL, NULL, size, flags, false, &local_err); } else { + /* Implicit filters should not appear in the filename */ + BlockDriverState *explicit_backing = + bdrv_skip_implicit_filters(target_backing_bs); + switch (arg->mode) { case NEW_IMAGE_MODE_EXISTING: break; case NEW_IMAGE_MODE_ABSOLUTE_PATHS: /* create new image with backing file */ - bdrv_refresh_filename(source); + bdrv_refresh_filename(explicit_backing); bdrv_img_create(arg->target, format, - source->filename, - source->drv->format_name, + explicit_backing->filename, + explicit_backing->drv->format_name, NULL, size, flags, false, &local_err); break; default: diff --git a/include/block/block.h b/include/block/block.h index 6e36154061..981ab5b314 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -532,7 +532,7 @@ BlockDriverState *bdrv_next(BdrvNextIterator *it); void bdrv_next_cleanup(BdrvNextIterator *it); BlockDriverState *bdrv_next_monitor_owned(BlockDriverState *bs); -bool bdrv_is_encrypted(BlockDriverState *bs); +bool bdrv_supports_compressed_writes(BlockDriverState *bs); void bdrv_iterate_format(void (*it)(void *opaque, const char *name), void *opaque, bool read_only); const char *bdrv_get_node_name(const BlockDriverState *bs); diff --git a/include/block/block_int.h b/include/block/block_int.h index 9da7a42927..38cad9d15c 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -92,9 +92,17 @@ struct BlockDriver { int instance_size; /* set to true if the BlockDriver is a block filter. Block filters pass - * certain callbacks that refer to data (see block.c) to their bs->file if - * the driver doesn't implement them. Drivers that do not wish to forward - * must implement them and return -ENOTSUP. + * certain callbacks that refer to data (see block.c) to their bs->file + * or bs->backing (whichever one exists) if the driver doesn't implement + * them. Drivers that do not wish to forward must implement them and return + * -ENOTSUP. + * Note that filters are not allowed to modify data. + * + * Filters generally cannot have more than a single filtered child, + * because the data they present must at all times be the same as + * that on their filtered child. That would be impossible to + * achieve for multiple filtered children. + * (And this filtered child must then be bs->file or bs->backing.) */ bool is_filter; /* @@ -839,11 +847,20 @@ struct BlockDriverState { bool walking_aio_notifiers; /* to make removal during iteration safe */ char filename[PATH_MAX]; - char backing_file[PATH_MAX]; /* if non zero, the image is a diff of - this file image */ - /* The backing filename indicated by the image header; if we ever - * open this file, then this is replaced by the resulting BDS's - * filename (i.e. after a bdrv_refresh_filename() run). */ + /* + * If not empty, this image is a diff in relation to backing_file. + * Note that this is the name given in the image header and + * therefore may or may not be equal to .backing->bs->filename. + * If this field contains a relative path, it is to be resolved + * relatively to the overlay's location. + */ + char backing_file[PATH_MAX]; + /* + * The backing filename indicated by the image header. Contrary + * to backing_file, if we ever open this file, auto_backing_file + * is replaced by the resulting BDS's filename (i.e. after a + * bdrv_refresh_filename() run). + */ char auto_backing_file[PATH_MAX]; char backing_format[16]; /* if non-zero and backing_file exists */ @@ -995,11 +1012,6 @@ typedef enum BlockMirrorBackingMode { MIRROR_LEAVE_BACKING_CHAIN, } BlockMirrorBackingMode; -static inline BlockDriverState *backing_bs(BlockDriverState *bs) -{ - return bs->backing ? bs->backing->bs : NULL; -} - /* Essential block drivers which must always be statically linked into qemu, and * which therefore can be accessed without using bdrv_find_format() */ @@ -1050,6 +1062,8 @@ BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size, void bdrv_parse_filename_strip_prefix(const char *filename, const char *prefix, QDict *options); +bool bdrv_backing_overridden(BlockDriverState *bs); + /** * bdrv_add_before_write_notifier: @@ -1300,28 +1314,6 @@ void bdrv_default_perms(BlockDriverState *bs, BdrvChild *c, uint64_t perm, uint64_t shared, uint64_t *nperm, uint64_t *nshared); -/* - * Default implementation for drivers to pass bdrv_co_block_status() to - * their file. - */ -int coroutine_fn bdrv_co_block_status_from_file(BlockDriverState *bs, - bool want_zero, - int64_t offset, - int64_t bytes, - int64_t *pnum, - int64_t *map, - BlockDriverState **file); -/* - * Default implementation for drivers to pass bdrv_co_block_status() to - * their backing file. - */ -int coroutine_fn bdrv_co_block_status_from_backing(BlockDriverState *bs, - bool want_zero, - int64_t offset, - int64_t bytes, - int64_t *pnum, - int64_t *map, - BlockDriverState **file); const char *bdrv_get_parent_name(const BlockDriverState *bs); void blk_dev_change_media_cb(BlockBackend *blk, bool load, Error **errp); bool blk_dev_has_removable_media(BlockBackend *blk); @@ -1382,4 +1374,37 @@ BdrvDirtyBitmap *block_dirty_bitmap_remove(const char *node, const char *name, BlockDriverState **bitmap_bs, Error **errp); +BdrvChild *bdrv_cow_child(BlockDriverState *bs); +BdrvChild *bdrv_filter_child(BlockDriverState *bs); +BdrvChild *bdrv_filter_or_cow_child(BlockDriverState *bs); +BdrvChild *bdrv_primary_child(BlockDriverState *bs); +BlockDriverState *bdrv_skip_implicit_filters(BlockDriverState *bs); +BlockDriverState *bdrv_skip_filters(BlockDriverState *bs); +BlockDriverState *bdrv_backing_chain_next(BlockDriverState *bs); + +static inline BlockDriverState *child_bs(BdrvChild *child) +{ + return child ? child->bs : NULL; +} + +static inline BlockDriverState *bdrv_cow_bs(BlockDriverState *bs) +{ + return child_bs(bdrv_cow_child(bs)); +} + +static inline BlockDriverState *bdrv_filter_bs(BlockDriverState *bs) +{ + return child_bs(bdrv_filter_child(bs)); +} + +static inline BlockDriverState *bdrv_filter_or_cow_bs(BlockDriverState *bs) +{ + return child_bs(bdrv_filter_or_cow_child(bs)); +} + +static inline BlockDriverState *bdrv_primary_bs(BlockDriverState *bs) +{ + return child_bs(bdrv_primary_child(bs)); +} + #endif /* BLOCK_INT_H */ diff --git a/migration/block-dirty-bitmap.c b/migration/block-dirty-bitmap.c index 549e14daba..5bef793ac0 100644 --- a/migration/block-dirty-bitmap.c +++ b/migration/block-dirty-bitmap.c @@ -615,13 +615,7 @@ static int init_dirty_bitmap_migration(DBMSaveState *s) while (bs && bs->drv && bs->drv->is_filter && !bdrv_has_named_bitmaps(bs)) { - if (bs->backing) { - bs = bs->backing->bs; - } else if (bs->file) { - bs = bs->file->bs; - } else { - bs = NULL; - } + bs = bdrv_filter_bs(bs); } if (bs && bs->drv && !bs->drv->is_filter) { diff --git a/nbd/server.c b/nbd/server.c index c5d71cff10..982de67816 100644 --- a/nbd/server.c +++ b/nbd/server.c @@ -1567,13 +1567,13 @@ NBDExport *nbd_export_new(BlockDriverState *bs, uint64_t dev_offset, if (bitmap) { BdrvDirtyBitmap *bm = NULL; - while (true) { + while (bs) { bm = bdrv_find_dirty_bitmap(bs, bitmap); - if (bm != NULL || bs->backing == NULL) { + if (bm != NULL) { break; } - bs = bs->backing->bs; + bs = bdrv_filter_or_cow_bs(bs); } if (bm == NULL) { diff --git a/qapi/block-core.json b/qapi/block-core.json index c5ac22d246..2d94873ca0 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -1569,6 +1569,18 @@ # Live commit of data from overlay image nodes into backing nodes - i.e., # writes data between 'top' and 'base' into 'base'. # +# If top == base, that is an error. +# If top has no overlays on top of it, or if it is in use by a writer, +# the job will not be completed by itself. The user needs to complete +# the job with the block-job-complete command after getting the ready +# event. (Since 2.0) +# +# If the base image is smaller than top, then the base image will be +# resized to be the same size as top. If top is smaller than the base +# image, the base will not be truncated. If you want the base image +# size to match the size of the smaller top, you can safely truncate +# it yourself once the commit operation successfully completes. +# # @job-id: identifier for the newly-created block job. If # omitted, the device name will be used. (Since 2.7) # @@ -1593,14 +1605,15 @@ # accepted # # @backing-file: The backing file string to write into the overlay -# image of 'top'. If 'top' is the active layer, -# specifying a backing file string is an error. This -# filename is not validated. +# image of 'top'. If 'top' does not have an overlay +# image, or if 'top' is in use by a writer, specifying +# a backing file string is an error. # -# If a pathname string is such that it cannot be -# resolved by QEMU, that means that subsequent QMP or -# HMP commands must use node-names for the image in -# question, as filename lookup methods will fail. +# This filename is not validated. If a pathname string +# is such that it cannot be resolved by QEMU, that +# means that subsequent QMP or HMP commands must use +# node-names for the image in question, as filename +# lookup methods will fail. # # If not specified, QEMU will automatically determine # the backing file string to use, or error out if @@ -1609,18 +1622,6 @@ # filename or protocol. # (Since 2.1) # -# If top == base, that is an error. -# If top == active, the job will not be completed by itself, -# user needs to complete the job with the block-job-complete -# command after getting the ready event. (Since 2.0) -# -# If the base image is smaller than top, then the base image -# will be resized to be the same size as top. If top is -# smaller than the base image, the base will not be -# truncated. If you want the base image size to match the -# size of the smaller top, you can safely truncate it -# yourself once the commit operation successfully completes. -# # @speed: the maximum speed, in bytes per second # # @on-error: the action to take on an error. 'ignore' means that the request @@ -1948,7 +1949,8 @@ # # @replaces: with sync=full graph node name to be replaced by the new # image when a whole image copy is done. This can be used to repair -# broken Quorum files. (Since 2.1) +# broken Quorum files. By default, @device is replaced, although +# implicitly created filters on it are kept. (Since 2.1) # # @mode: whether and how QEMU should create a new image, default is # 'absolute-paths'. @@ -2259,7 +2261,8 @@ # # @replaces: with sync=full graph node name to be replaced by the new # image when a whole image copy is done. This can be used to repair -# broken Quorum files. +# broken Quorum files. By default, @device is replaced, although +# implicitly created filters on it are kept. # # @speed: the maximum speed, in bytes per second # @@ -2484,13 +2487,20 @@ # of 'device'. # # If a base file is specified then sectors are not copied from that base file and -# its backing chain. When streaming completes the image file will have the base -# file as its backing file. This can be used to stream a subset of the backing -# file chain instead of flattening the entire image. +# its backing chain. This can be used to stream a subset of the backing file +# chain instead of flattening the entire image. +# When streaming completes the image file will have the base file as its backing +# file, unless that node was changed while the job was running. In that case, +# base's parent's backing (or filtered, whichever exists) child (i.e., base at +# the beginning of the job) will be the new backing file. # # On successful completion the image file is updated to drop the backing file # and the BLOCK_JOB_COMPLETED event is emitted. # +# In case @device is a filter node, block-stream modifies the first non-filter +# overlay node below it to point to the new backing node instead of modifying +# @device itself. +# # @job-id: identifier for the newly-created block job. If # omitted, the device name will be used. (Since 2.7) # diff --git a/qemu-img.c b/qemu-img.c index eb2fc1f862..d0b1c97562 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -1085,7 +1085,7 @@ static int img_commit(int argc, char **argv) /* This is different from QMP, which by default uses the deepest file in * the backing chain (i.e., the very base); however, the traditional * behavior of qemu-img commit is using the immediate backing file. */ - base_bs = backing_bs(bs); + base_bs = bdrv_backing_chain_next(bs); if (!base_bs) { error_setg(&local_err, "Image does not have a backing file"); goto done; @@ -1732,18 +1732,20 @@ static int convert_iteration_sectors(ImgConvertState *s, int64_t sector_num) if (s->sector_next_status <= sector_num) { uint64_t offset = (sector_num - src_cur_offset) * BDRV_SECTOR_SIZE; int64_t count; + BlockDriverState *src_bs = blk_bs(s->src[src_cur]); + BlockDriverState *base; + + if (s->target_has_backing) { + base = bdrv_cow_bs(bdrv_skip_filters(src_bs)); + } else { + base = NULL; + } do { count = n * BDRV_SECTOR_SIZE; - if (s->target_has_backing) { - ret = bdrv_block_status(blk_bs(s->src[src_cur]), offset, - count, &count, NULL, NULL); - } else { - ret = bdrv_block_status_above(blk_bs(s->src[src_cur]), NULL, - offset, count, &count, NULL, - NULL); - } + ret = bdrv_block_status_above(src_bs, base, offset, count, &count, + NULL, NULL); if (ret < 0) { if (s->salvage) { @@ -2665,7 +2667,8 @@ static int img_convert(int argc, char **argv) * s.target_backing_sectors has to be negative, which it will * be automatically). The backing file length is used only * for optimizations, so such a case is not fatal. */ - s.target_backing_sectors = bdrv_nb_sectors(out_bs->backing->bs); + s.target_backing_sectors = + bdrv_nb_sectors(bdrv_backing_chain_next(out_bs)); } else { s.target_backing_sectors = -1; } @@ -3035,6 +3038,7 @@ static int get_block_status(BlockDriverState *bs, int64_t offset, depth = 0; for (;;) { + bs = bdrv_skip_filters(bs); ret = bdrv_block_status(bs, offset, bytes, &bytes, &map, &file); if (ret < 0) { return ret; @@ -3043,7 +3047,7 @@ static int get_block_status(BlockDriverState *bs, int64_t offset, if (ret & (BDRV_BLOCK_ZERO|BDRV_BLOCK_DATA)) { break; } - bs = backing_bs(bs); + bs = bdrv_cow_bs(bs); if (bs == NULL) { ret = 0; break; @@ -3425,6 +3429,7 @@ static int img_rebase(int argc, char **argv) uint8_t *buf_old = NULL; uint8_t *buf_new = NULL; BlockDriverState *bs = NULL, *prefix_chain_bs = NULL; + BlockDriverState *unfiltered_bs; char *filename; const char *fmt, *cache, *src_cache, *out_basefmt, *out_baseimg; int c, flags, src_flags, ret; @@ -3559,6 +3564,8 @@ static int img_rebase(int argc, char **argv) } bs = blk_bs(blk); + unfiltered_bs = bdrv_skip_filters(bs); + if (out_basefmt != NULL) { if (bdrv_find_format(out_basefmt) == NULL) { error_report("Invalid format name: '%s'", out_basefmt); @@ -3570,7 +3577,7 @@ static int img_rebase(int argc, char **argv) /* For safe rebasing we need to compare old and new backing file */ if (!unsafe) { QDict *options = NULL; - BlockDriverState *base_bs = backing_bs(bs); + BlockDriverState *base_bs = bdrv_cow_bs(unfiltered_bs); if (base_bs) { blk_old_backing = blk_new(qemu_get_aio_context(), @@ -3711,7 +3718,7 @@ static int img_rebase(int argc, char **argv) n = MIN(IO_BUF_SIZE, size - offset); /* If the cluster is allocated, we don't need to take action */ - ret = bdrv_is_allocated(bs, offset, n, &n); + ret = bdrv_is_allocated(unfiltered_bs, offset, n, &n); if (ret < 0) { error_report("error while reading image metadata: %s", strerror(-ret)); @@ -3726,8 +3733,9 @@ static int img_rebase(int argc, char **argv) * If cluster wasn't changed since prefix_chain, we don't need * to take action */ - ret = bdrv_is_allocated_above(backing_bs(bs), prefix_chain_bs, - false, offset, n, &n); + ret = bdrv_is_allocated_above(bdrv_cow_bs(unfiltered_bs), + prefix_chain_bs, false, + offset, n, &n); if (ret < 0) { error_report("error while reading image metadata: %s", strerror(-ret)); @@ -3805,9 +3813,10 @@ static int img_rebase(int argc, char **argv) * doesn't change when we switch the backing file. */ if (out_baseimg && *out_baseimg) { - ret = bdrv_change_backing_file(bs, out_baseimg, out_basefmt, true); + ret = bdrv_change_backing_file(unfiltered_bs, out_baseimg, out_basefmt, + true); } else { - ret = bdrv_change_backing_file(bs, NULL, NULL, false); + ret = bdrv_change_backing_file(unfiltered_bs, NULL, NULL, false); } if (ret == -ENOSPC) { diff --git a/tests/qemu-iotests/020 b/tests/qemu-iotests/020 index a0782937b0..596505be2d 100755 --- a/tests/qemu-iotests/020 +++ b/tests/qemu-iotests/020 @@ -31,6 +31,11 @@ _cleanup() _cleanup_test_img _rm_test_img "$TEST_IMG.base" _rm_test_img "$TEST_IMG.orig" + + _rm_test_img "$TEST_DIR/subdir/t.$IMGFMT.base" + _rm_test_img "$TEST_DIR/subdir/t.$IMGFMT.mid" + _rm_test_img "$TEST_DIR/subdir/t.$IMGFMT" + rmdir "$TEST_DIR/subdir" &> /dev/null } trap "_cleanup; exit \$status" 0 1 2 3 15 @@ -139,6 +144,45 @@ $QEMU_IO -c 'writev 0 64k' "$TEST_IMG" | _filter_qemu_io $QEMU_IMG commit "$TEST_IMG" _cleanup + +echo +echo 'Testing commit in sub-directory with relative filenames' +echo + +pushd "$TEST_DIR" > /dev/null + +mkdir subdir + +TEST_IMG="subdir/t.$IMGFMT.base" _make_test_img 1M +TEST_IMG="subdir/t.$IMGFMT.mid" _make_test_img -b "t.$IMGFMT.base" -F $IMGFMT +TEST_IMG="subdir/t.$IMGFMT" _make_test_img -b "t.$IMGFMT.mid" -F $IMGFMT + +# Should work +$QEMU_IMG commit -b "t.$IMGFMT.mid" "subdir/t.$IMGFMT" + +# Might theoretically work, but does not in practice (we have to +# decide between this and the above; and since we always represent +# backing file names as relative to the overlay, we go for the above) +$QEMU_IMG commit -b "subdir/t.$IMGFMT.mid" "subdir/t.$IMGFMT" 2>&1 | \ + _filter_imgfmt + +# This should work as well +$QEMU_IMG commit -b "$TEST_DIR/subdir/t.$IMGFMT.mid" "subdir/t.$IMGFMT" + +popd > /dev/null + +# Now let's try with just absolute filenames +# (This will not work with external data files, though, because when +# using relative paths for those, qemu will always resolve them +# relative to its CWD. Therefore, it cannot find those data files now +# that we left $TEST_DIR.) +if _get_data_file '' > /dev/null; then + echo 'Image committed.' # Skip test +else + $QEMU_IMG commit -b "$TEST_DIR/subdir/t.$IMGFMT.mid" \ + "$TEST_DIR/subdir/t.$IMGFMT" +fi + # success, all done echo "*** done" rm -f $seq.full diff --git a/tests/qemu-iotests/020.out b/tests/qemu-iotests/020.out index 5936bc1cae..a5db1962ad 100644 --- a/tests/qemu-iotests/020.out +++ b/tests/qemu-iotests/020.out @@ -1083,4 +1083,14 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 backing_file=json:{'driv wrote 65536/65536 bytes at offset 0 64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) qemu-img: Block job failed: No space left on device + +Testing commit in sub-directory with relative filenames + +Formatting 'subdir/t.IMGFMT.base', fmt=IMGFMT size=1048576 +Formatting 'subdir/t.IMGFMT.mid', fmt=IMGFMT size=1048576 backing_file=t.IMGFMT.base backing_fmt=IMGFMT +Formatting 'subdir/t.IMGFMT', fmt=IMGFMT size=1048576 backing_file=t.IMGFMT.mid backing_fmt=IMGFMT +Image committed. +qemu-img: Did not find 'subdir/t.IMGFMT.mid' in the backing chain of 'subdir/t.IMGFMT' +Image committed. +Image committed. *** done diff --git a/tests/qemu-iotests/040 b/tests/qemu-iotests/040 index f58f50d023..caf286571a 100755 --- a/tests/qemu-iotests/040 +++ b/tests/qemu-iotests/040 @@ -734,6 +734,244 @@ class TestErrorHandling(iotests.QMPTestCase): self.assertTrue(iotests.compare_images(mid_img, backing_img, fmt2='raw'), 'target image does not match source after commit') +class TestCommitWithFilters(iotests.QMPTestCase): + img0 = os.path.join(iotests.test_dir, '0.img') + img1 = os.path.join(iotests.test_dir, '1.img') + img2 = os.path.join(iotests.test_dir, '2.img') + img3 = os.path.join(iotests.test_dir, '3.img') + + def do_test_io(self, read_or_write): + for index, pattern_file in enumerate(self.pattern_files): + result = qemu_io('-f', iotests.imgfmt, + '-c', + f'{read_or_write} -P {index + 1} {index}M 1M', + pattern_file) + self.assertFalse('Pattern verification failed' in result) + + def setUp(self): + qemu_img('create', '-f', iotests.imgfmt, self.img0, '64M') + qemu_img('create', '-f', iotests.imgfmt, self.img1, '64M') + qemu_img('create', '-f', iotests.imgfmt, self.img2, '64M') + qemu_img('create', '-f', iotests.imgfmt, self.img3, '64M') + + # Distributions of the patterns in the files; this is checked + # by tearDown() and should be changed by the test cases as is + # necessary + self.pattern_files = [self.img0, self.img1, self.img2, self.img3] + + self.do_test_io('write') + + self.vm = iotests.VM().add_device('virtio-scsi,id=vio-scsi') + self.vm.launch() + + result = self.vm.qmp('object-add', qom_type='throttle-group', id='tg') + self.assert_qmp(result, 'return', {}) + + result = self.vm.qmp('blockdev-add', **{ + 'node-name': 'top-filter', + 'driver': 'throttle', + 'throttle-group': 'tg', + 'file': { + 'node-name': 'cow-3', + 'driver': iotests.imgfmt, + 'file': { + 'driver': 'file', + 'filename': self.img3 + }, + 'backing': { + 'node-name': 'cow-2', + 'driver': iotests.imgfmt, + 'file': { + 'driver': 'file', + 'filename': self.img2 + }, + 'backing': { + 'node-name': 'cow-1', + 'driver': iotests.imgfmt, + 'file': { + 'driver': 'file', + 'filename': self.img1 + }, + 'backing': { + 'node-name': 'bottom-filter', + 'driver': 'throttle', + 'throttle-group': 'tg', + 'file': { + 'node-name': 'cow-0', + 'driver': iotests.imgfmt, + 'file': { + 'driver': 'file', + 'filename': self.img0 + } + } + } + } + } + } + }) + self.assert_qmp(result, 'return', {}) + + def tearDown(self): + self.vm.shutdown() + self.do_test_io('read') + + os.remove(self.img3) + os.remove(self.img2) + os.remove(self.img1) + os.remove(self.img0) + + # Filters make for funny filenames, so we cannot just use + # self.imgX to get them + def get_filename(self, node): + return self.vm.node_info(node)['image']['filename'] + + def test_filterless_commit(self): + result = self.vm.qmp('block-commit', + job_id='commit', + device='top-filter', + top_node='cow-2', + base_node='cow-1') + self.assert_qmp(result, 'return', {}) + self.wait_until_completed(drive='commit') + + self.assertIsNotNone(self.vm.node_info('cow-3')) + self.assertIsNone(self.vm.node_info('cow-2')) + self.assertIsNotNone(self.vm.node_info('cow-1')) + + # 2 has been comitted into 1 + self.pattern_files[2] = self.img1 + + def test_commit_through_filter(self): + result = self.vm.qmp('block-commit', + job_id='commit', + device='top-filter', + top_node='cow-1', + base_node='cow-0') + self.assert_qmp(result, 'return', {}) + self.wait_until_completed(drive='commit') + + self.assertIsNotNone(self.vm.node_info('cow-2')) + self.assertIsNone(self.vm.node_info('cow-1')) + self.assertIsNone(self.vm.node_info('bottom-filter')) + self.assertIsNotNone(self.vm.node_info('cow-0')) + + # 1 has been comitted into 0 + self.pattern_files[1] = self.img0 + + def test_filtered_active_commit_with_filter(self): + # Add a device, so the commit job finds a parent it can change + # to point to the base node (so we can test that top-filter is + # dropped from the graph) + result = self.vm.qmp('device_add', id='drv0', driver='scsi-hd', + bus='vio-scsi.0', drive='top-filter') + self.assert_qmp(result, 'return', {}) + + # Try to release our reference to top-filter; that should not + # work because drv0 uses it + result = self.vm.qmp('blockdev-del', node_name='top-filter') + self.assert_qmp(result, 'error/class', 'GenericError') + self.assert_qmp(result, 'error/desc', 'Node top-filter is in use') + + result = self.vm.qmp('block-commit', + job_id='commit', + device='top-filter', + base_node='cow-2') + self.assert_qmp(result, 'return', {}) + self.complete_and_wait(drive='commit') + + # Try to release our reference to top-filter again + result = self.vm.qmp('blockdev-del', node_name='top-filter') + self.assert_qmp(result, 'return', {}) + + self.assertIsNone(self.vm.node_info('top-filter')) + self.assertIsNone(self.vm.node_info('cow-3')) + self.assertIsNotNone(self.vm.node_info('cow-2')) + + # Check that drv0 is now connected to cow-2 + blockdevs = self.vm.qmp('query-block')['return'] + drv0 = next(dev for dev in blockdevs if dev['qdev'] == 'drv0') + self.assertEqual(drv0['inserted']['node-name'], 'cow-2') + + # 3 has been comitted into 2 + self.pattern_files[3] = self.img2 + + def test_filtered_active_commit_without_filter(self): + result = self.vm.qmp('block-commit', + job_id='commit', + device='top-filter', + top_node='cow-3', + base_node='cow-2') + self.assert_qmp(result, 'return', {}) + self.complete_and_wait(drive='commit') + + self.assertIsNotNone(self.vm.node_info('top-filter')) + self.assertIsNone(self.vm.node_info('cow-3')) + self.assertIsNotNone(self.vm.node_info('cow-2')) + + # 3 has been comitted into 2 + self.pattern_files[3] = self.img2 + +class TestCommitWithOverriddenBacking(iotests.QMPTestCase): + img_base_a = os.path.join(iotests.test_dir, 'base_a.img') + img_base_b = os.path.join(iotests.test_dir, 'base_b.img') + img_top = os.path.join(iotests.test_dir, 'top.img') + + def setUp(self): + qemu_img('create', '-f', iotests.imgfmt, self.img_base_a, '1M') + qemu_img('create', '-f', iotests.imgfmt, self.img_base_b, '1M') + qemu_img('create', '-f', iotests.imgfmt, '-b', self.img_base_a, \ + self.img_top) + + self.vm = iotests.VM() + self.vm.launch() + + # Use base_b instead of base_a as the backing of top + result = self.vm.qmp('blockdev-add', **{ + 'node-name': 'top', + 'driver': iotests.imgfmt, + 'file': { + 'driver': 'file', + 'filename': self.img_top + }, + 'backing': { + 'node-name': 'base', + 'driver': iotests.imgfmt, + 'file': { + 'driver': 'file', + 'filename': self.img_base_b + } + } + }) + self.assert_qmp(result, 'return', {}) + + def tearDown(self): + self.vm.shutdown() + os.remove(self.img_top) + os.remove(self.img_base_a) + os.remove(self.img_base_b) + + def test_commit_to_a(self): + # Try committing to base_a (which should fail, as top's + # backing image is base_b instead) + result = self.vm.qmp('block-commit', + job_id='commit', + device='top', + base=self.img_base_a) + self.assert_qmp(result, 'error/class', 'GenericError') + + def test_commit_to_b(self): + # Try committing to base_b (which should work, since that is + # actually top's backing image) + result = self.vm.qmp('block-commit', + job_id='commit', + device='top', + base=self.img_base_b) + self.assert_qmp(result, 'return', {}) + + self.vm.event_wait('BLOCK_JOB_READY') + self.vm.qmp('block-job-complete', device='commit') + self.vm.event_wait('BLOCK_JOB_COMPLETED') + if __name__ == '__main__': iotests.main(supported_fmts=['qcow2', 'qed'], supported_protocols=['file']) diff --git a/tests/qemu-iotests/040.out b/tests/qemu-iotests/040.out index 6a917130b6..1bb1dc5f0e 100644 --- a/tests/qemu-iotests/040.out +++ b/tests/qemu-iotests/040.out @@ -1,5 +1,5 @@ -........................................................... +................................................................. ---------------------------------------------------------------------- -Ran 59 tests +Ran 65 tests OK diff --git a/tests/qemu-iotests/041 b/tests/qemu-iotests/041 index f0a7bf6650..a7780853cd 100755 --- a/tests/qemu-iotests/041 +++ b/tests/qemu-iotests/041 @@ -21,8 +21,9 @@ import time import os import re +import json import iotests -from iotests import qemu_img, qemu_io +from iotests import qemu_img, qemu_img_pipe, qemu_io backing_img = os.path.join(iotests.test_dir, 'backing.img') target_backing_img = os.path.join(iotests.test_dir, 'target-backing.img') @@ -1288,6 +1289,149 @@ class TestReplaces(iotests.QMPTestCase): self.vm.assert_block_path('filter0', '/file', 'target') +# Tests for mirror with filters (and how the mirror filter behaves, as +# an example for an implicit filter) +class TestFilters(iotests.QMPTestCase): + def setUp(self): + qemu_img('create', '-f', iotests.imgfmt, backing_img, '1M') + qemu_img('create', '-f', iotests.imgfmt, '-b', backing_img, test_img) + qemu_img('create', '-f', iotests.imgfmt, '-b', backing_img, target_img) + + qemu_io('-c', 'write -P 1 0 512k', backing_img) + qemu_io('-c', 'write -P 2 512k 512k', test_img) + + self.vm = iotests.VM().add_device('virtio-scsi,id=vio-scsi') + self.vm.launch() + + result = self.vm.qmp('blockdev-add', **{ + 'node-name': 'target', + 'driver': iotests.imgfmt, + 'file': { + 'driver': 'file', + 'filename': target_img + }, + 'backing': None + }) + self.assert_qmp(result, 'return', {}) + + self.filterless_chain = { + 'node-name': 'source', + 'driver': iotests.imgfmt, + 'file': { + 'driver': 'file', + 'filename': test_img + }, + 'backing': { + 'node-name': 'backing', + 'driver': iotests.imgfmt, + 'file': { + 'driver': 'file', + 'filename': backing_img + } + } + } + + def tearDown(self): + self.vm.shutdown() + + os.remove(test_img) + os.remove(target_img) + os.remove(backing_img) + + def test_cor(self): + result = self.vm.qmp('blockdev-add', **{ + 'node-name': 'filter', + 'driver': 'copy-on-read', + 'file': self.filterless_chain + }) + self.assert_qmp(result, 'return', {}) + + result = self.vm.qmp('blockdev-mirror', + job_id='mirror', + device='filter', + target='target', + sync='top') + self.assert_qmp(result, 'return', {}) + + self.complete_and_wait('mirror') + + self.vm.qmp('blockdev-del', node_name='target') + + target_map = qemu_img_pipe('map', '--output=json', target_img) + target_map = json.loads(target_map) + + assert target_map[0]['start'] == 0 + assert target_map[0]['length'] == 512 * 1024 + assert target_map[0]['depth'] == 1 + + assert target_map[1]['start'] == 512 * 1024 + assert target_map[1]['length'] == 512 * 1024 + assert target_map[1]['depth'] == 0 + + def test_implicit_mirror_filter(self): + result = self.vm.qmp('blockdev-add', **self.filterless_chain) + self.assert_qmp(result, 'return', {}) + + # We need this so we can query from above the mirror node + result = self.vm.qmp('device_add', + driver='scsi-hd', + id='virtio', + bus='vio-scsi.0', + drive='source') + self.assert_qmp(result, 'return', {}) + + result = self.vm.qmp('blockdev-mirror', + job_id='mirror', + device='source', + target='target', + sync='top') + self.assert_qmp(result, 'return', {}) + + # The mirror filter is now an implicit node, so it should be + # invisible when querying the backing chain + blockdevs = self.vm.qmp('query-block')['return'] + device_info = next(dev for dev in blockdevs if dev['qdev'] == 'virtio') + + assert device_info['inserted']['node-name'] == 'source' + + image_info = device_info['inserted']['image'] + assert image_info['filename'] == test_img + assert image_info['backing-image']['filename'] == backing_img + + self.complete_and_wait('mirror') + + def test_explicit_mirror_filter(self): + # Same test as above, but this time we give the mirror filter + # a node-name so it will not be invisible + result = self.vm.qmp('blockdev-add', **self.filterless_chain) + self.assert_qmp(result, 'return', {}) + + # We need this so we can query from above the mirror node + result = self.vm.qmp('device_add', + driver='scsi-hd', + id='virtio', + bus='vio-scsi.0', + drive='source') + self.assert_qmp(result, 'return', {}) + + result = self.vm.qmp('blockdev-mirror', + job_id='mirror', + device='source', + target='target', + sync='top', + filter_node_name='mirror-filter') + self.assert_qmp(result, 'return', {}) + + # With a node-name given to it, the mirror filter should now + # be visible + blockdevs = self.vm.qmp('query-block')['return'] + device_info = next(dev for dev in blockdevs if dev['qdev'] == 'virtio') + + assert device_info['inserted']['node-name'] == 'mirror-filter' + + self.complete_and_wait('mirror') + + if __name__ == '__main__': iotests.main(supported_fmts=['qcow2', 'qed'], supported_protocols=['file'], diff --git a/tests/qemu-iotests/041.out b/tests/qemu-iotests/041.out index 53abe11d73..46651953e8 100644 --- a/tests/qemu-iotests/041.out +++ b/tests/qemu-iotests/041.out @@ -1,5 +1,5 @@ -........................................................................................................ +........................................................................................................... ---------------------------------------------------------------------- -Ran 104 tests +Ran 107 tests OK diff --git a/tests/qemu-iotests/049 b/tests/qemu-iotests/049 index 051a1c79e0..82b1e6c202 100755 --- a/tests/qemu-iotests/049 +++ b/tests/qemu-iotests/049 @@ -119,6 +119,10 @@ test_qemu_img create -f $IMGFMT -o compat=1.1,lazy_refcounts=on "$TEST_IMG" 64M test_qemu_img create -f $IMGFMT -o compat=0.10,lazy_refcounts=off "$TEST_IMG" 64M test_qemu_img create -f $IMGFMT -o compat=0.10,lazy_refcounts=on "$TEST_IMG" 64M +echo "== Expect error when backing file name is empty string ==" +echo +test_qemu_img create -f $IMGFMT -b '' $TEST_IMG 1M + # success, all done echo "*** done" rm -f $seq.full diff --git a/tests/qemu-iotests/049.out b/tests/qemu-iotests/049.out index a7e220830d..b1d8fd9107 100644 --- a/tests/qemu-iotests/049.out +++ b/tests/qemu-iotests/049.out @@ -209,4 +209,9 @@ qemu-img create -f qcow2 -o compat=0.10,lazy_refcounts=on TEST_DIR/t.qcow2 64M Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=67108864 compat=0.10 lazy_refcounts=on refcount_bits=16 qemu-img: TEST_DIR/t.qcow2: Lazy refcounts only supported with compatibility level 1.1 and above (use version=v3 or greater) +== Expect error when backing file name is empty string == + +qemu-img create -f qcow2 -b TEST_DIR/t.qcow2 1M +qemu-img: TEST_DIR/t.qcow2: Expected backing file name, got empty string + *** done diff --git a/tests/qemu-iotests/153.out b/tests/qemu-iotests/153.out index 8a79e1ee87..fcaa71aeee 100644 --- a/tests/qemu-iotests/153.out +++ b/tests/qemu-iotests/153.out @@ -464,7 +464,7 @@ No conflict: image: null-co:// file format: null-co virtual size: 1 GiB (1073741824 bytes) -disk size: unavailable +disk size: 0 B Conflict: qemu-img: --force-share/-U conflicts with image options diff --git a/tests/qemu-iotests/184 b/tests/qemu-iotests/184 index 33dd8d2a4f..eebb53faed 100755 --- a/tests/qemu-iotests/184 +++ b/tests/qemu-iotests/184 @@ -45,8 +45,7 @@ do_run_qemu() run_qemu() { do_run_qemu "$@" 2>&1 | _filter_testdir | _filter_qemu | _filter_qmp\ - | _filter_qemu_io | _filter_generated_node_ids \ - | _filter_actual_image_size + | _filter_qemu_io | _filter_generated_node_ids } test_throttle=$($QEMU_IMG --help|grep throttle) diff --git a/tests/qemu-iotests/184.out b/tests/qemu-iotests/184.out index 3deb3cfb94..87c73070e3 100644 --- a/tests/qemu-iotests/184.out +++ b/tests/qemu-iotests/184.out @@ -27,14 +27,21 @@ Testing: "iops_rd": 0, "detect_zeroes": "off", "image": { + "backing-image": { + "virtual-size": 1073741824, + "filename": "null-co://", + "format": "null-co", + "actual-size": 0 + }, "virtual-size": 1073741824, "filename": "json:{\"throttle-group\": \"group0\", \"driver\": \"throttle\", \"file\": {\"driver\": \"null-co\"}}", - "format": "throttle" + "format": "throttle", + "actual-size": 0 }, "iops_wr": 0, "ro": false, "node-name": "throttle0", - "backing_file_depth": 0, + "backing_file_depth": 1, "drv": "throttle", "iops": 0, "bps_wr": 0, @@ -56,7 +63,8 @@ Testing: "image": { "virtual-size": 1073741824, "filename": "null-co://", - "format": "null-co" + "format": "null-co", + "actual-size": 0 }, "iops_wr": 0, "ro": false, diff --git a/tests/qemu-iotests/204.out b/tests/qemu-iotests/204.out index 457f72df8f..4d903d20ea 100644 --- a/tests/qemu-iotests/204.out +++ b/tests/qemu-iotests/204.out @@ -59,5 +59,6 @@ Offset Length File 0x900000 0x2400000 TEST_DIR/t.IMGFMT 0x3c00000 0x1100000 TEST_DIR/t.IMGFMT 0x6a00000 0x400000 TEST_DIR/t.IMGFMT +0x6e00000 0x1200000 TEST_DIR/t.IMGFMT.base No errors were found on the image. *** done diff --git a/tests/qemu-iotests/228 b/tests/qemu-iotests/228 index 60db986d84..266fce6cda 100755 --- a/tests/qemu-iotests/228 +++ b/tests/qemu-iotests/228 @@ -36,7 +36,7 @@ def log_node_info(node): log('bs->filename: ' + node['image']['filename'], filters=[filter_testfiles, filter_imgfmt]) - log('bs->backing_file: ' + node['backing_file'], + log('bs->backing_file: ' + node['image']['full-backing-filename'], filters=[filter_testfiles, filter_imgfmt]) if 'backing-image' in node['image']: @@ -73,8 +73,8 @@ with iotests.FilePath('base.img') as base_img_path, \ }, filters=[filter_qmp_testfiles, filter_qmp_imgfmt]) - # Filename should be plain, and the backing filename should not - # contain the "file:" prefix + # Filename should be plain, and the backing node filename should + # not contain the "file:" prefix log_node_info(vm.node_info('node0')) vm.qmp_log('blockdev-del', node_name='node0') diff --git a/tests/qemu-iotests/228.out b/tests/qemu-iotests/228.out index 4217df24fe..8c82009abe 100644 --- a/tests/qemu-iotests/228.out +++ b/tests/qemu-iotests/228.out @@ -4,7 +4,7 @@ {"return": {}} bs->filename: TEST_DIR/PID-top.img -bs->backing_file: TEST_DIR/PID-base.img +bs->backing_file: file:TEST_DIR/PID-base.img bs->backing->bs->filename: TEST_DIR/PID-base.img {"execute": "blockdev-del", "arguments": {"node-name": "node0"}} @@ -41,7 +41,7 @@ bs->backing->bs->filename: TEST_DIR/PID-base.img {"return": {}} bs->filename: TEST_DIR/PID-top.img -bs->backing_file: TEST_DIR/PID-base.img +bs->backing_file: file:TEST_DIR/PID-base.img bs->backing->bs->filename: TEST_DIR/PID-base.img {"execute": "blockdev-del", "arguments": {"node-name": "node0"}} @@ -55,7 +55,7 @@ bs->backing->bs->filename: TEST_DIR/PID-base.img {"return": {}} bs->filename: json:{"backing": {"driver": "null-co"}, "driver": "IMGFMT", "file": {"driver": "file", "filename": "TEST_DIR/PID-top.img"}} -bs->backing_file: null-co:// +bs->backing_file: TEST_DIR/PID-base.img bs->backing->bs->filename: null-co:// {"execute": "blockdev-del", "arguments": {"node-name": "node0"}} diff --git a/tests/qemu-iotests/244 b/tests/qemu-iotests/244 index efe3c0428b..f2b2dddf1c 100755 --- a/tests/qemu-iotests/244 +++ b/tests/qemu-iotests/244 @@ -217,6 +217,55 @@ $QEMU_IMG amend -f $IMGFMT -o "data_file=blkdebug::$TEST_IMG.data" "$TEST_IMG" $QEMU_IMG convert -f $IMGFMT -O $IMGFMT -n -C "$TEST_IMG.src" "$TEST_IMG" $QEMU_IMG compare -f $IMGFMT -F $IMGFMT "$TEST_IMG.src" "$TEST_IMG" +echo +echo "=== Flushing should flush the data file ===" +echo + +# We are going to flush a qcow2 file with a blkdebug node inserted +# between the qcow2 node and its data file node. The blkdebug node +# will return an error for all flushes and so we if the data file is +# flushed, we will see qemu-io return an error. + +# We need to write something or the flush will not do anything; we +# also need -t writeback so the write is not done as a FUA write +# (which would then fail thanks to the implicit flush) +$QEMU_IO -c 'write 0 512' -c flush \ + -t writeback \ + "json:{ + 'driver': 'qcow2', + 'file': { + 'driver': 'file', + 'filename': '$TEST_IMG' + }, + 'data-file': { + 'driver': 'blkdebug', + 'inject-error': [{ + 'event': 'none', + 'iotype': 'flush' + }], + 'image': { + 'driver': 'file', + 'filename': '$TEST_IMG.data' + } + } + }" \ + | _filter_qemu_io + +result=${PIPESTATUS[0]} +echo + +case $result in + 0) + echo "ERROR: qemu-io succeeded, so the data file was not flushed" + ;; + 1) + echo "Success: qemu-io failed, so the data file was flushed" + ;; + *) + echo "ERROR: qemu-io returned unknown exit code $result" + ;; +esac + # success, all done echo "*** done" rm -f $seq.full diff --git a/tests/qemu-iotests/244.out b/tests/qemu-iotests/244.out index dbab7359a9..7269b4295a 100644 --- a/tests/qemu-iotests/244.out +++ b/tests/qemu-iotests/244.out @@ -131,4 +131,11 @@ Offset Length Mapped to File Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 data_file=TEST_DIR/t.IMGFMT.data Images are identical. Images are identical. + +=== Flushing should flush the data file === + +wrote 512/512 bytes at offset 0 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + +Success: qemu-io failed, so the data file was flushed *** done diff --git a/tests/qemu-iotests/245 b/tests/qemu-iotests/245 index ad91a6f5b4..e60c8326d3 100755 --- a/tests/qemu-iotests/245 +++ b/tests/qemu-iotests/245 @@ -725,7 +725,9 @@ class TestBlockdevReopen(iotests.QMPTestCase): # Detach hd2 from hd0. self.reopen(opts, {'backing': None}) - self.reopen(opts, {}, "backing is missing for 'hd0'") + + # Without a backing file, we can omit 'backing' again + self.reopen(opts) # Remove both hd0 and hd2 result = self.vm.qmp('blockdev-del', conv_keys = True, node_name = 'hd0') diff --git a/tests/qemu-iotests/273.out b/tests/qemu-iotests/273.out index 87d4758503..8247cbaea1 100644 --- a/tests/qemu-iotests/273.out +++ b/tests/qemu-iotests/273.out @@ -32,7 +32,7 @@ Testing: -blockdev file,node-name=base,filename=TEST_DIR/t.IMGFMT.base -blockdev "actual-size": SIZE, "dirty-flag": false }, - "backing-filename-format": "file", + "backing-filename-format": "IMGFMT", "virtual-size": 67108864, "filename": "TEST_DIR/t.IMGFMT.mid", "cluster-size": 65536, @@ -112,7 +112,7 @@ Testing: -blockdev file,node-name=base,filename=TEST_DIR/t.IMGFMT.base -blockdev "actual-size": SIZE, "dirty-flag": false }, - "backing-filename-format": "file", + "backing-filename-format": "IMGFMT", "virtual-size": 67108864, "filename": "TEST_DIR/t.IMGFMT.mid", "cluster-size": 65536, diff --git a/tests/qemu-iotests/check b/tests/qemu-iotests/check index 3ab859ac1a..e14a1f354d 100755 --- a/tests/qemu-iotests/check +++ b/tests/qemu-iotests/check @@ -44,7 +44,7 @@ then _init_error "failed to obtain source tree name from check symlink" fi source_iotests=$(cd "$source_iotests"; pwd) || _init_error "failed to enter source tree" - build_iotests=$PWD + build_iotests=$(readlink -f $(dirname "$0")) else # called from the source tree source_iotests=$PWD diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py index e197c73ca5..64ccaf9061 100644 --- a/tests/qemu-iotests/iotests.py +++ b/tests/qemu-iotests/iotests.py @@ -972,8 +972,12 @@ class QMPTestCase(unittest.TestCase): def wait_ready(self, drive='drive0'): """Wait until a BLOCK_JOB_READY event, and return the event.""" - f = {'data': {'type': 'mirror', 'device': drive}} - return self.vm.event_wait(name='BLOCK_JOB_READY', match=f) + return self.vm.events_wait([ + ('BLOCK_JOB_READY', + {'data': {'type': 'mirror', 'device': drive}}), + ('BLOCK_JOB_READY', + {'data': {'type': 'commit', 'device': drive}}) + ]) def wait_ready_and_cancel(self, drive='drive0'): self.wait_ready(drive=drive) @@ -992,7 +996,7 @@ class QMPTestCase(unittest.TestCase): self.assert_qmp(result, 'return', {}) event = self.wait_until_completed(drive=drive, error=completion_error) - self.assert_qmp(event, 'data/type', 'mirror') + self.assertTrue(event['data']['type'] in ['mirror', 'commit']) def pause_wait(self, job_id='job0'): with Timeout(3, "Timeout waiting for job to pause"):