2018-04-12 17:29:59 +02:00
|
|
|
/*
|
|
|
|
* Background jobs (long-running operations)
|
|
|
|
*
|
|
|
|
* Copyright (c) 2011 IBM Corp.
|
|
|
|
* Copyright (c) 2012, 2018 Red Hat, Inc.
|
|
|
|
*
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
|
|
* of this software and associated documentation files (the "Software"), to deal
|
|
|
|
* in the Software without restriction, including without limitation the rights
|
|
|
|
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
|
|
* copies of the Software, and to permit persons to whom the Software is
|
|
|
|
* furnished to do so, subject to the following conditions:
|
|
|
|
*
|
|
|
|
* The above copyright notice and this permission notice shall be included in
|
|
|
|
* all copies or substantial portions of the Software.
|
|
|
|
*
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
|
|
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
|
|
* THE SOFTWARE.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "qemu/osdep.h"
|
|
|
|
#include "qapi/error.h"
|
|
|
|
#include "qemu/job.h"
|
|
|
|
#include "qemu/id.h"
|
2018-04-17 16:41:17 +02:00
|
|
|
#include "qemu/main-loop.h"
|
2018-08-17 14:58:49 +02:00
|
|
|
#include "block/aio-wait.h"
|
2020-02-04 12:20:10 +01:00
|
|
|
#include "trace/trace-root.h"
|
2018-04-30 19:09:46 +02:00
|
|
|
#include "qapi/qapi-events-job.h"
|
2018-04-12 17:29:59 +02:00
|
|
|
|
2018-04-12 17:54:37 +02:00
|
|
|
static QLIST_HEAD(, Job) jobs = QLIST_HEAD_INITIALIZER(jobs);
|
|
|
|
|
2018-04-13 17:19:31 +02:00
|
|
|
/* Job State Transition Table */
|
|
|
|
bool JobSTT[JOB_STATUS__MAX][JOB_STATUS__MAX] = {
|
|
|
|
/* U, C, R, P, Y, S, W, D, X, E, N */
|
|
|
|
/* U: */ [JOB_STATUS_UNDEFINED] = {0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
|
|
|
/* C: */ [JOB_STATUS_CREATED] = {0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1},
|
|
|
|
/* R: */ [JOB_STATUS_RUNNING] = {0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0},
|
|
|
|
/* P: */ [JOB_STATUS_PAUSED] = {0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0},
|
|
|
|
/* Y: */ [JOB_STATUS_READY] = {0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0},
|
|
|
|
/* S: */ [JOB_STATUS_STANDBY] = {0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0},
|
|
|
|
/* W: */ [JOB_STATUS_WAITING] = {0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0},
|
|
|
|
/* D: */ [JOB_STATUS_PENDING] = {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0},
|
|
|
|
/* X: */ [JOB_STATUS_ABORTING] = {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0},
|
|
|
|
/* E: */ [JOB_STATUS_CONCLUDED] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1},
|
|
|
|
/* N: */ [JOB_STATUS_NULL] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
|
|
|
};
|
|
|
|
|
|
|
|
bool JobVerbTable[JOB_VERB__MAX][JOB_STATUS__MAX] = {
|
|
|
|
/* U, C, R, P, Y, S, W, D, X, E, N */
|
|
|
|
[JOB_VERB_CANCEL] = {0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0},
|
|
|
|
[JOB_VERB_PAUSE] = {0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0},
|
|
|
|
[JOB_VERB_RESUME] = {0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0},
|
|
|
|
[JOB_VERB_SET_SPEED] = {0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0},
|
2021-04-09 14:04:20 +02:00
|
|
|
[JOB_VERB_COMPLETE] = {0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0},
|
2018-04-13 17:19:31 +02:00
|
|
|
[JOB_VERB_FINALIZE] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0},
|
|
|
|
[JOB_VERB_DISMISS] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0},
|
|
|
|
};
|
|
|
|
|
2018-04-23 16:06:26 +02:00
|
|
|
/* Transactional group of jobs */
|
|
|
|
struct JobTxn {
|
|
|
|
|
|
|
|
/* Is this txn being cancelled? */
|
|
|
|
bool aborting;
|
|
|
|
|
|
|
|
/* List of jobs */
|
|
|
|
QLIST_HEAD(, Job) jobs;
|
|
|
|
|
|
|
|
/* Reference count */
|
|
|
|
int refcnt;
|
|
|
|
};
|
|
|
|
|
2018-04-13 17:31:02 +02:00
|
|
|
/* Right now, this mutex is only needed to synchronize accesses to job->busy
|
|
|
|
* and job->sleep_timer, such as concurrent calls to job_do_yield and
|
|
|
|
* job_enter. */
|
|
|
|
static QemuMutex job_mutex;
|
|
|
|
|
|
|
|
static void job_lock(void)
|
|
|
|
{
|
|
|
|
qemu_mutex_lock(&job_mutex);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void job_unlock(void)
|
|
|
|
{
|
|
|
|
qemu_mutex_unlock(&job_mutex);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void __attribute__((__constructor__)) job_init(void)
|
|
|
|
{
|
|
|
|
qemu_mutex_init(&job_mutex);
|
|
|
|
}
|
|
|
|
|
2018-04-23 16:06:26 +02:00
|
|
|
JobTxn *job_txn_new(void)
|
|
|
|
{
|
|
|
|
JobTxn *txn = g_new0(JobTxn, 1);
|
|
|
|
QLIST_INIT(&txn->jobs);
|
|
|
|
txn->refcnt = 1;
|
|
|
|
return txn;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void job_txn_ref(JobTxn *txn)
|
|
|
|
{
|
|
|
|
txn->refcnt++;
|
|
|
|
}
|
|
|
|
|
|
|
|
void job_txn_unref(JobTxn *txn)
|
|
|
|
{
|
|
|
|
if (txn && --txn->refcnt == 0) {
|
|
|
|
g_free(txn);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void job_txn_add_job(JobTxn *txn, Job *job)
|
|
|
|
{
|
|
|
|
if (!txn) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
assert(!job->txn);
|
|
|
|
job->txn = txn;
|
|
|
|
|
|
|
|
QLIST_INSERT_HEAD(&txn->jobs, job, txn_list);
|
|
|
|
job_txn_ref(txn);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void job_txn_del_job(Job *job)
|
|
|
|
{
|
|
|
|
if (job->txn) {
|
|
|
|
QLIST_REMOVE(job, txn_list);
|
|
|
|
job_txn_unref(job->txn);
|
|
|
|
job->txn = NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
job: take each job's lock individually in job_txn_apply
All callers of job_txn_apply hold a single job's lock, but different
jobs within a transaction can have different contexts, thus we need to
lock each one individually before applying the callback function.
Similar to job_completed_txn_abort this also requires releasing the
caller's context before and reacquiring it after to avoid recursive
locks which might break AIO_WAIT_WHILE in the callback. This is safe, since
existing code would already have to take this into account, lest
job_completed_txn_abort might have broken.
This also brings to light a different issue: When a callback function in
job_txn_apply moves it's job to a different AIO context, callers will
try to release the wrong lock (now that we re-acquire the lock
correctly, previously it would just continue with the old lock, leaving
the job unlocked for the rest of the return path). Fix this by not caching
the job's context.
This is only necessary for qmp_block_job_finalize, qmp_job_finalize and
job_exit, since everyone else calls through job_exit.
One test needed adapting, since it calls job_finalize directly, so it
manually needs to acquire the correct context.
Signed-off-by: Stefan Reiter <s.reiter@proxmox.com>
Message-Id: <20200407115651.69472-2-s.reiter@proxmox.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2020-04-07 13:56:49 +02:00
|
|
|
static int job_txn_apply(Job *job, int fn(Job *))
|
2018-04-23 16:06:26 +02:00
|
|
|
{
|
job: take each job's lock individually in job_txn_apply
All callers of job_txn_apply hold a single job's lock, but different
jobs within a transaction can have different contexts, thus we need to
lock each one individually before applying the callback function.
Similar to job_completed_txn_abort this also requires releasing the
caller's context before and reacquiring it after to avoid recursive
locks which might break AIO_WAIT_WHILE in the callback. This is safe, since
existing code would already have to take this into account, lest
job_completed_txn_abort might have broken.
This also brings to light a different issue: When a callback function in
job_txn_apply moves it's job to a different AIO context, callers will
try to release the wrong lock (now that we re-acquire the lock
correctly, previously it would just continue with the old lock, leaving
the job unlocked for the rest of the return path). Fix this by not caching
the job's context.
This is only necessary for qmp_block_job_finalize, qmp_job_finalize and
job_exit, since everyone else calls through job_exit.
One test needed adapting, since it calls job_finalize directly, so it
manually needs to acquire the correct context.
Signed-off-by: Stefan Reiter <s.reiter@proxmox.com>
Message-Id: <20200407115651.69472-2-s.reiter@proxmox.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2020-04-07 13:56:49 +02:00
|
|
|
AioContext *inner_ctx;
|
|
|
|
Job *other_job, *next;
|
|
|
|
JobTxn *txn = job->txn;
|
2018-04-23 16:06:26 +02:00
|
|
|
int rc = 0;
|
|
|
|
|
job: take each job's lock individually in job_txn_apply
All callers of job_txn_apply hold a single job's lock, but different
jobs within a transaction can have different contexts, thus we need to
lock each one individually before applying the callback function.
Similar to job_completed_txn_abort this also requires releasing the
caller's context before and reacquiring it after to avoid recursive
locks which might break AIO_WAIT_WHILE in the callback. This is safe, since
existing code would already have to take this into account, lest
job_completed_txn_abort might have broken.
This also brings to light a different issue: When a callback function in
job_txn_apply moves it's job to a different AIO context, callers will
try to release the wrong lock (now that we re-acquire the lock
correctly, previously it would just continue with the old lock, leaving
the job unlocked for the rest of the return path). Fix this by not caching
the job's context.
This is only necessary for qmp_block_job_finalize, qmp_job_finalize and
job_exit, since everyone else calls through job_exit.
One test needed adapting, since it calls job_finalize directly, so it
manually needs to acquire the correct context.
Signed-off-by: Stefan Reiter <s.reiter@proxmox.com>
Message-Id: <20200407115651.69472-2-s.reiter@proxmox.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2020-04-07 13:56:49 +02:00
|
|
|
/*
|
|
|
|
* Similar to job_completed_txn_abort, we take each job's lock before
|
|
|
|
* applying fn, but since we assume that outer_ctx is held by the caller,
|
|
|
|
* we need to release it here to avoid holding the lock twice - which would
|
|
|
|
* break AIO_WAIT_WHILE from within fn.
|
|
|
|
*/
|
|
|
|
job_ref(job);
|
|
|
|
aio_context_release(job->aio_context);
|
|
|
|
|
|
|
|
QLIST_FOREACH_SAFE(other_job, &txn->jobs, txn_list, next) {
|
|
|
|
inner_ctx = other_job->aio_context;
|
|
|
|
aio_context_acquire(inner_ctx);
|
|
|
|
rc = fn(other_job);
|
|
|
|
aio_context_release(inner_ctx);
|
2018-04-23 16:06:26 +02:00
|
|
|
if (rc) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
job: take each job's lock individually in job_txn_apply
All callers of job_txn_apply hold a single job's lock, but different
jobs within a transaction can have different contexts, thus we need to
lock each one individually before applying the callback function.
Similar to job_completed_txn_abort this also requires releasing the
caller's context before and reacquiring it after to avoid recursive
locks which might break AIO_WAIT_WHILE in the callback. This is safe, since
existing code would already have to take this into account, lest
job_completed_txn_abort might have broken.
This also brings to light a different issue: When a callback function in
job_txn_apply moves it's job to a different AIO context, callers will
try to release the wrong lock (now that we re-acquire the lock
correctly, previously it would just continue with the old lock, leaving
the job unlocked for the rest of the return path). Fix this by not caching
the job's context.
This is only necessary for qmp_block_job_finalize, qmp_job_finalize and
job_exit, since everyone else calls through job_exit.
One test needed adapting, since it calls job_finalize directly, so it
manually needs to acquire the correct context.
Signed-off-by: Stefan Reiter <s.reiter@proxmox.com>
Message-Id: <20200407115651.69472-2-s.reiter@proxmox.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2020-04-07 13:56:49 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Note that job->aio_context might have been changed by calling fn, so we
|
|
|
|
* can't use a local variable to cache it.
|
|
|
|
*/
|
|
|
|
aio_context_acquire(job->aio_context);
|
|
|
|
job_unref(job);
|
2018-04-23 16:06:26 +02:00
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
2018-05-04 16:25:43 +02:00
|
|
|
bool job_is_internal(Job *job)
|
2018-04-30 19:09:46 +02:00
|
|
|
{
|
|
|
|
return (job->id == NULL);
|
|
|
|
}
|
|
|
|
|
2018-04-25 14:56:09 +02:00
|
|
|
static void job_state_transition(Job *job, JobStatus s1)
|
2018-04-13 17:19:31 +02:00
|
|
|
{
|
|
|
|
JobStatus s0 = job->status;
|
2018-11-05 21:38:35 +00:00
|
|
|
assert(s1 >= 0 && s1 < JOB_STATUS__MAX);
|
2018-04-19 17:30:16 +02:00
|
|
|
trace_job_state_transition(job, job->ret,
|
2018-04-13 17:19:31 +02:00
|
|
|
JobSTT[s0][s1] ? "allowed" : "disallowed",
|
|
|
|
JobStatus_str(s0), JobStatus_str(s1));
|
|
|
|
assert(JobSTT[s0][s1]);
|
|
|
|
job->status = s1;
|
2018-04-30 19:09:46 +02:00
|
|
|
|
|
|
|
if (!job_is_internal(job) && s1 != s0) {
|
2018-08-15 21:37:37 +08:00
|
|
|
qapi_event_send_job_status_change(job->id, job->status);
|
2018-04-30 19:09:46 +02:00
|
|
|
}
|
2018-04-13 17:19:31 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
int job_apply_verb(Job *job, JobVerb verb, Error **errp)
|
|
|
|
{
|
|
|
|
JobStatus s0 = job->status;
|
2018-11-05 21:38:35 +00:00
|
|
|
assert(verb >= 0 && verb < JOB_VERB__MAX);
|
2018-04-13 17:19:31 +02:00
|
|
|
trace_job_apply_verb(job, JobStatus_str(s0), JobVerb_str(verb),
|
|
|
|
JobVerbTable[verb][s0] ? "allowed" : "prohibited");
|
|
|
|
if (JobVerbTable[verb][s0]) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
error_setg(errp, "Job '%s' in state '%s' cannot accept command verb '%s'",
|
|
|
|
job->id, JobStatus_str(s0), JobVerb_str(verb));
|
|
|
|
return -EPERM;
|
|
|
|
}
|
|
|
|
|
2018-04-12 17:57:08 +02:00
|
|
|
JobType job_type(const Job *job)
|
|
|
|
{
|
|
|
|
return job->driver->job_type;
|
|
|
|
}
|
|
|
|
|
|
|
|
const char *job_type_str(const Job *job)
|
|
|
|
{
|
|
|
|
return JobType_str(job_type(job));
|
|
|
|
}
|
|
|
|
|
2018-04-17 12:56:07 +02:00
|
|
|
bool job_is_cancelled(Job *job)
|
job: Add job_cancel_requested()
Most callers of job_is_cancelled() actually want to know whether the job
is on its way to immediate termination. For example, we refuse to pause
jobs that are cancelled; but this only makes sense for jobs that are
really actually cancelled.
A mirror job that is cancelled during READY with force=false should
absolutely be allowed to pause. This "cancellation" (which is actually
a kind of completion) may take an indefinite amount of time, and so
should behave like any job during normal operation. For example, with
on-target-error=stop, the job should stop on write errors. (In
contrast, force-cancelled jobs should not get write errors, as they
should just terminate and not do further I/O.)
Therefore, redefine job_is_cancelled() to only return true for jobs that
are force-cancelled (which as of HEAD^ means any job that interprets the
cancellation request as a request for immediate termination), and add
job_cancel_requested() as the general variant, which returns true for
any jobs which have been requested to be cancelled, whether it be
immediately or after an arbitrarily long completion phase.
Finally, here is a justification for how different job_is_cancelled()
invocations are treated by this patch:
- block/mirror.c (mirror_run()):
- The first invocation is a while loop that should loop until the job
has been cancelled or scheduled for completion. What kind of cancel
does not matter, only the fact that the job is supposed to end.
- The second invocation wants to know whether the job has been
soft-cancelled. Calling job_cancel_requested() is a bit too broad,
but if the job were force-cancelled, we should leave the main loop
as soon as possible anyway, so this should not matter here.
- The last two invocations already check force_cancel, so they should
continue to use job_is_cancelled().
- block/backup.c, block/commit.c, block/stream.c, anything in tests/:
These jobs know only force-cancel, so there is no difference between
job_is_cancelled() and job_cancel_requested(). We can continue using
job_is_cancelled().
- job.c:
- job_pause_point(), job_yield(), job_sleep_ns(): Only force-cancelled
jobs should be prevented from being paused. Continue using job_is_cancelled().
- job_update_rc(), job_finalize_single(), job_finish_sync(): These
functions are all called after the job has left its main loop. The
mirror job (the only job that can be soft-cancelled) will clear
.cancelled before leaving the main loop if it has been
soft-cancelled. Therefore, these functions will observe .cancelled
to be true only if the job has been force-cancelled. We can
continue to use job_is_cancelled().
(Furthermore, conceptually, a soft-cancelled mirror job should not
report to have been cancelled. It should report completion (see
also the block-job-cancel QAPI documentation). Therefore, it makes
sense for these functions not to distinguish between a
soft-cancelled mirror job and a job that has completed as normal.)
- job_completed_txn_abort(): All jobs other than @job have been
force-cancelled. job_is_cancelled() must be true for them.
Regarding @job itself: job_completed_txn_abort() is mostly called
when the job's return value is not 0. A soft-cancelled mirror has a
return value of 0, and so will not end up here then.
However, job_cancel() invokes job_completed_txn_abort() if the job
has been deferred to the main loop, which is mostly the case for
completed jobs (which skip the assertion), but not for sure.
To be safe, use job_cancel_requested() in this assertion.
- job_complete(): This is function eventually invoked by the user
(through qmp_block_job_complete() or qmp_job_complete(), or
job_complete_sync(), which comes from qemu-img). The intention here
is to prevent a user from invoking job-complete after the job has
been cancelled. This should also apply to soft cancelling: After a
mirror job has been soft-cancelled, the user should not be able to
decide otherwise and have it complete as normal (i.e. pivoting to
the target).
- job_cancel(): Both functions are equivalent (see comment there), but
we want to use job_is_cancelled(), because this shows that we call
job_completed_txn_abort() only for force-cancelled jobs. (As
explained for job_update_rc(), soft-cancelled jobs should be treated
as if they have completed as normal.)
Buglink: https://gitlab.com/qemu-project/qemu/-/issues/462
Signed-off-by: Hanna Reitz <hreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20211006151940.214590-9-hreitz@redhat.com>
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
2021-10-06 17:19:35 +02:00
|
|
|
{
|
2021-10-06 17:19:39 +02:00
|
|
|
/* force_cancel may be true only if cancelled is true, too */
|
|
|
|
assert(job->cancelled || !job->force_cancel);
|
|
|
|
return job->force_cancel;
|
job: Add job_cancel_requested()
Most callers of job_is_cancelled() actually want to know whether the job
is on its way to immediate termination. For example, we refuse to pause
jobs that are cancelled; but this only makes sense for jobs that are
really actually cancelled.
A mirror job that is cancelled during READY with force=false should
absolutely be allowed to pause. This "cancellation" (which is actually
a kind of completion) may take an indefinite amount of time, and so
should behave like any job during normal operation. For example, with
on-target-error=stop, the job should stop on write errors. (In
contrast, force-cancelled jobs should not get write errors, as they
should just terminate and not do further I/O.)
Therefore, redefine job_is_cancelled() to only return true for jobs that
are force-cancelled (which as of HEAD^ means any job that interprets the
cancellation request as a request for immediate termination), and add
job_cancel_requested() as the general variant, which returns true for
any jobs which have been requested to be cancelled, whether it be
immediately or after an arbitrarily long completion phase.
Finally, here is a justification for how different job_is_cancelled()
invocations are treated by this patch:
- block/mirror.c (mirror_run()):
- The first invocation is a while loop that should loop until the job
has been cancelled or scheduled for completion. What kind of cancel
does not matter, only the fact that the job is supposed to end.
- The second invocation wants to know whether the job has been
soft-cancelled. Calling job_cancel_requested() is a bit too broad,
but if the job were force-cancelled, we should leave the main loop
as soon as possible anyway, so this should not matter here.
- The last two invocations already check force_cancel, so they should
continue to use job_is_cancelled().
- block/backup.c, block/commit.c, block/stream.c, anything in tests/:
These jobs know only force-cancel, so there is no difference between
job_is_cancelled() and job_cancel_requested(). We can continue using
job_is_cancelled().
- job.c:
- job_pause_point(), job_yield(), job_sleep_ns(): Only force-cancelled
jobs should be prevented from being paused. Continue using job_is_cancelled().
- job_update_rc(), job_finalize_single(), job_finish_sync(): These
functions are all called after the job has left its main loop. The
mirror job (the only job that can be soft-cancelled) will clear
.cancelled before leaving the main loop if it has been
soft-cancelled. Therefore, these functions will observe .cancelled
to be true only if the job has been force-cancelled. We can
continue to use job_is_cancelled().
(Furthermore, conceptually, a soft-cancelled mirror job should not
report to have been cancelled. It should report completion (see
also the block-job-cancel QAPI documentation). Therefore, it makes
sense for these functions not to distinguish between a
soft-cancelled mirror job and a job that has completed as normal.)
- job_completed_txn_abort(): All jobs other than @job have been
force-cancelled. job_is_cancelled() must be true for them.
Regarding @job itself: job_completed_txn_abort() is mostly called
when the job's return value is not 0. A soft-cancelled mirror has a
return value of 0, and so will not end up here then.
However, job_cancel() invokes job_completed_txn_abort() if the job
has been deferred to the main loop, which is mostly the case for
completed jobs (which skip the assertion), but not for sure.
To be safe, use job_cancel_requested() in this assertion.
- job_complete(): This is function eventually invoked by the user
(through qmp_block_job_complete() or qmp_job_complete(), or
job_complete_sync(), which comes from qemu-img). The intention here
is to prevent a user from invoking job-complete after the job has
been cancelled. This should also apply to soft cancelling: After a
mirror job has been soft-cancelled, the user should not be able to
decide otherwise and have it complete as normal (i.e. pivoting to
the target).
- job_cancel(): Both functions are equivalent (see comment there), but
we want to use job_is_cancelled(), because this shows that we call
job_completed_txn_abort() only for force-cancelled jobs. (As
explained for job_update_rc(), soft-cancelled jobs should be treated
as if they have completed as normal.)
Buglink: https://gitlab.com/qemu-project/qemu/-/issues/462
Signed-off-by: Hanna Reitz <hreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20211006151940.214590-9-hreitz@redhat.com>
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
2021-10-06 17:19:35 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
bool job_cancel_requested(Job *job)
|
2018-04-17 12:56:07 +02:00
|
|
|
{
|
|
|
|
return job->cancelled;
|
|
|
|
}
|
|
|
|
|
2018-04-25 15:09:58 +02:00
|
|
|
bool job_is_ready(Job *job)
|
|
|
|
{
|
|
|
|
switch (job->status) {
|
|
|
|
case JOB_STATUS_UNDEFINED:
|
|
|
|
case JOB_STATUS_CREATED:
|
|
|
|
case JOB_STATUS_RUNNING:
|
|
|
|
case JOB_STATUS_PAUSED:
|
|
|
|
case JOB_STATUS_WAITING:
|
|
|
|
case JOB_STATUS_PENDING:
|
|
|
|
case JOB_STATUS_ABORTING:
|
|
|
|
case JOB_STATUS_CONCLUDED:
|
|
|
|
case JOB_STATUS_NULL:
|
|
|
|
return false;
|
|
|
|
case JOB_STATUS_READY:
|
|
|
|
case JOB_STATUS_STANDBY:
|
|
|
|
return true;
|
|
|
|
default:
|
|
|
|
g_assert_not_reached();
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2018-04-19 13:04:01 +02:00
|
|
|
bool job_is_completed(Job *job)
|
|
|
|
{
|
|
|
|
switch (job->status) {
|
|
|
|
case JOB_STATUS_UNDEFINED:
|
|
|
|
case JOB_STATUS_CREATED:
|
|
|
|
case JOB_STATUS_RUNNING:
|
|
|
|
case JOB_STATUS_PAUSED:
|
|
|
|
case JOB_STATUS_READY:
|
|
|
|
case JOB_STATUS_STANDBY:
|
|
|
|
return false;
|
|
|
|
case JOB_STATUS_WAITING:
|
|
|
|
case JOB_STATUS_PENDING:
|
|
|
|
case JOB_STATUS_ABORTING:
|
|
|
|
case JOB_STATUS_CONCLUDED:
|
|
|
|
case JOB_STATUS_NULL:
|
|
|
|
return true;
|
|
|
|
default:
|
|
|
|
g_assert_not_reached();
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2018-04-24 16:13:52 +02:00
|
|
|
static bool job_started(Job *job)
|
2018-04-13 17:31:02 +02:00
|
|
|
{
|
|
|
|
return job->co;
|
|
|
|
}
|
|
|
|
|
2018-04-24 16:55:04 +02:00
|
|
|
static bool job_should_pause(Job *job)
|
2018-04-13 17:31:02 +02:00
|
|
|
{
|
|
|
|
return job->pause_count > 0;
|
|
|
|
}
|
|
|
|
|
2018-04-12 17:54:37 +02:00
|
|
|
Job *job_next(Job *job)
|
|
|
|
{
|
|
|
|
if (!job) {
|
|
|
|
return QLIST_FIRST(&jobs);
|
|
|
|
}
|
|
|
|
return QLIST_NEXT(job, job_list);
|
|
|
|
}
|
|
|
|
|
|
|
|
Job *job_get(const char *id)
|
|
|
|
{
|
|
|
|
Job *job;
|
|
|
|
|
|
|
|
QLIST_FOREACH(job, &jobs, job_list) {
|
|
|
|
if (job->id && !strcmp(id, job->id)) {
|
|
|
|
return job;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2018-04-18 16:32:20 +02:00
|
|
|
static void job_sleep_timer_cb(void *opaque)
|
|
|
|
{
|
|
|
|
Job *job = opaque;
|
|
|
|
|
|
|
|
job_enter(job);
|
|
|
|
}
|
|
|
|
|
2018-04-23 16:06:26 +02:00
|
|
|
void *job_create(const char *job_id, const JobDriver *driver, JobTxn *txn,
|
|
|
|
AioContext *ctx, int flags, BlockCompletionFunc *cb,
|
|
|
|
void *opaque, Error **errp)
|
2018-04-12 17:29:59 +02:00
|
|
|
{
|
|
|
|
Job *job;
|
|
|
|
|
|
|
|
if (job_id) {
|
2018-04-19 17:54:56 +02:00
|
|
|
if (flags & JOB_INTERNAL) {
|
|
|
|
error_setg(errp, "Cannot specify job ID for internal job");
|
|
|
|
return NULL;
|
|
|
|
}
|
2018-04-12 17:29:59 +02:00
|
|
|
if (!id_wellformed(job_id)) {
|
|
|
|
error_setg(errp, "Invalid job ID '%s'", job_id);
|
|
|
|
return NULL;
|
|
|
|
}
|
2018-04-12 17:54:37 +02:00
|
|
|
if (job_get(job_id)) {
|
|
|
|
error_setg(errp, "Job ID '%s' already in use", job_id);
|
|
|
|
return NULL;
|
|
|
|
}
|
2018-04-19 17:54:56 +02:00
|
|
|
} else if (!(flags & JOB_INTERNAL)) {
|
|
|
|
error_setg(errp, "An explicit job ID is required");
|
|
|
|
return NULL;
|
2018-04-12 17:29:59 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
job = g_malloc0(driver->instance_size);
|
|
|
|
job->driver = driver;
|
|
|
|
job->id = g_strdup(job_id);
|
2018-04-13 18:50:05 +02:00
|
|
|
job->refcnt = 1;
|
2018-04-17 13:49:33 +02:00
|
|
|
job->aio_context = ctx;
|
2018-04-13 17:31:02 +02:00
|
|
|
job->busy = false;
|
|
|
|
job->paused = true;
|
|
|
|
job->pause_count = 1;
|
2018-04-19 17:54:56 +02:00
|
|
|
job->auto_finalize = !(flags & JOB_MANUAL_FINALIZE);
|
|
|
|
job->auto_dismiss = !(flags & JOB_MANUAL_DISMISS);
|
2018-04-19 17:30:16 +02:00
|
|
|
job->cb = cb;
|
|
|
|
job->opaque = opaque;
|
2018-04-12 17:29:59 +02:00
|
|
|
|
2021-06-14 10:11:29 +02:00
|
|
|
progress_init(&job->progress);
|
|
|
|
|
2018-04-23 18:04:57 +02:00
|
|
|
notifier_list_init(&job->on_finalize_cancelled);
|
|
|
|
notifier_list_init(&job->on_finalize_completed);
|
|
|
|
notifier_list_init(&job->on_pending);
|
2018-04-25 14:56:09 +02:00
|
|
|
notifier_list_init(&job->on_ready);
|
2021-11-03 12:21:55 -04:00
|
|
|
notifier_list_init(&job->on_idle);
|
2018-04-23 18:04:57 +02:00
|
|
|
|
2018-04-13 17:19:31 +02:00
|
|
|
job_state_transition(job, JOB_STATUS_CREATED);
|
2018-04-18 16:32:20 +02:00
|
|
|
aio_timer_init(qemu_get_aio_context(), &job->sleep_timer,
|
|
|
|
QEMU_CLOCK_REALTIME, SCALE_NS,
|
|
|
|
job_sleep_timer_cb, job);
|
2018-04-13 17:19:31 +02:00
|
|
|
|
2018-04-12 17:54:37 +02:00
|
|
|
QLIST_INSERT_HEAD(&jobs, job, job_list);
|
|
|
|
|
2018-04-23 16:06:26 +02:00
|
|
|
/* Single jobs are modeled as single-job transactions for sake of
|
|
|
|
* consolidating the job management logic */
|
|
|
|
if (!txn) {
|
|
|
|
txn = job_txn_new();
|
|
|
|
job_txn_add_job(txn, job);
|
|
|
|
job_txn_unref(txn);
|
|
|
|
} else {
|
|
|
|
job_txn_add_job(txn, job);
|
|
|
|
}
|
|
|
|
|
2018-04-12 17:29:59 +02:00
|
|
|
return job;
|
|
|
|
}
|
2018-04-12 19:06:53 +02:00
|
|
|
|
2018-04-13 18:50:05 +02:00
|
|
|
void job_ref(Job *job)
|
2018-04-12 19:06:53 +02:00
|
|
|
{
|
2018-04-13 18:50:05 +02:00
|
|
|
++job->refcnt;
|
|
|
|
}
|
|
|
|
|
|
|
|
void job_unref(Job *job)
|
|
|
|
{
|
|
|
|
if (--job->refcnt == 0) {
|
|
|
|
assert(job->status == JOB_STATUS_NULL);
|
2018-04-18 16:32:20 +02:00
|
|
|
assert(!timer_pending(&job->sleep_timer));
|
2018-04-23 16:06:26 +02:00
|
|
|
assert(!job->txn);
|
2018-04-12 17:54:37 +02:00
|
|
|
|
2018-04-13 18:50:05 +02:00
|
|
|
if (job->driver->free) {
|
|
|
|
job->driver->free(job);
|
|
|
|
}
|
|
|
|
|
|
|
|
QLIST_REMOVE(job, job_list);
|
|
|
|
|
2021-06-14 10:11:29 +02:00
|
|
|
progress_destroy(&job->progress);
|
2018-08-29 21:57:27 -04:00
|
|
|
error_free(job->err);
|
2018-04-13 18:50:05 +02:00
|
|
|
g_free(job->id);
|
|
|
|
g_free(job);
|
|
|
|
}
|
2018-04-12 19:06:53 +02:00
|
|
|
}
|
2018-04-17 16:41:17 +02:00
|
|
|
|
2018-05-04 12:17:20 +02:00
|
|
|
void job_progress_update(Job *job, uint64_t done)
|
|
|
|
{
|
2020-03-11 13:29:56 +03:00
|
|
|
progress_work_done(&job->progress, done);
|
2018-05-04 12:17:20 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
void job_progress_set_remaining(Job *job, uint64_t remaining)
|
|
|
|
{
|
2020-03-11 13:29:56 +03:00
|
|
|
progress_set_remaining(&job->progress, remaining);
|
2018-05-04 12:17:20 +02:00
|
|
|
}
|
|
|
|
|
2018-06-13 20:18:20 +02:00
|
|
|
void job_progress_increase_remaining(Job *job, uint64_t delta)
|
|
|
|
{
|
2020-03-11 13:29:56 +03:00
|
|
|
progress_increase_remaining(&job->progress, delta);
|
2018-06-13 20:18:20 +02:00
|
|
|
}
|
|
|
|
|
2018-04-23 18:04:57 +02:00
|
|
|
void job_event_cancelled(Job *job)
|
|
|
|
{
|
|
|
|
notifier_list_notify(&job->on_finalize_cancelled, job);
|
|
|
|
}
|
|
|
|
|
|
|
|
void job_event_completed(Job *job)
|
|
|
|
{
|
|
|
|
notifier_list_notify(&job->on_finalize_completed, job);
|
|
|
|
}
|
|
|
|
|
2018-04-23 16:06:26 +02:00
|
|
|
static void job_event_pending(Job *job)
|
2018-04-23 18:04:57 +02:00
|
|
|
{
|
|
|
|
notifier_list_notify(&job->on_pending, job);
|
|
|
|
}
|
|
|
|
|
2018-04-25 14:56:09 +02:00
|
|
|
static void job_event_ready(Job *job)
|
|
|
|
{
|
|
|
|
notifier_list_notify(&job->on_ready, job);
|
|
|
|
}
|
|
|
|
|
2018-08-17 14:53:05 +02:00
|
|
|
static void job_event_idle(Job *job)
|
|
|
|
{
|
|
|
|
notifier_list_notify(&job->on_idle, job);
|
|
|
|
}
|
|
|
|
|
2018-04-13 17:31:02 +02:00
|
|
|
void job_enter_cond(Job *job, bool(*fn)(Job *job))
|
|
|
|
{
|
|
|
|
if (!job_started(job)) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if (job->deferred_to_main_loop) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
job_lock();
|
|
|
|
if (job->busy) {
|
|
|
|
job_unlock();
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (fn && !fn(job)) {
|
|
|
|
job_unlock();
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
assert(!job->deferred_to_main_loop);
|
|
|
|
timer_del(&job->sleep_timer);
|
|
|
|
job->busy = true;
|
|
|
|
job_unlock();
|
2019-05-03 19:17:43 +02:00
|
|
|
aio_co_enter(job->aio_context, job->co);
|
2018-04-13 17:31:02 +02:00
|
|
|
}
|
|
|
|
|
2018-04-18 16:32:20 +02:00
|
|
|
void job_enter(Job *job)
|
|
|
|
{
|
|
|
|
job_enter_cond(job, NULL);
|
|
|
|
}
|
|
|
|
|
2018-04-13 17:31:02 +02:00
|
|
|
/* Yield, and schedule a timer to reenter the coroutine after @ns nanoseconds.
|
2018-04-24 16:13:52 +02:00
|
|
|
* Reentering the job coroutine with job_enter() before the timer has expired
|
|
|
|
* is allowed and cancels the timer.
|
2018-04-13 17:31:02 +02:00
|
|
|
*
|
2018-04-24 16:13:52 +02:00
|
|
|
* If @ns is (uint64_t) -1, no timer is scheduled and job_enter() must be
|
2018-04-13 17:31:02 +02:00
|
|
|
* called explicitly. */
|
2018-04-24 16:55:04 +02:00
|
|
|
static void coroutine_fn job_do_yield(Job *job, uint64_t ns)
|
2018-04-13 17:31:02 +02:00
|
|
|
{
|
|
|
|
job_lock();
|
|
|
|
if (ns != -1) {
|
|
|
|
timer_mod(&job->sleep_timer, ns);
|
|
|
|
}
|
|
|
|
job->busy = false;
|
2018-08-17 14:53:05 +02:00
|
|
|
job_event_idle(job);
|
2018-04-13 17:31:02 +02:00
|
|
|
job_unlock();
|
|
|
|
qemu_coroutine_yield();
|
|
|
|
|
|
|
|
/* Set by job_enter_cond() before re-entering the coroutine. */
|
|
|
|
assert(job->busy);
|
|
|
|
}
|
|
|
|
|
|
|
|
void coroutine_fn job_pause_point(Job *job)
|
|
|
|
{
|
|
|
|
assert(job && job_started(job));
|
|
|
|
|
|
|
|
if (!job_should_pause(job)) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if (job_is_cancelled(job)) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (job->driver->pause) {
|
|
|
|
job->driver->pause(job);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (job_should_pause(job) && !job_is_cancelled(job)) {
|
|
|
|
JobStatus status = job->status;
|
|
|
|
job_state_transition(job, status == JOB_STATUS_READY
|
|
|
|
? JOB_STATUS_STANDBY
|
|
|
|
: JOB_STATUS_PAUSED);
|
|
|
|
job->paused = true;
|
|
|
|
job_do_yield(job, -1);
|
|
|
|
job->paused = false;
|
|
|
|
job_state_transition(job, status);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (job->driver->resume) {
|
|
|
|
job->driver->resume(job);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-04-24 16:55:04 +02:00
|
|
|
void job_yield(Job *job)
|
|
|
|
{
|
|
|
|
assert(job->busy);
|
|
|
|
|
|
|
|
/* Check cancellation *before* setting busy = false, too! */
|
|
|
|
if (job_is_cancelled(job)) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!job_should_pause(job)) {
|
|
|
|
job_do_yield(job, -1);
|
|
|
|
}
|
|
|
|
|
|
|
|
job_pause_point(job);
|
|
|
|
}
|
|
|
|
|
2018-04-18 16:32:20 +02:00
|
|
|
void coroutine_fn job_sleep_ns(Job *job, int64_t ns)
|
|
|
|
{
|
|
|
|
assert(job->busy);
|
|
|
|
|
|
|
|
/* Check cancellation *before* setting busy = false, too! */
|
|
|
|
if (job_is_cancelled(job)) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!job_should_pause(job)) {
|
|
|
|
job_do_yield(job, qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + ns);
|
|
|
|
}
|
|
|
|
|
|
|
|
job_pause_point(job);
|
|
|
|
}
|
|
|
|
|
2018-04-18 17:10:26 +02:00
|
|
|
/* Assumes the block_job_mutex is held */
|
|
|
|
static bool job_timer_not_pending(Job *job)
|
|
|
|
{
|
|
|
|
return !timer_pending(&job->sleep_timer);
|
|
|
|
}
|
|
|
|
|
|
|
|
void job_pause(Job *job)
|
|
|
|
{
|
|
|
|
job->pause_count++;
|
2021-01-17 00:46:51 +03:00
|
|
|
if (!job->paused) {
|
|
|
|
job_enter(job);
|
|
|
|
}
|
2018-04-18 17:10:26 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
void job_resume(Job *job)
|
|
|
|
{
|
|
|
|
assert(job->pause_count > 0);
|
|
|
|
job->pause_count--;
|
|
|
|
if (job->pause_count) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* kick only if no timer is pending */
|
|
|
|
job_enter_cond(job, job_timer_not_pending);
|
|
|
|
}
|
|
|
|
|
|
|
|
void job_user_pause(Job *job, Error **errp)
|
|
|
|
{
|
|
|
|
if (job_apply_verb(job, JOB_VERB_PAUSE, errp)) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if (job->user_paused) {
|
|
|
|
error_setg(errp, "Job is already paused");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
job->user_paused = true;
|
|
|
|
job_pause(job);
|
|
|
|
}
|
|
|
|
|
|
|
|
bool job_user_paused(Job *job)
|
|
|
|
{
|
|
|
|
return job->user_paused;
|
|
|
|
}
|
|
|
|
|
|
|
|
void job_user_resume(Job *job, Error **errp)
|
|
|
|
{
|
|
|
|
assert(job);
|
|
|
|
if (!job->user_paused || job->pause_count <= 0) {
|
|
|
|
error_setg(errp, "Can't resume a job that was not paused");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if (job_apply_verb(job, JOB_VERB_RESUME, errp)) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if (job->driver->user_resume) {
|
|
|
|
job->driver->user_resume(job);
|
|
|
|
}
|
|
|
|
job->user_paused = false;
|
|
|
|
job_resume(job);
|
|
|
|
}
|
|
|
|
|
2018-04-24 17:10:12 +02:00
|
|
|
static void job_do_dismiss(Job *job)
|
2018-04-19 17:30:16 +02:00
|
|
|
{
|
|
|
|
assert(job);
|
|
|
|
job->busy = false;
|
|
|
|
job->paused = false;
|
|
|
|
job->deferred_to_main_loop = true;
|
|
|
|
|
2018-04-23 16:06:26 +02:00
|
|
|
job_txn_del_job(job);
|
2018-04-19 17:30:16 +02:00
|
|
|
|
|
|
|
job_state_transition(job, JOB_STATUS_NULL);
|
|
|
|
job_unref(job);
|
|
|
|
}
|
|
|
|
|
2018-04-24 17:10:12 +02:00
|
|
|
void job_dismiss(Job **jobptr, Error **errp)
|
|
|
|
{
|
|
|
|
Job *job = *jobptr;
|
|
|
|
/* similarly to _complete, this is QMP-interface only. */
|
|
|
|
assert(job->id);
|
|
|
|
if (job_apply_verb(job, JOB_VERB_DISMISS, errp)) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
job_do_dismiss(job);
|
|
|
|
*jobptr = NULL;
|
|
|
|
}
|
|
|
|
|
2018-04-19 17:30:16 +02:00
|
|
|
void job_early_fail(Job *job)
|
|
|
|
{
|
|
|
|
assert(job->status == JOB_STATUS_CREATED);
|
|
|
|
job_do_dismiss(job);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void job_conclude(Job *job)
|
|
|
|
{
|
|
|
|
job_state_transition(job, JOB_STATUS_CONCLUDED);
|
|
|
|
if (job->auto_dismiss || !job_started(job)) {
|
|
|
|
job_do_dismiss(job);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-04-24 16:13:52 +02:00
|
|
|
static void job_update_rc(Job *job)
|
2018-04-19 17:30:16 +02:00
|
|
|
{
|
|
|
|
if (!job->ret && job_is_cancelled(job)) {
|
|
|
|
job->ret = -ECANCELED;
|
|
|
|
}
|
|
|
|
if (job->ret) {
|
2018-08-29 21:57:27 -04:00
|
|
|
if (!job->err) {
|
|
|
|
error_setg(&job->err, "%s", strerror(-job->ret));
|
2018-05-24 15:26:10 +02:00
|
|
|
}
|
2018-04-19 17:30:16 +02:00
|
|
|
job_state_transition(job, JOB_STATUS_ABORTING);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void job_commit(Job *job)
|
|
|
|
{
|
|
|
|
assert(!job->ret);
|
|
|
|
if (job->driver->commit) {
|
|
|
|
job->driver->commit(job);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void job_abort(Job *job)
|
|
|
|
{
|
|
|
|
assert(job->ret);
|
|
|
|
if (job->driver->abort) {
|
|
|
|
job->driver->abort(job);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void job_clean(Job *job)
|
|
|
|
{
|
|
|
|
if (job->driver->clean) {
|
|
|
|
job->driver->clean(job);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-04-23 16:06:26 +02:00
|
|
|
static int job_finalize_single(Job *job)
|
2018-04-19 17:30:16 +02:00
|
|
|
{
|
|
|
|
assert(job_is_completed(job));
|
|
|
|
|
|
|
|
/* Ensure abort is called for late-transactional failures */
|
|
|
|
job_update_rc(job);
|
|
|
|
|
|
|
|
if (!job->ret) {
|
|
|
|
job_commit(job);
|
|
|
|
} else {
|
|
|
|
job_abort(job);
|
|
|
|
}
|
|
|
|
job_clean(job);
|
|
|
|
|
|
|
|
if (job->cb) {
|
|
|
|
job->cb(job->opaque, job->ret);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Emit events only if we actually started */
|
|
|
|
if (job_started(job)) {
|
|
|
|
if (job_is_cancelled(job)) {
|
|
|
|
job_event_cancelled(job);
|
|
|
|
} else {
|
|
|
|
job_event_completed(job);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-04-23 16:06:26 +02:00
|
|
|
job_txn_del_job(job);
|
2018-04-19 17:30:16 +02:00
|
|
|
job_conclude(job);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2018-04-24 16:13:52 +02:00
|
|
|
static void job_cancel_async(Job *job, bool force)
|
2018-04-23 16:06:26 +02:00
|
|
|
{
|
2021-02-05 19:37:14 +03:00
|
|
|
if (job->driver->cancel) {
|
jobs: Give Job.force_cancel more meaning
We largely have two cancel modes for jobs:
First, there is actual cancelling. The job is terminated as soon as
possible, without trying to reach a consistent result.
Second, we have mirror in the READY state. Technically, the job is not
really cancelled, but it just is a different completion mode. The job
can still run for an indefinite amount of time while it tries to reach a
consistent result.
We want to be able to clearly distinguish which cancel mode a job is in
(when it has been cancelled). We can use Job.force_cancel for this, but
right now it only reflects cancel requests from the user with
force=true, but clearly, jobs that do not even distinguish between
force=false and force=true are effectively always force-cancelled.
So this patch has Job.force_cancel signify whether the job will
terminate as soon as possible (force_cancel=true) or whether it will
effectively remain running despite being "cancelled"
(force_cancel=false).
To this end, we let jobs that provide JobDriver.cancel() tell the
generic job code whether they will terminate as soon as possible or not,
and for jobs that do not provide that method we assume they will.
Signed-off-by: Hanna Reitz <hreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Message-Id: <20211006151940.214590-7-hreitz@redhat.com>
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
2021-10-06 17:19:33 +02:00
|
|
|
force = job->driver->cancel(job, force);
|
|
|
|
} else {
|
|
|
|
/* No .cancel() means the job will behave as if force-cancelled */
|
|
|
|
force = true;
|
2021-02-05 19:37:14 +03:00
|
|
|
}
|
jobs: Give Job.force_cancel more meaning
We largely have two cancel modes for jobs:
First, there is actual cancelling. The job is terminated as soon as
possible, without trying to reach a consistent result.
Second, we have mirror in the READY state. Technically, the job is not
really cancelled, but it just is a different completion mode. The job
can still run for an indefinite amount of time while it tries to reach a
consistent result.
We want to be able to clearly distinguish which cancel mode a job is in
(when it has been cancelled). We can use Job.force_cancel for this, but
right now it only reflects cancel requests from the user with
force=true, but clearly, jobs that do not even distinguish between
force=false and force=true are effectively always force-cancelled.
So this patch has Job.force_cancel signify whether the job will
terminate as soon as possible (force_cancel=true) or whether it will
effectively remain running despite being "cancelled"
(force_cancel=false).
To this end, we let jobs that provide JobDriver.cancel() tell the
generic job code whether they will terminate as soon as possible or not,
and for jobs that do not provide that method we assume they will.
Signed-off-by: Hanna Reitz <hreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Message-Id: <20211006151940.214590-7-hreitz@redhat.com>
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
2021-10-06 17:19:33 +02:00
|
|
|
|
2018-04-23 16:06:26 +02:00
|
|
|
if (job->user_paused) {
|
|
|
|
/* Do not call job_enter here, the caller will handle it. */
|
|
|
|
if (job->driver->user_resume) {
|
|
|
|
job->driver->user_resume(job);
|
|
|
|
}
|
2018-08-21 12:26:19 -04:00
|
|
|
job->user_paused = false;
|
2018-04-23 16:06:26 +02:00
|
|
|
assert(job->pause_count > 0);
|
|
|
|
job->pause_count--;
|
|
|
|
}
|
2021-10-06 17:19:34 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Ignore soft cancel requests after the job is already done
|
|
|
|
* (We will still invoke job->driver->cancel() above, but if the
|
|
|
|
* job driver supports soft cancelling and the job is done, that
|
|
|
|
* should be a no-op, too. We still call it so it can override
|
|
|
|
* @force.)
|
|
|
|
*/
|
|
|
|
if (force || !job->deferred_to_main_loop) {
|
|
|
|
job->cancelled = true;
|
|
|
|
/* To prevent 'force == false' overriding a previous 'force == true' */
|
|
|
|
job->force_cancel |= force;
|
|
|
|
}
|
2018-04-23 16:06:26 +02:00
|
|
|
}
|
|
|
|
|
2018-04-24 16:13:52 +02:00
|
|
|
static void job_completed_txn_abort(Job *job)
|
2018-04-23 16:06:26 +02:00
|
|
|
{
|
|
|
|
AioContext *ctx;
|
|
|
|
JobTxn *txn = job->txn;
|
|
|
|
Job *other_job;
|
|
|
|
|
|
|
|
if (txn->aborting) {
|
|
|
|
/*
|
|
|
|
* We are cancelled by another job, which will handle everything.
|
|
|
|
*/
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
txn->aborting = true;
|
|
|
|
job_txn_ref(txn);
|
|
|
|
|
2021-10-06 17:19:28 +02:00
|
|
|
/*
|
|
|
|
* We can only hold the single job's AioContext lock while calling
|
2018-09-13 14:35:27 +02:00
|
|
|
* job_finalize_single() because the finalization callbacks can involve
|
2021-10-06 17:19:28 +02:00
|
|
|
* calls of AIO_WAIT_WHILE(), which could deadlock otherwise.
|
|
|
|
* Note that the job's AioContext may change when it is finalized.
|
|
|
|
*/
|
|
|
|
job_ref(job);
|
|
|
|
aio_context_release(job->aio_context);
|
2018-04-23 16:06:26 +02:00
|
|
|
|
|
|
|
/* Other jobs are effectively cancelled by us, set the status for
|
|
|
|
* them; this job, however, may or may not be cancelled, depending
|
|
|
|
* on the caller, so leave it. */
|
|
|
|
QLIST_FOREACH(other_job, &txn->jobs, txn_list) {
|
|
|
|
if (other_job != job) {
|
2018-09-13 14:35:27 +02:00
|
|
|
ctx = other_job->aio_context;
|
|
|
|
aio_context_acquire(ctx);
|
2021-10-06 17:19:31 +02:00
|
|
|
/*
|
|
|
|
* This is a transaction: If one job failed, no result will matter.
|
|
|
|
* Therefore, pass force=true to terminate all other jobs as quickly
|
|
|
|
* as possible.
|
|
|
|
*/
|
|
|
|
job_cancel_async(other_job, true);
|
2018-09-13 14:35:27 +02:00
|
|
|
aio_context_release(ctx);
|
2018-04-23 16:06:26 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
while (!QLIST_EMPTY(&txn->jobs)) {
|
|
|
|
other_job = QLIST_FIRST(&txn->jobs);
|
2021-10-06 17:19:28 +02:00
|
|
|
/*
|
|
|
|
* The job's AioContext may change, so store it in @ctx so we
|
|
|
|
* release the same context that we have acquired before.
|
|
|
|
*/
|
2018-04-23 16:06:26 +02:00
|
|
|
ctx = other_job->aio_context;
|
2018-09-13 14:35:27 +02:00
|
|
|
aio_context_acquire(ctx);
|
2018-04-23 16:06:26 +02:00
|
|
|
if (!job_is_completed(other_job)) {
|
job: Add job_cancel_requested()
Most callers of job_is_cancelled() actually want to know whether the job
is on its way to immediate termination. For example, we refuse to pause
jobs that are cancelled; but this only makes sense for jobs that are
really actually cancelled.
A mirror job that is cancelled during READY with force=false should
absolutely be allowed to pause. This "cancellation" (which is actually
a kind of completion) may take an indefinite amount of time, and so
should behave like any job during normal operation. For example, with
on-target-error=stop, the job should stop on write errors. (In
contrast, force-cancelled jobs should not get write errors, as they
should just terminate and not do further I/O.)
Therefore, redefine job_is_cancelled() to only return true for jobs that
are force-cancelled (which as of HEAD^ means any job that interprets the
cancellation request as a request for immediate termination), and add
job_cancel_requested() as the general variant, which returns true for
any jobs which have been requested to be cancelled, whether it be
immediately or after an arbitrarily long completion phase.
Finally, here is a justification for how different job_is_cancelled()
invocations are treated by this patch:
- block/mirror.c (mirror_run()):
- The first invocation is a while loop that should loop until the job
has been cancelled or scheduled for completion. What kind of cancel
does not matter, only the fact that the job is supposed to end.
- The second invocation wants to know whether the job has been
soft-cancelled. Calling job_cancel_requested() is a bit too broad,
but if the job were force-cancelled, we should leave the main loop
as soon as possible anyway, so this should not matter here.
- The last two invocations already check force_cancel, so they should
continue to use job_is_cancelled().
- block/backup.c, block/commit.c, block/stream.c, anything in tests/:
These jobs know only force-cancel, so there is no difference between
job_is_cancelled() and job_cancel_requested(). We can continue using
job_is_cancelled().
- job.c:
- job_pause_point(), job_yield(), job_sleep_ns(): Only force-cancelled
jobs should be prevented from being paused. Continue using job_is_cancelled().
- job_update_rc(), job_finalize_single(), job_finish_sync(): These
functions are all called after the job has left its main loop. The
mirror job (the only job that can be soft-cancelled) will clear
.cancelled before leaving the main loop if it has been
soft-cancelled. Therefore, these functions will observe .cancelled
to be true only if the job has been force-cancelled. We can
continue to use job_is_cancelled().
(Furthermore, conceptually, a soft-cancelled mirror job should not
report to have been cancelled. It should report completion (see
also the block-job-cancel QAPI documentation). Therefore, it makes
sense for these functions not to distinguish between a
soft-cancelled mirror job and a job that has completed as normal.)
- job_completed_txn_abort(): All jobs other than @job have been
force-cancelled. job_is_cancelled() must be true for them.
Regarding @job itself: job_completed_txn_abort() is mostly called
when the job's return value is not 0. A soft-cancelled mirror has a
return value of 0, and so will not end up here then.
However, job_cancel() invokes job_completed_txn_abort() if the job
has been deferred to the main loop, which is mostly the case for
completed jobs (which skip the assertion), but not for sure.
To be safe, use job_cancel_requested() in this assertion.
- job_complete(): This is function eventually invoked by the user
(through qmp_block_job_complete() or qmp_job_complete(), or
job_complete_sync(), which comes from qemu-img). The intention here
is to prevent a user from invoking job-complete after the job has
been cancelled. This should also apply to soft cancelling: After a
mirror job has been soft-cancelled, the user should not be able to
decide otherwise and have it complete as normal (i.e. pivoting to
the target).
- job_cancel(): Both functions are equivalent (see comment there), but
we want to use job_is_cancelled(), because this shows that we call
job_completed_txn_abort() only for force-cancelled jobs. (As
explained for job_update_rc(), soft-cancelled jobs should be treated
as if they have completed as normal.)
Buglink: https://gitlab.com/qemu-project/qemu/-/issues/462
Signed-off-by: Hanna Reitz <hreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20211006151940.214590-9-hreitz@redhat.com>
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
2021-10-06 17:19:35 +02:00
|
|
|
assert(job_cancel_requested(other_job));
|
2018-04-23 16:06:26 +02:00
|
|
|
job_finish_sync(other_job, NULL, NULL);
|
|
|
|
}
|
|
|
|
job_finalize_single(other_job);
|
|
|
|
aio_context_release(ctx);
|
|
|
|
}
|
|
|
|
|
2021-10-06 17:19:28 +02:00
|
|
|
/*
|
|
|
|
* Use job_ref()/job_unref() so we can read the AioContext here
|
|
|
|
* even if the job went away during job_finalize_single().
|
|
|
|
*/
|
|
|
|
aio_context_acquire(job->aio_context);
|
|
|
|
job_unref(job);
|
2018-09-13 14:35:27 +02:00
|
|
|
|
2018-04-23 16:06:26 +02:00
|
|
|
job_txn_unref(txn);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int job_prepare(Job *job)
|
|
|
|
{
|
|
|
|
if (job->ret == 0 && job->driver->prepare) {
|
|
|
|
job->ret = job->driver->prepare(job);
|
2018-05-24 15:26:10 +02:00
|
|
|
job_update_rc(job);
|
2018-04-23 16:06:26 +02:00
|
|
|
}
|
|
|
|
return job->ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int job_needs_finalize(Job *job)
|
|
|
|
{
|
|
|
|
return !job->auto_finalize;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void job_do_finalize(Job *job)
|
|
|
|
{
|
|
|
|
int rc;
|
|
|
|
assert(job && job->txn);
|
|
|
|
|
|
|
|
/* prepare the transaction to complete */
|
job: take each job's lock individually in job_txn_apply
All callers of job_txn_apply hold a single job's lock, but different
jobs within a transaction can have different contexts, thus we need to
lock each one individually before applying the callback function.
Similar to job_completed_txn_abort this also requires releasing the
caller's context before and reacquiring it after to avoid recursive
locks which might break AIO_WAIT_WHILE in the callback. This is safe, since
existing code would already have to take this into account, lest
job_completed_txn_abort might have broken.
This also brings to light a different issue: When a callback function in
job_txn_apply moves it's job to a different AIO context, callers will
try to release the wrong lock (now that we re-acquire the lock
correctly, previously it would just continue with the old lock, leaving
the job unlocked for the rest of the return path). Fix this by not caching
the job's context.
This is only necessary for qmp_block_job_finalize, qmp_job_finalize and
job_exit, since everyone else calls through job_exit.
One test needed adapting, since it calls job_finalize directly, so it
manually needs to acquire the correct context.
Signed-off-by: Stefan Reiter <s.reiter@proxmox.com>
Message-Id: <20200407115651.69472-2-s.reiter@proxmox.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2020-04-07 13:56:49 +02:00
|
|
|
rc = job_txn_apply(job, job_prepare);
|
2018-04-23 16:06:26 +02:00
|
|
|
if (rc) {
|
|
|
|
job_completed_txn_abort(job);
|
|
|
|
} else {
|
job: take each job's lock individually in job_txn_apply
All callers of job_txn_apply hold a single job's lock, but different
jobs within a transaction can have different contexts, thus we need to
lock each one individually before applying the callback function.
Similar to job_completed_txn_abort this also requires releasing the
caller's context before and reacquiring it after to avoid recursive
locks which might break AIO_WAIT_WHILE in the callback. This is safe, since
existing code would already have to take this into account, lest
job_completed_txn_abort might have broken.
This also brings to light a different issue: When a callback function in
job_txn_apply moves it's job to a different AIO context, callers will
try to release the wrong lock (now that we re-acquire the lock
correctly, previously it would just continue with the old lock, leaving
the job unlocked for the rest of the return path). Fix this by not caching
the job's context.
This is only necessary for qmp_block_job_finalize, qmp_job_finalize and
job_exit, since everyone else calls through job_exit.
One test needed adapting, since it calls job_finalize directly, so it
manually needs to acquire the correct context.
Signed-off-by: Stefan Reiter <s.reiter@proxmox.com>
Message-Id: <20200407115651.69472-2-s.reiter@proxmox.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2020-04-07 13:56:49 +02:00
|
|
|
job_txn_apply(job, job_finalize_single);
|
2018-04-23 16:06:26 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void job_finalize(Job *job, Error **errp)
|
|
|
|
{
|
|
|
|
assert(job && job->id);
|
|
|
|
if (job_apply_verb(job, JOB_VERB_FINALIZE, errp)) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
job_do_finalize(job);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int job_transition_to_pending(Job *job)
|
|
|
|
{
|
|
|
|
job_state_transition(job, JOB_STATUS_PENDING);
|
|
|
|
if (!job->auto_finalize) {
|
|
|
|
job_event_pending(job);
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2018-04-25 14:56:09 +02:00
|
|
|
void job_transition_to_ready(Job *job)
|
|
|
|
{
|
|
|
|
job_state_transition(job, JOB_STATUS_READY);
|
|
|
|
job_event_ready(job);
|
|
|
|
}
|
|
|
|
|
2018-04-24 16:13:52 +02:00
|
|
|
static void job_completed_txn_success(Job *job)
|
2018-04-23 16:06:26 +02:00
|
|
|
{
|
|
|
|
JobTxn *txn = job->txn;
|
|
|
|
Job *other_job;
|
|
|
|
|
|
|
|
job_state_transition(job, JOB_STATUS_WAITING);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Successful completion, see if there are other running jobs in this
|
|
|
|
* txn.
|
|
|
|
*/
|
|
|
|
QLIST_FOREACH(other_job, &txn->jobs, txn_list) {
|
|
|
|
if (!job_is_completed(other_job)) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
assert(other_job->ret == 0);
|
|
|
|
}
|
|
|
|
|
job: take each job's lock individually in job_txn_apply
All callers of job_txn_apply hold a single job's lock, but different
jobs within a transaction can have different contexts, thus we need to
lock each one individually before applying the callback function.
Similar to job_completed_txn_abort this also requires releasing the
caller's context before and reacquiring it after to avoid recursive
locks which might break AIO_WAIT_WHILE in the callback. This is safe, since
existing code would already have to take this into account, lest
job_completed_txn_abort might have broken.
This also brings to light a different issue: When a callback function in
job_txn_apply moves it's job to a different AIO context, callers will
try to release the wrong lock (now that we re-acquire the lock
correctly, previously it would just continue with the old lock, leaving
the job unlocked for the rest of the return path). Fix this by not caching
the job's context.
This is only necessary for qmp_block_job_finalize, qmp_job_finalize and
job_exit, since everyone else calls through job_exit.
One test needed adapting, since it calls job_finalize directly, so it
manually needs to acquire the correct context.
Signed-off-by: Stefan Reiter <s.reiter@proxmox.com>
Message-Id: <20200407115651.69472-2-s.reiter@proxmox.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2020-04-07 13:56:49 +02:00
|
|
|
job_txn_apply(job, job_transition_to_pending);
|
2018-04-23 16:06:26 +02:00
|
|
|
|
|
|
|
/* If no jobs need manual finalization, automatically do so */
|
job: take each job's lock individually in job_txn_apply
All callers of job_txn_apply hold a single job's lock, but different
jobs within a transaction can have different contexts, thus we need to
lock each one individually before applying the callback function.
Similar to job_completed_txn_abort this also requires releasing the
caller's context before and reacquiring it after to avoid recursive
locks which might break AIO_WAIT_WHILE in the callback. This is safe, since
existing code would already have to take this into account, lest
job_completed_txn_abort might have broken.
This also brings to light a different issue: When a callback function in
job_txn_apply moves it's job to a different AIO context, callers will
try to release the wrong lock (now that we re-acquire the lock
correctly, previously it would just continue with the old lock, leaving
the job unlocked for the rest of the return path). Fix this by not caching
the job's context.
This is only necessary for qmp_block_job_finalize, qmp_job_finalize and
job_exit, since everyone else calls through job_exit.
One test needed adapting, since it calls job_finalize directly, so it
manually needs to acquire the correct context.
Signed-off-by: Stefan Reiter <s.reiter@proxmox.com>
Message-Id: <20200407115651.69472-2-s.reiter@proxmox.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2020-04-07 13:56:49 +02:00
|
|
|
if (job_txn_apply(job, job_needs_finalize) == 0) {
|
2018-04-23 16:06:26 +02:00
|
|
|
job_do_finalize(job);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-08-29 21:57:33 -04:00
|
|
|
static void job_completed(Job *job)
|
2018-04-24 16:13:52 +02:00
|
|
|
{
|
|
|
|
assert(job && job->txn && !job_is_completed(job));
|
2018-05-24 15:26:10 +02:00
|
|
|
|
2018-04-24 16:13:52 +02:00
|
|
|
job_update_rc(job);
|
2018-08-29 21:57:33 -04:00
|
|
|
trace_job_completed(job, job->ret);
|
2018-04-24 16:13:52 +02:00
|
|
|
if (job->ret) {
|
|
|
|
job_completed_txn_abort(job);
|
|
|
|
} else {
|
|
|
|
job_completed_txn_success(job);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-09-06 09:02:20 -04:00
|
|
|
/** Useful only as a type shim for aio_bh_schedule_oneshot. */
|
|
|
|
static void job_exit(void *opaque)
|
|
|
|
{
|
|
|
|
Job *job = (Job *)opaque;
|
job: take each job's lock individually in job_txn_apply
All callers of job_txn_apply hold a single job's lock, but different
jobs within a transaction can have different contexts, thus we need to
lock each one individually before applying the callback function.
Similar to job_completed_txn_abort this also requires releasing the
caller's context before and reacquiring it after to avoid recursive
locks which might break AIO_WAIT_WHILE in the callback. This is safe, since
existing code would already have to take this into account, lest
job_completed_txn_abort might have broken.
This also brings to light a different issue: When a callback function in
job_txn_apply moves it's job to a different AIO context, callers will
try to release the wrong lock (now that we re-acquire the lock
correctly, previously it would just continue with the old lock, leaving
the job unlocked for the rest of the return path). Fix this by not caching
the job's context.
This is only necessary for qmp_block_job_finalize, qmp_job_finalize and
job_exit, since everyone else calls through job_exit.
One test needed adapting, since it calls job_finalize directly, so it
manually needs to acquire the correct context.
Signed-off-by: Stefan Reiter <s.reiter@proxmox.com>
Message-Id: <20200407115651.69472-2-s.reiter@proxmox.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2020-04-07 13:56:49 +02:00
|
|
|
AioContext *ctx;
|
2018-09-12 13:50:38 +02:00
|
|
|
|
job: take each job's lock individually in job_txn_apply
All callers of job_txn_apply hold a single job's lock, but different
jobs within a transaction can have different contexts, thus we need to
lock each one individually before applying the callback function.
Similar to job_completed_txn_abort this also requires releasing the
caller's context before and reacquiring it after to avoid recursive
locks which might break AIO_WAIT_WHILE in the callback. This is safe, since
existing code would already have to take this into account, lest
job_completed_txn_abort might have broken.
This also brings to light a different issue: When a callback function in
job_txn_apply moves it's job to a different AIO context, callers will
try to release the wrong lock (now that we re-acquire the lock
correctly, previously it would just continue with the old lock, leaving
the job unlocked for the rest of the return path). Fix this by not caching
the job's context.
This is only necessary for qmp_block_job_finalize, qmp_job_finalize and
job_exit, since everyone else calls through job_exit.
One test needed adapting, since it calls job_finalize directly, so it
manually needs to acquire the correct context.
Signed-off-by: Stefan Reiter <s.reiter@proxmox.com>
Message-Id: <20200407115651.69472-2-s.reiter@proxmox.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2020-04-07 13:56:49 +02:00
|
|
|
job_ref(job);
|
|
|
|
aio_context_acquire(job->aio_context);
|
2018-09-07 15:31:22 +02:00
|
|
|
|
|
|
|
/* This is a lie, we're not quiescent, but still doing the completion
|
|
|
|
* callbacks. However, completion callbacks tend to involve operations that
|
|
|
|
* drain block nodes, and if .drained_poll still returned true, we would
|
|
|
|
* deadlock. */
|
|
|
|
job->busy = false;
|
|
|
|
job_event_idle(job);
|
|
|
|
|
2018-09-06 09:02:20 -04:00
|
|
|
job_completed(job);
|
2018-09-07 15:31:22 +02:00
|
|
|
|
job: take each job's lock individually in job_txn_apply
All callers of job_txn_apply hold a single job's lock, but different
jobs within a transaction can have different contexts, thus we need to
lock each one individually before applying the callback function.
Similar to job_completed_txn_abort this also requires releasing the
caller's context before and reacquiring it after to avoid recursive
locks which might break AIO_WAIT_WHILE in the callback. This is safe, since
existing code would already have to take this into account, lest
job_completed_txn_abort might have broken.
This also brings to light a different issue: When a callback function in
job_txn_apply moves it's job to a different AIO context, callers will
try to release the wrong lock (now that we re-acquire the lock
correctly, previously it would just continue with the old lock, leaving
the job unlocked for the rest of the return path). Fix this by not caching
the job's context.
This is only necessary for qmp_block_job_finalize, qmp_job_finalize and
job_exit, since everyone else calls through job_exit.
One test needed adapting, since it calls job_finalize directly, so it
manually needs to acquire the correct context.
Signed-off-by: Stefan Reiter <s.reiter@proxmox.com>
Message-Id: <20200407115651.69472-2-s.reiter@proxmox.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2020-04-07 13:56:49 +02:00
|
|
|
/*
|
|
|
|
* Note that calling job_completed can move the job to a different
|
|
|
|
* aio_context, so we cannot cache from above. job_txn_apply takes care of
|
|
|
|
* acquiring the new lock, and we ref/unref to avoid job_completed freeing
|
|
|
|
* the job underneath us.
|
|
|
|
*/
|
|
|
|
ctx = job->aio_context;
|
|
|
|
job_unref(job);
|
2018-09-12 13:50:38 +02:00
|
|
|
aio_context_release(ctx);
|
2018-09-06 09:02:20 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* All jobs must allow a pause point before entering their job proper. This
|
|
|
|
* ensures that jobs can be paused prior to being started, then resumed later.
|
|
|
|
*/
|
|
|
|
static void coroutine_fn job_co_entry(void *opaque)
|
|
|
|
{
|
|
|
|
Job *job = opaque;
|
|
|
|
|
|
|
|
assert(job && job->driver && job->driver->run);
|
|
|
|
job_pause_point(job);
|
|
|
|
job->ret = job->driver->run(job, &job->err);
|
|
|
|
job->deferred_to_main_loop = true;
|
2018-09-07 15:31:22 +02:00
|
|
|
job->busy = true;
|
2018-09-06 09:02:20 -04:00
|
|
|
aio_bh_schedule_oneshot(qemu_get_aio_context(), job_exit, job);
|
|
|
|
}
|
|
|
|
|
|
|
|
void job_start(Job *job)
|
|
|
|
{
|
|
|
|
assert(job && !job_started(job) && job->paused &&
|
|
|
|
job->driver && job->driver->run);
|
|
|
|
job->co = qemu_coroutine_create(job_co_entry, job);
|
|
|
|
job->pause_count--;
|
|
|
|
job->busy = true;
|
|
|
|
job->paused = false;
|
|
|
|
job_state_transition(job, JOB_STATUS_RUNNING);
|
|
|
|
aio_co_enter(job->aio_context, job->co);
|
|
|
|
}
|
|
|
|
|
2018-04-24 16:13:52 +02:00
|
|
|
void job_cancel(Job *job, bool force)
|
|
|
|
{
|
|
|
|
if (job->status == JOB_STATUS_CONCLUDED) {
|
|
|
|
job_do_dismiss(job);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
job_cancel_async(job, force);
|
|
|
|
if (!job_started(job)) {
|
2018-08-29 21:57:33 -04:00
|
|
|
job_completed(job);
|
2018-04-24 16:13:52 +02:00
|
|
|
} else if (job->deferred_to_main_loop) {
|
2021-10-06 17:19:34 +02:00
|
|
|
/*
|
|
|
|
* job_cancel_async() ignores soft-cancel requests for jobs
|
|
|
|
* that are already done (i.e. deferred to the main loop). We
|
|
|
|
* have to check again whether the job is really cancelled.
|
job: Add job_cancel_requested()
Most callers of job_is_cancelled() actually want to know whether the job
is on its way to immediate termination. For example, we refuse to pause
jobs that are cancelled; but this only makes sense for jobs that are
really actually cancelled.
A mirror job that is cancelled during READY with force=false should
absolutely be allowed to pause. This "cancellation" (which is actually
a kind of completion) may take an indefinite amount of time, and so
should behave like any job during normal operation. For example, with
on-target-error=stop, the job should stop on write errors. (In
contrast, force-cancelled jobs should not get write errors, as they
should just terminate and not do further I/O.)
Therefore, redefine job_is_cancelled() to only return true for jobs that
are force-cancelled (which as of HEAD^ means any job that interprets the
cancellation request as a request for immediate termination), and add
job_cancel_requested() as the general variant, which returns true for
any jobs which have been requested to be cancelled, whether it be
immediately or after an arbitrarily long completion phase.
Finally, here is a justification for how different job_is_cancelled()
invocations are treated by this patch:
- block/mirror.c (mirror_run()):
- The first invocation is a while loop that should loop until the job
has been cancelled or scheduled for completion. What kind of cancel
does not matter, only the fact that the job is supposed to end.
- The second invocation wants to know whether the job has been
soft-cancelled. Calling job_cancel_requested() is a bit too broad,
but if the job were force-cancelled, we should leave the main loop
as soon as possible anyway, so this should not matter here.
- The last two invocations already check force_cancel, so they should
continue to use job_is_cancelled().
- block/backup.c, block/commit.c, block/stream.c, anything in tests/:
These jobs know only force-cancel, so there is no difference between
job_is_cancelled() and job_cancel_requested(). We can continue using
job_is_cancelled().
- job.c:
- job_pause_point(), job_yield(), job_sleep_ns(): Only force-cancelled
jobs should be prevented from being paused. Continue using job_is_cancelled().
- job_update_rc(), job_finalize_single(), job_finish_sync(): These
functions are all called after the job has left its main loop. The
mirror job (the only job that can be soft-cancelled) will clear
.cancelled before leaving the main loop if it has been
soft-cancelled. Therefore, these functions will observe .cancelled
to be true only if the job has been force-cancelled. We can
continue to use job_is_cancelled().
(Furthermore, conceptually, a soft-cancelled mirror job should not
report to have been cancelled. It should report completion (see
also the block-job-cancel QAPI documentation). Therefore, it makes
sense for these functions not to distinguish between a
soft-cancelled mirror job and a job that has completed as normal.)
- job_completed_txn_abort(): All jobs other than @job have been
force-cancelled. job_is_cancelled() must be true for them.
Regarding @job itself: job_completed_txn_abort() is mostly called
when the job's return value is not 0. A soft-cancelled mirror has a
return value of 0, and so will not end up here then.
However, job_cancel() invokes job_completed_txn_abort() if the job
has been deferred to the main loop, which is mostly the case for
completed jobs (which skip the assertion), but not for sure.
To be safe, use job_cancel_requested() in this assertion.
- job_complete(): This is function eventually invoked by the user
(through qmp_block_job_complete() or qmp_job_complete(), or
job_complete_sync(), which comes from qemu-img). The intention here
is to prevent a user from invoking job-complete after the job has
been cancelled. This should also apply to soft cancelling: After a
mirror job has been soft-cancelled, the user should not be able to
decide otherwise and have it complete as normal (i.e. pivoting to
the target).
- job_cancel(): Both functions are equivalent (see comment there), but
we want to use job_is_cancelled(), because this shows that we call
job_completed_txn_abort() only for force-cancelled jobs. (As
explained for job_update_rc(), soft-cancelled jobs should be treated
as if they have completed as normal.)
Buglink: https://gitlab.com/qemu-project/qemu/-/issues/462
Signed-off-by: Hanna Reitz <hreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20211006151940.214590-9-hreitz@redhat.com>
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
2021-10-06 17:19:35 +02:00
|
|
|
* (job_cancel_requested() and job_is_cancelled() are equivalent
|
|
|
|
* here, because job_cancel_async() will make soft-cancel
|
|
|
|
* requests no-ops when deferred_to_main_loop is true. We
|
|
|
|
* choose to call job_is_cancelled() to show that we invoke
|
|
|
|
* job_completed_txn_abort() only for force-cancelled jobs.)
|
2021-10-06 17:19:34 +02:00
|
|
|
*/
|
|
|
|
if (job_is_cancelled(job)) {
|
|
|
|
job_completed_txn_abort(job);
|
|
|
|
}
|
2018-04-24 16:13:52 +02:00
|
|
|
} else {
|
|
|
|
job_enter(job);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void job_user_cancel(Job *job, bool force, Error **errp)
|
|
|
|
{
|
|
|
|
if (job_apply_verb(job, JOB_VERB_CANCEL, errp)) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
job_cancel(job, force);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* A wrapper around job_cancel() taking an Error ** parameter so it may be
|
|
|
|
* used with job_finish_sync() without the need for (rather nasty) function
|
|
|
|
* pointer casts there. */
|
|
|
|
static void job_cancel_err(Job *job, Error **errp)
|
|
|
|
{
|
|
|
|
job_cancel(job, false);
|
|
|
|
}
|
|
|
|
|
2021-10-06 17:19:32 +02:00
|
|
|
/**
|
|
|
|
* Same as job_cancel_err(), but force-cancel.
|
|
|
|
*/
|
|
|
|
static void job_force_cancel_err(Job *job, Error **errp)
|
2018-04-24 16:13:52 +02:00
|
|
|
{
|
2021-10-06 17:19:32 +02:00
|
|
|
job_cancel(job, true);
|
|
|
|
}
|
|
|
|
|
|
|
|
int job_cancel_sync(Job *job, bool force)
|
|
|
|
{
|
|
|
|
if (force) {
|
|
|
|
return job_finish_sync(job, &job_force_cancel_err, NULL);
|
|
|
|
} else {
|
|
|
|
return job_finish_sync(job, &job_cancel_err, NULL);
|
|
|
|
}
|
2018-04-24 16:13:52 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
void job_cancel_sync_all(void)
|
|
|
|
{
|
|
|
|
Job *job;
|
|
|
|
AioContext *aio_context;
|
|
|
|
|
|
|
|
while ((job = job_next(NULL))) {
|
|
|
|
aio_context = job->aio_context;
|
|
|
|
aio_context_acquire(aio_context);
|
2021-10-06 17:19:32 +02:00
|
|
|
job_cancel_sync(job, true);
|
2018-04-24 16:13:52 +02:00
|
|
|
aio_context_release(aio_context);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
int job_complete_sync(Job *job, Error **errp)
|
|
|
|
{
|
|
|
|
return job_finish_sync(job, job_complete, errp);
|
|
|
|
}
|
|
|
|
|
2018-04-23 12:24:16 +02:00
|
|
|
void job_complete(Job *job, Error **errp)
|
|
|
|
{
|
|
|
|
/* Should not be reachable via external interface for internal jobs */
|
|
|
|
assert(job->id);
|
|
|
|
if (job_apply_verb(job, JOB_VERB_COMPLETE, errp)) {
|
|
|
|
return;
|
|
|
|
}
|
job: Add job_cancel_requested()
Most callers of job_is_cancelled() actually want to know whether the job
is on its way to immediate termination. For example, we refuse to pause
jobs that are cancelled; but this only makes sense for jobs that are
really actually cancelled.
A mirror job that is cancelled during READY with force=false should
absolutely be allowed to pause. This "cancellation" (which is actually
a kind of completion) may take an indefinite amount of time, and so
should behave like any job during normal operation. For example, with
on-target-error=stop, the job should stop on write errors. (In
contrast, force-cancelled jobs should not get write errors, as they
should just terminate and not do further I/O.)
Therefore, redefine job_is_cancelled() to only return true for jobs that
are force-cancelled (which as of HEAD^ means any job that interprets the
cancellation request as a request for immediate termination), and add
job_cancel_requested() as the general variant, which returns true for
any jobs which have been requested to be cancelled, whether it be
immediately or after an arbitrarily long completion phase.
Finally, here is a justification for how different job_is_cancelled()
invocations are treated by this patch:
- block/mirror.c (mirror_run()):
- The first invocation is a while loop that should loop until the job
has been cancelled or scheduled for completion. What kind of cancel
does not matter, only the fact that the job is supposed to end.
- The second invocation wants to know whether the job has been
soft-cancelled. Calling job_cancel_requested() is a bit too broad,
but if the job were force-cancelled, we should leave the main loop
as soon as possible anyway, so this should not matter here.
- The last two invocations already check force_cancel, so they should
continue to use job_is_cancelled().
- block/backup.c, block/commit.c, block/stream.c, anything in tests/:
These jobs know only force-cancel, so there is no difference between
job_is_cancelled() and job_cancel_requested(). We can continue using
job_is_cancelled().
- job.c:
- job_pause_point(), job_yield(), job_sleep_ns(): Only force-cancelled
jobs should be prevented from being paused. Continue using job_is_cancelled().
- job_update_rc(), job_finalize_single(), job_finish_sync(): These
functions are all called after the job has left its main loop. The
mirror job (the only job that can be soft-cancelled) will clear
.cancelled before leaving the main loop if it has been
soft-cancelled. Therefore, these functions will observe .cancelled
to be true only if the job has been force-cancelled. We can
continue to use job_is_cancelled().
(Furthermore, conceptually, a soft-cancelled mirror job should not
report to have been cancelled. It should report completion (see
also the block-job-cancel QAPI documentation). Therefore, it makes
sense for these functions not to distinguish between a
soft-cancelled mirror job and a job that has completed as normal.)
- job_completed_txn_abort(): All jobs other than @job have been
force-cancelled. job_is_cancelled() must be true for them.
Regarding @job itself: job_completed_txn_abort() is mostly called
when the job's return value is not 0. A soft-cancelled mirror has a
return value of 0, and so will not end up here then.
However, job_cancel() invokes job_completed_txn_abort() if the job
has been deferred to the main loop, which is mostly the case for
completed jobs (which skip the assertion), but not for sure.
To be safe, use job_cancel_requested() in this assertion.
- job_complete(): This is function eventually invoked by the user
(through qmp_block_job_complete() or qmp_job_complete(), or
job_complete_sync(), which comes from qemu-img). The intention here
is to prevent a user from invoking job-complete after the job has
been cancelled. This should also apply to soft cancelling: After a
mirror job has been soft-cancelled, the user should not be able to
decide otherwise and have it complete as normal (i.e. pivoting to
the target).
- job_cancel(): Both functions are equivalent (see comment there), but
we want to use job_is_cancelled(), because this shows that we call
job_completed_txn_abort() only for force-cancelled jobs. (As
explained for job_update_rc(), soft-cancelled jobs should be treated
as if they have completed as normal.)
Buglink: https://gitlab.com/qemu-project/qemu/-/issues/462
Signed-off-by: Hanna Reitz <hreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20211006151940.214590-9-hreitz@redhat.com>
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
2021-10-06 17:19:35 +02:00
|
|
|
if (job_cancel_requested(job) || !job->driver->complete) {
|
2018-04-23 12:24:16 +02:00
|
|
|
error_setg(errp, "The active block job '%s' cannot be completed",
|
|
|
|
job->id);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
job->driver->complete(job, errp);
|
|
|
|
}
|
|
|
|
|
2018-04-20 15:33:57 +02:00
|
|
|
int job_finish_sync(Job *job, void (*finish)(Job *, Error **errp), Error **errp)
|
|
|
|
{
|
|
|
|
Error *local_err = NULL;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
job_ref(job);
|
|
|
|
|
|
|
|
if (finish) {
|
|
|
|
finish(job, &local_err);
|
|
|
|
}
|
|
|
|
if (local_err) {
|
|
|
|
error_propagate(errp, local_err);
|
|
|
|
job_unref(job);
|
|
|
|
return -EBUSY;
|
|
|
|
}
|
2018-08-17 14:58:49 +02:00
|
|
|
|
2018-09-18 17:09:16 +02:00
|
|
|
AIO_WAIT_WHILE(job->aio_context,
|
2019-08-29 12:09:53 +03:00
|
|
|
(job_enter(job), !job_is_completed(job)));
|
2018-08-17 14:58:49 +02:00
|
|
|
|
2018-04-20 15:33:57 +02:00
|
|
|
ret = (job_is_cancelled(job) && job->ret == 0) ? -ECANCELED : job->ret;
|
|
|
|
job_unref(job);
|
|
|
|
return ret;
|
|
|
|
}
|