Merge branch 'for-3.11/core' of git://git.kernel.dk/linux-block

Pull core block IO updates from Jens Axboe:
 "Here are the core IO block bits for 3.11. It contains:

   - A tweak to the reserved tag logic from Jan, for weirdo devices with
     just 3 free tags.  But for those it improves things substantially
     for random writes.

   - Periodic writeback fix from Jan.  Marked for stable as well.

   - Fix for a race condition in IO scheduler switching from Jianpeng.

   - The hierarchical blk-cgroup support from Tejun.  This is the grunt
     of the series.

   - blk-throttle fix from Vivek.

  Just a note that I'm in the middle of a relocation, whole family is
  flying out tomorrow.  Hence I will be awal the remainder of this week,
  but back at work again on Monday the 15th.  CC'ing Tejun, since any
  potential "surprises" will most likely be from the blk-cgroup work.
  But it's been brewing for a while and sitting in my tree and
  linux-next for a long time, so should be solid."

* 'for-3.11/core' of git://git.kernel.dk/linux-block: (36 commits)
  elevator: Fix a race in elevator switching
  block: Reserve only one queue tag for sync IO if only 3 tags are available
  writeback: Fix periodic writeback after fs mount
  blk-throttle: implement proper hierarchy support
  blk-throttle: implement throtl_grp->has_rules[]
  blk-throttle: Account for child group's start time in parent while bio climbs up
  blk-throttle: add throtl_qnode for dispatch fairness
  blk-throttle: make throtl_pending_timer_fn() ready for hierarchy
  blk-throttle: make tg_dispatch_one_bio() ready for hierarchy
  blk-throttle: make blk_throtl_bio() ready for hierarchy
  blk-throttle: make blk_throtl_drain() ready for hierarchy
  blk-throttle: dispatch from throtl_pending_timer_fn()
  blk-throttle: implement dispatch looping
  blk-throttle: separate out throtl_service_queue->pending_timer from throtl_data->dispatch_work
  blk-throttle: set REQ_THROTTLED from throtl_charge_bio() and gate stats update with it
  blk-throttle: implement sq_to_tg(), sq_to_td() and throtl_log()
  blk-throttle: add throtl_service_queue->parent_sq
  blk-throttle: generalize update_disptime optimization in blk_throtl_bio()
  blk-throttle: dispatch to throtl_data->service_queue.bio_lists[]
  blk-throttle: move bio_lists[] and friends to throtl_service_queue
  ...
This commit is contained in:
Linus Torvalds 2013-07-11 13:03:24 -07:00
commit 36805aaea5
12 changed files with 910 additions and 439 deletions

View File

@ -94,11 +94,13 @@ Throttling/Upper Limit policy
Hierarchical Cgroups
====================
- Currently only CFQ supports hierarchical groups. For throttling,
cgroup interface does allow creation of hierarchical cgroups and
internally it treats them as flat hierarchy.
If somebody created a hierarchy like as follows.
Both CFQ and throttling implement hierarchy support; however,
throttling's hierarchy support is enabled iff "sane_behavior" is
enabled from cgroup side, which currently is a development option and
not publicly available.
If somebody created a hierarchy like as follows.
root
/ \
@ -106,21 +108,20 @@ Hierarchical Cgroups
|
test3
CFQ will handle the hierarchy correctly but and throttling will
practically treat all groups at same level. For details on CFQ
hierarchy support, refer to Documentation/block/cfq-iosched.txt.
Throttling will treat the hierarchy as if it looks like the
following.
CFQ by default and throttling with "sane_behavior" will handle the
hierarchy correctly. For details on CFQ hierarchy support, refer to
Documentation/block/cfq-iosched.txt. For throttling, all limits apply
to the whole subtree while all statistics are local to the IOs
directly generated by tasks in that cgroup.
Throttling without "sane_behavior" enabled from cgroup side will
practically treat all groups at same level as if it looks like the
following.
pivot
/ / \ \
root test1 test2 test3
Nesting cgroups, while allowed, isn't officially supported and blkio
genereates warning when cgroups nest. Once throttling implements
hierarchy support, hierarchy will be supported and the warning will
be removed.
Various user visible config options
===================================
CONFIG_BLK_CGROUP

View File

@ -32,26 +32,6 @@ EXPORT_SYMBOL_GPL(blkcg_root);
static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
struct request_queue *q, bool update_hint);
/**
* blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
* @d_blkg: loop cursor pointing to the current descendant
* @pos_cgrp: used for iteration
* @p_blkg: target blkg to walk descendants of
*
* Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU
* read locked. If called under either blkcg or queue lock, the iteration
* is guaranteed to include all and only online blkgs. The caller may
* update @pos_cgrp by calling cgroup_rightmost_descendant() to skip
* subtree.
*/
#define blkg_for_each_descendant_pre(d_blkg, pos_cgrp, p_blkg) \
cgroup_for_each_descendant_pre((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \
if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \
(p_blkg)->q, false)))
static bool blkcg_policy_enabled(struct request_queue *q,
const struct blkcg_policy *pol)
{
@ -71,18 +51,8 @@ static void blkg_free(struct blkcg_gq *blkg)
if (!blkg)
return;
for (i = 0; i < BLKCG_MAX_POLS; i++) {
struct blkcg_policy *pol = blkcg_policy[i];
struct blkg_policy_data *pd = blkg->pd[i];
if (!pd)
continue;
if (pol && pol->pd_exit_fn)
pol->pd_exit_fn(blkg);
kfree(pd);
}
for (i = 0; i < BLKCG_MAX_POLS; i++)
kfree(blkg->pd[i]);
blk_exit_rl(&blkg->rl);
kfree(blkg);
@ -134,10 +104,6 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
blkg->pd[i] = pd;
pd->blkg = blkg;
pd->plid = i;
/* invoke per-policy init */
if (pol->pd_init_fn)
pol->pd_init_fn(blkg);
}
return blkg;
@ -158,8 +124,8 @@ err_free:
* @q's bypass state. If @update_hint is %true, the caller should be
* holding @q->queue_lock and lookup hint is updated on success.
*/
static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
struct request_queue *q, bool update_hint)
struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
bool update_hint)
{
struct blkcg_gq *blkg;
@ -234,16 +200,25 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
}
blkg = new_blkg;
/* link parent and insert */
/* link parent */
if (blkcg_parent(blkcg)) {
blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
if (WARN_ON_ONCE(!blkg->parent)) {
blkg = ERR_PTR(-EINVAL);
ret = -EINVAL;
goto err_put_css;
}
blkg_get(blkg->parent);
}
/* invoke per-policy init */
for (i = 0; i < BLKCG_MAX_POLS; i++) {
struct blkcg_policy *pol = blkcg_policy[i];
if (blkg->pd[i] && pol->pd_init_fn)
pol->pd_init_fn(blkg);
}
/* insert */
spin_lock(&blkcg->lock);
ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg);
if (likely(!ret)) {
@ -394,30 +369,38 @@ static void blkg_destroy_all(struct request_queue *q)
q->root_rl.blkg = NULL;
}
static void blkg_rcu_free(struct rcu_head *rcu_head)
/*
* A group is RCU protected, but having an rcu lock does not mean that one
* can access all the fields of blkg and assume these are valid. For
* example, don't try to follow throtl_data and request queue links.
*
* Having a reference to blkg under an rcu allows accesses to only values
* local to groups like group stats and group rate limits.
*/
void __blkg_release_rcu(struct rcu_head *rcu_head)
{
blkg_free(container_of(rcu_head, struct blkcg_gq, rcu_head));
}
struct blkcg_gq *blkg = container_of(rcu_head, struct blkcg_gq, rcu_head);
int i;
/* tell policies that this one is being freed */
for (i = 0; i < BLKCG_MAX_POLS; i++) {
struct blkcg_policy *pol = blkcg_policy[i];
if (blkg->pd[i] && pol->pd_exit_fn)
pol->pd_exit_fn(blkg);
}
void __blkg_release(struct blkcg_gq *blkg)
{
/* release the blkcg and parent blkg refs this blkg has been holding */
css_put(&blkg->blkcg->css);
if (blkg->parent)
if (blkg->parent) {
spin_lock_irq(blkg->q->queue_lock);
blkg_put(blkg->parent);
spin_unlock_irq(blkg->q->queue_lock);
}
/*
* A group is freed in rcu manner. But having an rcu lock does not
* mean that one can access all the fields of blkg and assume these
* are valid. For example, don't try to follow throtl_data and
* request queue links.
*
* Having a reference to blkg under an rcu allows acess to only
* values local to groups like group stats and group rate limits
*/
call_rcu(&blkg->rcu_head, blkg_rcu_free);
blkg_free(blkg);
}
EXPORT_SYMBOL_GPL(__blkg_release);
EXPORT_SYMBOL_GPL(__blkg_release_rcu);
/*
* The next function used by blk_queue_for_each_rl(). It's a bit tricky
@ -928,14 +911,6 @@ struct cgroup_subsys blkio_subsys = {
.subsys_id = blkio_subsys_id,
.base_cftypes = blkcg_files,
.module = THIS_MODULE,
/*
* blkio subsystem is utterly broken in terms of hierarchy support.
* It treats all cgroups equally regardless of where they're
* located in the hierarchy - all cgroups are treated as if they're
* right below the root. Fix it and remove the following.
*/
.broken_hierarchy = true,
};
EXPORT_SYMBOL_GPL(blkio_subsys);

View File

@ -266,7 +266,7 @@ static inline void blkg_get(struct blkcg_gq *blkg)
blkg->refcnt++;
}
void __blkg_release(struct blkcg_gq *blkg);
void __blkg_release_rcu(struct rcu_head *rcu);
/**
* blkg_put - put a blkg reference
@ -279,9 +279,43 @@ static inline void blkg_put(struct blkcg_gq *blkg)
lockdep_assert_held(blkg->q->queue_lock);
WARN_ON_ONCE(blkg->refcnt <= 0);
if (!--blkg->refcnt)
__blkg_release(blkg);
call_rcu(&blkg->rcu_head, __blkg_release_rcu);
}
struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
bool update_hint);
/**
* blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
* @d_blkg: loop cursor pointing to the current descendant
* @pos_cgrp: used for iteration
* @p_blkg: target blkg to walk descendants of
*
* Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU
* read locked. If called under either blkcg or queue lock, the iteration
* is guaranteed to include all and only online blkgs. The caller may
* update @pos_cgrp by calling cgroup_rightmost_descendant() to skip
* subtree.
*/
#define blkg_for_each_descendant_pre(d_blkg, pos_cgrp, p_blkg) \
cgroup_for_each_descendant_pre((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \
if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \
(p_blkg)->q, false)))
/**
* blkg_for_each_descendant_post - post-order walk of a blkg's descendants
* @d_blkg: loop cursor pointing to the current descendant
* @pos_cgrp: used for iteration
* @p_blkg: target blkg to walk descendants of
*
* Similar to blkg_for_each_descendant_pre() but performs post-order
* traversal instead. Synchronization rules are the same.
*/
#define blkg_for_each_descendant_post(d_blkg, pos_cgrp, p_blkg) \
cgroup_for_each_descendant_post((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \
if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \
(p_blkg)->q, false)))
/**
* blk_get_rl - get request_list to use
* @q: request_queue of interest

View File

@ -348,9 +348,16 @@ int blk_queue_start_tag(struct request_queue *q, struct request *rq)
*/
max_depth = bqt->max_depth;
if (!rq_is_sync(rq) && max_depth > 1) {
max_depth -= 2;
if (!max_depth)
switch (max_depth) {
case 2:
max_depth = 1;
break;
case 3:
max_depth = 2;
break;
default:
max_depth -= 2;
}
if (q->in_flight[BLK_RW_ASYNC] > max_depth)
return 1;
}

File diff suppressed because it is too large Load Diff

View File

@ -4347,18 +4347,28 @@ static void cfq_exit_queue(struct elevator_queue *e)
kfree(cfqd);
}
static int cfq_init_queue(struct request_queue *q)
static int cfq_init_queue(struct request_queue *q, struct elevator_type *e)
{
struct cfq_data *cfqd;
struct blkcg_gq *blkg __maybe_unused;
int i, ret;
struct elevator_queue *eq;
cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
if (!cfqd)
eq = elevator_alloc(q, e);
if (!eq)
return -ENOMEM;
cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
if (!cfqd) {
kobject_put(&eq->kobj);
return -ENOMEM;
}
eq->elevator_data = cfqd;
cfqd->queue = q;
q->elevator->elevator_data = cfqd;
spin_lock_irq(q->queue_lock);
q->elevator = eq;
spin_unlock_irq(q->queue_lock);
/* Init root service tree */
cfqd->grp_service_tree = CFQ_RB_ROOT;
@ -4433,6 +4443,7 @@ static int cfq_init_queue(struct request_queue *q)
out_free:
kfree(cfqd);
kobject_put(&eq->kobj);
return ret;
}

View File

@ -337,13 +337,21 @@ static void deadline_exit_queue(struct elevator_queue *e)
/*
* initialize elevator private data (deadline_data).
*/
static int deadline_init_queue(struct request_queue *q)
static int deadline_init_queue(struct request_queue *q, struct elevator_type *e)
{
struct deadline_data *dd;
struct elevator_queue *eq;
eq = elevator_alloc(q, e);
if (!eq)
return -ENOMEM;
dd = kmalloc_node(sizeof(*dd), GFP_KERNEL | __GFP_ZERO, q->node);
if (!dd)
if (!dd) {
kobject_put(&eq->kobj);
return -ENOMEM;
}
eq->elevator_data = dd;
INIT_LIST_HEAD(&dd->fifo_list[READ]);
INIT_LIST_HEAD(&dd->fifo_list[WRITE]);
@ -355,7 +363,9 @@ static int deadline_init_queue(struct request_queue *q)
dd->front_merges = 1;
dd->fifo_batch = fifo_batch;
q->elevator->elevator_data = dd;
spin_lock_irq(q->queue_lock);
q->elevator = eq;
spin_unlock_irq(q->queue_lock);
return 0;
}

View File

@ -150,7 +150,7 @@ void __init load_default_elevator_module(void)
static struct kobj_type elv_ktype;
static struct elevator_queue *elevator_alloc(struct request_queue *q,
struct elevator_queue *elevator_alloc(struct request_queue *q,
struct elevator_type *e)
{
struct elevator_queue *eq;
@ -170,6 +170,7 @@ err:
elevator_put(e);
return NULL;
}
EXPORT_SYMBOL(elevator_alloc);
static void elevator_release(struct kobject *kobj)
{
@ -221,16 +222,7 @@ int elevator_init(struct request_queue *q, char *name)
}
}
q->elevator = elevator_alloc(q, e);
if (!q->elevator)
return -ENOMEM;
err = e->ops.elevator_init_fn(q);
if (err) {
kobject_put(&q->elevator->kobj);
return err;
}
err = e->ops.elevator_init_fn(q, e);
return 0;
}
EXPORT_SYMBOL(elevator_init);
@ -935,17 +927,10 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
spin_unlock_irq(q->queue_lock);
/* allocate, init and register new elevator */
err = -ENOMEM;
q->elevator = elevator_alloc(q, new_e);
if (!q->elevator)
err = new_e->ops.elevator_init_fn(q, new_e);
if (err)
goto fail_init;
err = new_e->ops.elevator_init_fn(q);
if (err) {
kobject_put(&q->elevator->kobj);
goto fail_init;
}
if (registered) {
err = elv_register_queue(q);
if (err)

View File

@ -59,16 +59,27 @@ noop_latter_request(struct request_queue *q, struct request *rq)
return list_entry(rq->queuelist.next, struct request, queuelist);
}
static int noop_init_queue(struct request_queue *q)
static int noop_init_queue(struct request_queue *q, struct elevator_type *e)
{
struct noop_data *nd;
struct elevator_queue *eq;
nd = kmalloc_node(sizeof(*nd), GFP_KERNEL, q->node);
if (!nd)
eq = elevator_alloc(q, e);
if (!eq)
return -ENOMEM;
nd = kmalloc_node(sizeof(*nd), GFP_KERNEL, q->node);
if (!nd) {
kobject_put(&eq->kobj);
return -ENOMEM;
}
eq->elevator_data = nd;
INIT_LIST_HEAD(&nd->queue);
q->elevator->elevator_data = nd;
spin_lock_irq(q->queue_lock);
q->elevator = eq;
spin_unlock_irq(q->queue_lock);
return 0;
}

View File

@ -58,17 +58,24 @@ static void bdev_inode_switch_bdi(struct inode *inode,
struct backing_dev_info *dst)
{
struct backing_dev_info *old = inode->i_data.backing_dev_info;
bool wakeup_bdi = false;
if (unlikely(dst == old)) /* deadlock avoidance */
return;
bdi_lock_two(&old->wb, &dst->wb);
spin_lock(&inode->i_lock);
inode->i_data.backing_dev_info = dst;
if (inode->i_state & I_DIRTY)
if (inode->i_state & I_DIRTY) {
if (bdi_cap_writeback_dirty(dst) && !wb_has_dirty_io(&dst->wb))
wakeup_bdi = true;
list_move(&inode->i_wb_list, &dst->wb.b_dirty);
}
spin_unlock(&inode->i_lock);
spin_unlock(&old->wb.list_lock);
spin_unlock(&dst->wb.list_lock);
if (wakeup_bdi)
bdi_wakeup_thread_delayed(dst);
}
/* Kill _all_ buffers and pagecache , dirty or not.. */

View File

@ -278,6 +278,8 @@ enum {
*
* - memcg: use_hierarchy is on by default and the cgroup file for
* the flag is not created.
*
* - blkcg: blk-throttle becomes properly hierarchical.
*/
CGRP_ROOT_SANE_BEHAVIOR = (1 << 0),

View File

@ -7,6 +7,7 @@
#ifdef CONFIG_BLOCK
struct io_cq;
struct elevator_type;
typedef int (elevator_merge_fn) (struct request_queue *, struct request **,
struct bio *);
@ -35,7 +36,8 @@ typedef void (elevator_put_req_fn) (struct request *);
typedef void (elevator_activate_req_fn) (struct request_queue *, struct request *);
typedef void (elevator_deactivate_req_fn) (struct request_queue *, struct request *);
typedef int (elevator_init_fn) (struct request_queue *);
typedef int (elevator_init_fn) (struct request_queue *,
struct elevator_type *e);
typedef void (elevator_exit_fn) (struct elevator_queue *);
struct elevator_ops
@ -155,6 +157,8 @@ extern int elevator_init(struct request_queue *, char *);
extern void elevator_exit(struct elevator_queue *);
extern int elevator_change(struct request_queue *, const char *);
extern bool elv_rq_merge_ok(struct request *, struct bio *);
extern struct elevator_queue *elevator_alloc(struct request_queue *,
struct elevator_type *);
/*
* Helper functions.