diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 5f172703eb4f..178c23508d3d 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -104,30 +104,6 @@ DEFINE_WRITEBACK_EVENT(writeback_bdi_register); DEFINE_WRITEBACK_EVENT(writeback_bdi_unregister); DEFINE_WRITEBACK_EVENT(writeback_thread_start); DEFINE_WRITEBACK_EVENT(writeback_thread_stop); -DEFINE_WRITEBACK_EVENT(balance_dirty_start); -DEFINE_WRITEBACK_EVENT(balance_dirty_wait); - -TRACE_EVENT(balance_dirty_written, - - TP_PROTO(struct backing_dev_info *bdi, int written), - - TP_ARGS(bdi, written), - - TP_STRUCT__entry( - __array(char, name, 32) - __field(int, written) - ), - - TP_fast_assign( - strncpy(__entry->name, dev_name(bdi->dev), 32); - __entry->written = written; - ), - - TP_printk("bdi %s written %d", - __entry->name, - __entry->written - ) -); DECLARE_EVENT_CLASS(wbc_class, TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), diff --git a/mm/page-writeback.c b/mm/page-writeback.c index daff320d263f..f32f25092c66 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -250,50 +250,6 @@ static void bdi_writeout_fraction(struct backing_dev_info *bdi, numerator, denominator); } -static inline void task_dirties_fraction(struct task_struct *tsk, - long *numerator, long *denominator) -{ - prop_fraction_single(&vm_dirties, &tsk->dirties, - numerator, denominator); -} - -/* - * task_dirty_limit - scale down dirty throttling threshold for one task - * - * task specific dirty limit: - * - * dirty -= (dirty/8) * p_{t} - * - * To protect light/slow dirtying tasks from heavier/fast ones, we start - * throttling individual tasks before reaching the bdi dirty limit. - * Relatively low thresholds will be allocated to heavy dirtiers. So when - * dirty pages grow large, heavy dirtiers will be throttled first, which will - * effectively curb the growth of dirty pages. Light dirtiers with high enough - * dirty threshold may never get throttled. - */ -#define TASK_LIMIT_FRACTION 8 -static unsigned long task_dirty_limit(struct task_struct *tsk, - unsigned long bdi_dirty) -{ - long numerator, denominator; - unsigned long dirty = bdi_dirty; - u64 inv = dirty / TASK_LIMIT_FRACTION; - - task_dirties_fraction(tsk, &numerator, &denominator); - inv *= numerator; - do_div(inv, denominator); - - dirty -= inv; - - return max(dirty, bdi_dirty/2); -} - -/* Minimum limit for any task */ -static unsigned long task_min_dirty_limit(unsigned long bdi_dirty) -{ - return bdi_dirty - bdi_dirty / TASK_LIMIT_FRACTION; -} - /* * */ @@ -986,30 +942,36 @@ static unsigned long dirty_poll_interval(unsigned long dirty, /* * balance_dirty_pages() must be called by processes which are generating dirty * data. It looks at the number of dirty pages in the machine and will force - * the caller to perform writeback if the system is over `vm_dirty_ratio'. + * the caller to wait once crossing the (background_thresh + dirty_thresh) / 2. * If we're over `background_thresh' then the writeback threads are woken to * perform some writeout. */ static void balance_dirty_pages(struct address_space *mapping, - unsigned long write_chunk) + unsigned long pages_dirtied) { - unsigned long nr_reclaimable, bdi_nr_reclaimable; + unsigned long nr_reclaimable; /* = file_dirty + unstable_nfs */ + unsigned long bdi_reclaimable; unsigned long nr_dirty; /* = file_dirty + writeback + unstable_nfs */ unsigned long bdi_dirty; unsigned long freerun; unsigned long background_thresh; unsigned long dirty_thresh; unsigned long bdi_thresh; - unsigned long task_bdi_thresh; - unsigned long min_task_bdi_thresh; - unsigned long pages_written = 0; - unsigned long pause = 1; + long pause = 0; bool dirty_exceeded = false; - bool clear_dirty_exceeded = true; + unsigned long task_ratelimit; + unsigned long dirty_ratelimit; + unsigned long pos_ratio; struct backing_dev_info *bdi = mapping->backing_dev_info; unsigned long start_time = jiffies; for (;;) { + /* + * Unstable writes are a feature of certain networked + * filesystems (i.e. NFS) in which data may have been + * written to the server's write cache, but has not yet + * been flushed to permanent storage. + */ nr_reclaimable = global_page_state(NR_FILE_DIRTY) + global_page_state(NR_UNSTABLE_NFS); nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK); @@ -1026,9 +988,23 @@ static void balance_dirty_pages(struct address_space *mapping, if (nr_dirty <= freerun) break; + if (unlikely(!writeback_in_progress(bdi))) + bdi_start_background_writeback(bdi); + + /* + * bdi_thresh is not treated as some limiting factor as + * dirty_thresh, due to reasons + * - in JBOD setup, bdi_thresh can fluctuate a lot + * - in a system with HDD and USB key, the USB key may somehow + * go into state (bdi_dirty >> bdi_thresh) either because + * bdi_dirty starts high, or because bdi_thresh drops low. + * In this case we don't want to hard throttle the USB key + * dirtiers for 100 seconds until bdi_dirty drops under + * bdi_thresh. Instead the auxiliary bdi control line in + * bdi_position_ratio() will let the dirtier task progress + * at some rate <= (write_bw / 2) for bringing down bdi_dirty. + */ bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); - min_task_bdi_thresh = task_min_dirty_limit(bdi_thresh); - task_bdi_thresh = task_dirty_limit(current, bdi_thresh); /* * In order to avoid the stacked BDI deadlock we need @@ -1040,57 +1016,41 @@ static void balance_dirty_pages(struct address_space *mapping, * actually dirty; with m+n sitting in the percpu * deltas. */ - if (task_bdi_thresh < 2 * bdi_stat_error(bdi)) { - bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); - bdi_dirty = bdi_nr_reclaimable + + if (bdi_thresh < 2 * bdi_stat_error(bdi)) { + bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); + bdi_dirty = bdi_reclaimable + bdi_stat_sum(bdi, BDI_WRITEBACK); } else { - bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); - bdi_dirty = bdi_nr_reclaimable + + bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); + bdi_dirty = bdi_reclaimable + bdi_stat(bdi, BDI_WRITEBACK); } - /* - * The bdi thresh is somehow "soft" limit derived from the - * global "hard" limit. The former helps to prevent heavy IO - * bdi or process from holding back light ones; The latter is - * the last resort safeguard. - */ - dirty_exceeded = (bdi_dirty > task_bdi_thresh) || + dirty_exceeded = (bdi_dirty > bdi_thresh) || (nr_dirty > dirty_thresh); - clear_dirty_exceeded = (bdi_dirty <= min_task_bdi_thresh) && - (nr_dirty <= dirty_thresh); - - if (!dirty_exceeded) - break; - - if (!bdi->dirty_exceeded) + if (dirty_exceeded && !bdi->dirty_exceeded) bdi->dirty_exceeded = 1; bdi_update_bandwidth(bdi, dirty_thresh, background_thresh, nr_dirty, bdi_thresh, bdi_dirty, start_time); - /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. - * Unstable writes are a feature of certain networked - * filesystems (i.e. NFS) in which data may have been - * written to the server's write cache, but has not yet - * been flushed to permanent storage. - * Only move pages to writeback if this bdi is over its - * threshold otherwise wait until the disk writes catch - * up. - */ - trace_balance_dirty_start(bdi); - if (bdi_nr_reclaimable > task_bdi_thresh) { - pages_written += writeback_inodes_wb(&bdi->wb, - write_chunk); - trace_balance_dirty_written(bdi, pages_written); - if (pages_written >= write_chunk) - break; /* We've done our duty */ + dirty_ratelimit = bdi->dirty_ratelimit; + pos_ratio = bdi_position_ratio(bdi, dirty_thresh, + background_thresh, nr_dirty, + bdi_thresh, bdi_dirty); + if (unlikely(pos_ratio == 0)) { + pause = MAX_PAUSE; + goto pause; } + task_ratelimit = (u64)dirty_ratelimit * + pos_ratio >> RATELIMIT_CALC_SHIFT; + pause = (HZ * pages_dirtied) / (task_ratelimit | 1); + pause = min_t(long, pause, MAX_PAUSE); + +pause: __set_current_state(TASK_UNINTERRUPTIBLE); io_schedule_timeout(pause); - trace_balance_dirty_wait(bdi); dirty_thresh = hard_dirty_limit(dirty_thresh); /* @@ -1099,22 +1059,11 @@ static void balance_dirty_pages(struct address_space *mapping, * 200ms is typically more than enough to curb heavy dirtiers; * (b) the pause time limit makes the dirtiers more responsive. */ - if (nr_dirty < dirty_thresh && - bdi_dirty < (task_bdi_thresh + bdi_thresh) / 2 && - time_after(jiffies, start_time + MAX_PAUSE)) + if (nr_dirty < dirty_thresh) break; - - /* - * Increase the delay for each loop, up to our previous - * default of taking a 100ms nap. - */ - pause <<= 1; - if (pause > HZ / 10) - pause = HZ / 10; } - /* Clear dirty_exceeded flag only when no task can exceed the limit */ - if (clear_dirty_exceeded && bdi->dirty_exceeded) + if (!dirty_exceeded && bdi->dirty_exceeded) bdi->dirty_exceeded = 0; current->nr_dirtied = 0; @@ -1131,8 +1080,10 @@ static void balance_dirty_pages(struct address_space *mapping, * In normal mode, we start background writeout at the lower * background_thresh, to keep the amount of dirty memory low. */ - if ((laptop_mode && pages_written) || - (!laptop_mode && (nr_reclaimable > background_thresh))) + if (laptop_mode) + return; + + if (nr_reclaimable > background_thresh) bdi_start_background_writeback(bdi); }