698b1b3064
Memory compaction can be currently performed in several contexts: - kswapd balancing a zone after a high-order allocation failure - direct compaction to satisfy a high-order allocation, including THP page fault attemps - khugepaged trying to collapse a hugepage - manually from /proc The purpose of compaction is two-fold. The obvious purpose is to satisfy a (pending or future) high-order allocation, and is easy to evaluate. The other purpose is to keep overal memory fragmentation low and help the anti-fragmentation mechanism. The success wrt the latter purpose is more The current situation wrt the purposes has a few drawbacks: - compaction is invoked only when a high-order page or hugepage is not available (or manually). This might be too late for the purposes of keeping memory fragmentation low. - direct compaction increases latency of allocations. Again, it would be better if compaction was performed asynchronously to keep fragmentation low, before the allocation itself comes. - (a special case of the previous) the cost of compaction during THP page faults can easily offset the benefits of THP. - kswapd compaction appears to be complex, fragile and not working in some scenarios. It could also end up compacting for a high-order allocation request when it should be reclaiming memory for a later order-0 request. To improve the situation, we should be able to benefit from an equivalent of kswapd, but for compaction - i.e. a background thread which responds to fragmentation and the need for high-order allocations (including hugepages) somewhat proactively. One possibility is to extend the responsibilities of kswapd, which could however complicate its design too much. It should be better to let kswapd handle reclaim, as order-0 allocations are often more critical than high-order ones. Another possibility is to extend khugepaged, but this kthread is a single instance and tied to THP configs. This patch goes with the option of a new set of per-node kthreads called kcompactd, and lays the foundations, without introducing any new tunables. The lifecycle mimics kswapd kthreads, including the memory hotplug hooks. For compaction, kcompactd uses the standard compaction_suitable() and ompact_finished() criteria and the deferred compaction functionality. Unlike direct compaction, it uses only sync compaction, as there's no allocation latency to minimize. This patch doesn't yet add a call to wakeup_kcompactd. The kswapd compact/reclaim loop for high-order pages will be replaced by waking up kcompactd in the next patch with the description of what's wrong with the old approach. Waking up of the kcompactd threads is also tied to kswapd activity and follows these rules: - we don't want to affect any fastpaths, so wake up kcompactd only from the slowpath, as it's done for kswapd - if kswapd is doing reclaim, it's more important than compaction, so don't invoke kcompactd until kswapd goes to sleep - the target order used for kswapd is passed to kcompactd Future possible future uses for kcompactd include the ability to wake up kcompactd on demand in special situations, such as when hugepages are not available (currently not done due to __GFP_NO_KSWAPD) or when a fragmentation event (i.e. __rmqueue_fallback()) occurs. It's also possible to perform periodic compaction with kcompactd. [arnd@arndb.de: fix build errors with kcompactd] [paul.gortmaker@windriver.com: don't use modular references for non modular code] Signed-off-by: Vlastimil Babka <vbabka@suse.cz> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> Cc: Rik van Riel <riel@redhat.com> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: David Rientjes <rientjes@google.com> Cc: Michal Hocko <mhocko@suse.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Arnd Bergmann <arnd@arndb.de> Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com> Cc: Hugh Dickins <hughd@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
412 lines
9.0 KiB
C
412 lines
9.0 KiB
C
#undef TRACE_SYSTEM
|
|
#define TRACE_SYSTEM compaction
|
|
|
|
#if !defined(_TRACE_COMPACTION_H) || defined(TRACE_HEADER_MULTI_READ)
|
|
#define _TRACE_COMPACTION_H
|
|
|
|
#include <linux/types.h>
|
|
#include <linux/list.h>
|
|
#include <linux/tracepoint.h>
|
|
#include <trace/events/mmflags.h>
|
|
|
|
#define COMPACTION_STATUS \
|
|
EM( COMPACT_DEFERRED, "deferred") \
|
|
EM( COMPACT_SKIPPED, "skipped") \
|
|
EM( COMPACT_CONTINUE, "continue") \
|
|
EM( COMPACT_PARTIAL, "partial") \
|
|
EM( COMPACT_COMPLETE, "complete") \
|
|
EM( COMPACT_NO_SUITABLE_PAGE, "no_suitable_page") \
|
|
EM( COMPACT_NOT_SUITABLE_ZONE, "not_suitable_zone") \
|
|
EMe(COMPACT_CONTENDED, "contended")
|
|
|
|
#ifdef CONFIG_ZONE_DMA
|
|
#define IFDEF_ZONE_DMA(X) X
|
|
#else
|
|
#define IFDEF_ZONE_DMA(X)
|
|
#endif
|
|
|
|
#ifdef CONFIG_ZONE_DMA32
|
|
#define IFDEF_ZONE_DMA32(X) X
|
|
#else
|
|
#define IFDEF_ZONE_DMA32(X)
|
|
#endif
|
|
|
|
#ifdef CONFIG_HIGHMEM
|
|
#define IFDEF_ZONE_HIGHMEM(X) X
|
|
#else
|
|
#define IFDEF_ZONE_HIGHMEM(X)
|
|
#endif
|
|
|
|
#define ZONE_TYPE \
|
|
IFDEF_ZONE_DMA( EM (ZONE_DMA, "DMA")) \
|
|
IFDEF_ZONE_DMA32( EM (ZONE_DMA32, "DMA32")) \
|
|
EM (ZONE_NORMAL, "Normal") \
|
|
IFDEF_ZONE_HIGHMEM( EM (ZONE_HIGHMEM,"HighMem")) \
|
|
EMe(ZONE_MOVABLE,"Movable")
|
|
|
|
/*
|
|
* First define the enums in the above macros to be exported to userspace
|
|
* via TRACE_DEFINE_ENUM().
|
|
*/
|
|
#undef EM
|
|
#undef EMe
|
|
#define EM(a, b) TRACE_DEFINE_ENUM(a);
|
|
#define EMe(a, b) TRACE_DEFINE_ENUM(a);
|
|
|
|
COMPACTION_STATUS
|
|
ZONE_TYPE
|
|
|
|
/*
|
|
* Now redefine the EM() and EMe() macros to map the enums to the strings
|
|
* that will be printed in the output.
|
|
*/
|
|
#undef EM
|
|
#undef EMe
|
|
#define EM(a, b) {a, b},
|
|
#define EMe(a, b) {a, b}
|
|
|
|
DECLARE_EVENT_CLASS(mm_compaction_isolate_template,
|
|
|
|
TP_PROTO(
|
|
unsigned long start_pfn,
|
|
unsigned long end_pfn,
|
|
unsigned long nr_scanned,
|
|
unsigned long nr_taken),
|
|
|
|
TP_ARGS(start_pfn, end_pfn, nr_scanned, nr_taken),
|
|
|
|
TP_STRUCT__entry(
|
|
__field(unsigned long, start_pfn)
|
|
__field(unsigned long, end_pfn)
|
|
__field(unsigned long, nr_scanned)
|
|
__field(unsigned long, nr_taken)
|
|
),
|
|
|
|
TP_fast_assign(
|
|
__entry->start_pfn = start_pfn;
|
|
__entry->end_pfn = end_pfn;
|
|
__entry->nr_scanned = nr_scanned;
|
|
__entry->nr_taken = nr_taken;
|
|
),
|
|
|
|
TP_printk("range=(0x%lx ~ 0x%lx) nr_scanned=%lu nr_taken=%lu",
|
|
__entry->start_pfn,
|
|
__entry->end_pfn,
|
|
__entry->nr_scanned,
|
|
__entry->nr_taken)
|
|
);
|
|
|
|
DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_migratepages,
|
|
|
|
TP_PROTO(
|
|
unsigned long start_pfn,
|
|
unsigned long end_pfn,
|
|
unsigned long nr_scanned,
|
|
unsigned long nr_taken),
|
|
|
|
TP_ARGS(start_pfn, end_pfn, nr_scanned, nr_taken)
|
|
);
|
|
|
|
DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_freepages,
|
|
|
|
TP_PROTO(
|
|
unsigned long start_pfn,
|
|
unsigned long end_pfn,
|
|
unsigned long nr_scanned,
|
|
unsigned long nr_taken),
|
|
|
|
TP_ARGS(start_pfn, end_pfn, nr_scanned, nr_taken)
|
|
);
|
|
|
|
TRACE_EVENT(mm_compaction_migratepages,
|
|
|
|
TP_PROTO(unsigned long nr_all,
|
|
int migrate_rc,
|
|
struct list_head *migratepages),
|
|
|
|
TP_ARGS(nr_all, migrate_rc, migratepages),
|
|
|
|
TP_STRUCT__entry(
|
|
__field(unsigned long, nr_migrated)
|
|
__field(unsigned long, nr_failed)
|
|
),
|
|
|
|
TP_fast_assign(
|
|
unsigned long nr_failed = 0;
|
|
struct list_head *page_lru;
|
|
|
|
/*
|
|
* migrate_pages() returns either a non-negative number
|
|
* with the number of pages that failed migration, or an
|
|
* error code, in which case we need to count the remaining
|
|
* pages manually
|
|
*/
|
|
if (migrate_rc >= 0)
|
|
nr_failed = migrate_rc;
|
|
else
|
|
list_for_each(page_lru, migratepages)
|
|
nr_failed++;
|
|
|
|
__entry->nr_migrated = nr_all - nr_failed;
|
|
__entry->nr_failed = nr_failed;
|
|
),
|
|
|
|
TP_printk("nr_migrated=%lu nr_failed=%lu",
|
|
__entry->nr_migrated,
|
|
__entry->nr_failed)
|
|
);
|
|
|
|
TRACE_EVENT(mm_compaction_begin,
|
|
TP_PROTO(unsigned long zone_start, unsigned long migrate_pfn,
|
|
unsigned long free_pfn, unsigned long zone_end, bool sync),
|
|
|
|
TP_ARGS(zone_start, migrate_pfn, free_pfn, zone_end, sync),
|
|
|
|
TP_STRUCT__entry(
|
|
__field(unsigned long, zone_start)
|
|
__field(unsigned long, migrate_pfn)
|
|
__field(unsigned long, free_pfn)
|
|
__field(unsigned long, zone_end)
|
|
__field(bool, sync)
|
|
),
|
|
|
|
TP_fast_assign(
|
|
__entry->zone_start = zone_start;
|
|
__entry->migrate_pfn = migrate_pfn;
|
|
__entry->free_pfn = free_pfn;
|
|
__entry->zone_end = zone_end;
|
|
__entry->sync = sync;
|
|
),
|
|
|
|
TP_printk("zone_start=0x%lx migrate_pfn=0x%lx free_pfn=0x%lx zone_end=0x%lx, mode=%s",
|
|
__entry->zone_start,
|
|
__entry->migrate_pfn,
|
|
__entry->free_pfn,
|
|
__entry->zone_end,
|
|
__entry->sync ? "sync" : "async")
|
|
);
|
|
|
|
TRACE_EVENT(mm_compaction_end,
|
|
TP_PROTO(unsigned long zone_start, unsigned long migrate_pfn,
|
|
unsigned long free_pfn, unsigned long zone_end, bool sync,
|
|
int status),
|
|
|
|
TP_ARGS(zone_start, migrate_pfn, free_pfn, zone_end, sync, status),
|
|
|
|
TP_STRUCT__entry(
|
|
__field(unsigned long, zone_start)
|
|
__field(unsigned long, migrate_pfn)
|
|
__field(unsigned long, free_pfn)
|
|
__field(unsigned long, zone_end)
|
|
__field(bool, sync)
|
|
__field(int, status)
|
|
),
|
|
|
|
TP_fast_assign(
|
|
__entry->zone_start = zone_start;
|
|
__entry->migrate_pfn = migrate_pfn;
|
|
__entry->free_pfn = free_pfn;
|
|
__entry->zone_end = zone_end;
|
|
__entry->sync = sync;
|
|
__entry->status = status;
|
|
),
|
|
|
|
TP_printk("zone_start=0x%lx migrate_pfn=0x%lx free_pfn=0x%lx zone_end=0x%lx, mode=%s status=%s",
|
|
__entry->zone_start,
|
|
__entry->migrate_pfn,
|
|
__entry->free_pfn,
|
|
__entry->zone_end,
|
|
__entry->sync ? "sync" : "async",
|
|
__print_symbolic(__entry->status, COMPACTION_STATUS))
|
|
);
|
|
|
|
TRACE_EVENT(mm_compaction_try_to_compact_pages,
|
|
|
|
TP_PROTO(
|
|
int order,
|
|
gfp_t gfp_mask,
|
|
enum migrate_mode mode),
|
|
|
|
TP_ARGS(order, gfp_mask, mode),
|
|
|
|
TP_STRUCT__entry(
|
|
__field(int, order)
|
|
__field(gfp_t, gfp_mask)
|
|
__field(enum migrate_mode, mode)
|
|
),
|
|
|
|
TP_fast_assign(
|
|
__entry->order = order;
|
|
__entry->gfp_mask = gfp_mask;
|
|
__entry->mode = mode;
|
|
),
|
|
|
|
TP_printk("order=%d gfp_mask=0x%x mode=%d",
|
|
__entry->order,
|
|
__entry->gfp_mask,
|
|
(int)__entry->mode)
|
|
);
|
|
|
|
DECLARE_EVENT_CLASS(mm_compaction_suitable_template,
|
|
|
|
TP_PROTO(struct zone *zone,
|
|
int order,
|
|
int ret),
|
|
|
|
TP_ARGS(zone, order, ret),
|
|
|
|
TP_STRUCT__entry(
|
|
__field(int, nid)
|
|
__field(enum zone_type, idx)
|
|
__field(int, order)
|
|
__field(int, ret)
|
|
),
|
|
|
|
TP_fast_assign(
|
|
__entry->nid = zone_to_nid(zone);
|
|
__entry->idx = zone_idx(zone);
|
|
__entry->order = order;
|
|
__entry->ret = ret;
|
|
),
|
|
|
|
TP_printk("node=%d zone=%-8s order=%d ret=%s",
|
|
__entry->nid,
|
|
__print_symbolic(__entry->idx, ZONE_TYPE),
|
|
__entry->order,
|
|
__print_symbolic(__entry->ret, COMPACTION_STATUS))
|
|
);
|
|
|
|
DEFINE_EVENT(mm_compaction_suitable_template, mm_compaction_finished,
|
|
|
|
TP_PROTO(struct zone *zone,
|
|
int order,
|
|
int ret),
|
|
|
|
TP_ARGS(zone, order, ret)
|
|
);
|
|
|
|
DEFINE_EVENT(mm_compaction_suitable_template, mm_compaction_suitable,
|
|
|
|
TP_PROTO(struct zone *zone,
|
|
int order,
|
|
int ret),
|
|
|
|
TP_ARGS(zone, order, ret)
|
|
);
|
|
|
|
#ifdef CONFIG_COMPACTION
|
|
DECLARE_EVENT_CLASS(mm_compaction_defer_template,
|
|
|
|
TP_PROTO(struct zone *zone, int order),
|
|
|
|
TP_ARGS(zone, order),
|
|
|
|
TP_STRUCT__entry(
|
|
__field(int, nid)
|
|
__field(enum zone_type, idx)
|
|
__field(int, order)
|
|
__field(unsigned int, considered)
|
|
__field(unsigned int, defer_shift)
|
|
__field(int, order_failed)
|
|
),
|
|
|
|
TP_fast_assign(
|
|
__entry->nid = zone_to_nid(zone);
|
|
__entry->idx = zone_idx(zone);
|
|
__entry->order = order;
|
|
__entry->considered = zone->compact_considered;
|
|
__entry->defer_shift = zone->compact_defer_shift;
|
|
__entry->order_failed = zone->compact_order_failed;
|
|
),
|
|
|
|
TP_printk("node=%d zone=%-8s order=%d order_failed=%d consider=%u limit=%lu",
|
|
__entry->nid,
|
|
__print_symbolic(__entry->idx, ZONE_TYPE),
|
|
__entry->order,
|
|
__entry->order_failed,
|
|
__entry->considered,
|
|
1UL << __entry->defer_shift)
|
|
);
|
|
|
|
DEFINE_EVENT(mm_compaction_defer_template, mm_compaction_deferred,
|
|
|
|
TP_PROTO(struct zone *zone, int order),
|
|
|
|
TP_ARGS(zone, order)
|
|
);
|
|
|
|
DEFINE_EVENT(mm_compaction_defer_template, mm_compaction_defer_compaction,
|
|
|
|
TP_PROTO(struct zone *zone, int order),
|
|
|
|
TP_ARGS(zone, order)
|
|
);
|
|
|
|
DEFINE_EVENT(mm_compaction_defer_template, mm_compaction_defer_reset,
|
|
|
|
TP_PROTO(struct zone *zone, int order),
|
|
|
|
TP_ARGS(zone, order)
|
|
);
|
|
#endif
|
|
|
|
TRACE_EVENT(mm_compaction_kcompactd_sleep,
|
|
|
|
TP_PROTO(int nid),
|
|
|
|
TP_ARGS(nid),
|
|
|
|
TP_STRUCT__entry(
|
|
__field(int, nid)
|
|
),
|
|
|
|
TP_fast_assign(
|
|
__entry->nid = nid;
|
|
),
|
|
|
|
TP_printk("nid=%d", __entry->nid)
|
|
);
|
|
|
|
DECLARE_EVENT_CLASS(kcompactd_wake_template,
|
|
|
|
TP_PROTO(int nid, int order, enum zone_type classzone_idx),
|
|
|
|
TP_ARGS(nid, order, classzone_idx),
|
|
|
|
TP_STRUCT__entry(
|
|
__field(int, nid)
|
|
__field(int, order)
|
|
__field(enum zone_type, classzone_idx)
|
|
),
|
|
|
|
TP_fast_assign(
|
|
__entry->nid = nid;
|
|
__entry->order = order;
|
|
__entry->classzone_idx = classzone_idx;
|
|
),
|
|
|
|
TP_printk("nid=%d order=%d classzone_idx=%-8s",
|
|
__entry->nid,
|
|
__entry->order,
|
|
__print_symbolic(__entry->classzone_idx, ZONE_TYPE))
|
|
);
|
|
|
|
DEFINE_EVENT(kcompactd_wake_template, mm_compaction_wakeup_kcompactd,
|
|
|
|
TP_PROTO(int nid, int order, enum zone_type classzone_idx),
|
|
|
|
TP_ARGS(nid, order, classzone_idx)
|
|
);
|
|
|
|
DEFINE_EVENT(kcompactd_wake_template, mm_compaction_kcompactd_wake,
|
|
|
|
TP_PROTO(int nid, int order, enum zone_type classzone_idx),
|
|
|
|
TP_ARGS(nid, order, classzone_idx)
|
|
);
|
|
|
|
#endif /* _TRACE_COMPACTION_H */
|
|
|
|
/* This part must be outside protection */
|
|
#include <trace/define_trace.h>
|