diff --git a/Documentation/lockdep-design.txt b/Documentation/lockdep-design.txt index 488773018152..938ea22f2cc0 100644 --- a/Documentation/lockdep-design.txt +++ b/Documentation/lockdep-design.txt @@ -27,33 +27,37 @@ lock-class. State ----- -The validator tracks lock-class usage history into 5 separate state bits: +The validator tracks lock-class usage history into 4n + 1 separate state bits: -- 'ever held in hardirq context' [ == hardirq-safe ] -- 'ever held in softirq context' [ == softirq-safe ] -- 'ever held with hardirqs enabled' [ == hardirq-unsafe ] -- 'ever held with softirqs and hardirqs enabled' [ == softirq-unsafe ] +- 'ever held in STATE context' +- 'ever head as readlock in STATE context' +- 'ever head with STATE enabled' +- 'ever head as readlock with STATE enabled' + +Where STATE can be either one of (kernel/lockdep_states.h) + - hardirq + - softirq + - reclaim_fs - 'ever used' [ == !unused ] -When locking rules are violated, these 4 state bits are presented in the -locking error messages, inside curlies. A contrived example: +When locking rules are violated, these state bits are presented in the +locking error messages, inside curlies. A contrived example: modprobe/2287 is trying to acquire lock: - (&sio_locks[i].lock){--..}, at: [] mutex_lock+0x21/0x24 + (&sio_locks[i].lock){-.-...}, at: [] mutex_lock+0x21/0x24 but task is already holding lock: - (&sio_locks[i].lock){--..}, at: [] mutex_lock+0x21/0x24 + (&sio_locks[i].lock){-.-...}, at: [] mutex_lock+0x21/0x24 -The bit position indicates hardirq, softirq, hardirq-read, -softirq-read respectively, and the character displayed in each -indicates: +The bit position indicates STATE, STATE-read, for each of the states listed +above, and the character displayed in each indicates: '.' acquired while irqs disabled '+' acquired in irq context '-' acquired with irqs enabled - '?' read acquired in irq context with irqs enabled. + '?' acquired in irq context with irqs enabled. Unused mutexes cannot be part of the cause of an error. diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 23bf02fb124f..5a58ea3e91e9 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -20,43 +20,10 @@ struct lockdep_map; #include /* - * Lock-class usage-state bits: + * We'd rather not expose kernel/lockdep_states.h this wide, but we do need + * the total number of states... :-( */ -enum lock_usage_bit -{ - LOCK_USED = 0, - LOCK_USED_IN_HARDIRQ, - LOCK_USED_IN_SOFTIRQ, - LOCK_ENABLED_SOFTIRQS, - LOCK_ENABLED_HARDIRQS, - LOCK_USED_IN_HARDIRQ_READ, - LOCK_USED_IN_SOFTIRQ_READ, - LOCK_ENABLED_SOFTIRQS_READ, - LOCK_ENABLED_HARDIRQS_READ, - LOCK_USAGE_STATES -}; - -/* - * Usage-state bitmasks: - */ -#define LOCKF_USED (1 << LOCK_USED) -#define LOCKF_USED_IN_HARDIRQ (1 << LOCK_USED_IN_HARDIRQ) -#define LOCKF_USED_IN_SOFTIRQ (1 << LOCK_USED_IN_SOFTIRQ) -#define LOCKF_ENABLED_HARDIRQS (1 << LOCK_ENABLED_HARDIRQS) -#define LOCKF_ENABLED_SOFTIRQS (1 << LOCK_ENABLED_SOFTIRQS) - -#define LOCKF_ENABLED_IRQS (LOCKF_ENABLED_HARDIRQS | LOCKF_ENABLED_SOFTIRQS) -#define LOCKF_USED_IN_IRQ (LOCKF_USED_IN_HARDIRQ | LOCKF_USED_IN_SOFTIRQ) - -#define LOCKF_USED_IN_HARDIRQ_READ (1 << LOCK_USED_IN_HARDIRQ_READ) -#define LOCKF_USED_IN_SOFTIRQ_READ (1 << LOCK_USED_IN_SOFTIRQ_READ) -#define LOCKF_ENABLED_HARDIRQS_READ (1 << LOCK_ENABLED_HARDIRQS_READ) -#define LOCKF_ENABLED_SOFTIRQS_READ (1 << LOCK_ENABLED_SOFTIRQS_READ) - -#define LOCKF_ENABLED_IRQS_READ \ - (LOCKF_ENABLED_HARDIRQS_READ | LOCKF_ENABLED_SOFTIRQS_READ) -#define LOCKF_USED_IN_IRQ_READ \ - (LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ) +#define XXX_LOCK_USAGE_STATES (1+3*4) #define MAX_LOCKDEP_SUBCLASSES 8UL @@ -97,7 +64,7 @@ struct lock_class { * IRQ/softirq usage tracking bits: */ unsigned long usage_mask; - struct stack_trace usage_traces[LOCK_USAGE_STATES]; + struct stack_trace usage_traces[XXX_LOCK_USAGE_STATES]; /* * These fields represent a directed graph of lock dependencies, @@ -324,7 +291,11 @@ static inline void lock_set_subclass(struct lockdep_map *lock, lock_set_class(lock, lock->name, lock->key, subclass, ip); } -# define INIT_LOCKDEP .lockdep_recursion = 0, +extern void lockdep_set_current_reclaim_state(gfp_t gfp_mask); +extern void lockdep_clear_current_reclaim_state(void); +extern void lockdep_trace_alloc(gfp_t mask); + +# define INIT_LOCKDEP .lockdep_recursion = 0, .lockdep_reclaim_gfp = 0, #define lockdep_depth(tsk) (debug_locks ? (tsk)->lockdep_depth : 0) @@ -342,6 +313,9 @@ static inline void lockdep_on(void) # define lock_release(l, n, i) do { } while (0) # define lock_set_class(l, n, k, s, i) do { } while (0) # define lock_set_subclass(l, s, i) do { } while (0) +# define lockdep_set_current_reclaim_state(g) do { } while (0) +# define lockdep_clear_current_reclaim_state() do { } while (0) +# define lockdep_trace_alloc(g) do { } while (0) # define lockdep_init() do { } while (0) # define lockdep_info() do { } while (0) # define lockdep_init_map(lock, name, key, sub) \ diff --git a/include/linux/mutex.h b/include/linux/mutex.h index 7a0e5c4f8072..3069ec7e0ab8 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -50,8 +50,10 @@ struct mutex { atomic_t count; spinlock_t wait_lock; struct list_head wait_list; -#ifdef CONFIG_DEBUG_MUTEXES +#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_SMP) struct thread_info *owner; +#endif +#ifdef CONFIG_DEBUG_MUTEXES const char *name; void *magic; #endif @@ -68,7 +70,6 @@ struct mutex_waiter { struct list_head list; struct task_struct *task; #ifdef CONFIG_DEBUG_MUTEXES - struct mutex *lock; void *magic; #endif }; diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index f5e793d69bd3..79fcbc4b09d6 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -121,6 +121,9 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu); u64 ring_buffer_time_stamp(int cpu); void ring_buffer_normalize_time_stamp(int cpu, u64 *ts); +size_t ring_buffer_page_len(void *page); + + /* * The below functions are fine to use outside the tracing facility. */ @@ -138,8 +141,8 @@ static inline int tracing_is_on(void) { return 0; } void *ring_buffer_alloc_read_page(struct ring_buffer *buffer); void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data); -int ring_buffer_read_page(struct ring_buffer *buffer, - void **data_page, int cpu, int full); +int ring_buffer_read_page(struct ring_buffer *buffer, void **data_page, + size_t len, int cpu, int full); enum ring_buffer_flags { RB_FL_OVERWRITE = 1 << 0, diff --git a/include/linux/sched.h b/include/linux/sched.h index 698ce978e97a..0237d124089f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -333,7 +333,9 @@ extern signed long schedule_timeout(signed long timeout); extern signed long schedule_timeout_interruptible(signed long timeout); extern signed long schedule_timeout_killable(signed long timeout); extern signed long schedule_timeout_uninterruptible(signed long timeout); +asmlinkage void __schedule(void); asmlinkage void schedule(void); +extern int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner); struct nsproxy; struct user_namespace; @@ -1330,6 +1332,7 @@ struct task_struct { int lockdep_depth; unsigned int lockdep_recursion; struct held_lock held_locks[MAX_LOCK_DEPTH]; + gfp_t lockdep_reclaim_gfp; #endif /* journalling filesystem info */ diff --git a/include/linux/timer.h b/include/linux/timer.h index daf9685b861c..51774eb87cc6 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -5,6 +5,7 @@ #include #include #include +#include struct tvec_base; @@ -21,52 +22,126 @@ struct timer_list { char start_comm[16]; int start_pid; #endif +#ifdef CONFIG_LOCKDEP + struct lockdep_map lockdep_map; +#endif }; extern struct tvec_base boot_tvec_bases; +#ifdef CONFIG_LOCKDEP +/* + * NB: because we have to copy the lockdep_map, setting the lockdep_map key + * (second argument) here is required, otherwise it could be initialised to + * the copy of the lockdep_map later! We use the pointer to and the string + * ":" as the key resp. the name of the lockdep_map. + */ +#define __TIMER_LOCKDEP_MAP_INITIALIZER(_kn) \ + .lockdep_map = STATIC_LOCKDEP_MAP_INIT(_kn, &_kn), +#else +#define __TIMER_LOCKDEP_MAP_INITIALIZER(_kn) +#endif + #define TIMER_INITIALIZER(_function, _expires, _data) { \ .entry = { .prev = TIMER_ENTRY_STATIC }, \ .function = (_function), \ .expires = (_expires), \ .data = (_data), \ .base = &boot_tvec_bases, \ + __TIMER_LOCKDEP_MAP_INITIALIZER( \ + __FILE__ ":" __stringify(__LINE__)) \ } #define DEFINE_TIMER(_name, _function, _expires, _data) \ struct timer_list _name = \ TIMER_INITIALIZER(_function, _expires, _data) -void init_timer(struct timer_list *timer); -void init_timer_deferrable(struct timer_list *timer); +void init_timer_key(struct timer_list *timer, + const char *name, + struct lock_class_key *key); +void init_timer_deferrable_key(struct timer_list *timer, + const char *name, + struct lock_class_key *key); + +#ifdef CONFIG_LOCKDEP +#define init_timer(timer) \ + do { \ + static struct lock_class_key __key; \ + init_timer_key((timer), #timer, &__key); \ + } while (0) + +#define init_timer_deferrable(timer) \ + do { \ + static struct lock_class_key __key; \ + init_timer_deferrable_key((timer), #timer, &__key); \ + } while (0) + +#define init_timer_on_stack(timer) \ + do { \ + static struct lock_class_key __key; \ + init_timer_on_stack_key((timer), #timer, &__key); \ + } while (0) + +#define setup_timer(timer, fn, data) \ + do { \ + static struct lock_class_key __key; \ + setup_timer_key((timer), #timer, &__key, (fn), (data));\ + } while (0) + +#define setup_timer_on_stack(timer, fn, data) \ + do { \ + static struct lock_class_key __key; \ + setup_timer_on_stack_key((timer), #timer, &__key, \ + (fn), (data)); \ + } while (0) +#else +#define init_timer(timer)\ + init_timer_key((timer), NULL, NULL) +#define init_timer_deferrable(timer)\ + init_timer_deferrable_key((timer), NULL, NULL) +#define init_timer_on_stack(timer)\ + init_timer_on_stack_key((timer), NULL, NULL) +#define setup_timer(timer, fn, data)\ + setup_timer_key((timer), NULL, NULL, (fn), (data)) +#define setup_timer_on_stack(timer, fn, data)\ + setup_timer_on_stack_key((timer), NULL, NULL, (fn), (data)) +#endif #ifdef CONFIG_DEBUG_OBJECTS_TIMERS -extern void init_timer_on_stack(struct timer_list *timer); +extern void init_timer_on_stack_key(struct timer_list *timer, + const char *name, + struct lock_class_key *key); extern void destroy_timer_on_stack(struct timer_list *timer); #else static inline void destroy_timer_on_stack(struct timer_list *timer) { } -static inline void init_timer_on_stack(struct timer_list *timer) +static inline void init_timer_on_stack_key(struct timer_list *timer, + const char *name, + struct lock_class_key *key) { - init_timer(timer); + init_timer_key(timer, name, key); } #endif -static inline void setup_timer(struct timer_list * timer, +static inline void setup_timer_key(struct timer_list * timer, + const char *name, + struct lock_class_key *key, void (*function)(unsigned long), unsigned long data) { timer->function = function; timer->data = data; - init_timer(timer); + init_timer_key(timer, name, key); } -static inline void setup_timer_on_stack(struct timer_list *timer, +static inline void setup_timer_on_stack_key(struct timer_list *timer, + const char *name, + struct lock_class_key *key, void (*function)(unsigned long), unsigned long data) { timer->function = function; timer->data = data; - init_timer_on_stack(timer); + init_timer_on_stack_key(timer, name, key); } /** diff --git a/include/trace/lockdep.h b/include/trace/lockdep.h new file mode 100644 index 000000000000..5ca67df87f2a --- /dev/null +++ b/include/trace/lockdep.h @@ -0,0 +1,9 @@ +#ifndef _TRACE_LOCKDEP_H +#define _TRACE_LOCKDEP_H + +#include +#include + +#include + +#endif diff --git a/include/trace/lockdep_event_types.h b/include/trace/lockdep_event_types.h new file mode 100644 index 000000000000..f713d74a82b4 --- /dev/null +++ b/include/trace/lockdep_event_types.h @@ -0,0 +1,44 @@ + +#ifndef TRACE_EVENT_FORMAT +# error Do not include this file directly. +# error Unless you know what you are doing. +#endif + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM lock + +#ifdef CONFIG_LOCKDEP + +TRACE_FORMAT(lock_acquire, + TPPROTO(struct lockdep_map *lock, unsigned int subclass, + int trylock, int read, int check, + struct lockdep_map *next_lock, unsigned long ip), + TPARGS(lock, subclass, trylock, read, check, next_lock, ip), + TPFMT("%s%s%s", trylock ? "try " : "", + read ? "read " : "", lock->name) + ); + +TRACE_FORMAT(lock_release, + TPPROTO(struct lockdep_map *lock, int nested, unsigned long ip), + TPARGS(lock, nested, ip), + TPFMT("%s", lock->name) + ); + +#ifdef CONFIG_LOCK_STAT + +TRACE_FORMAT(lock_contended, + TPPROTO(struct lockdep_map *lock, unsigned long ip), + TPARGS(lock, ip), + TPFMT("%s", lock->name) + ); + +TRACE_FORMAT(lock_acquired, + TPPROTO(struct lockdep_map *lock, unsigned long ip), + TPARGS(lock, ip), + TPFMT("%s", lock->name) + ); + +#endif +#endif + +#undef TRACE_SYSTEM diff --git a/include/trace/trace_event_types.h b/include/trace/trace_event_types.h index 33c8ed5ccb6c..df56f5694be6 100644 --- a/include/trace/trace_event_types.h +++ b/include/trace/trace_event_types.h @@ -2,3 +2,4 @@ #include #include +#include diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h index ea2ef2051762..fd13750ca4ba 100644 --- a/include/trace/trace_events.h +++ b/include/trace/trace_events.h @@ -2,3 +2,4 @@ #include #include +#include diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 06b0c3568f0b..cb70c1db85d0 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -41,6 +41,8 @@ #include #include #include +#include +#include #include @@ -310,12 +312,14 @@ EXPORT_SYMBOL(lockdep_on); #if VERBOSE # define HARDIRQ_VERBOSE 1 # define SOFTIRQ_VERBOSE 1 +# define RECLAIM_VERBOSE 1 #else # define HARDIRQ_VERBOSE 0 # define SOFTIRQ_VERBOSE 0 +# define RECLAIM_VERBOSE 0 #endif -#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE +#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE || RECLAIM_VERBOSE /* * Quick filtering for interesting events: */ @@ -443,17 +447,18 @@ atomic_t nr_find_usage_backwards_recursions; * Locking printouts: */ +#define __USAGE(__STATE) \ + [LOCK_USED_IN_##__STATE] = "IN-"__stringify(__STATE)"-W", \ + [LOCK_ENABLED_##__STATE] = __stringify(__STATE)"-ON-W", \ + [LOCK_USED_IN_##__STATE##_READ] = "IN-"__stringify(__STATE)"-R",\ + [LOCK_ENABLED_##__STATE##_READ] = __stringify(__STATE)"-ON-R", + static const char *usage_str[] = { - [LOCK_USED] = "initial-use ", - [LOCK_USED_IN_HARDIRQ] = "in-hardirq-W", - [LOCK_USED_IN_SOFTIRQ] = "in-softirq-W", - [LOCK_ENABLED_SOFTIRQS] = "softirq-on-W", - [LOCK_ENABLED_HARDIRQS] = "hardirq-on-W", - [LOCK_USED_IN_HARDIRQ_READ] = "in-hardirq-R", - [LOCK_USED_IN_SOFTIRQ_READ] = "in-softirq-R", - [LOCK_ENABLED_SOFTIRQS_READ] = "softirq-on-R", - [LOCK_ENABLED_HARDIRQS_READ] = "hardirq-on-R", +#define LOCKDEP_STATE(__STATE) __USAGE(__STATE) +#include "lockdep_states.h" +#undef LOCKDEP_STATE + [LOCK_USED] = "INITIAL USE", }; const char * __get_key_name(struct lockdep_subclass_key *key, char *str) @@ -461,46 +466,45 @@ const char * __get_key_name(struct lockdep_subclass_key *key, char *str) return kallsyms_lookup((unsigned long)key, NULL, NULL, NULL, str); } -void -get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4) +static inline unsigned long lock_flag(enum lock_usage_bit bit) { - *c1 = '.', *c2 = '.', *c3 = '.', *c4 = '.'; + return 1UL << bit; +} - if (class->usage_mask & LOCKF_USED_IN_HARDIRQ) - *c1 = '+'; - else - if (class->usage_mask & LOCKF_ENABLED_HARDIRQS) - *c1 = '-'; +static char get_usage_char(struct lock_class *class, enum lock_usage_bit bit) +{ + char c = '.'; - if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ) - *c2 = '+'; - else - if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS) - *c2 = '-'; - - if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ) - *c3 = '-'; - if (class->usage_mask & LOCKF_USED_IN_HARDIRQ_READ) { - *c3 = '+'; - if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ) - *c3 = '?'; + if (class->usage_mask & lock_flag(bit + 2)) + c = '+'; + if (class->usage_mask & lock_flag(bit)) { + c = '-'; + if (class->usage_mask & lock_flag(bit + 2)) + c = '?'; } - if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ) - *c4 = '-'; - if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ_READ) { - *c4 = '+'; - if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ) - *c4 = '?'; - } + return c; +} + +void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS]) +{ + int i = 0; + +#define LOCKDEP_STATE(__STATE) \ + usage[i++] = get_usage_char(class, LOCK_USED_IN_##__STATE); \ + usage[i++] = get_usage_char(class, LOCK_USED_IN_##__STATE##_READ); +#include "lockdep_states.h" +#undef LOCKDEP_STATE + + usage[i] = '\0'; } static void print_lock_name(struct lock_class *class) { - char str[KSYM_NAME_LEN], c1, c2, c3, c4; + char str[KSYM_NAME_LEN], usage[LOCK_USAGE_CHARS]; const char *name; - get_usage_chars(class, &c1, &c2, &c3, &c4); + get_usage_chars(class, usage); name = class->name; if (!name) { @@ -513,7 +517,7 @@ static void print_lock_name(struct lock_class *class) if (class->subclass) printk("/%d", class->subclass); } - printk("){%c%c%c%c}", c1, c2, c3, c4); + printk("){%s}", usage); } static void print_lockdep_cache(struct lockdep_map *lock) @@ -1263,9 +1267,49 @@ check_usage(struct task_struct *curr, struct held_lock *prev, bit_backwards, bit_forwards, irqclass); } -static int -check_prev_add_irq(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next) +static const char *state_names[] = { +#define LOCKDEP_STATE(__STATE) \ + __stringify(__STATE), +#include "lockdep_states.h" +#undef LOCKDEP_STATE +}; + +static const char *state_rnames[] = { +#define LOCKDEP_STATE(__STATE) \ + __stringify(__STATE)"-READ", +#include "lockdep_states.h" +#undef LOCKDEP_STATE +}; + +static inline const char *state_name(enum lock_usage_bit bit) +{ + return (bit & 1) ? state_rnames[bit >> 2] : state_names[bit >> 2]; +} + +static int exclusive_bit(int new_bit) +{ + /* + * USED_IN + * USED_IN_READ + * ENABLED + * ENABLED_READ + * + * bit 0 - write/read + * bit 1 - used_in/enabled + * bit 2+ state + */ + + int state = new_bit & ~3; + int dir = new_bit & 2; + + /* + * keep state, bit flip the direction and strip read. + */ + return state | (dir ^ 2); +} + +static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, + struct held_lock *next, enum lock_usage_bit bit) { /* * Prove that the new dependency does not connect a hardirq-safe @@ -1273,38 +1317,34 @@ check_prev_add_irq(struct task_struct *curr, struct held_lock *prev, * the backwards-subgraph starting at , and the * forwards-subgraph starting at : */ - if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ, - LOCK_ENABLED_HARDIRQS, "hard")) + if (!check_usage(curr, prev, next, bit, + exclusive_bit(bit), state_name(bit))) return 0; + bit++; /* _READ */ + /* * Prove that the new dependency does not connect a hardirq-safe-read * lock with a hardirq-unsafe lock - to achieve this we search * the backwards-subgraph starting at , and the * forwards-subgraph starting at : */ - if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ_READ, - LOCK_ENABLED_HARDIRQS, "hard-read")) + if (!check_usage(curr, prev, next, bit, + exclusive_bit(bit), state_name(bit))) return 0; - /* - * Prove that the new dependency does not connect a softirq-safe - * lock with a softirq-unsafe lock - to achieve this we search - * the backwards-subgraph starting at , and the - * forwards-subgraph starting at : - */ - if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ, - LOCK_ENABLED_SOFTIRQS, "soft")) - return 0; - /* - * Prove that the new dependency does not connect a softirq-safe-read - * lock with a softirq-unsafe lock - to achieve this we search - * the backwards-subgraph starting at , and the - * forwards-subgraph starting at : - */ - if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ_READ, - LOCK_ENABLED_SOFTIRQS, "soft")) + return 1; +} + +static int +check_prev_add_irq(struct task_struct *curr, struct held_lock *prev, + struct held_lock *next) +{ +#define LOCKDEP_STATE(__STATE) \ + if (!check_irq_usage(curr, prev, next, LOCK_USED_IN_##__STATE)) \ return 0; +#include "lockdep_states.h" +#undef LOCKDEP_STATE return 1; } @@ -1861,9 +1901,9 @@ print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other, curr->comm, task_pid_nr(curr)); print_lock(this); if (forwards) - printk("but this lock took another, %s-irq-unsafe lock in the past:\n", irqclass); + printk("but this lock took another, %s-unsafe lock in the past:\n", irqclass); else - printk("but this lock was taken by another, %s-irq-safe lock in the past:\n", irqclass); + printk("but this lock was taken by another, %s-safe lock in the past:\n", irqclass); print_lock_name(other); printk("\n\nand interrupts could create inverse lock ordering between them.\n\n"); @@ -1933,7 +1973,7 @@ void print_irqtrace_events(struct task_struct *curr) print_ip_sym(curr->softirq_disable_ip); } -static int hardirq_verbose(struct lock_class *class) +static int HARDIRQ_verbose(struct lock_class *class) { #if HARDIRQ_VERBOSE return class_filter(class); @@ -1941,7 +1981,7 @@ static int hardirq_verbose(struct lock_class *class) return 0; } -static int softirq_verbose(struct lock_class *class) +static int SOFTIRQ_verbose(struct lock_class *class) { #if SOFTIRQ_VERBOSE return class_filter(class); @@ -1949,185 +1989,95 @@ static int softirq_verbose(struct lock_class *class) return 0; } +static int RECLAIM_FS_verbose(struct lock_class *class) +{ +#if RECLAIM_VERBOSE + return class_filter(class); +#endif + return 0; +} + #define STRICT_READ_CHECKS 1 -static int mark_lock_irq(struct task_struct *curr, struct held_lock *this, +static int (*state_verbose_f[])(struct lock_class *class) = { +#define LOCKDEP_STATE(__STATE) \ + __STATE##_verbose, +#include "lockdep_states.h" +#undef LOCKDEP_STATE +}; + +static inline int state_verbose(enum lock_usage_bit bit, + struct lock_class *class) +{ + return state_verbose_f[bit >> 2](class); +} + +typedef int (*check_usage_f)(struct task_struct *, struct held_lock *, + enum lock_usage_bit bit, const char *name); + +static int +mark_lock_irq(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit new_bit) { - int ret = 1; + int excl_bit = exclusive_bit(new_bit); + int read = new_bit & 1; + int dir = new_bit & 2; - switch(new_bit) { - case LOCK_USED_IN_HARDIRQ: - if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQS)) + /* + * mark USED_IN has to look forwards -- to ensure no dependency + * has ENABLED state, which would allow recursion deadlocks. + * + * mark ENABLED has to look backwards -- to ensure no dependee + * has USED_IN state, which, again, would allow recursion deadlocks. + */ + check_usage_f usage = dir ? + check_usage_backwards : check_usage_forwards; + + /* + * Validate that this particular lock does not have conflicting + * usage states. + */ + if (!valid_state(curr, this, new_bit, excl_bit)) + return 0; + + /* + * Validate that the lock dependencies don't have conflicting usage + * states. + */ + if ((!read || !dir || STRICT_READ_CHECKS) && + !usage(curr, this, excl_bit, state_name(new_bit & ~1))) + return 0; + + /* + * Check for read in write conflicts + */ + if (!read) { + if (!valid_state(curr, this, new_bit, excl_bit + 1)) return 0; - if (!valid_state(curr, this, new_bit, - LOCK_ENABLED_HARDIRQS_READ)) + + if (STRICT_READ_CHECKS && + !usage(curr, this, excl_bit + 1, + state_name(new_bit + 1))) return 0; - /* - * just marked it hardirq-safe, check that this lock - * took no hardirq-unsafe lock in the past: - */ - if (!check_usage_forwards(curr, this, - LOCK_ENABLED_HARDIRQS, "hard")) - return 0; -#if STRICT_READ_CHECKS - /* - * just marked it hardirq-safe, check that this lock - * took no hardirq-unsafe-read lock in the past: - */ - if (!check_usage_forwards(curr, this, - LOCK_ENABLED_HARDIRQS_READ, "hard-read")) - return 0; -#endif - if (hardirq_verbose(hlock_class(this))) - ret = 2; - break; - case LOCK_USED_IN_SOFTIRQ: - if (!valid_state(curr, this, new_bit, LOCK_ENABLED_SOFTIRQS)) - return 0; - if (!valid_state(curr, this, new_bit, - LOCK_ENABLED_SOFTIRQS_READ)) - return 0; - /* - * just marked it softirq-safe, check that this lock - * took no softirq-unsafe lock in the past: - */ - if (!check_usage_forwards(curr, this, - LOCK_ENABLED_SOFTIRQS, "soft")) - return 0; -#if STRICT_READ_CHECKS - /* - * just marked it softirq-safe, check that this lock - * took no softirq-unsafe-read lock in the past: - */ - if (!check_usage_forwards(curr, this, - LOCK_ENABLED_SOFTIRQS_READ, "soft-read")) - return 0; -#endif - if (softirq_verbose(hlock_class(this))) - ret = 2; - break; - case LOCK_USED_IN_HARDIRQ_READ: - if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQS)) - return 0; - /* - * just marked it hardirq-read-safe, check that this lock - * took no hardirq-unsafe lock in the past: - */ - if (!check_usage_forwards(curr, this, - LOCK_ENABLED_HARDIRQS, "hard")) - return 0; - if (hardirq_verbose(hlock_class(this))) - ret = 2; - break; - case LOCK_USED_IN_SOFTIRQ_READ: - if (!valid_state(curr, this, new_bit, LOCK_ENABLED_SOFTIRQS)) - return 0; - /* - * just marked it softirq-read-safe, check that this lock - * took no softirq-unsafe lock in the past: - */ - if (!check_usage_forwards(curr, this, - LOCK_ENABLED_SOFTIRQS, "soft")) - return 0; - if (softirq_verbose(hlock_class(this))) - ret = 2; - break; - case LOCK_ENABLED_HARDIRQS: - if (!valid_state(curr, this, new_bit, LOCK_USED_IN_HARDIRQ)) - return 0; - if (!valid_state(curr, this, new_bit, - LOCK_USED_IN_HARDIRQ_READ)) - return 0; - /* - * just marked it hardirq-unsafe, check that no hardirq-safe - * lock in the system ever took it in the past: - */ - if (!check_usage_backwards(curr, this, - LOCK_USED_IN_HARDIRQ, "hard")) - return 0; -#if STRICT_READ_CHECKS - /* - * just marked it hardirq-unsafe, check that no - * hardirq-safe-read lock in the system ever took - * it in the past: - */ - if (!check_usage_backwards(curr, this, - LOCK_USED_IN_HARDIRQ_READ, "hard-read")) - return 0; -#endif - if (hardirq_verbose(hlock_class(this))) - ret = 2; - break; - case LOCK_ENABLED_SOFTIRQS: - if (!valid_state(curr, this, new_bit, LOCK_USED_IN_SOFTIRQ)) - return 0; - if (!valid_state(curr, this, new_bit, - LOCK_USED_IN_SOFTIRQ_READ)) - return 0; - /* - * just marked it softirq-unsafe, check that no softirq-safe - * lock in the system ever took it in the past: - */ - if (!check_usage_backwards(curr, this, - LOCK_USED_IN_SOFTIRQ, "soft")) - return 0; -#if STRICT_READ_CHECKS - /* - * just marked it softirq-unsafe, check that no - * softirq-safe-read lock in the system ever took - * it in the past: - */ - if (!check_usage_backwards(curr, this, - LOCK_USED_IN_SOFTIRQ_READ, "soft-read")) - return 0; -#endif - if (softirq_verbose(hlock_class(this))) - ret = 2; - break; - case LOCK_ENABLED_HARDIRQS_READ: - if (!valid_state(curr, this, new_bit, LOCK_USED_IN_HARDIRQ)) - return 0; -#if STRICT_READ_CHECKS - /* - * just marked it hardirq-read-unsafe, check that no - * hardirq-safe lock in the system ever took it in the past: - */ - if (!check_usage_backwards(curr, this, - LOCK_USED_IN_HARDIRQ, "hard")) - return 0; -#endif - if (hardirq_verbose(hlock_class(this))) - ret = 2; - break; - case LOCK_ENABLED_SOFTIRQS_READ: - if (!valid_state(curr, this, new_bit, LOCK_USED_IN_SOFTIRQ)) - return 0; -#if STRICT_READ_CHECKS - /* - * just marked it softirq-read-unsafe, check that no - * softirq-safe lock in the system ever took it in the past: - */ - if (!check_usage_backwards(curr, this, - LOCK_USED_IN_SOFTIRQ, "soft")) - return 0; -#endif - if (softirq_verbose(hlock_class(this))) - ret = 2; - break; - default: - WARN_ON(1); - break; } - return ret; + if (state_verbose(new_bit, hlock_class(this))) + return 2; + + return 1; } +enum mark_type { +#define LOCKDEP_STATE(__STATE) __STATE, +#include "lockdep_states.h" +#undef LOCKDEP_STATE +}; + /* * Mark all held locks with a usage bit: */ static int -mark_held_locks(struct task_struct *curr, int hardirq) +mark_held_locks(struct task_struct *curr, enum mark_type mark) { enum lock_usage_bit usage_bit; struct held_lock *hlock; @@ -2136,17 +2086,12 @@ mark_held_locks(struct task_struct *curr, int hardirq) for (i = 0; i < curr->lockdep_depth; i++) { hlock = curr->held_locks + i; - if (hardirq) { - if (hlock->read) - usage_bit = LOCK_ENABLED_HARDIRQS_READ; - else - usage_bit = LOCK_ENABLED_HARDIRQS; - } else { - if (hlock->read) - usage_bit = LOCK_ENABLED_SOFTIRQS_READ; - else - usage_bit = LOCK_ENABLED_SOFTIRQS; - } + usage_bit = 2 + (mark << 2); /* ENABLED */ + if (hlock->read) + usage_bit += 1; /* READ */ + + BUG_ON(usage_bit >= LOCK_USAGE_STATES); + if (!mark_lock(curr, hlock, usage_bit)) return 0; } @@ -2200,7 +2145,7 @@ void trace_hardirqs_on_caller(unsigned long ip) * We are going to turn hardirqs on, so set the * usage bit for all held locks: */ - if (!mark_held_locks(curr, 1)) + if (!mark_held_locks(curr, HARDIRQ)) return; /* * If we have softirqs enabled, then set the usage @@ -2208,7 +2153,7 @@ void trace_hardirqs_on_caller(unsigned long ip) * this bit from being set before) */ if (curr->softirqs_enabled) - if (!mark_held_locks(curr, 0)) + if (!mark_held_locks(curr, SOFTIRQ)) return; curr->hardirq_enable_ip = ip; @@ -2288,7 +2233,7 @@ void trace_softirqs_on(unsigned long ip) * enabled too: */ if (curr->hardirqs_enabled) - mark_held_locks(curr, 0); + mark_held_locks(curr, SOFTIRQ); } /* @@ -2317,6 +2262,31 @@ void trace_softirqs_off(unsigned long ip) debug_atomic_inc(&redundant_softirqs_off); } +void lockdep_trace_alloc(gfp_t gfp_mask) +{ + struct task_struct *curr = current; + + if (unlikely(!debug_locks)) + return; + + /* no reclaim without waiting on it */ + if (!(gfp_mask & __GFP_WAIT)) + return; + + /* this guy won't enter reclaim */ + if ((curr->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC)) + return; + + /* We're only interested __GFP_FS allocations for now */ + if (!(gfp_mask & __GFP_FS)) + return; + + if (DEBUG_LOCKS_WARN_ON(irqs_disabled())) + return; + + mark_held_locks(curr, RECLAIM_FS); +} + static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock) { /* @@ -2345,19 +2315,35 @@ static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock) if (!hlock->hardirqs_off) { if (hlock->read) { if (!mark_lock(curr, hlock, - LOCK_ENABLED_HARDIRQS_READ)) + LOCK_ENABLED_HARDIRQ_READ)) return 0; if (curr->softirqs_enabled) if (!mark_lock(curr, hlock, - LOCK_ENABLED_SOFTIRQS_READ)) + LOCK_ENABLED_SOFTIRQ_READ)) return 0; } else { if (!mark_lock(curr, hlock, - LOCK_ENABLED_HARDIRQS)) + LOCK_ENABLED_HARDIRQ)) return 0; if (curr->softirqs_enabled) if (!mark_lock(curr, hlock, - LOCK_ENABLED_SOFTIRQS)) + LOCK_ENABLED_SOFTIRQ)) + return 0; + } + } + + /* + * We reuse the irq context infrastructure more broadly as a general + * context checking code. This tests GFP_FS recursion (a lock taken + * during reclaim for a GFP_FS allocation is held over a GFP_FS + * allocation). + */ + if (!hlock->trylock && (curr->lockdep_reclaim_gfp & __GFP_FS)) { + if (hlock->read) { + if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS_READ)) + return 0; + } else { + if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS)) return 0; } } @@ -2412,6 +2398,10 @@ static inline int separate_irq_context(struct task_struct *curr, return 0; } +void lockdep_trace_alloc(gfp_t gfp_mask) +{ +} + #endif /* @@ -2445,14 +2435,13 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, return 0; switch (new_bit) { - case LOCK_USED_IN_HARDIRQ: - case LOCK_USED_IN_SOFTIRQ: - case LOCK_USED_IN_HARDIRQ_READ: - case LOCK_USED_IN_SOFTIRQ_READ: - case LOCK_ENABLED_HARDIRQS: - case LOCK_ENABLED_SOFTIRQS: - case LOCK_ENABLED_HARDIRQS_READ: - case LOCK_ENABLED_SOFTIRQS_READ: +#define LOCKDEP_STATE(__STATE) \ + case LOCK_USED_IN_##__STATE: \ + case LOCK_USED_IN_##__STATE##_READ: \ + case LOCK_ENABLED_##__STATE: \ + case LOCK_ENABLED_##__STATE##_READ: +#include "lockdep_states.h" +#undef LOCKDEP_STATE ret = mark_lock_irq(curr, this, new_bit); if (!ret) return 0; @@ -2925,6 +2914,8 @@ void lock_set_class(struct lockdep_map *lock, const char *name, } EXPORT_SYMBOL_GPL(lock_set_class); +DEFINE_TRACE(lock_acquire); + /* * We are not always called with irqs disabled - do that here, * and also avoid lockdep recursion: @@ -2935,6 +2926,8 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, { unsigned long flags; + trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip); + if (unlikely(current->lockdep_recursion)) return; @@ -2949,11 +2942,15 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, } EXPORT_SYMBOL_GPL(lock_acquire); +DEFINE_TRACE(lock_release); + void lock_release(struct lockdep_map *lock, int nested, unsigned long ip) { unsigned long flags; + trace_lock_release(lock, nested, ip); + if (unlikely(current->lockdep_recursion)) return; @@ -2966,6 +2963,16 @@ void lock_release(struct lockdep_map *lock, int nested, } EXPORT_SYMBOL_GPL(lock_release); +void lockdep_set_current_reclaim_state(gfp_t gfp_mask) +{ + current->lockdep_reclaim_gfp = gfp_mask; +} + +void lockdep_clear_current_reclaim_state(void) +{ + current->lockdep_reclaim_gfp = 0; +} + #ifdef CONFIG_LOCK_STAT static int print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, @@ -3092,10 +3099,14 @@ found_it: lock->ip = ip; } +DEFINE_TRACE(lock_contended); + void lock_contended(struct lockdep_map *lock, unsigned long ip) { unsigned long flags; + trace_lock_contended(lock, ip); + if (unlikely(!lock_stat)) return; @@ -3111,10 +3122,14 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip) } EXPORT_SYMBOL_GPL(lock_contended); +DEFINE_TRACE(lock_acquired); + void lock_acquired(struct lockdep_map *lock, unsigned long ip) { unsigned long flags; + trace_lock_acquired(lock, ip); + if (unlikely(!lock_stat)) return; diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h index 56b196932c08..a2cc7e9a6e84 100644 --- a/kernel/lockdep_internals.h +++ b/kernel/lockdep_internals.h @@ -6,6 +6,45 @@ * lockdep subsystem internal functions and variables. */ +/* + * Lock-class usage-state bits: + */ +enum lock_usage_bit { +#define LOCKDEP_STATE(__STATE) \ + LOCK_USED_IN_##__STATE, \ + LOCK_USED_IN_##__STATE##_READ, \ + LOCK_ENABLED_##__STATE, \ + LOCK_ENABLED_##__STATE##_READ, +#include "lockdep_states.h" +#undef LOCKDEP_STATE + LOCK_USED, + LOCK_USAGE_STATES +}; + +/* + * Usage-state bitmasks: + */ +#define __LOCKF(__STATE) LOCKF_##__STATE = (1 << LOCK_##__STATE), + +enum { +#define LOCKDEP_STATE(__STATE) \ + __LOCKF(USED_IN_##__STATE) \ + __LOCKF(USED_IN_##__STATE##_READ) \ + __LOCKF(ENABLED_##__STATE) \ + __LOCKF(ENABLED_##__STATE##_READ) +#include "lockdep_states.h" +#undef LOCKDEP_STATE + __LOCKF(USED) +}; + +#define LOCKF_ENABLED_IRQ (LOCKF_ENABLED_HARDIRQ | LOCKF_ENABLED_SOFTIRQ) +#define LOCKF_USED_IN_IRQ (LOCKF_USED_IN_HARDIRQ | LOCKF_USED_IN_SOFTIRQ) + +#define LOCKF_ENABLED_IRQ_READ \ + (LOCKF_ENABLED_HARDIRQ_READ | LOCKF_ENABLED_SOFTIRQ_READ) +#define LOCKF_USED_IN_IRQ_READ \ + (LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ) + /* * MAX_LOCKDEP_ENTRIES is the maximum number of lock dependencies * we track. @@ -31,8 +70,10 @@ extern struct list_head all_lock_classes; extern struct lock_chain lock_chains[]; -extern void -get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4); +#define LOCK_USAGE_CHARS (1+LOCK_USAGE_STATES/2) + +extern void get_usage_chars(struct lock_class *class, + char usage[LOCK_USAGE_CHARS]); extern const char * __get_key_name(struct lockdep_subclass_key *key, char *str); diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index 13716b813896..d7135aa2d2c4 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -84,7 +84,7 @@ static int l_show(struct seq_file *m, void *v) { struct lock_class *class = v; struct lock_list *entry; - char c1, c2, c3, c4; + char usage[LOCK_USAGE_CHARS]; if (v == SEQ_START_TOKEN) { seq_printf(m, "all lock classes:\n"); @@ -100,8 +100,8 @@ static int l_show(struct seq_file *m, void *v) seq_printf(m, " BD:%5ld", lockdep_count_backward_deps(class)); #endif - get_usage_chars(class, &c1, &c2, &c3, &c4); - seq_printf(m, " %c%c%c%c", c1, c2, c3, c4); + get_usage_chars(class, usage); + seq_printf(m, " %s", usage); seq_printf(m, ": "); print_name(m, class); @@ -300,27 +300,27 @@ static int lockdep_stats_show(struct seq_file *m, void *v) nr_uncategorized++; if (class->usage_mask & LOCKF_USED_IN_IRQ) nr_irq_safe++; - if (class->usage_mask & LOCKF_ENABLED_IRQS) + if (class->usage_mask & LOCKF_ENABLED_IRQ) nr_irq_unsafe++; if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ) nr_softirq_safe++; - if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS) + if (class->usage_mask & LOCKF_ENABLED_SOFTIRQ) nr_softirq_unsafe++; if (class->usage_mask & LOCKF_USED_IN_HARDIRQ) nr_hardirq_safe++; - if (class->usage_mask & LOCKF_ENABLED_HARDIRQS) + if (class->usage_mask & LOCKF_ENABLED_HARDIRQ) nr_hardirq_unsafe++; if (class->usage_mask & LOCKF_USED_IN_IRQ_READ) nr_irq_read_safe++; - if (class->usage_mask & LOCKF_ENABLED_IRQS_READ) + if (class->usage_mask & LOCKF_ENABLED_IRQ_READ) nr_irq_read_unsafe++; if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ_READ) nr_softirq_read_safe++; - if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ) + if (class->usage_mask & LOCKF_ENABLED_SOFTIRQ_READ) nr_softirq_read_unsafe++; if (class->usage_mask & LOCKF_USED_IN_HARDIRQ_READ) nr_hardirq_read_safe++; - if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ) + if (class->usage_mask & LOCKF_ENABLED_HARDIRQ_READ) nr_hardirq_read_unsafe++; #ifdef CONFIG_PROVE_LOCKING @@ -601,6 +601,10 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data) static void seq_header(struct seq_file *m) { seq_printf(m, "lock_stat version 0.3\n"); + + if (unlikely(!debug_locks)) + seq_printf(m, "*WARNING* lock debugging disabled!! - possibly due to a lockdep warning\n"); + seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1)); seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s " "%14s %14s\n", diff --git a/kernel/lockdep_states.h b/kernel/lockdep_states.h new file mode 100644 index 000000000000..995b0cc2b84c --- /dev/null +++ b/kernel/lockdep_states.h @@ -0,0 +1,9 @@ +/* + * Lockdep states, + * + * please update XXX_LOCK_USAGE_STATES in include/linux/lockdep.h whenever + * you add one, or come up with a nice dynamic solution. + */ +LOCKDEP_STATE(HARDIRQ) +LOCKDEP_STATE(SOFTIRQ) +LOCKDEP_STATE(RECLAIM_FS) diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c index 1d94160eb532..50d022e5a560 100644 --- a/kernel/mutex-debug.c +++ b/kernel/mutex-debug.c @@ -26,11 +26,6 @@ /* * Must be called with lock->wait_lock held. */ -void debug_mutex_set_owner(struct mutex *lock, struct thread_info *new_owner) -{ - lock->owner = new_owner; -} - void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter) { memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter)); @@ -59,7 +54,6 @@ void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, /* Mark the current thread as blocked on the lock: */ ti->task->blocked_on = waiter; - waiter->lock = lock; } void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, @@ -82,7 +76,7 @@ void debug_mutex_unlock(struct mutex *lock) DEBUG_LOCKS_WARN_ON(lock->magic != lock); DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info()); DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); - DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info()); + mutex_clear_owner(lock); } void debug_mutex_init(struct mutex *lock, const char *name, @@ -95,7 +89,6 @@ void debug_mutex_init(struct mutex *lock, const char *name, debug_check_no_locks_freed((void *)lock, sizeof(*lock)); lockdep_init_map(&lock->dep_map, name, key, 0); #endif - lock->owner = NULL; lock->magic = lock; } diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h index babfbdfc534b..6b2d735846a5 100644 --- a/kernel/mutex-debug.h +++ b/kernel/mutex-debug.h @@ -13,14 +13,6 @@ /* * This must be called with lock->wait_lock held. */ -extern void -debug_mutex_set_owner(struct mutex *lock, struct thread_info *new_owner); - -static inline void debug_mutex_clear_owner(struct mutex *lock) -{ - lock->owner = NULL; -} - extern void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter); extern void debug_mutex_wake_waiter(struct mutex *lock, @@ -35,6 +27,16 @@ extern void debug_mutex_unlock(struct mutex *lock); extern void debug_mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key); +static inline void mutex_set_owner(struct mutex *lock) +{ + lock->owner = current_thread_info(); +} + +static inline void mutex_clear_owner(struct mutex *lock) +{ + lock->owner = NULL; +} + #define spin_lock_mutex(lock, flags) \ do { \ struct mutex *l = container_of(lock, struct mutex, wait_lock); \ diff --git a/kernel/mutex.c b/kernel/mutex.c index 4f45d4b658ef..5d79781394a3 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -10,6 +10,11 @@ * Many thanks to Arjan van de Ven, Thomas Gleixner, Steven Rostedt and * David Howells for suggestions and improvements. * + * - Adaptive spinning for mutexes by Peter Zijlstra. (Ported to mainline + * from the -rt tree, where it was originally implemented for rtmutexes + * by Steven Rostedt, based on work by Gregory Haskins, Peter Morreale + * and Sven Dietrich. + * * Also see Documentation/mutex-design.txt. */ #include @@ -46,6 +51,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) atomic_set(&lock->count, 1); spin_lock_init(&lock->wait_lock); INIT_LIST_HEAD(&lock->wait_list); + mutex_clear_owner(lock); debug_mutex_init(lock, name, key); } @@ -91,6 +97,7 @@ void inline __sched mutex_lock(struct mutex *lock) * 'unlocked' into 'locked' state. */ __mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath); + mutex_set_owner(lock); } EXPORT_SYMBOL(mutex_lock); @@ -115,6 +122,14 @@ void __sched mutex_unlock(struct mutex *lock) * The unlocking fastpath is the 0->1 transition from 'locked' * into 'unlocked' state: */ +#ifndef CONFIG_DEBUG_MUTEXES + /* + * When debugging is enabled we must not clear the owner before time, + * the slow path will always be taken, and that clears the owner field + * after verifying that it was indeed current. + */ + mutex_clear_owner(lock); +#endif __mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath); } @@ -129,21 +144,75 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, { struct task_struct *task = current; struct mutex_waiter waiter; - unsigned int old_val; unsigned long flags; + preempt_disable(); + mutex_acquire(&lock->dep_map, subclass, 0, ip); +#if defined(CONFIG_SMP) && !defined(CONFIG_DEBUG_MUTEXES) + /* + * Optimistic spinning. + * + * We try to spin for acquisition when we find that there are no + * pending waiters and the lock owner is currently running on a + * (different) CPU. + * + * The rationale is that if the lock owner is running, it is likely to + * release the lock soon. + * + * Since this needs the lock owner, and this mutex implementation + * doesn't track the owner atomically in the lock field, we need to + * track it non-atomically. + * + * We can't do this for DEBUG_MUTEXES because that relies on wait_lock + * to serialize everything. + */ + + for (;;) { + struct thread_info *owner; + + /* + * If there's an owner, wait for it to either + * release the lock or go to sleep. + */ + owner = ACCESS_ONCE(lock->owner); + if (owner && !mutex_spin_on_owner(lock, owner)) + break; + + if (atomic_cmpxchg(&lock->count, 1, 0) == 1) { + lock_acquired(&lock->dep_map, ip); + mutex_set_owner(lock); + preempt_enable(); + return 0; + } + + /* + * When there's no owner, we might have preempted between the + * owner acquiring the lock and setting the owner field. If + * we're an RT task that will live-lock because we won't let + * the owner complete. + */ + if (!owner && (need_resched() || rt_task(task))) + break; + + /* + * The cpu_relax() call is a compiler barrier which forces + * everything in this loop to be re-loaded. We don't need + * memory barriers as we'll eventually observe the right + * values at the cost of a few extra spins. + */ + cpu_relax(); + } +#endif spin_lock_mutex(&lock->wait_lock, flags); debug_mutex_lock_common(lock, &waiter); - mutex_acquire(&lock->dep_map, subclass, 0, ip); debug_mutex_add_waiter(lock, &waiter, task_thread_info(task)); /* add waiting tasks to the end of the waitqueue (FIFO): */ list_add_tail(&waiter.list, &lock->wait_list); waiter.task = task; - old_val = atomic_xchg(&lock->count, -1); - if (old_val == 1) + if (atomic_xchg(&lock->count, -1) == 1) goto done; lock_contended(&lock->dep_map, ip); @@ -158,8 +227,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, * that when we release the lock, we properly wake up the * other waiters: */ - old_val = atomic_xchg(&lock->count, -1); - if (old_val == 1) + if (atomic_xchg(&lock->count, -1) == 1) break; /* @@ -173,21 +241,22 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, spin_unlock_mutex(&lock->wait_lock, flags); debug_mutex_free_waiter(&waiter); + preempt_enable(); return -EINTR; } __set_task_state(task, state); /* didnt get the lock, go to sleep: */ spin_unlock_mutex(&lock->wait_lock, flags); - schedule(); + __schedule(); spin_lock_mutex(&lock->wait_lock, flags); } done: lock_acquired(&lock->dep_map, ip); /* got the lock - rejoice! */ - mutex_remove_waiter(lock, &waiter, task_thread_info(task)); - debug_mutex_set_owner(lock, task_thread_info(task)); + mutex_remove_waiter(lock, &waiter, current_thread_info()); + mutex_set_owner(lock); /* set it to 0 if there are no waiters left: */ if (likely(list_empty(&lock->wait_list))) @@ -196,6 +265,7 @@ done: spin_unlock_mutex(&lock->wait_lock, flags); debug_mutex_free_waiter(&waiter); + preempt_enable(); return 0; } @@ -222,7 +292,8 @@ int __sched mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass) { might_sleep(); - return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, subclass, _RET_IP_); + return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, + subclass, _RET_IP_); } EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); @@ -260,8 +331,6 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested) wake_up_process(waiter->task); } - debug_mutex_clear_owner(lock); - spin_unlock_mutex(&lock->wait_lock, flags); } @@ -298,18 +367,30 @@ __mutex_lock_interruptible_slowpath(atomic_t *lock_count); */ int __sched mutex_lock_interruptible(struct mutex *lock) { + int ret; + might_sleep(); - return __mutex_fastpath_lock_retval + ret = __mutex_fastpath_lock_retval (&lock->count, __mutex_lock_interruptible_slowpath); + if (!ret) + mutex_set_owner(lock); + + return ret; } EXPORT_SYMBOL(mutex_lock_interruptible); int __sched mutex_lock_killable(struct mutex *lock) { + int ret; + might_sleep(); - return __mutex_fastpath_lock_retval + ret = __mutex_fastpath_lock_retval (&lock->count, __mutex_lock_killable_slowpath); + if (!ret) + mutex_set_owner(lock); + + return ret; } EXPORT_SYMBOL(mutex_lock_killable); @@ -352,9 +433,10 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count) prev = atomic_xchg(&lock->count, -1); if (likely(prev == 1)) { - debug_mutex_set_owner(lock, current_thread_info()); + mutex_set_owner(lock); mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); } + /* Set it back to 0 if there are no waiters: */ if (likely(list_empty(&lock->wait_list))) atomic_set(&lock->count, 0); @@ -380,8 +462,13 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count) */ int __sched mutex_trylock(struct mutex *lock) { - return __mutex_fastpath_trylock(&lock->count, - __mutex_trylock_slowpath); + int ret; + + ret = __mutex_fastpath_trylock(&lock->count, __mutex_trylock_slowpath); + if (ret) + mutex_set_owner(lock); + + return ret; } EXPORT_SYMBOL(mutex_trylock); diff --git a/kernel/mutex.h b/kernel/mutex.h index a075dafbb290..67578ca48f94 100644 --- a/kernel/mutex.h +++ b/kernel/mutex.h @@ -16,8 +16,26 @@ #define mutex_remove_waiter(lock, waiter, ti) \ __list_del((waiter)->list.prev, (waiter)->list.next) -#define debug_mutex_set_owner(lock, new_owner) do { } while (0) -#define debug_mutex_clear_owner(lock) do { } while (0) +#ifdef CONFIG_SMP +static inline void mutex_set_owner(struct mutex *lock) +{ + lock->owner = current_thread_info(); +} + +static inline void mutex_clear_owner(struct mutex *lock) +{ + lock->owner = NULL; +} +#else +static inline void mutex_set_owner(struct mutex *lock) +{ +} + +static inline void mutex_clear_owner(struct mutex *lock) +{ +} +#endif + #define debug_mutex_wake_waiter(lock, waiter) do { } while (0) #define debug_mutex_free_waiter(waiter) do { } while (0) #define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0) diff --git a/kernel/sched.c b/kernel/sched.c index 328f9c7448a5..e1f676e20119 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4543,15 +4543,13 @@ pick_next_task(struct rq *rq, struct task_struct *prev) /* * schedule() is the main scheduler function. */ -asmlinkage void __sched schedule(void) +asmlinkage void __sched __schedule(void) { struct task_struct *prev, *next; unsigned long *switch_count; struct rq *rq; int cpu; -need_resched: - preempt_disable(); cpu = smp_processor_id(); rq = cpu_rq(cpu); rcu_qsctr_inc(cpu); @@ -4608,13 +4606,80 @@ need_resched_nonpreemptible: if (unlikely(reacquire_kernel_lock(current) < 0)) goto need_resched_nonpreemptible; +} +asmlinkage void __sched schedule(void) +{ +need_resched: + preempt_disable(); + __schedule(); preempt_enable_no_resched(); if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) goto need_resched; } EXPORT_SYMBOL(schedule); +#ifdef CONFIG_SMP +/* + * Look out! "owner" is an entirely speculative pointer + * access and not reliable. + */ +int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) +{ + unsigned int cpu; + struct rq *rq; + + if (!sched_feat(OWNER_SPIN)) + return 0; + +#ifdef CONFIG_DEBUG_PAGEALLOC + /* + * Need to access the cpu field knowing that + * DEBUG_PAGEALLOC could have unmapped it if + * the mutex owner just released it and exited. + */ + if (probe_kernel_address(&owner->cpu, cpu)) + goto out; +#else + cpu = owner->cpu; +#endif + + /* + * Even if the access succeeded (likely case), + * the cpu field may no longer be valid. + */ + if (cpu >= nr_cpumask_bits) + goto out; + + /* + * We need to validate that we can do a + * get_cpu() and that we have the percpu area. + */ + if (!cpu_online(cpu)) + goto out; + + rq = cpu_rq(cpu); + + for (;;) { + /* + * Owner changed, break to re-assess state. + */ + if (lock->owner != owner) + break; + + /* + * Is that owner really running on that cpu? + */ + if (task_thread_info(rq->curr) != owner || need_resched()) + return 0; + + cpu_relax(); + } +out: + return 1; +} +#endif + #ifdef CONFIG_PREEMPT /* * this is the entry point to schedule() from in-kernel preemption diff --git a/kernel/sched_features.h b/kernel/sched_features.h index da5d93b5d2c6..07bc02e99ab1 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -13,3 +13,4 @@ SCHED_FEAT(LB_WAKEUP_UPDATE, 1) SCHED_FEAT(ASYM_EFF_LOAD, 1) SCHED_FEAT(WAKEUP_OVERLAP, 0) SCHED_FEAT(LAST_BUDDY, 1) +SCHED_FEAT(OWNER_SPIN, 1) diff --git a/kernel/timer.c b/kernel/timer.c index 13dd64fe143d..ef1c385bc572 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -491,14 +491,18 @@ static inline void debug_timer_free(struct timer_list *timer) debug_object_free(timer, &timer_debug_descr); } -static void __init_timer(struct timer_list *timer); +static void __init_timer(struct timer_list *timer, + const char *name, + struct lock_class_key *key); -void init_timer_on_stack(struct timer_list *timer) +void init_timer_on_stack_key(struct timer_list *timer, + const char *name, + struct lock_class_key *key) { debug_object_init_on_stack(timer, &timer_debug_descr); - __init_timer(timer); + __init_timer(timer, name, key); } -EXPORT_SYMBOL_GPL(init_timer_on_stack); +EXPORT_SYMBOL_GPL(init_timer_on_stack_key); void destroy_timer_on_stack(struct timer_list *timer) { @@ -512,7 +516,9 @@ static inline void debug_timer_activate(struct timer_list *timer) { } static inline void debug_timer_deactivate(struct timer_list *timer) { } #endif -static void __init_timer(struct timer_list *timer) +static void __init_timer(struct timer_list *timer, + const char *name, + struct lock_class_key *key) { timer->entry.next = NULL; timer->base = __raw_get_cpu_var(tvec_bases); @@ -521,6 +527,7 @@ static void __init_timer(struct timer_list *timer) timer->start_pid = -1; memset(timer->start_comm, 0, TASK_COMM_LEN); #endif + lockdep_init_map(&timer->lockdep_map, name, key, 0); } /** @@ -530,19 +537,23 @@ static void __init_timer(struct timer_list *timer) * init_timer() must be done to a timer prior calling *any* of the * other timer functions. */ -void init_timer(struct timer_list *timer) +void init_timer_key(struct timer_list *timer, + const char *name, + struct lock_class_key *key) { debug_timer_init(timer); - __init_timer(timer); + __init_timer(timer, name, key); } -EXPORT_SYMBOL(init_timer); +EXPORT_SYMBOL(init_timer_key); -void init_timer_deferrable(struct timer_list *timer) +void init_timer_deferrable_key(struct timer_list *timer, + const char *name, + struct lock_class_key *key) { - init_timer(timer); + init_timer_key(timer, name, key); timer_set_deferrable(timer); } -EXPORT_SYMBOL(init_timer_deferrable); +EXPORT_SYMBOL(init_timer_deferrable_key); static inline void detach_timer(struct timer_list *timer, int clear_pending) @@ -789,6 +800,15 @@ EXPORT_SYMBOL(try_to_del_timer_sync); */ int del_timer_sync(struct timer_list *timer) { +#ifdef CONFIG_LOCKDEP + unsigned long flags; + + local_irq_save(flags); + lock_map_acquire(&timer->lockdep_map); + lock_map_release(&timer->lockdep_map); + local_irq_restore(flags); +#endif + for (;;) { int ret = try_to_del_timer_sync(timer); if (ret >= 0) @@ -861,10 +881,36 @@ static inline void __run_timers(struct tvec_base *base) set_running_timer(base, timer); detach_timer(timer, 1); + spin_unlock_irq(&base->lock); { int preempt_count = preempt_count(); + +#ifdef CONFIG_LOCKDEP + /* + * It is permissible to free the timer from + * inside the function that is called from + * it, this we need to take into account for + * lockdep too. To avoid bogus "held lock + * freed" warnings as well as problems when + * looking into timer->lockdep_map, make a + * copy and use that here. + */ + struct lockdep_map lockdep_map = + timer->lockdep_map; +#endif + /* + * Couple the lock chain with the lock chain at + * del_timer_sync() by acquiring the lock_map + * around the fn() call here and in + * del_timer_sync(). + */ + lock_map_acquire(&lockdep_map); + fn(data); + + lock_map_release(&lockdep_map); + if (preempt_count != preempt_count()) { printk(KERN_ERR "huh, entered %p " "with preempt_count %08x, exited" diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index a8c275c01e83..f2a163db52f9 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -61,6 +61,8 @@ enum { static unsigned long ring_buffer_flags __read_mostly = RB_BUFFERS_ON; +#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data) + /** * tracing_on - enable all tracing buffers * @@ -132,7 +134,7 @@ void ring_buffer_normalize_time_stamp(int cpu, u64 *ts) } EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp); -#define RB_EVNT_HDR_SIZE (sizeof(struct ring_buffer_event)) +#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) #define RB_ALIGNMENT 4U #define RB_MAX_SMALL_DATA 28 @@ -234,6 +236,18 @@ static void rb_init_page(struct buffer_data_page *bpage) local_set(&bpage->commit, 0); } +/** + * ring_buffer_page_len - the size of data on the page. + * @page: The page to read + * + * Returns the amount of data on the page, including buffer page header. + */ +size_t ring_buffer_page_len(void *page) +{ + return local_read(&((struct buffer_data_page *)page)->commit) + + BUF_PAGE_HDR_SIZE; +} + /* * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing * this issue out. @@ -254,7 +268,7 @@ static inline int test_time_stamp(u64 delta) return 0; } -#define BUF_PAGE_SIZE (PAGE_SIZE - offsetof(struct buffer_data_page, data)) +#define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE) /* * head_page == tail_page && head == tail then buffer is empty. @@ -2378,8 +2392,8 @@ static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer, */ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer) { - unsigned long addr; struct buffer_data_page *bpage; + unsigned long addr; addr = __get_free_page(GFP_KERNEL); if (!addr) @@ -2387,6 +2401,8 @@ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer) bpage = (void *)addr; + rb_init_page(bpage); + return bpage; } @@ -2406,6 +2422,7 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data) * ring_buffer_read_page - extract a page from the ring buffer * @buffer: buffer to extract from * @data_page: the page to use allocated from ring_buffer_alloc_read_page + * @len: amount to extract * @cpu: the cpu of the buffer to extract * @full: should the extraction only happen when the page is full. * @@ -2418,7 +2435,7 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data) * rpage = ring_buffer_alloc_read_page(buffer); * if (!rpage) * return error; - * ret = ring_buffer_read_page(buffer, &rpage, cpu, 0); + * ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0); * if (ret >= 0) * process_page(rpage, ret); * @@ -2435,70 +2452,103 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data) * <0 if no data has been transferred. */ int ring_buffer_read_page(struct ring_buffer *buffer, - void **data_page, int cpu, int full) + void **data_page, size_t len, int cpu, int full) { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; struct ring_buffer_event *event; struct buffer_data_page *bpage; + struct buffer_page *reader; unsigned long flags; + unsigned int commit; unsigned int read; int ret = -1; + /* + * If len is not big enough to hold the page header, then + * we can not copy anything. + */ + if (len <= BUF_PAGE_HDR_SIZE) + return -1; + + len -= BUF_PAGE_HDR_SIZE; + if (!data_page) - return 0; + return -1; bpage = *data_page; if (!bpage) - return 0; + return -1; spin_lock_irqsave(&cpu_buffer->reader_lock, flags); - /* - * rb_buffer_peek will get the next ring buffer if - * the current reader page is empty. - */ - event = rb_buffer_peek(buffer, cpu, NULL); - if (!event) + reader = rb_get_reader_page(cpu_buffer); + if (!reader) goto out; - /* check for data */ - if (!local_read(&cpu_buffer->reader_page->page->commit)) - goto out; + event = rb_reader_event(cpu_buffer); + + read = reader->read; + commit = rb_page_commit(reader); - read = cpu_buffer->reader_page->read; /* - * If the writer is already off of the read page, then simply - * switch the read page with the given page. Otherwise - * we need to copy the data from the reader to the writer. + * If this page has been partially read or + * if len is not big enough to read the rest of the page or + * a writer is still on the page, then + * we must copy the data from the page to the buffer. + * Otherwise, we can simply swap the page with the one passed in. */ - if (cpu_buffer->reader_page == cpu_buffer->commit_page) { - unsigned int commit = rb_page_commit(cpu_buffer->reader_page); + if (read || (len < (commit - read)) || + cpu_buffer->reader_page == cpu_buffer->commit_page) { struct buffer_data_page *rpage = cpu_buffer->reader_page->page; + unsigned int rpos = read; + unsigned int pos = 0; + unsigned int size; if (full) goto out; - /* The writer is still on the reader page, we must copy */ - memcpy(bpage->data + read, rpage->data + read, commit - read); - /* consume what was read */ - cpu_buffer->reader_page->read = commit; + if (len > (commit - read)) + len = (commit - read); + + size = rb_event_length(event); + + if (len < size) + goto out; + + /* Need to copy one event at a time */ + do { + memcpy(bpage->data + pos, rpage->data + rpos, size); + + len -= size; + + rb_advance_reader(cpu_buffer); + rpos = reader->read; + pos += size; + + event = rb_reader_event(cpu_buffer); + size = rb_event_length(event); + } while (len > size); /* update bpage */ - local_set(&bpage->commit, commit); - if (!read) - bpage->time_stamp = rpage->time_stamp; + local_set(&bpage->commit, pos); + bpage->time_stamp = rpage->time_stamp; + + /* we copied everything to the beginning */ + read = 0; } else { /* swap the pages */ rb_init_page(bpage); - bpage = cpu_buffer->reader_page->page; - cpu_buffer->reader_page->page = *data_page; - cpu_buffer->reader_page->read = 0; + bpage = reader->page; + reader->page = *data_page; + local_set(&reader->write, 0); + reader->read = 0; *data_page = bpage; + + /* update the entry counter */ + rb_remove_entries(cpu_buffer, bpage, read); } ret = read; - /* update the entry counter */ - rb_remove_entries(cpu_buffer, bpage, read); out: spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ea055aa21cd9..c8abbb0c8397 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -11,31 +11,30 @@ * Copyright (C) 2004-2006 Ingo Molnar * Copyright (C) 2004 William Lee Irwin III */ +#include #include +#include +#include #include #include #include +#include #include #include #include #include #include +#include #include #include #include +#include #include #include #include #include #include #include -#include -#include -#include - -#include -#include -#include #include "trace.h" #include "trace_output.h" @@ -624,7 +623,7 @@ static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; static unsigned map_cmdline_to_pid[SAVED_CMDLINES]; static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN]; static int cmdline_idx; -static DEFINE_SPINLOCK(trace_cmdline_lock); +static raw_spinlock_t trace_cmdline_lock = __RAW_SPIN_LOCK_UNLOCKED; /* temporary disable recording */ static atomic_t trace_record_cmdline_disabled __read_mostly; @@ -736,7 +735,7 @@ static void trace_save_cmdline(struct task_struct *tsk) * nor do we want to disable interrupts, * so if we miss here, then better luck next time. */ - if (!spin_trylock(&trace_cmdline_lock)) + if (!__raw_spin_trylock(&trace_cmdline_lock)) return; idx = map_pid_to_cmdline[tsk->pid]; @@ -754,7 +753,7 @@ static void trace_save_cmdline(struct task_struct *tsk) memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN); - spin_unlock(&trace_cmdline_lock); + __raw_spin_unlock(&trace_cmdline_lock); } char *trace_find_cmdline(int pid) @@ -3005,6 +3004,246 @@ static struct file_operations tracing_mark_fops = { .write = tracing_mark_write, }; +struct ftrace_buffer_info { + struct trace_array *tr; + void *spare; + int cpu; + unsigned int read; +}; + +static int tracing_buffers_open(struct inode *inode, struct file *filp) +{ + int cpu = (int)(long)inode->i_private; + struct ftrace_buffer_info *info; + + if (tracing_disabled) + return -ENODEV; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return -ENOMEM; + + info->tr = &global_trace; + info->cpu = cpu; + info->spare = ring_buffer_alloc_read_page(info->tr->buffer); + /* Force reading ring buffer for first read */ + info->read = (unsigned int)-1; + if (!info->spare) + goto out; + + filp->private_data = info; + + return 0; + + out: + kfree(info); + return -ENOMEM; +} + +static ssize_t +tracing_buffers_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos) +{ + struct ftrace_buffer_info *info = filp->private_data; + unsigned int pos; + ssize_t ret; + size_t size; + + /* Do we have previous read data to read? */ + if (info->read < PAGE_SIZE) + goto read; + + info->read = 0; + + ret = ring_buffer_read_page(info->tr->buffer, + &info->spare, + count, + info->cpu, 0); + if (ret < 0) + return 0; + + pos = ring_buffer_page_len(info->spare); + + if (pos < PAGE_SIZE) + memset(info->spare + pos, 0, PAGE_SIZE - pos); + +read: + size = PAGE_SIZE - info->read; + if (size > count) + size = count; + + ret = copy_to_user(ubuf, info->spare + info->read, size); + if (ret) + return -EFAULT; + *ppos += size; + info->read += size; + + return size; +} + +static int tracing_buffers_release(struct inode *inode, struct file *file) +{ + struct ftrace_buffer_info *info = file->private_data; + + ring_buffer_free_read_page(info->tr->buffer, info->spare); + kfree(info); + + return 0; +} + +struct buffer_ref { + struct ring_buffer *buffer; + void *page; + int ref; +}; + +static void buffer_pipe_buf_release(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + struct buffer_ref *ref = (struct buffer_ref *)buf->private; + + if (--ref->ref) + return; + + ring_buffer_free_read_page(ref->buffer, ref->page); + kfree(ref); + buf->private = 0; +} + +static int buffer_pipe_buf_steal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + return 1; +} + +static void buffer_pipe_buf_get(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + struct buffer_ref *ref = (struct buffer_ref *)buf->private; + + ref->ref++; +} + +/* Pipe buffer operations for a buffer. */ +static struct pipe_buf_operations buffer_pipe_buf_ops = { + .can_merge = 0, + .map = generic_pipe_buf_map, + .unmap = generic_pipe_buf_unmap, + .confirm = generic_pipe_buf_confirm, + .release = buffer_pipe_buf_release, + .steal = buffer_pipe_buf_steal, + .get = buffer_pipe_buf_get, +}; + +/* + * Callback from splice_to_pipe(), if we need to release some pages + * at the end of the spd in case we error'ed out in filling the pipe. + */ +static void buffer_spd_release(struct splice_pipe_desc *spd, unsigned int i) +{ + struct buffer_ref *ref = + (struct buffer_ref *)spd->partial[i].private; + + if (--ref->ref) + return; + + ring_buffer_free_read_page(ref->buffer, ref->page); + kfree(ref); + spd->partial[i].private = 0; +} + +static ssize_t +tracing_buffers_splice_read(struct file *file, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + struct ftrace_buffer_info *info = file->private_data; + struct partial_page partial[PIPE_BUFFERS]; + struct page *pages[PIPE_BUFFERS]; + struct splice_pipe_desc spd = { + .pages = pages, + .partial = partial, + .flags = flags, + .ops = &buffer_pipe_buf_ops, + .spd_release = buffer_spd_release, + }; + struct buffer_ref *ref; + int size, i; + size_t ret; + + /* + * We can't seek on a buffer input + */ + if (unlikely(*ppos)) + return -ESPIPE; + + + for (i = 0; i < PIPE_BUFFERS && len; i++, len -= size) { + struct page *page; + int r; + + ref = kzalloc(sizeof(*ref), GFP_KERNEL); + if (!ref) + break; + + ref->buffer = info->tr->buffer; + ref->page = ring_buffer_alloc_read_page(ref->buffer); + if (!ref->page) { + kfree(ref); + break; + } + + r = ring_buffer_read_page(ref->buffer, &ref->page, + len, info->cpu, 0); + if (r < 0) { + ring_buffer_free_read_page(ref->buffer, + ref->page); + kfree(ref); + break; + } + + /* + * zero out any left over data, this is going to + * user land. + */ + size = ring_buffer_page_len(ref->page); + if (size < PAGE_SIZE) + memset(ref->page + size, 0, PAGE_SIZE - size); + + page = virt_to_page(ref->page); + + spd.pages[i] = page; + spd.partial[i].len = PAGE_SIZE; + spd.partial[i].offset = 0; + spd.partial[i].private = (unsigned long)ref; + spd.nr_pages++; + } + + spd.nr_pages = i; + + /* did we read anything? */ + if (!spd.nr_pages) { + if (flags & SPLICE_F_NONBLOCK) + ret = -EAGAIN; + else + ret = 0; + /* TODO: block */ + return ret; + } + + ret = splice_to_pipe(pipe, &spd); + + return ret; +} + +static const struct file_operations tracing_buffers_fops = { + .open = tracing_buffers_open, + .read = tracing_buffers_read, + .release = tracing_buffers_release, + .splice_read = tracing_buffers_splice_read, + .llseek = no_llseek, +}; + #ifdef CONFIG_DYNAMIC_FTRACE int __weak ftrace_arch_read_dyn_info(char *buf, int size) @@ -3399,6 +3638,7 @@ static __init void create_trace_options_dir(void) static __init int tracer_init_debugfs(void) { struct dentry *d_tracer; + struct dentry *buffers; struct dentry *entry; int cpu; @@ -3471,6 +3711,26 @@ static __init int tracer_init_debugfs(void) pr_warning("Could not create debugfs " "'trace_marker' entry\n"); + buffers = debugfs_create_dir("binary_buffers", d_tracer); + + if (!buffers) + pr_warning("Could not create buffers directory\n"); + else { + int cpu; + char buf[64]; + + for_each_tracing_cpu(cpu) { + sprintf(buf, "%d", cpu); + + entry = debugfs_create_file(buf, 0444, buffers, + (void *)(long)cpu, + &tracing_buffers_fops); + if (!entry) + pr_warning("Could not create debugfs buffers " + "'%s' entry\n", buf); + } + } + #ifdef CONFIG_DYNAMIC_FTRACE entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer, &ftrace_update_tot_cnt, @@ -3491,7 +3751,7 @@ static __init int tracer_init_debugfs(void) int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args) { - static DEFINE_SPINLOCK(trace_buf_lock); + static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED; static char trace_buf[TRACE_BUF_SIZE]; struct ring_buffer_event *event; @@ -3513,7 +3773,8 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args) goto out; pause_graph_tracing(); - spin_lock_irqsave(&trace_buf_lock, irq_flags); + raw_local_irq_save(irq_flags); + __raw_spin_lock(&trace_buf_lock); len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args); len = min(len, TRACE_BUF_SIZE-1); @@ -3532,7 +3793,8 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args) ring_buffer_unlock_commit(tr->buffer, event); out_unlock: - spin_unlock_irqrestore(&trace_buf_lock, irq_flags); + __raw_spin_unlock(&trace_buf_lock); + raw_local_irq_restore(irq_flags); unpause_graph_tracing(); out: preempt_enable_notrace(); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index e606633fb498..561bb5c5d988 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -217,6 +217,7 @@ enum trace_flag_type { */ struct trace_array_cpu { atomic_t disabled; + void *buffer_page; /* ring buffer spare */ /* these fields get copied into max-trace: */ unsigned long trace_idx; diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h index 041789ffbac1..2c8d76c7dbed 100644 --- a/kernel/trace/trace_events_stage_3.h +++ b/kernel/trace/trace_events_stage_3.h @@ -5,7 +5,7 @@ * * static void ftrace_event_(proto) * { - * event_trace_printk(_RET_IP_, "() " ); + * event_trace_printk(_RET_IP_, ": " ); * } * * static int ftrace_reg_event_(void) @@ -112,7 +112,7 @@ #define _TRACE_FORMAT(call, proto, args, fmt) \ static void ftrace_event_##call(proto) \ { \ - event_trace_printk(_RET_IP_, "(" #call ") " fmt); \ + event_trace_printk(_RET_IP_, #call ": " fmt); \ } \ \ static int ftrace_reg_event_##call(void) \ diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 1bcf9cd4baa0..a0879b2e8b6b 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -402,7 +402,7 @@ config LOCKDEP bool depends on DEBUG_KERNEL && TRACE_IRQFLAGS_SUPPORT && STACKTRACE_SUPPORT && LOCKDEP_SUPPORT select STACKTRACE - select FRAME_POINTER if !X86 && !MIPS && !PPC + select FRAME_POINTER if !MIPS && !PPC select KALLSYMS select KALLSYMS_ALL diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5c44ed49ca93..a3803ea8c27d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1479,6 +1479,8 @@ __alloc_pages_internal(gfp_t gfp_mask, unsigned int order, unsigned long did_some_progress; unsigned long pages_reclaimed = 0; + lockdep_trace_alloc(gfp_mask); + might_sleep_if(wait); if (should_fail_alloc_page(gfp_mask, order)) @@ -1578,12 +1580,15 @@ nofail_alloc: */ cpuset_update_task_memory_state(); p->flags |= PF_MEMALLOC; + + lockdep_set_current_reclaim_state(gfp_mask); reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; did_some_progress = try_to_free_pages(zonelist, order, gfp_mask); p->reclaim_state = NULL; + lockdep_clear_current_reclaim_state(); p->flags &= ~PF_MEMALLOC; cond_resched(); diff --git a/mm/slab.c b/mm/slab.c index aeeb4ecb9428..9ec66c3e6ee0 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3327,6 +3327,8 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, unsigned long save_flags; void *ptr; + lockdep_trace_alloc(flags); + if (slab_should_failslab(cachep, flags)) return NULL; @@ -3403,6 +3405,8 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) unsigned long save_flags; void *objp; + lockdep_trace_alloc(flags); + if (slab_should_failslab(cachep, flags)) return NULL; diff --git a/mm/slob.c b/mm/slob.c index f9cc24688232..596152926a8d 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -466,6 +466,8 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node) int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); void *ret; + lockdep_trace_alloc(flags); + if (size < PAGE_SIZE - align) { if (!size) return ZERO_SIZE_PTR; diff --git a/mm/slub.c b/mm/slub.c index 6de5e07c8850..816734ed8aa3 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1597,6 +1597,7 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, unsigned long flags; unsigned int objsize; + lockdep_trace_alloc(gfpflags); might_sleep_if(gfpflags & __GFP_WAIT); if (should_failslab(s->objsize, gfpflags)) diff --git a/mm/vmscan.c b/mm/vmscan.c index 6177e3bcd66b..ae6f4c174a12 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1965,6 +1965,8 @@ static int kswapd(void *p) }; node_to_cpumask_ptr(cpumask, pgdat->node_id); + lockdep_set_current_reclaim_state(GFP_KERNEL); + if (!cpumask_empty(cpumask)) set_cpus_allowed_ptr(tsk, cpumask); current->reclaim_state = &reclaim_state;