2005-04-17 00:20:36 +02:00
|
|
|
/* Kernel thread helper functions.
|
|
|
|
* Copyright (C) 2004 IBM Corporation, Rusty Russell.
|
|
|
|
*
|
2007-05-09 11:34:32 +02:00
|
|
|
* Creation is done via kthreadd, so that we get a clean environment
|
2005-04-17 00:20:36 +02:00
|
|
|
* even if we're invoked from userspace (think modprobe, hotplug cpu,
|
|
|
|
* etc.).
|
|
|
|
*/
|
|
|
|
#include <linux/sched.h>
|
|
|
|
#include <linux/kthread.h>
|
|
|
|
#include <linux/completion.h>
|
|
|
|
#include <linux/err.h>
|
cpuset,mm: update tasks' mems_allowed in time
Fix allocating page cache/slab object on the unallowed node when memory
spread is set by updating tasks' mems_allowed after its cpuset's mems is
changed.
In order to update tasks' mems_allowed in time, we must modify the code of
memory policy. Because the memory policy is applied in the process's
context originally. After applying this patch, one task directly
manipulates anothers mems_allowed, and we use alloc_lock in the
task_struct to protect mems_allowed and memory policy of the task.
But in the fast path, we didn't use lock to protect them, because adding a
lock may lead to performance regression. But if we don't add a lock,the
task might see no nodes when changing cpuset's mems_allowed to some
non-overlapping set. In order to avoid it, we set all new allowed nodes,
then clear newly disallowed ones.
[lee.schermerhorn@hp.com:
The rework of mpol_new() to extract the adjusting of the node mask to
apply cpuset and mpol flags "context" breaks set_mempolicy() and mbind()
with MPOL_PREFERRED and a NULL nodemask--i.e., explicit local
allocation. Fix this by adding the check for MPOL_PREFERRED and empty
node mask to mpol_new_mpolicy().
Remove the now unneeded 'nodes = NULL' from mpol_new().
Note that mpol_new_mempolicy() is always called with a non-NULL
'nodes' parameter now that it has been removed from mpol_new().
Therefore, we don't need to test nodes for NULL before testing it for
'empty'. However, just to be extra paranoid, add a VM_BUG_ON() to
verify this assumption.]
[lee.schermerhorn@hp.com:
I don't think the function name 'mpol_new_mempolicy' is descriptive
enough to differentiate it from mpol_new().
This function applies cpuset set context, usually constraining nodes
to those allowed by the cpuset. However, when the 'RELATIVE_NODES flag
is set, it also translates the nodes. So I settled on
'mpol_set_nodemask()', because the comment block for mpol_new() mentions
that we need to call this function to "set nodes".
Some additional minor line length, whitespace and typo cleanup.]
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Paul Menage <menage@google.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-06-17 00:31:49 +02:00
|
|
|
#include <linux/cpuset.h>
|
2005-04-17 00:20:36 +02:00
|
|
|
#include <linux/unistd.h>
|
|
|
|
#include <linux/file.h>
|
|
|
|
#include <linux/module.h>
|
2006-03-23 12:00:24 +01:00
|
|
|
#include <linux/mutex.h>
|
2009-04-15 01:39:12 +02:00
|
|
|
#include <trace/events/sched.h>
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2007-05-09 11:34:32 +02:00
|
|
|
static DEFINE_SPINLOCK(kthread_create_lock);
|
|
|
|
static LIST_HEAD(kthread_create_list);
|
|
|
|
struct task_struct *kthreadd_task;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
struct kthread_create_info
|
|
|
|
{
|
2007-05-09 11:34:32 +02:00
|
|
|
/* Information passed to kthread() from kthreadd. */
|
2005-04-17 00:20:36 +02:00
|
|
|
int (*threadfn)(void *data);
|
|
|
|
void *data;
|
|
|
|
|
2007-05-09 11:34:32 +02:00
|
|
|
/* Result passed back to kthread_create() from kthreadd. */
|
2005-04-17 00:20:36 +02:00
|
|
|
struct task_struct *result;
|
|
|
|
struct completion done;
|
2006-11-22 15:55:48 +01:00
|
|
|
|
2007-05-09 11:34:32 +02:00
|
|
|
struct list_head list;
|
2005-04-17 00:20:36 +02:00
|
|
|
};
|
|
|
|
|
2009-06-18 01:27:45 +02:00
|
|
|
struct kthread {
|
|
|
|
int should_stop;
|
|
|
|
struct completion exited;
|
2005-04-17 00:20:36 +02:00
|
|
|
};
|
|
|
|
|
2009-06-18 01:27:45 +02:00
|
|
|
#define to_kthread(tsk) \
|
|
|
|
container_of((tsk)->vfork_done, struct kthread, exited)
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2006-06-25 14:49:19 +02:00
|
|
|
/**
|
|
|
|
* kthread_should_stop - should this kthread return now?
|
|
|
|
*
|
2007-02-10 10:45:59 +01:00
|
|
|
* When someone calls kthread_stop() on your kthread, it will be woken
|
2006-06-25 14:49:19 +02:00
|
|
|
* and this will return true. You should then return, and your return
|
|
|
|
* value will be passed through to kthread_stop().
|
|
|
|
*/
|
2005-04-17 00:20:36 +02:00
|
|
|
int kthread_should_stop(void)
|
|
|
|
{
|
2009-06-18 01:27:45 +02:00
|
|
|
return to_kthread(current)->should_stop;
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(kthread_should_stop);
|
|
|
|
|
|
|
|
static int kthread(void *_create)
|
|
|
|
{
|
2009-06-18 01:27:45 +02:00
|
|
|
/* Copy data: it's on kthread's stack */
|
2005-04-17 00:20:36 +02:00
|
|
|
struct kthread_create_info *create = _create;
|
2009-06-18 01:27:45 +02:00
|
|
|
int (*threadfn)(void *data) = create->threadfn;
|
|
|
|
void *data = create->data;
|
|
|
|
struct kthread self;
|
|
|
|
int ret;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2009-06-18 01:27:45 +02:00
|
|
|
self.should_stop = 0;
|
|
|
|
init_completion(&self.exited);
|
|
|
|
current->vfork_done = &self.exited;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
/* OK, tell user we're spawned, wait for stop or wakeup */
|
2007-05-23 22:57:27 +02:00
|
|
|
__set_current_state(TASK_UNINTERRUPTIBLE);
|
2009-04-09 17:50:35 +02:00
|
|
|
create->result = current;
|
2009-06-18 01:27:43 +02:00
|
|
|
complete(&create->done);
|
2005-04-17 00:20:36 +02:00
|
|
|
schedule();
|
|
|
|
|
2009-06-18 01:27:45 +02:00
|
|
|
ret = -EINTR;
|
|
|
|
if (!self.should_stop)
|
2005-04-17 00:20:36 +02:00
|
|
|
ret = threadfn(data);
|
|
|
|
|
2009-06-18 01:27:45 +02:00
|
|
|
/* we can't just return, we must preserve "self" on stack */
|
|
|
|
do_exit(ret);
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
|
2007-05-09 11:34:32 +02:00
|
|
|
static void create_kthread(struct kthread_create_info *create)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
|
|
|
int pid;
|
|
|
|
|
|
|
|
/* We want our own signal handler (we take no signals by default). */
|
|
|
|
pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
|
2009-06-18 01:27:43 +02:00
|
|
|
if (pid < 0) {
|
2005-04-17 00:20:36 +02:00
|
|
|
create->result = ERR_PTR(pid);
|
2009-06-18 01:27:43 +02:00
|
|
|
complete(&create->done);
|
|
|
|
}
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
|
2006-06-25 14:49:19 +02:00
|
|
|
/**
|
|
|
|
* kthread_create - create a kthread.
|
|
|
|
* @threadfn: the function to run until signal_pending(current).
|
|
|
|
* @data: data ptr for @threadfn.
|
|
|
|
* @namefmt: printf-style name for the thread.
|
|
|
|
*
|
|
|
|
* Description: This helper function creates and names a kernel
|
|
|
|
* thread. The thread will be stopped: use wake_up_process() to start
|
|
|
|
* it. See also kthread_run(), kthread_create_on_cpu().
|
|
|
|
*
|
|
|
|
* When woken, the thread will run @threadfn() with @data as its
|
2007-02-10 10:45:59 +01:00
|
|
|
* argument. @threadfn() can either call do_exit() directly if it is a
|
2006-06-25 14:49:19 +02:00
|
|
|
* standalone thread for which noone will call kthread_stop(), or
|
|
|
|
* return when 'kthread_should_stop()' is true (which means
|
|
|
|
* kthread_stop() has been called). The return value should be zero
|
|
|
|
* or a negative error number; it will be passed to kthread_stop().
|
|
|
|
*
|
|
|
|
* Returns a task_struct or ERR_PTR(-ENOMEM).
|
|
|
|
*/
|
2005-04-17 00:20:36 +02:00
|
|
|
struct task_struct *kthread_create(int (*threadfn)(void *data),
|
|
|
|
void *data,
|
|
|
|
const char namefmt[],
|
|
|
|
...)
|
|
|
|
{
|
|
|
|
struct kthread_create_info create;
|
|
|
|
|
|
|
|
create.threadfn = threadfn;
|
|
|
|
create.data = data;
|
|
|
|
init_completion(&create.done);
|
2007-05-09 11:34:32 +02:00
|
|
|
|
|
|
|
spin_lock(&kthread_create_lock);
|
|
|
|
list_add_tail(&create.list, &kthread_create_list);
|
|
|
|
spin_unlock(&kthread_create_lock);
|
|
|
|
|
2008-04-29 09:59:23 +02:00
|
|
|
wake_up_process(kthreadd_task);
|
2007-05-09 11:34:32 +02:00
|
|
|
wait_for_completion(&create.done);
|
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
if (!IS_ERR(create.result)) {
|
2009-04-09 17:50:36 +02:00
|
|
|
struct sched_param param = { .sched_priority = 0 };
|
2005-04-17 00:20:36 +02:00
|
|
|
va_list args;
|
2009-04-09 17:50:36 +02:00
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
va_start(args, namefmt);
|
|
|
|
vsnprintf(create.result->comm, sizeof(create.result->comm),
|
|
|
|
namefmt, args);
|
|
|
|
va_end(args);
|
2009-04-09 17:50:36 +02:00
|
|
|
/*
|
|
|
|
* root may have changed our (kthreadd's) priority or CPU mask.
|
|
|
|
* The kernel thread should not inherit these properties.
|
|
|
|
*/
|
|
|
|
sched_setscheduler_nocheck(create.result, SCHED_NORMAL, ¶m);
|
|
|
|
set_cpus_allowed_ptr(create.result, cpu_all_mask);
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
return create.result;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(kthread_create);
|
|
|
|
|
2006-06-25 14:49:19 +02:00
|
|
|
/**
|
|
|
|
* kthread_stop - stop a thread created by kthread_create().
|
|
|
|
* @k: thread created by kthread_create().
|
|
|
|
*
|
|
|
|
* Sets kthread_should_stop() for @k to return true, wakes it, and
|
2009-06-19 02:51:13 +02:00
|
|
|
* waits for it to exit. This can also be called after kthread_create()
|
|
|
|
* instead of calling wake_up_process(): the thread will exit without
|
|
|
|
* calling threadfn().
|
|
|
|
*
|
|
|
|
* If threadfn() may call do_exit() itself, the caller must ensure
|
|
|
|
* task_struct can't go away.
|
2006-06-25 14:49:19 +02:00
|
|
|
*
|
|
|
|
* Returns the result of threadfn(), or %-EINTR if wake_up_process()
|
|
|
|
* was never called.
|
|
|
|
*/
|
2005-04-17 00:20:36 +02:00
|
|
|
int kthread_stop(struct task_struct *k)
|
|
|
|
{
|
2009-06-18 01:27:45 +02:00
|
|
|
struct kthread *kthread;
|
2005-04-17 00:20:36 +02:00
|
|
|
int ret;
|
|
|
|
|
tracing, sched: LTTng instrumentation - scheduler
Instrument the scheduler activity (sched_switch, migration, wakeups,
wait for a task, signal delivery) and process/thread
creation/destruction (fork, exit, kthread stop). Actually, kthread
creation is not instrumented in this patch because it is architecture
dependent. It allows to connect tracers such as ftrace which detects
scheduling latencies, good/bad scheduler decisions. Tools like LTTng can
export this scheduler information along with instrumentation of the rest
of the kernel activity to perform post-mortem analysis on the scheduler
activity.
About the performance impact of tracepoints (which is comparable to
markers), even without immediate values optimizations, tests done by
Hideo Aoki on ia64 show no regression. His test case was using hackbench
on a kernel where scheduler instrumentation (about 5 events in code
scheduler code) was added. See the "Tracepoints" patch header for
performance result detail.
Changelog :
- Change instrumentation location and parameter to match ftrace
instrumentation, previously done with kernel markers.
[ mingo@elte.hu: conflict resolutions ]
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Acked-by: 'Peter Zijlstra' <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-07-18 18:16:17 +02:00
|
|
|
trace_sched_kthread_stop(k);
|
2009-06-18 01:27:45 +02:00
|
|
|
get_task_struct(k);
|
tracing, sched: LTTng instrumentation - scheduler
Instrument the scheduler activity (sched_switch, migration, wakeups,
wait for a task, signal delivery) and process/thread
creation/destruction (fork, exit, kthread stop). Actually, kthread
creation is not instrumented in this patch because it is architecture
dependent. It allows to connect tracers such as ftrace which detects
scheduling latencies, good/bad scheduler decisions. Tools like LTTng can
export this scheduler information along with instrumentation of the rest
of the kernel activity to perform post-mortem analysis on the scheduler
activity.
About the performance impact of tracepoints (which is comparable to
markers), even without immediate values optimizations, tests done by
Hideo Aoki on ia64 show no regression. His test case was using hackbench
on a kernel where scheduler instrumentation (about 5 events in code
scheduler code) was added. See the "Tracepoints" patch header for
performance result detail.
Changelog :
- Change instrumentation location and parameter to match ftrace
instrumentation, previously done with kernel markers.
[ mingo@elte.hu: conflict resolutions ]
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Acked-by: 'Peter Zijlstra' <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-07-18 18:16:17 +02:00
|
|
|
|
2009-06-18 01:27:45 +02:00
|
|
|
kthread = to_kthread(k);
|
|
|
|
barrier(); /* it might have exited */
|
|
|
|
if (k->vfork_done != NULL) {
|
|
|
|
kthread->should_stop = 1;
|
|
|
|
wake_up_process(k);
|
|
|
|
wait_for_completion(&kthread->exited);
|
|
|
|
}
|
|
|
|
ret = k->exit_code;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
put_task_struct(k);
|
tracing, sched: LTTng instrumentation - scheduler
Instrument the scheduler activity (sched_switch, migration, wakeups,
wait for a task, signal delivery) and process/thread
creation/destruction (fork, exit, kthread stop). Actually, kthread
creation is not instrumented in this patch because it is architecture
dependent. It allows to connect tracers such as ftrace which detects
scheduling latencies, good/bad scheduler decisions. Tools like LTTng can
export this scheduler information along with instrumentation of the rest
of the kernel activity to perform post-mortem analysis on the scheduler
activity.
About the performance impact of tracepoints (which is comparable to
markers), even without immediate values optimizations, tests done by
Hideo Aoki on ia64 show no regression. His test case was using hackbench
on a kernel where scheduler instrumentation (about 5 events in code
scheduler code) was added. See the "Tracepoints" patch header for
performance result detail.
Changelog :
- Change instrumentation location and parameter to match ftrace
instrumentation, previously done with kernel markers.
[ mingo@elte.hu: conflict resolutions ]
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Acked-by: 'Peter Zijlstra' <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-07-18 18:16:17 +02:00
|
|
|
trace_sched_kthread_stop_ret(ret);
|
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
return ret;
|
|
|
|
}
|
2006-07-14 09:24:05 +02:00
|
|
|
EXPORT_SYMBOL(kthread_stop);
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2007-07-31 09:39:16 +02:00
|
|
|
int kthreadd(void *unused)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
2007-05-09 11:34:32 +02:00
|
|
|
struct task_struct *tsk = current;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2007-07-31 09:39:16 +02:00
|
|
|
/* Setup a clean context for our children to inherit. */
|
2007-05-09 11:34:32 +02:00
|
|
|
set_task_comm(tsk, "kthreadd");
|
2007-05-09 11:34:37 +02:00
|
|
|
ignore_signals(tsk);
|
2009-03-31 06:05:10 +02:00
|
|
|
set_cpus_allowed_ptr(tsk, cpu_all_mask);
|
cpuset,mm: update tasks' mems_allowed in time
Fix allocating page cache/slab object on the unallowed node when memory
spread is set by updating tasks' mems_allowed after its cpuset's mems is
changed.
In order to update tasks' mems_allowed in time, we must modify the code of
memory policy. Because the memory policy is applied in the process's
context originally. After applying this patch, one task directly
manipulates anothers mems_allowed, and we use alloc_lock in the
task_struct to protect mems_allowed and memory policy of the task.
But in the fast path, we didn't use lock to protect them, because adding a
lock may lead to performance regression. But if we don't add a lock,the
task might see no nodes when changing cpuset's mems_allowed to some
non-overlapping set. In order to avoid it, we set all new allowed nodes,
then clear newly disallowed ones.
[lee.schermerhorn@hp.com:
The rework of mpol_new() to extract the adjusting of the node mask to
apply cpuset and mpol flags "context" breaks set_mempolicy() and mbind()
with MPOL_PREFERRED and a NULL nodemask--i.e., explicit local
allocation. Fix this by adding the check for MPOL_PREFERRED and empty
node mask to mpol_new_mpolicy().
Remove the now unneeded 'nodes = NULL' from mpol_new().
Note that mpol_new_mempolicy() is always called with a non-NULL
'nodes' parameter now that it has been removed from mpol_new().
Therefore, we don't need to test nodes for NULL before testing it for
'empty'. However, just to be extra paranoid, add a VM_BUG_ON() to
verify this assumption.]
[lee.schermerhorn@hp.com:
I don't think the function name 'mpol_new_mempolicy' is descriptive
enough to differentiate it from mpol_new().
This function applies cpuset set context, usually constraining nodes
to those allowed by the cpuset. However, when the 'RELATIVE_NODES flag
is set, it also translates the nodes. So I settled on
'mpol_set_nodemask()', because the comment block for mpol_new() mentions
that we need to call this function to "set nodes".
Some additional minor line length, whitespace and typo cleanup.]
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Paul Menage <menage@google.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-06-17 00:31:49 +02:00
|
|
|
set_mems_allowed(node_possible_map);
|
2007-05-09 11:34:32 +02:00
|
|
|
|
2008-06-11 22:04:29 +02:00
|
|
|
current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;
|
2007-05-09 11:34:32 +02:00
|
|
|
|
|
|
|
for (;;) {
|
|
|
|
set_current_state(TASK_INTERRUPTIBLE);
|
|
|
|
if (list_empty(&kthread_create_list))
|
|
|
|
schedule();
|
|
|
|
__set_current_state(TASK_RUNNING);
|
|
|
|
|
|
|
|
spin_lock(&kthread_create_lock);
|
|
|
|
while (!list_empty(&kthread_create_list)) {
|
|
|
|
struct kthread_create_info *create;
|
|
|
|
|
|
|
|
create = list_entry(kthread_create_list.next,
|
|
|
|
struct kthread_create_info, list);
|
|
|
|
list_del_init(&create->list);
|
|
|
|
spin_unlock(&kthread_create_lock);
|
|
|
|
|
|
|
|
create_kthread(create);
|
|
|
|
|
|
|
|
spin_lock(&kthread_create_lock);
|
|
|
|
}
|
|
|
|
spin_unlock(&kthread_create_lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|